changed VQSR to take care of mixed positions and made the script a li…

…ttle more general (now uppmax prpoject is not hard coded) and less dependent on folder structure (samples are passed via config file or via 00_samples.txt file
SciLifeLab · Jan 30, 2017 · c216772 · c216772
1 parent fa8d2c8
commit c216772
Show file tree

Hide file tree

Showing 10 changed files with 295 additions and 112 deletions.
diff --git a/common.py b/common.py
@@ -35,6 +35,19 @@ def find_VCF(project, uppmax_project):
     return samples
 
 
+def slurm_header(uppmax_project, job_name, working_dir):
+    header  = "#!/bin/bash -l\n"
+    header += "#SBATCH -A {}\n".format(uppmax_project)
+    header += "#SBATCH -p node\n"
+    header += "#SBATCH -n 16\n"
+    header += "#SBATCH -t 10-00:00:00\n"
+    header += "#SBATCH -J {}\n".format(job_name)
+    header += "#SBATCH -o {}/std_out/{}.out\n".format(working_dir, job_name )
+    header += "#SBATCH -e {}/std_err/{}.err\n".format(working_dir, job_name )
+    header += "module load bioinfo-tools\n"
+    header += "module load GATK/3.5.0\n"
+    return header
+
 
 
 def submit_jobs(sbatch_files, pending_jobs = None):

diff --git a/example/config.yaml b/example/config.yaml
@@ -7,8 +7,6 @@ samples:
     - /proj/ngi2016003/nobackup/NGI/ANALYSIS/P2652/piper_ngi/07_variant_calls/P2652_239.clean.dedup.recal.bam.genomic.vcf.gz
     - /proj/ngi2016003/nobackup/NGI/ANALYSIS/P2652/piper_ngi/07_variant_calls/P2652_241.clean.dedup.recal.bam.genomic.vcf.gz
     - /proj/ngi2016003/nobackup/NGI/ANALYSIS/P2652/piper_ngi/07_variant_calls/P2652_244.clean.dedup.recal.bam.genomic.vcf.gz
-#Assumes NGI structure, i.e., /proj/{}/nobackup/NGI/ANALYSIS/{}/piper_ngi/07_variant_calls/".format(uppmax_project, project)
-projects:
 #path to a folder containing an .interval file for each interval we want to work on
 intervals: /proj/ngi2016003/nobackup/vezzi/SwedishReferenceGenomeProjects/develop/test/00_intervals
 #True if one wants to use scratch
@@ -20,9 +18,7 @@ batch_size: 4
 #how the output file names should be called
 output_header: "SRG"
 #uppmax project where one need to seach for samples (used only if projects are specified)
-uppmax_projects:
-    - ngi2016001
-    - ngi2016003
+uppmax_project: a201002
 #options for walkers
 #path to GATK
 GATK : "/sw/apps/bioinfo/GATK/3.5.0/GenomeAnalysisTK.jar"
@@ -59,8 +55,9 @@ walkers:
              - "-an FS"
              - "-an SOR"
              - "-an DP"
-#             - "-an InbreedingCoeff"
+             - "-an InbreedingCoeff"
              - "-mode SNP"
+             - "tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 "
         - INDEL:
               - "--maxGaussians 4"
               - "-resource:mills,known=false,training=true,truth=true,prior=12.0 /lupus/ngi/resources/piper/gatk_bundle/2.8/b37/Mills_and_1000G_gold_standard.indels.b37.vcf" 
@@ -71,13 +68,14 @@ walkers:
               - "-an SOR" 
               - "-an ReadPosRankSum" 
               - "-an MQRankSum" 
-#              - "-an InbreedingCoeff"
+              - "-an InbreedingCoeff"
               - "-mode INDEL"
+              - "-tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0"
     ApplyRecalibration:
         - "-R /lupus/ngi/resources/piper/gatk_bundle/2.8/b37/human_g1k_v37.fasta" 
         - "-nt 16"
         - SNP:
-              - "--ts_filter_level 99.5"
+              - "--ts_filter_level 99.0"
               - "-mode SNP"
         - INDEL:
               - "--ts_filter_level 99.0"

diff --git a/joint_variant_calling.py b/joint_variant_calling.py
@@ -9,6 +9,7 @@
 from walkers.SelectVariants import  SelectVariants
 from walkers.VariantRecalibrator import VariantRecalibrator
 from walkers.ApplyRecalibration import ApplyRecalibration
+from walkers.VQSR import VQSR
 
 from utils.config import CONFIG
 
@@ -25,7 +26,7 @@ def check_configuration():
     
     :returns: True if all checks succed, false otherwise
     """
-    mandatory_args = ('dry_run', 'scratch', 'batch_size', 'output_header', 'intervals', 'walkers', 'samples', 'uppmax_projects', 'projects' , 'GATK')
+    mandatory_args = ('dry_run', 'scratch', 'batch_size', 'output_header', 'intervals', 'walkers',  'uppmax_project',  'GATK')
     for mandatory_arg in mandatory_args:
         if mandatory_arg not in CONFIG:
             print "ERROR: argument {} is mandatory. If you think it does not apply leave it empty".format(mandatory_arg)
@@ -38,32 +39,27 @@ def check_configuration():
             return False
 
     #now at least I know all keys are present
-    if CONFIG["samples"] is None and CONFIG["projects"] is None and not os.path.exists("00_samples.txt"):
-        print "ERROR: at least one one between projects and/or  samples must contain a list. In alternative specify the file 00_"
-        return False
-    #if project is not none than uppmax-project need do be specifed
-    if CONFIG["projects"] is not None and CONFIG["uppmax_projects"] is None:
-        print "ERROR: if projects are specified their uppmax-project need to be specified as well"
+    if ("samples" not in CONFIG or CONFIG["samples"] is None) and not os.path.exists("00_samples.txt"):
+        print "ERROR: at least one one between samples and/or 00_samples.txt must contain a list of samples"
         return False
     #check tha provided uppmax projects exists (if provided)
-    if CONFIG["uppmax_projects"] is not None:
-        for uppmax_project in CONFIG["uppmax_projects"]:
-            if not os.path.exists("/proj/{}".format(uppmax_project)):
-                print "ERROR: uppmax project {} does not exists.".format(uppmax_project)
-                return False
+    if CONFIG["uppmax_project"] is not None:
+        if not os.path.exists("/proj/{}".format(CONFIG["uppmax_project"])):
+            print "ERROR: uppmax project {} does not exists.".format(CONFIG["uppmax_project"])
+            return False
     #do the same sanity check for samples and create the to be join called list
     CONFIG["samples_JC"] = []
-    if CONFIG["samples"] is not None:
+    if os.path.exists("00_samples.txt"):
+        print "WARNING: file 00_samples.txt exists, I will use only samples specified in this file and I will not give a shit about those in the config."
+        with open("00_samples.txt", "r") as samplesFile:
+            for sample in samplesFile:
+                CONFIG["samples_JC"].append(sample.rstrip())
+    else:
         for sample in CONFIG["samples"]:
             if not  os.path.exists("{}".format(sample)):
                 print "ERROR: sample  {} does not exists.".format(sample)
                 return False
             CONFIG["samples_JC"].append(sample)
-    if CONFIG["projects"] is not None:
-        for project in CONFIG["projects"]:
-            for uppmax_project in CONFIG["uppmax_projects"]:
-                #for each project check the directory that is supposed to contain the abalysis
-                CONFIG["samples_JC"] += find_VCF(project, uppmax_project)
     #check for duplicates, if identified stop computation
     samples_to_be_joint_called_hash = {}
     for sample in CONFIG["samples_JC"]:
@@ -89,19 +85,12 @@ def check_configuration():
 def main(args):
     config = conf.load_yaml_config(args.configuration)
     if not check_configuration():
-        sys.exit("ERROR: configuraiton file was malformed, please edit it and retry")
+        sys.exit("ERROR: configuration file was malformed, please edit it and retry")
     #store in a file path to vcf that are going to be analysed
     if args.resume and os.path.exists("00_samples.txt"):
-        sys.exit("ERROR: --resume  specified, however 00_samples.txt found. Please if you want to resume analysis, remove/move 00_samples.txt, 02_GenotypeGVCF, 03_ ... ")
-    if os.path.exists("00_samples.txt"):
-        print "WARNING: file 00_samples.txt exists, I will replace samples in config file with those stored in this file."
-        #delete samples sorted in previous step
-        CONFIG["samples_JC"] = []
-        with open("00_samples.txt", "r") as samplesFile:
-            for sample in samplesFile:
-                CONFIG["samples_JC"].append(sample.rstrip())
-    elif not args.resume:
-        #otherwise create the file and store the samples
+        sys.exit("ERROR: -- resume  specified, however 00_samples.txt found. Please if you want to resume analysis, remove/move 00_samples.txt, 02_GenotypeGVCF, 03_ ... ")
+    if not args.resume:
+        #create the file 00_samples.txt in order to prevent deleting by mistake analysis
         with open("00_samples.txt", "w") as samplesFile:
             for sample in CONFIG["samples_JC"]:
                 samplesFile.write("{}\n".format(sample))
@@ -129,20 +118,26 @@ def main(args):
     if not CONFIG["dry_run"]:
         slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
     #now perofmr VQSR
-    #start with select variants and variant evaluation
-    sbatch_files = SelectVariants()
-    #and execute
-    if not CONFIG["dry_run"]:
-        slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
-    #then perfomr VQSR
-    sbatch_files = VariantRecalibrator()
-    #and execute
-    if not CONFIG["dry_run"]:
-        slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
-    #than ApplyRecalibration
-    sbatch_files = ApplyRecalibration()
-    if not CONFIG["dry_run"]:
-        slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
+    if args.mixed_positions:
+        sbatch_files = VQSR()
+        #and execute
+        if not CONFIG["dry_run"]:
+            slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
+    else:
+        #start with select variants and variant evaluation
+        sbatch_files = SelectVariants()
+        #and execute
+        if not CONFIG["dry_run"]:
+            slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
+        #then perfomr VQSR
+        sbatch_files = VariantRecalibrator()
+        #and execute
+        if not CONFIG["dry_run"]:
+            slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
+        #than ApplyRecalibration
+        sbatch_files = ApplyRecalibration()
+        if not CONFIG["dry_run"]:
+            slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
 
 
 
@@ -151,6 +146,7 @@ def main(args):
     parser = argparse.ArgumentParser("""Scripts performs join variant calling on all samples provided, or on all samples analysed belonging to the projects specified""")
     parser.add_argument('--configuration', help="configuration file, give a look to the example one to see what to do", type=str)
     parser.add_argument('--resume', action='store_true', help="if this option is specified 01_Combine_GVCFs needs to be created and populated, all other folders need to be deleted/moved. In this modality the script deletes only the last batch (if needed) and creates a n new (or more) extra batches to account for new samples",  default=False )
+    parser.add_argument('--mixed-positions', help="With this option run the BP as suggested in http://gatkforums.broadinstitute.org/gatk/discussion/2805/howto-recalibrate-variant-quality-scores-run-vqsr (this is the reccomended way)", action='store_true')
     args = parser.parse_args()
     main(args)
 

diff --git a/walkers/ApplyRecalibration.py b/walkers/ApplyRecalibration.py
@@ -5,6 +5,7 @@
 
 from utils.config import CONFIG
 from common import atoi, natural_keys
+from common import slurm_header
 
 
 
@@ -28,16 +29,9 @@ def build_ApplyRecalibration_sbatch(working_dir, variant_raw,  recal, tranches,
     #create the sbatch file to merge all varaints or to copy the already single one
     sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name))
     with open(sbatch_file, "w") as ApplyRecalibration:
-        ApplyRecalibration.write("#!/bin/bash -l\n")
-        ApplyRecalibration.write("#SBATCH -A ngi2016003\n")
-        ApplyRecalibration.write("#SBATCH -p node\n")
-        ApplyRecalibration.write("#SBATCH -n 16\n")
-        ApplyRecalibration.write("#SBATCH -t 10-00:00:00\n")
-        ApplyRecalibration.write("#SBATCH -J {}\n".format(job_name))
-        ApplyRecalibration.write("#SBATCH -o {}/std_out/{}.out\n".format(working_dir, job_name ))
-        ApplyRecalibration.write("#SBATCH -e {}/std_err/{}.err\n".format(working_dir, job_name ))
-        ApplyRecalibration.write("module load bioinfo-tools\n")
-        ApplyRecalibration.write("module load GATK/3.5.0\n")
+        slurm = slurm_header(CONFIG["uppmax_project"], working_dir, job_name)
+        ApplyRecalibration.write(slurm)
+        ApplyRecalibration.write("\n")
 
         if scratch:
             ApplyRecalibration.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory

diff --git a/walkers/CatVariants.py b/walkers/CatVariants.py
@@ -5,6 +5,7 @@
 
 from utils.config import CONFIG
 from common import atoi, natural_keys
+from common import slurm_header
 
 
 
@@ -23,16 +24,9 @@ def build_CatVariants_sbatch(working_dir, variants_dir,  scratch=False):
     #create the sbatch file to merge all varaints or to copy the already single one
     sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name))
     with open(sbatch_file, "w") as CatVariants:
-        CatVariants.write("#!/bin/bash -l\n")
-        CatVariants.write("#SBATCH -A ngi2016003\n")
-        CatVariants.write("#SBATCH -p node\n")
-        CatVariants.write("#SBATCH -n 16\n")
-        CatVariants.write("#SBATCH -t 10-00:00:00\n")
-        CatVariants.write("#SBATCH -J {}\n".format(job_name))
-        CatVariants.write("#SBATCH -o {}/std_out/{}.out\n".format(working_dir, job_name ))
-        CatVariants.write("#SBATCH -e {}/std_err/{}.err\n".format(working_dir, job_name ))
-        CatVariants.write("module load bioinfo-tools\n")
-        CatVariants.write("module load GATK/3.5.0\n")
+        slurm = slurm_header(CONFIG["uppmax_project"], working_dir, job_name)
+        CatVariants.write(slurm)
+        CatVariants.write("\n")
 
         if len(CONFIG["intervals_list"]) == 0:
             #in this case I need only to copy the already single file

diff --git a/walkers/CombineGVCF.py b/walkers/CombineGVCF.py
@@ -3,6 +3,7 @@
 import re
 
 from utils.config import CONFIG
+from common import slurm_header
 
 
 def build_CombineGVCFs_sbatch(working_dir, batch, current_batch, scratch=False, interval=None):
@@ -28,16 +29,9 @@ def build_CombineGVCFs_sbatch(working_dir, batch, current_batch, scratch=False,
     #create the sbatch file to analyse the current batch of samples
     sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name))
     with open(sbatch_file, "w") as CombineGVCFsFile:
-        CombineGVCFsFile.write("#!/bin/bash -l\n")
-        CombineGVCFsFile.write("#SBATCH -A ngi2016003\n")
-        CombineGVCFsFile.write("#SBATCH -p node\n")
-        CombineGVCFsFile.write("#SBATCH -n 8\n")
-        CombineGVCFsFile.write("#SBATCH -t 10-00:00:00\n")
-        CombineGVCFsFile.write("#SBATCH -J {}\n".format(job_name))
-        CombineGVCFsFile.write("#SBATCH -o {}/std_out/{}.out\n".format(working_dir, job_name ))
-        CombineGVCFsFile.write("#SBATCH -e {}/std_err/{}.err\n".format(working_dir, job_name ))
-        CombineGVCFsFile.write("module load bioinfo-tools\n")
-        CombineGVCFsFile.write("module load GATK/3.5.0\n")
+        slurm = slurm_header(CONFIG["uppmax_project"], working_dir, job_name)
+        CombineGVCFsFile.write(slurm)
+        CombineGVCFsFile.write("\n")
         #rsync to scratch all samples
         if scratch:
             CombineGVCFsFile.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory

diff --git a/walkers/GenotypeGVCFs.py b/walkers/GenotypeGVCFs.py
@@ -2,7 +2,7 @@
 import re
 
 from utils.config import CONFIG
-
+from common import slurm_header
 
 
 def build_GenotypeGVCFs_sbatch(working_dir, combined_gvcf_files, scratch=False, interval=None):
@@ -28,16 +28,9 @@ def build_GenotypeGVCFs_sbatch(working_dir, combined_gvcf_files, scratch=False,
     #create the sbatch file to analyse the current batch of samples
     sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name))
     with open(sbatch_file, "w") as GenotypeGVCFs:
-        GenotypeGVCFs.write("#!/bin/bash -l\n")
-        GenotypeGVCFs.write("#SBATCH -A ngi2016003\n")
-        GenotypeGVCFs.write("#SBATCH -p node\n")
-        GenotypeGVCFs.write("#SBATCH -n 16\n")
-        GenotypeGVCFs.write("#SBATCH -t 10-00:00:00\n")
-        GenotypeGVCFs.write("#SBATCH -J {}\n".format(job_name))
-        GenotypeGVCFs.write("#SBATCH -o {}/std_out/{}.out\n".format(working_dir, job_name ))
-        GenotypeGVCFs.write("#SBATCH -e {}/std_err/{}.err\n".format(working_dir, job_name ))
-        GenotypeGVCFs.write("module load bioinfo-tools\n")
-        GenotypeGVCFs.write("module load GATK/3.5.0\n")
+        slurm = slurm_header(CONFIG["uppmax_project"], working_dir, job_name)
+        GenotypeGVCFs.write(slurm)
+        GenotypeGVCFs.write("\n")
         #rsync to scratch all samples
         if scratch:
             GenotypeGVCFs.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory

diff --git a/walkers/SelectVariants.py b/walkers/SelectVariants.py
@@ -5,6 +5,7 @@
 
 from utils.config import CONFIG
 from common import atoi, natural_keys
+from common import slurm_header
 
 
 
@@ -26,16 +27,9 @@ def build_SelectVariants_sbatch(working_dir, variant_file, scratch=False):
     #create the sbatch file to merge all varaints or to copy the already single one
     sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name))
     with open(sbatch_file, "w") as SelectVariants:
-        SelectVariants.write("#!/bin/bash -l\n")
-        SelectVariants.write("#SBATCH -A ngi2016003\n")
-        SelectVariants.write("#SBATCH -p node\n")
-        SelectVariants.write("#SBATCH -n 16\n")
-        SelectVariants.write("#SBATCH -t 10-00:00:00\n")
-        SelectVariants.write("#SBATCH -J {}\n".format(job_name))
-        SelectVariants.write("#SBATCH -o {}/std_out/{}.out\n".format(working_dir, job_name ))
-        SelectVariants.write("#SBATCH -e {}/std_err/{}.err\n".format(working_dir, job_name ))
-        SelectVariants.write("module load bioinfo-tools\n")
-        SelectVariants.write("module load GATK/3.5.0\n")
+        slurm = slurm_header(CONFIG["uppmax_project"], working_dir, job_name)
+        SelectVariants.write(slurm)
+        SelectVariants.write("\n")
 
         if scratch:
             SelectVariants.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory