diff --git a/common.py b/common.py index 11570f2..c829cea 100644 --- a/common.py +++ b/common.py @@ -35,6 +35,19 @@ def find_VCF(project, uppmax_project): return samples +def slurm_header(uppmax_project, job_name, working_dir): + header = "#!/bin/bash -l\n" + header += "#SBATCH -A {}\n".format(uppmax_project) + header += "#SBATCH -p node\n" + header += "#SBATCH -n 16\n" + header += "#SBATCH -t 10-00:00:00\n" + header += "#SBATCH -J {}\n".format(job_name) + header += "#SBATCH -o {}/std_out/{}.out\n".format(working_dir, job_name ) + header += "#SBATCH -e {}/std_err/{}.err\n".format(working_dir, job_name ) + header += "module load bioinfo-tools\n" + header += "module load GATK/3.5.0\n" + return header + def submit_jobs(sbatch_files, pending_jobs = None): diff --git a/example/config.yaml b/example/config.yaml index c17e824..20327a4 100644 --- a/example/config.yaml +++ b/example/config.yaml @@ -7,8 +7,6 @@ samples: - /proj/ngi2016003/nobackup/NGI/ANALYSIS/P2652/piper_ngi/07_variant_calls/P2652_239.clean.dedup.recal.bam.genomic.vcf.gz - /proj/ngi2016003/nobackup/NGI/ANALYSIS/P2652/piper_ngi/07_variant_calls/P2652_241.clean.dedup.recal.bam.genomic.vcf.gz - /proj/ngi2016003/nobackup/NGI/ANALYSIS/P2652/piper_ngi/07_variant_calls/P2652_244.clean.dedup.recal.bam.genomic.vcf.gz -#Assumes NGI structure, i.e., /proj/{}/nobackup/NGI/ANALYSIS/{}/piper_ngi/07_variant_calls/".format(uppmax_project, project) -projects: #path to a folder containing an .interval file for each interval we want to work on intervals: /proj/ngi2016003/nobackup/vezzi/SwedishReferenceGenomeProjects/develop/test/00_intervals #True if one wants to use scratch @@ -20,9 +18,7 @@ batch_size: 4 #how the output file names should be called output_header: "SRG" #uppmax project where one need to seach for samples (used only if projects are specified) -uppmax_projects: - - ngi2016001 - - ngi2016003 +uppmax_project: a201002 #options for walkers #path to GATK GATK : "/sw/apps/bioinfo/GATK/3.5.0/GenomeAnalysisTK.jar" @@ -59,8 +55,9 @@ walkers: - "-an FS" - "-an SOR" - "-an DP" -# - "-an InbreedingCoeff" + - "-an InbreedingCoeff" - "-mode SNP" + - "tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 " - INDEL: - "--maxGaussians 4" - "-resource:mills,known=false,training=true,truth=true,prior=12.0 /lupus/ngi/resources/piper/gatk_bundle/2.8/b37/Mills_and_1000G_gold_standard.indels.b37.vcf" @@ -71,13 +68,14 @@ walkers: - "-an SOR" - "-an ReadPosRankSum" - "-an MQRankSum" -# - "-an InbreedingCoeff" + - "-an InbreedingCoeff" - "-mode INDEL" + - "-tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0" ApplyRecalibration: - "-R /lupus/ngi/resources/piper/gatk_bundle/2.8/b37/human_g1k_v37.fasta" - "-nt 16" - SNP: - - "--ts_filter_level 99.5" + - "--ts_filter_level 99.0" - "-mode SNP" - INDEL: - "--ts_filter_level 99.0" diff --git a/joint_variant_calling.py b/joint_variant_calling.py index fc72c9d..4ed2d21 100644 --- a/joint_variant_calling.py +++ b/joint_variant_calling.py @@ -9,6 +9,7 @@ from walkers.SelectVariants import SelectVariants from walkers.VariantRecalibrator import VariantRecalibrator from walkers.ApplyRecalibration import ApplyRecalibration +from walkers.VQSR import VQSR from utils.config import CONFIG @@ -25,7 +26,7 @@ def check_configuration(): :returns: True if all checks succed, false otherwise """ - mandatory_args = ('dry_run', 'scratch', 'batch_size', 'output_header', 'intervals', 'walkers', 'samples', 'uppmax_projects', 'projects' , 'GATK') + mandatory_args = ('dry_run', 'scratch', 'batch_size', 'output_header', 'intervals', 'walkers', 'uppmax_project', 'GATK') for mandatory_arg in mandatory_args: if mandatory_arg not in CONFIG: print "ERROR: argument {} is mandatory. If you think it does not apply leave it empty".format(mandatory_arg) @@ -38,32 +39,27 @@ def check_configuration(): return False #now at least I know all keys are present - if CONFIG["samples"] is None and CONFIG["projects"] is None and not os.path.exists("00_samples.txt"): - print "ERROR: at least one one between projects and/or samples must contain a list. In alternative specify the file 00_" - return False - #if project is not none than uppmax-project need do be specifed - if CONFIG["projects"] is not None and CONFIG["uppmax_projects"] is None: - print "ERROR: if projects are specified their uppmax-project need to be specified as well" + if ("samples" not in CONFIG or CONFIG["samples"] is None) and not os.path.exists("00_samples.txt"): + print "ERROR: at least one one between samples and/or 00_samples.txt must contain a list of samples" return False #check tha provided uppmax projects exists (if provided) - if CONFIG["uppmax_projects"] is not None: - for uppmax_project in CONFIG["uppmax_projects"]: - if not os.path.exists("/proj/{}".format(uppmax_project)): - print "ERROR: uppmax project {} does not exists.".format(uppmax_project) - return False + if CONFIG["uppmax_project"] is not None: + if not os.path.exists("/proj/{}".format(CONFIG["uppmax_project"])): + print "ERROR: uppmax project {} does not exists.".format(CONFIG["uppmax_project"]) + return False #do the same sanity check for samples and create the to be join called list CONFIG["samples_JC"] = [] - if CONFIG["samples"] is not None: + if os.path.exists("00_samples.txt"): + print "WARNING: file 00_samples.txt exists, I will use only samples specified in this file and I will not give a shit about those in the config." + with open("00_samples.txt", "r") as samplesFile: + for sample in samplesFile: + CONFIG["samples_JC"].append(sample.rstrip()) + else: for sample in CONFIG["samples"]: if not os.path.exists("{}".format(sample)): print "ERROR: sample {} does not exists.".format(sample) return False CONFIG["samples_JC"].append(sample) - if CONFIG["projects"] is not None: - for project in CONFIG["projects"]: - for uppmax_project in CONFIG["uppmax_projects"]: - #for each project check the directory that is supposed to contain the abalysis - CONFIG["samples_JC"] += find_VCF(project, uppmax_project) #check for duplicates, if identified stop computation samples_to_be_joint_called_hash = {} for sample in CONFIG["samples_JC"]: @@ -89,19 +85,12 @@ def check_configuration(): def main(args): config = conf.load_yaml_config(args.configuration) if not check_configuration(): - sys.exit("ERROR: configuraiton file was malformed, please edit it and retry") + sys.exit("ERROR: configuration file was malformed, please edit it and retry") #store in a file path to vcf that are going to be analysed if args.resume and os.path.exists("00_samples.txt"): - sys.exit("ERROR: --resume specified, however 00_samples.txt found. Please if you want to resume analysis, remove/move 00_samples.txt, 02_GenotypeGVCF, 03_ ... ") - if os.path.exists("00_samples.txt"): - print "WARNING: file 00_samples.txt exists, I will replace samples in config file with those stored in this file." - #delete samples sorted in previous step - CONFIG["samples_JC"] = [] - with open("00_samples.txt", "r") as samplesFile: - for sample in samplesFile: - CONFIG["samples_JC"].append(sample.rstrip()) - elif not args.resume: - #otherwise create the file and store the samples + sys.exit("ERROR: -- resume specified, however 00_samples.txt found. Please if you want to resume analysis, remove/move 00_samples.txt, 02_GenotypeGVCF, 03_ ... ") + if not args.resume: + #create the file 00_samples.txt in order to prevent deleting by mistake analysis with open("00_samples.txt", "w") as samplesFile: for sample in CONFIG["samples_JC"]: samplesFile.write("{}\n".format(sample)) @@ -129,20 +118,26 @@ def main(args): if not CONFIG["dry_run"]: slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id) #now perofmr VQSR - #start with select variants and variant evaluation - sbatch_files = SelectVariants() - #and execute - if not CONFIG["dry_run"]: - slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id) - #then perfomr VQSR - sbatch_files = VariantRecalibrator() - #and execute - if not CONFIG["dry_run"]: - slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id) - #than ApplyRecalibration - sbatch_files = ApplyRecalibration() - if not CONFIG["dry_run"]: - slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id) + if args.mixed_positions: + sbatch_files = VQSR() + #and execute + if not CONFIG["dry_run"]: + slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id) + else: + #start with select variants and variant evaluation + sbatch_files = SelectVariants() + #and execute + if not CONFIG["dry_run"]: + slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id) + #then perfomr VQSR + sbatch_files = VariantRecalibrator() + #and execute + if not CONFIG["dry_run"]: + slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id) + #than ApplyRecalibration + sbatch_files = ApplyRecalibration() + if not CONFIG["dry_run"]: + slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id) @@ -151,6 +146,7 @@ def main(args): parser = argparse.ArgumentParser("""Scripts performs join variant calling on all samples provided, or on all samples analysed belonging to the projects specified""") parser.add_argument('--configuration', help="configuration file, give a look to the example one to see what to do", type=str) parser.add_argument('--resume', action='store_true', help="if this option is specified 01_Combine_GVCFs needs to be created and populated, all other folders need to be deleted/moved. In this modality the script deletes only the last batch (if needed) and creates a n new (or more) extra batches to account for new samples", default=False ) + parser.add_argument('--mixed-positions', help="With this option run the BP as suggested in http://gatkforums.broadinstitute.org/gatk/discussion/2805/howto-recalibrate-variant-quality-scores-run-vqsr (this is the reccomended way)", action='store_true') args = parser.parse_args() main(args) diff --git a/walkers/ApplyRecalibration.py b/walkers/ApplyRecalibration.py index 451462d..7249014 100644 --- a/walkers/ApplyRecalibration.py +++ b/walkers/ApplyRecalibration.py @@ -5,6 +5,7 @@ from utils.config import CONFIG from common import atoi, natural_keys +from common import slurm_header @@ -28,16 +29,9 @@ def build_ApplyRecalibration_sbatch(working_dir, variant_raw, recal, tranches, #create the sbatch file to merge all varaints or to copy the already single one sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name)) with open(sbatch_file, "w") as ApplyRecalibration: - ApplyRecalibration.write("#!/bin/bash -l\n") - ApplyRecalibration.write("#SBATCH -A ngi2016003\n") - ApplyRecalibration.write("#SBATCH -p node\n") - ApplyRecalibration.write("#SBATCH -n 16\n") - ApplyRecalibration.write("#SBATCH -t 10-00:00:00\n") - ApplyRecalibration.write("#SBATCH -J {}\n".format(job_name)) - ApplyRecalibration.write("#SBATCH -o {}/std_out/{}.out\n".format(working_dir, job_name )) - ApplyRecalibration.write("#SBATCH -e {}/std_err/{}.err\n".format(working_dir, job_name )) - ApplyRecalibration.write("module load bioinfo-tools\n") - ApplyRecalibration.write("module load GATK/3.5.0\n") + slurm = slurm_header(CONFIG["uppmax_project"], working_dir, job_name) + ApplyRecalibration.write(slurm) + ApplyRecalibration.write("\n") if scratch: ApplyRecalibration.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory diff --git a/walkers/CatVariants.py b/walkers/CatVariants.py index 1962f4c..c45d784 100644 --- a/walkers/CatVariants.py +++ b/walkers/CatVariants.py @@ -5,6 +5,7 @@ from utils.config import CONFIG from common import atoi, natural_keys +from common import slurm_header @@ -23,16 +24,9 @@ def build_CatVariants_sbatch(working_dir, variants_dir, scratch=False): #create the sbatch file to merge all varaints or to copy the already single one sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name)) with open(sbatch_file, "w") as CatVariants: - CatVariants.write("#!/bin/bash -l\n") - CatVariants.write("#SBATCH -A ngi2016003\n") - CatVariants.write("#SBATCH -p node\n") - CatVariants.write("#SBATCH -n 16\n") - CatVariants.write("#SBATCH -t 10-00:00:00\n") - CatVariants.write("#SBATCH -J {}\n".format(job_name)) - CatVariants.write("#SBATCH -o {}/std_out/{}.out\n".format(working_dir, job_name )) - CatVariants.write("#SBATCH -e {}/std_err/{}.err\n".format(working_dir, job_name )) - CatVariants.write("module load bioinfo-tools\n") - CatVariants.write("module load GATK/3.5.0\n") + slurm = slurm_header(CONFIG["uppmax_project"], working_dir, job_name) + CatVariants.write(slurm) + CatVariants.write("\n") if len(CONFIG["intervals_list"]) == 0: #in this case I need only to copy the already single file diff --git a/walkers/CombineGVCF.py b/walkers/CombineGVCF.py index c08353a..dab9bc2 100644 --- a/walkers/CombineGVCF.py +++ b/walkers/CombineGVCF.py @@ -3,6 +3,7 @@ import re from utils.config import CONFIG +from common import slurm_header def build_CombineGVCFs_sbatch(working_dir, batch, current_batch, scratch=False, interval=None): @@ -28,16 +29,9 @@ def build_CombineGVCFs_sbatch(working_dir, batch, current_batch, scratch=False, #create the sbatch file to analyse the current batch of samples sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name)) with open(sbatch_file, "w") as CombineGVCFsFile: - CombineGVCFsFile.write("#!/bin/bash -l\n") - CombineGVCFsFile.write("#SBATCH -A ngi2016003\n") - CombineGVCFsFile.write("#SBATCH -p node\n") - CombineGVCFsFile.write("#SBATCH -n 8\n") - CombineGVCFsFile.write("#SBATCH -t 10-00:00:00\n") - CombineGVCFsFile.write("#SBATCH -J {}\n".format(job_name)) - CombineGVCFsFile.write("#SBATCH -o {}/std_out/{}.out\n".format(working_dir, job_name )) - CombineGVCFsFile.write("#SBATCH -e {}/std_err/{}.err\n".format(working_dir, job_name )) - CombineGVCFsFile.write("module load bioinfo-tools\n") - CombineGVCFsFile.write("module load GATK/3.5.0\n") + slurm = slurm_header(CONFIG["uppmax_project"], working_dir, job_name) + CombineGVCFsFile.write(slurm) + CombineGVCFsFile.write("\n") #rsync to scratch all samples if scratch: CombineGVCFsFile.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory diff --git a/walkers/GenotypeGVCFs.py b/walkers/GenotypeGVCFs.py index 3a563bb..51839fc 100644 --- a/walkers/GenotypeGVCFs.py +++ b/walkers/GenotypeGVCFs.py @@ -2,7 +2,7 @@ import re from utils.config import CONFIG - +from common import slurm_header def build_GenotypeGVCFs_sbatch(working_dir, combined_gvcf_files, scratch=False, interval=None): @@ -28,16 +28,9 @@ def build_GenotypeGVCFs_sbatch(working_dir, combined_gvcf_files, scratch=False, #create the sbatch file to analyse the current batch of samples sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name)) with open(sbatch_file, "w") as GenotypeGVCFs: - GenotypeGVCFs.write("#!/bin/bash -l\n") - GenotypeGVCFs.write("#SBATCH -A ngi2016003\n") - GenotypeGVCFs.write("#SBATCH -p node\n") - GenotypeGVCFs.write("#SBATCH -n 16\n") - GenotypeGVCFs.write("#SBATCH -t 10-00:00:00\n") - GenotypeGVCFs.write("#SBATCH -J {}\n".format(job_name)) - GenotypeGVCFs.write("#SBATCH -o {}/std_out/{}.out\n".format(working_dir, job_name )) - GenotypeGVCFs.write("#SBATCH -e {}/std_err/{}.err\n".format(working_dir, job_name )) - GenotypeGVCFs.write("module load bioinfo-tools\n") - GenotypeGVCFs.write("module load GATK/3.5.0\n") + slurm = slurm_header(CONFIG["uppmax_project"], working_dir, job_name) + GenotypeGVCFs.write(slurm) + GenotypeGVCFs.write("\n") #rsync to scratch all samples if scratch: GenotypeGVCFs.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory diff --git a/walkers/SelectVariants.py b/walkers/SelectVariants.py index 4272b50..a62dd5e 100644 --- a/walkers/SelectVariants.py +++ b/walkers/SelectVariants.py @@ -5,6 +5,7 @@ from utils.config import CONFIG from common import atoi, natural_keys +from common import slurm_header @@ -26,16 +27,9 @@ def build_SelectVariants_sbatch(working_dir, variant_file, scratch=False): #create the sbatch file to merge all varaints or to copy the already single one sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name)) with open(sbatch_file, "w") as SelectVariants: - SelectVariants.write("#!/bin/bash -l\n") - SelectVariants.write("#SBATCH -A ngi2016003\n") - SelectVariants.write("#SBATCH -p node\n") - SelectVariants.write("#SBATCH -n 16\n") - SelectVariants.write("#SBATCH -t 10-00:00:00\n") - SelectVariants.write("#SBATCH -J {}\n".format(job_name)) - SelectVariants.write("#SBATCH -o {}/std_out/{}.out\n".format(working_dir, job_name )) - SelectVariants.write("#SBATCH -e {}/std_err/{}.err\n".format(working_dir, job_name )) - SelectVariants.write("module load bioinfo-tools\n") - SelectVariants.write("module load GATK/3.5.0\n") + slurm = slurm_header(CONFIG["uppmax_project"], working_dir, job_name) + SelectVariants.write(slurm) + SelectVariants.write("\n") if scratch: SelectVariants.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory diff --git a/walkers/VQSR.py b/walkers/VQSR.py new file mode 100644 index 0000000..25af34c --- /dev/null +++ b/walkers/VQSR.py @@ -0,0 +1,213 @@ +import sys, os, glob +import random +import subprocess +import re + +from utils.config import CONFIG +from common import atoi, natural_keys +from common import slurm_header + + +#### this applies VQSR as explained in +#### http://gatkforums.broadinstitute.org/gatk/discussion/2805/howto-recalibrate-variant-quality-scores-run-vqsr +#### solves problem with mixed positions + + +def build_VQSR_sbatch(working_dir, variant_raw, scratch=False): + """Builds the sbatch file in order to run VQSR + + :param str working_dir: directory where files will be created + :param str variant_raw: vcf containing the raw variants + :param bool scratch: if True works on scratch + + :returns: path to the sbatch file + """ + + job_name = "VQSR" + #first build the model for SNPS + racal_file_name_snps = "{}_joincalled.snp.recal".format(CONFIG["output_header"]) + tranches_file_name_snps = "{}_joincalled.snp.tranches".format(CONFIG["output_header"]) + #apply the model to SNPs only + variant_recal_snp_raw_indels = "{}_joincalled.recal_snp_raw_indels.vcf.gz".format(CONFIG["output_header"]) + #and then build the model for INDELS + racal_file_name_indels = "{}_joincalled.indel.recal".format(CONFIG["output_header"]) + tranches_file_name_indels = "{}_joincalled.indel.tranches".format(CONFIG["output_header"]) + variant_recal_snp_recal_indels = "{}_joincalled.recal_snp_recal_indels.vcf.gz".format(CONFIG["output_header"]) + #create the sbatch file to merge all varaints or to copy the already single one + sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name)) + with open(sbatch_file, "w") as VQSR: + slurm = slurm_header(CONFIG["uppmax_project"], working_dir, job_name) + VQSR.write(slurm) + VQSR.write("\n") + ############################################## + #### compute recalibration tables for SNPs ### + ############################################## + if scratch: + VQSR.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory + VQSR.write("mkdir -p $SNIC_TMP/{}/VCF/ \n".format(job_name)) # create tmp directory + GATK_input = "-input {} \\\n".format(variant_raw) + if scratch: + VQSR.write("rsync -rptoDLv {}* $SNIC_TMP/{}/\n".format(variant_raw, job_name)) + variant_raw_name = os.path.basename(variant_raw) + GATK_input = "-input $SNIC_TMP/{}/{} \\\n".format(job_name, variant_raw_name) + + GATK_command = "java -Xmx64g -jar {} -T VariantRecalibrator \\\n".format(CONFIG["GATK"]) + #add standard options + for option in CONFIG["walkers"]["VariantRecalibrator"]: + if isinstance(option, basestring): + GATK_command += "{} \\\n".format(option) + #now add specifc option for type + added = False + for option in CONFIG["walkers"]["VariantRecalibrator"]: + if not isinstance(option, basestring) and "SNP" in option: + specific_options = option["SNP"] + added = True + for specific_option in specific_options: + GATK_command += "{} \\\n".format(specific_option) + if not added: + print "WARNING: I did not inserted any specifc option in VQSR step, there should be either a SNP or an INDEL specific option" + GATK_command += GATK_input + if scratch: + GATK_command += "-recalFile $SNIC_TMP/{}/VCF/{} \\\n".format(job_name, racal_file_name_snps) + GATK_command += "-tranchesFile $SNIC_TMP/{}/VCF/{} \n\n".format(job_name, tranches_file_name_snps) + GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format(job_name, racal_file_name_snps , working_dir) + GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format(job_name, tranches_file_name_snps , working_dir) + else: + GATK_command += "-recalFile {}/VCF/{} \\\n".format(working_dir, racal_file_name_snps) + GATK_command += "-tranchesFile {}/VCF/{} \\\n".format(working_dir, tranches_file_name_snps) + VQSR.write(GATK_command) + VQSR.write("\n") + ########################################## + ##### now apply recalibration for SNPs ### + ########################################## + GATK_command = "java -Xmx64g -jar {} -T ApplyRecalibration \\\n".format(CONFIG["GATK"]) + #### GATK_input is the same + if scratch: + GATK_command += "-recalFile $SNIC_TMP/{}/VCF/{} \\\n".format(job_name, racal_file_name_snps) + GATK_command += "-tranchesFile $SNIC_TMP/{}/VCF/{} \\\n".format(job_name, tranches_file_name_snps) + else: + GATK_command += "-recalFile {}/VCF/{} \\\n".format(working_dir, racal_file_name_snps) + GATK_command += "-tranchesFile {}/VCF/{} \n\n".format(working_dir, tranches_file_name_snps) + GATK_command += GATK_input + #add standard options + for option in CONFIG["walkers"]["ApplyRecalibration"]: + if isinstance(option, basestring): + GATK_command += "{} \\\n".format(option) + #now add specifc option for type + added = False + for option in CONFIG["walkers"]["ApplyRecalibration"]: + if not isinstance(option, basestring) and "SNP" in option: + specific_options = option["SNP"] + added = True + for specific_option in specific_options: + GATK_command += "{} \\\n".format(specific_option) + if not added: + print "WARNING: I did not inserted any specifc option in VQSR step, there should be either a SNP or an INDEL specific option" + + if scratch: + GATK_command += "-o $SNIC_TMP/{}/VCF/{} \n\n".format(job_name, variant_recal_snp_raw_indels) + GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format(job_name, variant_recal_snp_raw_indels , working_dir) + else: + GATK_command += "-o {}/VCF/{} \n\n".format(working_dir, variant_recal_snp_raw_indels) + VQSR.write(GATK_command) + VQSR.write("\n") + ################################################ + #### compute recalibration tables for INDELS ### + ################################################ + GATK_input = "-input {}/VCF/{} \n\n".format(working_dir, variant_recal_snp_raw_indels) + if scratch: + GATK_input = "-input $SNIC_TMP/{}/VCF/{} \n\n".format(job_name, variant_recal_snp_raw_indels) + GATK_command = "java -Xmx64g -jar {} -T VariantRecalibrator \\\n".format(CONFIG["GATK"]) + #add standard options + for option in CONFIG["walkers"]["VariantRecalibrator"]: + if isinstance(option, basestring): + GATK_command += "{} \\\n".format(option) + #now add specifc option for type + added = False + for option in CONFIG["walkers"]["VariantRecalibrator"]: + if not isinstance(option, basestring) and "INDEL" in option: + specific_options = option["INDEL"] + added = True + for specific_option in specific_options: + GATK_command += "{} \\\n".format(specific_option) + if not added: + print "WARNING: I did not inserted any specifc option in VQSR step, there should be either a SNP or an INDEL specific option" + GATK_command += GATK_input + if scratch: + GATK_command += "-recalFile $SNIC_TMP/{}/VCF/{} \\\n".format(job_name, racal_file_name_indels) + GATK_command += "-tranchesFile $SNIC_TMP/{}/VCF/{} \\\n".format(job_name, tranches_file_name_indels) + GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format(job_name, racal_file_name_indels , working_dir) + GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format(job_name, tranches_file_name_indels , working_dir) + else: + GATK_command += "-recalFile {}/VCF/{} \\\n".format(working_dir, racal_file_name_indels) + GATK_command += "-tranchesFile {}/VCF/{} \\\n".format(working_dir, tranches_file_name_indels) + VQSR.write(GATK_command) + VQSR.write("\n") + ############################################ + ##### now apply recalibration for INDELS ### + ############################################ + GATK_command = "java -Xmx64g -jar {} -T ApplyRecalibration \\\n".format(CONFIG["GATK"]) + #### GATK_input is the same + if scratch: + GATK_command += "-recalFile $SNIC_TMP/{}/VCF/{} \\\n".format(job_name, racal_file_name_indels) + GATK_command += "-tranchesFile $SNIC_TMP/{}/VCF/{} \n\n".format(job_name, tranches_file_name_indels) + else: + GATK_command += "-recalFile {}/VCF/{} \\\n".format(working_dir, racal_file_name_indels) + GATK_command += "-tranchesFile {}/VCF/{} \n\n".format(working_dir, tranches_file_name_indels) + GATK_command += GATK_input + #add standard options + for option in CONFIG["walkers"]["ApplyRecalibration"]: + if isinstance(option, basestring): + GATK_command += "{} \\\n".format(option) + #now add specifc option for type + added = False + for option in CONFIG["walkers"]["ApplyRecalibration"]: + if not isinstance(option, basestring) and "INDEL" in option: + specific_options = option["INDEL"] + added = True + for specific_option in specific_options: + GATK_command += "{} \\\n".format(specific_option) + if not added: + print "WARNING: I did not inserted any specifc option in VQSR step, there should be either a SNP or an INDEL specific option" + + if scratch: + GATK_command += "-o $SNIC_TMP/{}/VCF/{} \n\n".format(job_name, variant_recal_snp_recal_indels) + GATK_command += "rsync $SNIC_TMP/{}/VCF/{}* {}/VCF/ \n".format(job_name, variant_recal_snp_recal_indels , working_dir) + else: + GATK_command += "-o {}/VCF/{} \n\n".format(working_dir, variant_recal_snp_recal_indels) + + VQSR.write(GATK_command) + VQSR.write("\n") + + return sbatch_file + + + + + +def VQSR(): + """Run VQSR + + :returns: the sbatch_file to be executed + """ + cwd = os.getcwd() + sbatch_files = [] + if not os.path.isdir(os.path.join(cwd, "03_CatVariants")): + sys.exit("Directory 03_CatVariants does not exits exists, something went wrong here.") + if os.path.isdir(os.path.join(cwd, "04_VQSR")): + print "WARNING: 04_VQSR already present, assuming this step has been completed with success." + return sbatch_files + else: + #create the folder structure + os.mkdir(os.path.join(cwd, "04_VQSR")) + os.mkdir(os.path.join(cwd, "04_VQSR", "sbatch")) + os.mkdir(os.path.join(cwd, "04_VQSR", "std_err")) + os.mkdir(os.path.join(cwd, "04_VQSR", "std_out")) + os.mkdir(os.path.join(cwd, "04_VQSR", "VCF")) + #Build the sbatch files for merging step + working_dir = os.path.join(cwd, "04_VQSR") + variant_file = os.path.join(cwd, "03_CatVariants", "VCF", "{}_joincalled.g.vcf.gz".format(CONFIG["output_header"])) + sbatch_file = build_VQSR_sbatch(working_dir, variant_file, CONFIG["scratch"]) + sbatch_files.append(sbatch_file) + return sbatch_files + diff --git a/walkers/VariantRecalibrator.py b/walkers/VariantRecalibrator.py index 1d8b665..962261f 100644 --- a/walkers/VariantRecalibrator.py +++ b/walkers/VariantRecalibrator.py @@ -5,6 +5,7 @@ from utils.config import CONFIG from common import atoi, natural_keys +from common import slurm_header @@ -29,16 +30,9 @@ def build_VariantRecalibrator_sbatch(working_dir, variant_raw, type, scratch=Fa #create the sbatch file to merge all varaints or to copy the already single one sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name)) with open(sbatch_file, "w") as VariantRecalibrator: - VariantRecalibrator.write("#!/bin/bash -l\n") - VariantRecalibrator.write("#SBATCH -A ngi2016003\n") - VariantRecalibrator.write("#SBATCH -p node\n") - VariantRecalibrator.write("#SBATCH -n 16\n") - VariantRecalibrator.write("#SBATCH -t 10-00:00:00\n") - VariantRecalibrator.write("#SBATCH -J {}\n".format(job_name)) - VariantRecalibrator.write("#SBATCH -o {}/std_out/{}.out\n".format(working_dir, job_name )) - VariantRecalibrator.write("#SBATCH -e {}/std_err/{}.err\n".format(working_dir, job_name )) - VariantRecalibrator.write("module load bioinfo-tools\n") - VariantRecalibrator.write("module load GATK/3.5.0\n") + slurm = slurm_header(CONFIG["uppmax_project"], working_dir, job_name) + VariantRecalibrator.write(slurm) + VariantRecalibrator.write("\n") if scratch: VariantRecalibrator.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory