Skip to content

Commit

Permalink
changed VQSR to take care of mixed positions and made the script a li…
Browse files Browse the repository at this point in the history
…ttle more general (now uppmax prpoject is not hard coded) and less dependent on folder structure (samples are passed via config file or via 00_samples.txt file
  • Loading branch information
Francesco Vezzi committed Jan 30, 2017
1 parent fa8d2c8 commit c216772
Show file tree
Hide file tree
Showing 10 changed files with 295 additions and 112 deletions.
13 changes: 13 additions & 0 deletions common.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,19 @@ def find_VCF(project, uppmax_project):
return samples


def slurm_header(uppmax_project, job_name, working_dir):
header = "#!/bin/bash -l\n"
header += "#SBATCH -A {}\n".format(uppmax_project)
header += "#SBATCH -p node\n"
header += "#SBATCH -n 16\n"
header += "#SBATCH -t 10-00:00:00\n"
header += "#SBATCH -J {}\n".format(job_name)
header += "#SBATCH -o {}/std_out/{}.out\n".format(working_dir, job_name )
header += "#SBATCH -e {}/std_err/{}.err\n".format(working_dir, job_name )
header += "module load bioinfo-tools\n"
header += "module load GATK/3.5.0\n"
return header



def submit_jobs(sbatch_files, pending_jobs = None):
Expand Down
14 changes: 6 additions & 8 deletions example/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ samples:
- /proj/ngi2016003/nobackup/NGI/ANALYSIS/P2652/piper_ngi/07_variant_calls/P2652_239.clean.dedup.recal.bam.genomic.vcf.gz
- /proj/ngi2016003/nobackup/NGI/ANALYSIS/P2652/piper_ngi/07_variant_calls/P2652_241.clean.dedup.recal.bam.genomic.vcf.gz
- /proj/ngi2016003/nobackup/NGI/ANALYSIS/P2652/piper_ngi/07_variant_calls/P2652_244.clean.dedup.recal.bam.genomic.vcf.gz
#Assumes NGI structure, i.e., /proj/{}/nobackup/NGI/ANALYSIS/{}/piper_ngi/07_variant_calls/".format(uppmax_project, project)
projects:
#path to a folder containing an .interval file for each interval we want to work on
intervals: /proj/ngi2016003/nobackup/vezzi/SwedishReferenceGenomeProjects/develop/test/00_intervals
#True if one wants to use scratch
Expand All @@ -20,9 +18,7 @@ batch_size: 4
#how the output file names should be called
output_header: "SRG"
#uppmax project where one need to seach for samples (used only if projects are specified)
uppmax_projects:
- ngi2016001
- ngi2016003
uppmax_project: a201002
#options for walkers
#path to GATK
GATK : "/sw/apps/bioinfo/GATK/3.5.0/GenomeAnalysisTK.jar"
Expand Down Expand Up @@ -59,8 +55,9 @@ walkers:
- "-an FS"
- "-an SOR"
- "-an DP"
# - "-an InbreedingCoeff"
- "-an InbreedingCoeff"
- "-mode SNP"
- "tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0 "
- INDEL:
- "--maxGaussians 4"
- "-resource:mills,known=false,training=true,truth=true,prior=12.0 /lupus/ngi/resources/piper/gatk_bundle/2.8/b37/Mills_and_1000G_gold_standard.indels.b37.vcf"
Expand All @@ -71,13 +68,14 @@ walkers:
- "-an SOR"
- "-an ReadPosRankSum"
- "-an MQRankSum"
# - "-an InbreedingCoeff"
- "-an InbreedingCoeff"
- "-mode INDEL"
- "-tranche 100.0 -tranche 99.9 -tranche 99.0 -tranche 90.0"
ApplyRecalibration:
- "-R /lupus/ngi/resources/piper/gatk_bundle/2.8/b37/human_g1k_v37.fasta"
- "-nt 16"
- SNP:
- "--ts_filter_level 99.5"
- "--ts_filter_level 99.0"
- "-mode SNP"
- INDEL:
- "--ts_filter_level 99.0"
Expand Down
82 changes: 39 additions & 43 deletions joint_variant_calling.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from walkers.SelectVariants import SelectVariants
from walkers.VariantRecalibrator import VariantRecalibrator
from walkers.ApplyRecalibration import ApplyRecalibration
from walkers.VQSR import VQSR

from utils.config import CONFIG

Expand All @@ -25,7 +26,7 @@ def check_configuration():
:returns: True if all checks succed, false otherwise
"""
mandatory_args = ('dry_run', 'scratch', 'batch_size', 'output_header', 'intervals', 'walkers', 'samples', 'uppmax_projects', 'projects' , 'GATK')
mandatory_args = ('dry_run', 'scratch', 'batch_size', 'output_header', 'intervals', 'walkers', 'uppmax_project', 'GATK')
for mandatory_arg in mandatory_args:
if mandatory_arg not in CONFIG:
print "ERROR: argument {} is mandatory. If you think it does not apply leave it empty".format(mandatory_arg)
Expand All @@ -38,32 +39,27 @@ def check_configuration():
return False

#now at least I know all keys are present
if CONFIG["samples"] is None and CONFIG["projects"] is None and not os.path.exists("00_samples.txt"):
print "ERROR: at least one one between projects and/or samples must contain a list. In alternative specify the file 00_"
return False
#if project is not none than uppmax-project need do be specifed
if CONFIG["projects"] is not None and CONFIG["uppmax_projects"] is None:
print "ERROR: if projects are specified their uppmax-project need to be specified as well"
if ("samples" not in CONFIG or CONFIG["samples"] is None) and not os.path.exists("00_samples.txt"):
print "ERROR: at least one one between samples and/or 00_samples.txt must contain a list of samples"
return False
#check tha provided uppmax projects exists (if provided)
if CONFIG["uppmax_projects"] is not None:
for uppmax_project in CONFIG["uppmax_projects"]:
if not os.path.exists("/proj/{}".format(uppmax_project)):
print "ERROR: uppmax project {} does not exists.".format(uppmax_project)
return False
if CONFIG["uppmax_project"] is not None:
if not os.path.exists("/proj/{}".format(CONFIG["uppmax_project"])):
print "ERROR: uppmax project {} does not exists.".format(CONFIG["uppmax_project"])
return False
#do the same sanity check for samples and create the to be join called list
CONFIG["samples_JC"] = []
if CONFIG["samples"] is not None:
if os.path.exists("00_samples.txt"):
print "WARNING: file 00_samples.txt exists, I will use only samples specified in this file and I will not give a shit about those in the config."
with open("00_samples.txt", "r") as samplesFile:
for sample in samplesFile:
CONFIG["samples_JC"].append(sample.rstrip())
else:
for sample in CONFIG["samples"]:
if not os.path.exists("{}".format(sample)):
print "ERROR: sample {} does not exists.".format(sample)
return False
CONFIG["samples_JC"].append(sample)
if CONFIG["projects"] is not None:
for project in CONFIG["projects"]:
for uppmax_project in CONFIG["uppmax_projects"]:
#for each project check the directory that is supposed to contain the abalysis
CONFIG["samples_JC"] += find_VCF(project, uppmax_project)
#check for duplicates, if identified stop computation
samples_to_be_joint_called_hash = {}
for sample in CONFIG["samples_JC"]:
Expand All @@ -89,19 +85,12 @@ def check_configuration():
def main(args):
config = conf.load_yaml_config(args.configuration)
if not check_configuration():
sys.exit("ERROR: configuraiton file was malformed, please edit it and retry")
sys.exit("ERROR: configuration file was malformed, please edit it and retry")
#store in a file path to vcf that are going to be analysed
if args.resume and os.path.exists("00_samples.txt"):
sys.exit("ERROR: --resume specified, however 00_samples.txt found. Please if you want to resume analysis, remove/move 00_samples.txt, 02_GenotypeGVCF, 03_ ... ")
if os.path.exists("00_samples.txt"):
print "WARNING: file 00_samples.txt exists, I will replace samples in config file with those stored in this file."
#delete samples sorted in previous step
CONFIG["samples_JC"] = []
with open("00_samples.txt", "r") as samplesFile:
for sample in samplesFile:
CONFIG["samples_JC"].append(sample.rstrip())
elif not args.resume:
#otherwise create the file and store the samples
sys.exit("ERROR: -- resume specified, however 00_samples.txt found. Please if you want to resume analysis, remove/move 00_samples.txt, 02_GenotypeGVCF, 03_ ... ")
if not args.resume:
#create the file 00_samples.txt in order to prevent deleting by mistake analysis
with open("00_samples.txt", "w") as samplesFile:
for sample in CONFIG["samples_JC"]:
samplesFile.write("{}\n".format(sample))
Expand Down Expand Up @@ -129,20 +118,26 @@ def main(args):
if not CONFIG["dry_run"]:
slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
#now perofmr VQSR
#start with select variants and variant evaluation
sbatch_files = SelectVariants()
#and execute
if not CONFIG["dry_run"]:
slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
#then perfomr VQSR
sbatch_files = VariantRecalibrator()
#and execute
if not CONFIG["dry_run"]:
slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
#than ApplyRecalibration
sbatch_files = ApplyRecalibration()
if not CONFIG["dry_run"]:
slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
if args.mixed_positions:
sbatch_files = VQSR()
#and execute
if not CONFIG["dry_run"]:
slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
else:
#start with select variants and variant evaluation
sbatch_files = SelectVariants()
#and execute
if not CONFIG["dry_run"]:
slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
#then perfomr VQSR
sbatch_files = VariantRecalibrator()
#and execute
if not CONFIG["dry_run"]:
slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)
#than ApplyRecalibration
sbatch_files = ApplyRecalibration()
if not CONFIG["dry_run"]:
slurm_jobs_id = submit_jobs(sbatch_files, slurm_jobs_id)



Expand All @@ -151,6 +146,7 @@ def main(args):
parser = argparse.ArgumentParser("""Scripts performs join variant calling on all samples provided, or on all samples analysed belonging to the projects specified""")
parser.add_argument('--configuration', help="configuration file, give a look to the example one to see what to do", type=str)
parser.add_argument('--resume', action='store_true', help="if this option is specified 01_Combine_GVCFs needs to be created and populated, all other folders need to be deleted/moved. In this modality the script deletes only the last batch (if needed) and creates a n new (or more) extra batches to account for new samples", default=False )
parser.add_argument('--mixed-positions', help="With this option run the BP as suggested in http://gatkforums.broadinstitute.org/gatk/discussion/2805/howto-recalibrate-variant-quality-scores-run-vqsr (this is the reccomended way)", action='store_true')
args = parser.parse_args()
main(args)

Expand Down
14 changes: 4 additions & 10 deletions walkers/ApplyRecalibration.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from utils.config import CONFIG
from common import atoi, natural_keys
from common import slurm_header



Expand All @@ -28,16 +29,9 @@ def build_ApplyRecalibration_sbatch(working_dir, variant_raw, recal, tranches,
#create the sbatch file to merge all varaints or to copy the already single one
sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name))
with open(sbatch_file, "w") as ApplyRecalibration:
ApplyRecalibration.write("#!/bin/bash -l\n")
ApplyRecalibration.write("#SBATCH -A ngi2016003\n")
ApplyRecalibration.write("#SBATCH -p node\n")
ApplyRecalibration.write("#SBATCH -n 16\n")
ApplyRecalibration.write("#SBATCH -t 10-00:00:00\n")
ApplyRecalibration.write("#SBATCH -J {}\n".format(job_name))
ApplyRecalibration.write("#SBATCH -o {}/std_out/{}.out\n".format(working_dir, job_name ))
ApplyRecalibration.write("#SBATCH -e {}/std_err/{}.err\n".format(working_dir, job_name ))
ApplyRecalibration.write("module load bioinfo-tools\n")
ApplyRecalibration.write("module load GATK/3.5.0\n")
slurm = slurm_header(CONFIG["uppmax_project"], working_dir, job_name)
ApplyRecalibration.write(slurm)
ApplyRecalibration.write("\n")

if scratch:
ApplyRecalibration.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory
Expand Down
14 changes: 4 additions & 10 deletions walkers/CatVariants.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from utils.config import CONFIG
from common import atoi, natural_keys
from common import slurm_header



Expand All @@ -23,16 +24,9 @@ def build_CatVariants_sbatch(working_dir, variants_dir, scratch=False):
#create the sbatch file to merge all varaints or to copy the already single one
sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name))
with open(sbatch_file, "w") as CatVariants:
CatVariants.write("#!/bin/bash -l\n")
CatVariants.write("#SBATCH -A ngi2016003\n")
CatVariants.write("#SBATCH -p node\n")
CatVariants.write("#SBATCH -n 16\n")
CatVariants.write("#SBATCH -t 10-00:00:00\n")
CatVariants.write("#SBATCH -J {}\n".format(job_name))
CatVariants.write("#SBATCH -o {}/std_out/{}.out\n".format(working_dir, job_name ))
CatVariants.write("#SBATCH -e {}/std_err/{}.err\n".format(working_dir, job_name ))
CatVariants.write("module load bioinfo-tools\n")
CatVariants.write("module load GATK/3.5.0\n")
slurm = slurm_header(CONFIG["uppmax_project"], working_dir, job_name)
CatVariants.write(slurm)
CatVariants.write("\n")

if len(CONFIG["intervals_list"]) == 0:
#in this case I need only to copy the already single file
Expand Down
14 changes: 4 additions & 10 deletions walkers/CombineGVCF.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import re

from utils.config import CONFIG
from common import slurm_header


def build_CombineGVCFs_sbatch(working_dir, batch, current_batch, scratch=False, interval=None):
Expand All @@ -28,16 +29,9 @@ def build_CombineGVCFs_sbatch(working_dir, batch, current_batch, scratch=False,
#create the sbatch file to analyse the current batch of samples
sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name))
with open(sbatch_file, "w") as CombineGVCFsFile:
CombineGVCFsFile.write("#!/bin/bash -l\n")
CombineGVCFsFile.write("#SBATCH -A ngi2016003\n")
CombineGVCFsFile.write("#SBATCH -p node\n")
CombineGVCFsFile.write("#SBATCH -n 8\n")
CombineGVCFsFile.write("#SBATCH -t 10-00:00:00\n")
CombineGVCFsFile.write("#SBATCH -J {}\n".format(job_name))
CombineGVCFsFile.write("#SBATCH -o {}/std_out/{}.out\n".format(working_dir, job_name ))
CombineGVCFsFile.write("#SBATCH -e {}/std_err/{}.err\n".format(working_dir, job_name ))
CombineGVCFsFile.write("module load bioinfo-tools\n")
CombineGVCFsFile.write("module load GATK/3.5.0\n")
slurm = slurm_header(CONFIG["uppmax_project"], working_dir, job_name)
CombineGVCFsFile.write(slurm)
CombineGVCFsFile.write("\n")
#rsync to scratch all samples
if scratch:
CombineGVCFsFile.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory
Expand Down
15 changes: 4 additions & 11 deletions walkers/GenotypeGVCFs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import re

from utils.config import CONFIG

from common import slurm_header


def build_GenotypeGVCFs_sbatch(working_dir, combined_gvcf_files, scratch=False, interval=None):
Expand All @@ -28,16 +28,9 @@ def build_GenotypeGVCFs_sbatch(working_dir, combined_gvcf_files, scratch=False,
#create the sbatch file to analyse the current batch of samples
sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name))
with open(sbatch_file, "w") as GenotypeGVCFs:
GenotypeGVCFs.write("#!/bin/bash -l\n")
GenotypeGVCFs.write("#SBATCH -A ngi2016003\n")
GenotypeGVCFs.write("#SBATCH -p node\n")
GenotypeGVCFs.write("#SBATCH -n 16\n")
GenotypeGVCFs.write("#SBATCH -t 10-00:00:00\n")
GenotypeGVCFs.write("#SBATCH -J {}\n".format(job_name))
GenotypeGVCFs.write("#SBATCH -o {}/std_out/{}.out\n".format(working_dir, job_name ))
GenotypeGVCFs.write("#SBATCH -e {}/std_err/{}.err\n".format(working_dir, job_name ))
GenotypeGVCFs.write("module load bioinfo-tools\n")
GenotypeGVCFs.write("module load GATK/3.5.0\n")
slurm = slurm_header(CONFIG["uppmax_project"], working_dir, job_name)
GenotypeGVCFs.write(slurm)
GenotypeGVCFs.write("\n")
#rsync to scratch all samples
if scratch:
GenotypeGVCFs.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory
Expand Down
14 changes: 4 additions & 10 deletions walkers/SelectVariants.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from utils.config import CONFIG
from common import atoi, natural_keys
from common import slurm_header



Expand All @@ -26,16 +27,9 @@ def build_SelectVariants_sbatch(working_dir, variant_file, scratch=False):
#create the sbatch file to merge all varaints or to copy the already single one
sbatch_file = os.path.join(working_dir, "sbatch", "{}.sbatch".format(job_name))
with open(sbatch_file, "w") as SelectVariants:
SelectVariants.write("#!/bin/bash -l\n")
SelectVariants.write("#SBATCH -A ngi2016003\n")
SelectVariants.write("#SBATCH -p node\n")
SelectVariants.write("#SBATCH -n 16\n")
SelectVariants.write("#SBATCH -t 10-00:00:00\n")
SelectVariants.write("#SBATCH -J {}\n".format(job_name))
SelectVariants.write("#SBATCH -o {}/std_out/{}.out\n".format(working_dir, job_name ))
SelectVariants.write("#SBATCH -e {}/std_err/{}.err\n".format(working_dir, job_name ))
SelectVariants.write("module load bioinfo-tools\n")
SelectVariants.write("module load GATK/3.5.0\n")
slurm = slurm_header(CONFIG["uppmax_project"], working_dir, job_name)
SelectVariants.write(slurm)
SelectVariants.write("\n")

if scratch:
SelectVariants.write("mkdir -p $SNIC_TMP/{} \n".format(job_name)) # create tmp directory
Expand Down
Loading

0 comments on commit c216772

Please sign in to comment.