From f044be305f51e13131685a7e5f6bcc5f22e28754 Mon Sep 17 00:00:00 2001 From: Stella Date: Tue, 30 Jan 2024 12:54:35 -0500 Subject: [PATCH 01/21] adding files for CODEC pipeline --- CODEC/SingleSampleCODEC.wdl | 1038 ++++++++++++++++++++++++++++++++++ CODEC/codec_bcl2fastq.wdl | 91 +++ CODEC/demux_CODEC.wdl | 277 +++++++++ CODEC/prep_codec_metadata.py | 23 + CODEC/sampleIndex.csv | 13 + 5 files changed, 1442 insertions(+) create mode 100644 CODEC/SingleSampleCODEC.wdl create mode 100644 CODEC/codec_bcl2fastq.wdl create mode 100644 CODEC/demux_CODEC.wdl create mode 100644 CODEC/prep_codec_metadata.py create mode 100644 CODEC/sampleIndex.csv diff --git a/CODEC/SingleSampleCODEC.wdl b/CODEC/SingleSampleCODEC.wdl new file mode 100644 index 0000000..71e8d2b --- /dev/null +++ b/CODEC/SingleSampleCODEC.wdl @@ -0,0 +1,1038 @@ +version 1.0 + +workflow SingleSampleCODEC { + input { + String sample_id + File fastq1 + File fastq2 + File reference_fasta + File reference_fasta_index + File reference_dict + File reference_pac + File reference_amb + File reference_ann + File reference_bwt + File reference_sa + File germline_bam + File germline_bam_index + Int num_parallel + String sort_memory + File eval_genome_interval + } + call SplitFastq1 { + input: + fastq_read1 = fastq1, + nsplit = num_parallel, + sample_id = sample_id + } + call SplitFastq2 { + input: + fastq_read2 = fastq2, + nsplit = num_parallel, + sample_id = sample_id + } + scatter (split_index in range(num_parallel)) { + String output_prefix = "${sample_id}_split.${split_index}" + call Trim { + input: + read1 = SplitFastq1.split_read1[split_index], + read2 = SplitFastq2.split_read2[split_index], + output_prefix = output_prefix, + split = split_index, + sample_id = sample_id + } + call AlignRawTrimmed { + input: + bam_input = Trim.trimmed_bam, + reference_fasta = reference_fasta, + reference_fasta_index = reference_fasta_index, + reference_dict = reference_dict, + reference_pac = reference_pac, + reference_amb = reference_amb, + reference_ann = reference_ann, + reference_bwt = reference_bwt, + reference_sa = reference_sa, + sample_id = sample_id, + split = split_index + } + call ZipperBamAlignment { + input: + mapped_bam = AlignRawTrimmed.aligned_bam, + unmapped_bam = Trim.trimmed_bam, + reference_fasta = reference_fasta, + reference_fasta_index = reference_fasta_index, + reference_dict = reference_dict, + sample_id = sample_id, + split = split_index, + sort_memory = sort_memory + } + } + call MergeSplit { + input: + bam_files = ZipperBamAlignment.bam, + sample_id = sample_id + } + call MergeLogSplit { + input: + log_files = Trim.trimmed_log, + sample_id = sample_id + } + call SortBam { + input: + bam_file = MergeSplit.merged_bam, + sample_id = sample_id + } + call CDSByProduct { + input: + trim_log = MergeLogSplit.merged_log, + highconf_bam = SortBam.sorted_bam, + sample_id = sample_id + } + call ReplaceRawReadGroup { + input: + raw_bam = MergeSplit.merged_bam, + sample_id = sample_id + } + call MarkRawDuplicates { + input: + input_bam = ReplaceRawReadGroup.bam, + sample_id = sample_id + } + call CollectInsertSizeMetrics { + input: + input_bam = MarkRawDuplicates.dup_marked_bam, + sample_id = sample_id + } + call GroupReadByUMI { + input: + input_bam = MarkRawDuplicates.dup_marked_bam, + sample_id = sample_id + } + call FgbioCollapseReadFamilies { + input: + grouped_umi_bam = GroupReadByUMI.groupbyumi_bam, + sample_id = sample_id + } + call AlignMolecularConsensusReads { + input: + mol_consensus_bam = FgbioCollapseReadFamilies.mol_consensus_bam, + sample_id = sample_id, + reference_fasta = reference_fasta, + reference_fasta_index = reference_fasta_index, + reference_dict = reference_dict, + reference_pac = reference_pac, + reference_amb = reference_amb, + reference_ann = reference_ann, + reference_bwt = reference_bwt, + reference_sa = reference_sa + } + call MergeAndSortMoleculeConsensusReads { + input: + mapped_sam = AlignMolecularConsensusReads.aligned_bam, + unmapped_bam = FgbioCollapseReadFamilies.mol_consensus_bam, + reference_fasta = reference_fasta, + reference_fasta_index = reference_fasta_index, + reference_dict = reference_dict, + sample_id = sample_id, + sort_memory = sort_memory + } + call CollectRawWgsMetrics { + input: + ReplaceRGBam = MarkRawDuplicates.dup_marked_bam, + sample_id = sample_id, + reference_fasta = reference_fasta, + reference_fasta_index = reference_fasta_index, + reference_dict = reference_dict + } + call CollectConsensusWgsMetrics { + input: + ConsensusAlignedBam = MergeAndSortMoleculeConsensusReads.bam, + ConsensusAlignedBai = MergeAndSortMoleculeConsensusReads.bai, + sample_id = sample_id, + reference_fasta = reference_fasta, + reference_fasta_index = reference_fasta_index, + reference_dict = reference_dict + } + call CSS_SFC_ErrorMetrics { + input: + ConsensusAlignedBam = MergeAndSortMoleculeConsensusReads.bam, + ConsensusAlignedBai = MergeAndSortMoleculeConsensusReads.bai, + sample_id = sample_id, + reference_fasta = reference_fasta, + reference_fasta_index = reference_fasta_index, + reference_dict = reference_dict, + reference_pac = reference_pac, + reference_amb = reference_amb, + reference_ann = reference_ann, + reference_bwt = reference_bwt, + reference_sa = reference_sa, + germline_bam = germline_bam, + germline_bam_index = germline_bam_index + } + call RAW_SFC_ErrorMetrics { + input: + ReplaceRGBam = MarkRawDuplicates.dup_marked_bam, + ReplaceRGBai = MarkRawDuplicates.dup_marked_bai, + sample_id = sample_id, + reference_fasta = reference_fasta, + reference_fasta_index = reference_fasta_index, + reference_dict = reference_dict, + reference_pac = reference_pac, + reference_amb = reference_amb, + reference_ann = reference_ann, + reference_bwt = reference_bwt, + reference_sa = reference_sa, + germline_bam = germline_bam, + germline_bam_index = germline_bam_index + } + call QC_metrics { + input: + byproduct_metrics = CDSByProduct.byproduct_metrics, + RawWgsMetrics = CollectRawWgsMetrics.RawWgsMetrics, + DuplicationMetrics = MarkRawDuplicates.dup_metrics, + InsertSizeMetrics = CollectInsertSizeMetrics.insert_size_metrics, + ConsensusWgsMetrics = CollectConsensusWgsMetrics.ConsensusWgsMetrics, + mutant_metrics = CSS_SFC_ErrorMetrics.mutant_metrics, + } + call EvalGenomeBases { + input: + eval_genome_interval = eval_genome_interval + } + call CalculateDuplexDepth { + input: + eval_genome_bases = EvalGenomeBases.eval_genome_bases, + n_bases_eval = QC_metrics.n_bases_eval + } + + output { + File byproduct_metrics = CDSByProduct.byproduct_metrics + File RAW_BAM = MergeSplit.merged_bam + File RAW_BAM_index = MergeSplit.merged_bai + File MolConsensusBAM = MergeAndSortMoleculeConsensusReads.bam + File MolConsensusBAM_index = MergeAndSortMoleculeConsensusReads.bai + File InsertSizeMetrics = CollectInsertSizeMetrics.insert_size_metrics + File InsertSizeHistogram = CollectInsertSizeMetrics.insert_size_histogram + File DuplicationMetrics = MarkRawDuplicates.dup_metrics + File RawWgsMetrics = CollectRawWgsMetrics.RawWgsMetrics + File ConsensusWgsMetrics = CollectConsensusWgsMetrics.ConsensusWgsMetrics + File raw_mutant_metrics = RAW_SFC_ErrorMetrics.raw_mutant_metrics + File raw_context_count = RAW_SFC_ErrorMetrics.raw_context_count + File raw_variants_called = RAW_SFC_ErrorMetrics.raw_variants_called + File mutant_metrics = CSS_SFC_ErrorMetrics.mutant_metrics + File context_count = CSS_SFC_ErrorMetrics.context_count + File variants_called = CSS_SFC_ErrorMetrics.variants_called + + Int n_total_fastq = QC_metrics.n_total_fastq + Int n_correct_products = QC_metrics.n_correct_products + Float pct_correct_products = QC_metrics.pct_correct_products + Int n_double_ligation = QC_metrics.n_double_ligation + Float pct_double_ligation = QC_metrics.pct_double_ligation + Int n_intermol = QC_metrics.n_intermol + Float pct_intermol = QC_metrics.pct_intermol + Int n_adp_dimer = QC_metrics.n_adp_dimer + Float pct_adp_dimer = QC_metrics.pct_adp_dimer + Float raw_dedupped_mean_cov = QC_metrics.raw_dedupped_mean_cov + Int raw_dedupped_median_cov = QC_metrics.raw_dedupped_median_cov + Float raw_duplication_rate = QC_metrics.raw_duplication_rate + Float mean_insert_size = QC_metrics.mean_insert_size + Int median_insert_size = QC_metrics.median_insert_size + Float mol_consensus_mean_cov = QC_metrics.mol_consensus_mean_cov + Int mol_consensus_median_cov = QC_metrics.mol_consensus_median_cov + Int n_snv = QC_metrics.n_snv + Int n_indel = QC_metrics.n_indel + String n_bases_eval = QC_metrics.n_bases_eval + Float snv_rate = QC_metrics.snv_rate + Float indel_rate = QC_metrics.indel_rate + String eval_genome_bases = EvalGenomeBases.eval_genome_bases + Float duplex_depth = CalculateDuplexDepth.duplex_depth + } +} + +task SplitFastq1 { + input { + File fastq_read1 + String sample_id + Int nsplit + Int memory = 64 + Int disk_size = 200 + } + + command <<< + set -e + + zcat ~{fastq_read1} | /CODECsuite/snakemake/script/fastqsplit.pl ~{sample_id}_split_r1 ~{nsplit} + + >>> + + output { + Array[File] split_read1 = glob("~{sample_id}_split_r1.*.fastq") + } + + runtime { + docker: "us.gcr.io/tag-team-160914/codec:v1" + memory: memory + " GB" + disks: "local-disk " + disk_size + " HDD" + } +} + +task SplitFastq2 { + input { + File fastq_read2 + Int nsplit + String sample_id + Int memory = 64 + Int disk_size = 200 + } + + command <<< + set -e + zcat ~{fastq_read2} | /CODECsuite/snakemake/script/fastqsplit.pl ~{sample_id}_split_r2 ~{nsplit} + + >>> + + output { + Array[File] split_read2 = glob("~{sample_id}_split_r2.*.fastq") + } + + runtime { + docker: "us.gcr.io/tag-team-160914/codec:v1" + memory: memory + " GB" + disks: "local-disk " + disk_size + " HDD" + } +} + +task Trim { + input { + File read1 + File read2 + String sample_id + String output_prefix + Int split + Int mem = 16 + Int disk_size = 32 + } + + command { + set -e + + /CODECsuite/build/codec trim -1 ~{read1} -2 ~{read2} -o ~{output_prefix} -u 3 -U 3 -f 2 -t 2 -s ~{sample_id} > ~{output_prefix}.trim.log + + } + runtime { + docker: "us.gcr.io/tag-team-160914/codec:v1" + disks: "local-disk " + disk_size + " HDD" + memory: mem + " GB" + } + output { + File trimmed_bam = "${sample_id}_split.${split}.trim.bam" + File trimmed_log = "${sample_id}_split.${split}.trim.log" + } +} + +task AlignRawTrimmed { + input { + File bam_input + File reference_fasta + File reference_fasta_index + File reference_dict + File reference_pac + File reference_amb + File reference_ann + File reference_bwt + File reference_sa + Int? memory + Int? disk + Int mem = select_first([memory, 16]) + Int disk_size = select_first([disk, 32]) + String sample_id + Int split + } + + String output_bam_name = "${sample_id}_split.${split}.aligned_tmp.bam" + + command { + samtools fastq ~{bam_input} | \ + bwa mem \ + -K 100000000 \ + -p \ + -Y \ + ~{reference_fasta} - | samtools view - -S -b -o ~{output_bam_name} + } + + output { + File aligned_bam = output_bam_name + } + + runtime { + docker: "us.gcr.io/tag-team-160914/codec:v1" + memory: mem + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: 3 + } +} + +task ZipperBamAlignment { + input { + File mapped_bam + File unmapped_bam + File reference_fasta + File reference_fasta_index + File reference_dict + Int? mem + Int? disk_size + String sample_id + Int split + String sort_memory + + } + + String bamOutput = "${sample_id}_split.${split}.trim.aligned.bam" + String baiOutput = "${sample_id}_split.${split}.trim.aligned.bam.bai" + + command { + java -jar /dependencies/fgbio-2.1.0.jar --compression 0 --async-io ZipperBams \ + -i ~{mapped_bam} \ + --unmapped ~{unmapped_bam} \ + --ref ~{reference_fasta} \ + | samtools sort -m ~{sort_memory} - -o ~{bamOutput} -O BAM && samtools index ~{bamOutput} + } + + output { + File bam = bamOutput + File bai = baiOutput + } + + runtime { + docker: "us.gcr.io/tag-team-160914/codec:v1" + memory: select_first([mem, 8]) + " GB" + disks: "local-disk " + select_first([disk_size, 16]) + " HDD" + preemptible: 3 + } +} + +task MergeSplit { + input { + Array[File] bam_files + String sample_id + Int memory = 64 + Int disk_size =200 + } + + command { + set -e + samtools merge -@ 4 ~{sample_id}.raw.aligned.bam ~{sep=' ' bam_files} && \ + samtools index ~{sample_id}.raw.aligned.bam + } + + output { + File merged_bam = "~{sample_id}.raw.aligned.bam" + File merged_bai = "~{sample_id}.raw.aligned.bam.bai" + } + + runtime { + docker: "us.gcr.io/tag-team-160914/codec:v1" + disks: "local-disk " + disk_size + " HDD" + memory: memory + " GB" + } +} + +task MergeLogSplit { + input { + Array[File] log_files + String sample_id + Int mem = 32 + Int disk_size = 64 + } + + command { + set -e + python3 /CODECsuite/snakemake/script/agg_log.py ~{sep=' ' log_files} ~{sample_id}.trim.log + } + + output { + File merged_log = "~{sample_id}.trim.log" + } + + runtime { + docker: "us.gcr.io/tag-team-160914/codec:v1" + disks: "local-disk " + disk_size + " HDD" + memory: mem + " GB" + } +} + +task SortBam { + input { + File bam_file + String sample_id + Int mem = 64 + Int disk_size = 200 + } + + command { + samtools sort -n ~{bam_file} -o ~{sample_id}.raw.aligned.sortbyname.bam + } + + output { + File sorted_bam = "~{sample_id}.raw.aligned.sortbyname.bam" + } + + runtime { + docker: "us.gcr.io/tag-team-160914/codec:v1" + disks: "local-disk " + disk_size + " HDD" + memory: mem + " GB" + preemptible: 2 + } +} + +task CDSByProduct { + input { + File trim_log + File highconf_bam + String sample_id + Int mem = 32 + Int disk_size = 100 + } + + command { + python3 /CODECsuite/snakemake/script/cds_summarize.py --sample_id ~{sample_id} --trim_log ~{trim_log} \ + --highconf_bam ~{highconf_bam} > ~{sample_id}.byproduct.txt + } + + output { + File byproduct_metrics = "~{sample_id}.byproduct.txt" + } + + runtime { + docker: "us.gcr.io/tag-team-160914/codec:v1" + disks: "local-disk " + disk_size + " HDD" + memory: mem + " GB" + } +} + +task ReplaceRawReadGroup { + input { + File raw_bam + String sample_id + Int memory = 64 + Int disk_size = 200 + } + + command { + java -jar /dependencies/picard.jar AddOrReplaceReadGroups \ + I=~{raw_bam} \ + O=~{sample_id}.raw.replacerg.bam \ + CREATE_INDEX=true \ + RGID=4 \ + RGLB=lib1 \ + RGPL=ILLUMINA \ + RGPU=unit1 \ + RGSM=~{sample_id} + } + + output { + File bam = "~{sample_id}.raw.replacerg.bam" + File bai = "~{sample_id}.raw.replacerg.bai" + } + + runtime { + memory: memory + " GB" + docker: "us.gcr.io/tag-team-160914/codec:v1" + disks: "local-disk " + disk_size + " HDD" + } +} + +task MarkRawDuplicates { + input { + File input_bam + String sample_id + Int memory = 64 + Int disk_size = 200 + } + + command { + java -jar /dependencies/picard.jar MarkDuplicates \ + I=~{input_bam} \ + O=~{sample_id}.raw.replacerg.markdup.bam \ + M=~{sample_id}.raw.marked_duplicates.txt \ + CREATE_INDEX=true \ + TAG_DUPLICATE_SET_MEMBERS=true \ + TAGGING_POLICY=All + + samtools index ~{sample_id}.raw.replacerg.markdup.bam + } + + output { + File dup_marked_bam = "~{sample_id}.raw.replacerg.markdup.bam" + File dup_marked_bai = "~{sample_id}.raw.replacerg.markdup.bam.bai" + File dup_metrics = "~{sample_id}.raw.marked_duplicates.txt" + } + + runtime { + memory: memory + " GB" + docker: "us.gcr.io/tag-team-160914/codec:v1" + disks: "local-disk " + disk_size + " HDD" + } +} + +task CollectInsertSizeMetrics { + input { + File input_bam + String sample_id + Int memory = 32 + Int disk_size = 200 + } + + command { + java -jar /dependencies/picard.jar CollectInsertSizeMetrics \ + I=~{input_bam} \ + O=~{sample_id}.raw.insert_size_metrics.txt \ + H=~{sample_id}.raw.insert_size_histogram.pdf \ + M=0.5 W=600 DEVIATIONS=100 + } + + output { + File insert_size_metrics = "~{sample_id}.raw.insert_size_metrics.txt" + File insert_size_histogram = "~{sample_id}.raw.insert_size_histogram.pdf" + } + + runtime { + memory: memory + " GB" + docker: "us.gcr.io/tag-team-160914/codec:v1" + disks: "local-disk " + disk_size + " HDD" + } +} + +task GroupReadByUMI { + input { + File input_bam + String sample_id + Int memory = 64 + Int disk_size = 200 + } + + command { + java -jar /dependencies/fgbio-2.1.0.jar --compression 1 --async-io \ + GroupReadsByUmi \ + -i ~{input_bam} \ + -o ~{sample_id}.GroupedByUmi.bam \ + -f ~{sample_id}.umiHistogram.txt \ + -m 0 \ + --strategy=paired + } + + output { + File groupbyumi_bam = "~{sample_id}.GroupedByUmi.bam" + File umi_histogram = "~{sample_id}.umiHistogram.txt" + } + + runtime { + memory: memory + " GB" + docker: "us.gcr.io/tag-team-160914/codec:v1" + disks: "local-disk " + disk_size + " HDD" + } +} + +task FgbioCollapseReadFamilies { + input { + File grouped_umi_bam + String sample_id + Int memory = 64 + Int disk_size = 200 + } + + command { + java -jar /dependencies/fgbio-2.1.0.jar --compression 1 CallMolecularConsensusReads \ + -i ~{grouped_umi_bam} \ + -o ~{sample_id}.mol_consensus.bam \ + -p ~{sample_id} \ + --threads 2 \ + --consensus-call-overlapping-bases false \ + -M 1 + } + + output { + File mol_consensus_bam = "~{sample_id}.mol_consensus.bam" + } + + runtime { + memory: memory + " GB" + docker: "us.gcr.io/tag-team-160914/codec:v1" + disks: "local-disk " + disk_size + " HDD" + } +} + +task AlignMolecularConsensusReads { + input { + File mol_consensus_bam + String sample_id + File reference_fasta + File reference_fasta_index + File reference_dict + File reference_pac + File reference_amb + File reference_ann + File reference_bwt + File reference_sa + Int memory = 64 + Int disk_size = 200 + Int threads = 4 + Int cpu_cores = 1 + } + String output_bam_name = "${sample_id}.mol_consensus.aligned_tmp.bam" + + command { + samtools fastq ~{mol_consensus_bam} \ + | bwa mem -K 100000000 -t ~{threads} -p -Y ~{reference_fasta} - | samtools view - -S -b -o ~{output_bam_name} + } + + output { + File aligned_bam = "~{sample_id}.mol_consensus.aligned_tmp.bam" + } + + runtime { + memory: memory + " GB" + docker: "us.gcr.io/tag-team-160914/codec:v1" + disks: "local-disk " + disk_size + " HDD" + cpu: cpu_cores + preemptible: 3 + } +} + +task MergeAndSortMoleculeConsensusReads { + input { + File mapped_sam + File unmapped_bam + File reference_fasta + File reference_fasta_index + File reference_dict + String sample_id + Int memory = 64 + Int disk_size = 200 + String sort_memory + } + + command { + java -jar /dependencies/fgbio-2.1.0.jar --compression 0 --async-io ZipperBams \ + -i ~{mapped_sam} \ + --unmapped ~{unmapped_bam} \ + --ref ~{reference_fasta} \ + --tags-to-reverse Consensus \ + --tags-to-revcomp Consensus \ + | samtools sort -m ~{sort_memory} - -o ~{sample_id}.mol_consensus.aligned.bam -O BAM -@ 4 && samtools index ~{sample_id}.mol_consensus.aligned.bam + } + + output { + File bam = "~{sample_id}.mol_consensus.aligned.bam" + File bai = "~{sample_id}.mol_consensus.aligned.bam.bai" + } + + runtime { + memory: memory+ " GB" + docker: "us.gcr.io/tag-team-160914/codec:v1" + disks: "local-disk " + disk_size + " HDD" + preemptible: 3 + } +} + +task CollectRawWgsMetrics { + input { + File ReplaceRGBam + String sample_id + File reference_fasta + File reference_fasta_index + File reference_dict + Int memory = 64 + Int disk_size = 200 + } + + + command { + java -jar /dependencies/picard.jar CollectWgsMetrics \ + I=~{ReplaceRGBam} O=~{sample_id}.raw.wgs_metrics.txt R=~{reference_fasta} INTERVALS=/reference_files/GRCh38_notinalldifficultregions.interval_list \ + COUNT_UNPAIRED=true MINIMUM_BASE_QUALITY=0 MINIMUM_MAPPING_QUALITY=0 + } + + output { + File RawWgsMetrics = "~{sample_id}.raw.wgs_metrics.txt" + } + + runtime { + memory: memory + " GB" + docker: "us.gcr.io/tag-team-160914/codec:v1" + disks: "local-disk " + disk_size + " HDD" + preemptible: 3 + } +} + +task CollectConsensusWgsMetrics { + input { + File ConsensusAlignedBam + File ConsensusAlignedBai + String sample_id + File reference_fasta + File reference_fasta_index + File reference_dict + Int memory = 64 + Int disk_size = 200 + } + + command { + java -jar /dependencies/picard.jar CollectWgsMetrics \ + I=~{ConsensusAlignedBam} O=~{sample_id}.mol_consensus.wgs_metrics.txt R=~{reference_fasta} INTERVALS=/reference_files/GRCh38_notinalldifficultregions.interval_list \ + INCLUDE_BQ_HISTOGRAM=true MINIMUM_BASE_QUALITY=30 + } + + output { + File ConsensusWgsMetrics = "~{sample_id}.mol_consensus.wgs_metrics.txt" + } + + runtime { + memory: memory + " GB" + docker: "us.gcr.io/tag-team-160914/codec:v1" + disks: "local-disk " + disk_size + " HDD" + preemptible: 3 + } +} + + +task CSS_SFC_ErrorMetrics { + input { + File ConsensusAlignedBam + File ConsensusAlignedBai + String sample_id + File reference_fasta + File reference_fasta_index + File reference_dict + File reference_pac + File reference_amb + File reference_ann + File reference_bwt + File reference_sa + File germline_bam + File germline_bam_index + Int memory = 64 + Int disk_size = 200 + } + + command { + /CODECsuite/build/codec call -b ~{ConsensusAlignedBam} \ + -L /reference_files/GRCh38_notinalldifficultregions.bed \ + -r ~{reference_fasta} \ + -m 60 \ + -q 30 \ + -d 12 \ + -n ~{germline_bam} \ + -V /reference_files/alfa_all.freq.breakmulti.hg38.af0001.vcf.gz \ + -x 6 \ + -c 4 \ + -5 \ + -g 30 \ + -G 250 \ + -Q 0.7 \ + -B 0.6 \ + -N 0.05 \ + -Y 5 \ + -W 1 \ + -a ~{sample_id}.mutant_metrics.txt \ + -e ~{sample_id}.variants_called.txt \ + -C ~{sample_id}.context_count.txt + } + + output { + File mutant_metrics = "~{sample_id}.mutant_metrics.txt" + File variants_called = "~{sample_id}.variants_called.txt" + File context_count = "~{sample_id}.context_count.txt" + } + + runtime { + memory: memory + " GB" + docker: "us.gcr.io/tag-team-160914/codec:v1" + disks: "local-disk " + disk_size + " HDD" + preemptible: 3 + } +} + +task RAW_SFC_ErrorMetrics { + input { + File ReplaceRGBam + File ReplaceRGBai + String sample_id + File reference_fasta + File reference_fasta_index + File reference_dict + File reference_pac + File reference_amb + File reference_ann + File reference_bwt + File reference_sa + File germline_bam + File germline_bam_index + Int memory = 32 + Int disk_size = 200 + } + command { + /CODECsuite/build/codec call -b ~{ReplaceRGBam} \ + -L /reference_files/GRCh38_notinalldifficultregions.bed \ + -r ~{reference_fasta} \ + -m 60 \ + -n ~{germline_bam} \ + -q 30 \ + -d 12 \ + -V /reference_files/alfa_all.freq.breakmulti.hg38.af0001.vcf.gz \ + -x 6 \ + -c 4 \ + -5 \ + -g 30 \ + -G 250 \ + -Q 0.6 \ + -B 0.6 \ + -N 0.1 \ + -Y 5 \ + -W 1 \ + -a ~{sample_id}.raw.mutant_metrics.txt \ + -e ~{sample_id}.raw.variants_called.txt \ + -C ~{sample_id}.raw.context_count.txt + } + + output { + File raw_mutant_metrics = "~{sample_id}.raw.mutant_metrics.txt" + File raw_variants_called = "~{sample_id}.raw.variants_called.txt" + File raw_context_count = "~{sample_id}.raw.context_count.txt" + } + + runtime { + memory: memory + " GB" + docker: "us.gcr.io/tag-team-160914/codec:v1" + disks: "local-disk " + disk_size + " HDD" + preemptible: 3 + } +} + +task QC_metrics { + input { + File byproduct_metrics + File RawWgsMetrics + File DuplicationMetrics + File InsertSizeMetrics + File ConsensusWgsMetrics + File mutant_metrics + Int memory = 16 + Int disk_size = 16 + + } + command <<< + set -e + cat ~{byproduct_metrics} | awk 'NR==2 {print $NF}' > n_total_fastq.txt + cat ~{byproduct_metrics} | awk 'NR==2 {print $9}' > n_correct.txt + cat ~{byproduct_metrics} | awk 'NR==2 {print $2}' > pct_correct.txt + cat ~{byproduct_metrics} | awk 'NR==2 {print $10}' > n_double_ligation.txt + cat ~{byproduct_metrics} | awk 'NR==2 {print $3}' > pct_double_ligation.txt + cat ~{byproduct_metrics} | awk 'NR==2 {print $12}' > n_intermol.txt + cat ~{byproduct_metrics} | awk 'NR==2 {print $5}' > pct_intermol.txt + cat ~{byproduct_metrics} | awk 'NR==2 {print $11}' > n_adp_dimer.txt + cat ~{byproduct_metrics} | awk 'NR==2 {print $4}' > pct_adp_dimer.txt + cat ~{RawWgsMetrics} | grep -v "#" | awk 'NR==3 {print $2}' > raw_dedupped_mean_cov.txt + cat ~{RawWgsMetrics} | grep -v "#" | awk 'NR==3 {print $4}' > raw_dedupped_median_cov.txt + cat ~{DuplicationMetrics} | grep -v "#" | awk 'NR==3 {print $9}' > raw_duplication_rate.txt + cat ~{InsertSizeMetrics} | grep -v "#" | awk 'NR==3 {print $1}' > median_insert_size.txt + cat ~{InsertSizeMetrics} | grep -v "#" | awk 'NR==3 {print $6}' > mean_insert_size.txt + cat ~{ConsensusWgsMetrics} | grep -v "#" | awk 'NR==3 {print $2}' > mol_consensus_mean_cov.txt + cat ~{ConsensusWgsMetrics} | grep -v "#" | awk 'NR==3 {print $4}' > mol_consensus_median_cov.txt + cat ~{mutant_metrics} | awk 'NR==2 {print $8}' > n_snv.txt + cat ~{mutant_metrics} | awk 'NR==2 {print $10}' > n_indel.txt + cat ~{mutant_metrics} | awk 'NR==2 {print $3}' > n_bases_eval.txt + cat ~{mutant_metrics} | awk 'NR==2 {print $9}' > snv_rate.txt + cat ~{mutant_metrics} | awk 'NR==2 {print $11}' > indel_rate.txt + + + >>> + output { + Int n_total_fastq = read_int("n_total_fastq.txt") + Int n_correct_products = read_int("n_correct.txt") + Float pct_correct_products = read_float("pct_correct.txt") + Int n_double_ligation = read_int("n_double_ligation.txt") + Float pct_double_ligation = read_float("pct_double_ligation.txt") + Int n_intermol = read_int("n_intermol.txt") + Float pct_intermol = read_float("pct_intermol.txt") + Int n_adp_dimer = read_int("n_adp_dimer.txt") + Float pct_adp_dimer = read_float("pct_adp_dimer.txt") + Float raw_dedupped_mean_cov = read_float("raw_dedupped_mean_cov.txt") + Int raw_dedupped_median_cov = read_int("raw_dedupped_median_cov.txt") + Float raw_duplication_rate = read_float("raw_duplication_rate.txt") + Float mean_insert_size = read_float("mean_insert_size.txt") + Int median_insert_size = read_int("median_insert_size.txt") + Float mol_consensus_mean_cov = read_float("mol_consensus_mean_cov.txt") + Int mol_consensus_median_cov = read_int("mol_consensus_median_cov.txt") + Int n_snv = read_int("n_snv.txt") + Int n_indel = read_int("n_indel.txt") + String n_bases_eval = read_string("n_bases_eval.txt") + Float snv_rate = read_float("snv_rate.txt") + Float indel_rate = read_float("indel_rate.txt") + } + runtime { + memory: memory + " GB" + docker: "us.gcr.io/tag-team-160914/picard_docker" + disks: "local-disk " + disk_size + " HDD" + preemptible: 2 + } +} + + +task EvalGenomeBases { + input { + File eval_genome_interval + Int memory = 16 + Int disk_size = 8 + } + + command { + java -jar /dependencies/picard.jar IntervalListTools \ + I=~{eval_genome_interval} COUNT_OUTPUT=eval_genome_bases.txt OUTPUT_VALUE=BASES + + } + + output { + String eval_genome_bases = read_string("eval_genome_bases.txt") + } + runtime { + memory: memory + " GB" + docker: "us.gcr.io/tag-team-160914/picard_docker" + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + } +} + + +task CalculateDuplexDepth { + input { + String eval_genome_bases + String n_bases_eval + Int memory = 16 + Int disk_size = 8 + } + + command <<< + + python3 <>> + + output { + Float duplex_depth = read_float(stdout()) + } + runtime { + memory: memory + " GB" + docker: "us.gcr.io/tag-team-160914/picard_docker" + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + } +} diff --git a/CODEC/codec_bcl2fastq.wdl b/CODEC/codec_bcl2fastq.wdl new file mode 100644 index 0000000..4b217a8 --- /dev/null +++ b/CODEC/codec_bcl2fastq.wdl @@ -0,0 +1,91 @@ +version 1.0 + +workflow codec_bcl2fastq { + input { + String input_bcl_directory + String output_directory + Int read_threads + Int write_threads + String memory = "128G" + Int disk_space = 2000 + Int preemptible = 3 + Boolean delete_input_bcl_directory + } + + call run_bcl2fastq { + input: + input_bcl_directory = sub(input_bcl_directory, "/+$", ""), + output_directory = sub(output_directory, "/+$", ""), + read_threads = read_threads, + write_threads = write_threads, + memory = memory, + disk_space = disk_space, + preemptible = preemptible, + delete_input_bcl_directory = delete_input_bcl_directory + } + + output { + String job_stat = run_bcl2fastq.job_stat + } +} + +task run_bcl2fastq { + input { + String input_bcl_directory + String output_directory + String memory + Int disk_space + Int preemptible + Int read_threads = 4 + Int write_threads = 4 + String run_id = basename(input_bcl_directory) + Boolean delete_input_bcl_directory + Int num_cpu = 2 + } + + command { + set -e + export TMPDIR=/tmp + + gsutil -q -m cp -r ${input_bcl_directory} . + + cd ${run_id} + + bcl2fastq \ + --output-dir out \ + -w ~{write_threads} \ + -r ~{read_threads} + + cd out + + gsutil -q -m cp -r . ${output_directory}/${run_id}_fastqs/ + + if [ $? -eq 0 ]; then + echo "Fastq files generated and copied successfully to ${output_directory}/${run_id}_fastqs/." + else + echo "Error in copying files." + fi + + if [ "${delete_input_bcl_directory}" = "true" ]; then + gsutil -q -m rm -r "${input_bcl_directory}" + if [ $? -eq 0 ]; then + echo "Input bcl folder has been successfully deleted!" + else + echo "Unable to delete BCL directory" + fi + fi + } + + output { + String job_stat = read_string(stdout()) + } + + runtime { + docker: "us.gcr.io/tag-team-160914/bcl2fastq_codec:v1" + memory: memory + bootDiskSizeGb: 12 + disks: "local-disk ${disk_space} HDD" + cpu: num_cpu + preemptible: preemptible +} +} \ No newline at end of file diff --git a/CODEC/demux_CODEC.wdl b/CODEC/demux_CODEC.wdl new file mode 100644 index 0000000..4a0dc00 --- /dev/null +++ b/CODEC/demux_CODEC.wdl @@ -0,0 +1,277 @@ +version 1.0 + + +workflow demux_CODEC { + input { + Array[File] fastq1 + Array[File] fastq2 + Array[String] batch_ids + Int num_parallel + Array[File] sample_sheet + String workspace + } + scatter (batch_index in range(length(batch_ids))) { + call SplitFastq1 { + input: + batch_id =batch_ids[batch_index], + fastq_read1 = fastq1[batch_index], + nsplit = num_parallel + } + call SplitFastq2 { + input: + batch_id =batch_ids[batch_index], + fastq_read2 = fastq2[batch_index], + nsplit = num_parallel + } + scatter (split_index in range(num_parallel)) { + call Demux { + input: + paired_fastqs = (SplitFastq1.split_read1[split_index], SplitFastq2.split_read2[split_index]), + sample_sheet = sample_sheet[batch_index], + batch_id = batch_ids[batch_index], + split = split_index + } + } + } + Array[File] demultiplexed_fastq1_all_lanes = flatten(flatten(Demux.demux_read1)) + Array[File] demultiplexed_fastq2_all_lanes = flatten(flatten(Demux.demux_read2)) + + call GroupFastqFiles as GroupFastq1 { + input: + fastq_files = demultiplexed_fastq1_all_lanes + } + call GroupFastqFiles as GroupFastq2 { + input: + fastq_files = demultiplexed_fastq2_all_lanes + } + scatter (txt_file in GroupFastq1.grouped_fastq_lists) { + call concatenate_and_compress_fastq as concatenate_and_compress_fastq1 { + input: + grouped_fastq_list = txt_file, + output_file = basename(txt_file, ".txt") + ".1.gz" + } + } + + scatter (txt_file in GroupFastq2.grouped_fastq_lists) { + call concatenate_and_compress_fastq as concatenate_and_compress_fastq2 { + input: + grouped_fastq_list = txt_file, + output_file = basename(txt_file, ".txt") + ".2.gz" + } + } + + call UploadDataTable { + input: + fastq1_files = concatenate_and_compress_fastq1.output_fastq_gz, + fastq2_files = concatenate_and_compress_fastq2.output_fastq_gz, + workspace = workspace + + } + + output { + Array[File] merged_fastq1_per_sample = concatenate_and_compress_fastq1.output_fastq_gz + Array[File] merged_fastq2_per_sample = concatenate_and_compress_fastq2.output_fastq_gz + File data_table = UploadDataTable.data_table + } +} + + +task SplitFastq1 { + input { + File fastq_read1 + Int nsplit + String batch_id + Int memory = 128 + Int disk_size = 1024 + + } + + command <<< + set -e + + zcat ~{fastq_read1} | /CODECsuite/snakemake/script/fastqsplit.pl ~{batch_id}_split_r1 ~{nsplit} + + >>> + + output { + Array[File] split_read1 = glob("~{batch_id}_split_r1.*.fastq") + } + + runtime { + docker: "us.gcr.io/tag-team-160914/codec:v1" + memory: memory + " GB" + disks: "local-disk " + disk_size + " HDD" + } +} + +task SplitFastq2 { + input { + File fastq_read2 + Int nsplit + String batch_id + Int memory = 128 + Int disk_size = 1024 + } + + command <<< + set -e + zcat ~{fastq_read2} | /CODECsuite/snakemake/script/fastqsplit.pl ~{batch_id}_split_r2 ~{nsplit} + + >>> + + output { + Array[File] split_read2 = glob("~{batch_id}_split_r2.*.fastq") + } + + runtime { + docker: "us.gcr.io/tag-team-160914/codec:v1" + memory: memory + " GB" + disks: "local-disk " + disk_size + " HDD" + } +} + +task Demux { + input { + Pair[File, File] paired_fastqs + File sample_sheet + Int memory = 64 + Int disk_size = 100 + Int split + String batch_id + } + + command { + set -e + + /CODECsuite/build/codec demux -1 ${paired_fastqs.left} -2 ${paired_fastqs.right} -p ${sample_sheet} -o ${batch_id}_split.${split} > ${batch_id}_split.${split}.log + } + runtime { + docker: "us.gcr.io/tag-team-160914/codec:v1" + memory: memory + " GB" + disks: "local-disk " + disk_size + " HDD" + } + output { + Array[File] demux_read1 = glob("~{batch_id}_split.~{split}.*.1.fastq.gz") + Array[File] demux_read2 = glob("~{batch_id}_split.~{split}.*.2.fastq.gz") + File log = "~{batch_id}_split.~{split}.log" + } +} +task GroupFastqFiles { + input { + Array[File] fastq_files + Int memory = 64 + Int disk_size = 100 + } + + command <<< + set -e + + for fastq_file in ~{sep=' ' fastq_files}; do + sample_name=$(basename "$fastq_file" | cut -d '.' -f 3) + echo "$fastq_file" >> "${sample_name}.txt" + done + + for txt_file in *.txt; do + sed -i 's|/cromwell_root/|gs://|g' "$txt_file" + done + >>> + + output { + Array[File] grouped_fastq_lists = glob("*.txt") + } + + runtime { + docker: "us.gcr.io/tag-team-160914/codec:v1" + disks: "local-disk " + disk_size + " HDD" + memory: memory + " GB" + } +} + +task concatenate_and_compress_fastq { + input { + File grouped_fastq_list + String output_file + Int memory = 64 + Int disk_size = 64 + } + + command { + set -e + while IFS= read -r line; do + gsutil cp "$line" temp.fastq.gz + + # Check if the file was successfully copied + if [ -f temp.fastq.gz ]; then + gunzip -c temp.fastq.gz + else + echo "Failed to copy file from $line" + exit 1 + fi + done < ~{grouped_fastq_list} | gzip > ~{output_file} + } + output { + File output_fastq_gz = output_file + } + + runtime { + docker: "us.gcr.io/tag-team-160914/gcloudsdk" + disks: "local-disk " + disk_size + " HDD" + memory: memory + " GB" + } +} + + +task UploadDataTable { + input { + Array[String] fastq1_files + Array[String] fastq2_files + String output_data_table = "sample.tsv" + String workspace + String namespace = "broadtagteam" + Int memory = 32 + Int disk_size = 32 + } + + command <<< + python3 <>> + + output { + File data_table = output_data_table + String upload_stats = read_string(stdout()) + } + + runtime { + docker: "us.gcr.io/tag-team-160914/metadata_upload" + disks: "local-disk " + disk_size + " HDD" + memory: memory + " GB" + } +} \ No newline at end of file diff --git a/CODEC/prep_codec_metadata.py b/CODEC/prep_codec_metadata.py new file mode 100644 index 0000000..691b1ec --- /dev/null +++ b/CODEC/prep_codec_metadata.py @@ -0,0 +1,23 @@ +import pandas as pd +import sys + +if len(sys.argv) != 3: + print("Usage: python3 script.py ") + sys.exit(1) + +# Assigning command-line arguments to variables +metadata_excel_file = sys.argv[1] +index_csv_file = sys.argv[2] + +xlsx_df = pd.read_excel(metadata_excel_file) +index_df = pd.read_csv(index_csv_file) + +merged_df = pd.merge( + xlsx_df, index_df, left_on='CODEC index', right_on='index') + +for lane, group in merged_df.groupby('lanes'): + + output_df = group[['submission_id', 'IndexBarcode1', 'IndexBarcode2']] + output_df.columns = ['SampleName', 'IndexBarcode1', 'IndexBarcode2'] + + output_df.to_csv(f'sample_sheet_L00{lane}.csv', index=False) diff --git a/CODEC/sampleIndex.csv b/CODEC/sampleIndex.csv new file mode 100644 index 0000000..d0dc938 --- /dev/null +++ b/CODEC/sampleIndex.csv @@ -0,0 +1,13 @@ +index,IndexBarcode1,IndexBarcode2 +1,CTTGAACGGACTGTCCAC,CACCGAGCGTTAGACTAC +2,GAGCCTACTCAGTCAACG,GTGTCGAACACTTGACGG +3,AGCTTGTAAGGCAGGTTA,ACTGATCTTCAGCTGACT +4,TCAAGCGTCTTACATGGT,TGAATCTGAGGCACTGTA +5,CTGGTCCAAGAACGTCTG,CTCTGAACGATCGAGCTC +6,GATCCAGTTCTGTCGAGC,GAGGTGCATGCACCTTAG +7,ACCTATAGGTGCAACGAA,ACTAACTTCCATTGCACT +8,TGAAGGTCCACTGTATCT,TGACCTGGATGGATAGGA +9,CACTGCTTCGAGACGAAG,CTCCAGTTACTGAGACGG +10,GTGATACCTCGATGCTCC,GAGGTCCAGTCTCTGTCC +11,ACTCAGAGAACTCATGGA,ACTACAGGTGGATCCAAT +12,TGAGCTGAGTTCGTACTT,TGATGTACCAACGATGTA From 20c00d977dc0b3fbfbb17bd65c8e354eec7a48f3 Mon Sep 17 00:00:00 2001 From: Stella Date: Tue, 6 Feb 2024 13:57:15 -0500 Subject: [PATCH 02/21] change fgbio version in the wdl to match on prem --- CODEC/SingleSampleCODEC.wdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CODEC/SingleSampleCODEC.wdl b/CODEC/SingleSampleCODEC.wdl index 71e8d2b..c4281bd 100644 --- a/CODEC/SingleSampleCODEC.wdl +++ b/CODEC/SingleSampleCODEC.wdl @@ -390,7 +390,7 @@ task ZipperBamAlignment { String baiOutput = "${sample_id}_split.${split}.trim.aligned.bam.bai" command { - java -jar /dependencies/fgbio-2.1.0.jar --compression 0 --async-io ZipperBams \ + java -jar /dependencies/fgbio-2.0.2.jar --compression 0 --async-io ZipperBams \ -i ~{mapped_bam} \ --unmapped ~{unmapped_bam} \ --ref ~{reference_fasta} \ @@ -611,7 +611,7 @@ task GroupReadByUMI { } command { - java -jar /dependencies/fgbio-2.1.0.jar --compression 1 --async-io \ + java -jar /dependencies/fgbio-2.0.2.jar --compression 1 --async-io \ GroupReadsByUmi \ -i ~{input_bam} \ -o ~{sample_id}.GroupedByUmi.bam \ @@ -641,7 +641,7 @@ task FgbioCollapseReadFamilies { } command { - java -jar /dependencies/fgbio-2.1.0.jar --compression 1 CallMolecularConsensusReads \ + java -jar /dependencies/fgbio-2.0.2.jar --compression 1 CallMolecularConsensusReads \ -i ~{grouped_umi_bam} \ -o ~{sample_id}.mol_consensus.bam \ -p ~{sample_id} \ @@ -712,7 +712,7 @@ task MergeAndSortMoleculeConsensusReads { } command { - java -jar /dependencies/fgbio-2.1.0.jar --compression 0 --async-io ZipperBams \ + java -jar /dependencies/fgbio-2.0.2.jar --compression 0 --async-io ZipperBams \ -i ~{mapped_sam} \ --unmapped ~{unmapped_bam} \ --ref ~{reference_fasta} \ From d5856984983e144ee9631cfbafdd6cc390c92fd9 Mon Sep 17 00:00:00 2001 From: Stella Date: Tue, 13 Feb 2024 09:40:55 -0500 Subject: [PATCH 03/21] make eval_genome_interval/bed tunable in multiple tasks --- CODEC/SingleSampleCODEC.wdl | 35 ++++++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/CODEC/SingleSampleCODEC.wdl b/CODEC/SingleSampleCODEC.wdl index c4281bd..a12d100 100644 --- a/CODEC/SingleSampleCODEC.wdl +++ b/CODEC/SingleSampleCODEC.wdl @@ -17,7 +17,8 @@ workflow SingleSampleCODEC { File germline_bam_index Int num_parallel String sort_memory - File eval_genome_interval + File eval_genome_interval = "gs://gptag/CODEC/GRCh38_notinalldifficultregions.interval_list" + File eval_genome_bed = "gs://gptag/CODEC/GRCh38_notinalldifficultregions.bed" } call SplitFastq1 { input: @@ -142,7 +143,8 @@ workflow SingleSampleCODEC { sample_id = sample_id, reference_fasta = reference_fasta, reference_fasta_index = reference_fasta_index, - reference_dict = reference_dict + reference_dict = reference_dict, + eval_genome_interval = eval_genome_interval } call CollectConsensusWgsMetrics { input: @@ -151,7 +153,8 @@ workflow SingleSampleCODEC { sample_id = sample_id, reference_fasta = reference_fasta, reference_fasta_index = reference_fasta_index, - reference_dict = reference_dict + reference_dict = reference_dict, + eval_genome_interval = eval_genome_interval } call CSS_SFC_ErrorMetrics { input: @@ -167,7 +170,8 @@ workflow SingleSampleCODEC { reference_bwt = reference_bwt, reference_sa = reference_sa, germline_bam = germline_bam, - germline_bam_index = germline_bam_index + germline_bam_index = germline_bam_index, + eval_genome_bed = eval_genome_bed } call RAW_SFC_ErrorMetrics { input: @@ -183,7 +187,8 @@ workflow SingleSampleCODEC { reference_bwt = reference_bwt, reference_sa = reference_sa, germline_bam = germline_bam, - germline_bam_index = germline_bam_index + germline_bam_index = germline_bam_index, + eval_genome_bed = eval_genome_bed } call QC_metrics { input: @@ -741,6 +746,7 @@ task CollectRawWgsMetrics { File reference_fasta File reference_fasta_index File reference_dict + File eval_genome_interval Int memory = 64 Int disk_size = 200 } @@ -748,7 +754,7 @@ task CollectRawWgsMetrics { command { java -jar /dependencies/picard.jar CollectWgsMetrics \ - I=~{ReplaceRGBam} O=~{sample_id}.raw.wgs_metrics.txt R=~{reference_fasta} INTERVALS=/reference_files/GRCh38_notinalldifficultregions.interval_list \ + I=~{ReplaceRGBam} O=~{sample_id}.raw.wgs_metrics.txt R=~{reference_fasta} INTERVALS=~{eval_genome_interval} \ COUNT_UNPAIRED=true MINIMUM_BASE_QUALITY=0 MINIMUM_MAPPING_QUALITY=0 } @@ -772,13 +778,14 @@ task CollectConsensusWgsMetrics { File reference_fasta File reference_fasta_index File reference_dict + File eval_genome_interval Int memory = 64 Int disk_size = 200 } command { java -jar /dependencies/picard.jar CollectWgsMetrics \ - I=~{ConsensusAlignedBam} O=~{sample_id}.mol_consensus.wgs_metrics.txt R=~{reference_fasta} INTERVALS=/reference_files/GRCh38_notinalldifficultregions.interval_list \ + I=~{ConsensusAlignedBam} O=~{sample_id}.mol_consensus.wgs_metrics.txt R=~{reference_fasta} INTERVALS=~{eval_genome_interval} \ INCLUDE_BQ_HISTOGRAM=true MINIMUM_BASE_QUALITY=30 } @@ -810,19 +817,22 @@ task CSS_SFC_ErrorMetrics { File reference_sa File germline_bam File germline_bam_index + File eval_genome_bed + File population_based_vcf = "gs://gptag/CODEC/alfa_all.freq.breakmulti.hg38.af0001.vcf.gz" + File population_based_vcf_index = "gs://gptag/CODEC/alfa_all.freq.breakmulti.hg38.af0001.vcf.gz.tbi" Int memory = 64 Int disk_size = 200 } command { /CODECsuite/build/codec call -b ~{ConsensusAlignedBam} \ - -L /reference_files/GRCh38_notinalldifficultregions.bed \ + -L ~{eval_genome_bed} \ -r ~{reference_fasta} \ -m 60 \ -q 30 \ -d 12 \ -n ~{germline_bam} \ - -V /reference_files/alfa_all.freq.breakmulti.hg38.af0001.vcf.gz \ + -V ~{population_based_vcf} \ -x 6 \ -c 4 \ -5 \ @@ -867,18 +877,21 @@ task RAW_SFC_ErrorMetrics { File reference_sa File germline_bam File germline_bam_index + File eval_genome_bed + File population_based_vcf = "gs://gptag/CODEC/alfa_all.freq.breakmulti.hg38.af0001.vcf.gz" + File population_based_vcf_index = "gs://gptag/CODEC/alfa_all.freq.breakmulti.hg38.af0001.vcf.gz.tbi" Int memory = 32 Int disk_size = 200 } command { /CODECsuite/build/codec call -b ~{ReplaceRGBam} \ - -L /reference_files/GRCh38_notinalldifficultregions.bed \ + -L ~{eval_genome_bed} \ -r ~{reference_fasta} \ -m 60 \ -n ~{germline_bam} \ -q 30 \ -d 12 \ - -V /reference_files/alfa_all.freq.breakmulti.hg38.af0001.vcf.gz \ + -V ~{population_based_vcf} \ -x 6 \ -c 4 \ -5 \ From bcc80bbc273cac8cda3bd9db598255fbe81e280a Mon Sep 17 00:00:00 2001 From: Stella Date: Fri, 16 Feb 2024 13:56:03 -0500 Subject: [PATCH 04/21] add Dockerfile that buils the codec docker --- CODEC/Dockerfile | 58 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) create mode 100644 CODEC/Dockerfile diff --git a/CODEC/Dockerfile b/CODEC/Dockerfile new file mode 100644 index 0000000..4b389a3 --- /dev/null +++ b/CODEC/Dockerfile @@ -0,0 +1,58 @@ +FROM --platform=linux/amd64 ubuntu:20.04 + +LABEL maintainer="lining@broadinstitute.org" + +ENV DEBIAN_FRONTEND noninteractive + +RUN apt-get update \ + && apt-get install -y software-properties-common \ + && add-apt-repository ppa:deadsnakes/ppa \ + && apt-get update \ + && apt-get install -y python3.8 python3.8-dev python3.8-venv python3-pip \ + && pip3 install pandas argparse numpy pysam + +RUN apt-get install -y r-base r-base-dev + +RUN apt-get install -y \ + git \ + wget \ + bwa \ + libssl-dev \ + g++ \ + zlib1g-dev \ + autoconf \ + libbz2-dev \ + liblzma-dev \ + libcurl4-gnutls-dev \ + libssl-dev \ + build-essential \ + software-properties-common + +RUN apt-get update -qq +RUN apt-get install -y openjdk-11-jdk + + + +# Clone the CODECsuite repository +RUN git clone --recursive https://github.com/broadinstitute/CODECsuite.git /CODECsuite + + +RUN wget https://github.com/Kitware/CMake/releases/download/v3.28.0-rc3/cmake-3.28.0-rc3-linux-x86_64.tar.gz \ + && tar -xzvf cmake-3.28.0-rc3-linux-x86_64.tar.gz \ + && mv cmake-3.28.0-rc3-linux-x86_64 /opt/cmake-3.28 \ + && ln -s /opt/cmake-3.28/bin/cmake /usr/local/bin/cmake \ + && ln -s /opt/cmake-3.28/bin/ctest /usr/local/bin/ctest \ + && ln -s /opt/cmake-3.28/bin/cpack /usr/local/bin/cpack \ + && rm cmake-3.28.0-rc3-linux-x86_64.tar.gz + +RUN cd CODECsuite && \ + mkdir build && \ + cd build && \ + cmake .. && \ + make + +COPY dependencies/ /dependencies/ +COPY reference_files/ /reference_files/ + +RUN cp /dependencies/samtools-1.9/samtools /usr/bin/ +RUN chmod +x /CODECsuite/snakemake/script/agg_log.py \ No newline at end of file From 801bdc431bd422a3eb150a133aa7dba9647606dd Mon Sep 17 00:00:00 2001 From: Stella Date: Fri, 16 Feb 2024 14:43:46 -0500 Subject: [PATCH 05/21] add inputs.json and modify .dockstore.yml to add 2 wdls to Dockstore --- .dockstore.yml | 17 +++++++- CODEC/SingleSampleCODEC.inputs.json | 63 +++++++++++++++++++++++++++++ CODEC/codec_bcl2fastq.inputs.json | 12 ++++++ CODEC/demux_CODEC.inputs.json | 21 ++++++++++ 4 files changed, 112 insertions(+), 1 deletion(-) create mode 100644 CODEC/SingleSampleCODEC.inputs.json create mode 100644 CODEC/codec_bcl2fastq.inputs.json create mode 100644 CODEC/demux_CODEC.inputs.json diff --git a/.dockstore.yml b/.dockstore.yml index 74db3c3..78edf41 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -118,4 +118,19 @@ workflows: subclass: WDL primaryDescriptorPath: /CleanupFailedSubmissions/Cleanup_Failed_Submissions.wdl testParameterFiles: - - /CleanupFailedSubmissions/Cleanup_Failed_Submissions.inputs.json \ No newline at end of file + - /CleanupFailedSubmissions/Cleanup_Failed_Submissions.inputs.json + - name: codec_bcl2fastq + subclass: WDL + primaryDescriptorPath: /CODEC/codec_bcl2fastq.wdl + testParameterFiles: + - /CODEC/codec_bcl2fastq.inputs.json + - name: demux_CODEC + subclass: WDL + primaryDescriptorPath: /CODEC/demux_CODEC.wdl + testParameterFiles: + - /CODEC/demux_CODEC.inputs.json + - name: SingleSampleCODEC + subclass: WDL + primaryDescriptorPath: /CODEC/SingleSampleCODEC.wdl + testParameterFiles: + - /CODEC/SingleSampleCODEC.inputs.json diff --git a/CODEC/SingleSampleCODEC.inputs.json b/CODEC/SingleSampleCODEC.inputs.json new file mode 100644 index 0000000..94a48f3 --- /dev/null +++ b/CODEC/SingleSampleCODEC.inputs.json @@ -0,0 +1,63 @@ +{ + "SingleSampleCODEC.AlignMolecularConsensusReads.cpu_cores": "${2}", + "SingleSampleCODEC.AlignMolecularConsensusReads.disk_size": "${}", + "SingleSampleCODEC.AlignMolecularConsensusReads.memory": "${}", + "SingleSampleCODEC.AlignMolecularConsensusReads.threads": "${8}", + "SingleSampleCODEC.AlignRawTrimmed.disk": "${64}", + "SingleSampleCODEC.AlignRawTrimmed.disk_size": "${}", + "SingleSampleCODEC.AlignRawTrimmed.mem": "${}", + "SingleSampleCODEC.AlignRawTrimmed.memory": "${64}", + "SingleSampleCODEC.CDSByProduct.disk_size": "${}", + "SingleSampleCODEC.CDSByProduct.mem": "${}", + "SingleSampleCODEC.CSS_SFC_ErrorMetrics.disk_size": "${800}", + "SingleSampleCODEC.CSS_SFC_ErrorMetrics.memory": "${}", + "SingleSampleCODEC.CollectConsensusWgsMetrics.disk_size": "${160}", + "SingleSampleCODEC.CollectConsensusWgsMetrics.memory": "${}", + "SingleSampleCODEC.CollectInsertSizeMetrics.disk_size": "${100}", + "SingleSampleCODEC.CollectInsertSizeMetrics.memory": "${32}", + "SingleSampleCODEC.CollectRawWgsMetrics.disk_size": "${160}", + "SingleSampleCODEC.CollectRawWgsMetrics.memory": "${}", + "SingleSampleCODEC.FgbioCollapseReadFamilies.disk_size": "${200}", + "SingleSampleCODEC.FgbioCollapseReadFamilies.memory": "${}", + "SingleSampleCODEC.GroupReadByUMI.disk_size": "${400}", + "SingleSampleCODEC.GroupReadByUMI.memory": "${32}", + "SingleSampleCODEC.MarkRawDuplicates.disk_size": "${200}", + "SingleSampleCODEC.MarkRawDuplicates.memory": "${64}", + "SingleSampleCODEC.MergeAndSortMoleculeConsensusReads.disk_size": "${160}", + "SingleSampleCODEC.MergeAndSortMoleculeConsensusReads.memory": "${64}", + "SingleSampleCODEC.MergeLogSplit.disk_size": "${}", + "SingleSampleCODEC.MergeLogSplit.mem": "${}", + "SingleSampleCODEC.MergeSplit.disk_size": "${200}", + "SingleSampleCODEC.MergeSplit.memory": "${32}", + "SingleSampleCODEC.RAW_SFC_ErrorMetrics.disk_size": "${800}", + "SingleSampleCODEC.RAW_SFC_ErrorMetrics.memory": "${32}", + "SingleSampleCODEC.ReplaceRawReadGroup.disk_size": "${200}", + "SingleSampleCODEC.ReplaceRawReadGroup.memory": "${32}", + "SingleSampleCODEC.SortBam.disk_size": "${200}", + "SingleSampleCODEC.SortBam.mem": "${}", + "SingleSampleCODEC.SplitFastq1.disk_size": "${400}", + "SingleSampleCODEC.SplitFastq1.memory": "${32}", + "SingleSampleCODEC.SplitFastq2.disk_size": "${400}", + "SingleSampleCODEC.SplitFastq2.memory": "${32}", + "SingleSampleCODEC.Trim.disk_size": "${64}", + "SingleSampleCODEC.Trim.mem": "${32}", + "SingleSampleCODEC.ZipperBamAlignment.disk_size": "${200}", + "SingleSampleCODEC.ZipperBamAlignment.mem": "${32}", + "SingleSampleCODEC.eval_genome_bed": "gs://gptag/CODEC/ddbtp_codec_easy_regions.hg38.bed", + "SingleSampleCODEC.eval_genome_interval": "gs://gptag/CODEC/ddbtp_codec_easy_regions.hg38.interval_list", + "SingleSampleCODEC.fastq1": "${this.fastq1}", + "SingleSampleCODEC.fastq2": "${this.fastq2}", + "SingleSampleCODEC.germline_bam": "${this.germline_bam}", + "SingleSampleCODEC.germline_bam_index": "${this.germline_bam_index}", + "SingleSampleCODEC.num_parallel": "${40}", + "SingleSampleCODEC.reference_amb": "${workspace.referenceData_hg38_ref_amb}", + "SingleSampleCODEC.reference_ann": "${workspace.referenceData_hg38_ref_ann}", + "SingleSampleCODEC.reference_bwt": "${workspace.referenceData_hg38_ref_bwt}", + "SingleSampleCODEC.reference_dict": "${workspace.referenceData_hg38_ref_dict}", + "SingleSampleCODEC.reference_fasta": "${workspace.referenceData_hg38_ref_fasta}", + "SingleSampleCODEC.reference_fasta_index": "${workspace.referenceData_hg38_ref_fasta_index}", + "SingleSampleCODEC.reference_pac": "${workspace.referenceData_hg38_ref_pac}", + "SingleSampleCODEC.reference_sa": "${workspace.referenceData_hg38_ref_sa}", + "SingleSampleCODEC.sample_id": "${this.sample_id}", + "SingleSampleCODEC.sort_memory": "2G" +} \ No newline at end of file diff --git a/CODEC/codec_bcl2fastq.inputs.json b/CODEC/codec_bcl2fastq.inputs.json new file mode 100644 index 0000000..bcef386 --- /dev/null +++ b/CODEC/codec_bcl2fastq.inputs.json @@ -0,0 +1,12 @@ +{ + "codec_bcl2fastq.delete_input_bcl_directory": "${false}", + "codec_bcl2fastq.disk_space": "${3000}", + "codec_bcl2fastq.input_bcl_directory": "gs://fc-685a4bff-2f8c-4c0c-ad21-bbfdeab7b5e0/1705429346/", + "codec_bcl2fastq.memory": "${}", + "codec_bcl2fastq.output_directory": "gs://fc-685a4bff-2f8c-4c0c-ad21-bbfdeab7b5e0", + "codec_bcl2fastq.preemptible": "${4}", + "codec_bcl2fastq.read_threads": "${8}", + "codec_bcl2fastq.run_bcl2fastq.num_cpu": "${2}", + "codec_bcl2fastq.run_bcl2fastq.run_id": "${}", + "codec_bcl2fastq.write_threads": "${8}" +} \ No newline at end of file diff --git a/CODEC/demux_CODEC.inputs.json b/CODEC/demux_CODEC.inputs.json new file mode 100644 index 0000000..aa785e5 --- /dev/null +++ b/CODEC/demux_CODEC.inputs.json @@ -0,0 +1,21 @@ +{ + "demux_CODEC.Demux.disk_size": "${200}", + "demux_CODEC.Demux.memory": "${64}", + "demux_CODEC.GroupFastq1.disk_size": "${1600}", + "demux_CODEC.GroupFastq2.disk_size": "${1600}", + "demux_CODEC.SplitFastq1.disk_size": "${2048}", + "demux_CODEC.SplitFastq1.memory": "${128}", + "demux_CODEC.SplitFastq2.disk_size": "${2048}", + "demux_CODEC.SplitFastq2.memory": "${128}", + "demux_CODEC.UploadDataTable.namespace": "broadtagteam", + "demux_CODEC.batch_ids": "${[\"L001\",\"L002\"]}", + "demux_CODEC.concatenate_and_compress_fastq1.disk_size": "${100}", + "demux_CODEC.concatenate_and_compress_fastq1.memory": "${64}", + "demux_CODEC.concatenate_and_compress_fastq2.disk_size": "${100}", + "demux_CODEC.concatenate_and_compress_fastq2.memory": "${64}", + "demux_CODEC.fastq1": "${[\"gs://fc-685a4bff-2f8c-4c0c-ad21-bbfdeab7b5e0/1705429346_fastqs/Undetermined_S0_L001_R1_001.fastq.gz\",\"gs://fc-685a4bff-2f8c-4c0c-ad21-bbfdeab7b5e0/1705429346_fastqs/Undetermined_S0_L002_R1_001.fastq.gz\"]}", + "demux_CODEC.fastq2": "${[\"gs://fc-685a4bff-2f8c-4c0c-ad21-bbfdeab7b5e0/1705429346_fastqs/Undetermined_S0_L001_R2_001.fastq.gz\",\"gs://fc-685a4bff-2f8c-4c0c-ad21-bbfdeab7b5e0/1705429346_fastqs/Undetermined_S0_L002_R2_001.fastq.gz\"]}", + "demux_CODEC.num_parallel": "${40}", + "demux_CODEC.sample_sheet": "${[\"gs://fc-685a4bff-2f8c-4c0c-ad21-bbfdeab7b5e0/sample_sheet/sample_sheet_L001.csv\",\"gs://fc-685a4bff-2f8c-4c0c-ad21-bbfdeab7b5e0/sample_sheet/sample_sheet_L002.csv\"]}", + "demux_CODEC.workspace": "CODEC_example_workspace" +} \ No newline at end of file From 110f8ac4da7ccfadfc0207e776baa50708453f42 Mon Sep 17 00:00:00 2001 From: Stella Date: Wed, 28 Feb 2024 15:03:23 -0500 Subject: [PATCH 06/21] change duplication rate calculation, add duplex efficiency calculation, get rid of rawsfc task, and modified CollectRawWgsMetrics part --- CODEC/SingleSampleCODEC.wdl | 199 +++++++++--------------------------- 1 file changed, 47 insertions(+), 152 deletions(-) diff --git a/CODEC/SingleSampleCODEC.wdl b/CODEC/SingleSampleCODEC.wdl index a12d100..bcd46c9 100644 --- a/CODEC/SingleSampleCODEC.wdl +++ b/CODEC/SingleSampleCODEC.wdl @@ -137,16 +137,7 @@ workflow SingleSampleCODEC { sample_id = sample_id, sort_memory = sort_memory } - call CollectRawWgsMetrics { - input: - ReplaceRGBam = MarkRawDuplicates.dup_marked_bam, - sample_id = sample_id, - reference_fasta = reference_fasta, - reference_fasta_index = reference_fasta_index, - reference_dict = reference_dict, - eval_genome_interval = eval_genome_interval - } - call CollectConsensusWgsMetrics { + call CollectWgsMetrics { input: ConsensusAlignedBam = MergeAndSortMoleculeConsensusReads.bam, ConsensusAlignedBai = MergeAndSortMoleculeConsensusReads.bai, @@ -173,30 +164,12 @@ workflow SingleSampleCODEC { germline_bam_index = germline_bam_index, eval_genome_bed = eval_genome_bed } - call RAW_SFC_ErrorMetrics { - input: - ReplaceRGBam = MarkRawDuplicates.dup_marked_bam, - ReplaceRGBai = MarkRawDuplicates.dup_marked_bai, - sample_id = sample_id, - reference_fasta = reference_fasta, - reference_fasta_index = reference_fasta_index, - reference_dict = reference_dict, - reference_pac = reference_pac, - reference_amb = reference_amb, - reference_ann = reference_ann, - reference_bwt = reference_bwt, - reference_sa = reference_sa, - germline_bam = germline_bam, - germline_bam_index = germline_bam_index, - eval_genome_bed = eval_genome_bed - } call QC_metrics { input: byproduct_metrics = CDSByProduct.byproduct_metrics, - RawWgsMetrics = CollectRawWgsMetrics.RawWgsMetrics, - DuplicationMetrics = MarkRawDuplicates.dup_metrics, + WgsMetrics = CollectWgsMetrics.WgsMetrics, + umiHistogram = GroupReadByUMI.umi_histogram, InsertSizeMetrics = CollectInsertSizeMetrics.insert_size_metrics, - ConsensusWgsMetrics = CollectConsensusWgsMetrics.ConsensusWgsMetrics, mutant_metrics = CSS_SFC_ErrorMetrics.mutant_metrics, } call EvalGenomeBases { @@ -206,7 +179,9 @@ workflow SingleSampleCODEC { call CalculateDuplexDepth { input: eval_genome_bases = EvalGenomeBases.eval_genome_bases, - n_bases_eval = QC_metrics.n_bases_eval + n_bases_eval = QC_metrics.n_bases_eval, + mean_insert_size = QC_metrics.mean_insert_size, + n_total_fastq = QC_metrics.n_total_fastq } output { @@ -218,11 +193,8 @@ workflow SingleSampleCODEC { File InsertSizeMetrics = CollectInsertSizeMetrics.insert_size_metrics File InsertSizeHistogram = CollectInsertSizeMetrics.insert_size_histogram File DuplicationMetrics = MarkRawDuplicates.dup_metrics - File RawWgsMetrics = CollectRawWgsMetrics.RawWgsMetrics - File ConsensusWgsMetrics = CollectConsensusWgsMetrics.ConsensusWgsMetrics - File raw_mutant_metrics = RAW_SFC_ErrorMetrics.raw_mutant_metrics - File raw_context_count = RAW_SFC_ErrorMetrics.raw_context_count - File raw_variants_called = RAW_SFC_ErrorMetrics.raw_variants_called + File umiHistogram = GroupReadByUMI.umi_histogram + File WgsMetrics = CollectWgsMetrics.WgsMetrics File mutant_metrics = CSS_SFC_ErrorMetrics.mutant_metrics File context_count = CSS_SFC_ErrorMetrics.context_count File variants_called = CSS_SFC_ErrorMetrics.variants_called @@ -238,11 +210,9 @@ workflow SingleSampleCODEC { Float pct_adp_dimer = QC_metrics.pct_adp_dimer Float raw_dedupped_mean_cov = QC_metrics.raw_dedupped_mean_cov Int raw_dedupped_median_cov = QC_metrics.raw_dedupped_median_cov - Float raw_duplication_rate = QC_metrics.raw_duplication_rate + Float duplication_rate = QC_metrics.duplication_rate Float mean_insert_size = QC_metrics.mean_insert_size Int median_insert_size = QC_metrics.median_insert_size - Float mol_consensus_mean_cov = QC_metrics.mol_consensus_mean_cov - Int mol_consensus_median_cov = QC_metrics.mol_consensus_median_cov Int n_snv = QC_metrics.n_snv Int n_indel = QC_metrics.n_indel String n_bases_eval = QC_metrics.n_bases_eval @@ -250,6 +220,8 @@ workflow SingleSampleCODEC { Float indel_rate = QC_metrics.indel_rate String eval_genome_bases = EvalGenomeBases.eval_genome_bases Float duplex_depth = CalculateDuplexDepth.duplex_depth + Float duplex_efficiency = CalculateDuplexDepth.duplex_efficiency + } } @@ -739,38 +711,7 @@ task MergeAndSortMoleculeConsensusReads { } } -task CollectRawWgsMetrics { - input { - File ReplaceRGBam - String sample_id - File reference_fasta - File reference_fasta_index - File reference_dict - File eval_genome_interval - Int memory = 64 - Int disk_size = 200 - } - - - command { - java -jar /dependencies/picard.jar CollectWgsMetrics \ - I=~{ReplaceRGBam} O=~{sample_id}.raw.wgs_metrics.txt R=~{reference_fasta} INTERVALS=~{eval_genome_interval} \ - COUNT_UNPAIRED=true MINIMUM_BASE_QUALITY=0 MINIMUM_MAPPING_QUALITY=0 - } - - output { - File RawWgsMetrics = "~{sample_id}.raw.wgs_metrics.txt" - } - - runtime { - memory: memory + " GB" - docker: "us.gcr.io/tag-team-160914/codec:v1" - disks: "local-disk " + disk_size + " HDD" - preemptible: 3 - } -} - -task CollectConsensusWgsMetrics { +task CollectWgsMetrics { input { File ConsensusAlignedBam File ConsensusAlignedBai @@ -785,12 +726,12 @@ task CollectConsensusWgsMetrics { command { java -jar /dependencies/picard.jar CollectWgsMetrics \ - I=~{ConsensusAlignedBam} O=~{sample_id}.mol_consensus.wgs_metrics.txt R=~{reference_fasta} INTERVALS=~{eval_genome_interval} \ - INCLUDE_BQ_HISTOGRAM=true MINIMUM_BASE_QUALITY=30 + I=~{ConsensusAlignedBam} O=~{sample_id}.wgs_metrics.txt R=~{reference_fasta} INTERVALS=~{eval_genome_interval} \ + COUNT_UNPAIRED=true MINIMUM_BASE_QUALITY=0 MINIMUM_MAPPING_QUALITY=0 } output { - File ConsensusWgsMetrics = "~{sample_id}.mol_consensus.wgs_metrics.txt" + File WgsMetrics = "~{sample_id}.wgs_metrics.txt" } runtime { @@ -862,72 +803,13 @@ task CSS_SFC_ErrorMetrics { } } -task RAW_SFC_ErrorMetrics { - input { - File ReplaceRGBam - File ReplaceRGBai - String sample_id - File reference_fasta - File reference_fasta_index - File reference_dict - File reference_pac - File reference_amb - File reference_ann - File reference_bwt - File reference_sa - File germline_bam - File germline_bam_index - File eval_genome_bed - File population_based_vcf = "gs://gptag/CODEC/alfa_all.freq.breakmulti.hg38.af0001.vcf.gz" - File population_based_vcf_index = "gs://gptag/CODEC/alfa_all.freq.breakmulti.hg38.af0001.vcf.gz.tbi" - Int memory = 32 - Int disk_size = 200 - } - command { - /CODECsuite/build/codec call -b ~{ReplaceRGBam} \ - -L ~{eval_genome_bed} \ - -r ~{reference_fasta} \ - -m 60 \ - -n ~{germline_bam} \ - -q 30 \ - -d 12 \ - -V ~{population_based_vcf} \ - -x 6 \ - -c 4 \ - -5 \ - -g 30 \ - -G 250 \ - -Q 0.6 \ - -B 0.6 \ - -N 0.1 \ - -Y 5 \ - -W 1 \ - -a ~{sample_id}.raw.mutant_metrics.txt \ - -e ~{sample_id}.raw.variants_called.txt \ - -C ~{sample_id}.raw.context_count.txt - } - - output { - File raw_mutant_metrics = "~{sample_id}.raw.mutant_metrics.txt" - File raw_variants_called = "~{sample_id}.raw.variants_called.txt" - File raw_context_count = "~{sample_id}.raw.context_count.txt" - } - - runtime { - memory: memory + " GB" - docker: "us.gcr.io/tag-team-160914/codec:v1" - disks: "local-disk " + disk_size + " HDD" - preemptible: 3 - } -} task QC_metrics { input { File byproduct_metrics - File RawWgsMetrics - File DuplicationMetrics + File WgsMetrics + File umiHistogram File InsertSizeMetrics - File ConsensusWgsMetrics File mutant_metrics Int memory = 16 Int disk_size = 16 @@ -935,6 +817,16 @@ task QC_metrics { } command <<< set -e + + # Use umi histogram to calculate duplication rate + cat ~{umiHistogram} | awk -F'\t' 'NR > 1 { + numerator += ($1 - 1) * $2; + denominator += $1 * $2; + } END { + if (denominator > 0) print numerator / denominator; + else print "NA"; + }' > duplication_rate.txt + cat ~{byproduct_metrics} | awk 'NR==2 {print $NF}' > n_total_fastq.txt cat ~{byproduct_metrics} | awk 'NR==2 {print $9}' > n_correct.txt cat ~{byproduct_metrics} | awk 'NR==2 {print $2}' > pct_correct.txt @@ -944,13 +836,10 @@ task QC_metrics { cat ~{byproduct_metrics} | awk 'NR==2 {print $5}' > pct_intermol.txt cat ~{byproduct_metrics} | awk 'NR==2 {print $11}' > n_adp_dimer.txt cat ~{byproduct_metrics} | awk 'NR==2 {print $4}' > pct_adp_dimer.txt - cat ~{RawWgsMetrics} | grep -v "#" | awk 'NR==3 {print $2}' > raw_dedupped_mean_cov.txt - cat ~{RawWgsMetrics} | grep -v "#" | awk 'NR==3 {print $4}' > raw_dedupped_median_cov.txt - cat ~{DuplicationMetrics} | grep -v "#" | awk 'NR==3 {print $9}' > raw_duplication_rate.txt + cat ~{WgsMetrics} | grep -v "#" | awk 'NR==3 {print $2}' > raw_dedupped_mean_cov.txt + cat ~{WgsMetrics} | grep -v "#" | awk 'NR==3 {print $4}' > raw_dedupped_median_cov.txt cat ~{InsertSizeMetrics} | grep -v "#" | awk 'NR==3 {print $1}' > median_insert_size.txt cat ~{InsertSizeMetrics} | grep -v "#" | awk 'NR==3 {print $6}' > mean_insert_size.txt - cat ~{ConsensusWgsMetrics} | grep -v "#" | awk 'NR==3 {print $2}' > mol_consensus_mean_cov.txt - cat ~{ConsensusWgsMetrics} | grep -v "#" | awk 'NR==3 {print $4}' > mol_consensus_median_cov.txt cat ~{mutant_metrics} | awk 'NR==2 {print $8}' > n_snv.txt cat ~{mutant_metrics} | awk 'NR==2 {print $10}' > n_indel.txt cat ~{mutant_metrics} | awk 'NR==2 {print $3}' > n_bases_eval.txt @@ -960,6 +849,7 @@ task QC_metrics { >>> output { + Float duplication_rate = read_float("duplication_rate.txt") Int n_total_fastq = read_int("n_total_fastq.txt") Int n_correct_products = read_int("n_correct.txt") Float pct_correct_products = read_float("pct_correct.txt") @@ -971,11 +861,8 @@ task QC_metrics { Float pct_adp_dimer = read_float("pct_adp_dimer.txt") Float raw_dedupped_mean_cov = read_float("raw_dedupped_mean_cov.txt") Int raw_dedupped_median_cov = read_int("raw_dedupped_median_cov.txt") - Float raw_duplication_rate = read_float("raw_duplication_rate.txt") Float mean_insert_size = read_float("mean_insert_size.txt") Int median_insert_size = read_int("median_insert_size.txt") - Float mol_consensus_mean_cov = read_float("mol_consensus_mean_cov.txt") - Int mol_consensus_median_cov = read_int("mol_consensus_median_cov.txt") Int n_snv = read_int("n_snv.txt") Int n_indel = read_int("n_indel.txt") String n_bases_eval = read_string("n_bases_eval.txt") @@ -1020,27 +907,35 @@ task CalculateDuplexDepth { input { String eval_genome_bases String n_bases_eval + Float mean_insert_size + Int n_total_fastq Int memory = 16 Int disk_size = 8 } - command <<< - + command <<< python3 <>> output { - Float duplex_depth = read_float(stdout()) + Float duplex_depth = read_float('duplex_depth.txt') + Float duplex_efficiency = read_float('duplex_efficiency.txt') } runtime { memory: memory + " GB" From 7bb7ede7c042ac51a7a6b14864be8b089b56ce97 Mon Sep 17 00:00:00 2001 From: Stella Date: Thu, 29 Feb 2024 14:56:32 -0500 Subject: [PATCH 07/21] add ceil() to help with disk size allocation --- CODEC/SingleSampleCODEC.wdl | 22 +++++++++++++--------- CODEC/codec_bcl2fastq.wdl | 35 +++++++++++++++++++++++++++++++---- CODEC/demux_CODEC.wdl | 8 ++++---- CODEC/prep_codec_metadata.py | 12 +++++++++--- 4 files changed, 57 insertions(+), 20 deletions(-) diff --git a/CODEC/SingleSampleCODEC.wdl b/CODEC/SingleSampleCODEC.wdl index bcd46c9..b29727c 100644 --- a/CODEC/SingleSampleCODEC.wdl +++ b/CODEC/SingleSampleCODEC.wdl @@ -231,7 +231,8 @@ task SplitFastq1 { String sample_id Int nsplit Int memory = 64 - Int disk_size = 200 + Int? extra_disk + Int disk_size = ceil(size(fastq_read1, "GB") * 15) + select_first([extra_disk, 0]) } command <<< @@ -258,7 +259,8 @@ task SplitFastq2 { Int nsplit String sample_id Int memory = 64 - Int disk_size = 200 + Int? extra_disk + Int disk_size = ceil(size(fastq_read2, "GB") * 15) + select_first([extra_disk, 0]) } command <<< @@ -286,7 +288,8 @@ task Trim { String output_prefix Int split Int mem = 16 - Int disk_size = 32 + Int? extra_disk + Int disk_size = ceil(size(read1, "GB") * 8) + select_first([extra_disk, 0]) } command { @@ -317,10 +320,9 @@ task AlignRawTrimmed { File reference_ann File reference_bwt File reference_sa - Int? memory - Int? disk - Int mem = select_first([memory, 16]) - Int disk_size = select_first([disk, 32]) + Int mem = 16 + Int? extra_disk + Int disk_size = ceil(size(bam_input, "GB") * 8) + select_first([extra_disk, 0]) String sample_id Int split } @@ -584,7 +586,8 @@ task GroupReadByUMI { File input_bam String sample_id Int memory = 64 - Int disk_size = 200 + Int? extra_disk + Int disk_size = ceil(size(input_bam, "GB") * 15) + select_first([extra_disk, 0]) } command { @@ -614,7 +617,8 @@ task FgbioCollapseReadFamilies { File grouped_umi_bam String sample_id Int memory = 64 - Int disk_size = 200 + Int? extra_disk + Int disk_size = ceil(size(grouped_umi_bam, "GB") * 10) + select_first([extra_disk, 0]) } command { diff --git a/CODEC/codec_bcl2fastq.wdl b/CODEC/codec_bcl2fastq.wdl index 4b217a8..45670fc 100644 --- a/CODEC/codec_bcl2fastq.wdl +++ b/CODEC/codec_bcl2fastq.wdl @@ -7,10 +7,14 @@ workflow codec_bcl2fastq { Int read_threads Int write_threads String memory = "128G" - Int disk_space = 2000 Int preemptible = 3 Boolean delete_input_bcl_directory } + call GetBclBucketSize { + input: + input_bcl_directory = input_bcl_directory, + memory = 16 + } call run_bcl2fastq { input: @@ -19,7 +23,7 @@ workflow codec_bcl2fastq { read_threads = read_threads, write_threads = write_threads, memory = memory, - disk_space = disk_space, + input_bcl_size = GetBclBucketSize.input_bcl_size, preemptible = preemptible, delete_input_bcl_directory = delete_input_bcl_directory } @@ -29,18 +33,41 @@ workflow codec_bcl2fastq { } } +task GetBclBucketSize { + input { + String input_bcl_directory + Int memory = 16 + } + + command <<< + gsutil du -s ${input_bcl_directory} | awk '{print $1/1024/1024/1024}' > input_bcl_size.txt + >>> + + output { + Float input_bcl_size = read_float("input_bcl_size.txt") + } + + runtime { + docker: "us.gcr.io/tag-team-160914/bcl2fastq_codec:v1" + memory: memory + " GB" + disks: "local-disk 16 HDD" + } +} + task run_bcl2fastq { input { String input_bcl_directory String output_directory String memory - Int disk_space - Int preemptible + Float input_bcl_size + Int disk_space = ceil(input_bcl_size * 1.5) + select_first([extra_disk, 0]) + Int? extra_disk Int read_threads = 4 Int write_threads = 4 String run_id = basename(input_bcl_directory) Boolean delete_input_bcl_directory Int num_cpu = 2 + Int preemptible } command { diff --git a/CODEC/demux_CODEC.wdl b/CODEC/demux_CODEC.wdl index 4a0dc00..21200cf 100644 --- a/CODEC/demux_CODEC.wdl +++ b/CODEC/demux_CODEC.wdl @@ -3,8 +3,8 @@ version 1.0 workflow demux_CODEC { input { - Array[File] fastq1 - Array[File] fastq2 + Array[File] multisample_fastq1 + Array[File] multisample_fastq2 Array[String] batch_ids Int num_parallel Array[File] sample_sheet @@ -14,13 +14,13 @@ workflow demux_CODEC { call SplitFastq1 { input: batch_id =batch_ids[batch_index], - fastq_read1 = fastq1[batch_index], + fastq_read1 = multisample_fastq1[batch_index], nsplit = num_parallel } call SplitFastq2 { input: batch_id =batch_ids[batch_index], - fastq_read2 = fastq2[batch_index], + fastq_read2 = multisample_fastq2[batch_index], nsplit = num_parallel } scatter (split_index in range(num_parallel)) { diff --git a/CODEC/prep_codec_metadata.py b/CODEC/prep_codec_metadata.py index 691b1ec..835b3da 100644 --- a/CODEC/prep_codec_metadata.py +++ b/CODEC/prep_codec_metadata.py @@ -1,11 +1,17 @@ -import pandas as pd +# Running this script is a prerequisite for demux_CODEC wdl. +# This script is taking an Excel file provided by collaborator +# with Barcode assigned to each sample and a default Index reference file, +# to generate a sample_sheet_L00{lane}.csv file for each lane. + + import sys +import pandas as pd + if len(sys.argv) != 3: - print("Usage: python3 script.py ") + print("Usage: python3 prep_codec_metadata.py ") sys.exit(1) -# Assigning command-line arguments to variables metadata_excel_file = sys.argv[1] index_csv_file = sys.argv[2] From 0feeada03b7abdc0e80fe1b69cf4a318db1a5c74 Mon Sep 17 00:00:00 2001 From: Stella Date: Fri, 1 Mar 2024 15:16:47 -0500 Subject: [PATCH 08/21] minor changes to task name and QC metrics collection --- CODEC/SingleSampleCODEC.wdl | 41 ++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 19 deletions(-) diff --git a/CODEC/SingleSampleCODEC.wdl b/CODEC/SingleSampleCODEC.wdl index b29727c..8f50358 100644 --- a/CODEC/SingleSampleCODEC.wdl +++ b/CODEC/SingleSampleCODEC.wdl @@ -83,7 +83,7 @@ workflow SingleSampleCODEC { bam_file = MergeSplit.merged_bam, sample_id = sample_id } - call CDSByProduct { + call ByProductMetrics { input: trim_log = MergeLogSplit.merged_log, highconf_bam = SortBam.sorted_bam, @@ -166,7 +166,7 @@ workflow SingleSampleCODEC { } call QC_metrics { input: - byproduct_metrics = CDSByProduct.byproduct_metrics, + byproduct_metrics = ByProductMetrics.byproduct_metrics, WgsMetrics = CollectWgsMetrics.WgsMetrics, umiHistogram = GroupReadByUMI.umi_histogram, InsertSizeMetrics = CollectInsertSizeMetrics.insert_size_metrics, @@ -185,7 +185,7 @@ workflow SingleSampleCODEC { } output { - File byproduct_metrics = CDSByProduct.byproduct_metrics + File byproduct_metrics = ByProductMetrics.byproduct_metrics File RAW_BAM = MergeSplit.merged_bam File RAW_BAM_index = MergeSplit.merged_bai File MolConsensusBAM = MergeAndSortMoleculeConsensusReads.bam @@ -463,7 +463,7 @@ task SortBam { } } -task CDSByProduct { +task ByProductMetrics { input { File trim_log File highconf_bam @@ -831,24 +831,27 @@ task QC_metrics { else print "NA"; }' > duplication_rate.txt - cat ~{byproduct_metrics} | awk 'NR==2 {print $NF}' > n_total_fastq.txt - cat ~{byproduct_metrics} | awk 'NR==2 {print $9}' > n_correct.txt - cat ~{byproduct_metrics} | awk 'NR==2 {print $2}' > pct_correct.txt - cat ~{byproduct_metrics} | awk 'NR==2 {print $10}' > n_double_ligation.txt - cat ~{byproduct_metrics} | awk 'NR==2 {print $3}' > pct_double_ligation.txt - cat ~{byproduct_metrics} | awk 'NR==2 {print $12}' > n_intermol.txt - cat ~{byproduct_metrics} | awk 'NR==2 {print $5}' > pct_intermol.txt - cat ~{byproduct_metrics} | awk 'NR==2 {print $11}' > n_adp_dimer.txt - cat ~{byproduct_metrics} | awk 'NR==2 {print $4}' > pct_adp_dimer.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="n_total") col=i} NR==2 {print $col}' ~{byproduct_metrics} > n_total_fastq.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="n_correct") col=i} NR==2 {print $col}' ~{byproduct_metrics} > n_correct.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="pct_correct") col=i} NR==2 {print $col}' ~{byproduct_metrics} > pct_correct.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="n_double_ligation") col=i} NR==2 {print $col}' ~{byproduct_metrics} > n_double_ligation.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="pct_double_ligation") col=i} NR==2 {print $col}' ~{byproduct_metrics} > pct_double_ligation.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="n_intermol") col=i} NR==2 {print $col}' ~{byproduct_metrics} > n_intermol.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="pct_intermol") col=i} NR==2 {print $col}' ~{byproduct_metrics} > pct_intermol.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="n_adp_dimer") col=i} NR==2 {print $col}' ~{byproduct_metrics} > n_adp_dimer.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="pct_adp_dimer") col=i} NR==2 {print $col}' ~{byproduct_metrics} > pct_adp_dimer.txt + cat ~{WgsMetrics} | grep -v "#" | awk 'NR==3 {print $2}' > raw_dedupped_mean_cov.txt cat ~{WgsMetrics} | grep -v "#" | awk 'NR==3 {print $4}' > raw_dedupped_median_cov.txt cat ~{InsertSizeMetrics} | grep -v "#" | awk 'NR==3 {print $1}' > median_insert_size.txt cat ~{InsertSizeMetrics} | grep -v "#" | awk 'NR==3 {print $6}' > mean_insert_size.txt - cat ~{mutant_metrics} | awk 'NR==2 {print $8}' > n_snv.txt - cat ~{mutant_metrics} | awk 'NR==2 {print $10}' > n_indel.txt - cat ~{mutant_metrics} | awk 'NR==2 {print $3}' > n_bases_eval.txt - cat ~{mutant_metrics} | awk 'NR==2 {print $9}' > snv_rate.txt - cat ~{mutant_metrics} | awk 'NR==2 {print $11}' > indel_rate.txt + + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="n_snv") col=i} NR==2 {print $col}' ~{mutant_metrics} > n_snv.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="n_indel") col=i} NR==2 {print $col}' ~{mutant_metrics} > n_indel.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="n_bases_eval") col=i} NR==2 {print $col}' ~{mutant_metrics} > n_bases_eval.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="snv_rate") col=i} NR==2 {print $col}' ~{mutant_metrics} > snv_rate.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="indel_rate") col=i} NR==2 {print $col}' ~{mutant_metrics} > indel_rate.txt + >>> @@ -947,4 +950,4 @@ task CalculateDuplexDepth { disks: "local-disk " + disk_size + " HDD" preemptible: 1 } -} +} \ No newline at end of file From 97bcfef6936c741f1b1c0e982b53657f29d7635b Mon Sep 17 00:00:00 2001 From: Stella Date: Fri, 1 Mar 2024 16:30:11 -0500 Subject: [PATCH 09/21] minor changes to disk size input --- CODEC/SingleSampleCODEC.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CODEC/SingleSampleCODEC.wdl b/CODEC/SingleSampleCODEC.wdl index 8f50358..6a4fb08 100644 --- a/CODEC/SingleSampleCODEC.wdl +++ b/CODEC/SingleSampleCODEC.wdl @@ -322,7 +322,7 @@ task AlignRawTrimmed { File reference_sa Int mem = 16 Int? extra_disk - Int disk_size = ceil(size(bam_input, "GB") * 8) + select_first([extra_disk, 0]) + Int disk_size = ceil(size(bam_input, "GB") * 20) + select_first([extra_disk, 0]) String sample_id Int split } From d71532ff36d14ae37a14b9bf89e30ff6d76f3e5f Mon Sep 17 00:00:00 2001 From: Stella Date: Fri, 3 May 2024 11:49:17 -0400 Subject: [PATCH 10/21] change variant calling parameters and remove MarkDuplicated task --- CODEC/SingleSampleCODEC.wdl | 53 +++++-------------------------------- 1 file changed, 7 insertions(+), 46 deletions(-) diff --git a/CODEC/SingleSampleCODEC.wdl b/CODEC/SingleSampleCODEC.wdl index 6a4fb08..c292918 100644 --- a/CODEC/SingleSampleCODEC.wdl +++ b/CODEC/SingleSampleCODEC.wdl @@ -94,19 +94,14 @@ workflow SingleSampleCODEC { raw_bam = MergeSplit.merged_bam, sample_id = sample_id } - call MarkRawDuplicates { - input: - input_bam = ReplaceRawReadGroup.bam, - sample_id = sample_id - } call CollectInsertSizeMetrics { input: - input_bam = MarkRawDuplicates.dup_marked_bam, + input_bam = ReplaceRawReadGroup.bam, sample_id = sample_id } call GroupReadByUMI { input: - input_bam = MarkRawDuplicates.dup_marked_bam, + input_bam = ReplaceRawReadGroup.bam, sample_id = sample_id } call FgbioCollapseReadFamilies { @@ -192,7 +187,6 @@ workflow SingleSampleCODEC { File MolConsensusBAM_index = MergeAndSortMoleculeConsensusReads.bai File InsertSizeMetrics = CollectInsertSizeMetrics.insert_size_metrics File InsertSizeHistogram = CollectInsertSizeMetrics.insert_size_histogram - File DuplicationMetrics = MarkRawDuplicates.dup_metrics File umiHistogram = GroupReadByUMI.umi_histogram File WgsMetrics = CollectWgsMetrics.WgsMetrics File mutant_metrics = CSS_SFC_ErrorMetrics.mutant_metrics @@ -520,39 +514,6 @@ task ReplaceRawReadGroup { } } -task MarkRawDuplicates { - input { - File input_bam - String sample_id - Int memory = 64 - Int disk_size = 200 - } - - command { - java -jar /dependencies/picard.jar MarkDuplicates \ - I=~{input_bam} \ - O=~{sample_id}.raw.replacerg.markdup.bam \ - M=~{sample_id}.raw.marked_duplicates.txt \ - CREATE_INDEX=true \ - TAG_DUPLICATE_SET_MEMBERS=true \ - TAGGING_POLICY=All - - samtools index ~{sample_id}.raw.replacerg.markdup.bam - } - - output { - File dup_marked_bam = "~{sample_id}.raw.replacerg.markdup.bam" - File dup_marked_bai = "~{sample_id}.raw.replacerg.markdup.bam.bai" - File dup_metrics = "~{sample_id}.raw.marked_duplicates.txt" - } - - runtime { - memory: memory + " GB" - docker: "us.gcr.io/tag-team-160914/codec:v1" - disks: "local-disk " + disk_size + " HDD" - } -} - task CollectInsertSizeMetrics { input { File input_bam @@ -724,7 +685,7 @@ task CollectWgsMetrics { File reference_fasta_index File reference_dict File eval_genome_interval - Int memory = 64 + Int memory = 32 Int disk_size = 200 } @@ -786,7 +747,7 @@ task CSS_SFC_ErrorMetrics { -Q 0.7 \ -B 0.6 \ -N 0.05 \ - -Y 5 \ + -Y 10 \ -W 1 \ -a ~{sample_id}.mutant_metrics.txt \ -e ~{sample_id}.variants_called.txt \ @@ -815,7 +776,7 @@ task QC_metrics { File umiHistogram File InsertSizeMetrics File mutant_metrics - Int memory = 16 + Int memory = 32 Int disk_size = 16 } @@ -916,8 +877,8 @@ task CalculateDuplexDepth { String n_bases_eval Float mean_insert_size Int n_total_fastq - Int memory = 16 - Int disk_size = 8 + Int memory = 32 + Int disk_size = 64 } command <<< From c2347f1b2f8dd210dd5d54398e46554e53bcbaca Mon Sep 17 00:00:00 2001 From: Stella Date: Thu, 23 May 2024 11:10:31 -0400 Subject: [PATCH 11/21] change docker images since it is now public --- CODEC/codec_bcl2fastq.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CODEC/codec_bcl2fastq.wdl b/CODEC/codec_bcl2fastq.wdl index 45670fc..0d9a26b 100644 --- a/CODEC/codec_bcl2fastq.wdl +++ b/CODEC/codec_bcl2fastq.wdl @@ -48,7 +48,7 @@ task GetBclBucketSize { } runtime { - docker: "us.gcr.io/tag-team-160914/bcl2fastq_codec:v1" + docker: "us.gcr.io/tag-public/bcl2fastq_codec:v1" memory: memory + " GB" disks: "local-disk 16 HDD" } @@ -108,7 +108,7 @@ task run_bcl2fastq { } runtime { - docker: "us.gcr.io/tag-team-160914/bcl2fastq_codec:v1" + docker: "us.gcr.io/tag-public/bcl2fastq_codec:v1" memory: memory bootDiskSizeGb: 12 disks: "local-disk ${disk_space} HDD" From a73fe6c5fdfbec06503e858c225b24043f032266 Mon Sep 17 00:00:00 2001 From: Stella Date: Thu, 23 May 2024 11:49:32 -0400 Subject: [PATCH 12/21] not yet pushing to public --- CODEC/codec_bcl2fastq.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CODEC/codec_bcl2fastq.wdl b/CODEC/codec_bcl2fastq.wdl index 0d9a26b..713c23d 100644 --- a/CODEC/codec_bcl2fastq.wdl +++ b/CODEC/codec_bcl2fastq.wdl @@ -48,7 +48,7 @@ task GetBclBucketSize { } runtime { - docker: "us.gcr.io/tag-public/bcl2fastq_codec:v1" + docker: "us-central1-docker.pkg.dev/tag-team-160914/gptag-dockers/bcl2fastq_codec:v1" memory: memory + " GB" disks: "local-disk 16 HDD" } @@ -108,7 +108,7 @@ task run_bcl2fastq { } runtime { - docker: "us.gcr.io/tag-public/bcl2fastq_codec:v1" + docker: "us-central1-docker.pkg.dev/tag-team-160914/gptag-dockers/bcl2fastq_codec:v1" memory: memory bootDiskSizeGb: 12 disks: "local-disk ${disk_space} HDD" From ca095bec9e12ba6d954c6ee70e4b7150190864c7 Mon Sep 17 00:00:00 2001 From: Stella Date: Sat, 25 May 2024 11:46:30 -0400 Subject: [PATCH 13/21] docker images chaneg to public ones --- CODEC/demux_CODEC.wdl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CODEC/demux_CODEC.wdl b/CODEC/demux_CODEC.wdl index 21200cf..7b5a561 100644 --- a/CODEC/demux_CODEC.wdl +++ b/CODEC/demux_CODEC.wdl @@ -214,7 +214,7 @@ task concatenate_and_compress_fastq { } runtime { - docker: "us.gcr.io/tag-team-160914/gcloudsdk" + docker: "us.gcr.io/tag-public/bcl2fastq_codec:v1" disks: "local-disk " + disk_size + " HDD" memory: memory + " GB" } @@ -270,7 +270,7 @@ task UploadDataTable { } runtime { - docker: "us.gcr.io/tag-team-160914/metadata_upload" + docker: "us.gcr.io/tag-public/metadata_upload" disks: "local-disk " + disk_size + " HDD" memory: memory + " GB" } From 21cf843d1c53c591c4bb8cd9dde484965482f99f Mon Sep 17 00:00:00 2001 From: Stella Date: Sat, 25 May 2024 17:43:40 -0400 Subject: [PATCH 14/21] switch to public docker image --- CODEC/SingleSampleCODEC.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CODEC/SingleSampleCODEC.wdl b/CODEC/SingleSampleCODEC.wdl index c292918..2163e33 100644 --- a/CODEC/SingleSampleCODEC.wdl +++ b/CODEC/SingleSampleCODEC.wdl @@ -839,7 +839,7 @@ task QC_metrics { } runtime { memory: memory + " GB" - docker: "us.gcr.io/tag-team-160914/picard_docker" + docker: "us.gcr.io/tag-public/metadata_upload" disks: "local-disk " + disk_size + " HDD" preemptible: 2 } @@ -864,7 +864,7 @@ task EvalGenomeBases { } runtime { memory: memory + " GB" - docker: "us.gcr.io/tag-team-160914/picard_docker" + docker: "us.gcr.io/tag-public/metadata_upload" disks: "local-disk " + disk_size + " HDD" preemptible: 1 } @@ -907,7 +907,7 @@ task CalculateDuplexDepth { } runtime { memory: memory + " GB" - docker: "us.gcr.io/tag-team-160914/picard_docker" + docker: "us.gcr.io/tag-public/metadata_upload" disks: "local-disk " + disk_size + " HDD" preemptible: 1 } From dd0d44e330618211dc4f9190b26da3d08ebe2f12 Mon Sep 17 00:00:00 2001 From: Stella Date: Wed, 29 May 2024 11:56:27 -0400 Subject: [PATCH 15/21] correct docker images --- CODEC/codec_bcl2fastq.wdl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CODEC/codec_bcl2fastq.wdl b/CODEC/codec_bcl2fastq.wdl index 713c23d..8e9058f 100644 --- a/CODEC/codec_bcl2fastq.wdl +++ b/CODEC/codec_bcl2fastq.wdl @@ -40,7 +40,7 @@ task GetBclBucketSize { } command <<< - gsutil du -s ${input_bcl_directory} | awk '{print $1/1024/1024/1024}' > input_bcl_size.txt + gsutil du -sh ${input_bcl_directory} | awk '{print $1/1024/1024/1024}' > input_bcl_size.txt >>> output { @@ -48,7 +48,7 @@ task GetBclBucketSize { } runtime { - docker: "us-central1-docker.pkg.dev/tag-team-160914/gptag-dockers/bcl2fastq_codec:v1" + docker: "us.gcr.io/tag-public/bcl2fastq_codec:v1" memory: memory + " GB" disks: "local-disk 16 HDD" } @@ -108,7 +108,7 @@ task run_bcl2fastq { } runtime { - docker: "us-central1-docker.pkg.dev/tag-team-160914/gptag-dockers/bcl2fastq_codec:v1" + docker: "us.gcr.io/tag-public/bcl2fastq_codec:v1" memory: memory bootDiskSizeGb: 12 disks: "local-disk ${disk_space} HDD" From 852be3e621e3782b470e7d88c8822448bf5adf7a Mon Sep 17 00:00:00 2001 From: Stella Date: Thu, 30 May 2024 11:41:27 -0400 Subject: [PATCH 16/21] change to public docker from tag-public --- CODEC/SingleSampleCODEC.wdl | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/CODEC/SingleSampleCODEC.wdl b/CODEC/SingleSampleCODEC.wdl index 2163e33..233d80d 100644 --- a/CODEC/SingleSampleCODEC.wdl +++ b/CODEC/SingleSampleCODEC.wdl @@ -241,7 +241,7 @@ task SplitFastq1 { } runtime { - docker: "us.gcr.io/tag-team-160914/codec:v1" + docker: "us.gcr.io/tag-public/codec:v1.1.1" memory: memory + " GB" disks: "local-disk " + disk_size + " HDD" } @@ -268,7 +268,7 @@ task SplitFastq2 { } runtime { - docker: "us.gcr.io/tag-team-160914/codec:v1" + docker: "us.gcr.io/tag-public/codec:v1.1.1" memory: memory + " GB" disks: "local-disk " + disk_size + " HDD" } @@ -293,7 +293,7 @@ task Trim { } runtime { - docker: "us.gcr.io/tag-team-160914/codec:v1" + docker: "us.gcr.io/tag-public/codec:v1.1.1" disks: "local-disk " + disk_size + " HDD" memory: mem + " GB" } @@ -337,7 +337,7 @@ task AlignRawTrimmed { } runtime { - docker: "us.gcr.io/tag-team-160914/codec:v1" + docker: "us.gcr.io/tag-public/codec:v1.1.1" memory: mem + " GB" disks: "local-disk " + disk_size + " HDD" preemptible: 3 @@ -376,7 +376,7 @@ task ZipperBamAlignment { } runtime { - docker: "us.gcr.io/tag-team-160914/codec:v1" + docker: "us.gcr.io/tag-public/codec:v1.1.1" memory: select_first([mem, 8]) + " GB" disks: "local-disk " + select_first([disk_size, 16]) + " HDD" preemptible: 3 @@ -403,7 +403,7 @@ task MergeSplit { } runtime { - docker: "us.gcr.io/tag-team-160914/codec:v1" + docker: "us.gcr.io/tag-public/codec:v1.1.1" disks: "local-disk " + disk_size + " HDD" memory: memory + " GB" } @@ -427,7 +427,7 @@ task MergeLogSplit { } runtime { - docker: "us.gcr.io/tag-team-160914/codec:v1" + docker: "us.gcr.io/tag-public/codec:v1.1.1" disks: "local-disk " + disk_size + " HDD" memory: mem + " GB" } @@ -450,7 +450,7 @@ task SortBam { } runtime { - docker: "us.gcr.io/tag-team-160914/codec:v1" + docker: "us.gcr.io/tag-public/codec:v1.1.1" disks: "local-disk " + disk_size + " HDD" memory: mem + " GB" preemptible: 2 @@ -476,7 +476,7 @@ task ByProductMetrics { } runtime { - docker: "us.gcr.io/tag-team-160914/codec:v1" + docker: "us.gcr.io/tag-public/codec:v1.1.1" disks: "local-disk " + disk_size + " HDD" memory: mem + " GB" } @@ -509,7 +509,7 @@ task ReplaceRawReadGroup { runtime { memory: memory + " GB" - docker: "us.gcr.io/tag-team-160914/codec:v1" + docker: "us.gcr.io/tag-public/codec:v1.1.1" disks: "local-disk " + disk_size + " HDD" } } @@ -537,7 +537,7 @@ task CollectInsertSizeMetrics { runtime { memory: memory + " GB" - docker: "us.gcr.io/tag-team-160914/codec:v1" + docker: "us.gcr.io/tag-public/codec:v1.1.1" disks: "local-disk " + disk_size + " HDD" } } @@ -568,7 +568,7 @@ task GroupReadByUMI { runtime { memory: memory + " GB" - docker: "us.gcr.io/tag-team-160914/codec:v1" + docker: "us.gcr.io/tag-public/codec:v1.1.1" disks: "local-disk " + disk_size + " HDD" } } @@ -598,7 +598,7 @@ task FgbioCollapseReadFamilies { runtime { memory: memory + " GB" - docker: "us.gcr.io/tag-team-160914/codec:v1" + docker: "us.gcr.io/tag-public/codec:v1.1.1" disks: "local-disk " + disk_size + " HDD" } } @@ -633,7 +633,7 @@ task AlignMolecularConsensusReads { runtime { memory: memory + " GB" - docker: "us.gcr.io/tag-team-160914/codec:v1" + docker: "us.gcr.io/tag-public/codec:v1.1.1" disks: "local-disk " + disk_size + " HDD" cpu: cpu_cores preemptible: 3 @@ -670,7 +670,7 @@ task MergeAndSortMoleculeConsensusReads { runtime { memory: memory+ " GB" - docker: "us.gcr.io/tag-team-160914/codec:v1" + docker: "us.gcr.io/tag-public/codec:v1.1.1" disks: "local-disk " + disk_size + " HDD" preemptible: 3 } @@ -701,7 +701,7 @@ task CollectWgsMetrics { runtime { memory: memory + " GB" - docker: "us.gcr.io/tag-team-160914/codec:v1" + docker: "us.gcr.io/tag-public/codec:v1.1.1" disks: "local-disk " + disk_size + " HDD" preemptible: 3 } @@ -762,7 +762,7 @@ task CSS_SFC_ErrorMetrics { runtime { memory: memory + " GB" - docker: "us.gcr.io/tag-team-160914/codec:v1" + docker: "us.gcr.io/tag-public/codec:v1.1.1" disks: "local-disk " + disk_size + " HDD" preemptible: 3 } From aaa259ec431ce00bfd7dbbccead4a16cc2ab2485 Mon Sep 17 00:00:00 2001 From: Stella Date: Thu, 30 May 2024 11:57:06 -0400 Subject: [PATCH 17/21] switch to public dockers --- CODEC/demux_CODEC.wdl | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/CODEC/demux_CODEC.wdl b/CODEC/demux_CODEC.wdl index 7b5a561..850db8b 100644 --- a/CODEC/demux_CODEC.wdl +++ b/CODEC/demux_CODEC.wdl @@ -98,7 +98,7 @@ task SplitFastq1 { } runtime { - docker: "us.gcr.io/tag-team-160914/codec:v1" + docker: "us.gcr.io/tag-public/codec:v1.1.1" memory: memory + " GB" disks: "local-disk " + disk_size + " HDD" } @@ -124,7 +124,7 @@ task SplitFastq2 { } runtime { - docker: "us.gcr.io/tag-team-160914/codec:v1" + docker: "us.gcr.io/tag-public/codec:v1.1.1" memory: memory + " GB" disks: "local-disk " + disk_size + " HDD" } @@ -146,7 +146,7 @@ task Demux { /CODECsuite/build/codec demux -1 ${paired_fastqs.left} -2 ${paired_fastqs.right} -p ${sample_sheet} -o ${batch_id}_split.${split} > ${batch_id}_split.${split}.log } runtime { - docker: "us.gcr.io/tag-team-160914/codec:v1" + docker: "us.gcr.io/tag-public/codec:v1.1.1" memory: memory + " GB" disks: "local-disk " + disk_size + " HDD" } @@ -181,7 +181,7 @@ task GroupFastqFiles { } runtime { - docker: "us.gcr.io/tag-team-160914/codec:v1" + docker: "us.gcr.io/tag-public/bcl2fastq_codec:v1" disks: "local-disk " + disk_size + " HDD" memory: memory + " GB" } From e0dc64bce311a6893904b1b108c44273cc99e56e Mon Sep 17 00:00:00 2001 From: Stella Date: Thu, 11 Jul 2024 11:31:33 -0400 Subject: [PATCH 18/21] Add files for new module Signature Profiler of CODEC pipeline and modified the dockstore.yml to add this pipeline to Public workflows --- .dockstore.yml | 5 ++ CODEC/Dockerfile | 58 -------------- CODEC/README.md | 19 +++++ CODEC/SigProfiler.inputs.json | 1 + CODEC/SigProfiler.wdl | 145 ++++++++++++++++++++++++++++++++++ CODEC/prep_codec_metadata.py | 29 ------- 6 files changed, 170 insertions(+), 87 deletions(-) delete mode 100644 CODEC/Dockerfile create mode 100644 CODEC/README.md create mode 100644 CODEC/SigProfiler.inputs.json create mode 100644 CODEC/SigProfiler.wdl delete mode 100644 CODEC/prep_codec_metadata.py diff --git a/.dockstore.yml b/.dockstore.yml index 78edf41..8947045 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -134,3 +134,8 @@ workflows: primaryDescriptorPath: /CODEC/SingleSampleCODEC.wdl testParameterFiles: - /CODEC/SingleSampleCODEC.inputs.json + - name: SigProfiler + subclass: WDL + primaryDescriptorPath: /CODEC/SigProfiler.wdl + testParameterFiles: + - /CODEC/SigProfiler.inputs.json diff --git a/CODEC/Dockerfile b/CODEC/Dockerfile deleted file mode 100644 index 4b389a3..0000000 --- a/CODEC/Dockerfile +++ /dev/null @@ -1,58 +0,0 @@ -FROM --platform=linux/amd64 ubuntu:20.04 - -LABEL maintainer="lining@broadinstitute.org" - -ENV DEBIAN_FRONTEND noninteractive - -RUN apt-get update \ - && apt-get install -y software-properties-common \ - && add-apt-repository ppa:deadsnakes/ppa \ - && apt-get update \ - && apt-get install -y python3.8 python3.8-dev python3.8-venv python3-pip \ - && pip3 install pandas argparse numpy pysam - -RUN apt-get install -y r-base r-base-dev - -RUN apt-get install -y \ - git \ - wget \ - bwa \ - libssl-dev \ - g++ \ - zlib1g-dev \ - autoconf \ - libbz2-dev \ - liblzma-dev \ - libcurl4-gnutls-dev \ - libssl-dev \ - build-essential \ - software-properties-common - -RUN apt-get update -qq -RUN apt-get install -y openjdk-11-jdk - - - -# Clone the CODECsuite repository -RUN git clone --recursive https://github.com/broadinstitute/CODECsuite.git /CODECsuite - - -RUN wget https://github.com/Kitware/CMake/releases/download/v3.28.0-rc3/cmake-3.28.0-rc3-linux-x86_64.tar.gz \ - && tar -xzvf cmake-3.28.0-rc3-linux-x86_64.tar.gz \ - && mv cmake-3.28.0-rc3-linux-x86_64 /opt/cmake-3.28 \ - && ln -s /opt/cmake-3.28/bin/cmake /usr/local/bin/cmake \ - && ln -s /opt/cmake-3.28/bin/ctest /usr/local/bin/ctest \ - && ln -s /opt/cmake-3.28/bin/cpack /usr/local/bin/cpack \ - && rm cmake-3.28.0-rc3-linux-x86_64.tar.gz - -RUN cd CODECsuite && \ - mkdir build && \ - cd build && \ - cmake .. && \ - make - -COPY dependencies/ /dependencies/ -COPY reference_files/ /reference_files/ - -RUN cp /dependencies/samtools-1.9/samtools /usr/bin/ -RUN chmod +x /CODECsuite/snakemake/script/agg_log.py \ No newline at end of file diff --git a/CODEC/README.md b/CODEC/README.md new file mode 100644 index 0000000..d26ad54 --- /dev/null +++ b/CODEC/README.md @@ -0,0 +1,19 @@ +# Signature Profiling WDL for CODEC Mutlist Output +CODEC pipeline: SingleSampleCODEC pipeline provides text files for discovered mutations. + +This workflow summarizes and plots mutation spectrums in 96 trinucleotide contexts and generate Mutation Matrix that will be later used to subtract SBS(Single Base Substitution) signatures from SNVs with database reference from COSMIC(https://cancer.sanger.ac.uk/signatures/). + +The Signature Profiling tool is from https://github.com/AlexandrovLab/SigProfilerAssignment and has been implanted to the docker image. + +The output of this WDL includes: +1) SpectrumPlots +2) MutationMetrics +3) SignatureCount +4) SignatureProportionPDF +5) SignatureStackedPlot +6) TMBPlot +7) DecomposedSignatureProbabilities + + +### Citation +Díaz-Gay et al. 2023 Bioinformatics and Tate et al. 2019 Nucleic Acids Research \ No newline at end of file diff --git a/CODEC/SigProfiler.inputs.json b/CODEC/SigProfiler.inputs.json new file mode 100644 index 0000000..07ec31a --- /dev/null +++ b/CODEC/SigProfiler.inputs.json @@ -0,0 +1 @@ +{"SigProfiler.GenomeFasta":"${workspace.referenceData_hg38_ref_fasta}","SigProfiler.MutlistFiles":"${this.samples.variants_called}","SigProfiler.mutlist_to_96_contexts.GenomeFastaIndex":"${workspace.referenceData_hg38_ref_fasta_index}"} \ No newline at end of file diff --git a/CODEC/SigProfiler.wdl b/CODEC/SigProfiler.wdl new file mode 100644 index 0000000..01f8fa6 --- /dev/null +++ b/CODEC/SigProfiler.wdl @@ -0,0 +1,145 @@ +version 1.0 + +workflow SigProfiler { + input { + Array[File] MutlistFiles + File GenomeFasta + } + + call mutlist_to_96_contexts { + input: + MutlistFiles = MutlistFiles, + GenomeFasta = GenomeFasta + } + call sigprofiler_analysis { + input: + MutationMetrics = mutlist_to_96_contexts.MutationMetrics + + } + call PlotSignatures { + input: + SignatureCount = sigprofiler_analysis.SignatureCount + } + + output { + File MutationMetrics = mutlist_to_96_contexts.MutationMetrics + File SpectrumPlots = mutlist_to_96_contexts.SpectrumPlots + File DecomposedSignatureProbabilities = sigprofiler_analysis.DecomposedSignatureProbabilities + File SignatureStackedPlot = sigprofiler_analysis.SignatureStackedPlot + File TMBPlot = sigprofiler_analysis.TMBPlot + File SignatureCount = sigprofiler_analysis.SignatureCount + File SignatureProportionPDF = PlotSignatures.signature_proportions_pdf + } +} + + + +task mutlist_to_96_contexts { + input { + Array[File] MutlistFiles + File GenomeFasta + File GenomeFastaIndex + } + + command { + Rscript /scripts/96_contexts_mutations.R "~{sep=' ' MutlistFiles}" ~{GenomeFasta} + } + + output { + File MutationMetrics = "trinuc_mutation_metrics.txt" + File SpectrumPlots = "all_sample_spectrums.pdf" + } + + runtime { + docker: "us.gcr.io/tag-public/sigprofiler:v1" + memory: "8 GB" + disks: "local-disk 20 HDD" + } +} + +task sigprofiler_analysis { + input { + File MutationMetrics + String OutputFolder = "SigProfiler-output" + } + + command { + python3 < 0] + + # Plot the data + plt.figure(figsize=(16, 9)) + sns.scatterplot(data=SigCounts_long, x="Samples", y="Signature", size="Proportion", sizes=(20, 200), legend=False) + plt.xticks(rotation=90) + plt.xlabel("Sample Name", fontsize=16) + plt.ylabel("Signature", fontsize=16) + plt.title("Signature Proportions by Sample", fontsize=20, pad = 20) + plt.grid(axis='y') + ax = plt.gca() + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.spines['left'].set_visible(False) + ax.spines['bottom'].set_visible(False) + plt.tight_layout() + plt.savefig("signature_proportions.pdf", format="pdf") + EOF + } + + output { + File signature_proportions_pdf = "signature_proportions.pdf" + } + + runtime { + docker: "us.gcr.io/tag-public/sigprofiler:v1" + memory: "8 GB" + disks: "local-disk 20 HDD" + } +} diff --git a/CODEC/prep_codec_metadata.py b/CODEC/prep_codec_metadata.py deleted file mode 100644 index 835b3da..0000000 --- a/CODEC/prep_codec_metadata.py +++ /dev/null @@ -1,29 +0,0 @@ -# Running this script is a prerequisite for demux_CODEC wdl. -# This script is taking an Excel file provided by collaborator -# with Barcode assigned to each sample and a default Index reference file, -# to generate a sample_sheet_L00{lane}.csv file for each lane. - - -import sys -import pandas as pd - - -if len(sys.argv) != 3: - print("Usage: python3 prep_codec_metadata.py ") - sys.exit(1) - -metadata_excel_file = sys.argv[1] -index_csv_file = sys.argv[2] - -xlsx_df = pd.read_excel(metadata_excel_file) -index_df = pd.read_csv(index_csv_file) - -merged_df = pd.merge( - xlsx_df, index_df, left_on='CODEC index', right_on='index') - -for lane, group in merged_df.groupby('lanes'): - - output_df = group[['submission_id', 'IndexBarcode1', 'IndexBarcode2']] - output_df.columns = ['SampleName', 'IndexBarcode1', 'IndexBarcode2'] - - output_df.to_csv(f'sample_sheet_L00{lane}.csv', index=False) From 3b2f657804e3460a7f369116d92697dde96e10ad Mon Sep 17 00:00:00 2001 From: Stella Date: Tue, 15 Oct 2024 15:52:31 -0400 Subject: [PATCH 19/21] adding and merging pipelines for captured data --- CODEC/SingleSampleCODEC.wdl | 386 +++++++++++++++++++++++++++++------- 1 file changed, 313 insertions(+), 73 deletions(-) diff --git a/CODEC/SingleSampleCODEC.wdl b/CODEC/SingleSampleCODEC.wdl index 233d80d..aebdaba 100644 --- a/CODEC/SingleSampleCODEC.wdl +++ b/CODEC/SingleSampleCODEC.wdl @@ -1,6 +1,6 @@ version 1.0 -workflow SingleSampleCODEC { +workflow SingleSampleCODEC_targeted { input { String sample_id File fastq1 @@ -13,12 +13,16 @@ workflow SingleSampleCODEC { File reference_ann File reference_bwt File reference_sa - File germline_bam - File germline_bam_index + File? germline_bam + File? germline_bam_index Int num_parallel - String sort_memory + String sort_memory File eval_genome_interval = "gs://gptag/CODEC/GRCh38_notinalldifficultregions.interval_list" File eval_genome_bed = "gs://gptag/CODEC/GRCh38_notinalldifficultregions.bed" + File? bait_intervals + File? target_intervals + Boolean is_captured_data + Boolean create_maf } call SplitFastq1 { input: @@ -104,6 +108,20 @@ workflow SingleSampleCODEC { input_bam = ReplaceRawReadGroup.bam, sample_id = sample_id } + if (is_captured_data) { + call DuplexRecoveryMetrics { + input: + groupbyumi_bam = GroupReadByUMI.groupbyumi_bam, + sample_id = sample_id, + duplex_eval_bed = bait_intervals + + } + call PlotDuplexRecoveryByTarget { + input: + sample_id = sample_id, + duplex_recovery_metrics = DuplexRecoveryMetrics.duplex_recovery_metrics + } + } call FgbioCollapseReadFamilies { input: grouped_umi_bam = GroupReadByUMI.groupbyumi_bam, @@ -132,15 +150,22 @@ workflow SingleSampleCODEC { sample_id = sample_id, sort_memory = sort_memory } - call CollectWgsMetrics { - input: - ConsensusAlignedBam = MergeAndSortMoleculeConsensusReads.bam, - ConsensusAlignedBai = MergeAndSortMoleculeConsensusReads.bai, - sample_id = sample_id, - reference_fasta = reference_fasta, - reference_fasta_index = reference_fasta_index, + if (is_captured_data) { + call CollectSelectionMetrics { + input: + reference = reference_fasta, + reference_index = reference_fasta_index, reference_dict = reference_dict, - eval_genome_interval = eval_genome_interval + bam_file = MergeAndSortMoleculeConsensusReads.bam, + bam_index = MergeAndSortMoleculeConsensusReads.bai, + sample_id = sample_id, + bait_intervals = bait_intervals, + target_intervals = target_intervals + } + call Hs_metrics { + input: + output_selection_metrics = CollectSelectionMetrics.output_selection_metrics + } } call CSS_SFC_ErrorMetrics { input: @@ -159,10 +184,16 @@ workflow SingleSampleCODEC { germline_bam_index = germline_bam_index, eval_genome_bed = eval_genome_bed } + if (create_maf) { + call codec2MAF { + input: + variants_called = CSS_SFC_ErrorMetrics.variants_called, + sample_id = sample_id + } + } call QC_metrics { input: byproduct_metrics = ByProductMetrics.byproduct_metrics, - WgsMetrics = CollectWgsMetrics.WgsMetrics, umiHistogram = GroupReadByUMI.umi_histogram, InsertSizeMetrics = CollectInsertSizeMetrics.insert_size_metrics, mutant_metrics = CSS_SFC_ErrorMetrics.mutant_metrics, @@ -188,7 +219,11 @@ workflow SingleSampleCODEC { File InsertSizeMetrics = CollectInsertSizeMetrics.insert_size_metrics File InsertSizeHistogram = CollectInsertSizeMetrics.insert_size_histogram File umiHistogram = GroupReadByUMI.umi_histogram - File WgsMetrics = CollectWgsMetrics.WgsMetrics + + File? output_selection_metrics = CollectSelectionMetrics.output_selection_metrics + File? output_per_target_selection_metrics = CollectSelectionMetrics.output_per_target_selection_metrics + File? output_theoretical_sensitivity = CollectSelectionMetrics.output_theoretical_sensitivity + File mutant_metrics = CSS_SFC_ErrorMetrics.mutant_metrics File context_count = CSS_SFC_ErrorMetrics.context_count File variants_called = CSS_SFC_ErrorMetrics.variants_called @@ -201,9 +236,7 @@ workflow SingleSampleCODEC { Int n_intermol = QC_metrics.n_intermol Float pct_intermol = QC_metrics.pct_intermol Int n_adp_dimer = QC_metrics.n_adp_dimer - Float pct_adp_dimer = QC_metrics.pct_adp_dimer - Float raw_dedupped_mean_cov = QC_metrics.raw_dedupped_mean_cov - Int raw_dedupped_median_cov = QC_metrics.raw_dedupped_median_cov + Float pct_adp_dimer = QC_metrics.pct_adp_dimer Float duplication_rate = QC_metrics.duplication_rate Float mean_insert_size = QC_metrics.mean_insert_size Int median_insert_size = QC_metrics.median_insert_size @@ -213,9 +246,14 @@ workflow SingleSampleCODEC { Float snv_rate = QC_metrics.snv_rate Float indel_rate = QC_metrics.indel_rate String eval_genome_bases = EvalGenomeBases.eval_genome_bases + Float? pct_selected_bases = Hs_metrics.pct_selected_bases + Float? mean_bait_coverage = Hs_metrics.mean_bait_coverage + Float? mean_target_coverage = Hs_metrics.mean_target_coverage Float duplex_depth = CalculateDuplexDepth.duplex_depth Float duplex_efficiency = CalculateDuplexDepth.duplex_efficiency - + File? duplex_recovery_metrics = DuplexRecoveryMetrics.duplex_recovery_metrics + File? ds_duplex_plots = PlotDuplexRecoveryByTarget.ds_duplex_plots + File? outputMAF = codec2MAF.output_maf } } @@ -227,6 +265,7 @@ task SplitFastq1 { Int memory = 64 Int? extra_disk Int disk_size = ceil(size(fastq_read1, "GB") * 15) + select_first([extra_disk, 0]) + String? docker_override } command <<< @@ -241,7 +280,7 @@ task SplitFastq1 { } runtime { - docker: "us.gcr.io/tag-public/codec:v1.1.1" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) memory: memory + " GB" disks: "local-disk " + disk_size + " HDD" } @@ -255,6 +294,7 @@ task SplitFastq2 { Int memory = 64 Int? extra_disk Int disk_size = ceil(size(fastq_read2, "GB") * 15) + select_first([extra_disk, 0]) + String? docker_override } command <<< @@ -268,7 +308,7 @@ task SplitFastq2 { } runtime { - docker: "us.gcr.io/tag-public/codec:v1.1.1" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) memory: memory + " GB" disks: "local-disk " + disk_size + " HDD" } @@ -284,6 +324,7 @@ task Trim { Int mem = 16 Int? extra_disk Int disk_size = ceil(size(read1, "GB") * 8) + select_first([extra_disk, 0]) + String? docker_override } command { @@ -293,7 +334,7 @@ task Trim { } runtime { - docker: "us.gcr.io/tag-public/codec:v1.1.1" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) disks: "local-disk " + disk_size + " HDD" memory: mem + " GB" } @@ -319,6 +360,7 @@ task AlignRawTrimmed { Int disk_size = ceil(size(bam_input, "GB") * 20) + select_first([extra_disk, 0]) String sample_id Int split + String? docker_override } String output_bam_name = "${sample_id}_split.${split}.aligned_tmp.bam" @@ -337,7 +379,7 @@ task AlignRawTrimmed { } runtime { - docker: "us.gcr.io/tag-public/codec:v1.1.1" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) memory: mem + " GB" disks: "local-disk " + disk_size + " HDD" preemptible: 3 @@ -356,6 +398,7 @@ task ZipperBamAlignment { String sample_id Int split String sort_memory + String? docker_override } @@ -376,7 +419,7 @@ task ZipperBamAlignment { } runtime { - docker: "us.gcr.io/tag-public/codec:v1.1.1" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) memory: select_first([mem, 8]) + " GB" disks: "local-disk " + select_first([disk_size, 16]) + " HDD" preemptible: 3 @@ -389,6 +432,7 @@ task MergeSplit { String sample_id Int memory = 64 Int disk_size =200 + String? docker_override } command { @@ -403,7 +447,7 @@ task MergeSplit { } runtime { - docker: "us.gcr.io/tag-public/codec:v1.1.1" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) disks: "local-disk " + disk_size + " HDD" memory: memory + " GB" } @@ -415,6 +459,7 @@ task MergeLogSplit { String sample_id Int mem = 32 Int disk_size = 64 + String? docker_override } command { @@ -427,18 +472,112 @@ task MergeLogSplit { } runtime { - docker: "us.gcr.io/tag-public/codec:v1.1.1" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) disks: "local-disk " + disk_size + " HDD" memory: mem + " GB" } } +task CollectSelectionMetrics { + input{ + File reference + File reference_index + File reference_dict + File bam_file + File bam_index + String sample_id + File bait_intervals + File target_intervals + Int? preemptible_attempts + Int memory = 32 + Int disk_pad = 0 + Int disk_size = ceil(size(bam_file, "GB") * 2) + disk_pad + String? docker_override + } + + command { + + java -jar /dependencies/picard.jar CollectHsMetrics \ + I=~{bam_file} \ + O=~{sample_id}.selection_metrics \ + PER_TARGET_COVERAGE=~{sample_id}.per_target_selection_metrics \ + THEORETICAL_SENSITIVITY_OUTPUT=~{sample_id}.theoretical_sensitivity \ + BAIT_INTERVALS=~{bait_intervals} \ + TARGET_INTERVALS=~{target_intervals} \ + REFERENCE_SEQUENCE=~{reference} + } + + output { + File output_selection_metrics = "~{sample_id}.selection_metrics" + File output_per_target_selection_metrics = "~{sample_id}.per_target_selection_metrics" + File output_theoretical_sensitivity = "~{sample_id}.theoretical_sensitivity" + } + + runtime { + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) + disks: "local-disk " + disk_size + " HDD" + memory: memory + " GB" + preemptible: select_first([preemptible_attempts, 3]) + } +} + +task Hs_metrics { + input { + File output_selection_metrics + } + + command <<< + python3 <>> + + output { + Float pct_selected_bases = read_float("pct_selected_bases.txt") + Float mean_bait_coverage = read_float("mean_bait_coverage.txt") + Float mean_target_coverage = read_float("mean_target_coverage.txt") + } + + runtime { + memory: "16 GB" + docker: "us.gcr.io/tag-public/metadata_upload" + disks: "local-disk 16 HDD" + preemptible: 1 + } +} + task SortBam { input { File bam_file String sample_id Int mem = 64 Int disk_size = 200 + String? docker_override } command { @@ -450,7 +589,7 @@ task SortBam { } runtime { - docker: "us.gcr.io/tag-public/codec:v1.1.1" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) disks: "local-disk " + disk_size + " HDD" memory: mem + " GB" preemptible: 2 @@ -464,6 +603,7 @@ task ByProductMetrics { String sample_id Int mem = 32 Int disk_size = 100 + String? docker_override } command { @@ -476,7 +616,7 @@ task ByProductMetrics { } runtime { - docker: "us.gcr.io/tag-public/codec:v1.1.1" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) disks: "local-disk " + disk_size + " HDD" memory: mem + " GB" } @@ -488,6 +628,7 @@ task ReplaceRawReadGroup { String sample_id Int memory = 64 Int disk_size = 200 + String? docker_override } command { @@ -509,7 +650,7 @@ task ReplaceRawReadGroup { runtime { memory: memory + " GB" - docker: "us.gcr.io/tag-public/codec:v1.1.1" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) disks: "local-disk " + disk_size + " HDD" } } @@ -520,6 +661,7 @@ task CollectInsertSizeMetrics { String sample_id Int memory = 32 Int disk_size = 200 + String? docker_override } command { @@ -537,7 +679,7 @@ task CollectInsertSizeMetrics { runtime { memory: memory + " GB" - docker: "us.gcr.io/tag-public/codec:v1.1.1" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) disks: "local-disk " + disk_size + " HDD" } } @@ -549,6 +691,7 @@ task GroupReadByUMI { Int memory = 64 Int? extra_disk Int disk_size = ceil(size(input_bam, "GB") * 15) + select_first([extra_disk, 0]) + String? docker_override } command { @@ -568,11 +711,108 @@ task GroupReadByUMI { runtime { memory: memory + " GB" - docker: "us.gcr.io/tag-public/codec:v1.1.1" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) disks: "local-disk " + disk_size + " HDD" } } + +task DuplexRecoveryMetrics { + input { + File groupbyumi_bam + String sample_id + File duplex_eval_bed + Int memory = 32 + Int extra_disk = 0 + Int disk_size = ceil(size(groupbyumi_bam , "GB") * 3) + select_first([extra_disk, 0]) + String sorted_bam = basename(groupbyumi_bam, ".bam") + ".sorted.bam" + String? docker_override + } + + command { + + samtools sort ~{groupbyumi_bam} -o ~{sorted_bam} + + python3 /scripts/collect_duplex_metrics.py \ + --bam_file ~{sorted_bam} \ + --output_file ~{sample_id}.duplex_metrics.txt \ + --interval_list ~{duplex_eval_bed} \ + --per_target \ + --is_cds + + } + + output { + File duplex_recovery_metrics = "~{sample_id}.duplex_metrics.txt" + } + + runtime { + memory: memory + " GB" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) + disks: "local-disk " + disk_size + " HDD" + } +} + +task PlotDuplexRecoveryByTarget { + input { + File duplex_recovery_metrics + String sample_id + } + + command <<< + python3 <>> + + + output { + File ds_duplex_plots = "~{sample_id}.ds_duplex_plots.pdf" + } + + runtime { + memory: "16 GB" + docker: "us.gcr.io/tag-public/python:test" + disks: "local-disk 16 HDD" + } +} + task FgbioCollapseReadFamilies { input { File grouped_umi_bam @@ -580,6 +820,7 @@ task FgbioCollapseReadFamilies { Int memory = 64 Int? extra_disk Int disk_size = ceil(size(grouped_umi_bam, "GB") * 10) + select_first([extra_disk, 0]) + String? docker_override } command { @@ -598,7 +839,7 @@ task FgbioCollapseReadFamilies { runtime { memory: memory + " GB" - docker: "us.gcr.io/tag-public/codec:v1.1.1" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) disks: "local-disk " + disk_size + " HDD" } } @@ -619,6 +860,7 @@ task AlignMolecularConsensusReads { Int disk_size = 200 Int threads = 4 Int cpu_cores = 1 + String? docker_override } String output_bam_name = "${sample_id}.mol_consensus.aligned_tmp.bam" @@ -633,7 +875,7 @@ task AlignMolecularConsensusReads { runtime { memory: memory + " GB" - docker: "us.gcr.io/tag-public/codec:v1.1.1" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) disks: "local-disk " + disk_size + " HDD" cpu: cpu_cores preemptible: 3 @@ -651,6 +893,7 @@ task MergeAndSortMoleculeConsensusReads { Int memory = 64 Int disk_size = 200 String sort_memory + String? docker_override } command { @@ -670,38 +913,7 @@ task MergeAndSortMoleculeConsensusReads { runtime { memory: memory+ " GB" - docker: "us.gcr.io/tag-public/codec:v1.1.1" - disks: "local-disk " + disk_size + " HDD" - preemptible: 3 - } -} - -task CollectWgsMetrics { - input { - File ConsensusAlignedBam - File ConsensusAlignedBai - String sample_id - File reference_fasta - File reference_fasta_index - File reference_dict - File eval_genome_interval - Int memory = 32 - Int disk_size = 200 - } - - command { - java -jar /dependencies/picard.jar CollectWgsMetrics \ - I=~{ConsensusAlignedBam} O=~{sample_id}.wgs_metrics.txt R=~{reference_fasta} INTERVALS=~{eval_genome_interval} \ - COUNT_UNPAIRED=true MINIMUM_BASE_QUALITY=0 MINIMUM_MAPPING_QUALITY=0 - } - - output { - File WgsMetrics = "~{sample_id}.wgs_metrics.txt" - } - - runtime { - memory: memory + " GB" - docker: "us.gcr.io/tag-public/codec:v1.1.1" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) disks: "local-disk " + disk_size + " HDD" preemptible: 3 } @@ -721,13 +933,14 @@ task CSS_SFC_ErrorMetrics { File reference_ann File reference_bwt File reference_sa - File germline_bam - File germline_bam_index + File? germline_bam + File? germline_bam_index File eval_genome_bed File population_based_vcf = "gs://gptag/CODEC/alfa_all.freq.breakmulti.hg38.af0001.vcf.gz" File population_based_vcf_index = "gs://gptag/CODEC/alfa_all.freq.breakmulti.hg38.af0001.vcf.gz.tbi" Int memory = 64 Int disk_size = 200 + String? docker_override } command { @@ -737,7 +950,7 @@ task CSS_SFC_ErrorMetrics { -m 60 \ -q 30 \ -d 12 \ - -n ~{germline_bam} \ + ~{if defined(germline_bam) then "-n " + germline_bam else ""} \ -V ~{population_based_vcf} \ -x 6 \ -c 4 \ @@ -762,7 +975,7 @@ task CSS_SFC_ErrorMetrics { runtime { memory: memory + " GB" - docker: "us.gcr.io/tag-public/codec:v1.1.1" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) disks: "local-disk " + disk_size + " HDD" preemptible: 3 } @@ -772,7 +985,6 @@ task CSS_SFC_ErrorMetrics { task QC_metrics { input { File byproduct_metrics - File WgsMetrics File umiHistogram File InsertSizeMetrics File mutant_metrics @@ -802,8 +1014,6 @@ task QC_metrics { awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="n_adp_dimer") col=i} NR==2 {print $col}' ~{byproduct_metrics} > n_adp_dimer.txt awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="pct_adp_dimer") col=i} NR==2 {print $col}' ~{byproduct_metrics} > pct_adp_dimer.txt - cat ~{WgsMetrics} | grep -v "#" | awk 'NR==3 {print $2}' > raw_dedupped_mean_cov.txt - cat ~{WgsMetrics} | grep -v "#" | awk 'NR==3 {print $4}' > raw_dedupped_median_cov.txt cat ~{InsertSizeMetrics} | grep -v "#" | awk 'NR==3 {print $1}' > median_insert_size.txt cat ~{InsertSizeMetrics} | grep -v "#" | awk 'NR==3 {print $6}' > mean_insert_size.txt @@ -827,8 +1037,6 @@ task QC_metrics { Float pct_intermol = read_float("pct_intermol.txt") Int n_adp_dimer = read_int("n_adp_dimer.txt") Float pct_adp_dimer = read_float("pct_adp_dimer.txt") - Float raw_dedupped_mean_cov = read_float("raw_dedupped_mean_cov.txt") - Int raw_dedupped_median_cov = read_int("raw_dedupped_median_cov.txt") Float mean_insert_size = read_float("mean_insert_size.txt") Int median_insert_size = read_int("median_insert_size.txt") Int n_snv = read_int("n_snv.txt") @@ -911,4 +1119,36 @@ task CalculateDuplexDepth { disks: "local-disk " + disk_size + " HDD" preemptible: 1 } +} + +task codec2MAF { + input { + File variants_called + String sample_id + String? docker_override + } + + command <<< + + # Only run this script when there are variants called. + + if [[ $(wc -l < ~{variants_called}) -gt 2 ]]; then + Rscript /scripts/codec2maf.R --inputmaf ~{variants_called} --outputmaf ~{sample_id}.maf + else + echo "Warning: File ~{variants_called} has no variants. Skipping this task and returning empty maf." + touch ~{sample_id}.maf + fi + + >>> + + output { + File output_maf = "~{sample_id}.maf" + } + + runtime{ + memory: "16G" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec2maf_r_docker:v1"]) + disks: "local-disk 16 HDD" + preemptible: 1 + } } \ No newline at end of file From 60db35160d1c37616f5a6a02fc9cc60260b6a8e9 Mon Sep 17 00:00:00 2001 From: Stella Date: Tue, 15 Oct 2024 16:00:24 -0400 Subject: [PATCH 20/21] adding and merging pipelines for captured data --- CODEC/SingleSampleCODEC.wdl | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/CODEC/SingleSampleCODEC.wdl b/CODEC/SingleSampleCODEC.wdl index aebdaba..10aac0a 100644 --- a/CODEC/SingleSampleCODEC.wdl +++ b/CODEC/SingleSampleCODEC.wdl @@ -114,7 +114,6 @@ workflow SingleSampleCODEC_targeted { groupbyumi_bam = GroupReadByUMI.groupbyumi_bam, sample_id = sample_id, duplex_eval_bed = bait_intervals - } call PlotDuplexRecoveryByTarget { input: @@ -486,8 +485,8 @@ task CollectSelectionMetrics { File bam_file File bam_index String sample_id - File bait_intervals - File target_intervals + File? bait_intervals + File? target_intervals Int? preemptible_attempts Int memory = 32 Int disk_pad = 0 @@ -721,7 +720,7 @@ task DuplexRecoveryMetrics { input { File groupbyumi_bam String sample_id - File duplex_eval_bed + File? duplex_eval_bed Int memory = 32 Int extra_disk = 0 Int disk_size = ceil(size(groupbyumi_bam , "GB") * 3) + select_first([extra_disk, 0]) @@ -755,7 +754,7 @@ task DuplexRecoveryMetrics { task PlotDuplexRecoveryByTarget { input { - File duplex_recovery_metrics + File? duplex_recovery_metrics String sample_id } From 6bd9d6fa1d319d7061365f86392d507d436d0ac6 Mon Sep 17 00:00:00 2001 From: Stella Date: Mon, 28 Oct 2024 11:33:22 -0400 Subject: [PATCH 21/21] adding files for GDC pipeline --- GDCRealignment/CheckContaminationSomatic.wdl | 84 ++ GDCRealignment/CramToUnmappedBams.wdl | 332 ++++++++ ...WholeGenomeSomaticSingleSample.inputs.json | 6 + .../GDCWholeGenomeSomaticSingleSample.wdl | 802 ++++++++++++++++++ .../README.md | 12 + 5 files changed, 1236 insertions(+) create mode 100644 GDCRealignment/CheckContaminationSomatic.wdl create mode 100644 GDCRealignment/CramToUnmappedBams.wdl create mode 100644 GDCRealignment/GDCWholeGenomeSomaticSingleSample/GDCWholeGenomeSomaticSingleSample.inputs.json create mode 100644 GDCRealignment/GDCWholeGenomeSomaticSingleSample/GDCWholeGenomeSomaticSingleSample.wdl create mode 100644 GDCRealignment/GDCWholeGenomeSomaticSingleSample/README.md diff --git a/GDCRealignment/CheckContaminationSomatic.wdl b/GDCRealignment/CheckContaminationSomatic.wdl new file mode 100644 index 0000000..dd70628 --- /dev/null +++ b/GDCRealignment/CheckContaminationSomatic.wdl @@ -0,0 +1,84 @@ +version 1.0 + +task CalculateSomaticContamination { + input { + File? intervals + File reference + File reference_dict + File reference_index + File tumor_cram_or_bam + File tumor_crai_or_bai + File? normal_cram_or_bam + File? normal_crai_or_bai + File contamination_vcf + File contamination_vcf_index + + # runtime + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.2.4.1" + File? gatk_override + Int? additional_disk + Int memory_mb = 3000 + Int? preemptible_attempts + Int? max_retries + } + + Int disk_size = ceil(size(tumor_cram_or_bam,"GB") + size(normal_cram_or_bam,"GB")) + select_first([additional_disk, 10]) + + Int command_mem = memory_mb - 500 + + command <<< + set -e + + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} + + gatk --java-options "-Xmx~{command_mem}m" GetPileupSummaries \ + -R ~{reference} \ + -I ~{tumor_cram_or_bam} \ + ~{"--interval-set-rule INTERSECTION -L " + intervals} \ + -V ~{contamination_vcf} \ + -L ~{contamination_vcf} \ + -O pileups.table + + if [[ -f "~{normal_cram_or_bam}" ]]; + then + gatk --java-options "-Xmx~{command_mem}m" GetPileupSummaries \ + -R ~{reference} \ + -I ~{normal_cram_or_bam} \ + ~{"--interval-set-rule INTERSECTION -L " + intervals} \ + -V ~{contamination_vcf} \ + -L ~{contamination_vcf} \ + -O normal_pileups.table + + gatk --java-options "-Xmx~{command_mem}m" CalculateContamination \ + -I pileups.table \ + -O contamination.table \ + --tumor-segmentation segments.table \ + -matched normal_pileups.table + else + touch normal_pileups.table + gatk --java-options "-Xmx~{command_mem}m" CalculateContamination \ + -I pileups.table \ + -O contamination.table \ + --tumor-segmentation segments.table + fi + + grep -v ^sample contamination.table | awk '{print($2)}' > contam.txt + >>> + + runtime { + docker: gatk_docker + bootDiskSizeGb: 12 + memory: memory_mb + " MiB" + maxRetries: select_first([max_retries, 2]) + disks: "local-disk " + disk_size + " HDD" + preemptible: select_first([preemptible_attempts, 3]) + } + + output { + File pileups = "pileups.table" + File normal_pileups = "normal_pileups.table" + File contamination_table = "contamination.table" + File maf_segments = "segments.table" + File contamination = "contam.txt" + } +} diff --git a/GDCRealignment/CramToUnmappedBams.wdl b/GDCRealignment/CramToUnmappedBams.wdl new file mode 100644 index 0000000..8f54641 --- /dev/null +++ b/GDCRealignment/CramToUnmappedBams.wdl @@ -0,0 +1,332 @@ +version 1.0 + +# Exactly one of input_cram and input_bam should be supplied to this workflow. If an input_cram is supplied, a ref_fasta +# and ref_fasta_index must also be supplied. The ref_fasta and ref_fasta_index are used to generate a bam, so if an +# input_cram is not supplied, the input_bam is used instead and the ref_fasta and ref_fasta_index are not needed. + +# If the output_map file is provided, it is expected to be a tab-separated file containing a list of all the read group ids +# found in the input_cram / input_bam and the desired name of the unmapped bams generated for each. +# If the file is not provided, the output names of the unmapped bams will be the read_group_id +workflow CramToUnmappedBams { + + String pipeline_version = "1.1.3" + + input { + File? input_cram + File? input_bam + File? ref_fasta + File? ref_fasta_index + File? output_map + String base_file_name + String unmapped_bam_suffix = ".unmapped.bam" + Int additional_disk = 20 + } + + if (defined(input_cram)) { + Float cram_size = size(input_cram, "GiB") + String bam_from_cram_name = basename(input_cram_path, ".cram") + String input_cram_path = select_first([input_cram]) + + call CramToBam { + input: + ref_fasta = select_first([ref_fasta]), + ref_fasta_index = select_first([ref_fasta_index]), + cram_file = select_first([input_cram]), + output_basename = bam_from_cram_name, + disk_size = ceil(cram_size * 6) + additional_disk + } + } + + File input_file = select_first([CramToBam.output_bam, input_bam]) + Float input_size = size(input_file, "GiB") + + if (!defined(output_map)) { + call GenerateOutputMap { + input: + input_bam = input_file, + unmapped_bam_suffix = unmapped_bam_suffix, + disk_size = ceil(input_size) + additional_disk + } + } + + call SplitUpOutputMapFile { + input: + read_group_map_file = select_first([output_map, GenerateOutputMap.output_map]) + } + + scatter (rg_map_file in SplitUpOutputMapFile.rg_to_ubam_file) { + call SplitOutUbamByReadGroup { + input: + input_bam = input_file, + rg_to_ubam_file = rg_map_file, + disk_size = ceil(input_size * 2) + additional_disk + } + + String unmapped_bam_filename = basename(SplitOutUbamByReadGroup.output_bam) + + call RevertSam { + input: + input_bam = SplitOutUbamByReadGroup.output_bam, + output_bam_filename = unmapped_bam_filename, + disk_size = ceil(input_size * 3) + additional_disk + } + + call SortSam { + input: + input_bam = RevertSam.output_bam, + output_bam_filename = unmapped_bam_filename + } + + Float unmapped_bam_size = size(SortSam.output_bam, "GiB") + + call ValidateSamFile { + input: + input_bam = SortSam.output_bam, + report_filename = unmapped_bam_filename + ".validation_report", + disk_size = ceil(unmapped_bam_size) + additional_disk + } + } + + output { + Array[File] validation_report = ValidateSamFile.report + Array[File] unmapped_bams = SortSam.output_bam + } + meta { + allowNestedInputs: true + } +} + +task RevertSam { + input { + File input_bam + String output_bam_filename + Int disk_size + Int memory_in_MiB = 3000 + } + + Int java_mem = memory_in_MiB - 1000 + Int max_heap = memory_in_MiB - 500 + + command <<< + java -Xms~{java_mem}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar \ + RevertSam \ + --INPUT ~{input_bam} \ + --OUTPUT ~{output_bam_filename} \ + --VALIDATION_STRINGENCY LENIENT \ + --ATTRIBUTE_TO_CLEAR FT \ + --ATTRIBUTE_TO_CLEAR CO \ + --ATTRIBUTE_TO_CLEAR PA \ + --ATTRIBUTE_TO_CLEAR OA \ + --ATTRIBUTE_TO_CLEAR XA \ + --SORT_ORDER coordinate \ + --RESTORE_HARDCLIPS false + + >>> + + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + disks: "local-disk " + disk_size + " HDD" + memory: "~{memory_in_MiB} MiB" + preemptible: 3 + } + + output { + File output_bam = output_bam_filename + } +} + +# This task is slower than converting straight from cram to bam (note we stream through sam format in between cram and bam) +# This is currently necessary due to a difference in the way the NM tag is calculated in order to produce a valid bam. +task CramToBam { + input { + File ref_fasta + File ref_fasta_index + File cram_file + String output_basename + Int disk_size + Int memory_in_MiB = 7000 + } + + command <<< + + set -e + set -o pipefail + + samtools view -h -T ~{ref_fasta} ~{cram_file} | + samtools view -b -o ~{output_basename}.bam - + samtools index -b ~{output_basename}.bam + mv ~{output_basename}.bam.bai ~{output_basename}.bai + + >>> + + runtime { + docker: "us.gcr.io/broad-gotc-prod/samtools:1.0.0-1.11-1624651616" + cpu: 3 + memory: "~{memory_in_MiB} MiB" + disks: "local-disk " + disk_size + " HDD" + preemptible: 3 + } + + output { + File output_bam = "~{output_basename}.bam" + File output_bam_index = "~{output_basename}.bai" + } +} + +task GenerateOutputMap { + input { + File input_bam + String unmapped_bam_suffix + Int disk_size + Int memory_in_MiB = 3000 + } + + command <<< + + set -e + + samtools view -H ~{input_bam} | grep '^@RG' | cut -f2 | sed s/ID:// > readgroups.txt + + echo -e "#READ_GROUP_ID\tOUTPUT" > output_map.tsv + + for rg in `cat readgroups.txt`; do + echo -e "$rg\t$rg~{unmapped_bam_suffix}" >> output_map.tsv + done + + >>> + + runtime { + docker: "us.gcr.io/broad-gotc-prod/samtools:1.0.0-1.11-1624651616" + disks: "local-disk " + disk_size + " HDD" + memory: "~{memory_in_MiB} MiB" + preemptible: 3 + } + + output { + File output_map = "output_map.tsv" + } +} + +task SplitUpOutputMapFile { + input { + File read_group_map_file + Int disk_size = 10 + Int memory_in_MiB = 3000 + } + + command <<< + mkdir rgtemp + cd rgtemp + + # splits list of mappings into single files. One line each. + grep -v '^#' ~{read_group_map_file} | split -l 1 - rg_to_ubam_ + >>> + + runtime { + docker: "gcr.io/gcp-runtimes/ubuntu_16_0_4@sha256:025124e2f1cf4d29149958f17270596bffe13fc6acca6252977c572dd5ba01bf" + disks: "local-disk " + disk_size + " HDD" + memory: "~{memory_in_MiB} MiB" + } + + output { + Array[File] rg_to_ubam_file = glob("rgtemp/rg_to_ubam_*") + } +} + +task SplitOutUbamByReadGroup { + + input { + File input_bam + File rg_to_ubam_file + Int disk_size + Int memory_in_MiB = 30000 + } + + Array[Array[String]] tmp = read_tsv(rg_to_ubam_file) + + command <<< + echo "Read Group ~{tmp[0][0]} from ~{input_bam} is being written to ~{tmp[0][1]}" + samtools view -b -h -r ~{tmp[0][0]} -o ~{tmp[0][1]} ~{input_bam} + >>> + + output { + File output_bam = tmp[0][1] + } + + runtime { + docker: "us.gcr.io/broad-gotc-prod/samtools:1.0.0-1.11-1624651616" + cpu: 2 + disks: "local-disk " + disk_size + " HDD" + memory: "~{memory_in_MiB} MiB" + preemptible: 3 + } +} + +task ValidateSamFile { + input { + File input_bam + String report_filename + Int disk_size + Int memory_in_MiB = 3000 + } + + Int java_mem = memory_in_MiB - 1000 + Int max_heap = memory_in_MiB - 500 + + command <<< + + java -Xms~{java_mem}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar \ + ValidateSamFile \ + --INPUT ~{input_bam} \ + --OUTPUT ~{report_filename} \ + --MODE VERBOSE \ + --IS_BISULFITE_SEQUENCED false + + >>> + + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + disks: "local-disk " + disk_size + " HDD" + memory: "~{memory_in_MiB} MiB" + preemptible: 3 + } + + output { + File report = "~{report_filename}" + } +} + +task SortSam { + input { + File input_bam + String output_bam_filename + Int memory_in_MiB = 7000 + Float sort_sam_disk_multiplier = 6 + } + # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs + # more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier + Int disk_size = ceil(sort_sam_disk_multiplier * size(input_bam, "GiB")) + 20 + Int java_mem = memory_in_MiB - 1000 + Int max_heap = memory_in_MiB - 500 + + command <<< + java -Xms~{java_mem}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar \ + SortSam \ + --INPUT ~{input_bam} \ + --OUTPUT ~{output_bam_filename} \ + --SORT_ORDER queryname \ + --MAX_RECORDS_IN_RAM 300000 + + >>> + + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + disks: "local-disk " + disk_size + " HDD" + memory: "~{memory_in_MiB} MiB" + preemptible: 3 + } + + output { + File output_bam = output_bam_filename + } +} diff --git a/GDCRealignment/GDCWholeGenomeSomaticSingleSample/GDCWholeGenomeSomaticSingleSample.inputs.json b/GDCRealignment/GDCWholeGenomeSomaticSingleSample/GDCWholeGenomeSomaticSingleSample.inputs.json new file mode 100644 index 0000000..28f61b4 --- /dev/null +++ b/GDCRealignment/GDCWholeGenomeSomaticSingleSample/GDCWholeGenomeSomaticSingleSample.inputs.json @@ -0,0 +1,6 @@ +{ + "read_from_cache": true, + "write_to_cache": true, + "google_legacy_machine_selection": true, + "monitoring_script": "gs://broad-gotc-test-storage/cromwell_monitoring_script.sh" +} diff --git a/GDCRealignment/GDCWholeGenomeSomaticSingleSample/GDCWholeGenomeSomaticSingleSample.wdl b/GDCRealignment/GDCWholeGenomeSomaticSingleSample/GDCWholeGenomeSomaticSingleSample.wdl new file mode 100644 index 0000000..4dbe651 --- /dev/null +++ b/GDCRealignment/GDCWholeGenomeSomaticSingleSample/GDCWholeGenomeSomaticSingleSample.wdl @@ -0,0 +1,802 @@ +version 1.0 + +import "../CramToUnmappedBams.wdl" as ToUbams +import "../CheckContaminationSomatic.wdl" as CheckContamination + +struct FastqPairRecord { + File forward_fastq + File reverse_fastq + String readgroup + String readgroup_id +} + +struct FastqSingleRecord { + File fastq + String readgroup + String readgroup_id +} + +task bam_readgroup_to_contents { + input { + File bam + Int preemptible = 10 + Int max_retries = 0 + Int cpu = 1 + Int additional_memory_mb = 0 + Int additional_disk_gb = 0 + } + Int mem = ceil(size(bam, "MiB")) + 2000 + additional_memory_mb + Int disk_space = ceil(size(bam, "GiB")) + 10 + additional_disk_gb + + parameter_meta { + bam: {localization_optional: true} + } + + command <<< + set -euo pipefail + export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) + samtools view -H ~{bam} | \ + awk 'BEGIN { \ + OFS = FS = "\t"; \ + header = "ID\tBC\tCN\tDS\tDT\tFO\tKS\tLB\tPG\tPI\tPL\tPM\tPU\tSM"; \ + split(header, header_ary, "\t"); \ + for (i=1; i in header_ary; i++) { \ + header_pos[header_ary[i]] = i \ + }; \ + print header \ + } \ + /^@RG/ { \ + for (i=2; i<=NF; i++) { \ + split($i, rg, ":"); \ + row_ary[header_pos[rg[1]]] = rg[2]; \ + }; \ + row = row_ary[1]; \ + for (i=2; i in header_ary; i++) { \ + row = row "\t"; \ + if (i in row_ary) \ + row = row row_ary[i]; \ + }; \ + delete row_ary; \ + print row \ + }' + >>> + + output { + Array[Object] readgroups = read_objects(stdout()) + } + + runtime { + docker: "broadgdac/samtools:1.10" + memory: mem + " MiB" + disks: "local-disk " + disk_space + " HDD" + preemptible: preemptible + maxRetries: max_retries + cpu: cpu + } +} + +task biobambam_bamtofastq { + input { + Int collate = 1 + String exclude = "QCFAIL,SECONDARY,SUPPLEMENTARY" + File filename + Int gz = 1 + String inputformat = "bam" + Int level = 5 + String outputdir = "." + Int outputperreadgroup = 1 + String outputperreadgroupsuffixF = "_1.fq.gz" + String outputperreadgroupsuffixF2 = "_2.fq.gz" + String outputperreadgroupsuffixO = "_o1.fq.gz" + String outputperreadgroupsuffixO2 = "_o2.fq.gz" + String outputperreadgroupsuffixS = "_s.fq.gz" + Int tryoq = 1 + String T = "tempfq" + Int cpu = 1 + Int preemptible = 2 + Int max_retries = 0 + Int additional_memory_mb = 0 + Int additional_disk_gb = 0 + } + Int mem = ceil(size(filename, "MiB")) + 2000 + additional_memory_mb + Int disk_space = ceil(size(filename, "GiB") * 2) + 10 + additional_disk_gb + + command { + set -euo pipefail + /usr/local/bin/bamtofastq \ + T=~{T} \ + collate=~{collate} \ + exclude=~{exclude} \ + filename=~{filename} \ + gz=~{gz} \ + inputformat=~{inputformat} \ + level=~{level} \ + outputdir=~{outputdir} \ + outputperreadgroup=~{outputperreadgroup} \ + outputperreadgroupsuffixF=~{outputperreadgroupsuffixF} \ + outputperreadgroupsuffixF2=~{outputperreadgroupsuffixF2} \ + outputperreadgroupsuffixO=~{outputperreadgroupsuffixO} \ + outputperreadgroupsuffixO2=~{outputperreadgroupsuffixO2} \ + outputperreadgroupsuffixS=~{outputperreadgroupsuffixS} \ + tryoq=~{tryoq} + } + + output { + Array[File] output_fastq1 = glob("*~{outputperreadgroupsuffixF}") + Array[File] output_fastq2 = glob("*~{outputperreadgroupsuffixF2}") + Array[File] output_fastq_o1 = glob("*~{outputperreadgroupsuffixO}") + Array[File] output_fastq_o2 = glob("*~{outputperreadgroupsuffixO2}") + Array[File] output_fastq_s = glob("*~{outputperreadgroupsuffixS}") + } + + runtime { + docker: "broadgdac/biobambam2:2.0.87-release-20180301132713" + memory: mem + " MiB" + disks: "local-disk " + disk_space + " HDD" + preemptible: preemptible + maxRetries: max_retries + cpu: cpu + } +} + +task emit_pe_records { + input { + Array[File]+ fastq1_files + Array[File]+ fastq2_files + Array[Object]+ readgroups + String fastq1_suffix = "_1.fq.gz" + Int preemptible = 10 + Int max_retries = 0 + Int cpu = 1 + Int additional_memory_mb = 0 + Int additional_disk_gb = 0 + } + Int mem = ceil(size(fastq1_files, "MiB") + size(fastq2_files, "MiB")) + 2000 + additional_memory_mb + Int disk_space = ceil(size(fastq1_files, "GiB") + size(fastq2_files, "GiB")) + 10 + additional_disk_gb + + File readgroups_tsv = write_objects(readgroups) + + parameter_meta { + fastq1_files: {localization_optional: true} + fastq2_files: {localization_optional: true} + } + + command <<< + set -euo pipefail + + python <>> + + output { + Array[FastqPairRecord] fastq_pair_records = read_objects(stdout()) + } + + runtime { + docker: "python:3.8-slim" + memory: mem + " MiB" + disks: "local-disk " + disk_space + " HDD" + preemptible: preemptible + maxRetries: max_retries + cpu: cpu + } +} + +task emit_se_records { + input { + Array[File] fastq_o1_files + Array[File] fastq_o2_files + Array[File] fastq_s_files + Array[Object]+ readgroups + String fastq_o1_suffix = "_o1.fq.gz" + String fastq_o2_suffix = "_o2.fq.gz" + String fastq_s_suffix = "_s.fq.gz" + Int preemptible = 10 + Int max_retries = 0 + Int cpu = 1 + Int additional_memory_mb = 0 + Int additional_disk_gb = 0 + } + Int mem = ceil(size(fastq_o1_files, "MiB") + size(fastq_o2_files, "MiB") + size(fastq_s_files, "MiB")) + 2000 + additional_memory_mb + Int disk_space = ceil(size(fastq_o1_files, "GiB") + size(fastq_o2_files, "GiB") + size(fastq_s_files, "GiB")) + 10 + additional_disk_gb + + File readgroups_tsv = write_objects(readgroups) + + parameter_meta { + fastq_o1_files: {localization_optional: true} + fastq_o2_files: {localization_optional: true} + fastq_s_files: {localization_optional: true} + } + + command <<< + set -euo pipefail + + python <>> + + output { + Array[FastqSingleRecord] fastq_single_records = read_objects(stdout()) + } + + runtime { + docker: "python:3.8-slim" + memory: mem + " MiB" + disks: "local-disk " + disk_space + " HDD" + preemptible: preemptible + maxRetries: max_retries + cpu: cpu + } +} + +task bwa_pe { + input { + FastqPairRecord fastq_record + File ref_fasta + File ref_fai + File ref_dict + File ref_amb + File ref_ann + File ref_bwt + File ref_pac + File ref_sa + Int cpu = 16 + Int preemptible = 1 + Int max_retries = 0 + Int additional_memory_mb = 0 + Int additional_disk_gb = 0 + } + File fastq1 = fastq_record.forward_fastq + File fastq2 = fastq_record.reverse_fastq + String readgroup = fastq_record.readgroup + String outbam = fastq_record.readgroup_id + ".bam" + Float ref_size =size([ref_fasta, ref_dict, ref_amb, ref_ann, ref_bwt, ref_pac, ref_sa, ref_fai], "GiB") + Int mem = ceil(size([fastq1, fastq2], "MiB")) + 10000 + additional_memory_mb + Int disk_space = ceil((size([fastq1, fastq2], "GiB") * 4) + ref_size) + 10 + additional_disk_gb + + command { + set -euo pipefail + bwa mem \ + -t ~{cpu} \ + -T 0 \ + -R "~{readgroup}" \ + ~{ref_fasta} \ + ~{fastq1} \ + ~{fastq2} \ + | samtools view \ + -Shb \ + -o ~{outbam} \ + - + } + + output { + File bam = "~{outbam}" + } + + runtime { + docker: "broadgdac/bwa:0.7.15-r1142-dirty" + memory: mem + " MiB" + disks: "local-disk " + disk_space + " HDD" + preemptible: preemptible + maxRetries: max_retries + cpu: cpu + } +} + +task bwa_se { + input { + FastqSingleRecord fastq_record + File ref_fasta + File ref_dict + File ref_amb + File ref_ann + File ref_bwt + File ref_pac + File ref_sa + File ref_fai + Int cpu = 16 + Int preemptible = 1 + Int max_retries = 0 + Int additional_memory_mb = 0 + Int additional_disk_gb = 0 + } + File fastq = fastq_record.fastq + String readgroup = fastq_record.readgroup + String outbam = fastq_record.readgroup_id + ".bam" + Float ref_size = size([ref_fasta, ref_dict, ref_amb, ref_ann, ref_bwt, ref_pac, ref_sa, ref_fai], "GiB") + Int mem = ceil(size(fastq, "MiB")) + 10000 + additional_memory_mb + Int disk_space = ceil((size(fastq, "GiB") * 4) + ref_size) + 10 + additional_disk_gb + + command { + set -euo pipefail + bwa mem \ + -t ~{cpu} \ + -T 0 \ + -R "~{readgroup}" \ + ~{ref_fasta} \ + ~{fastq} \ + | samtools view \ + -Shb \ + -o ~{outbam} \ + - + } + + output { + File bam = "~{outbam}" + } + + runtime { + docker: "broadgdac/bwa:0.7.15-r1142-dirty" + memory: mem + " MiB" + disks: "local-disk " + disk_space + " HDD" + preemptible: preemptible + maxRetries: max_retries + cpu: cpu + } +} + +task picard_markduplicates { + input { + Array[File]+ bams + String outbam + + Int compression_level = 2 + Int preemptible_tries = 1 + Int max_retries = 0 + String validation_stringency = "SILENT" + String assume_sort_order = "queryname" + + # The program default for READ_NAME_REGEX is appropriate in nearly every case. + # Sometimes we wish to supply "null" in order to turn off optical duplicate detection + # This can be desirable if you don't mind the estimated library size being wrong and optical duplicate detection is taking >7 days and failing + String? read_name_regex + Int memory_multiplier = 1 + Int additional_disk_gb = 20 + + Float? sorting_collection_size_ratio + } + Float total_input_size = size(bams, "GiB") + String metrics_filename = outbam + ".metrics" + # The merged bam will be smaller than the sum of the parts so we need to account for the unmerged inputs and the merged output. + # Mark Duplicates takes in as input readgroup bams and outputs a slightly smaller aggregated bam. Giving .25 as wiggleroom + Float md_disk_multiplier = 3 + Int disk_size = ceil(md_disk_multiplier * total_input_size) + additional_disk_gb + + Float memory_size = 7.5 * memory_multiplier + Int java_memory_size = (ceil(memory_size) - 2) + + # Task is assuming query-sorted input so that the Secondary and Supplementary reads get marked correctly + # This works because the output of BWA is query-grouped and therefore, so is the output of MergeBamAlignment. + # While query-grouped isn't actually query-sorted, it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname" + + command { + java -Dsamjdk.compression_level=~{compression_level} -Xms~{java_memory_size}g -jar /usr/picard/picard.jar \ + MarkDuplicates \ + INPUT=~{sep=' INPUT=' bams} \ + OUTPUT=~{outbam} \ + METRICS_FILE=~{metrics_filename} \ + VALIDATION_STRINGENCY=~{validation_stringency} \ + ASSUME_SORT_ORDER=~{assume_sort_order} \ + ~{"SORTING_COLLECTION_SIZE_RATIO=" + sorting_collection_size_ratio} \ + ~{"READ_NAME_REGEX=" + read_name_regex} + } + + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + preemptible: preemptible_tries + memory: "~{memory_size} GiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File bam = "~{outbam}" + File metrics = "~{metrics_filename}" + } +} + +task sort_and_index_markdup_bam { + input { + File input_bam + String tmp_prefix = "tmp_srt" + Int cpu = 8 + Int preemptible = 2 + Int max_retries = 0 + Int additional_memory_mb = 0 + Int additional_disk_gb = 0 + } + Int mem = ceil(size(input_bam, "MiB")) + 10000 + additional_memory_mb + Int disk_space = ceil(size(input_bam, "GiB") * 3.25) + 20 + additional_disk_gb + Int mem_per_thread = floor(mem / cpu * 0.85) + Int index_threads = cpu - 1 + String output_bam = basename(input_bam) + String output_bai = basename(input_bam, ".bam") + ".bai" + + command { + set -euo pipefail + samtools sort \ + -@ ~{cpu} \ + -o ~{output_bam} \ + -T ~{tmp_prefix} \ + -m ~{mem_per_thread}M \ + ~{input_bam} + samtools index \ + -b \ + -@ ~{index_threads} \ + ~{output_bam} \ + ~{output_bai} + } + + output { + File bam = "~{output_bam}" + File bai = "~{output_bai}" + } + + runtime { + docker: "us.gcr.io/broad-gotc-prod/samtools:1.10" + memory: mem + " MiB" + disks: "local-disk " + disk_space + " HDD" + preemptible: preemptible + maxRetries: max_retries + cpu: cpu + } +} + + +task gatk_baserecalibrator { + input { + File bam + File dbsnp_vcf + File dbsnp_vcf_index + File ref_dict + File ref_fasta + File ref_fai + Int cpu = 2 + Int preemptible = 2 + Int max_retries = 0 + Int additional_memory_mb = 0 + Int additional_disk_gb = 0 + } + String output_grp = basename(bam, ".bam") + "_bqsr.grp" + Float ref_size = size([ref_fasta, ref_fai, ref_dict], "GiB") + Float dbsnp_size = size([dbsnp_vcf, dbsnp_vcf_index], "GiB") + Int mem = ceil(size(bam, "MiB")) + 6000 + additional_memory_mb + Int jvm_mem = mem - 1000 + Int max_heap = mem - 500 + Int disk_space = ceil(size(bam, "GiB") + ref_size + dbsnp_size) + 20 + additional_disk_gb + + parameter_meta { + bam: {localization_optional: true} + dbsnp_vcf: {localization_optional: true} + dbsnp_vcf_index: {localization_optional: true} + ref_dict: {localization_optional: true} + ref_fasta: {localization_optional: true} + ref_fai: {localization_optional: true} + } + + command { + gatk --java-options "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal -Xlog:gc=debug:file=gc_log.log -Xms~{jvm_mem}m -Xmx~{max_heap}m" \ + BaseRecalibrator \ + --input ~{bam} \ + --known-sites ~{dbsnp_vcf} \ + --reference ~{ref_fasta} \ + --tmp-dir . \ + --output ~{output_grp} + } + + output { + File bqsr_recal_file = "~{output_grp}" + } + + runtime { + docker: "us.gcr.io/broad-gatk/gatk:4.6.0.0" + memory: mem + " MiB" + disks: "local-disk " + disk_space + " HDD" + preemptible: preemptible + maxRetries: max_retries + cpu: cpu + } +} + +task gatk_applybqsr { + input { + File input_bam + File bqsr_recal_file + Boolean emit_original_quals = true + Int cpu = 2 + Int preemptible = 2 + Int max_retries = 0 + Int additional_memory_mb = 0 + Int additional_disk_gb = 0 + } + String output_bam = basename(input_bam) + String output_bai = basename(input_bam, ".bam") + ".bai" + Int mem = ceil(size(input_bam, "MiB")) + 4000 + additional_memory_mb + Int jvm_mem = mem - 1000 + Int max_heap = mem - 500 + Int disk_space = ceil((size(input_bam, "GiB") * 3)) + 20 + additional_disk_gb + + parameter_meta { + input_bam: {localization_optional: true} + } + + command { + gatk --java-options "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal -Xlog:gc=debug:file=gc_log.log -Xms~{jvm_mem}m -Xmx~{max_heap}m" \ + ApplyBQSR \ + --input ~{input_bam} \ + --bqsr-recal-file ~{bqsr_recal_file} \ + --emit-original-quals ~{emit_original_quals} \ + --tmp-dir . \ + --output ~{output_bam} + } + + output { + File bam = "~{output_bam}" + File bai = "~{output_bai}" + } + + runtime { + docker: "us.gcr.io/broad-gatk/gatk:4.6.0.0" + memory: mem + " MiB" + disks: "local-disk " + disk_space + " HDD" + preemptible: preemptible + maxRetries: max_retries + cpu: cpu + } +} + +# Collect quality metrics from the aggregated bam +task collect_insert_size_metrics { + input { + File input_bam + String output_bam_prefix + Int additional_memory_mb = 0 + Int additional_disk_gb = 0 + } + Int mem = ceil(size(input_bam, "GiB")) + 7000 + additional_memory_mb + Int jvm_mem = mem - 1000 + Int max_heap = mem - 500 + Int disk_size = ceil(size(input_bam, "GiB")) + 20 + additional_disk_gb + + command { + java -Xms~{jvm_mem}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar \ + CollectInsertSizeMetrics \ + INPUT=~{input_bam} \ + OUTPUT=~{output_bam_prefix}.insert_size_metrics \ + HISTOGRAM_FILE=~{output_bam_prefix}.insert_size_histogram.pdf + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + memory: mem + " MiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File insert_size_histogram_pdf = "~{output_bam_prefix}.insert_size_histogram.pdf" + File insert_size_metrics = "~{output_bam_prefix}.insert_size_metrics" + } +} + + +workflow GDCWholeGenomeSomaticSingleSample { + + String pipeline_version = "1.3.3" + + input { + File? input_cram + File? input_bam + File? cram_ref_fasta + File? cram_ref_fasta_index + File? output_map + String? unmapped_bam_suffix + String base_file_name + + File? ubam + + File contamination_vcf + File contamination_vcf_index + File dbsnp_vcf + File dbsnp_vcf_index + + File ref_fasta + File ref_fai + File ref_dict + File ref_amb + File ref_ann + File ref_bwt + File ref_pac + File ref_sa + } + + String outbam = if (defined(ubam) || defined(input_bam)) then basename(select_first([ubam, input_bam]), ".bam") + ".aln.mrkdp.bam" + else basename(select_first([input_cram]), ".cram") + ".aln.mrkdp.bam" + + if (!defined(ubam)) { + call ToUbams.CramToUnmappedBams { + input: + input_cram = input_cram, + input_bam = input_bam, + ref_fasta = select_first([cram_ref_fasta, ref_fasta]), + ref_fasta_index = select_first([cram_ref_fasta_index, ref_fai]), + output_map = output_map, + base_file_name = base_file_name, + unmapped_bam_suffix = unmapped_bam_suffix + } + } + + Array[File] ubams = if defined(ubam) then [select_first([ubam])] else select_first([CramToUnmappedBams.unmapped_bams]) + + scatter (ubam in ubams) { + call bam_readgroup_to_contents { + input: bam = ubam + } + + call biobambam_bamtofastq { + input: filename = ubam + } + } + + Array[Object] readgroups = flatten(bam_readgroup_to_contents.readgroups) + + Array[File] fastq1 = flatten(biobambam_bamtofastq.output_fastq1) + Array[File] fastq2 = flatten(biobambam_bamtofastq.output_fastq2) + Array[File] fastq_o1 = flatten(biobambam_bamtofastq.output_fastq_o1) + Array[File] fastq_o2 = flatten(biobambam_bamtofastq.output_fastq_o2) + Array[File] fastq_s = flatten(biobambam_bamtofastq.output_fastq_s) + + Int pe_count = length(fastq1) + Int o1_count = length(fastq_o1) + Int o2_count = length(fastq_o2) + Int s_count = length(fastq_s) + + if (pe_count > 0) { + call emit_pe_records { + input: + fastq1_files = fastq1, + fastq2_files = fastq2, + readgroups = readgroups + } + scatter (pe_record in emit_pe_records.fastq_pair_records) { + call bwa_pe { + input: + fastq_record = pe_record, + ref_fasta = ref_fasta, + ref_fai = ref_fai, + ref_dict = ref_dict, + ref_amb = ref_amb, + ref_ann = ref_ann, + ref_bwt = ref_bwt, + ref_pac = ref_pac, + ref_sa = ref_sa + } + } + } + + if (o1_count + o2_count + s_count > 0) { + call emit_se_records { + input: + fastq_o1_files = fastq_o1, + fastq_o2_files = fastq_o2, + fastq_s_files = fastq_s, + readgroups = readgroups + } + scatter (se_record in emit_se_records.fastq_single_records) { + call bwa_se { + input: + fastq_record = se_record, + ref_fasta = ref_fasta, + ref_fai = ref_fai, + ref_dict = ref_dict, + ref_amb = ref_amb, + ref_ann = ref_ann, + ref_bwt = ref_bwt, + ref_pac = ref_pac, + ref_sa = ref_sa + } + } + } + + Array[File] aligned_bams = flatten([select_first([bwa_pe.bam, []]), select_first([bwa_se.bam, []])]) + + call picard_markduplicates { + input: + bams = aligned_bams, + outbam = outbam + } + + call sort_and_index_markdup_bam { + input: input_bam = picard_markduplicates.bam + } + + call CheckContamination.CalculateSomaticContamination as check_contamination { + input: + reference = ref_fasta, + reference_dict = ref_dict, + reference_index = ref_fai, + tumor_cram_or_bam = sort_and_index_markdup_bam.bam, + tumor_crai_or_bai = sort_and_index_markdup_bam.bai, + contamination_vcf = contamination_vcf, + contamination_vcf_index = contamination_vcf_index + } + + call gatk_baserecalibrator { + input: + bam = sort_and_index_markdup_bam.bam, + ref_fasta = ref_fasta, + ref_fai = ref_fai, + ref_dict = ref_dict, + dbsnp_vcf = dbsnp_vcf, + dbsnp_vcf_index = dbsnp_vcf_index + } + + call gatk_applybqsr { + input: + input_bam = sort_and_index_markdup_bam.bam, + bqsr_recal_file = gatk_baserecalibrator.bqsr_recal_file + } + + String output_bam_prefix = basename(gatk_applybqsr.bam, ".bam") + + call collect_insert_size_metrics { + input: + input_bam = gatk_applybqsr.bam, + output_bam_prefix = output_bam_prefix + } + + output { + Array[File]? validation_report = CramToUnmappedBams.validation_report + Array[File]? unmapped_bams = CramToUnmappedBams.unmapped_bams + File bam = gatk_applybqsr.bam + File bai = gatk_applybqsr.bai + File md_metrics = picard_markduplicates.metrics + File insert_size_metrics = collect_insert_size_metrics.insert_size_metrics + File insert_size_histogram_pdf = collect_insert_size_metrics.insert_size_histogram_pdf + File contamination = check_contamination.contamination + } + meta { + allowNestedInputs: true + } +} diff --git a/GDCRealignment/GDCWholeGenomeSomaticSingleSample/README.md b/GDCRealignment/GDCWholeGenomeSomaticSingleSample/README.md new file mode 100644 index 0000000..0ddff73 --- /dev/null +++ b/GDCRealignment/GDCWholeGenomeSomaticSingleSample/README.md @@ -0,0 +1,12 @@ +## Announcement: GDC is Deprecated 9/12/2024 + +The GDC Whole Genome Somatic Single Sample workflow is deprecated and no longer supported. However, documentation is still available. Read about the [GDC Whole Genome Somatic Single Sample Pipeline](https://broadinstitute.github.io/warp/documentation/Pipelines/Genomic_Data_Commons_Whole_Genome_Somatic/) on the [WARP documentation site](https://broadinstitute.github.io/warp/)! + +## Introduction to the GDC Whole Genome Somatic Single Sample Pipeline +The GDC Whole Genome Somatic Single Sample (abbreviated GDC here) pipeline is the alignment and preprocessing workflow for genomic data designed for the National Cancer Institute's [Genomic Data Commons](https://gdc.cancer.gov/about-gdc). + +A high-level overview of the pipeline in addition to tool parameters are available on the [GDC Documentation site](https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/DNA_Seq_Variant_Calling_Pipeline/). + +Overall, the pipeline converts reads (CRAM or BAM) to FASTQ and (re)aligns them to the latest human reference genome (see the [GDC Reference Genome](#gdc-reference-genome) section below). Each read group is aligned separately. Read group alignments that belong to a single sample are then merged and duplicate reads are flagged for downstream variant calling. + +