diff --git a/.dockstore.yml b/.dockstore.yml index 74db3c3..8947045 100644 --- a/.dockstore.yml +++ b/.dockstore.yml @@ -118,4 +118,24 @@ workflows: subclass: WDL primaryDescriptorPath: /CleanupFailedSubmissions/Cleanup_Failed_Submissions.wdl testParameterFiles: - - /CleanupFailedSubmissions/Cleanup_Failed_Submissions.inputs.json \ No newline at end of file + - /CleanupFailedSubmissions/Cleanup_Failed_Submissions.inputs.json + - name: codec_bcl2fastq + subclass: WDL + primaryDescriptorPath: /CODEC/codec_bcl2fastq.wdl + testParameterFiles: + - /CODEC/codec_bcl2fastq.inputs.json + - name: demux_CODEC + subclass: WDL + primaryDescriptorPath: /CODEC/demux_CODEC.wdl + testParameterFiles: + - /CODEC/demux_CODEC.inputs.json + - name: SingleSampleCODEC + subclass: WDL + primaryDescriptorPath: /CODEC/SingleSampleCODEC.wdl + testParameterFiles: + - /CODEC/SingleSampleCODEC.inputs.json + - name: SigProfiler + subclass: WDL + primaryDescriptorPath: /CODEC/SigProfiler.wdl + testParameterFiles: + - /CODEC/SigProfiler.inputs.json diff --git a/CODEC/README.md b/CODEC/README.md new file mode 100644 index 0000000..d26ad54 --- /dev/null +++ b/CODEC/README.md @@ -0,0 +1,19 @@ +# Signature Profiling WDL for CODEC Mutlist Output +CODEC pipeline: SingleSampleCODEC pipeline provides text files for discovered mutations. + +This workflow summarizes and plots mutation spectrums in 96 trinucleotide contexts and generate Mutation Matrix that will be later used to subtract SBS(Single Base Substitution) signatures from SNVs with database reference from COSMIC(https://cancer.sanger.ac.uk/signatures/). + +The Signature Profiling tool is from https://github.com/AlexandrovLab/SigProfilerAssignment and has been implanted to the docker image. + +The output of this WDL includes: +1) SpectrumPlots +2) MutationMetrics +3) SignatureCount +4) SignatureProportionPDF +5) SignatureStackedPlot +6) TMBPlot +7) DecomposedSignatureProbabilities + + +### Citation +Díaz-Gay et al. 2023 Bioinformatics and Tate et al. 2019 Nucleic Acids Research \ No newline at end of file diff --git a/CODEC/SigProfiler.inputs.json b/CODEC/SigProfiler.inputs.json new file mode 100644 index 0000000..07ec31a --- /dev/null +++ b/CODEC/SigProfiler.inputs.json @@ -0,0 +1 @@ +{"SigProfiler.GenomeFasta":"${workspace.referenceData_hg38_ref_fasta}","SigProfiler.MutlistFiles":"${this.samples.variants_called}","SigProfiler.mutlist_to_96_contexts.GenomeFastaIndex":"${workspace.referenceData_hg38_ref_fasta_index}"} \ No newline at end of file diff --git a/CODEC/SigProfiler.wdl b/CODEC/SigProfiler.wdl new file mode 100644 index 0000000..01f8fa6 --- /dev/null +++ b/CODEC/SigProfiler.wdl @@ -0,0 +1,145 @@ +version 1.0 + +workflow SigProfiler { + input { + Array[File] MutlistFiles + File GenomeFasta + } + + call mutlist_to_96_contexts { + input: + MutlistFiles = MutlistFiles, + GenomeFasta = GenomeFasta + } + call sigprofiler_analysis { + input: + MutationMetrics = mutlist_to_96_contexts.MutationMetrics + + } + call PlotSignatures { + input: + SignatureCount = sigprofiler_analysis.SignatureCount + } + + output { + File MutationMetrics = mutlist_to_96_contexts.MutationMetrics + File SpectrumPlots = mutlist_to_96_contexts.SpectrumPlots + File DecomposedSignatureProbabilities = sigprofiler_analysis.DecomposedSignatureProbabilities + File SignatureStackedPlot = sigprofiler_analysis.SignatureStackedPlot + File TMBPlot = sigprofiler_analysis.TMBPlot + File SignatureCount = sigprofiler_analysis.SignatureCount + File SignatureProportionPDF = PlotSignatures.signature_proportions_pdf + } +} + + + +task mutlist_to_96_contexts { + input { + Array[File] MutlistFiles + File GenomeFasta + File GenomeFastaIndex + } + + command { + Rscript /scripts/96_contexts_mutations.R "~{sep=' ' MutlistFiles}" ~{GenomeFasta} + } + + output { + File MutationMetrics = "trinuc_mutation_metrics.txt" + File SpectrumPlots = "all_sample_spectrums.pdf" + } + + runtime { + docker: "us.gcr.io/tag-public/sigprofiler:v1" + memory: "8 GB" + disks: "local-disk 20 HDD" + } +} + +task sigprofiler_analysis { + input { + File MutationMetrics + String OutputFolder = "SigProfiler-output" + } + + command { + python3 < 0] + + # Plot the data + plt.figure(figsize=(16, 9)) + sns.scatterplot(data=SigCounts_long, x="Samples", y="Signature", size="Proportion", sizes=(20, 200), legend=False) + plt.xticks(rotation=90) + plt.xlabel("Sample Name", fontsize=16) + plt.ylabel("Signature", fontsize=16) + plt.title("Signature Proportions by Sample", fontsize=20, pad = 20) + plt.grid(axis='y') + ax = plt.gca() + ax.spines['top'].set_visible(False) + ax.spines['right'].set_visible(False) + ax.spines['left'].set_visible(False) + ax.spines['bottom'].set_visible(False) + plt.tight_layout() + plt.savefig("signature_proportions.pdf", format="pdf") + EOF + } + + output { + File signature_proportions_pdf = "signature_proportions.pdf" + } + + runtime { + docker: "us.gcr.io/tag-public/sigprofiler:v1" + memory: "8 GB" + disks: "local-disk 20 HDD" + } +} diff --git a/CODEC/SingleSampleCODEC.inputs.json b/CODEC/SingleSampleCODEC.inputs.json new file mode 100644 index 0000000..94a48f3 --- /dev/null +++ b/CODEC/SingleSampleCODEC.inputs.json @@ -0,0 +1,63 @@ +{ + "SingleSampleCODEC.AlignMolecularConsensusReads.cpu_cores": "${2}", + "SingleSampleCODEC.AlignMolecularConsensusReads.disk_size": "${}", + "SingleSampleCODEC.AlignMolecularConsensusReads.memory": "${}", + "SingleSampleCODEC.AlignMolecularConsensusReads.threads": "${8}", + "SingleSampleCODEC.AlignRawTrimmed.disk": "${64}", + "SingleSampleCODEC.AlignRawTrimmed.disk_size": "${}", + "SingleSampleCODEC.AlignRawTrimmed.mem": "${}", + "SingleSampleCODEC.AlignRawTrimmed.memory": "${64}", + "SingleSampleCODEC.CDSByProduct.disk_size": "${}", + "SingleSampleCODEC.CDSByProduct.mem": "${}", + "SingleSampleCODEC.CSS_SFC_ErrorMetrics.disk_size": "${800}", + "SingleSampleCODEC.CSS_SFC_ErrorMetrics.memory": "${}", + "SingleSampleCODEC.CollectConsensusWgsMetrics.disk_size": "${160}", + "SingleSampleCODEC.CollectConsensusWgsMetrics.memory": "${}", + "SingleSampleCODEC.CollectInsertSizeMetrics.disk_size": "${100}", + "SingleSampleCODEC.CollectInsertSizeMetrics.memory": "${32}", + "SingleSampleCODEC.CollectRawWgsMetrics.disk_size": "${160}", + "SingleSampleCODEC.CollectRawWgsMetrics.memory": "${}", + "SingleSampleCODEC.FgbioCollapseReadFamilies.disk_size": "${200}", + "SingleSampleCODEC.FgbioCollapseReadFamilies.memory": "${}", + "SingleSampleCODEC.GroupReadByUMI.disk_size": "${400}", + "SingleSampleCODEC.GroupReadByUMI.memory": "${32}", + "SingleSampleCODEC.MarkRawDuplicates.disk_size": "${200}", + "SingleSampleCODEC.MarkRawDuplicates.memory": "${64}", + "SingleSampleCODEC.MergeAndSortMoleculeConsensusReads.disk_size": "${160}", + "SingleSampleCODEC.MergeAndSortMoleculeConsensusReads.memory": "${64}", + "SingleSampleCODEC.MergeLogSplit.disk_size": "${}", + "SingleSampleCODEC.MergeLogSplit.mem": "${}", + "SingleSampleCODEC.MergeSplit.disk_size": "${200}", + "SingleSampleCODEC.MergeSplit.memory": "${32}", + "SingleSampleCODEC.RAW_SFC_ErrorMetrics.disk_size": "${800}", + "SingleSampleCODEC.RAW_SFC_ErrorMetrics.memory": "${32}", + "SingleSampleCODEC.ReplaceRawReadGroup.disk_size": "${200}", + "SingleSampleCODEC.ReplaceRawReadGroup.memory": "${32}", + "SingleSampleCODEC.SortBam.disk_size": "${200}", + "SingleSampleCODEC.SortBam.mem": "${}", + "SingleSampleCODEC.SplitFastq1.disk_size": "${400}", + "SingleSampleCODEC.SplitFastq1.memory": "${32}", + "SingleSampleCODEC.SplitFastq2.disk_size": "${400}", + "SingleSampleCODEC.SplitFastq2.memory": "${32}", + "SingleSampleCODEC.Trim.disk_size": "${64}", + "SingleSampleCODEC.Trim.mem": "${32}", + "SingleSampleCODEC.ZipperBamAlignment.disk_size": "${200}", + "SingleSampleCODEC.ZipperBamAlignment.mem": "${32}", + "SingleSampleCODEC.eval_genome_bed": "gs://gptag/CODEC/ddbtp_codec_easy_regions.hg38.bed", + "SingleSampleCODEC.eval_genome_interval": "gs://gptag/CODEC/ddbtp_codec_easy_regions.hg38.interval_list", + "SingleSampleCODEC.fastq1": "${this.fastq1}", + "SingleSampleCODEC.fastq2": "${this.fastq2}", + "SingleSampleCODEC.germline_bam": "${this.germline_bam}", + "SingleSampleCODEC.germline_bam_index": "${this.germline_bam_index}", + "SingleSampleCODEC.num_parallel": "${40}", + "SingleSampleCODEC.reference_amb": "${workspace.referenceData_hg38_ref_amb}", + "SingleSampleCODEC.reference_ann": "${workspace.referenceData_hg38_ref_ann}", + "SingleSampleCODEC.reference_bwt": "${workspace.referenceData_hg38_ref_bwt}", + "SingleSampleCODEC.reference_dict": "${workspace.referenceData_hg38_ref_dict}", + "SingleSampleCODEC.reference_fasta": "${workspace.referenceData_hg38_ref_fasta}", + "SingleSampleCODEC.reference_fasta_index": "${workspace.referenceData_hg38_ref_fasta_index}", + "SingleSampleCODEC.reference_pac": "${workspace.referenceData_hg38_ref_pac}", + "SingleSampleCODEC.reference_sa": "${workspace.referenceData_hg38_ref_sa}", + "SingleSampleCODEC.sample_id": "${this.sample_id}", + "SingleSampleCODEC.sort_memory": "2G" +} \ No newline at end of file diff --git a/CODEC/SingleSampleCODEC.wdl b/CODEC/SingleSampleCODEC.wdl new file mode 100644 index 0000000..10aac0a --- /dev/null +++ b/CODEC/SingleSampleCODEC.wdl @@ -0,0 +1,1153 @@ +version 1.0 + +workflow SingleSampleCODEC_targeted { + input { + String sample_id + File fastq1 + File fastq2 + File reference_fasta + File reference_fasta_index + File reference_dict + File reference_pac + File reference_amb + File reference_ann + File reference_bwt + File reference_sa + File? germline_bam + File? germline_bam_index + Int num_parallel + String sort_memory + File eval_genome_interval = "gs://gptag/CODEC/GRCh38_notinalldifficultregions.interval_list" + File eval_genome_bed = "gs://gptag/CODEC/GRCh38_notinalldifficultregions.bed" + File? bait_intervals + File? target_intervals + Boolean is_captured_data + Boolean create_maf + } + call SplitFastq1 { + input: + fastq_read1 = fastq1, + nsplit = num_parallel, + sample_id = sample_id + } + call SplitFastq2 { + input: + fastq_read2 = fastq2, + nsplit = num_parallel, + sample_id = sample_id + } + scatter (split_index in range(num_parallel)) { + String output_prefix = "${sample_id}_split.${split_index}" + call Trim { + input: + read1 = SplitFastq1.split_read1[split_index], + read2 = SplitFastq2.split_read2[split_index], + output_prefix = output_prefix, + split = split_index, + sample_id = sample_id + } + call AlignRawTrimmed { + input: + bam_input = Trim.trimmed_bam, + reference_fasta = reference_fasta, + reference_fasta_index = reference_fasta_index, + reference_dict = reference_dict, + reference_pac = reference_pac, + reference_amb = reference_amb, + reference_ann = reference_ann, + reference_bwt = reference_bwt, + reference_sa = reference_sa, + sample_id = sample_id, + split = split_index + } + call ZipperBamAlignment { + input: + mapped_bam = AlignRawTrimmed.aligned_bam, + unmapped_bam = Trim.trimmed_bam, + reference_fasta = reference_fasta, + reference_fasta_index = reference_fasta_index, + reference_dict = reference_dict, + sample_id = sample_id, + split = split_index, + sort_memory = sort_memory + } + } + call MergeSplit { + input: + bam_files = ZipperBamAlignment.bam, + sample_id = sample_id + } + call MergeLogSplit { + input: + log_files = Trim.trimmed_log, + sample_id = sample_id + } + call SortBam { + input: + bam_file = MergeSplit.merged_bam, + sample_id = sample_id + } + call ByProductMetrics { + input: + trim_log = MergeLogSplit.merged_log, + highconf_bam = SortBam.sorted_bam, + sample_id = sample_id + } + call ReplaceRawReadGroup { + input: + raw_bam = MergeSplit.merged_bam, + sample_id = sample_id + } + call CollectInsertSizeMetrics { + input: + input_bam = ReplaceRawReadGroup.bam, + sample_id = sample_id + } + call GroupReadByUMI { + input: + input_bam = ReplaceRawReadGroup.bam, + sample_id = sample_id + } + if (is_captured_data) { + call DuplexRecoveryMetrics { + input: + groupbyumi_bam = GroupReadByUMI.groupbyumi_bam, + sample_id = sample_id, + duplex_eval_bed = bait_intervals + } + call PlotDuplexRecoveryByTarget { + input: + sample_id = sample_id, + duplex_recovery_metrics = DuplexRecoveryMetrics.duplex_recovery_metrics + } + } + call FgbioCollapseReadFamilies { + input: + grouped_umi_bam = GroupReadByUMI.groupbyumi_bam, + sample_id = sample_id + } + call AlignMolecularConsensusReads { + input: + mol_consensus_bam = FgbioCollapseReadFamilies.mol_consensus_bam, + sample_id = sample_id, + reference_fasta = reference_fasta, + reference_fasta_index = reference_fasta_index, + reference_dict = reference_dict, + reference_pac = reference_pac, + reference_amb = reference_amb, + reference_ann = reference_ann, + reference_bwt = reference_bwt, + reference_sa = reference_sa + } + call MergeAndSortMoleculeConsensusReads { + input: + mapped_sam = AlignMolecularConsensusReads.aligned_bam, + unmapped_bam = FgbioCollapseReadFamilies.mol_consensus_bam, + reference_fasta = reference_fasta, + reference_fasta_index = reference_fasta_index, + reference_dict = reference_dict, + sample_id = sample_id, + sort_memory = sort_memory + } + if (is_captured_data) { + call CollectSelectionMetrics { + input: + reference = reference_fasta, + reference_index = reference_fasta_index, + reference_dict = reference_dict, + bam_file = MergeAndSortMoleculeConsensusReads.bam, + bam_index = MergeAndSortMoleculeConsensusReads.bai, + sample_id = sample_id, + bait_intervals = bait_intervals, + target_intervals = target_intervals + } + call Hs_metrics { + input: + output_selection_metrics = CollectSelectionMetrics.output_selection_metrics + } + } + call CSS_SFC_ErrorMetrics { + input: + ConsensusAlignedBam = MergeAndSortMoleculeConsensusReads.bam, + ConsensusAlignedBai = MergeAndSortMoleculeConsensusReads.bai, + sample_id = sample_id, + reference_fasta = reference_fasta, + reference_fasta_index = reference_fasta_index, + reference_dict = reference_dict, + reference_pac = reference_pac, + reference_amb = reference_amb, + reference_ann = reference_ann, + reference_bwt = reference_bwt, + reference_sa = reference_sa, + germline_bam = germline_bam, + germline_bam_index = germline_bam_index, + eval_genome_bed = eval_genome_bed + } + if (create_maf) { + call codec2MAF { + input: + variants_called = CSS_SFC_ErrorMetrics.variants_called, + sample_id = sample_id + } + } + call QC_metrics { + input: + byproduct_metrics = ByProductMetrics.byproduct_metrics, + umiHistogram = GroupReadByUMI.umi_histogram, + InsertSizeMetrics = CollectInsertSizeMetrics.insert_size_metrics, + mutant_metrics = CSS_SFC_ErrorMetrics.mutant_metrics, + } + call EvalGenomeBases { + input: + eval_genome_interval = eval_genome_interval + } + call CalculateDuplexDepth { + input: + eval_genome_bases = EvalGenomeBases.eval_genome_bases, + n_bases_eval = QC_metrics.n_bases_eval, + mean_insert_size = QC_metrics.mean_insert_size, + n_total_fastq = QC_metrics.n_total_fastq + } + + output { + File byproduct_metrics = ByProductMetrics.byproduct_metrics + File RAW_BAM = MergeSplit.merged_bam + File RAW_BAM_index = MergeSplit.merged_bai + File MolConsensusBAM = MergeAndSortMoleculeConsensusReads.bam + File MolConsensusBAM_index = MergeAndSortMoleculeConsensusReads.bai + File InsertSizeMetrics = CollectInsertSizeMetrics.insert_size_metrics + File InsertSizeHistogram = CollectInsertSizeMetrics.insert_size_histogram + File umiHistogram = GroupReadByUMI.umi_histogram + + File? output_selection_metrics = CollectSelectionMetrics.output_selection_metrics + File? output_per_target_selection_metrics = CollectSelectionMetrics.output_per_target_selection_metrics + File? output_theoretical_sensitivity = CollectSelectionMetrics.output_theoretical_sensitivity + + File mutant_metrics = CSS_SFC_ErrorMetrics.mutant_metrics + File context_count = CSS_SFC_ErrorMetrics.context_count + File variants_called = CSS_SFC_ErrorMetrics.variants_called + + Int n_total_fastq = QC_metrics.n_total_fastq + Int n_correct_products = QC_metrics.n_correct_products + Float pct_correct_products = QC_metrics.pct_correct_products + Int n_double_ligation = QC_metrics.n_double_ligation + Float pct_double_ligation = QC_metrics.pct_double_ligation + Int n_intermol = QC_metrics.n_intermol + Float pct_intermol = QC_metrics.pct_intermol + Int n_adp_dimer = QC_metrics.n_adp_dimer + Float pct_adp_dimer = QC_metrics.pct_adp_dimer + Float duplication_rate = QC_metrics.duplication_rate + Float mean_insert_size = QC_metrics.mean_insert_size + Int median_insert_size = QC_metrics.median_insert_size + Int n_snv = QC_metrics.n_snv + Int n_indel = QC_metrics.n_indel + String n_bases_eval = QC_metrics.n_bases_eval + Float snv_rate = QC_metrics.snv_rate + Float indel_rate = QC_metrics.indel_rate + String eval_genome_bases = EvalGenomeBases.eval_genome_bases + Float? pct_selected_bases = Hs_metrics.pct_selected_bases + Float? mean_bait_coverage = Hs_metrics.mean_bait_coverage + Float? mean_target_coverage = Hs_metrics.mean_target_coverage + Float duplex_depth = CalculateDuplexDepth.duplex_depth + Float duplex_efficiency = CalculateDuplexDepth.duplex_efficiency + File? duplex_recovery_metrics = DuplexRecoveryMetrics.duplex_recovery_metrics + File? ds_duplex_plots = PlotDuplexRecoveryByTarget.ds_duplex_plots + File? outputMAF = codec2MAF.output_maf + } +} + +task SplitFastq1 { + input { + File fastq_read1 + String sample_id + Int nsplit + Int memory = 64 + Int? extra_disk + Int disk_size = ceil(size(fastq_read1, "GB") * 15) + select_first([extra_disk, 0]) + String? docker_override + } + + command <<< + set -e + + zcat ~{fastq_read1} | /CODECsuite/snakemake/script/fastqsplit.pl ~{sample_id}_split_r1 ~{nsplit} + + >>> + + output { + Array[File] split_read1 = glob("~{sample_id}_split_r1.*.fastq") + } + + runtime { + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) + memory: memory + " GB" + disks: "local-disk " + disk_size + " HDD" + } +} + +task SplitFastq2 { + input { + File fastq_read2 + Int nsplit + String sample_id + Int memory = 64 + Int? extra_disk + Int disk_size = ceil(size(fastq_read2, "GB") * 15) + select_first([extra_disk, 0]) + String? docker_override + } + + command <<< + set -e + zcat ~{fastq_read2} | /CODECsuite/snakemake/script/fastqsplit.pl ~{sample_id}_split_r2 ~{nsplit} + + >>> + + output { + Array[File] split_read2 = glob("~{sample_id}_split_r2.*.fastq") + } + + runtime { + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) + memory: memory + " GB" + disks: "local-disk " + disk_size + " HDD" + } +} + +task Trim { + input { + File read1 + File read2 + String sample_id + String output_prefix + Int split + Int mem = 16 + Int? extra_disk + Int disk_size = ceil(size(read1, "GB") * 8) + select_first([extra_disk, 0]) + String? docker_override + } + + command { + set -e + + /CODECsuite/build/codec trim -1 ~{read1} -2 ~{read2} -o ~{output_prefix} -u 3 -U 3 -f 2 -t 2 -s ~{sample_id} > ~{output_prefix}.trim.log + + } + runtime { + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) + disks: "local-disk " + disk_size + " HDD" + memory: mem + " GB" + } + output { + File trimmed_bam = "${sample_id}_split.${split}.trim.bam" + File trimmed_log = "${sample_id}_split.${split}.trim.log" + } +} + +task AlignRawTrimmed { + input { + File bam_input + File reference_fasta + File reference_fasta_index + File reference_dict + File reference_pac + File reference_amb + File reference_ann + File reference_bwt + File reference_sa + Int mem = 16 + Int? extra_disk + Int disk_size = ceil(size(bam_input, "GB") * 20) + select_first([extra_disk, 0]) + String sample_id + Int split + String? docker_override + } + + String output_bam_name = "${sample_id}_split.${split}.aligned_tmp.bam" + + command { + samtools fastq ~{bam_input} | \ + bwa mem \ + -K 100000000 \ + -p \ + -Y \ + ~{reference_fasta} - | samtools view - -S -b -o ~{output_bam_name} + } + + output { + File aligned_bam = output_bam_name + } + + runtime { + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) + memory: mem + " GB" + disks: "local-disk " + disk_size + " HDD" + preemptible: 3 + } +} + +task ZipperBamAlignment { + input { + File mapped_bam + File unmapped_bam + File reference_fasta + File reference_fasta_index + File reference_dict + Int? mem + Int? disk_size + String sample_id + Int split + String sort_memory + String? docker_override + + } + + String bamOutput = "${sample_id}_split.${split}.trim.aligned.bam" + String baiOutput = "${sample_id}_split.${split}.trim.aligned.bam.bai" + + command { + java -jar /dependencies/fgbio-2.0.2.jar --compression 0 --async-io ZipperBams \ + -i ~{mapped_bam} \ + --unmapped ~{unmapped_bam} \ + --ref ~{reference_fasta} \ + | samtools sort -m ~{sort_memory} - -o ~{bamOutput} -O BAM && samtools index ~{bamOutput} + } + + output { + File bam = bamOutput + File bai = baiOutput + } + + runtime { + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) + memory: select_first([mem, 8]) + " GB" + disks: "local-disk " + select_first([disk_size, 16]) + " HDD" + preemptible: 3 + } +} + +task MergeSplit { + input { + Array[File] bam_files + String sample_id + Int memory = 64 + Int disk_size =200 + String? docker_override + } + + command { + set -e + samtools merge -@ 4 ~{sample_id}.raw.aligned.bam ~{sep=' ' bam_files} && \ + samtools index ~{sample_id}.raw.aligned.bam + } + + output { + File merged_bam = "~{sample_id}.raw.aligned.bam" + File merged_bai = "~{sample_id}.raw.aligned.bam.bai" + } + + runtime { + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) + disks: "local-disk " + disk_size + " HDD" + memory: memory + " GB" + } +} + +task MergeLogSplit { + input { + Array[File] log_files + String sample_id + Int mem = 32 + Int disk_size = 64 + String? docker_override + } + + command { + set -e + python3 /CODECsuite/snakemake/script/agg_log.py ~{sep=' ' log_files} ~{sample_id}.trim.log + } + + output { + File merged_log = "~{sample_id}.trim.log" + } + + runtime { + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) + disks: "local-disk " + disk_size + " HDD" + memory: mem + " GB" + } +} + +task CollectSelectionMetrics { + input{ + File reference + File reference_index + File reference_dict + File bam_file + File bam_index + String sample_id + File? bait_intervals + File? target_intervals + Int? preemptible_attempts + Int memory = 32 + Int disk_pad = 0 + Int disk_size = ceil(size(bam_file, "GB") * 2) + disk_pad + String? docker_override + } + + command { + + java -jar /dependencies/picard.jar CollectHsMetrics \ + I=~{bam_file} \ + O=~{sample_id}.selection_metrics \ + PER_TARGET_COVERAGE=~{sample_id}.per_target_selection_metrics \ + THEORETICAL_SENSITIVITY_OUTPUT=~{sample_id}.theoretical_sensitivity \ + BAIT_INTERVALS=~{bait_intervals} \ + TARGET_INTERVALS=~{target_intervals} \ + REFERENCE_SEQUENCE=~{reference} + } + + output { + File output_selection_metrics = "~{sample_id}.selection_metrics" + File output_per_target_selection_metrics = "~{sample_id}.per_target_selection_metrics" + File output_theoretical_sensitivity = "~{sample_id}.theoretical_sensitivity" + } + + runtime { + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) + disks: "local-disk " + disk_size + " HDD" + memory: memory + " GB" + preemptible: select_first([preemptible_attempts, 3]) + } +} + +task Hs_metrics { + input { + File output_selection_metrics + } + + command <<< + python3 <>> + + output { + Float pct_selected_bases = read_float("pct_selected_bases.txt") + Float mean_bait_coverage = read_float("mean_bait_coverage.txt") + Float mean_target_coverage = read_float("mean_target_coverage.txt") + } + + runtime { + memory: "16 GB" + docker: "us.gcr.io/tag-public/metadata_upload" + disks: "local-disk 16 HDD" + preemptible: 1 + } +} + +task SortBam { + input { + File bam_file + String sample_id + Int mem = 64 + Int disk_size = 200 + String? docker_override + } + + command { + samtools sort -n ~{bam_file} -o ~{sample_id}.raw.aligned.sortbyname.bam + } + + output { + File sorted_bam = "~{sample_id}.raw.aligned.sortbyname.bam" + } + + runtime { + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) + disks: "local-disk " + disk_size + " HDD" + memory: mem + " GB" + preemptible: 2 + } +} + +task ByProductMetrics { + input { + File trim_log + File highconf_bam + String sample_id + Int mem = 32 + Int disk_size = 100 + String? docker_override + } + + command { + python3 /CODECsuite/snakemake/script/cds_summarize.py --sample_id ~{sample_id} --trim_log ~{trim_log} \ + --highconf_bam ~{highconf_bam} > ~{sample_id}.byproduct.txt + } + + output { + File byproduct_metrics = "~{sample_id}.byproduct.txt" + } + + runtime { + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) + disks: "local-disk " + disk_size + " HDD" + memory: mem + " GB" + } +} + +task ReplaceRawReadGroup { + input { + File raw_bam + String sample_id + Int memory = 64 + Int disk_size = 200 + String? docker_override + } + + command { + java -jar /dependencies/picard.jar AddOrReplaceReadGroups \ + I=~{raw_bam} \ + O=~{sample_id}.raw.replacerg.bam \ + CREATE_INDEX=true \ + RGID=4 \ + RGLB=lib1 \ + RGPL=ILLUMINA \ + RGPU=unit1 \ + RGSM=~{sample_id} + } + + output { + File bam = "~{sample_id}.raw.replacerg.bam" + File bai = "~{sample_id}.raw.replacerg.bai" + } + + runtime { + memory: memory + " GB" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) + disks: "local-disk " + disk_size + " HDD" + } +} + +task CollectInsertSizeMetrics { + input { + File input_bam + String sample_id + Int memory = 32 + Int disk_size = 200 + String? docker_override + } + + command { + java -jar /dependencies/picard.jar CollectInsertSizeMetrics \ + I=~{input_bam} \ + O=~{sample_id}.raw.insert_size_metrics.txt \ + H=~{sample_id}.raw.insert_size_histogram.pdf \ + M=0.5 W=600 DEVIATIONS=100 + } + + output { + File insert_size_metrics = "~{sample_id}.raw.insert_size_metrics.txt" + File insert_size_histogram = "~{sample_id}.raw.insert_size_histogram.pdf" + } + + runtime { + memory: memory + " GB" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) + disks: "local-disk " + disk_size + " HDD" + } +} + +task GroupReadByUMI { + input { + File input_bam + String sample_id + Int memory = 64 + Int? extra_disk + Int disk_size = ceil(size(input_bam, "GB") * 15) + select_first([extra_disk, 0]) + String? docker_override + } + + command { + java -jar /dependencies/fgbio-2.0.2.jar --compression 1 --async-io \ + GroupReadsByUmi \ + -i ~{input_bam} \ + -o ~{sample_id}.GroupedByUmi.bam \ + -f ~{sample_id}.umiHistogram.txt \ + -m 0 \ + --strategy=paired + } + + output { + File groupbyumi_bam = "~{sample_id}.GroupedByUmi.bam" + File umi_histogram = "~{sample_id}.umiHistogram.txt" + } + + runtime { + memory: memory + " GB" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) + disks: "local-disk " + disk_size + " HDD" + } +} + + +task DuplexRecoveryMetrics { + input { + File groupbyumi_bam + String sample_id + File? duplex_eval_bed + Int memory = 32 + Int extra_disk = 0 + Int disk_size = ceil(size(groupbyumi_bam , "GB") * 3) + select_first([extra_disk, 0]) + String sorted_bam = basename(groupbyumi_bam, ".bam") + ".sorted.bam" + String? docker_override + } + + command { + + samtools sort ~{groupbyumi_bam} -o ~{sorted_bam} + + python3 /scripts/collect_duplex_metrics.py \ + --bam_file ~{sorted_bam} \ + --output_file ~{sample_id}.duplex_metrics.txt \ + --interval_list ~{duplex_eval_bed} \ + --per_target \ + --is_cds + + } + + output { + File duplex_recovery_metrics = "~{sample_id}.duplex_metrics.txt" + } + + runtime { + memory: memory + " GB" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) + disks: "local-disk " + disk_size + " HDD" + } +} + +task PlotDuplexRecoveryByTarget { + input { + File? duplex_recovery_metrics + String sample_id + } + + command <<< + python3 <>> + + + output { + File ds_duplex_plots = "~{sample_id}.ds_duplex_plots.pdf" + } + + runtime { + memory: "16 GB" + docker: "us.gcr.io/tag-public/python:test" + disks: "local-disk 16 HDD" + } +} + +task FgbioCollapseReadFamilies { + input { + File grouped_umi_bam + String sample_id + Int memory = 64 + Int? extra_disk + Int disk_size = ceil(size(grouped_umi_bam, "GB") * 10) + select_first([extra_disk, 0]) + String? docker_override + } + + command { + java -jar /dependencies/fgbio-2.0.2.jar --compression 1 CallMolecularConsensusReads \ + -i ~{grouped_umi_bam} \ + -o ~{sample_id}.mol_consensus.bam \ + -p ~{sample_id} \ + --threads 2 \ + --consensus-call-overlapping-bases false \ + -M 1 + } + + output { + File mol_consensus_bam = "~{sample_id}.mol_consensus.bam" + } + + runtime { + memory: memory + " GB" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) + disks: "local-disk " + disk_size + " HDD" + } +} + +task AlignMolecularConsensusReads { + input { + File mol_consensus_bam + String sample_id + File reference_fasta + File reference_fasta_index + File reference_dict + File reference_pac + File reference_amb + File reference_ann + File reference_bwt + File reference_sa + Int memory = 64 + Int disk_size = 200 + Int threads = 4 + Int cpu_cores = 1 + String? docker_override + } + String output_bam_name = "${sample_id}.mol_consensus.aligned_tmp.bam" + + command { + samtools fastq ~{mol_consensus_bam} \ + | bwa mem -K 100000000 -t ~{threads} -p -Y ~{reference_fasta} - | samtools view - -S -b -o ~{output_bam_name} + } + + output { + File aligned_bam = "~{sample_id}.mol_consensus.aligned_tmp.bam" + } + + runtime { + memory: memory + " GB" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) + disks: "local-disk " + disk_size + " HDD" + cpu: cpu_cores + preemptible: 3 + } +} + +task MergeAndSortMoleculeConsensusReads { + input { + File mapped_sam + File unmapped_bam + File reference_fasta + File reference_fasta_index + File reference_dict + String sample_id + Int memory = 64 + Int disk_size = 200 + String sort_memory + String? docker_override + } + + command { + java -jar /dependencies/fgbio-2.0.2.jar --compression 0 --async-io ZipperBams \ + -i ~{mapped_sam} \ + --unmapped ~{unmapped_bam} \ + --ref ~{reference_fasta} \ + --tags-to-reverse Consensus \ + --tags-to-revcomp Consensus \ + | samtools sort -m ~{sort_memory} - -o ~{sample_id}.mol_consensus.aligned.bam -O BAM -@ 4 && samtools index ~{sample_id}.mol_consensus.aligned.bam + } + + output { + File bam = "~{sample_id}.mol_consensus.aligned.bam" + File bai = "~{sample_id}.mol_consensus.aligned.bam.bai" + } + + runtime { + memory: memory+ " GB" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) + disks: "local-disk " + disk_size + " HDD" + preemptible: 3 + } +} + + +task CSS_SFC_ErrorMetrics { + input { + File ConsensusAlignedBam + File ConsensusAlignedBai + String sample_id + File reference_fasta + File reference_fasta_index + File reference_dict + File reference_pac + File reference_amb + File reference_ann + File reference_bwt + File reference_sa + File? germline_bam + File? germline_bam_index + File eval_genome_bed + File population_based_vcf = "gs://gptag/CODEC/alfa_all.freq.breakmulti.hg38.af0001.vcf.gz" + File population_based_vcf_index = "gs://gptag/CODEC/alfa_all.freq.breakmulti.hg38.af0001.vcf.gz.tbi" + Int memory = 64 + Int disk_size = 200 + String? docker_override + } + + command { + /CODECsuite/build/codec call -b ~{ConsensusAlignedBam} \ + -L ~{eval_genome_bed} \ + -r ~{reference_fasta} \ + -m 60 \ + -q 30 \ + -d 12 \ + ~{if defined(germline_bam) then "-n " + germline_bam else ""} \ + -V ~{population_based_vcf} \ + -x 6 \ + -c 4 \ + -5 \ + -g 30 \ + -G 250 \ + -Q 0.7 \ + -B 0.6 \ + -N 0.05 \ + -Y 10 \ + -W 1 \ + -a ~{sample_id}.mutant_metrics.txt \ + -e ~{sample_id}.variants_called.txt \ + -C ~{sample_id}.context_count.txt + } + + output { + File mutant_metrics = "~{sample_id}.mutant_metrics.txt" + File variants_called = "~{sample_id}.variants_called.txt" + File context_count = "~{sample_id}.context_count.txt" + } + + runtime { + memory: memory + " GB" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec:v1.1.4"]) + disks: "local-disk " + disk_size + " HDD" + preemptible: 3 + } +} + + +task QC_metrics { + input { + File byproduct_metrics + File umiHistogram + File InsertSizeMetrics + File mutant_metrics + Int memory = 32 + Int disk_size = 16 + + } + command <<< + set -e + + # Use umi histogram to calculate duplication rate + cat ~{umiHistogram} | awk -F'\t' 'NR > 1 { + numerator += ($1 - 1) * $2; + denominator += $1 * $2; + } END { + if (denominator > 0) print numerator / denominator; + else print "NA"; + }' > duplication_rate.txt + + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="n_total") col=i} NR==2 {print $col}' ~{byproduct_metrics} > n_total_fastq.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="n_correct") col=i} NR==2 {print $col}' ~{byproduct_metrics} > n_correct.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="pct_correct") col=i} NR==2 {print $col}' ~{byproduct_metrics} > pct_correct.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="n_double_ligation") col=i} NR==2 {print $col}' ~{byproduct_metrics} > n_double_ligation.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="pct_double_ligation") col=i} NR==2 {print $col}' ~{byproduct_metrics} > pct_double_ligation.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="n_intermol") col=i} NR==2 {print $col}' ~{byproduct_metrics} > n_intermol.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="pct_intermol") col=i} NR==2 {print $col}' ~{byproduct_metrics} > pct_intermol.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="n_adp_dimer") col=i} NR==2 {print $col}' ~{byproduct_metrics} > n_adp_dimer.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="pct_adp_dimer") col=i} NR==2 {print $col}' ~{byproduct_metrics} > pct_adp_dimer.txt + + cat ~{InsertSizeMetrics} | grep -v "#" | awk 'NR==3 {print $1}' > median_insert_size.txt + cat ~{InsertSizeMetrics} | grep -v "#" | awk 'NR==3 {print $6}' > mean_insert_size.txt + + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="n_snv") col=i} NR==2 {print $col}' ~{mutant_metrics} > n_snv.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="n_indel") col=i} NR==2 {print $col}' ~{mutant_metrics} > n_indel.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="n_bases_eval") col=i} NR==2 {print $col}' ~{mutant_metrics} > n_bases_eval.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="snv_rate") col=i} NR==2 {print $col}' ~{mutant_metrics} > snv_rate.txt + awk 'NR==1 {for (i=1; i<=NF; i++) if ($i=="indel_rate") col=i} NR==2 {print $col}' ~{mutant_metrics} > indel_rate.txt + + + + >>> + output { + Float duplication_rate = read_float("duplication_rate.txt") + Int n_total_fastq = read_int("n_total_fastq.txt") + Int n_correct_products = read_int("n_correct.txt") + Float pct_correct_products = read_float("pct_correct.txt") + Int n_double_ligation = read_int("n_double_ligation.txt") + Float pct_double_ligation = read_float("pct_double_ligation.txt") + Int n_intermol = read_int("n_intermol.txt") + Float pct_intermol = read_float("pct_intermol.txt") + Int n_adp_dimer = read_int("n_adp_dimer.txt") + Float pct_adp_dimer = read_float("pct_adp_dimer.txt") + Float mean_insert_size = read_float("mean_insert_size.txt") + Int median_insert_size = read_int("median_insert_size.txt") + Int n_snv = read_int("n_snv.txt") + Int n_indel = read_int("n_indel.txt") + String n_bases_eval = read_string("n_bases_eval.txt") + Float snv_rate = read_float("snv_rate.txt") + Float indel_rate = read_float("indel_rate.txt") + } + runtime { + memory: memory + " GB" + docker: "us.gcr.io/tag-public/metadata_upload" + disks: "local-disk " + disk_size + " HDD" + preemptible: 2 + } +} + + +task EvalGenomeBases { + input { + File eval_genome_interval + Int memory = 16 + Int disk_size = 8 + } + + command { + java -jar /dependencies/picard.jar IntervalListTools \ + I=~{eval_genome_interval} COUNT_OUTPUT=eval_genome_bases.txt OUTPUT_VALUE=BASES + + } + + output { + String eval_genome_bases = read_string("eval_genome_bases.txt") + } + runtime { + memory: memory + " GB" + docker: "us.gcr.io/tag-public/metadata_upload" + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + } +} + + +task CalculateDuplexDepth { + input { + String eval_genome_bases + String n_bases_eval + Float mean_insert_size + Int n_total_fastq + Int memory = 32 + Int disk_size = 64 + } + + command <<< + python3 <>> + + output { + Float duplex_depth = read_float('duplex_depth.txt') + Float duplex_efficiency = read_float('duplex_efficiency.txt') + } + runtime { + memory: memory + " GB" + docker: "us.gcr.io/tag-public/metadata_upload" + disks: "local-disk " + disk_size + " HDD" + preemptible: 1 + } +} + +task codec2MAF { + input { + File variants_called + String sample_id + String? docker_override + } + + command <<< + + # Only run this script when there are variants called. + + if [[ $(wc -l < ~{variants_called}) -gt 2 ]]; then + Rscript /scripts/codec2maf.R --inputmaf ~{variants_called} --outputmaf ~{sample_id}.maf + else + echo "Warning: File ~{variants_called} has no variants. Skipping this task and returning empty maf." + touch ~{sample_id}.maf + fi + + >>> + + output { + File output_maf = "~{sample_id}.maf" + } + + runtime{ + memory: "16G" + docker: select_first([docker_override, "us.gcr.io/tag-public/codec2maf_r_docker:v1"]) + disks: "local-disk 16 HDD" + preemptible: 1 + } +} \ No newline at end of file diff --git a/CODEC/codec_bcl2fastq.inputs.json b/CODEC/codec_bcl2fastq.inputs.json new file mode 100644 index 0000000..bcef386 --- /dev/null +++ b/CODEC/codec_bcl2fastq.inputs.json @@ -0,0 +1,12 @@ +{ + "codec_bcl2fastq.delete_input_bcl_directory": "${false}", + "codec_bcl2fastq.disk_space": "${3000}", + "codec_bcl2fastq.input_bcl_directory": "gs://fc-685a4bff-2f8c-4c0c-ad21-bbfdeab7b5e0/1705429346/", + "codec_bcl2fastq.memory": "${}", + "codec_bcl2fastq.output_directory": "gs://fc-685a4bff-2f8c-4c0c-ad21-bbfdeab7b5e0", + "codec_bcl2fastq.preemptible": "${4}", + "codec_bcl2fastq.read_threads": "${8}", + "codec_bcl2fastq.run_bcl2fastq.num_cpu": "${2}", + "codec_bcl2fastq.run_bcl2fastq.run_id": "${}", + "codec_bcl2fastq.write_threads": "${8}" +} \ No newline at end of file diff --git a/CODEC/codec_bcl2fastq.wdl b/CODEC/codec_bcl2fastq.wdl new file mode 100644 index 0000000..8e9058f --- /dev/null +++ b/CODEC/codec_bcl2fastq.wdl @@ -0,0 +1,118 @@ +version 1.0 + +workflow codec_bcl2fastq { + input { + String input_bcl_directory + String output_directory + Int read_threads + Int write_threads + String memory = "128G" + Int preemptible = 3 + Boolean delete_input_bcl_directory + } + call GetBclBucketSize { + input: + input_bcl_directory = input_bcl_directory, + memory = 16 + } + + call run_bcl2fastq { + input: + input_bcl_directory = sub(input_bcl_directory, "/+$", ""), + output_directory = sub(output_directory, "/+$", ""), + read_threads = read_threads, + write_threads = write_threads, + memory = memory, + input_bcl_size = GetBclBucketSize.input_bcl_size, + preemptible = preemptible, + delete_input_bcl_directory = delete_input_bcl_directory + } + + output { + String job_stat = run_bcl2fastq.job_stat + } +} + +task GetBclBucketSize { + input { + String input_bcl_directory + Int memory = 16 + } + + command <<< + gsutil du -sh ${input_bcl_directory} | awk '{print $1/1024/1024/1024}' > input_bcl_size.txt + >>> + + output { + Float input_bcl_size = read_float("input_bcl_size.txt") + } + + runtime { + docker: "us.gcr.io/tag-public/bcl2fastq_codec:v1" + memory: memory + " GB" + disks: "local-disk 16 HDD" + } +} + +task run_bcl2fastq { + input { + String input_bcl_directory + String output_directory + String memory + Float input_bcl_size + Int disk_space = ceil(input_bcl_size * 1.5) + select_first([extra_disk, 0]) + Int? extra_disk + Int read_threads = 4 + Int write_threads = 4 + String run_id = basename(input_bcl_directory) + Boolean delete_input_bcl_directory + Int num_cpu = 2 + Int preemptible + } + + command { + set -e + export TMPDIR=/tmp + + gsutil -q -m cp -r ${input_bcl_directory} . + + cd ${run_id} + + bcl2fastq \ + --output-dir out \ + -w ~{write_threads} \ + -r ~{read_threads} + + cd out + + gsutil -q -m cp -r . ${output_directory}/${run_id}_fastqs/ + + if [ $? -eq 0 ]; then + echo "Fastq files generated and copied successfully to ${output_directory}/${run_id}_fastqs/." + else + echo "Error in copying files." + fi + + if [ "${delete_input_bcl_directory}" = "true" ]; then + gsutil -q -m rm -r "${input_bcl_directory}" + if [ $? -eq 0 ]; then + echo "Input bcl folder has been successfully deleted!" + else + echo "Unable to delete BCL directory" + fi + fi + } + + output { + String job_stat = read_string(stdout()) + } + + runtime { + docker: "us.gcr.io/tag-public/bcl2fastq_codec:v1" + memory: memory + bootDiskSizeGb: 12 + disks: "local-disk ${disk_space} HDD" + cpu: num_cpu + preemptible: preemptible +} +} \ No newline at end of file diff --git a/CODEC/demux_CODEC.inputs.json b/CODEC/demux_CODEC.inputs.json new file mode 100644 index 0000000..aa785e5 --- /dev/null +++ b/CODEC/demux_CODEC.inputs.json @@ -0,0 +1,21 @@ +{ + "demux_CODEC.Demux.disk_size": "${200}", + "demux_CODEC.Demux.memory": "${64}", + "demux_CODEC.GroupFastq1.disk_size": "${1600}", + "demux_CODEC.GroupFastq2.disk_size": "${1600}", + "demux_CODEC.SplitFastq1.disk_size": "${2048}", + "demux_CODEC.SplitFastq1.memory": "${128}", + "demux_CODEC.SplitFastq2.disk_size": "${2048}", + "demux_CODEC.SplitFastq2.memory": "${128}", + "demux_CODEC.UploadDataTable.namespace": "broadtagteam", + "demux_CODEC.batch_ids": "${[\"L001\",\"L002\"]}", + "demux_CODEC.concatenate_and_compress_fastq1.disk_size": "${100}", + "demux_CODEC.concatenate_and_compress_fastq1.memory": "${64}", + "demux_CODEC.concatenate_and_compress_fastq2.disk_size": "${100}", + "demux_CODEC.concatenate_and_compress_fastq2.memory": "${64}", + "demux_CODEC.fastq1": "${[\"gs://fc-685a4bff-2f8c-4c0c-ad21-bbfdeab7b5e0/1705429346_fastqs/Undetermined_S0_L001_R1_001.fastq.gz\",\"gs://fc-685a4bff-2f8c-4c0c-ad21-bbfdeab7b5e0/1705429346_fastqs/Undetermined_S0_L002_R1_001.fastq.gz\"]}", + "demux_CODEC.fastq2": "${[\"gs://fc-685a4bff-2f8c-4c0c-ad21-bbfdeab7b5e0/1705429346_fastqs/Undetermined_S0_L001_R2_001.fastq.gz\",\"gs://fc-685a4bff-2f8c-4c0c-ad21-bbfdeab7b5e0/1705429346_fastqs/Undetermined_S0_L002_R2_001.fastq.gz\"]}", + "demux_CODEC.num_parallel": "${40}", + "demux_CODEC.sample_sheet": "${[\"gs://fc-685a4bff-2f8c-4c0c-ad21-bbfdeab7b5e0/sample_sheet/sample_sheet_L001.csv\",\"gs://fc-685a4bff-2f8c-4c0c-ad21-bbfdeab7b5e0/sample_sheet/sample_sheet_L002.csv\"]}", + "demux_CODEC.workspace": "CODEC_example_workspace" +} \ No newline at end of file diff --git a/CODEC/demux_CODEC.wdl b/CODEC/demux_CODEC.wdl new file mode 100644 index 0000000..850db8b --- /dev/null +++ b/CODEC/demux_CODEC.wdl @@ -0,0 +1,277 @@ +version 1.0 + + +workflow demux_CODEC { + input { + Array[File] multisample_fastq1 + Array[File] multisample_fastq2 + Array[String] batch_ids + Int num_parallel + Array[File] sample_sheet + String workspace + } + scatter (batch_index in range(length(batch_ids))) { + call SplitFastq1 { + input: + batch_id =batch_ids[batch_index], + fastq_read1 = multisample_fastq1[batch_index], + nsplit = num_parallel + } + call SplitFastq2 { + input: + batch_id =batch_ids[batch_index], + fastq_read2 = multisample_fastq2[batch_index], + nsplit = num_parallel + } + scatter (split_index in range(num_parallel)) { + call Demux { + input: + paired_fastqs = (SplitFastq1.split_read1[split_index], SplitFastq2.split_read2[split_index]), + sample_sheet = sample_sheet[batch_index], + batch_id = batch_ids[batch_index], + split = split_index + } + } + } + Array[File] demultiplexed_fastq1_all_lanes = flatten(flatten(Demux.demux_read1)) + Array[File] demultiplexed_fastq2_all_lanes = flatten(flatten(Demux.demux_read2)) + + call GroupFastqFiles as GroupFastq1 { + input: + fastq_files = demultiplexed_fastq1_all_lanes + } + call GroupFastqFiles as GroupFastq2 { + input: + fastq_files = demultiplexed_fastq2_all_lanes + } + scatter (txt_file in GroupFastq1.grouped_fastq_lists) { + call concatenate_and_compress_fastq as concatenate_and_compress_fastq1 { + input: + grouped_fastq_list = txt_file, + output_file = basename(txt_file, ".txt") + ".1.gz" + } + } + + scatter (txt_file in GroupFastq2.grouped_fastq_lists) { + call concatenate_and_compress_fastq as concatenate_and_compress_fastq2 { + input: + grouped_fastq_list = txt_file, + output_file = basename(txt_file, ".txt") + ".2.gz" + } + } + + call UploadDataTable { + input: + fastq1_files = concatenate_and_compress_fastq1.output_fastq_gz, + fastq2_files = concatenate_and_compress_fastq2.output_fastq_gz, + workspace = workspace + + } + + output { + Array[File] merged_fastq1_per_sample = concatenate_and_compress_fastq1.output_fastq_gz + Array[File] merged_fastq2_per_sample = concatenate_and_compress_fastq2.output_fastq_gz + File data_table = UploadDataTable.data_table + } +} + + +task SplitFastq1 { + input { + File fastq_read1 + Int nsplit + String batch_id + Int memory = 128 + Int disk_size = 1024 + + } + + command <<< + set -e + + zcat ~{fastq_read1} | /CODECsuite/snakemake/script/fastqsplit.pl ~{batch_id}_split_r1 ~{nsplit} + + >>> + + output { + Array[File] split_read1 = glob("~{batch_id}_split_r1.*.fastq") + } + + runtime { + docker: "us.gcr.io/tag-public/codec:v1.1.1" + memory: memory + " GB" + disks: "local-disk " + disk_size + " HDD" + } +} + +task SplitFastq2 { + input { + File fastq_read2 + Int nsplit + String batch_id + Int memory = 128 + Int disk_size = 1024 + } + + command <<< + set -e + zcat ~{fastq_read2} | /CODECsuite/snakemake/script/fastqsplit.pl ~{batch_id}_split_r2 ~{nsplit} + + >>> + + output { + Array[File] split_read2 = glob("~{batch_id}_split_r2.*.fastq") + } + + runtime { + docker: "us.gcr.io/tag-public/codec:v1.1.1" + memory: memory + " GB" + disks: "local-disk " + disk_size + " HDD" + } +} + +task Demux { + input { + Pair[File, File] paired_fastqs + File sample_sheet + Int memory = 64 + Int disk_size = 100 + Int split + String batch_id + } + + command { + set -e + + /CODECsuite/build/codec demux -1 ${paired_fastqs.left} -2 ${paired_fastqs.right} -p ${sample_sheet} -o ${batch_id}_split.${split} > ${batch_id}_split.${split}.log + } + runtime { + docker: "us.gcr.io/tag-public/codec:v1.1.1" + memory: memory + " GB" + disks: "local-disk " + disk_size + " HDD" + } + output { + Array[File] demux_read1 = glob("~{batch_id}_split.~{split}.*.1.fastq.gz") + Array[File] demux_read2 = glob("~{batch_id}_split.~{split}.*.2.fastq.gz") + File log = "~{batch_id}_split.~{split}.log" + } +} +task GroupFastqFiles { + input { + Array[File] fastq_files + Int memory = 64 + Int disk_size = 100 + } + + command <<< + set -e + + for fastq_file in ~{sep=' ' fastq_files}; do + sample_name=$(basename "$fastq_file" | cut -d '.' -f 3) + echo "$fastq_file" >> "${sample_name}.txt" + done + + for txt_file in *.txt; do + sed -i 's|/cromwell_root/|gs://|g' "$txt_file" + done + >>> + + output { + Array[File] grouped_fastq_lists = glob("*.txt") + } + + runtime { + docker: "us.gcr.io/tag-public/bcl2fastq_codec:v1" + disks: "local-disk " + disk_size + " HDD" + memory: memory + " GB" + } +} + +task concatenate_and_compress_fastq { + input { + File grouped_fastq_list + String output_file + Int memory = 64 + Int disk_size = 64 + } + + command { + set -e + while IFS= read -r line; do + gsutil cp "$line" temp.fastq.gz + + # Check if the file was successfully copied + if [ -f temp.fastq.gz ]; then + gunzip -c temp.fastq.gz + else + echo "Failed to copy file from $line" + exit 1 + fi + done < ~{grouped_fastq_list} | gzip > ~{output_file} + } + output { + File output_fastq_gz = output_file + } + + runtime { + docker: "us.gcr.io/tag-public/bcl2fastq_codec:v1" + disks: "local-disk " + disk_size + " HDD" + memory: memory + " GB" + } +} + + +task UploadDataTable { + input { + Array[String] fastq1_files + Array[String] fastq2_files + String output_data_table = "sample.tsv" + String workspace + String namespace = "broadtagteam" + Int memory = 32 + Int disk_size = 32 + } + + command <<< + python3 <>> + + output { + File data_table = output_data_table + String upload_stats = read_string(stdout()) + } + + runtime { + docker: "us.gcr.io/tag-public/metadata_upload" + disks: "local-disk " + disk_size + " HDD" + memory: memory + " GB" + } +} \ No newline at end of file diff --git a/CODEC/sampleIndex.csv b/CODEC/sampleIndex.csv new file mode 100644 index 0000000..d0dc938 --- /dev/null +++ b/CODEC/sampleIndex.csv @@ -0,0 +1,13 @@ +index,IndexBarcode1,IndexBarcode2 +1,CTTGAACGGACTGTCCAC,CACCGAGCGTTAGACTAC +2,GAGCCTACTCAGTCAACG,GTGTCGAACACTTGACGG +3,AGCTTGTAAGGCAGGTTA,ACTGATCTTCAGCTGACT +4,TCAAGCGTCTTACATGGT,TGAATCTGAGGCACTGTA +5,CTGGTCCAAGAACGTCTG,CTCTGAACGATCGAGCTC +6,GATCCAGTTCTGTCGAGC,GAGGTGCATGCACCTTAG +7,ACCTATAGGTGCAACGAA,ACTAACTTCCATTGCACT +8,TGAAGGTCCACTGTATCT,TGACCTGGATGGATAGGA +9,CACTGCTTCGAGACGAAG,CTCCAGTTACTGAGACGG +10,GTGATACCTCGATGCTCC,GAGGTCCAGTCTCTGTCC +11,ACTCAGAGAACTCATGGA,ACTACAGGTGGATCCAAT +12,TGAGCTGAGTTCGTACTT,TGATGTACCAACGATGTA diff --git a/GDCRealignment/CheckContaminationSomatic.wdl b/GDCRealignment/CheckContaminationSomatic.wdl new file mode 100644 index 0000000..dd70628 --- /dev/null +++ b/GDCRealignment/CheckContaminationSomatic.wdl @@ -0,0 +1,84 @@ +version 1.0 + +task CalculateSomaticContamination { + input { + File? intervals + File reference + File reference_dict + File reference_index + File tumor_cram_or_bam + File tumor_crai_or_bai + File? normal_cram_or_bam + File? normal_crai_or_bai + File contamination_vcf + File contamination_vcf_index + + # runtime + String gatk_docker = "us.gcr.io/broad-gatk/gatk:4.2.4.1" + File? gatk_override + Int? additional_disk + Int memory_mb = 3000 + Int? preemptible_attempts + Int? max_retries + } + + Int disk_size = ceil(size(tumor_cram_or_bam,"GB") + size(normal_cram_or_bam,"GB")) + select_first([additional_disk, 10]) + + Int command_mem = memory_mb - 500 + + command <<< + set -e + + export GATK_LOCAL_JAR=~{default="/root/gatk.jar" gatk_override} + + gatk --java-options "-Xmx~{command_mem}m" GetPileupSummaries \ + -R ~{reference} \ + -I ~{tumor_cram_or_bam} \ + ~{"--interval-set-rule INTERSECTION -L " + intervals} \ + -V ~{contamination_vcf} \ + -L ~{contamination_vcf} \ + -O pileups.table + + if [[ -f "~{normal_cram_or_bam}" ]]; + then + gatk --java-options "-Xmx~{command_mem}m" GetPileupSummaries \ + -R ~{reference} \ + -I ~{normal_cram_or_bam} \ + ~{"--interval-set-rule INTERSECTION -L " + intervals} \ + -V ~{contamination_vcf} \ + -L ~{contamination_vcf} \ + -O normal_pileups.table + + gatk --java-options "-Xmx~{command_mem}m" CalculateContamination \ + -I pileups.table \ + -O contamination.table \ + --tumor-segmentation segments.table \ + -matched normal_pileups.table + else + touch normal_pileups.table + gatk --java-options "-Xmx~{command_mem}m" CalculateContamination \ + -I pileups.table \ + -O contamination.table \ + --tumor-segmentation segments.table + fi + + grep -v ^sample contamination.table | awk '{print($2)}' > contam.txt + >>> + + runtime { + docker: gatk_docker + bootDiskSizeGb: 12 + memory: memory_mb + " MiB" + maxRetries: select_first([max_retries, 2]) + disks: "local-disk " + disk_size + " HDD" + preemptible: select_first([preemptible_attempts, 3]) + } + + output { + File pileups = "pileups.table" + File normal_pileups = "normal_pileups.table" + File contamination_table = "contamination.table" + File maf_segments = "segments.table" + File contamination = "contam.txt" + } +} diff --git a/GDCRealignment/CramToUnmappedBams.wdl b/GDCRealignment/CramToUnmappedBams.wdl new file mode 100644 index 0000000..8f54641 --- /dev/null +++ b/GDCRealignment/CramToUnmappedBams.wdl @@ -0,0 +1,332 @@ +version 1.0 + +# Exactly one of input_cram and input_bam should be supplied to this workflow. If an input_cram is supplied, a ref_fasta +# and ref_fasta_index must also be supplied. The ref_fasta and ref_fasta_index are used to generate a bam, so if an +# input_cram is not supplied, the input_bam is used instead and the ref_fasta and ref_fasta_index are not needed. + +# If the output_map file is provided, it is expected to be a tab-separated file containing a list of all the read group ids +# found in the input_cram / input_bam and the desired name of the unmapped bams generated for each. +# If the file is not provided, the output names of the unmapped bams will be the read_group_id +workflow CramToUnmappedBams { + + String pipeline_version = "1.1.3" + + input { + File? input_cram + File? input_bam + File? ref_fasta + File? ref_fasta_index + File? output_map + String base_file_name + String unmapped_bam_suffix = ".unmapped.bam" + Int additional_disk = 20 + } + + if (defined(input_cram)) { + Float cram_size = size(input_cram, "GiB") + String bam_from_cram_name = basename(input_cram_path, ".cram") + String input_cram_path = select_first([input_cram]) + + call CramToBam { + input: + ref_fasta = select_first([ref_fasta]), + ref_fasta_index = select_first([ref_fasta_index]), + cram_file = select_first([input_cram]), + output_basename = bam_from_cram_name, + disk_size = ceil(cram_size * 6) + additional_disk + } + } + + File input_file = select_first([CramToBam.output_bam, input_bam]) + Float input_size = size(input_file, "GiB") + + if (!defined(output_map)) { + call GenerateOutputMap { + input: + input_bam = input_file, + unmapped_bam_suffix = unmapped_bam_suffix, + disk_size = ceil(input_size) + additional_disk + } + } + + call SplitUpOutputMapFile { + input: + read_group_map_file = select_first([output_map, GenerateOutputMap.output_map]) + } + + scatter (rg_map_file in SplitUpOutputMapFile.rg_to_ubam_file) { + call SplitOutUbamByReadGroup { + input: + input_bam = input_file, + rg_to_ubam_file = rg_map_file, + disk_size = ceil(input_size * 2) + additional_disk + } + + String unmapped_bam_filename = basename(SplitOutUbamByReadGroup.output_bam) + + call RevertSam { + input: + input_bam = SplitOutUbamByReadGroup.output_bam, + output_bam_filename = unmapped_bam_filename, + disk_size = ceil(input_size * 3) + additional_disk + } + + call SortSam { + input: + input_bam = RevertSam.output_bam, + output_bam_filename = unmapped_bam_filename + } + + Float unmapped_bam_size = size(SortSam.output_bam, "GiB") + + call ValidateSamFile { + input: + input_bam = SortSam.output_bam, + report_filename = unmapped_bam_filename + ".validation_report", + disk_size = ceil(unmapped_bam_size) + additional_disk + } + } + + output { + Array[File] validation_report = ValidateSamFile.report + Array[File] unmapped_bams = SortSam.output_bam + } + meta { + allowNestedInputs: true + } +} + +task RevertSam { + input { + File input_bam + String output_bam_filename + Int disk_size + Int memory_in_MiB = 3000 + } + + Int java_mem = memory_in_MiB - 1000 + Int max_heap = memory_in_MiB - 500 + + command <<< + java -Xms~{java_mem}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar \ + RevertSam \ + --INPUT ~{input_bam} \ + --OUTPUT ~{output_bam_filename} \ + --VALIDATION_STRINGENCY LENIENT \ + --ATTRIBUTE_TO_CLEAR FT \ + --ATTRIBUTE_TO_CLEAR CO \ + --ATTRIBUTE_TO_CLEAR PA \ + --ATTRIBUTE_TO_CLEAR OA \ + --ATTRIBUTE_TO_CLEAR XA \ + --SORT_ORDER coordinate \ + --RESTORE_HARDCLIPS false + + >>> + + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + disks: "local-disk " + disk_size + " HDD" + memory: "~{memory_in_MiB} MiB" + preemptible: 3 + } + + output { + File output_bam = output_bam_filename + } +} + +# This task is slower than converting straight from cram to bam (note we stream through sam format in between cram and bam) +# This is currently necessary due to a difference in the way the NM tag is calculated in order to produce a valid bam. +task CramToBam { + input { + File ref_fasta + File ref_fasta_index + File cram_file + String output_basename + Int disk_size + Int memory_in_MiB = 7000 + } + + command <<< + + set -e + set -o pipefail + + samtools view -h -T ~{ref_fasta} ~{cram_file} | + samtools view -b -o ~{output_basename}.bam - + samtools index -b ~{output_basename}.bam + mv ~{output_basename}.bam.bai ~{output_basename}.bai + + >>> + + runtime { + docker: "us.gcr.io/broad-gotc-prod/samtools:1.0.0-1.11-1624651616" + cpu: 3 + memory: "~{memory_in_MiB} MiB" + disks: "local-disk " + disk_size + " HDD" + preemptible: 3 + } + + output { + File output_bam = "~{output_basename}.bam" + File output_bam_index = "~{output_basename}.bai" + } +} + +task GenerateOutputMap { + input { + File input_bam + String unmapped_bam_suffix + Int disk_size + Int memory_in_MiB = 3000 + } + + command <<< + + set -e + + samtools view -H ~{input_bam} | grep '^@RG' | cut -f2 | sed s/ID:// > readgroups.txt + + echo -e "#READ_GROUP_ID\tOUTPUT" > output_map.tsv + + for rg in `cat readgroups.txt`; do + echo -e "$rg\t$rg~{unmapped_bam_suffix}" >> output_map.tsv + done + + >>> + + runtime { + docker: "us.gcr.io/broad-gotc-prod/samtools:1.0.0-1.11-1624651616" + disks: "local-disk " + disk_size + " HDD" + memory: "~{memory_in_MiB} MiB" + preemptible: 3 + } + + output { + File output_map = "output_map.tsv" + } +} + +task SplitUpOutputMapFile { + input { + File read_group_map_file + Int disk_size = 10 + Int memory_in_MiB = 3000 + } + + command <<< + mkdir rgtemp + cd rgtemp + + # splits list of mappings into single files. One line each. + grep -v '^#' ~{read_group_map_file} | split -l 1 - rg_to_ubam_ + >>> + + runtime { + docker: "gcr.io/gcp-runtimes/ubuntu_16_0_4@sha256:025124e2f1cf4d29149958f17270596bffe13fc6acca6252977c572dd5ba01bf" + disks: "local-disk " + disk_size + " HDD" + memory: "~{memory_in_MiB} MiB" + } + + output { + Array[File] rg_to_ubam_file = glob("rgtemp/rg_to_ubam_*") + } +} + +task SplitOutUbamByReadGroup { + + input { + File input_bam + File rg_to_ubam_file + Int disk_size + Int memory_in_MiB = 30000 + } + + Array[Array[String]] tmp = read_tsv(rg_to_ubam_file) + + command <<< + echo "Read Group ~{tmp[0][0]} from ~{input_bam} is being written to ~{tmp[0][1]}" + samtools view -b -h -r ~{tmp[0][0]} -o ~{tmp[0][1]} ~{input_bam} + >>> + + output { + File output_bam = tmp[0][1] + } + + runtime { + docker: "us.gcr.io/broad-gotc-prod/samtools:1.0.0-1.11-1624651616" + cpu: 2 + disks: "local-disk " + disk_size + " HDD" + memory: "~{memory_in_MiB} MiB" + preemptible: 3 + } +} + +task ValidateSamFile { + input { + File input_bam + String report_filename + Int disk_size + Int memory_in_MiB = 3000 + } + + Int java_mem = memory_in_MiB - 1000 + Int max_heap = memory_in_MiB - 500 + + command <<< + + java -Xms~{java_mem}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar \ + ValidateSamFile \ + --INPUT ~{input_bam} \ + --OUTPUT ~{report_filename} \ + --MODE VERBOSE \ + --IS_BISULFITE_SEQUENCED false + + >>> + + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + disks: "local-disk " + disk_size + " HDD" + memory: "~{memory_in_MiB} MiB" + preemptible: 3 + } + + output { + File report = "~{report_filename}" + } +} + +task SortSam { + input { + File input_bam + String output_bam_filename + Int memory_in_MiB = 7000 + Float sort_sam_disk_multiplier = 6 + } + # SortSam spills to disk a lot more because we are only store 300000 records in RAM now because its faster for our data so it needs + # more disk space. Also it spills to disk in an uncompressed format so we need to account for that with a larger multiplier + Int disk_size = ceil(sort_sam_disk_multiplier * size(input_bam, "GiB")) + 20 + Int java_mem = memory_in_MiB - 1000 + Int max_heap = memory_in_MiB - 500 + + command <<< + java -Xms~{java_mem}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar \ + SortSam \ + --INPUT ~{input_bam} \ + --OUTPUT ~{output_bam_filename} \ + --SORT_ORDER queryname \ + --MAX_RECORDS_IN_RAM 300000 + + >>> + + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + disks: "local-disk " + disk_size + " HDD" + memory: "~{memory_in_MiB} MiB" + preemptible: 3 + } + + output { + File output_bam = output_bam_filename + } +} diff --git a/GDCRealignment/GDCWholeGenomeSomaticSingleSample/GDCWholeGenomeSomaticSingleSample.inputs.json b/GDCRealignment/GDCWholeGenomeSomaticSingleSample/GDCWholeGenomeSomaticSingleSample.inputs.json new file mode 100644 index 0000000..28f61b4 --- /dev/null +++ b/GDCRealignment/GDCWholeGenomeSomaticSingleSample/GDCWholeGenomeSomaticSingleSample.inputs.json @@ -0,0 +1,6 @@ +{ + "read_from_cache": true, + "write_to_cache": true, + "google_legacy_machine_selection": true, + "monitoring_script": "gs://broad-gotc-test-storage/cromwell_monitoring_script.sh" +} diff --git a/GDCRealignment/GDCWholeGenomeSomaticSingleSample/GDCWholeGenomeSomaticSingleSample.wdl b/GDCRealignment/GDCWholeGenomeSomaticSingleSample/GDCWholeGenomeSomaticSingleSample.wdl new file mode 100644 index 0000000..4dbe651 --- /dev/null +++ b/GDCRealignment/GDCWholeGenomeSomaticSingleSample/GDCWholeGenomeSomaticSingleSample.wdl @@ -0,0 +1,802 @@ +version 1.0 + +import "../CramToUnmappedBams.wdl" as ToUbams +import "../CheckContaminationSomatic.wdl" as CheckContamination + +struct FastqPairRecord { + File forward_fastq + File reverse_fastq + String readgroup + String readgroup_id +} + +struct FastqSingleRecord { + File fastq + String readgroup + String readgroup_id +} + +task bam_readgroup_to_contents { + input { + File bam + Int preemptible = 10 + Int max_retries = 0 + Int cpu = 1 + Int additional_memory_mb = 0 + Int additional_disk_gb = 0 + } + Int mem = ceil(size(bam, "MiB")) + 2000 + additional_memory_mb + Int disk_space = ceil(size(bam, "GiB")) + 10 + additional_disk_gb + + parameter_meta { + bam: {localization_optional: true} + } + + command <<< + set -euo pipefail + export GCS_OAUTH_TOKEN=$(gcloud auth application-default print-access-token) + samtools view -H ~{bam} | \ + awk 'BEGIN { \ + OFS = FS = "\t"; \ + header = "ID\tBC\tCN\tDS\tDT\tFO\tKS\tLB\tPG\tPI\tPL\tPM\tPU\tSM"; \ + split(header, header_ary, "\t"); \ + for (i=1; i in header_ary; i++) { \ + header_pos[header_ary[i]] = i \ + }; \ + print header \ + } \ + /^@RG/ { \ + for (i=2; i<=NF; i++) { \ + split($i, rg, ":"); \ + row_ary[header_pos[rg[1]]] = rg[2]; \ + }; \ + row = row_ary[1]; \ + for (i=2; i in header_ary; i++) { \ + row = row "\t"; \ + if (i in row_ary) \ + row = row row_ary[i]; \ + }; \ + delete row_ary; \ + print row \ + }' + >>> + + output { + Array[Object] readgroups = read_objects(stdout()) + } + + runtime { + docker: "broadgdac/samtools:1.10" + memory: mem + " MiB" + disks: "local-disk " + disk_space + " HDD" + preemptible: preemptible + maxRetries: max_retries + cpu: cpu + } +} + +task biobambam_bamtofastq { + input { + Int collate = 1 + String exclude = "QCFAIL,SECONDARY,SUPPLEMENTARY" + File filename + Int gz = 1 + String inputformat = "bam" + Int level = 5 + String outputdir = "." + Int outputperreadgroup = 1 + String outputperreadgroupsuffixF = "_1.fq.gz" + String outputperreadgroupsuffixF2 = "_2.fq.gz" + String outputperreadgroupsuffixO = "_o1.fq.gz" + String outputperreadgroupsuffixO2 = "_o2.fq.gz" + String outputperreadgroupsuffixS = "_s.fq.gz" + Int tryoq = 1 + String T = "tempfq" + Int cpu = 1 + Int preemptible = 2 + Int max_retries = 0 + Int additional_memory_mb = 0 + Int additional_disk_gb = 0 + } + Int mem = ceil(size(filename, "MiB")) + 2000 + additional_memory_mb + Int disk_space = ceil(size(filename, "GiB") * 2) + 10 + additional_disk_gb + + command { + set -euo pipefail + /usr/local/bin/bamtofastq \ + T=~{T} \ + collate=~{collate} \ + exclude=~{exclude} \ + filename=~{filename} \ + gz=~{gz} \ + inputformat=~{inputformat} \ + level=~{level} \ + outputdir=~{outputdir} \ + outputperreadgroup=~{outputperreadgroup} \ + outputperreadgroupsuffixF=~{outputperreadgroupsuffixF} \ + outputperreadgroupsuffixF2=~{outputperreadgroupsuffixF2} \ + outputperreadgroupsuffixO=~{outputperreadgroupsuffixO} \ + outputperreadgroupsuffixO2=~{outputperreadgroupsuffixO2} \ + outputperreadgroupsuffixS=~{outputperreadgroupsuffixS} \ + tryoq=~{tryoq} + } + + output { + Array[File] output_fastq1 = glob("*~{outputperreadgroupsuffixF}") + Array[File] output_fastq2 = glob("*~{outputperreadgroupsuffixF2}") + Array[File] output_fastq_o1 = glob("*~{outputperreadgroupsuffixO}") + Array[File] output_fastq_o2 = glob("*~{outputperreadgroupsuffixO2}") + Array[File] output_fastq_s = glob("*~{outputperreadgroupsuffixS}") + } + + runtime { + docker: "broadgdac/biobambam2:2.0.87-release-20180301132713" + memory: mem + " MiB" + disks: "local-disk " + disk_space + " HDD" + preemptible: preemptible + maxRetries: max_retries + cpu: cpu + } +} + +task emit_pe_records { + input { + Array[File]+ fastq1_files + Array[File]+ fastq2_files + Array[Object]+ readgroups + String fastq1_suffix = "_1.fq.gz" + Int preemptible = 10 + Int max_retries = 0 + Int cpu = 1 + Int additional_memory_mb = 0 + Int additional_disk_gb = 0 + } + Int mem = ceil(size(fastq1_files, "MiB") + size(fastq2_files, "MiB")) + 2000 + additional_memory_mb + Int disk_space = ceil(size(fastq1_files, "GiB") + size(fastq2_files, "GiB")) + 10 + additional_disk_gb + + File readgroups_tsv = write_objects(readgroups) + + parameter_meta { + fastq1_files: {localization_optional: true} + fastq2_files: {localization_optional: true} + } + + command <<< + set -euo pipefail + + python <>> + + output { + Array[FastqPairRecord] fastq_pair_records = read_objects(stdout()) + } + + runtime { + docker: "python:3.8-slim" + memory: mem + " MiB" + disks: "local-disk " + disk_space + " HDD" + preemptible: preemptible + maxRetries: max_retries + cpu: cpu + } +} + +task emit_se_records { + input { + Array[File] fastq_o1_files + Array[File] fastq_o2_files + Array[File] fastq_s_files + Array[Object]+ readgroups + String fastq_o1_suffix = "_o1.fq.gz" + String fastq_o2_suffix = "_o2.fq.gz" + String fastq_s_suffix = "_s.fq.gz" + Int preemptible = 10 + Int max_retries = 0 + Int cpu = 1 + Int additional_memory_mb = 0 + Int additional_disk_gb = 0 + } + Int mem = ceil(size(fastq_o1_files, "MiB") + size(fastq_o2_files, "MiB") + size(fastq_s_files, "MiB")) + 2000 + additional_memory_mb + Int disk_space = ceil(size(fastq_o1_files, "GiB") + size(fastq_o2_files, "GiB") + size(fastq_s_files, "GiB")) + 10 + additional_disk_gb + + File readgroups_tsv = write_objects(readgroups) + + parameter_meta { + fastq_o1_files: {localization_optional: true} + fastq_o2_files: {localization_optional: true} + fastq_s_files: {localization_optional: true} + } + + command <<< + set -euo pipefail + + python <>> + + output { + Array[FastqSingleRecord] fastq_single_records = read_objects(stdout()) + } + + runtime { + docker: "python:3.8-slim" + memory: mem + " MiB" + disks: "local-disk " + disk_space + " HDD" + preemptible: preemptible + maxRetries: max_retries + cpu: cpu + } +} + +task bwa_pe { + input { + FastqPairRecord fastq_record + File ref_fasta + File ref_fai + File ref_dict + File ref_amb + File ref_ann + File ref_bwt + File ref_pac + File ref_sa + Int cpu = 16 + Int preemptible = 1 + Int max_retries = 0 + Int additional_memory_mb = 0 + Int additional_disk_gb = 0 + } + File fastq1 = fastq_record.forward_fastq + File fastq2 = fastq_record.reverse_fastq + String readgroup = fastq_record.readgroup + String outbam = fastq_record.readgroup_id + ".bam" + Float ref_size =size([ref_fasta, ref_dict, ref_amb, ref_ann, ref_bwt, ref_pac, ref_sa, ref_fai], "GiB") + Int mem = ceil(size([fastq1, fastq2], "MiB")) + 10000 + additional_memory_mb + Int disk_space = ceil((size([fastq1, fastq2], "GiB") * 4) + ref_size) + 10 + additional_disk_gb + + command { + set -euo pipefail + bwa mem \ + -t ~{cpu} \ + -T 0 \ + -R "~{readgroup}" \ + ~{ref_fasta} \ + ~{fastq1} \ + ~{fastq2} \ + | samtools view \ + -Shb \ + -o ~{outbam} \ + - + } + + output { + File bam = "~{outbam}" + } + + runtime { + docker: "broadgdac/bwa:0.7.15-r1142-dirty" + memory: mem + " MiB" + disks: "local-disk " + disk_space + " HDD" + preemptible: preemptible + maxRetries: max_retries + cpu: cpu + } +} + +task bwa_se { + input { + FastqSingleRecord fastq_record + File ref_fasta + File ref_dict + File ref_amb + File ref_ann + File ref_bwt + File ref_pac + File ref_sa + File ref_fai + Int cpu = 16 + Int preemptible = 1 + Int max_retries = 0 + Int additional_memory_mb = 0 + Int additional_disk_gb = 0 + } + File fastq = fastq_record.fastq + String readgroup = fastq_record.readgroup + String outbam = fastq_record.readgroup_id + ".bam" + Float ref_size = size([ref_fasta, ref_dict, ref_amb, ref_ann, ref_bwt, ref_pac, ref_sa, ref_fai], "GiB") + Int mem = ceil(size(fastq, "MiB")) + 10000 + additional_memory_mb + Int disk_space = ceil((size(fastq, "GiB") * 4) + ref_size) + 10 + additional_disk_gb + + command { + set -euo pipefail + bwa mem \ + -t ~{cpu} \ + -T 0 \ + -R "~{readgroup}" \ + ~{ref_fasta} \ + ~{fastq} \ + | samtools view \ + -Shb \ + -o ~{outbam} \ + - + } + + output { + File bam = "~{outbam}" + } + + runtime { + docker: "broadgdac/bwa:0.7.15-r1142-dirty" + memory: mem + " MiB" + disks: "local-disk " + disk_space + " HDD" + preemptible: preemptible + maxRetries: max_retries + cpu: cpu + } +} + +task picard_markduplicates { + input { + Array[File]+ bams + String outbam + + Int compression_level = 2 + Int preemptible_tries = 1 + Int max_retries = 0 + String validation_stringency = "SILENT" + String assume_sort_order = "queryname" + + # The program default for READ_NAME_REGEX is appropriate in nearly every case. + # Sometimes we wish to supply "null" in order to turn off optical duplicate detection + # This can be desirable if you don't mind the estimated library size being wrong and optical duplicate detection is taking >7 days and failing + String? read_name_regex + Int memory_multiplier = 1 + Int additional_disk_gb = 20 + + Float? sorting_collection_size_ratio + } + Float total_input_size = size(bams, "GiB") + String metrics_filename = outbam + ".metrics" + # The merged bam will be smaller than the sum of the parts so we need to account for the unmerged inputs and the merged output. + # Mark Duplicates takes in as input readgroup bams and outputs a slightly smaller aggregated bam. Giving .25 as wiggleroom + Float md_disk_multiplier = 3 + Int disk_size = ceil(md_disk_multiplier * total_input_size) + additional_disk_gb + + Float memory_size = 7.5 * memory_multiplier + Int java_memory_size = (ceil(memory_size) - 2) + + # Task is assuming query-sorted input so that the Secondary and Supplementary reads get marked correctly + # This works because the output of BWA is query-grouped and therefore, so is the output of MergeBamAlignment. + # While query-grouped isn't actually query-sorted, it's good enough for MarkDuplicates with ASSUME_SORT_ORDER="queryname" + + command { + java -Dsamjdk.compression_level=~{compression_level} -Xms~{java_memory_size}g -jar /usr/picard/picard.jar \ + MarkDuplicates \ + INPUT=~{sep=' INPUT=' bams} \ + OUTPUT=~{outbam} \ + METRICS_FILE=~{metrics_filename} \ + VALIDATION_STRINGENCY=~{validation_stringency} \ + ASSUME_SORT_ORDER=~{assume_sort_order} \ + ~{"SORTING_COLLECTION_SIZE_RATIO=" + sorting_collection_size_ratio} \ + ~{"READ_NAME_REGEX=" + read_name_regex} + } + + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + preemptible: preemptible_tries + memory: "~{memory_size} GiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File bam = "~{outbam}" + File metrics = "~{metrics_filename}" + } +} + +task sort_and_index_markdup_bam { + input { + File input_bam + String tmp_prefix = "tmp_srt" + Int cpu = 8 + Int preemptible = 2 + Int max_retries = 0 + Int additional_memory_mb = 0 + Int additional_disk_gb = 0 + } + Int mem = ceil(size(input_bam, "MiB")) + 10000 + additional_memory_mb + Int disk_space = ceil(size(input_bam, "GiB") * 3.25) + 20 + additional_disk_gb + Int mem_per_thread = floor(mem / cpu * 0.85) + Int index_threads = cpu - 1 + String output_bam = basename(input_bam) + String output_bai = basename(input_bam, ".bam") + ".bai" + + command { + set -euo pipefail + samtools sort \ + -@ ~{cpu} \ + -o ~{output_bam} \ + -T ~{tmp_prefix} \ + -m ~{mem_per_thread}M \ + ~{input_bam} + samtools index \ + -b \ + -@ ~{index_threads} \ + ~{output_bam} \ + ~{output_bai} + } + + output { + File bam = "~{output_bam}" + File bai = "~{output_bai}" + } + + runtime { + docker: "us.gcr.io/broad-gotc-prod/samtools:1.10" + memory: mem + " MiB" + disks: "local-disk " + disk_space + " HDD" + preemptible: preemptible + maxRetries: max_retries + cpu: cpu + } +} + + +task gatk_baserecalibrator { + input { + File bam + File dbsnp_vcf + File dbsnp_vcf_index + File ref_dict + File ref_fasta + File ref_fai + Int cpu = 2 + Int preemptible = 2 + Int max_retries = 0 + Int additional_memory_mb = 0 + Int additional_disk_gb = 0 + } + String output_grp = basename(bam, ".bam") + "_bqsr.grp" + Float ref_size = size([ref_fasta, ref_fai, ref_dict], "GiB") + Float dbsnp_size = size([dbsnp_vcf, dbsnp_vcf_index], "GiB") + Int mem = ceil(size(bam, "MiB")) + 6000 + additional_memory_mb + Int jvm_mem = mem - 1000 + Int max_heap = mem - 500 + Int disk_space = ceil(size(bam, "GiB") + ref_size + dbsnp_size) + 20 + additional_disk_gb + + parameter_meta { + bam: {localization_optional: true} + dbsnp_vcf: {localization_optional: true} + dbsnp_vcf_index: {localization_optional: true} + ref_dict: {localization_optional: true} + ref_fasta: {localization_optional: true} + ref_fai: {localization_optional: true} + } + + command { + gatk --java-options "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal -Xlog:gc=debug:file=gc_log.log -Xms~{jvm_mem}m -Xmx~{max_heap}m" \ + BaseRecalibrator \ + --input ~{bam} \ + --known-sites ~{dbsnp_vcf} \ + --reference ~{ref_fasta} \ + --tmp-dir . \ + --output ~{output_grp} + } + + output { + File bqsr_recal_file = "~{output_grp}" + } + + runtime { + docker: "us.gcr.io/broad-gatk/gatk:4.6.0.0" + memory: mem + " MiB" + disks: "local-disk " + disk_space + " HDD" + preemptible: preemptible + maxRetries: max_retries + cpu: cpu + } +} + +task gatk_applybqsr { + input { + File input_bam + File bqsr_recal_file + Boolean emit_original_quals = true + Int cpu = 2 + Int preemptible = 2 + Int max_retries = 0 + Int additional_memory_mb = 0 + Int additional_disk_gb = 0 + } + String output_bam = basename(input_bam) + String output_bai = basename(input_bam, ".bam") + ".bai" + Int mem = ceil(size(input_bam, "MiB")) + 4000 + additional_memory_mb + Int jvm_mem = mem - 1000 + Int max_heap = mem - 500 + Int disk_space = ceil((size(input_bam, "GiB") * 3)) + 20 + additional_disk_gb + + parameter_meta { + input_bam: {localization_optional: true} + } + + command { + gatk --java-options "-XX:GCTimeLimit=50 -XX:GCHeapFreeLimit=10 -XX:+PrintFlagsFinal -Xlog:gc=debug:file=gc_log.log -Xms~{jvm_mem}m -Xmx~{max_heap}m" \ + ApplyBQSR \ + --input ~{input_bam} \ + --bqsr-recal-file ~{bqsr_recal_file} \ + --emit-original-quals ~{emit_original_quals} \ + --tmp-dir . \ + --output ~{output_bam} + } + + output { + File bam = "~{output_bam}" + File bai = "~{output_bai}" + } + + runtime { + docker: "us.gcr.io/broad-gatk/gatk:4.6.0.0" + memory: mem + " MiB" + disks: "local-disk " + disk_space + " HDD" + preemptible: preemptible + maxRetries: max_retries + cpu: cpu + } +} + +# Collect quality metrics from the aggregated bam +task collect_insert_size_metrics { + input { + File input_bam + String output_bam_prefix + Int additional_memory_mb = 0 + Int additional_disk_gb = 0 + } + Int mem = ceil(size(input_bam, "GiB")) + 7000 + additional_memory_mb + Int jvm_mem = mem - 1000 + Int max_heap = mem - 500 + Int disk_size = ceil(size(input_bam, "GiB")) + 20 + additional_disk_gb + + command { + java -Xms~{jvm_mem}m -Xmx~{max_heap}m -jar /usr/picard/picard.jar \ + CollectInsertSizeMetrics \ + INPUT=~{input_bam} \ + OUTPUT=~{output_bam_prefix}.insert_size_metrics \ + HISTOGRAM_FILE=~{output_bam_prefix}.insert_size_histogram.pdf + } + runtime { + docker: "us.gcr.io/broad-gotc-prod/picard-cloud:2.26.10" + memory: mem + " MiB" + disks: "local-disk " + disk_size + " HDD" + } + output { + File insert_size_histogram_pdf = "~{output_bam_prefix}.insert_size_histogram.pdf" + File insert_size_metrics = "~{output_bam_prefix}.insert_size_metrics" + } +} + + +workflow GDCWholeGenomeSomaticSingleSample { + + String pipeline_version = "1.3.3" + + input { + File? input_cram + File? input_bam + File? cram_ref_fasta + File? cram_ref_fasta_index + File? output_map + String? unmapped_bam_suffix + String base_file_name + + File? ubam + + File contamination_vcf + File contamination_vcf_index + File dbsnp_vcf + File dbsnp_vcf_index + + File ref_fasta + File ref_fai + File ref_dict + File ref_amb + File ref_ann + File ref_bwt + File ref_pac + File ref_sa + } + + String outbam = if (defined(ubam) || defined(input_bam)) then basename(select_first([ubam, input_bam]), ".bam") + ".aln.mrkdp.bam" + else basename(select_first([input_cram]), ".cram") + ".aln.mrkdp.bam" + + if (!defined(ubam)) { + call ToUbams.CramToUnmappedBams { + input: + input_cram = input_cram, + input_bam = input_bam, + ref_fasta = select_first([cram_ref_fasta, ref_fasta]), + ref_fasta_index = select_first([cram_ref_fasta_index, ref_fai]), + output_map = output_map, + base_file_name = base_file_name, + unmapped_bam_suffix = unmapped_bam_suffix + } + } + + Array[File] ubams = if defined(ubam) then [select_first([ubam])] else select_first([CramToUnmappedBams.unmapped_bams]) + + scatter (ubam in ubams) { + call bam_readgroup_to_contents { + input: bam = ubam + } + + call biobambam_bamtofastq { + input: filename = ubam + } + } + + Array[Object] readgroups = flatten(bam_readgroup_to_contents.readgroups) + + Array[File] fastq1 = flatten(biobambam_bamtofastq.output_fastq1) + Array[File] fastq2 = flatten(biobambam_bamtofastq.output_fastq2) + Array[File] fastq_o1 = flatten(biobambam_bamtofastq.output_fastq_o1) + Array[File] fastq_o2 = flatten(biobambam_bamtofastq.output_fastq_o2) + Array[File] fastq_s = flatten(biobambam_bamtofastq.output_fastq_s) + + Int pe_count = length(fastq1) + Int o1_count = length(fastq_o1) + Int o2_count = length(fastq_o2) + Int s_count = length(fastq_s) + + if (pe_count > 0) { + call emit_pe_records { + input: + fastq1_files = fastq1, + fastq2_files = fastq2, + readgroups = readgroups + } + scatter (pe_record in emit_pe_records.fastq_pair_records) { + call bwa_pe { + input: + fastq_record = pe_record, + ref_fasta = ref_fasta, + ref_fai = ref_fai, + ref_dict = ref_dict, + ref_amb = ref_amb, + ref_ann = ref_ann, + ref_bwt = ref_bwt, + ref_pac = ref_pac, + ref_sa = ref_sa + } + } + } + + if (o1_count + o2_count + s_count > 0) { + call emit_se_records { + input: + fastq_o1_files = fastq_o1, + fastq_o2_files = fastq_o2, + fastq_s_files = fastq_s, + readgroups = readgroups + } + scatter (se_record in emit_se_records.fastq_single_records) { + call bwa_se { + input: + fastq_record = se_record, + ref_fasta = ref_fasta, + ref_fai = ref_fai, + ref_dict = ref_dict, + ref_amb = ref_amb, + ref_ann = ref_ann, + ref_bwt = ref_bwt, + ref_pac = ref_pac, + ref_sa = ref_sa + } + } + } + + Array[File] aligned_bams = flatten([select_first([bwa_pe.bam, []]), select_first([bwa_se.bam, []])]) + + call picard_markduplicates { + input: + bams = aligned_bams, + outbam = outbam + } + + call sort_and_index_markdup_bam { + input: input_bam = picard_markduplicates.bam + } + + call CheckContamination.CalculateSomaticContamination as check_contamination { + input: + reference = ref_fasta, + reference_dict = ref_dict, + reference_index = ref_fai, + tumor_cram_or_bam = sort_and_index_markdup_bam.bam, + tumor_crai_or_bai = sort_and_index_markdup_bam.bai, + contamination_vcf = contamination_vcf, + contamination_vcf_index = contamination_vcf_index + } + + call gatk_baserecalibrator { + input: + bam = sort_and_index_markdup_bam.bam, + ref_fasta = ref_fasta, + ref_fai = ref_fai, + ref_dict = ref_dict, + dbsnp_vcf = dbsnp_vcf, + dbsnp_vcf_index = dbsnp_vcf_index + } + + call gatk_applybqsr { + input: + input_bam = sort_and_index_markdup_bam.bam, + bqsr_recal_file = gatk_baserecalibrator.bqsr_recal_file + } + + String output_bam_prefix = basename(gatk_applybqsr.bam, ".bam") + + call collect_insert_size_metrics { + input: + input_bam = gatk_applybqsr.bam, + output_bam_prefix = output_bam_prefix + } + + output { + Array[File]? validation_report = CramToUnmappedBams.validation_report + Array[File]? unmapped_bams = CramToUnmappedBams.unmapped_bams + File bam = gatk_applybqsr.bam + File bai = gatk_applybqsr.bai + File md_metrics = picard_markduplicates.metrics + File insert_size_metrics = collect_insert_size_metrics.insert_size_metrics + File insert_size_histogram_pdf = collect_insert_size_metrics.insert_size_histogram_pdf + File contamination = check_contamination.contamination + } + meta { + allowNestedInputs: true + } +} diff --git a/GDCRealignment/GDCWholeGenomeSomaticSingleSample/README.md b/GDCRealignment/GDCWholeGenomeSomaticSingleSample/README.md new file mode 100644 index 0000000..0ddff73 --- /dev/null +++ b/GDCRealignment/GDCWholeGenomeSomaticSingleSample/README.md @@ -0,0 +1,12 @@ +## Announcement: GDC is Deprecated 9/12/2024 + +The GDC Whole Genome Somatic Single Sample workflow is deprecated and no longer supported. However, documentation is still available. Read about the [GDC Whole Genome Somatic Single Sample Pipeline](https://broadinstitute.github.io/warp/documentation/Pipelines/Genomic_Data_Commons_Whole_Genome_Somatic/) on the [WARP documentation site](https://broadinstitute.github.io/warp/)! + +## Introduction to the GDC Whole Genome Somatic Single Sample Pipeline +The GDC Whole Genome Somatic Single Sample (abbreviated GDC here) pipeline is the alignment and preprocessing workflow for genomic data designed for the National Cancer Institute's [Genomic Data Commons](https://gdc.cancer.gov/about-gdc). + +A high-level overview of the pipeline in addition to tool parameters are available on the [GDC Documentation site](https://docs.gdc.cancer.gov/Data/Bioinformatics_Pipelines/DNA_Seq_Variant_Calling_Pipeline/). + +Overall, the pipeline converts reads (CRAM or BAM) to FASTQ and (re)aligns them to the latest human reference genome (see the [GDC Reference Genome](#gdc-reference-genome) section below). Each read group is aligned separately. Read group alignments that belong to a single sample are then merged and duplicate reads are flagged for downstream variant calling. + +