From 0367531a88a4b8f4dfb606aef3f315cb0f1efa5a Mon Sep 17 00:00:00 2001 From: dmiller15 Date: Wed, 8 Jan 2025 17:24:48 -0500 Subject: [PATCH 1/8] :tada: add hardfilter annotation filter plotting --- scripts/gatk_plot_annotations.R | 53 +++++++++++++++++++ subworkflows/kfdrc-gatk-hardfiltering.cwl | 43 +++++++++++++++ tools/filtering_defaults.cwl | 6 +++ tools/gatk_plot_annotations.cwl | 31 +++++++++++ tools/gatk_variantstotable.cwl | 34 ++++++++++++ tools/tar.cwl | 29 ++++++++++ .../kfdrc-single-sample-genotyping-wf.cwl | 7 ++- 7 files changed, 201 insertions(+), 2 deletions(-) create mode 100644 scripts/gatk_plot_annotations.R create mode 100644 tools/gatk_plot_annotations.cwl create mode 100644 tools/gatk_variantstotable.cwl create mode 100644 tools/tar.cwl diff --git a/scripts/gatk_plot_annotations.R b/scripts/gatk_plot_annotations.R new file mode 100644 index 0000000..5f15b80 --- /dev/null +++ b/scripts/gatk_plot_annotations.R @@ -0,0 +1,53 @@ +#!/usr/bin/env Rscript + +# plotting.R script loads ggplot and gridExtra libraries and defines functions to plot variant annotations + +library(ggplot2) +library(gridExtra) +library(readr) + +# Function for making density plots of a single annotation +makeDensityPlot <- function(dataframe, xvar, split, xmin=min(dataframe[xvar], na.rm=TRUE), xmax=max(dataframe[xvar], na.rm=TRUE), alpha=0.5, log10=FALSE) { + if(missing(split)) { + plot = ggplot(data=dataframe, aes_string(x=xvar)) + xlim(xmin,xmax) + geom_density() + if (log10) { + plot = plot + scale_x_log10() + } + return(plot) + } + else { + return(ggplot(data=dataframe, aes_string(x=xvar, fill=split)) + xlim(xmin,xmax) + geom_density(alpha=alpha) ) + } +} + +args <- commandArgs(trailingOnly=TRUE) + +if (length(args) < 3) { + stop("Three arguments are required. First, an ouptut_basename. Second, the input type. Third, the input table.", call.=FALSE) +} + +output_basename <- args[1] +input_type <- args[2] +input_file <- args[3] +input_fields <- if (length(args) > 3) tail(args, -3) + +message(paste("Loading", input_file)) +sampleSNP <- read_delim(input_file, "\t", escape_double = FALSE, trim_ws = TRUE) + +message(paste("Picking annotations for", input_type)) +snp_annots <- c("QD", "QUAL", "SOR", "FS", "MQ", "MQRankSum", "ReadPosRankSum") +indel_annots <- c("QD", "QUAL", "FS", "ReadPosRankSum") +mode_annots <- if (input_type=="SNP") snp_annots else indel_annots +picked_annots <- if (is.null(input_fields)) mode_annots else input_fields + +for (annot in picked_annots) { + message(paste("Creating Density Plot for", annot)) + if (annot %in% c("FS", "QUAL")) { + plot <- makeDensityPlot(sampleSNP, annot, log10 = TRUE) + } + else { + plot <- makeDensityPlot(sampleSNP, annot) + } + message(paste("Saving plot to", paste(output_basename, input_type, annot, "plot.pdf", sep="."))) + ggsave(plot, filename=paste(output_basename, input_type, annot, "plot.pdf", sep=".")) +} diff --git a/subworkflows/kfdrc-gatk-hardfiltering.cwl b/subworkflows/kfdrc-gatk-hardfiltering.cwl index 79d5588..087cf0d 100644 --- a/subworkflows/kfdrc-gatk-hardfiltering.cwl +++ b/subworkflows/kfdrc-gatk-hardfiltering.cwl @@ -14,6 +14,8 @@ doc: |- inputs: input_vcf: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], doc: "Input VCF containing INDEL and SNP variants"} output_basename: {type: 'string', doc: "String value to use as the base for the filename of the output"} + snp_plot_annots: {type: 'string[]?', doc: "The name of a standard VCF field or an INFO field to include in the output table for SNPs"} + indel_plot_annots: {type: 'string[]?', doc: "The name of a standard VCF field or an INFO field to include in the output table for INDELs"} snp_hardfilters: {type: 'string', doc: "String value of hardfilters to set for SNPs in input_vcf" } indel_hardfilters: {type: 'string', doc: "String value of hardfilters to set for INDELs in input_vcf" } snp_filtration_extra_args: {type: 'string?', doc: "Any extra arguments for SNP VariantFiltration" } @@ -22,6 +24,7 @@ inputs: filtration_ram: { type: 'int?', doc: "GB of RAM to allocate to GATK VariantFiltration" } outputs: + annotation_plots: {type: 'File', outputSource: tar_plots/output} hardfiltered_vcf: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], outputSource: bcftools_concat_snps_indels/output} steps: @@ -39,6 +42,46 @@ steps: output_basename: output_basename selection: {valueFrom: "INDEL"} out: [output] + gatk_variantstotable_snps: + run: ../tools/gatk_variantstotable.cwl + in: + input_vcf: gatk_selectvariants_snps/output + fields: snp_plot_annots + output_filename: { valueFrom: "temp.snp.tsv"} + out: [output] + gatk_variantstotable_indels: + run: ../tools/gatk_variantstotable.cwl + in: + input_vcf: gatk_selectvariants_indels/output + fields: indel_plot_annots + output_filename: {valueFrom: "temp.snp.tsv"} + out: [output] + gatk_plot_annotations_snps: + run: ../tools/gatk_plot_annotations.cwl + in: + input_table: gatk_variantstotable_snps/output + input_type: {valueFrom: "SNP"} + output_basename: output_basename + annotation_fields: snp_plot_annots + out: [plots] + gatk_plot_annotations_indels: + run: ../tools/gatk_plot_annotations.cwl + in: + input_table: gatk_variantstotable_snps/output + input_type: {valueFrom: "INDEL"} + output_basename: output_basename + annotation_fields: indel_plot_annots + out: [plots] + tar_plots: + run: ../tools/tar.cwl + in: + output_filename: + source: output_basename + valueFrom: '$(self).annotation_plots.tar.gz' + input_files: + source: [gatk_plot_annotations_snps/plots, gatk_plot_annotations_indels/plots] + valueFrom: '$(self[0].concat(self[1]))' + out: [output] gatk_variantfiltration_snps: run: ../tools/gatk_variantfiltration.cwl in: diff --git a/tools/filtering_defaults.cwl b/tools/filtering_defaults.cwl index e9e507e..1ab7f5c 100644 --- a/tools/filtering_defaults.cwl +++ b/tools/filtering_defaults.cwl @@ -32,6 +32,8 @@ outputs: indel_ts_filter_level: float? snp_hardfilter: string? indel_hardfilter: string? + snp_plot_annots: string[]? + indel_plot_annots: string[]? expression: | ${ var OUTPUTS = { @@ -44,6 +46,8 @@ expression: | "indel_ts_filter_level": null, "snp_hardfilter": null, "indel_hardfilter": null, + "snp_plot_annots": null, + "indel_plot_annots": null, }; var IS_LOW_DATA = { "WGS": false, @@ -79,10 +83,12 @@ expression: | '-filter "MQ < 40.0" --filter-name "MQ40_SNP" ' + '-filter "MQRankSum < -12.5" --filter-name "MQRankSum-12.5_SNP" ' + '-filter "ReadPosRankSum < -8.0" --filter-name "ReadPosRankSum-8_SNP"', + "snp_plot_annots": ["QD", "QUAL", "SOR", "FS", "MQ", "MQRankSum", "ReadPosRankSum"], "indel_hardfilter": '-filter "QD < 2.0" --filter-name "QD2" ' + '-filter "QUAL < 30.0" --filter-name "QUAL30" ' + '-filter "FS > 200.0" --filter-name "FS200_INDEL" ' + '-filter "ReadPosRankSum < -20.0" --filter-name "ReadPosRankSum-20_INDEL"', + "indel_plot_annots": ["QD", "QUAL", "FS", "ReadPosRankSum"], }; var PICKED = IS_LOW_DATA[inputs.experiment_type] ? LOW_DATA_FILTERS : HIGH_DATA_FILTERS[inputs.experiment_type]; return Object.assign({}, OUTPUTS, PICKED); diff --git a/tools/gatk_plot_annotations.cwl b/tools/gatk_plot_annotations.cwl new file mode 100644 index 0000000..e7ca488 --- /dev/null +++ b/tools/gatk_plot_annotations.cwl @@ -0,0 +1,31 @@ +cwlVersion: v1.2 +class: CommandLineTool +id: gatk_plot_annotations +doc: | + Plot annotations relevant to GATK hardfiltering. Take a TSV table generated + by VariantsToTable and create density charts for relevant annotations. + + TAR the results and return. +requirements: + - class: InlineJavascriptRequirement + - class: ShellCommandRequirement + - class: ResourceRequirement + ramMin: $(inputs.ram * 1000) + coresMin: $(inputs.cpu) + - class: DockerRequirement + dockerPull: 'dmiller15/tidyverse:4.4.2-gatk-plotter' + - class: InitialWorkDirRequirement + listing: + - entryname: gatk_plot_annotations.R + entry: + $include: ../scripts/gatk_plot_annotations.R +baseCommand: [Rscript, gatk_plot_annotations.R] +inputs: + input_table: { type: 'File', inputBinding: { position: 8 }, doc: "TSV table with variants and annotations." } + input_type: { type: 'string', inputBinding: { position: 7 }, doc: "Type of input. SNP or INDEL." } + output_basename: { type: 'string', inputBinding: { position: 6 }, doc: "String to use as basename for outputs." } + annotation_fields: { type: 'string[]?', inputBinding: { position: 9 }, doc: "Annotation fields being examined." } + ram: { type: 'int?', default: 4, doc: "GB of RAM to allocate to the task." } + cpu: { type: 'int?', default: 2, doc: "Minimum reserved number of CPU cores for the task." } +outputs: + plots: { type: 'File[]', outputBinding: { glob: '*.pdf' } } diff --git a/tools/gatk_variantstotable.cwl b/tools/gatk_variantstotable.cwl new file mode 100644 index 0000000..13b3c0d --- /dev/null +++ b/tools/gatk_variantstotable.cwl @@ -0,0 +1,34 @@ +cwlVersion: v1.2 +class: CommandLineTool +id: gatk_variantstotable +doc: | + Extract fields from a VCF file to a tab-delimited table +requirements: + - class: InlineJavascriptRequirement + - class: ShellCommandRequirement + - class: ResourceRequirement + ramMin: $(inputs.ram * 1000) + coresMin: $(inputs.cpu) + - class: DockerRequirement + dockerPull: 'broadinstitute/gatk:4.6.1.0' +baseCommand: [gatk, VariantsToTable] +inputs: + input_vcf: { type: 'File', secondaryFiles: [{pattern: '.tbi', required: false}], inputBinding: { position: 2, prefix: "--variant" }, doc: "The input VCF file to convert to a table." } + input_intervals: { type: 'File?', inputBinding: { position: 2, prefix: "--intervals" }, doc: "One or more genomic intervals over which to operate" } + reference: { type: 'File?', secondaryFiles: [{pattern: '.fai', required: true}, {pattern: '^.dict', required: true}], inputBinding: { position: 2, prefix: "--reference" }, doc: "Reference sequence" } + fields: + type: + - 'null' + - type: array + items: string + inputBinding: + prefix: '--fields' + inputBinding: + position: 2 + doc: "The name of a standard VCF field or an INFO field to include in the output table" + output_filename: { type: 'string?', default: "out.interval_list", inputBinding: { position: 2, prefix: "--output" }, doc: "Name for output file." } + extra_args: { type: 'string?', inputBinding: { position: 2 }, doc: "Extra args for this task" } + ram: { type: 'int?', default: 4, doc: "GB of RAM to allocate to the task." } + cpu: { type: 'int?', default: 2, doc: "Minimum reserved number of CPU cores for the task." } +outputs: + output: { type: 'File', outputBinding: { glob: $(inputs.output_filename) } } diff --git a/tools/tar.cwl b/tools/tar.cwl new file mode 100644 index 0000000..55a3859 --- /dev/null +++ b/tools/tar.cwl @@ -0,0 +1,29 @@ +cwlVersion: v1.2 +class: CommandLineTool +id: tar +requirements: + - class: ShellCommandRequirement + - class: InlineJavascriptRequirement + - class: LoadListingRequirement + - class: ResourceRequirement + coresMin: $(inputs.cpu) + ramMin: $(inputs.ram * 1000) + - class: InitialWorkDirRequirement + listing: $(inputs.input_files) +baseCommand: [tar, czf] +inputs: + output_filename: { type: 'string', inputBinding: { position: 1 } } + input_files: + type: + type: array + items: File + inputBinding: + valueFrom: $(self.basename) + inputBinding: { position: 9 } + cpu: { type: 'int?', default: 1, doc: "Number of threads to use." } + ram: { type: 'int?', default: 4, doc: "GB of RAM to allocate to this task." } +outputs: + output: + type: File + outputBinding: + glob: $(inputs.output_filename) diff --git a/workflows/kfdrc-single-sample-genotyping-wf.cwl b/workflows/kfdrc-single-sample-genotyping-wf.cwl index a77dbd0..bffc10b 100644 --- a/workflows/kfdrc-single-sample-genotyping-wf.cwl +++ b/workflows/kfdrc-single-sample-genotyping-wf.cwl @@ -203,6 +203,7 @@ outputs: peddy_csv: {type: 'File[]', doc: 'csv details of peddy results', outputSource: peddy/output_csv} peddy_ped: {type: 'File[]', doc: 'ped format summary of peddy results', outputSource: peddy/output_peddy} hardfiltered_vcf: {type: 'File?', secondaryFiles: [{pattern: '.tbi', required: true}], outputSource: gatk_hardfiltering/hardfiltered_vcf} + annotation_plots: {type: 'File?', outputSource: gatk_hardfiltering/annotation_plots} vep_annotated_vcf: {type: 'File[]', outputSource: annotate_vcf/annotated_vcf} steps: @@ -214,7 +215,7 @@ steps: valueFrom: $(self.length) experiment_type: experiment_type out: [low_data, snp_tranches, indel_tranches, snp_annotations, indel_annotations, snp_ts_filter_level, indel_ts_filter_level, - snp_hardfilter, indel_hardfilter] + snp_hardfilter, indel_hardfilter, snp_plot_annots, indel_plot_annots] dynamicallycombineintervals: run: ../tools/script_dynamicallycombineintervals.cwl hints: @@ -304,11 +305,13 @@ steps: indel_hardfilters: source: [hardfilter_indel_filters, filtering_defaults/indel_hardfilter] valueFrom: "$(self[0] != null ? self[0] : self[1])" + snp_plot_annots: filtering_defaults/snp_plot_annots + indel_plot_annots: filtering_defaults/indel_plot_annots snp_filtration_extra_args: hardfilter_snp_filter_extra_args indel_filtration_extra_args: hardfilter_indel_filter_extra_args filtration_cpu: hardfilter_filtertration_cpu filtration_ram: hardfilter_filtertration_ram - out: [hardfiltered_vcf] + out: [hardfiltered_vcf, annotation_plots] peddy: run: ../tools/kfdrc_peddy_tool.cwl doc: 'QC family relationships and sex assignment' From 4d73e61f90aa015b2a337fdcc833b890e08d5827 Mon Sep 17 00:00:00 2001 From: dmiller15 Date: Fri, 10 Jan 2025 14:34:17 -0500 Subject: [PATCH 2/8] :wrench: plotting as a subworkflow --- .../gatk_plot_genotyping_annotations.cwl | 64 +++++++++++++++++++ subworkflows/kfdrc-gatk-hardfiltering.cwl | 48 +++----------- 2 files changed, 73 insertions(+), 39 deletions(-) create mode 100644 subworkflows/gatk_plot_genotyping_annotations.cwl diff --git a/subworkflows/gatk_plot_genotyping_annotations.cwl b/subworkflows/gatk_plot_genotyping_annotations.cwl new file mode 100644 index 0000000..7277de8 --- /dev/null +++ b/subworkflows/gatk_plot_genotyping_annotations.cwl @@ -0,0 +1,64 @@ +cwlVersion: v1.2 +class: Workflow +id: gatk_plot_genotyping_annotations +requirements: +- class: StepInputExpressionRequirement +- class: InlineJavascriptRequirement +doc: |- + This workflow performs manual site-level variant filtration on an input VCF using the generic hard-filtering thresholds and example commands in the + [documentation from Broad](https://gatk.broadinstitute.org/hc/en-us/articles/360035531112--How-to-Filter-variants-either-with-VQSR-or-by-hard-filtering#2). + + The input VCF is split into SNP and INDEL VCFs using GATK SelectVariants. Those individual VCFs are then filtered using GATK VariantFiltration. + Finally the VCFs are merged back together using bcftools concat and returned. + +inputs: + snps_vcf: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: false}], doc: "Input VCF containing SNP variants"} + indels_vcf: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: false}], doc: "Input VCF containing INDEL variants"} + output_basename: {type: 'string', doc: "String value to use as the base for the filename of the output"} + snp_plot_annots: {type: 'string[]?', doc: "The name of a standard VCF field or an INFO field to include in the output table for SNPs"} + indel_plot_annots: {type: 'string[]?', doc: "The name of a standard VCF field or an INFO field to include in the output table for INDELs"} + +outputs: + annotation_plots: {type: 'File', outputSource: tar_plots/output} + +steps: + gatk_variantstotable_snps: + run: ../tools/gatk_variantstotable.cwl + in: + input_vcf: snps_vcf + fields: snp_plot_annots + output_filename: { valueFrom: "temp.snp.tsv"} + out: [output] + gatk_variantstotable_indels: + run: ../tools/gatk_variantstotable.cwl + in: + input_vcf: indels_vcf + fields: indel_plot_annots + output_filename: {valueFrom: "temp.snp.tsv"} + out: [output] + gatk_plot_annotations_snps: + run: ../tools/gatk_plot_annotations.cwl + in: + input_table: gatk_variantstotable_snps/output + input_type: {valueFrom: "SNP"} + output_basename: output_basename + annotation_fields: snp_plot_annots + out: [plots] + gatk_plot_annotations_indels: + run: ../tools/gatk_plot_annotations.cwl + in: + input_table: gatk_variantstotable_snps/output + input_type: {valueFrom: "INDEL"} + output_basename: output_basename + annotation_fields: indel_plot_annots + out: [plots] + tar_plots: + run: ../tools/tar.cwl + in: + output_filename: + source: output_basename + valueFrom: '$(self).annotation_plots.tar.gz' + input_files: + source: [gatk_plot_annotations_snps/plots, gatk_plot_annotations_indels/plots] + valueFrom: '$(self[0].concat(self[1]))' + out: [output] diff --git a/subworkflows/kfdrc-gatk-hardfiltering.cwl b/subworkflows/kfdrc-gatk-hardfiltering.cwl index 087cf0d..94d8fd0 100644 --- a/subworkflows/kfdrc-gatk-hardfiltering.cwl +++ b/subworkflows/kfdrc-gatk-hardfiltering.cwl @@ -4,6 +4,7 @@ id: kfdrc-gatk-hardfiltering requirements: - class: StepInputExpressionRequirement - class: InlineJavascriptRequirement +- class: SubworkflowFeatureRequirement doc: |- This workflow performs manual site-level variant filtration on an input VCF using the generic hard-filtering thresholds and example commands in the [documentation from Broad](https://gatk.broadinstitute.org/hc/en-us/articles/360035531112--How-to-Filter-variants-either-with-VQSR-or-by-hard-filtering#2). @@ -24,7 +25,7 @@ inputs: filtration_ram: { type: 'int?', doc: "GB of RAM to allocate to GATK VariantFiltration" } outputs: - annotation_plots: {type: 'File', outputSource: tar_plots/output} + annotation_plots: {type: 'File', outputSource: gatk_plot_genotyping_annotations/annotation_plots} hardfiltered_vcf: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], outputSource: bcftools_concat_snps_indels/output} steps: @@ -42,46 +43,15 @@ steps: output_basename: output_basename selection: {valueFrom: "INDEL"} out: [output] - gatk_variantstotable_snps: - run: ../tools/gatk_variantstotable.cwl + gatk_plot_genotyping_annotations: + run: ../subworkflows/gatk_plot_genotyping_annotations.cwl in: - input_vcf: gatk_selectvariants_snps/output - fields: snp_plot_annots - output_filename: { valueFrom: "temp.snp.tsv"} - out: [output] - gatk_variantstotable_indels: - run: ../tools/gatk_variantstotable.cwl - in: - input_vcf: gatk_selectvariants_indels/output - fields: indel_plot_annots - output_filename: {valueFrom: "temp.snp.tsv"} - out: [output] - gatk_plot_annotations_snps: - run: ../tools/gatk_plot_annotations.cwl - in: - input_table: gatk_variantstotable_snps/output - input_type: {valueFrom: "SNP"} + snps_vcf: gatk_selectvariants_snps/output + indels_vcf: gatk_selectvariants_indels/output output_basename: output_basename - annotation_fields: snp_plot_annots - out: [plots] - gatk_plot_annotations_indels: - run: ../tools/gatk_plot_annotations.cwl - in: - input_table: gatk_variantstotable_snps/output - input_type: {valueFrom: "INDEL"} - output_basename: output_basename - annotation_fields: indel_plot_annots - out: [plots] - tar_plots: - run: ../tools/tar.cwl - in: - output_filename: - source: output_basename - valueFrom: '$(self).annotation_plots.tar.gz' - input_files: - source: [gatk_plot_annotations_snps/plots, gatk_plot_annotations_indels/plots] - valueFrom: '$(self[0].concat(self[1]))' - out: [output] + snp_plot_annots: snp_plot_annots + indel_plot_annots: indel_plot_annots + out: [annotation_plots] gatk_variantfiltration_snps: run: ../tools/gatk_variantfiltration.cwl in: From bf8acf10db6695b92e7ba4814c36548c9401166c Mon Sep 17 00:00:00 2001 From: dmiller15 Date: Fri, 10 Jan 2025 15:16:16 -0500 Subject: [PATCH 3/8] :wrench: open ports for hardfilter outs --- docs/GERMLINE_SNV_README.md | 2 ++ docs/GERMLINE_VARIANT_README.md | 2 ++ workflows/kfdrc-germline-snv-wf.cwl | 4 +++- workflows/kfdrc-germline-variant-wf.cwl | 5 ++++- 4 files changed, 11 insertions(+), 2 deletions(-) diff --git a/docs/GERMLINE_SNV_README.md b/docs/GERMLINE_SNV_README.md index 89277a1..cec91d3 100644 --- a/docs/GERMLINE_SNV_README.md +++ b/docs/GERMLINE_SNV_README.md @@ -121,6 +121,8 @@ For more information on the specific annotations, please see the documentation. - `peddy_html`: HTML metrics files from Peddy - `peddy_csv`: CSV metrics for het_check, ped_check, and sex_check from Peddy - `peddy_ped`: PED file with additional metrics information from Peddy + - `gatk_hardfiltered_vcf`: VCF containing the all genotyping variants with their hardfilters + - `gatk_annotation_plots`: TAR file containing plots of all annotations used for hardfiltering - Strelka2 - `strelka2_prepass_variants`: Raw variants output from Strelka2 - `strelka2_gvcfs`: gVCF output from Strelka2 diff --git a/docs/GERMLINE_VARIANT_README.md b/docs/GERMLINE_VARIANT_README.md index 3a94e9e..b978d42 100644 --- a/docs/GERMLINE_VARIANT_README.md +++ b/docs/GERMLINE_VARIANT_README.md @@ -146,6 +146,8 @@ user must provide the associated gVCF in the `input_gvcf` input. - `peddy_csv`: CSV metrics for het_check, ped_check, and sex_check from Peddy - `peddy_ped`: PED file with additional metrics information from Peddy - `verifybamid_output`: VerifyBAMID output, including contamination score + - `gatk_hardfiltered_vcf`: VCF containing the all genotyping variants with their hardfilters + - `gatk_annotation_plots`: TAR file containing plots of all annotations used for hardfiltering - Strelka2 - `strelka2_prepass_variants`: Raw variants output from Strelka2 - `strelka2_gvcfs`: gVCF output from Strelka2 diff --git a/workflows/kfdrc-germline-snv-wf.cwl b/workflows/kfdrc-germline-snv-wf.cwl index 45cc6a0..0e7f2a7 100644 --- a/workflows/kfdrc-germline-snv-wf.cwl +++ b/workflows/kfdrc-germline-snv-wf.cwl @@ -279,6 +279,8 @@ outputs: peddy_html: {type: 'File[]', doc: 'html summary of peddy results', outputSource: single_sample_genotyping/peddy_html} peddy_csv: {type: 'File[]', doc: 'csv details of peddy results', outputSource: single_sample_genotyping/peddy_csv} peddy_ped: {type: 'File[]', doc: 'ped format summary of peddy results', outputSource: single_sample_genotyping/peddy_ped} + gatk_hardfiltered_vcf: {type: 'File?', doc: "VCF containing the all genotyping variants with their hardfilters", outputSource: single_sample_genotyping/hardfiltered_vcf} + gatk_annotation_plots: {type: 'File?', doc: "TAR file containing plots of all annotations used for hardfiltering", outputSource: single_sample_genotyping/annotation_plots} freebayes_unfiltered_vcf: {type: 'File', outputSource: freebayes/unfiltered_vcf} strelka2_prepass_variants: {type: 'File', outputSource: strelka2/prepass_variants_vcf} strelka2_gvcfs: {type: 'File[]', outputSource: strelka2/genome_vcfs} @@ -479,7 +481,7 @@ steps: cadd_indels: cadd_indels cadd_snvs: cadd_snvs intervar: intervar - out: [collectvariantcallingmetrics, peddy_html, peddy_csv, peddy_ped, vep_annotated_vcf] + out: [collectvariantcallingmetrics, peddy_html, peddy_csv, peddy_ped, vep_annotated_vcf, annotation_plots, hardfiltered_vcf] hints: - class: "sbg:maxNumberOfParallelInstances" value: 3 diff --git a/workflows/kfdrc-germline-variant-wf.cwl b/workflows/kfdrc-germline-variant-wf.cwl index e0ee171..9e27465 100644 --- a/workflows/kfdrc-germline-variant-wf.cwl +++ b/workflows/kfdrc-germline-variant-wf.cwl @@ -456,6 +456,8 @@ outputs: peddy_html: {type: 'File[]?', doc: 'html summary of peddy results', outputSource: snv/peddy_html} peddy_csv: {type: 'File[]?', doc: 'csv details of peddy results', outputSource: snv/peddy_csv} peddy_ped: {type: 'File[]?', doc: 'ped format summary of peddy results', outputSource: snv/peddy_ped} + gatk_hardfiltered_vcf: {type: 'File?', doc: "VCF containing the all genotyping variants with their hardfilters", outputSource: snv/gatk_hardfiltered_vcf} + gatk_annotation_plots: {type: 'File?', doc: "TAR file containing plots of all annotations used for hardfiltering", outputSource: snv/gatk_annotation_plots} vep_annotated_gatk_vcf: {type: 'File[]?', outputSource: snv/vep_annotated_gatk_vcf} vep_annotated_strelka_vcf: {type: 'File[]?', outputSource: snv/vep_annotated_strelka_vcf} vep_annotated_freebayes_vcf: {type: 'File[]?', outputSource: snv/vep_annotated_freebayes_vcf} @@ -609,7 +611,8 @@ steps: run_freebayes: run_freebayes run_strelka: run_strelka out: [gatk_gvcf, gatk_gvcf_metrics, verifybamid_output, gatk_vcf_metrics, peddy_html, peddy_csv, peddy_ped, vep_annotated_gatk_vcf, - freebayes_unfiltered_vcf, vep_annotated_freebayes_vcf, vep_annotated_strelka_vcf, strelka2_prepass_variants, strelka2_gvcfs] + freebayes_unfiltered_vcf, vep_annotated_freebayes_vcf, vep_annotated_strelka_vcf, strelka2_prepass_variants, strelka2_gvcfs, + gatk_hardfiltered_vcf, gatk_annotation_plots] sv: run: ../workflows/kfdrc-germline-sv-wf.cwl in: From 2bdab65395042cef4278571a44de4d8b55061047 Mon Sep 17 00:00:00 2001 From: dmiller15 Date: Mon, 13 Jan 2025 09:53:15 -0500 Subject: [PATCH 4/8] :wrench: more descriptive name --- subworkflows/gatk_plot_genotyping_annotations.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/subworkflows/gatk_plot_genotyping_annotations.cwl b/subworkflows/gatk_plot_genotyping_annotations.cwl index 7277de8..6cc5035 100644 --- a/subworkflows/gatk_plot_genotyping_annotations.cwl +++ b/subworkflows/gatk_plot_genotyping_annotations.cwl @@ -57,7 +57,7 @@ steps: in: output_filename: source: output_basename - valueFrom: '$(self).annotation_plots.tar.gz' + valueFrom: '$(self).genotyping_annotation_plots.tar.gz' input_files: source: [gatk_plot_annotations_snps/plots, gatk_plot_annotations_indels/plots] valueFrom: '$(self[0].concat(self[1]))' From a3d301a3295b1ef1450e3748024871d69928dc5f Mon Sep 17 00:00:00 2001 From: dmiller15 Date: Mon, 13 Jan 2025 11:02:00 -0500 Subject: [PATCH 5/8] :whale: update plotter docker --- tools/gatk_plot_annotations.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/gatk_plot_annotations.cwl b/tools/gatk_plot_annotations.cwl index e7ca488..ce11a4a 100644 --- a/tools/gatk_plot_annotations.cwl +++ b/tools/gatk_plot_annotations.cwl @@ -13,7 +13,7 @@ requirements: ramMin: $(inputs.ram * 1000) coresMin: $(inputs.cpu) - class: DockerRequirement - dockerPull: 'dmiller15/tidyverse:4.4.2-gatk-plotter' + dockerPull: 'pgc-images.sbgenomics.com/d3b-bixu/tidyverse:4.4.2-gatk-plotter' - class: InitialWorkDirRequirement listing: - entryname: gatk_plot_annotations.R From 96aaa4a8a8f25f868cb32c498533896e5f8ce88a Mon Sep 17 00:00:00 2001 From: "github-actions[bot]" <41898282+github-actions[bot]@users.noreply.github.com> Date: Mon, 13 Jan 2025 15:57:48 -0500 Subject: [PATCH 6/8] update docker table (#60) Co-authored-by: dmiller15 --- docs/dockers_gatk_genotyping.md | 3 +++ docs/dockers_germline_variant.md | 3 +++ 2 files changed, 6 insertions(+) diff --git a/docs/dockers_gatk_genotyping.md b/docs/dockers_gatk_genotyping.md index 8f5db6e..84b9712 100644 --- a/docs/dockers_gatk_genotyping.md +++ b/docs/dockers_gatk_genotyping.md @@ -15,13 +15,16 @@ gatk_gathervcfs.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0 gatk_genomicsdbimport_genotypegvcfs.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0 gatk_indelsvariantrecalibrator.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0 gatk_makesitesonlyvcf.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0 +gatk_plot_annotations.cwl|pgc-images.sbgenomics.com/d3b-bixu/tidyverse:4.4.2-gatk-plotter gatk_selectvariants.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.2.0.0R gatk_snpsvariantrecalibratorcreatemodel.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0 gatk_snpsvariantrecalibratorscattered.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0 gatk_variantfiltration.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.2.0.0R +gatk_variantstotable.cwl|broadinstitute/gatk:4.6.1.0 generic_rename_outputs.cwl|None kfdrc_peddy_tool.cwl|pgc-images.sbgenomics.com/d3b-bixu/peddy:latest normalize_vcf.cwl|pgc-images.sbgenomics.com/d3b-bixu/vcfutils:latest picard_collectvariantcallingmetrics.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0 script_dynamicallycombineintervals.cwl|pgc-images.sbgenomics.com/d3b-bixu/python:2.7.13 +tar.cwl|None variant_effect_predictor_105.cwl|ensemblorg/ensembl-vep:release_105.0 diff --git a/docs/dockers_germline_variant.md b/docs/dockers_germline_variant.md index 1d1a02c..43caca2 100644 --- a/docs/dockers_germline_variant.md +++ b/docs/dockers_germline_variant.md @@ -41,12 +41,14 @@ gatk_intervallisttobed.cwl|broadinstitute/gatk:4.4.0.0 gatk_intervallisttools.cwl|broadinstitute/gatk:4.4.0.0 gatk_makesitesonlyvcf.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0 gatk_mergevcfs.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.1.1.0 +gatk_plot_annotations.cwl|pgc-images.sbgenomics.com/d3b-bixu/tidyverse:4.4.2-gatk-plotter gatk_postprocessgermlinecnvcalls.cwl|broadinstitute/gatk:4.2.0.0 gatk_preprocessintervals.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.2.0.0R gatk_selectvariants.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.2.0.0R gatk_snpsvariantrecalibratorcreatemodel.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0 gatk_snpsvariantrecalibratorscattered.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0 gatk_variantfiltration.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.2.0.0R +gatk_variantstotable.cwl|broadinstitute/gatk:4.6.1.0 generic_rename_outputs.cwl|None guess_bin_size.cwl|None kfdrc_peddy_tool.cwl|pgc-images.sbgenomics.com/d3b-bixu/peddy:latest @@ -60,5 +62,6 @@ scatter_ploidy_calls_by_sample.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.2.0 script_dynamicallycombineintervals.cwl|pgc-images.sbgenomics.com/d3b-bixu/python:2.7.13 strelka2_germline.cwl|pgc-images.sbgenomics.com/d3b-bixu/strelka:v2.9.10 svaba.cwl|pgc-images.sbgenomics.com/d3b-bixu/svaba:1.1.0 +tar.cwl|None variant_effect_predictor_105.cwl|ensemblorg/ensembl-vep:release_105.0 verifybamid_contamination_conditional.cwl|pgc-images.sbgenomics.com/d3b-bixu/verifybamid:1.0.2 From cd7b0a2753e6ea0ffe19c0984802d35f94ad9835 Mon Sep 17 00:00:00 2001 From: dmiller15 Date: Thu, 16 Jan 2025 10:03:48 -0500 Subject: [PATCH 7/8] :fire: remove hardfilter output hooks --- workflows/kfdrc-germline-snv-wf.cwl | 3 +-- workflows/kfdrc-germline-variant-wf.cwl | 3 +-- workflows/kfdrc-single-sample-genotyping-wf.cwl | 1 - 3 files changed, 2 insertions(+), 5 deletions(-) diff --git a/workflows/kfdrc-germline-snv-wf.cwl b/workflows/kfdrc-germline-snv-wf.cwl index 0e7f2a7..13217ad 100644 --- a/workflows/kfdrc-germline-snv-wf.cwl +++ b/workflows/kfdrc-germline-snv-wf.cwl @@ -279,7 +279,6 @@ outputs: peddy_html: {type: 'File[]', doc: 'html summary of peddy results', outputSource: single_sample_genotyping/peddy_html} peddy_csv: {type: 'File[]', doc: 'csv details of peddy results', outputSource: single_sample_genotyping/peddy_csv} peddy_ped: {type: 'File[]', doc: 'ped format summary of peddy results', outputSource: single_sample_genotyping/peddy_ped} - gatk_hardfiltered_vcf: {type: 'File?', doc: "VCF containing the all genotyping variants with their hardfilters", outputSource: single_sample_genotyping/hardfiltered_vcf} gatk_annotation_plots: {type: 'File?', doc: "TAR file containing plots of all annotations used for hardfiltering", outputSource: single_sample_genotyping/annotation_plots} freebayes_unfiltered_vcf: {type: 'File', outputSource: freebayes/unfiltered_vcf} strelka2_prepass_variants: {type: 'File', outputSource: strelka2/prepass_variants_vcf} @@ -481,7 +480,7 @@ steps: cadd_indels: cadd_indels cadd_snvs: cadd_snvs intervar: intervar - out: [collectvariantcallingmetrics, peddy_html, peddy_csv, peddy_ped, vep_annotated_vcf, annotation_plots, hardfiltered_vcf] + out: [collectvariantcallingmetrics, peddy_html, peddy_csv, peddy_ped, vep_annotated_vcf, annotation_plots] hints: - class: "sbg:maxNumberOfParallelInstances" value: 3 diff --git a/workflows/kfdrc-germline-variant-wf.cwl b/workflows/kfdrc-germline-variant-wf.cwl index 9e27465..8835638 100644 --- a/workflows/kfdrc-germline-variant-wf.cwl +++ b/workflows/kfdrc-germline-variant-wf.cwl @@ -456,7 +456,6 @@ outputs: peddy_html: {type: 'File[]?', doc: 'html summary of peddy results', outputSource: snv/peddy_html} peddy_csv: {type: 'File[]?', doc: 'csv details of peddy results', outputSource: snv/peddy_csv} peddy_ped: {type: 'File[]?', doc: 'ped format summary of peddy results', outputSource: snv/peddy_ped} - gatk_hardfiltered_vcf: {type: 'File?', doc: "VCF containing the all genotyping variants with their hardfilters", outputSource: snv/gatk_hardfiltered_vcf} gatk_annotation_plots: {type: 'File?', doc: "TAR file containing plots of all annotations used for hardfiltering", outputSource: snv/gatk_annotation_plots} vep_annotated_gatk_vcf: {type: 'File[]?', outputSource: snv/vep_annotated_gatk_vcf} vep_annotated_strelka_vcf: {type: 'File[]?', outputSource: snv/vep_annotated_strelka_vcf} @@ -612,7 +611,7 @@ steps: run_strelka: run_strelka out: [gatk_gvcf, gatk_gvcf_metrics, verifybamid_output, gatk_vcf_metrics, peddy_html, peddy_csv, peddy_ped, vep_annotated_gatk_vcf, freebayes_unfiltered_vcf, vep_annotated_freebayes_vcf, vep_annotated_strelka_vcf, strelka2_prepass_variants, strelka2_gvcfs, - gatk_hardfiltered_vcf, gatk_annotation_plots] + gatk_annotation_plots] sv: run: ../workflows/kfdrc-germline-sv-wf.cwl in: diff --git a/workflows/kfdrc-single-sample-genotyping-wf.cwl b/workflows/kfdrc-single-sample-genotyping-wf.cwl index bffc10b..e85b5b2 100644 --- a/workflows/kfdrc-single-sample-genotyping-wf.cwl +++ b/workflows/kfdrc-single-sample-genotyping-wf.cwl @@ -202,7 +202,6 @@ outputs: peddy_html: {type: 'File[]', doc: 'html summary of peddy results', outputSource: peddy/output_html} peddy_csv: {type: 'File[]', doc: 'csv details of peddy results', outputSource: peddy/output_csv} peddy_ped: {type: 'File[]', doc: 'ped format summary of peddy results', outputSource: peddy/output_peddy} - hardfiltered_vcf: {type: 'File?', secondaryFiles: [{pattern: '.tbi', required: true}], outputSource: gatk_hardfiltering/hardfiltered_vcf} annotation_plots: {type: 'File?', outputSource: gatk_hardfiltering/annotation_plots} vep_annotated_vcf: {type: 'File[]', outputSource: annotate_vcf/annotated_vcf} From e5e3f67ac4e793e2d88aef2c2bd4b995b736346e Mon Sep 17 00:00:00 2001 From: dmiller15 Date: Thu, 16 Jan 2025 11:31:50 -0500 Subject: [PATCH 8/8] :broom: fixes from review --- docs/GERMLINE_SNV_README.md | 1 - docs/GERMLINE_VARIANT_README.md | 1 - scripts/gatk_plot_annotations.R | 6 +++++- subworkflows/gatk_plot_genotyping_annotations.cwl | 4 ++-- 4 files changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/GERMLINE_SNV_README.md b/docs/GERMLINE_SNV_README.md index cec91d3..4594384 100644 --- a/docs/GERMLINE_SNV_README.md +++ b/docs/GERMLINE_SNV_README.md @@ -121,7 +121,6 @@ For more information on the specific annotations, please see the documentation. - `peddy_html`: HTML metrics files from Peddy - `peddy_csv`: CSV metrics for het_check, ped_check, and sex_check from Peddy - `peddy_ped`: PED file with additional metrics information from Peddy - - `gatk_hardfiltered_vcf`: VCF containing the all genotyping variants with their hardfilters - `gatk_annotation_plots`: TAR file containing plots of all annotations used for hardfiltering - Strelka2 - `strelka2_prepass_variants`: Raw variants output from Strelka2 diff --git a/docs/GERMLINE_VARIANT_README.md b/docs/GERMLINE_VARIANT_README.md index b978d42..e431868 100644 --- a/docs/GERMLINE_VARIANT_README.md +++ b/docs/GERMLINE_VARIANT_README.md @@ -146,7 +146,6 @@ user must provide the associated gVCF in the `input_gvcf` input. - `peddy_csv`: CSV metrics for het_check, ped_check, and sex_check from Peddy - `peddy_ped`: PED file with additional metrics information from Peddy - `verifybamid_output`: VerifyBAMID output, including contamination score - - `gatk_hardfiltered_vcf`: VCF containing the all genotyping variants with their hardfilters - `gatk_annotation_plots`: TAR file containing plots of all annotations used for hardfiltering - Strelka2 - `strelka2_prepass_variants`: Raw variants output from Strelka2 diff --git a/scripts/gatk_plot_annotations.R b/scripts/gatk_plot_annotations.R index 5f15b80..677f546 100644 --- a/scripts/gatk_plot_annotations.R +++ b/scripts/gatk_plot_annotations.R @@ -1,6 +1,10 @@ #!/usr/bin/env Rscript # plotting.R script loads ggplot and gridExtra libraries and defines functions to plot variant annotations +# Usage: +# Rscript gatk_plot_annotations.R ... +# and are required +# ... are optional overrides to the field names that will be plotted library(ggplot2) library(gridExtra) @@ -23,7 +27,7 @@ makeDensityPlot <- function(dataframe, xvar, split, xmin=min(dataframe[xvar], na args <- commandArgs(trailingOnly=TRUE) if (length(args) < 3) { - stop("Three arguments are required. First, an ouptut_basename. Second, the input type. Third, the input table.", call.=FALSE) + stop("Three arguments are required. First, an output_basename. Second, the input type. Third, the input table.", call.=FALSE) } output_basename <- args[1] diff --git a/subworkflows/gatk_plot_genotyping_annotations.cwl b/subworkflows/gatk_plot_genotyping_annotations.cwl index 6cc5035..f409069 100644 --- a/subworkflows/gatk_plot_genotyping_annotations.cwl +++ b/subworkflows/gatk_plot_genotyping_annotations.cwl @@ -34,7 +34,7 @@ steps: in: input_vcf: indels_vcf fields: indel_plot_annots - output_filename: {valueFrom: "temp.snp.tsv"} + output_filename: {valueFrom: "temp.indel.tsv"} out: [output] gatk_plot_annotations_snps: run: ../tools/gatk_plot_annotations.cwl @@ -47,7 +47,7 @@ steps: gatk_plot_annotations_indels: run: ../tools/gatk_plot_annotations.cwl in: - input_table: gatk_variantstotable_snps/output + input_table: gatk_variantstotable_indels/output input_type: {valueFrom: "INDEL"} output_basename: output_basename annotation_fields: indel_plot_annots