diff --git a/docs/GATK_GERMLINE_README.md b/docs/GATK_GERMLINE_README.md index c5a3069..4969ac3 100644 --- a/docs/GATK_GERMLINE_README.md +++ b/docs/GATK_GERMLINE_README.md @@ -4,7 +4,14 @@ Kids First Data Resource Center Single Sample Genotyping Workflow. This workflow While the Joint Genotyping Workflow is meant to be used with whole genome sequenced trios, this workflow is meant for processing single samples from any sequencing experiment. The key difference between the different approaches is -the filtering process. +the filtering process. Whole Genome samples will be filtered using GATK's +Variant Quality Score Recalibration (VQSR). Whole Exome and Targeted Sequencing +samples will be filtered using GATK's recommended Hard Filter. See below for +more information on both of these filtering processes. Note: it should be +possible to run a whole exome cohort of 30 or more samples through this +workflow. In that case, the workflow will use VQSR to process the cohort. +No internal testing has been performed for this approach so be prepared to +make adjustments to the workflow if you are attempting to run a cohort. While non-germline samples can be run through this workflow, be wary that the filtering process (VQSR/Hard Filtering) is specifically tuned for germline diff --git a/subworkflows/gatk_plot_genotyping_annotations.cwl b/subworkflows/gatk_plot_genotyping_annotations.cwl index f409069..0ce48aa 100644 --- a/subworkflows/gatk_plot_genotyping_annotations.cwl +++ b/subworkflows/gatk_plot_genotyping_annotations.cwl @@ -57,7 +57,7 @@ steps: in: output_filename: source: output_basename - valueFrom: '$(self).genotyping_annotation_plots.tar.gz' + valueFrom: '$(self).single.gatk.genotyped.annotation_plots.tar.gz' input_files: source: [gatk_plot_annotations_snps/plots, gatk_plot_annotations_indels/plots] valueFrom: '$(self[0].concat(self[1]))' diff --git a/workflows/kfdrc-germline-snv-wf.cwl b/workflows/kfdrc-germline-snv-wf.cwl index 13217ad..5c9ffb9 100644 --- a/workflows/kfdrc-germline-snv-wf.cwl +++ b/workflows/kfdrc-germline-snv-wf.cwl @@ -164,7 +164,7 @@ inputs: path: 60639019357c3a53540ca7e7, name: Homo_sapiens_assembly38.dict}, {class: File, path: 60639016357c3a53540ca7af, name: Homo_sapiens_assembly38.fasta.fai}]} output_basename: {type: 'string', doc: "String to use as the base for output filenames"} biospecimen_name: {type: 'string', doc: "String name of biospcimen"} - input_reads: {type: 'File', secondaryFiles: [{pattern: '.bai', required: false}, {pattern: '^.bai', required: false}, {pattern: '.crai', + input_reads: {type: 'File?', secondaryFiles: [{pattern: '.bai', required: false}, {pattern: '^.bai', required: false}, {pattern: '.crai', required: false}, {pattern: '^.crai', required: false}], doc: "Aligned reads files to be analyzed", "sbg:fileTypes": "BAM,CRAM"} input_gvcf: {type: 'File?', secondaryFiles: [{pattern: '.tbi', required: true}], doc: "gVCF associated with input_reads. Providing this value will skip gVCF creation for the GATK pipeline.", "sbg:fileTypes": "VCF.GZ"} @@ -297,7 +297,7 @@ steps: out: [out_file_array] samtools_view: run: ../tools/samtools_view.cwl - when: $(inputs.input_reads.nameext == '.cram' && inputs.run_gatk) + when: $(inputs.input_reads != null && inputs.input_reads.nameext == '.cram' && inputs.run_gatk) in: run_gatk: run_gatk input_reads: input_reads @@ -308,7 +308,7 @@ steps: valueFrom: $(1 == 1) output_filename: valueFrom: | - $(inputs.input_reads.nameroot).bam##idx##$(inputs.input_reads.nameroot).bam.bai + $(inputs.input_reads ? inputs.input_reads.nameroot : 'ph').bam##idx##$(inputs.input_reads ? inputs.input_reads.nameroot : 'ph').bam.bai cpu: valueFrom: $(8) ram: @@ -423,7 +423,7 @@ steps: run_gatk: boolean_to_boolean_gvcf/out_bool input_bam: source: [samtools_view/output, input_reads] - pickValue: first_non_null + valueFrom: '$(self[0] ? self[0] : self[1])' indexed_reference_fasta: indexed_reference_fasta scattered_calling_interval_lists: scatter_regions/scattered_intervallists biospecimen_name: biospecimen_name @@ -466,7 +466,7 @@ steps: genomicsdbimport_extra_args: genomicsdbimport_extra_args output_basename: output_basename tool_name: - valueFrom: "single.vqsr.filtered.vep_105" + valueFrom: "single.gatk.genotyped.filtered.vep_105" bcftools_annot_clinvar_columns: bcftools_annot_clinvar_columns clinvar_annotation_vcf: clinvar_annotation_vcf echtvar_anno_zips: echtvar_anno_zips @@ -500,5 +500,5 @@ $namespaces: - VCF - VEP "sbg:links": -- id: 'https://github.com/kids-first/kf-germline-workflow/releases/tag/v1.1.1' +- id: 'https://github.com/kids-first/kf-germline-workflow/releases/tag/v1.2.0' label: github-release diff --git a/workflows/kfdrc-germline-variant-wf.cwl b/workflows/kfdrc-germline-variant-wf.cwl index 8835638..41c09d5 100644 --- a/workflows/kfdrc-germline-variant-wf.cwl +++ b/workflows/kfdrc-germline-variant-wf.cwl @@ -215,7 +215,7 @@ inputs: name: Homo_sapiens_assembly38.fasta.64.amb}, {class: File, path: 6063901f357c3a53540ca849, name: Homo_sapiens_assembly38.fasta.64.ann}, {class: File, path: 6063901d357c3a53540ca81e, name: Homo_sapiens_assembly38.fasta.64.bwt}, {class: File, path: 6063901c357c3a53540ca801, name: Homo_sapiens_assembly38.fasta.64.pac}, {class: File, path: 60639015357c3a53540ca7a9, name: Homo_sapiens_assembly38.fasta.64.sa}]} - aligned_reads: {type: 'File', secondaryFiles: [{pattern: '.bai', required: false}, {pattern: '^.bai', required: false}, {pattern: '.crai', + aligned_reads: {type: 'File?', secondaryFiles: [{pattern: '.bai', required: false}, {pattern: '^.bai', required: false}, {pattern: '.crai', required: false}, {pattern: '^.crai', required: false}], doc: "Aligned Reads file(s) from which Germline Variants will be discovered", "sbg:fileTypes": "BAM, CRAM"} input_gvcf: {type: 'File?', secondaryFiles: [{pattern: '.tbi', required: true}], doc: "gVCF associated with aligned_reads. Providing @@ -227,11 +227,12 @@ inputs: name: experiment_type symbols: ["WGS", "WXS", "Targeted Sequencing"] doc: "Experimental strategy used to sequence the data of the aligned_reads" + default: "WGS" output_basename: {type: 'string', doc: "String value to use for the basename of all outputs"} cnv_intervals_padding: {type: 'int?', doc: "Length (in bp) of the padding regions on each side of the intervals. This must be the same value used for all case samples."} cnv_intervals_bin_length: {type: 'int?', doc: "Length (in bp) of the bins. If zero, no binning will be performed."} - cnv_intervals: {type: 'File', doc: "Picard or GATK-style interval list of regions to process. For WGS, this should typically only + cnv_intervals: {type: 'File?', doc: "Picard or GATK-style interval list of regions to process. For WGS, this should typically only include the chromosomes of interest.", "sbg:fileTypes": "INTERVALS, INTERVAL_LIST, LIST"} cnv_blacklist_intervals: {type: 'File?', doc: "Picard or GATK-style interval list of regions to ignore.", "sbg:fileTypes": "INTERVALS, INTERVAL_LIST, LIST"} @@ -286,7 +287,7 @@ inputs: cnvnator_disable_gc_correction: {type: 'boolean?', doc: "Do not to use GC corrected RD signal"} contig_ploidy_model_tar: {type: 'File?', doc: "The contig-ploidy model directory generated by the DetermineGermlineContigPloidyCohortMode task in the Cohort workflow.", "sbg:fileTypes": "TAR.GZ"} - gcnv_model_tars: {type: 'File[]', doc: "Array of tars of the contig-ploidy model directories generated by the GermlineCNVCallerCohortMode + gcnv_model_tars: {type: 'File[]?', doc: "Array of tars of the contig-ploidy model directories generated by the GermlineCNVCallerCohortMode tasks in the Cohort workflow.", "sbg:fileTypes": "TAR.GZ"} disabled_read_filters_for_collect_counts: {type: 'string[]?', doc: "Read filters to be disabled before analysis by GATK CollectReadCounts."} ploidy_mapping_error_rate: {type: 'float?', doc: "Typical mapping error rate."} @@ -567,6 +568,7 @@ steps: indexed_reference_fasta: indexed_reference_fasta input_reads: aligned_reads input_gvcf: input_gvcf + experiment_type: experiment_type output_basename: output_basename biospecimen_name: biospecimen_name calling_regions: snv_calling_regions diff --git a/workflows/kfdrc-single-sample-genotyping-wf.cwl b/workflows/kfdrc-single-sample-genotyping-wf.cwl index c4699ed..2f79459 100644 --- a/workflows/kfdrc-single-sample-genotyping-wf.cwl +++ b/workflows/kfdrc-single-sample-genotyping-wf.cwl @@ -106,10 +106,12 @@ inputs: input_vcfs: {type: 'File[]', doc: 'Input array of individual sample gVCF files'} experiment_type: type: + - 'null' - type: enum name: experiment_type symbols: ["WGS", "WXS", "Targeted Sequencing"] doc: "Experimental strategy used to sequence the data in the input_vcfs" + default: "WGS" axiomPoly_resource_vcf: {type: File, secondaryFiles: [{pattern: '.tbi', required: true}], doc: 'Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz', "sbg:suggestedValue": {class: File, path: 60639016357c3a53540ca7c7, name: Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz, secondaryFiles: [{class: File, path: 6063901d357c3a53540ca81b, name: Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi}]}} @@ -140,7 +142,7 @@ inputs: genomicsdbimport_extra_args: {type: 'string?', doc: "Any extra arguments to give to GenomicsDBImport"} genotypegvcfs_extra_args: {type: 'string?', doc: "Any extra arguments to give to GenotypeGVCFs"} output_basename: string - tool_name: {type: 'string?', default: "single.vqsr.filtered.vep_105", doc: "File name string suffx to use for output files"} + tool_name: {type: 'string?', default: "single.gatk.genotyped.filtered.vep_105", doc: "File name string suffx to use for output files"} # VQSR Options vqsr_snp_max_gaussians: {type: 'int?', doc: "Interger value for max gaussians in SNP VariantRecalibration. If a dataset gives fewer @@ -335,7 +337,7 @@ steps: $(self.secondaryFiles.filter(function(e) {return e.nameext == '.dict'})[0]) output_basename: source: output_basename - valueFrom: $(self).gatk.germline.hardfiltered + valueFrom: $(self).single.gatk.genotyped.filtered dbsnp_vcf: dbsnp_vcf wgs_evaluation_interval_list: wgs_evaluation_interval_list out: [output] @@ -378,5 +380,5 @@ hints: - VCF - VEP "sbg:links": -- id: 'https://github.com/kids-first/kf-germline-workflow/releases/tag/v1.1.1' +- id: 'https://github.com/kids-first/kf-germline-workflow/releases/tag/v1.2.0' label: github-release