kids-first
diff --git a/‎docs/GATK_GERMLINE_README.md
+46-4 b/‎docs/GATK_GERMLINE_README.md
+46-4
diff --git a/‎docs/dockers_gatk_genotyping.md
+3-1 b/‎docs/dockers_gatk_genotyping.md
+3-1
diff --git a/‎docs/dockers_germline_variant.md
+3-1 b/‎docs/dockers_germline_variant.md
+3-1
diff --git a/‎subworkflows/kfdrc-gatk-hardfiltering.cwl
+28-7 b/‎subworkflows/kfdrc-gatk-hardfiltering.cwl
+28-7
diff --git a/‎subworkflows/kfdrc-gatk-vqsr.cwl
+177 b/‎subworkflows/kfdrc-gatk-vqsr.cwl
+177
@@ -1,9 +1,51 @@
 # Kids First DRC Single Sample Genotyping Workflow
 Kids First Data Resource Center Single Sample Genotyping Workflow. This workflow closely mirrors the [Kids First DRC Joint Genotyping Workflow](https://github.com/kids-first/kf-jointgenotyping-workflow/blob/master/workflow/kfdrc-jointgenotyping-refinement-workflow.cwl).
-While the Joint Genotyping Workflow is meant to be used with trios, this workflow is meant for processing single samples.
-The key difference in this pipeline is a change in filtering between when the final VCF is gathered by GATK GatherVcfCloud and when it is annotated by VEP bcftools (see [Kids First DRC Germline SNV Annotation Workflow docs](https://github.com/kids-first/kf-annotation-tools/blob/v1.1.0/docs/GERMLINE_SNV_ANNOT_README.md) ).
-Unlike the Joint Genotyping Workflow, a germline-oriented [GATK hard filtering process](https://gatk.broadinstitute.org/hc/en-us/articles/360035890471-Hard-filtering-germline-short-variants) is performed and CalculateGenotypePosteriors has been removed.
-While somatic samples can be run through this workflow, be wary that the filtering process is specifically tuned for germline data.
+
+While the Joint Genotyping Workflow is meant to be used with whole genome
+sequenced trios, this workflow is meant for processing single samples from any
+sequencing experiment. The key difference between the different approaches is
+the filtering process.
+
+While non-germline samples can be run through this workflow, be wary that the
+filtering process (VQSR/Hard Filtering) is specifically tuned for germline
+data. We strongly recommend manually adjusting this process to fit your data.
+See the available `vqsr_` and `hardfilter_` options.
+
+## GATK Genotype Site-Level Filtering
+
+Coming out of the GATK Genotyping process, site-level filtering must be done to
+remove variants that might adversely affect downstream analysis.
+
+GATK provides many different approaches to filtering:
+- Variant Quality Score Recalibration (VQSR)
+- CNNScoreVariants/NVScoreVariants
+- Variant Extract-Train-Score (VETS)
+- Hard Filtering
+
+The first three are all complex, model-based approaches that attempt to infer
+cutoff points based on the data provided. Hard Filtering involves manually setting
+thresholds and removing variants that fail to meet those thresholds. For this
+workflow, we only make use of VQSR and Hard Filtering at this time.
+
+VQSR, being a model based approach, needs sufficient data to construct that
+model. Normally in the joint filtering context, this means having hundreds of
+samples. According to the documentation: "it is not suitable for some
+small-scale experiments, such as targeted gene panels or exome studies with
+fewer than 30 exomes." Therefore, VQSR is only activated in this workflow when
+the input gVCFs for this workflow come from whole genome sequencing experiments
+or when the user provides 30 or more exome gVCFs. The 30+ samples will be jointly
+genotpyed and that genotyped VCF will be provided to VQSR.
+
+Hard Filtering is really only constrained by having sufficient depth. In the
+case of exome and targeted sequencing, the depths are more than sufficient. Our
+current approach for hard filtering mirrors the default approach outlined in
+the GATK documentation. However as they point out, "You absolutely SHOULD
+expect to have to evaluate your results critically and TRY AGAIN with some
+parameter adjustments until you find the settings that are right for your
+data." As such, the workflow also allows you to provide your own hard filters
+to replace the defaults in this workflow.
+
+## Running the Workflow
 
 If you would like to run this workflow using the CAVATICA public app, a basic primer on running public apps can be found [here](https://www.notion.so/d3b/Starting-From-Scratch-Running-Cavatica-af5ebb78c38a4f3190e32e67b4ce12bb).
 Alternatively, if you'd like to run it locally using `cwltool`, a basic primer on that can be found [here](https://www.notion.so/d3b/Starting-From-Scratch-Running-CWLtool-b8dbbde2dc7742e4aff290b0a878344d) and combined with app-specific info from the readme below.
 
@@ -7,12 +7,14 @@ bcftools_concat.cwl|pgc-images.sbgenomics.com/d3b-bixu/vcfutils:latest
 bcftools_filter_vcf.cwl|pgc-images.sbgenomics.com/d3b-bixu/bcftools:1.20
 bcftools_strip_ann.cwl|pgc-images.sbgenomics.com/d3b-bixu/vcfutils:latest
 echtvar_anno.cwl|pgc-images.sbgenomics.com/d3b-bixu/echtvar:0.2.0
+filtering_defaults.cwl|None
 gatk_applyrecalibration.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
 gatk_gatherfinalvcf.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
 gatk_gathertranches.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
 gatk_gathervcfs.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
-gatk_import_genotype_filtergvcf_merge.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
+gatk_genomicsdbimport_genotypegvcfs.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
 gatk_indelsvariantrecalibrator.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
+gatk_makesitesonlyvcf.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
 gatk_selectvariants.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.2.0.0R
 gatk_snpsvariantrecalibratorcreatemodel.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
 gatk_snpsvariantrecalibratorscattered.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
 
@@ -24,6 +24,7 @@ echtvar_anno.cwl|pgc-images.sbgenomics.com/d3b-bixu/echtvar:0.2.0
 expression_create_index_array.cwl|None
 expression_transpose_two_dimension_array.cwl|None
 file_to_file_array.cwl|None
+filtering_defaults.cwl|None
 freebayes.cwl|staphb/freebayes:1.3.6
 gatk_applyrecalibration.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
 gatk_bedtointervallist.cwl|broadinstitute/gatk:4.4.0.0
@@ -32,12 +33,13 @@ gatk_determinegermlinecontigploidy_case.cwl|broadinstitute/gatk:4.2.0.0
 gatk_gatherfinalvcf.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
 gatk_gathertranches.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
 gatk_gathervcfs.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
+gatk_genomicsdbimport_genotypegvcfs.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
 gatk_germlinecnvcaller_case.cwl|broadinstitute/gatk:4.2.0.0
 gatk_haplotypecaller.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.beta.1-3.5
-gatk_import_genotype_filtergvcf_merge.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
 gatk_indelsvariantrecalibrator.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
 gatk_intervallisttobed.cwl|broadinstitute/gatk:4.4.0.0
 gatk_intervallisttools.cwl|broadinstitute/gatk:4.4.0.0
+gatk_makesitesonlyvcf.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.0.12.0
 gatk_mergevcfs.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.1.1.0
 gatk_postprocessgermlinecnvcalls.cwl|broadinstitute/gatk:4.2.0.0
 gatk_preprocessintervals.cwl|pgc-images.sbgenomics.com/d3b-bixu/gatk:4.2.0.0R
 
@@ -1,6 +1,9 @@
-cwlVersion: v1.0
+cwlVersion: v1.2
 class: Workflow
 id: kfdrc-gatk-hardfiltering
+requirements:
+- class: StepInputExpressionRequirement
+- class: InlineJavascriptRequirement
 doc: |-
   This workflow performs manual site-level variant filtration on an input VCF using the generic hard-filtering thresholds and example commands in the
   [documentation from Broad](https://gatk.broadinstitute.org/hc/en-us/articles/360035531112--How-to-Filter-variants-either-with-VQSR-or-by-hard-filtering#2).
@@ -9,11 +12,17 @@ doc: |-
   Finally the VCFs are merged back together using bcftools concat and returned.
 
 inputs:
-  input_vcf: {type: 'File', secondaryFiles: [.tbi], doc: "Input VCF containing INDEL and SNP variants"}
+  input_vcf: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], doc: "Input VCF containing INDEL and SNP variants"}
   output_basename: {type: 'string', doc: "String value to use as the base for the filename of the output"}
+  snp_hardfilters: {type: 'string', doc: "String value of hardfilters to set for SNPs in input_vcf" }
+  indel_hardfilters: {type: 'string', doc: "String value of hardfilters to set for INDELs in input_vcf" }
+  snp_filtration_extra_args: {type: 'string?', doc: "Any extra arguments for SNP VariantFiltration" }
+  indel_filtration_extra_args: {type: 'string?', doc: "Any extra arguments for INDEL VariantFiltration" }
+  filtration_cpu: { type: 'int?', doc: "CPUs to allocate to GATK VariantFiltration" }
+  filtration_ram: { type: 'int?', doc: "GB of RAM to allocate to GATK VariantFiltration" }
 
 outputs:
-  hardfiltered_vcf: {type: 'File', outputSource: bcftools_concat_snps_indels/output}
+  hardfiltered_vcf: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], outputSource: bcftools_concat_snps_indels/output}
 
 steps:
   gatk_selectvariants_snps:
@@ -34,15 +43,27 @@ steps:
     run: ../tools/gatk_variantfiltration.cwl
     in:
       input_vcf: gatk_selectvariants_snps/output
-      output_basename: output_basename
-      selection: {valueFrom: "SNP"}
+      output_basename:
+        source: output_basename
+        valueFrom: |
+          $(self).snp.filtered
+      variant_filters: snp_hardfilters
+      extra_args: snp_filtration_extra_args
+      max_memory: filtration_ram
+      cpu: filtration_cpu
     out: [output]
   gatk_variantfiltration_indels:
     run: ../tools/gatk_variantfiltration.cwl
     in:
       input_vcf: gatk_selectvariants_indels/output
-      output_basename: output_basename
-      selection: {valueFrom: "INDEL"}
+      output_basename:
+        source: output_basename
+        valueFrom: |
+          $(self).indel.filtered
+      variant_filters: indel_hardfilters
+      extra_args: indel_filtration_extra_args
+      max_memory: filtration_ram
+      cpu: filtration_cpu
     out: [output]
   bcftools_concat_snps_indels:
     run: ../tools/bcftools_concat.cwl
 
@@ -0,0 +1,177 @@
+cwlVersion: v1.2
+class: Workflow
+id: kfdrc-gatk-vqsr
+doc: |-
+  GATK workflow for Variant Quality Score Recalibration (VQSR)
+requirements:
+- class: ScatterFeatureRequirement
+- class: InlineJavascriptRequirement
+- class: StepInputExpressionRequirement
+
+inputs:
+  genotyped_vcfs: {type: 'File[]', secondaryFiles: [{pattern: '.tbi', required: true}], doc: "Input VCF that has been jointly genotyped"}
+  output_basename: {type: 'string', doc: "String value to use as the base for the filename of the output"}
+
+  axiomPoly_resource_vcf: {type: File, secondaryFiles: [{pattern: '.tbi', required: true}], doc: 'Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz',
+    "sbg:suggestedValue": {class: File, path: 60639016357c3a53540ca7c7, name: Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz,
+      secondaryFiles: [{class: File, path: 6063901d357c3a53540ca81b, name: Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi}]}}
+  dbsnp_vcf: {type: File, secondaryFiles: [{pattern: '.idx', required: true}], doc: 'Homo_sapiens_assembly38.dbsnp138.vcf', "sbg:suggestedValue": {
+      class: File, path: 6063901f357c3a53540ca84b, name: Homo_sapiens_assembly38.dbsnp138.vcf, secondaryFiles: [{class: File, path: 6063901e357c3a53540ca834,
+          name: Homo_sapiens_assembly38.dbsnp138.vcf.idx}]}}
+  hapmap_resource_vcf: {type: File, secondaryFiles: [{pattern: '.tbi', required: true}], doc: 'Hapmap genotype SNP input vcf', "sbg:suggestedValue": {
+      class: File, path: 60639016357c3a53540ca7be, name: hapmap_3.3.hg38.vcf.gz, secondaryFiles: [{class: File, path: 60639016357c3a53540ca7c5,
+          name: hapmap_3.3.hg38.vcf.gz.tbi}]}}
+  mills_resource_vcf: {type: File, secondaryFiles: [{pattern: '.tbi', required: true}], doc: 'Mills_and_1000G_gold_standard.indels.hg38.vcf.gz',
+    "sbg:suggestedValue": {class: File, path: 6063901a357c3a53540ca7f3, name: Mills_and_1000G_gold_standard.indels.hg38.vcf.gz, secondaryFiles: [
+        {class: File, path: 6063901c357c3a53540ca806, name: Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi}]}}
+  omni_resource_vcf: {type: File, secondaryFiles: [{pattern: '.tbi', required: true}], doc: '1000G_omni2.5.hg38.vcf.gz', "sbg:suggestedValue": {
+      class: File, path: 6063901e357c3a53540ca835, name: 1000G_omni2.5.hg38.vcf.gz, secondaryFiles: [{class: File, path: 60639016357c3a53540ca7b1,
+          name: 1000G_omni2.5.hg38.vcf.gz.tbi}]}}
+  one_thousand_genomes_resource_vcf: {type: File, secondaryFiles: [{pattern: '.tbi', required: true}], doc: '1000G_phase1.snps.high_confidence.hg38.vcf.gz,
+      high confidence snps', "sbg:suggestedValue": {class: File, path: 6063901c357c3a53540ca80f, name: 1000G_phase1.snps.high_confidence.hg38.vcf.gz,
+      secondaryFiles: [{class: File, path: 6063901e357c3a53540ca845, name: 1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi}]}}
+  snp_max_gaussians: {type: 'int?', doc: "Interger value for max gaussians in SNP VariantRecalibration. If a dataset gives fewer variants
+      than the expected scale, the number of Gaussians for training should be turned down. Lowering the max-Gaussians forces the program
+      to group variants into a smaller number of clusters, which results in more variants per cluster."}
+  indel_max_gaussians: {type: 'int?', doc: "Interger value for max gaussians in INDEL VariantRecalibration. If a dataset gives fewer
+      variants than the expected scale, the number of Gaussians for training should be turned down. Lowering the max-Gaussians forces
+      the program to group variants into a smaller number of clusters, which results in more variants per cluster."}
+  snp_tranches: { type: 'string[]', doc: "The levels of truth sensitivity at which to slice the SNP recalibration data, in percent." }
+  snp_annotations: { type: 'string[]', doc: "The names of the annotations which should used for SNP recalibration calculations." }
+  indel_tranches: { type: 'string[]', doc: "The levels of truth sensitivity at which to slice the INDEL recalibration data, in percent." }
+  indel_annotations: { type: 'string[]', doc: "The names of the annotations which should used for INDEL recalibration calculations." }
+  snp_ts_filter_level: { type: 'float', doc: "The truth sensitivity level at which to start filtering SNP data" }
+  indel_ts_filter_level: { type: 'float', doc: "The truth sensitivity level at which to start filtering INDEL data" }
+
+  # Resource Control
+  snp_model_cpu: { type: 'int?', doc: "CPUs to allocate to VariantRecalibrator for SNP model creation." }
+  snp_model_ram: { type: 'int?', doc: "GB of RAM to allocate to VariantRecalibrator for SNP model creation." }
+  indel_recal_cpu: { type: 'int?', doc: "CPUs to allocate to VariantRecalibrator for INDEL recalibration." }
+  indel_recal_ram: { type: 'int?', doc: "GB of RAM to allocate to VariantRecalibrator for INDEL recalibration." }
+  snp_recal_cpu: { type: 'int?', doc: "CPUs to allocate to VariantRecalibrator for scattered SNP recalibration." }
+  snp_recal_ram: { type: 'int?', doc: "GB of RAM to allocate to VariantRecalibrator for scattered SNP recalibration." }
+  gathertranche_cpu: { type: 'int?', doc: "CPUs to allocate to GatherTranches." }
+  gathertranche_ram: { type: 'int?', doc: "GB of RAM to allocate to GatherTranches." }
+  apply_cpu: { type: 'int?', doc: "CPUs to allocate to ApplyVQSR for INDELs and SNPs." }
+  apply_ram: { type: 'int?', doc: "GB of RAM to allocate to ApplyVQSR for INDELs and SNPs." }
+  gathervcf_cpu: { type: 'int?', doc: "CPUs to allocate to GatherVcfsCloud." }
+  gathervcf_ram: { type: 'int?', doc: "GB of RAM to allocate to GatherVcfsCloud." }
+
+outputs:
+  recalibrated_vcf: { type: 'File', secondaryFiles: [.tbi], outputSource: gatk_gatherfinalvcf/output }
+
+steps:
+  gatk_filter_excesshet:
+    run: ../tools/gatk_variantfiltration.cwl
+    scatter: [input_vcf]
+    hints:
+    - class: 'sbg:AWSInstanceType'
+      value: m5.4xlarge
+    in:
+      input_vcf: genotyped_vcfs
+      output_basename:
+        valueFrom: 'excesshet_filtered'
+      variant_filters:
+        valueFrom: '--filter-expression "ExcessHet > 54.69" --filter-name ExcessHet'
+    out: [output]
+  gatk_makesitesonlyvcf:
+    run: ../tools/gatk_makesitesonlyvcf.cwl
+    scatter: [input_vcf]
+    hints:
+    - class: 'sbg:AWSInstanceType'
+      value: m5.4xlarge
+    in:
+      input_vcf: gatk_filter_excesshet/output
+      output_filename:
+        valueFrom: 'sites_only.variant_filtered.vcf.gz'
+    out: [sites_vcf]
+  gatk_gathervcfs:
+    run: ../tools/gatk_gathervcfs.cwl
+    in:
+      input_vcfs: gatk_makesitesonlyvcf/sites_vcf
+    out: [output]
+  gatk_snpsvariantrecalibratorcreatemodel:
+    run: ../tools/gatk_snpsvariantrecalibratorcreatemodel.cwl
+    in:
+      dbsnp_resource_vcf: dbsnp_vcf
+      hapmap_resource_vcf: hapmap_resource_vcf
+      omni_resource_vcf: omni_resource_vcf
+      one_thousand_genomes_resource_vcf: one_thousand_genomes_resource_vcf
+      sites_only_variant_filtered_vcf: gatk_gathervcfs/output
+      max_gaussians: snp_max_gaussians
+      tranche: snp_tranches
+      annotations: snp_annotations
+      cpu: snp_model_cpu
+      ram: snp_model_ram
+    out: [model_report]
+  gatk_indelsvariantrecalibrator:
+    run: ../tools/gatk_indelsvariantrecalibrator.cwl
+    in:
+      axiomPoly_resource_vcf: axiomPoly_resource_vcf
+      dbsnp_resource_vcf: dbsnp_vcf
+      mills_resource_vcf: mills_resource_vcf
+      sites_only_variant_filtered_vcf: gatk_gathervcfs/output
+      max_gaussians: indel_max_gaussians
+      tranche: indel_tranches
+      annotations: indel_annotations
+      cpu: indel_recal_cpu
+      ram: indel_recal_ram
+    out: [recalibration, tranches]
+  gatk_snpsvariantrecalibratorscattered:
+    run: ../tools/gatk_snpsvariantrecalibratorscattered.cwl
+    scatter: [sites_only_variant_filtered_vcf]
+    hints:
+    - class: 'sbg:AWSInstanceType'
+      value: r5.2xlarge
+    in:
+      sites_only_variant_filtered_vcf: gatk_filter_excesshet/output
+      model_report: gatk_snpsvariantrecalibratorcreatemodel/model_report
+      hapmap_resource_vcf: hapmap_resource_vcf
+      omni_resource_vcf: omni_resource_vcf
+      one_thousand_genomes_resource_vcf: one_thousand_genomes_resource_vcf
+      dbsnp_resource_vcf: dbsnp_vcf
+      max_gaussians: snp_max_gaussians
+      tranche: snp_tranches
+      annotations: snp_annotations
+      cpu: snp_recal_cpu
+      ram: snp_recal_ram
+    out: [recalibration, tranches]
+  gatk_gathertranches:
+    run: ../tools/gatk_gathertranches.cwl
+    hints:
+    - class: 'sbg:AWSInstanceType'
+      value: r5.2xlarge
+    in:
+      tranches: gatk_snpsvariantrecalibratorscattered/tranches
+      cpu: gathertranche_cpu
+      ram: gathertranche_ram
+    out: [output]
+  gatk_applyrecalibration:
+    run: ../tools/gatk_applyrecalibration.cwl
+    scatter: [input_vcf, snps_recalibration]
+    scatterMethod: dotproduct
+    hints:
+    - class: 'sbg:AWSInstanceType'
+      value: r5.2xlarge
+    in:
+      indels_recalibration: gatk_indelsvariantrecalibrator/recalibration
+      indels_tranches: gatk_indelsvariantrecalibrator/tranches
+      input_vcf: gatk_filter_excesshet/output
+      snps_recalibration: gatk_snpsvariantrecalibratorscattered/recalibration
+      snps_tranches: gatk_gathertranches/output
+      snp_ts_filter_level: snp_ts_filter_level
+      indel_ts_filter_level: indel_ts_filter_level
+      cpu: apply_cpu
+      ram: apply_ram
+    out: [recalibrated_vcf]
+  gatk_gatherfinalvcf:
+    run: ../tools/gatk_gatherfinalvcf.cwl
+    in:
+      input_vcfs: gatk_applyrecalibration/recalibrated_vcf
+      output_basename: output_basename
+      cpu: gathervcf_cpu
+      ram: gathervcf_ram
+    out: [output]
+
+$namespaces:
+  sbg: https://sevenbridges.com