Skip to content

Commit 4ee1df2

Browse files
committed
🚧 rework hardfiltering
🐛 fix typos 🧹 cleanup unused tools 🧹 return old output name
1 parent 499cbc6 commit 4ee1df2

18 files changed

+183
-257
lines changed

subworkflows/kfdrc-gatk-hardfiltering.cwl

+22-6
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,17 @@ doc: |-
1212
Finally the VCFs are merged back together using bcftools concat and returned.
1313

1414
inputs:
15-
input_vcf: {type: 'File', secondaryFiles: [.tbi], doc: "Input VCF containing INDEL and SNP variants"}
15+
input_vcf: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], doc: "Input VCF containing INDEL and SNP variants"}
1616
output_basename: {type: 'string', doc: "String value to use as the base for the filename of the output"}
1717
snp_hardfilters: {type: 'string', doc: "String value of hardfilters to set for SNPs in input_vcf" }
1818
indel_hardfilters: {type: 'string', doc: "String value of hardfilters to set for INDELs in input_vcf" }
19+
snp_filtration_extra_args: {type: 'string?', doc: "Any extra arguments for SNP VariantFiltration" }
20+
indel_filtration_extra_args: {type: 'string?', doc: "Any extra arguments for INDEL VariantFiltration" }
21+
filtration_cpu: { type: 'int?', doc: "CPUs to allocate to GATK VariantFiltration" }
22+
filtration_ram: { type: 'int?', doc: "GB of RAM to allocate to GATK VariantFiltration" }
1923

2024
outputs:
21-
hardfiltered_vcf: {type: 'File', outputSource: bcftools_concat_snps_indels/output}
25+
hardfiltered_vcf: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], outputSource: bcftools_concat_snps_indels/output}
2226

2327
steps:
2428
gatk_selectvariants_snps:
@@ -39,15 +43,27 @@ steps:
3943
run: ../tools/gatk_variantfiltration.cwl
4044
in:
4145
input_vcf: gatk_selectvariants_snps/output
42-
output_basename: output_basename
43-
selection: {valueFrom: "SNP"}
46+
output_basename:
47+
source: output_basename
48+
valueFrom: |
49+
$(self).snp.filtered
50+
variant_filters: snp_hardfilters
51+
extra_args: snp_filtration_extra_args
52+
max_memory: filtration_ram
53+
cpu: filtration_cpu
4454
out: [output]
4555
gatk_variantfiltration_indels:
4656
run: ../tools/gatk_variantfiltration.cwl
4757
in:
4858
input_vcf: gatk_selectvariants_indels/output
49-
output_basename: output_basename
50-
selection: {valueFrom: "INDEL"}
59+
output_basename:
60+
source: output_basename
61+
valueFrom: |
62+
$(self).indel.filtered
63+
variant_filters: indel_hardfilters
64+
extra_args: indel_filtration_extra_args
65+
max_memory: filtration_ram
66+
cpu: filtration_cpu
5167
out: [output]
5268
bcftools_concat_snps_indels:
5369
run: ../tools/bcftools_concat.cwl

subworkflows/kfdrc-gatk-vqsr.cwl

+22-12
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@ doc: |-
66
requirements:
77
- class: ScatterFeatureRequirement
88
- class: InlineJavascriptRequirement
9+
- class: StepInputExpressionRequirement
910

1011
inputs:
1112
genotyped_vcfs: {type: 'File[]', secondaryFiles: [{pattern: '.tbi', required: true}], doc: "Input VCF that has been jointly genotyped"}
@@ -61,20 +62,26 @@ outputs:
6162

6263
steps:
6364
gatk_filter_excesshet:
64-
run: ../tools/tools/gatk_variantfiltration2.cwl
65+
run: ../tools/gatk_variantfiltration.cwl
6566
scatter: [input_vcf]
67+
hints:
68+
- class: 'sbg:AWSInstanceType'
69+
value: m5.4xlarge
6670
in:
6771
input_vcf: genotyped_vcfs
68-
output_filename:
69-
valueFrom: 'variant_filtered.vcf.gz'
72+
output_basename:
73+
valueFrom: 'excesshet_filtered'
7074
variant_filters:
7175
valueFrom: '--filter-expression "ExcessHet > 54.69" --filter-name ExcessHet'
72-
out: [filtered_vcf]
76+
out: [output]
7377
gatk_makesitesonlyvcf:
7478
run: ../tools/gatk_makesitesonlyvcf.cwl
7579
scatter: [input_vcf]
80+
hints:
81+
- class: 'sbg:AWSInstanceType'
82+
value: m5.4xlarge
7683
in:
77-
input_vcf: gatk_filter_excesshet/filtered_vcf
84+
input_vcf: gatk_filter_excesshet/output
7885
output_filename:
7986
valueFrom: 'sites_only.variant_filtered.vcf.gz'
8087
out: [sites_vcf]
@@ -92,7 +99,7 @@ steps:
9299
one_thousand_genomes_resource_vcf: one_thousand_genomes_resource_vcf
93100
sites_only_variant_filtered_vcf: gatk_gathervcfs/output
94101
max_gaussians: snp_max_gaussians
95-
tranches: snp_tranches
102+
tranche: snp_tranches
96103
annotations: snp_annotations
97104
cpu: snp_model_cpu
98105
ram: snp_model_ram
@@ -105,7 +112,7 @@ steps:
105112
mills_resource_vcf: mills_resource_vcf
106113
sites_only_variant_filtered_vcf: gatk_gathervcfs/output
107114
max_gaussians: indel_max_gaussians
108-
tranches: indel_tranches
115+
tranche: indel_tranches
109116
annotations: indel_annotations
110117
cpu: indel_recal_cpu
111118
ram: indel_recal_ram
@@ -115,22 +122,25 @@ steps:
115122
scatter: [sites_only_variant_filtered_vcf]
116123
hints:
117124
- class: 'sbg:AWSInstanceType'
118-
value: r5.4xlarge
125+
value: r5.2xlarge
119126
in:
120-
sites_only_variant_filtered_vcf: gatk_filter_execesshet/filtered_vcf
127+
sites_only_variant_filtered_vcf: gatk_filter_excesshet/output
121128
model_report: gatk_snpsvariantrecalibratorcreatemodel/model_report
122129
hapmap_resource_vcf: hapmap_resource_vcf
123130
omni_resource_vcf: omni_resource_vcf
124131
one_thousand_genomes_resource_vcf: one_thousand_genomes_resource_vcf
125132
dbsnp_resource_vcf: dbsnp_vcf
126133
max_gaussians: snp_max_gaussians
127-
tranches: snp_tranches
134+
tranche: snp_tranches
128135
annotations: snp_annotations
129136
cpu: snp_recal_cpu
130137
ram: snp_recal_ram
131138
out: [recalibration, tranches]
132139
gatk_gathertranches:
133140
run: ../tools/gatk_gathertranches.cwl
141+
hints:
142+
- class: 'sbg:AWSInstanceType'
143+
value: r5.2xlarge
134144
in:
135145
tranches: gatk_snpsvariantrecalibratorscattered/tranches
136146
cpu: gathertranche_cpu
@@ -142,11 +152,11 @@ steps:
142152
scatterMethod: dotproduct
143153
hints:
144154
- class: 'sbg:AWSInstanceType'
145-
value: r5.4xlarge
155+
value: r5.2xlarge
146156
in:
147157
indels_recalibration: gatk_indelsvariantrecalibrator/recalibration
148158
indels_tranches: gatk_indelsvariantrecalibrator/tranches
149-
input_vcf: variants_vcfs
159+
input_vcf: gatk_filter_excesshet/output
150160
snps_recalibration: gatk_snpsvariantrecalibratorscattered/recalibration
151161
snps_tranches: gatk_gathertranches/output
152162
snp_ts_filter_level: snp_ts_filter_level

tools/bcftools_concat.cwl

+4-4
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
cwlVersion: v1.0
1+
cwlVersion: v1.2
22
class: CommandLineTool
33
id: bcftools_concat
44
requirements:
@@ -27,8 +27,8 @@ arguments:
2727
tabix $(inputs.output_basename).merged.vcf.gz
2828

2929
inputs:
30-
indel_vcf: { type: 'File', secondaryFiles: ['.tbi'], doc: "VCF file containing INDELs" }
31-
snp_vcf: { type: 'File', secondaryFiles: ['.tbi'], doc: "VCF file containing SNPs" }
30+
indel_vcf: { type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], doc: "VCF file containing INDELs" }
31+
snp_vcf: { type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], doc: "VCF file containing SNPs" }
3232
output_basename: { type: 'string', doc: "String value to use as the base of the output filename" }
3333
ram: { type: 'int?', default: 8, doc: "GB of memory to allocate to this task. default: 8; softcap" }
3434
cpu: { type: 'int?', default: 4, doc: "Number of CPUs to allocate to this task. default: 4" }
@@ -38,4 +38,4 @@ outputs:
3838
type: 'File'
3939
outputBinding:
4040
glob: '$(inputs.output_basename).merged.vcf.gz'
41-
secondaryFiles: ['.tbi']
41+
secondaryFiles: [{pattern: '.tbi', required: true}]

tools/filtering_defaults.cwl

+6-1
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,12 @@ requirements:
1616
- class: InlineJavascriptRequirement
1717
inputs:
1818
num_vcfs: int
19-
experiment_type: string
19+
experiment_type:
20+
type:
21+
- type: enum
22+
name: experiment_type
23+
symbols: ["WGS", "WXS", "Targeted Sequencing"]
24+
doc: "Experimental strategy used to sequence the data in the input_vcfs"
2025
outputs:
2126
low_data: boolean
2227
snp_tranches: string[]?

tools/gatk_applyrecalibration.cwl

+8-8
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
cwlVersion: v1.0
1+
cwlVersion: v1.2
22
class: CommandLineTool
33
id: gatk_applyrecalibration
44
requirements:
@@ -14,7 +14,7 @@ arguments:
1414
- position: 0
1515
shellQuote: false
1616
valueFrom: >-
17-
/gatk --java-options "-Xmx5g -Xms5g"
17+
/gatk --java-options "-Xms$(Math.floor((inputs.ram - 1)*1000/1.074-1))m -Xmx$(Math.floor(inputs.ram*1000/1.074-1))m"
1818
ApplyVQSR
1919
-O tmp.indel.recalibrated.vcf
2020
-V $(inputs.input_vcf.path)
@@ -24,7 +24,7 @@ arguments:
2424
--create-output-bam-index true
2525
-mode INDEL
2626

27-
/gatk --java-options "-Xmx5g -Xms5g"
27+
/gatk --java-options "-Xms$(Math.floor((inputs.ram - 1)*1000/1.074-1))m -Xmx$(Math.floor(inputs.ram*1000/1.074-1))m"
2828
ApplyVQSR
2929
-O scatter.filtered.vcf.gz
3030
-V tmp.indel.recalibrated.vcf
@@ -36,23 +36,23 @@ arguments:
3636
inputs:
3737
input_vcf:
3838
type: File
39-
secondaryFiles: [.tbi]
39+
secondaryFiles: [{pattern: '.tbi', required: true}]
4040
indels_recalibration:
4141
type: File
42-
secondaryFiles: [.idx]
42+
secondaryFiles: [{pattern: '.idx', required: true}]
4343
indels_tranches: File
4444
snps_recalibration:
4545
type: File
46-
secondaryFiles: [.idx]
46+
secondaryFiles: [{pattern: '.idx', required: true}]
4747
snps_tranches: File
4848
snp_ts_filter_level: { type: 'float', doc: "The truth sensitivity level at which to start filtering SNP data" }
4949
indel_ts_filter_level: { type: 'float', doc: "The truth sensitivity level at which to start filtering INDEL data" }
50-
cpu: { type: 'int?', default: 2, doc: "CPUs to allocate to this task." }
50+
cpu: { type: 'int?', default: 1, doc: "CPUs to allocate to this task." }
5151
ram: { type: 'int?', default: 7, doc: "GB of RAM to allocate to this task." }
5252

5353
outputs:
5454
recalibrated_vcf:
5555
type: File
5656
outputBinding:
5757
glob: scatter.filtered.vcf.gz
58-
secondaryFiles: [.tbi]
58+
secondaryFiles: [{pattern: '.tbi', required: true}]

tools/gatk_gatherfinalvcf.cwl

+1-1
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
cwlVersion: v1.0
1+
cwlVersion: v1.2
22
class: CommandLineTool
33
id: gatk_gathervcfs
44
requirements:

tools/gatk_gathertranches.cwl

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
cwlVersion: v1.0
1+
cwlVersion: v1.2
22
class: CommandLineTool
33
id: gatk_gathertranches
44
requirements:
@@ -13,7 +13,7 @@ arguments:
1313
- position: 0
1414
shellQuote: false
1515
valueFrom: >-
16-
/gatk --java-options "-Xmx6g -Xms6g"
16+
/gatk --java-options "-Xmx$(Math.floor(inputs.ram*1000/1.074-1))m -Xms$(Math.floor((inputs.ram - 1)*1000/1.074-1))m"
1717
GatherTranches
1818
--output snps.gathered.tranches
1919
inputs:

tools/gatk_gathervcfs.cwl

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
cwlVersion: v1.0
1+
cwlVersion: v1.2
22
class: CommandLineTool
33
id: gatk_gathervcfs
44
requirements:
@@ -37,4 +37,4 @@ outputs:
3737
type: File
3838
outputBinding:
3939
glob: sites_only.vcf.gz
40-
secondaryFiles: [.tbi]
40+
secondaryFiles: [{pattern: '.tbi', required: true}]

tools/gatk_genomicsdbimport_genotypegvcfs.cwl

+4-4
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ arguments:
1414
- position: 0
1515
shellQuote: false
1616
valueFrom: >-
17-
/gatk --java-options "-Xms$(((inputs.max_memory - 5) * 500)m -Xmx$((inputs.max_memory - 5) * 1000)m"
17+
/gatk --java-options "-Xms$((inputs.ram - 5) * 500)m -Xmx$((inputs.ram - 5) * 1000)m"
1818
GenomicsDBImport
1919
-L $(inputs.interval.path)
2020
--interval-padding 5
@@ -24,7 +24,7 @@ arguments:
2424
- position: 10
2525
shellQuote: false
2626
valueFrom: >-
27-
&& /gatk --java-options "-Xms$(Math.floor(inputs.max_memory*500/1.074-1))m -Xmx$(Math.floor(inputs.max_memory*1000/1.074-1))m"
27+
&& /gatk --java-options "-Xms$(Math.floor(inputs.ram*500/1.074-1))m -Xmx$(Math.floor(inputs.ram*1000/1.074-1))m"
2828
GenotypeGVCFs
2929
-R $(inputs.reference_fasta.path)
3030
-D $(inputs.dbsnp_vcf.path)
@@ -40,9 +40,9 @@ inputs:
4040
interval: File
4141
reference_fasta: { type: 'File', secondaryFiles: [{pattern: '^.dict', required: true}, {pattern: '.fai', required: true}]}
4242
dbsnp_vcf: { type: 'File', secondaryFiles: [{pattern: '.idx', required: true}]}
43-
input_vcfs: { type: { type: array, items: 'File', inputBinding: { prefix: -V } }, inputBinding: { position: 1 }, secondaryFiles: [{pattern: '.tbi', requred: true}] }
43+
input_vcfs: { type: { type: array, items: 'File', inputBinding: { prefix: '-V' } }, inputBinding: { position: 1 }, secondaryFiles: [{pattern: '.tbi', required: true}] }
4444
genomicsdbimport_extra_args: { type: 'string?', inputBinding: { position: 1, shellQuote: false } }
45-
genotypegvcfs_extra_args: { type: 'string?' inputBinding: { position: 11, shellQuote: false } }
45+
genotypegvcfs_extra_args: { type: 'string?', inputBinding: { position: 11, shellQuote: false } }
4646
cpu: { type: 'int?', default: 5, doc: "CPUs to allocate to this task" }
4747
ram: { type: 'int?', default: 10, doc: "GB of RAM to allocate to this task" }
4848
outputs:

tools/gatk_import_genotype_filtergvcf_merge.cwl

-93
This file was deleted.

0 commit comments

Comments
 (0)