From 66034ae3b63ffa0adc2acd1b9b1203aec26d5c08 Mon Sep 17 00:00:00 2001 From: Haodong Chen Date: Fri, 8 Nov 2024 08:37:32 -0800 Subject: [PATCH 01/11] Add Sentieon HiFi workflow --- ...entieon-pacbio-hifi-longreads-workflow.cwl | 73 ++++++ tools/download_DNAscope_model.cwl | 73 ++++++ tools/sentieon_DNAscope_LongRead_CLI.cwl | 247 ++++++++++++++++++ 3 files changed, 393 insertions(+) create mode 100644 subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl create mode 100644 tools/download_DNAscope_model.cwl create mode 100644 tools/sentieon_DNAscope_LongRead_CLI.cwl diff --git a/subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl b/subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl new file mode 100644 index 0000000..66fd0f5 --- /dev/null +++ b/subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl @@ -0,0 +1,73 @@ +cwlVersion: v1.2 +class: Workflow +id: sentieon-pacbio-hifi-longreads-workflow +doc: | + Run Sentieon PacBio HiFi workflow + Minimap2 + DNAscope + LongReadSV +requirements: +- class: InlineJavascriptRequirement +- class: MultipleInputFeatureRequirement +- class: ScatterFeatureRequirement +- class: StepInputExpressionRequirement +- class: SubworkflowFeatureRequirement +inputs: + input_unaligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: false}], + doc: "Unaligned BAM file and index containing long reads generated by an PacBio sequencer.", + "sbg:fileTypes": "BAM"} + indexed_reference_fasta: {type: 'File', secondaryFiles: [{pattern: '.fai', required: true}, + {pattern: '^.dict', required: false}], doc: "Reference fasta and fai index.", + "sbg:suggestedValue": {class: File, path: 60639014357c3a53540ca7a3, name: Homo_sapiens_assembly38.fasta, + secondaryFiles: [{class: File, path: 60639016357c3a53540ca7af, name: Homo_sapiens_assembly38.fasta.fai}, + {class: File, path: 60639019357c3a53540ca7e7, name: Homo_sapiens_assembly38.dict}]}, + "sbg:fileTypes": "FASTA, FA"} + output_basename: {type: 'string', doc: "String to use as basename for all workflow\ + \ outputs."} + sentieon_license: {type: 'string?', doc: "License server host and port for Sentieon\ + \ tools.", default: "10.5.64.221:8990"} + dnascope_cpu: {type: 'int?', doc: "CPU Cores for dnascope to use."} + dnascope_ram: {type: 'int?', doc: "RAM (in GB) for dnascope to use."} +outputs: + minimap2_aligned_bam: {type: 'File[]?', secondaryFiles: [{pattern: '.bai', required: true}], + outputSource: sentieon_longread_cli/out_alignments, doc: "Aligned BAM file from Minimap2."} + dnascope_small_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], + outputSource: sentieon_longread_cli/small_variants, doc: "VCF.GZ file and index containing DNAscope-generated\ + \ small variant calls."} + longreadsv_structural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', + required: true}], outputSource: sentieon_longread_cli/structural_variants, doc: "VCF.GZ\ + \ file and index containing Sentieon LongReadSV-generated SV calls."} +steps: + download_model: + run: ../tools/download_DNAscope_model.cwl + in: + model_name: + valueFrom: "PacBio_HiFi-WGS" + out: [model_bundle] + sentieon_longread_cli: + run: ../tools/sentieon_DNAscope_LongRead_CLI.cwl + in: + input_bam: + source: + - input_unaligned_bam + linkMerge: merge_nested + reference: indexed_reference_fasta + output_vcf: + source: output_basename + valueFrom: $(self).vcf.gz + sentieon_license: sentieon_license + model_bundle: download_model/model_bundle + align: + valueFrom: | + $(1 == 1) + tech: + valueFrom: "HiFi" + bam_format: + valueFrom: | + $(1 == 1) + cpu_per_job: dnascope_cpu + mem_per_job: dnascope_ram + out: [small_variants, structural_variants, out_alignments, mosdepth_out] +$namespaces: + sbg: https://sevenbridges.com + diff --git a/tools/download_DNAscope_model.cwl b/tools/download_DNAscope_model.cwl new file mode 100644 index 0000000..73b5571 --- /dev/null +++ b/tools/download_DNAscope_model.cwl @@ -0,0 +1,73 @@ +cwlVersion: v1.2 +class: CommandLineTool +label: Download DNAscope model bundle +hints: + - class: ResourceRequirement + coresMin: 1 +requirements: + - class: ShellCommandRequirement + - class: InlineJavascriptRequirement + - class: DockerRequirement + dockerPull: python:3.7-slim + - class: InitialWorkDirRequirement + listing: + - entryname: get_dnascope_model.py + entry: | + #!/usr/bin/env python3 + + import argparse + import yaml + import requests + import sys + + def main(): + parser = argparse.ArgumentParser(description="Download DNAscope model bundle") + parser.add_argument("model_name", help="the name of the model bundle, e.g. Illumina_WGS") + args = parser.parse_args() + model_name = args.model_name.split("-") + sentieon_models_yaml = "https://github.com/Sentieon/sentieon-models/raw/refs/heads/main/sentieon_models.yaml" + response = requests.get(sentieon_models_yaml, allow_redirects=True) + content = response.content.decode("utf-8") + content = yaml.safe_load(content) + try: + url = content["DNAscope_bundles"][model_name[0]][model_name[1]] + r = requests.get(url, allow_redirects=True) + open(url.split("/")[-1], 'wb').write(r.content) + except: + open('empty.bundle', 'wb') + print('Models updated on: ' + content["Updated on"], file=sys.stderr) + + if __name__ == '__main__': + main() + + +arguments: + - position: 0 + valueFrom: 'pip install pyyaml requests;' + shellQuote: false + - position: 1 + valueFrom: 'python get_dnascope_model.py' + shellQuote: false +inputs: + - id: model_name + label: Model name + doc: Model platform and data type. For example, Illumina_WGS + type: + - type: enum + symbols: + - Illumina-WGS + - Illumina-WES + - MGI-WGS + - MGI-WES + - Element_Biosciences-WGS + - PacBio_HiFi-WGS + - Oxford_Nanopore-WGS + inputBinding: + position: 2 +outputs: + - id: model_bundle + label: DNAscope Model bundle + type: File + outputBinding: + glob: '*.bundle' + diff --git a/tools/sentieon_DNAscope_LongRead_CLI.cwl b/tools/sentieon_DNAscope_LongRead_CLI.cwl new file mode 100644 index 0000000..d51124c --- /dev/null +++ b/tools/sentieon_DNAscope_LongRead_CLI.cwl @@ -0,0 +1,247 @@ +cwlVersion: v1.2 +class: CommandLineTool +label: Sentieon_DNAscope_LongRead +doc: |- + This tool uses **Sentieon DNAscope** to call germline variants from PacBio HiFi reads [1]. + + ###References + + [1] [https://github.com/Sentieon/sentieon-cli/blob/main/docs/dnascope-longread.md](https://github.com/Sentieon/sentieon-cli/blob/main/docs/dnascope-longread.md) + +requirements: +- class: ShellCommandRequirement +- class: ResourceRequirement + coresMin: |- + ${ + if (inputs.cpu_per_job) + { + return inputs.cpu_per_job + } + else + { + return 36 + } + } + ramMin: |- + ${ + if (inputs.mem_per_job) + { + return inputs.mem_per_job + } + else + { + return 71000 + } + } +- class: DockerRequirement + dockerPull: pgc-images.sbgenomics.com/hdchen/sentieon:202308.03 +- class: EnvVarRequirement + envDef: + - envName: SENTIEON_LICENSE + envValue: $(inputs.sentieon_license) +- class: InlineJavascriptRequirement + +inputs: + sentieon_license: + label: Sentieon license + doc: License server host and port + type: string + reference: + type: File + doc: "Fasta for reference genome" + inputBinding: + position: 1 + prefix: -r + secondaryFiles: + - pattern: .fai + required: true + - pattern: ^.dict + required: false + sbg:fileTypes: FA, FASTA + fastq: + type: File[]? + doc: "Sample fastq files" + inputBinding: + position: 2 + prefix: --fastq + readgroups: + type: string[]? + doc: "Readgroup information for the fastq files" + inputBinding: + position: 3 + prefix: --readgroups + input_bam: + type: File[]? + doc: "sample BAM or CRAM file" + inputBinding: + position: 4 + prefix: -i + sbg:fileTypes: BAM, CRAM + secondaryFiles: + - pattern: ^.bai + required: false + - pattern: ^.crai + required: false + - pattern: .bai + required: false + - pattern: .crai + required: false + align: + type: boolean? + default: false + inputBinding: + position: 5 + prefix: --align + model_bundle: + type: File + doc: "The model bundle file" + inputBinding: + position: 6 + prefix: -m + tech: + type: string? + doc: "{HiFi,ONT} Sequencing technology used to generate the reads. (default: 'HiFi')" + inputBinding: + position: 7 + prefix: --tech + dbSNP: + type: File? + doc: "dbSNP vcf file Supplying this file will annotate variants with their dbSNP refSNP ID numbers." + inputBinding: + position: 8 + prefix: -d + secondaryFiles: + - pattern: .tbi + required: false + - pattern: .idx + required: false + diploid_bed: + type: File? + doc: "Region BED file. Supplying this file will limit variant calling to the intervals inside the BED file." + inputBinding: + position: 9 + prefix: -b + sbg:fileTypes: BED + haploid_bed: + type: File? + inputBinding: + position: 10 + prefix: --haploid- + sbg:fileTypes: BED + gvcf: + type: boolean? + default: false + inputBinding: + position: 11 + prefix: --gvcf + bam_format: + type: boolean? + default: false + doc: "Use the BAM format instead of CRAM for output aligned files (default: False)" + inputBinding: + position: 12 + prefix: --bam_format + cores: + type: int? + doc: "Number of threads/processes to use" + inputBinding: + position: 13 + prefix: -t + skip-small-variants: + type: boolean? + default: false + doc: "Skip small variant (SNV/indel) calling (default: False)" + inputBinding: + position: 14 + prefix: --skip-small-variants + skip-svs: + type: boolean? + default: false + doc: "Skip SV calling (default: False)" + inputBinding: + position: 15 + prefix: --skip-svs + skip-mosdepth: + type: boolean? + default: false + doc: "Skip QC with mosdepth (default: False)" + inputBinding: + position: 16 + prefix: --skip-mosdepth + input_ref: + type: File? + doc: "Used to decode the input alignment file. Required if the input file is in the CRAM/uCRAM formats" + inputBinding: + position: 17 + prefix: --input_ref + secondaryFiles: + - pattern: .fai + required: true + fastq_taglist: + type: string? + doc: "A comma-separated list of tags to retain. Defaults to ''*'' and the 'RG' tag is required" + inputBinding: + position: 18 + prefix: --fastq_taglist + minimap2_args: + type: string? + doc: "Extra arguments for sentieon minimap2 (default: '-Y')" + inputBinding: + position: 20 + prefix: --minimap2_args + util_sort_args: + type: string? + doc: "Extra arguments for sentieon util sort (default: '--cram_write_options version=3.0,compressor=rans')" + inputBinding: + position: 21 + prefix: --util_sort_args + output_vcf: + type: string + doc: "Output VCF File. The file name must end in .vcf.gz" + inputBinding: + position: 100 + cpu_per_job: + label: CPU per job + doc: CPU per job + type: int? + mem_per_job: + label: Memory per job + doc: Memory per job[MB]. + type: int? + +outputs: + small_variants: + type: File + secondaryFiles: + - pattern: .tbi + required: true + outputBinding: + glob: $(inputs.output_vcf) + sbg:fileTypes: VCF.GZ + structural_variants: + type: File + secondaryFiles: + - pattern: .tbi + required: true + outputBinding: + glob: $(inputs.output_vcf.replace(".vcf.gz", ".sv.vcf.gz")) + sbg:fileTypes: VCF.GZ + out_alignments: + type: File[]? + secondaryFiles: + - pattern: .bai + required: false + - pattern: .crai + required: false + outputBinding: + glob: ["*.cram", "*.bam"] + mosdepth_out: + type: File[]? + outputBinding: + glob: '*_mosdepth_*' + +baseCommand: +- sentieon-cli +- dnascope-longread +$namespaces: + sbg: https://sevenbridges.com From 2eb39a009ab32fb836360d59cd701162db89c55a Mon Sep 17 00:00:00 2001 From: Haodong Chen Date: Thu, 12 Dec 2024 19:54:33 -0800 Subject: [PATCH 02/11] Update PacBio HiFi workflow --- ...entieon-pacbio-hifi-longreads-workflow.cwl | 32 ++- ...reads-workflow-hifi-ubam-single-sample.cwl | 230 ++++++++++++++++++ 2 files changed, 259 insertions(+), 3 deletions(-) create mode 100644 workflows/kfdrc-pacbio-longreads-workflow-hifi-ubam-single-sample.cwl diff --git a/subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl b/subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl index 66fd0f5..efc5ba9 100644 --- a/subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl +++ b/subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl @@ -14,7 +14,7 @@ requirements: - class: SubworkflowFeatureRequirement inputs: input_unaligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: false}], - doc: "Unaligned BAM file and index containing long reads generated by an PacBio sequencer.", + doc: "Unaligned BAM file and index containing long reads generated by a PacBio sequencer.", "sbg:fileTypes": "BAM"} indexed_reference_fasta: {type: 'File', secondaryFiles: [{pattern: '.fai', required: true}, {pattern: '^.dict', required: false}], doc: "Reference fasta and fai index.", @@ -29,8 +29,8 @@ inputs: dnascope_cpu: {type: 'int?', doc: "CPU Cores for dnascope to use."} dnascope_ram: {type: 'int?', doc: "RAM (in GB) for dnascope to use."} outputs: - minimap2_aligned_bam: {type: 'File[]?', secondaryFiles: [{pattern: '.bai', required: true}], - outputSource: sentieon_longread_cli/out_alignments, doc: "Aligned BAM file from Minimap2."} + minimap2_aligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: true}], + outputSource: array_to_file/out_alignments, doc: "Aligned BAM file from Minimap2."} dnascope_small_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], outputSource: sentieon_longread_cli/small_variants, doc: "VCF.GZ file and index containing DNAscope-generated\ \ small variant calls."} @@ -68,6 +68,32 @@ steps: cpu_per_job: dnascope_cpu mem_per_job: dnascope_ram out: [small_variants, structural_variants, out_alignments, mosdepth_out] + array_to_file: + in: + infile: + source: sentieon_longread_cli/out_alignments + valueFrom: | + $(self[0]) + out: [out_alignments] + run: + cwlVersion: v1.2 + class: CommandLineTool + requirements: + - class: InlineJavascriptRequirement + doc: | + Select the first item from an array of BAM files. + baseCommand: [echo, done] + inputs: + infile: { type: 'File', secondaryFiles: [{pattern: '.bai', required: true}]} + outputs: + out_alignments: + type: File + outputBinding: + outputEval: | + $(inputs.infile) + secondaryFiles: + - pattern: .bai + required: true $namespaces: sbg: https://sevenbridges.com diff --git a/workflows/kfdrc-pacbio-longreads-workflow-hifi-ubam-single-sample.cwl b/workflows/kfdrc-pacbio-longreads-workflow-hifi-ubam-single-sample.cwl new file mode 100644 index 0000000..58010c7 --- /dev/null +++ b/workflows/kfdrc-pacbio-longreads-workflow-hifi-ubam-single-sample.cwl @@ -0,0 +1,230 @@ +cwlVersion: v1.2 +class: Workflow +id: kfdrc-pacbio-longreads-workflow-hifi-single-sample-ubam +label: Kids First DRC PacBio LongReads Workflow (HiFi single-sample uBAM) +doc: | + # Kids First Data Resource Center Pacific Biosciences Long Reads Alignment and Variant Calling Workflow + +

+ +

+ + The Kids First Data Resource Center (KFDRC) Pacific Biosciences (PacBio) + Long Reads Alignment and Variant Calling Workflow is a Common Workflow Language + (CWL) implementation of various softwares used to take reads information + generated by PacBio long reads sequencers and generate alignment and variant + information. This pipeline was made possible thanks to significant software and + support contributions from both Sentieon and Wang Genomics Lab. For more + information on our collaborators, check out their websites: + - Sentieon: https://www.sentieon.com/ + - Wang Genomics Lab: https://wglab.org/ + + ## Relevant Softwares and Versions + - [samtools head](http://www.htslib.org/doc/samtools-head.html): `1.17` + - [samtools fastq](http://www.htslib.org/doc/samtools-fastq.html): `1.15.1` + - [Sentieon Minimap2](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#minimap2-binary): `202308.03` + - [Sentieon util sort](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#util-binary): `202308.03` + - [Sentieon DNAScope HiFi](https://support.sentieon.com/manual/): `202308.03` + - [Sentieon LongReadSV](https://support.sentieon.com/manual/): `202308.03` + - [LongReadSum](https://github.com/WGLab/LongReadSum#readme): `1.2.0` + - [Sniffles](https://github.com/fritzsedlazeck/Sniffles#readme): `2.0.7` + - [pbsv](https://github.com/PacificBiosciences/pbsv#readme): `2.9.0` + + ## Input Files + - `input_unaligned_bam`: The primary input of the PacBio HiFi Long Reads Workflow is an unaligned BAM. RG fields are required. Only one SM (Sample name) is allowed. + - `indexed_reference_fasta`: Any suitable human reference genome. KFDRC uses `Homo_sapiens_assembly38.fasta` from Broad Institute. + + ## Output Files + - `dnascope_small_variants`: BGZIP and TABIX indexed VCF containing small variant calls made by Sentieon DNAScope HiFi on `minimap2_aligned_bam`. + - `longreadsum_bam_metrics`: BGZIP TAR containing various metrics collected by LongReadSum from the `minimap2_aligned_bam`. + - `minimap2_aligned_bam`: Indexed BAM file containing reads from the `input_unaligned_bam` aligned to the `indexed_reference_fasta`. + - `pbsv_structural_variants`: BGZIP and TABIX indexed VCF containing structural variant calls made by pbsv on the `minimap2_aligned_bam`. + - `sniffles_structural_variants`: BGZIP and TABIX indexed VCF containing structural variant calls made by Sniffles on the `minimap2_aligned_bam`. + - `longreadsv_structural_variants`: BGZIP and TABIX indexed VCF containing structural variant calls made by Sentieon LongReadSV on the `minimap2_aligned_bam`. + + ## Generalized Process + 1. Read group information (`@RG`) is harvested from the `input_unaligned_bam` header using `samtools head` and `grep`. + 1. Align `input_unaligned_bam` to `indexed_reference_fasta` with the above `@RG` information using samtools fastq, Sentieon Minimap2, and Sentieon sort. + 1. Generate long reads alignment metrics from the `minimap2_aligned_bam` using LongReadSum. + 1. Generate structural variant calls from the `minimap2_aligned_bam` using pbsv. + 1. Generate structural variant calls from the `minimap2_aligned_bam` using Sniffles. + 1. Generate structural variant calls from the `minimap2_aligned_bam` using Sentieon LongReadSV. + 1. Generate small variant from the `minimap2_aligned_bam` using Sentieon DNAScope HiFi. + + ## Basic Info + - [D3b dockerfiles](https://github.com/d3b-center/bixtools) + - Testing Tools: + - [Seven Bridges Cavatica Platform](https://cavatica.sbgenomics.com/) + - [Common Workflow Language reference implementation (cwltool)](https://github.com/common-workflow-language/cwltool/) + + ## References + - KFDRC AWS s3 bucket: s3://kids-first-seq-data/broad-references/ + - Cavatica: https://cavatica.sbgenomics.com/u/kfdrc-harmonization/kf-references/ + - Broad Institute Goolge Cloud: https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0/ +requirements: +- class: InlineJavascriptRequirement +- class: MultipleInputFeatureRequirement +- class: ScatterFeatureRequirement +- class: StepInputExpressionRequirement +- class: SubworkflowFeatureRequirement +inputs: + input_unaligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: false}], + doc: "Unaligned BAM file and index containing HiFi long reads generated by a PacBio sequencer.", + "sbg:fileTypes": "BAM"} + indexed_reference_fasta: {type: 'File', secondaryFiles: [{pattern: '.fai', required: true}, + {pattern: '^.dict', required: true}], doc: "Reference fasta and fai index.", + "sbg:suggestedValue": {class: File, path: 60639014357c3a53540ca7a3, name: Homo_sapiens_assembly38.fasta, + secondaryFiles: [{class: File, path: 60639016357c3a53540ca7af, name: Homo_sapiens_assembly38.fasta.fai}, + {class: File, path: 60639019357c3a53540ca7e7, name: Homo_sapiens_assembly38.dict}]}, + "sbg:fileTypes": "FASTA, FA"} + output_basename: {type: 'string', doc: "String to use as basename for all workflow\ + \ outputs."} + sentieon_license: {type: 'string?', doc: "License server host and port for Sentieon\ + \ tools.", default: "10.5.64.221:8990"} + longreadsum_cpu: {type: 'int?', doc: "CPU Cores for longreadsum to use."} + dnascope_cpu: {type: 'int?', doc: "CPU Cores for dnascope to use."} + dnascope_ram: {type: 'int?', doc: "RAM (in GB) for dnascope to use."} + pbsv_cpu: {type: 'int?', doc: "CPU Cores for pbsv to use."} + pbsv_ram: {type: 'int?', doc: "RAM (in GB) for pbsv to use."} + sniffles_cpu: {type: 'int?', doc: "CPU Cores for sniffles to use."} + sniffles_ram: {type: 'int?', doc: "RAM (in GB) for sniffles to use."} +outputs: + minimap2_aligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: true}], + outputSource: dnascope_longread/minimap2_aligned_bam, doc: "Aligned BAM file from Minimap2."} + longreadsum_bam_metrics: {type: 'File', outputSource: tar_longreadsum_dir/output, + doc: "TAR.GZ file containing longreadsum-generated metrics."} + dnascope_small_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], + outputSource: dnascope_longread/dnascope_small_variants, doc: "VCF.GZ file and index containing DNAscope-generated\ + \ small variant calls."} + pbsv_strucutural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], + outputSource: bgzip_tabix_index_pbsv_vcf/output, doc: "VCF.GZ file and index containing\ + \ pbsv-generated SV calls."} + sniffles_structural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', + required: true}], outputSource: sniffles/output_vcf, doc: "VCF.GZ file and\ + \ index containing sniffles-generated SV calls."} + longreadsv_structural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', + required: true}], outputSource: dnascope_longread/longreadsv_structural_variants, doc: "VCF.GZ\ + \ file and index containing Sentieon LongReadSV-generated SV calls."} +steps: + dnascope_longread: + run: ../subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl + in: + input_unaligned_bam: input_unaligned_bam + indexed_reference_fasta: indexed_reference_fasta + output_basename: output_basename + sentieon_license: sentieon_license + dnascope_cpu: dnascope_cpu + dnascope_ram: dnascope_ram + out: [minimap2_aligned_bam, dnascope_small_variants, longreadsv_structural_variants] + longreadsum: + hints: + - class: "sbg:AWSInstanceType" + value: c5.9xlarge + run: ../tools/longreadsum.cwl + in: + input_type: + valueFrom: "bam" + input_file: dnascope_longread/minimap2_aligned_bam + output_dir: output_basename + output_basename: output_basename + log: + valueFrom: "test.log" + log_level: + valueFrom: "2" + cpu: longreadsum_cpu + out: [outputs] + tar_longreadsum_dir: + run: ../tools/tar.cwl + in: + output_filename: + source: output_basename + valueFrom: $(self).longreadsum.tar.gz + input_dir: longreadsum/outputs + out: [output] + pbsv_discover: + hints: + - class: "sbg:AWSInstanceType" + value: c5.9xlarge + run: ../tools/pbsv_discover.cwl + in: + input_bam: dnascope_longread/minimap2_aligned_bam + output_filename: + source: output_basename + valueFrom: $(self).pbsv.svsig.gz + hifi_preset: + valueFrom: | + $(1 == 1) + cpu: pbsv_cpu + ram: pbsv_ram + out: [output_svsig] + pbsv_call: + hints: + - class: "sbg:AWSInstanceType" + value: c5.9xlarge + run: ../tools/pbsv_call.cwl + in: + reference_fasta: indexed_reference_fasta + input_svsig: pbsv_discover/output_svsig + output_filename: + source: output_basename + valueFrom: $(self).pbsv.vcf + hifi_preset: + valueFrom: | + $(1 == 1) + cpu: pbsv_cpu + ram: pbsv_ram + out: [output_vcf] + bgzip_tabix_index_pbsv_vcf: + run: ../tools/bgzip_tabix_index.cwl + in: + input_vcf: pbsv_call/output_vcf + cpu: pbsv_cpu + out: [output] + sniffles: + hints: + - class: "sbg:AWSInstanceType" + value: c5.9xlarge + run: ../tools/sniffles.cwl + in: + input_bam: + source: dnascope_longread/minimap2_aligned_bam + valueFrom: $([self]) + vcf_output_filename: + source: output_basename + valueFrom: $(self).sniffles.vcf.gz + reference_fasta: indexed_reference_fasta + cpu: sniffles_cpu + ram: sniffles_ram + out: [output_vcf, output_snf] +$namespaces: + sbg: https://sevenbridges.com +hints: +- class: "sbg:maxNumberOfParallelInstances" + value: 2 +"sbg:license": Apache License 2.0 +"sbg:publisher": KFDRC +"sbg:categories": +- ALIGNMENT +- DNA +- INDEL +- LONG +- LONGREADS +- LONGREADSUM +- METRICS +- NANOCALLER +- PACBIO +- PACIFIC +- PBMM2 +- PBSV +- SENTIEON +- SNIFFLES +- SNP +- SOMATIC +- STRUCTURAL +- SV +- VARIANT +- WGS +- WXS +"sbg:links": +- id: 'https://github.com/kids-first/kf-longreads-workflow/releases/tag/v2.0.2' + label: github-release From 49f0ae983c1993f93dab49dfb981d1273c6dba77 Mon Sep 17 00:00:00 2001 From: Haodong Chen Date: Thu, 12 Dec 2024 19:57:21 -0800 Subject: [PATCH 03/11] Update doc --- docs/dockers_pacbio.md | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/docs/dockers_pacbio.md b/docs/dockers_pacbio.md index f804e8f..c262759 100644 --- a/docs/dockers_pacbio.md +++ b/docs/dockers_pacbio.md @@ -10,9 +10,6 @@ pbsv_call.cwl|quay.io/biocontainers/pbsv:2.9.0--h9ee0642_0 pbsv_discover.cwl|quay.io/biocontainers/pbsv:2.9.0--h9ee0642_0 samtools_head.cwl|staphb/samtools:1.17 samtools_split.cwl|staphb/samtools:1.17 -sentieon_DNAscope_LongRead.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202112.01_hifi -sentieon_LongReadSV.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202112.06 -sentieon_ReadWriter.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202112.01_hifi -sentieon_minimap2.cwl|pgc-images.sbgenomics.com/d3b-bixu/sentieon:202112.01_hifi +sentieon_DNAscope_LongRead_CLI.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202308.03 sniffles.cwl|pgc-images.sbgenomics.com/d3b-bixu/sniffles:2.0.7 tar.cwl|None From bd37fd27cd3d307e870d87baf5cc17d93ee05fa0 Mon Sep 17 00:00:00 2001 From: Haodong Chen Date: Thu, 12 Dec 2024 23:52:13 -0800 Subject: [PATCH 04/11] Update the PacBio pipeline --- tools/sentieon_DNAscope_LongRead.cwl | 169 ------------------ tools/sentieon_DNAscope_LongRead_CLI.cwl | 2 +- tools/sentieon_LongReadSV.cwl | 134 -------------- tools/sentieon_ReadWriter.cwl | 2 +- tools/sentieon_minimap2.cwl | 2 +- workflows/kfdrc-pacbio-longreads-workflow.cwl | 49 +++-- 6 files changed, 26 insertions(+), 332 deletions(-) delete mode 100644 tools/sentieon_DNAscope_LongRead.cwl delete mode 100644 tools/sentieon_LongReadSV.cwl diff --git a/tools/sentieon_DNAscope_LongRead.cwl b/tools/sentieon_DNAscope_LongRead.cwl deleted file mode 100644 index 14855f2..0000000 --- a/tools/sentieon_DNAscope_LongRead.cwl +++ /dev/null @@ -1,169 +0,0 @@ -cwlVersion: v1.2 -class: CommandLineTool -label: Sentieon_DNAscope_LongRead -doc: |- - This tool uses **Sentieon DNAscope** to call germline variants from PacBio HiFi reads [1]. - - ### Input data requirements - - - **Aligned reads**: The pipeline will take PacBio HiFi reads that have been aligned to a reference genome with `pbmm2` or `minimap2`. - - **The Reference genome**: A reference genome file in FASTA format with its index file (.fai). - - ### Common Issues and Important Notes - - * By suppling an optional MHC BED file, additional special handling can be applied to the MHC region to further increase variant calling accuracy. - * Currently, the pipeline is only recommended for use with samples from diploid organisms. For samples with both diploid and haploid chromosomes, the `-b INTERVAL` option can be used to limit variant calling to diploid chromosomes. - - ###References - - [1] [https://support.sentieon.com/appnotes/dnascope_hifi/](https://support.sentieon.com/appnotes/dnascope_hifi/) - -requirements: -- class: ShellCommandRequirement -- class: ResourceRequirement - coresMin: |- - ${ - if (inputs.cpu_per_job) - { - return inputs.cpu_per_job - } - else - { - return 36 - } - } - ramMin: |- - ${ - if (inputs.mem_per_job) - { - return inputs.mem_per_job - } - else - { - return 71000 - } - } -- class: DockerRequirement - dockerPull: pgc-images.sbgenomics.com/hdchen/sentieon:202112.01_hifi -- class: EnvVarRequirement - envDef: - - envName: SENTIEON_LICENSE - envValue: $(inputs.sentieon_license) -- class: InlineJavascriptRequirement - -inputs: -- id: sentieon_license - label: Sentieon license - doc: License server host and port - type: string -- id: reference - label: Reference - doc: Reference fasta with associated fai index - type: File - secondaryFiles: - - pattern: .fai - required: true - - pattern: ^.dict - required: false - inputBinding: - prefix: -r - position: 10 - shellQuote: false - sbg:fileTypes: FA, FASTA -- id: input_bam - label: Input BAM - doc: Input BAM file - type: File - secondaryFiles: - - pattern: ^.bai - required: false - - pattern: ^.crai - required: false - - pattern: .bai - required: false - - pattern: .crai - required: false - inputBinding: - prefix: -i - position: 11 - shellQuote: false - sbg:fileTypes: BAM, CRAM -- id: output_file_name - label: Output file name - doc: The output VCF file name. Must end with ".gz". - type: string? -- id: dbSNP - label: dbSNP VCF file - doc: |- - Supplying this file will annotate variants with their dbSNP refSNP ID numbers. (optional) - type: File? - secondaryFiles: - - pattern: .tbi - required: false - - pattern: .idx - required: false - inputBinding: - prefix: -d - position: 30 - shellQuote: false -- id: bed - label: Region BED file - doc: |- - Supplying this file will limit variant calling to the intervals inside the BED file. (optional) - type: File? - inputBinding: - prefix: -b - position: 39 - shellQuote: false - sbg:fileTypes: BED -- id: mhc - label: MHC BED file - doc: |- - Supplying this file will enable the special handling of the MHC region. (optional) - type: File? - inputBinding: - prefix: -B - position: 60 - shellQuote: false - sbg:fileTypes: BED -- id: cpu_per_job - label: CPU per job - doc: CPU per job - type: int? -- id: mem_per_job - label: Memory per job - doc: Memory per job[MB] - type: int? - -outputs: -- id: output_vcf - type: File - secondaryFiles: - - pattern: .tbi - required: true - outputBinding: - glob: '*.vcf.gz' - sbg:fileTypes: VCF.GZ - -baseCommand: -- /bin/bash -- /opt/dnascope_hifi/DNAscopeHiFiBeta0.4.pipeline/dnascope_HiFi.sh -arguments: -- prefix: '' - position: 1 - valueFrom: -m /opt/dnascope_hifi/DNAscopeHiFiBeta0.4.pipeline/DNAscopeHiFiBeta0.4.model - shellQuote: false -- prefix: '' - position: 100 - valueFrom: |- - ${ - if (inputs.output_file_name) - return inputs.output_file_name - else - var basename = inputs.input_bam.nameroot - return basename.concat(".vcf.gz") - } - shellQuote: false - -$namespaces: - sbg: https://sevenbridges.com diff --git a/tools/sentieon_DNAscope_LongRead_CLI.cwl b/tools/sentieon_DNAscope_LongRead_CLI.cwl index d51124c..350506d 100644 --- a/tools/sentieon_DNAscope_LongRead_CLI.cwl +++ b/tools/sentieon_DNAscope_LongRead_CLI.cwl @@ -126,7 +126,7 @@ inputs: type: File? inputBinding: position: 10 - prefix: --haploid- + prefix: --haploid-bed sbg:fileTypes: BED gvcf: type: boolean? diff --git a/tools/sentieon_LongReadSV.cwl b/tools/sentieon_LongReadSV.cwl deleted file mode 100644 index a3d8ca6..0000000 --- a/tools/sentieon_LongReadSV.cwl +++ /dev/null @@ -1,134 +0,0 @@ -cwlVersion: v1.2 -class: CommandLineTool -id: sentieon_LongReadSV -doc: |- - Sentieon SV calling for PacBio HiFi and Oxford Nanopore long reads. - - ### Inputs: - #### Required - - ``Reference``: Location of the reference FASTA file. - - ``Input BAM``: Location of the BAM/CRAM input file. - - ``Platform``: PacBio HiFi or Oxford Nanopore - -requirements: -- class: ShellCommandRequirement -- class: InlineJavascriptRequirement -- class: ResourceRequirement - coresMin: $(inputs.cpu) - ramMin: $(inputs.ram * 1000) -- class: DockerRequirement - dockerPull: pgc-images.sbgenomics.com/hdchen/sentieon:202112.06 -- class: EnvVarRequirement - envDef: - - envName: SENTIEON_LICENSE - envValue: $(inputs.sentieon_license) -baseCommand: -- sentieon -- driver -arguments: -- prefix: '--algo' - position: 10 - valueFrom: LongReadSV - shellQuote: false -inputs: - sentieon_license: - type: 'string' - doc: License server host and port - reference: - type: 'File' - secondaryFiles: - - pattern: .fai - required: true - - pattern: ^.dict - required: true - inputBinding: - prefix: -r - position: 0 - shellQuote: false - doc: Reference fasta with associated fai index - sbg:fileTypes: FA, FASTA - input_bam: - type: 'File' - secondaryFiles: - - pattern: ^.bai - required: false - - pattern: ^.crai - required: false - - pattern: .bai - required: false - - pattern: .crai - required: false - inputBinding: - prefix: -i - position: 1 - shellQuote: false - doc: Input BAM file - sbg:fileTypes: BAM, CRAM - platform: - type: - - 'null' - - name: platform - type: enum - symbols: - - PacBioHiFi - - ONT - default: PacBioHiFi - inputBinding: - prefix: --model - position: 11 - shellQuote: true - valueFrom: |- - ${ - if (self === "PacBioHiFi") { - return "/opt/dnascope_models/SentieonLongReadSVHiFiBeta0.1.model"; - } - else if (self === "ONT") { - return "/opt/dnascope_models/SentieonLongReadSVONTBeta0.1.model"; - } - return "" - } - doc: |- - PacBio HiFi or Oxford Nanopore (ONT) - sbg:toolDefaultValue: PacBioHiFi - min_sv_size: - type: 'int?' - inputBinding: - prefix: --min_sv_size - shellQuote: true - position: 12 - doc: minimum SV size in basepairs to output - sbg:toolDefaultValue: 40 - min_map_qual: - type: 'int?' - inputBinding: - prefix: --min_map_qual - shellQuote: true - position: 12 - doc: minimum read mapping quality - sbg:toolDefaultValue: 20 - output_file_name: - type: 'string' - inputBinding: - position: 100 - shellQuote: true - doc: The output VCF file name. Must end with ".vcf.gz". - cpu: - type: 'int?' - default: 36 - doc: CPUs to allocate to this task - ram: - type: 'int?' - default: 36 - doc: GB of RAM to allocate to this task -outputs: - output_vcf: - type: 'File' - secondaryFiles: - - pattern: .tbi - required: true - outputBinding: - glob: '*.vcf.gz' - sbg:fileTypes: VCF.GZ - -$namespaces: - sbg: https://sevenbridges.com diff --git a/tools/sentieon_ReadWriter.cwl b/tools/sentieon_ReadWriter.cwl index cb4dfb7..68a8cc4 100644 --- a/tools/sentieon_ReadWriter.cwl +++ b/tools/sentieon_ReadWriter.cwl @@ -17,7 +17,7 @@ requirements: ramMin: | $(inputs.mem_per_job ? inputs.mem_per_job : 16000) - class: DockerRequirement - dockerPull: pgc-images.sbgenomics.com/hdchen/sentieon:202112.01_hifi + dockerPull: pgc-images.sbgenomics.com/hdchen/sentieon:202308.03 - class: EnvVarRequirement envDef: - envName: SENTIEON_LICENSE diff --git a/tools/sentieon_minimap2.cwl b/tools/sentieon_minimap2.cwl index 5e0bbad..2bfc931 100644 --- a/tools/sentieon_minimap2.cwl +++ b/tools/sentieon_minimap2.cwl @@ -18,7 +18,7 @@ requirements: coresMin: $(inputs.cpu_per_job) ramMin: $(inputs.mem_per_job * 1000) - class: DockerRequirement - dockerPull: pgc-images.sbgenomics.com/d3b-bixu/sentieon:202112.01_hifi + dockerPull: pgc-images.sbgenomic6s.com/hdchen/sentieon:202308.03 - class: EnvVarRequirement envDef: - envName: SENTIEON_LICENSE diff --git a/workflows/kfdrc-pacbio-longreads-workflow.cwl b/workflows/kfdrc-pacbio-longreads-workflow.cwl index d0bf8d5..42e0e57 100644 --- a/workflows/kfdrc-pacbio-longreads-workflow.cwl +++ b/workflows/kfdrc-pacbio-longreads-workflow.cwl @@ -22,10 +22,10 @@ doc: | ## Relevant Softwares and Versions - [samtools head](http://www.htslib.org/doc/samtools-head.html): `1.17` - [samtools fastq](http://www.htslib.org/doc/samtools-fastq.html): `1.15.1` - - [Sentieon Minimap2](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#minimap2-binary): `202112.01` - - [Sentieon util sort](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#util-binary): `202112.01` - - [Sentieon DNAScope HiFi](https://support.sentieon.com/manual/): `202112.01` - - [Sentieon LongReadSV](https://support.sentieon.com/manual/): `202112.06` + - [Sentieon Minimap2](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#minimap2-binary): `202308.03` + - [Sentieon util sort](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#util-binary): `202308.03` + - [Sentieon DNAScope HiFi](https://support.sentieon.com/manual/): `202308.03` + - [Sentieon LongReadSV](https://support.sentieon.com/manual/): `202308.03` - [LongReadSum](https://github.com/WGLab/LongReadSum#readme): `1.2.0` - [Sniffles](https://github.com/fritzsedlazeck/Sniffles#readme): `2.0.7` - [pbsv](https://github.com/PacificBiosciences/pbsv#readme): `2.9.0` @@ -112,15 +112,13 @@ inputs: pbsv_ram: {type: 'int?', doc: "RAM (in GB) for pbsv to use."} sniffles_cpu: {type: 'int?', doc: "CPU Cores for sniffles to use."} sniffles_ram: {type: 'int?', doc: "RAM (in GB) for sniffles to use."} - longreadsv_cpu: {type: 'int?', doc: "CPU Cores for Sentieon LongReadSV to use."} - longreadsv_ram: {type: 'int?', doc: "RAM (in GB) for Sentieon LongReadSV to use."} outputs: minimap2_aligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: true}], outputSource: clt_pickvalue/outfile, doc: "Aligned BAM file from Minimap2."} longreadsum_bam_metrics: {type: 'File', outputSource: tar_longreadsum_dir/output, doc: "TAR.GZ file containing longreadsum-generated metrics."} dnascope_small_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], - outputSource: dnascope/output_vcf, doc: "VCF.GZ file and index containing DNAscope-generated\ + outputSource: dnascope/small_variants, doc: "VCF.GZ file and index containing DNAscope-generated\ \ small variant calls."} pbsv_strucutural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], outputSource: bgzip_tabix_index_pbsv_vcf/output, doc: "VCF.GZ file and index containing\ @@ -129,7 +127,7 @@ outputs: required: true}], outputSource: sniffles/output_vcf, doc: "VCF.GZ file and\ \ index containing sniffles-generated SV calls."} longreadsv_structural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', - required: true}], outputSource: sentieon_longreadsv/output_vcf, doc: "VCF.GZ\ + required: true}], outputSource: dnascope/structural_variants, doc: "VCF.GZ\ \ file and index containing Sentieon LongReadSV-generated SV calls."} steps: samtools_split: @@ -228,20 +226,33 @@ steps: valueFrom: $(self).longreadsum.tar.gz input_dir: longreadsum/outputs out: [output] + download_model: + run: ../tools/download_DNAscope_model.cwl + in: + model_name: + valueFrom: "PacBio_HiFi-WGS" + out: [model_bundle] dnascope: - run: ../tools/sentieon_DNAscope_LongRead.cwl + run: ../tools/sentieon_DNAscope_LongRead_CLI.cwl when: $(inputs.minimap2_preset != "map-pb") in: minimap2_preset: minimap2_preset sentieon_license: sentieon_license reference: indexed_reference_fasta - input_bam: clt_pickvalue/outfile - output_file_name: + input_bam: + source: [clt_pickvalue/outfile] + linkMerge: merge_flattened + model_bundle: download_model/model_bundle + tech: + valueFrom: "HiFi" + output_vcf: source: output_basename valueFrom: $(self).dnascope.vcf.gz + skip-mosdepth: + default: true cpu_per_job: dnascope_cpu mem_per_job: dnascope_ram - out: [output_vcf] + out: [small_variants, structural_variants] pbsv_discover: hints: - class: "sbg:AWSInstanceType" @@ -302,20 +313,6 @@ steps: cpu: sniffles_cpu ram: sniffles_ram out: [output_vcf, output_snf] - sentieon_longreadsv: - run: ../tools/sentieon_LongReadSV.cwl - in: - sentieon_license: sentieon_license - reference: indexed_reference_fasta - input_bam: clt_pickvalue/outfile - platform: - valueFrom: "PacBioHiFi" - output_file_name: - source: output_basename - valueFrom: $(self).longreadsv.vcf.gz - cpu: longreadsv_cpu - ram: longreadsv_ram - out: [output_vcf] $namespaces: sbg: https://sevenbridges.com hints: From 3f9640ff34085f8d68963c9092405f9035d39308 Mon Sep 17 00:00:00 2001 From: Haodong Chen Date: Fri, 13 Dec 2024 00:26:38 -0800 Subject: [PATCH 05/11] Fix a typo --- tools/sentieon_minimap2.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/sentieon_minimap2.cwl b/tools/sentieon_minimap2.cwl index 2bfc931..b1dc64a 100644 --- a/tools/sentieon_minimap2.cwl +++ b/tools/sentieon_minimap2.cwl @@ -18,7 +18,7 @@ requirements: coresMin: $(inputs.cpu_per_job) ramMin: $(inputs.mem_per_job * 1000) - class: DockerRequirement - dockerPull: pgc-images.sbgenomic6s.com/hdchen/sentieon:202308.03 + dockerPull: pgc-images.sbgenomics.com/hdchen/sentieon:202308.03 - class: EnvVarRequirement envDef: - envName: SENTIEON_LICENSE From 74a073a24cbabc40a35ca2e0efd2a9a9fcee990a Mon Sep 17 00:00:00 2001 From: Haodong Chen Date: Fri, 13 Dec 2024 10:14:27 -0800 Subject: [PATCH 06/11] Allow SV for CLR --- ...entieon-pacbio-hifi-longreads-workflow.cwl | 99 -------- ...reads-workflow-hifi-ubam-single-sample.cwl | 230 ------------------ workflows/kfdrc-pacbio-longreads-workflow.cwl | 7 +- 3 files changed, 4 insertions(+), 332 deletions(-) delete mode 100644 subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl delete mode 100644 workflows/kfdrc-pacbio-longreads-workflow-hifi-ubam-single-sample.cwl diff --git a/subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl b/subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl deleted file mode 100644 index efc5ba9..0000000 --- a/subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl +++ /dev/null @@ -1,99 +0,0 @@ -cwlVersion: v1.2 -class: Workflow -id: sentieon-pacbio-hifi-longreads-workflow -doc: | - Run Sentieon PacBio HiFi workflow - Minimap2 - DNAscope - LongReadSV -requirements: -- class: InlineJavascriptRequirement -- class: MultipleInputFeatureRequirement -- class: ScatterFeatureRequirement -- class: StepInputExpressionRequirement -- class: SubworkflowFeatureRequirement -inputs: - input_unaligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: false}], - doc: "Unaligned BAM file and index containing long reads generated by a PacBio sequencer.", - "sbg:fileTypes": "BAM"} - indexed_reference_fasta: {type: 'File', secondaryFiles: [{pattern: '.fai', required: true}, - {pattern: '^.dict', required: false}], doc: "Reference fasta and fai index.", - "sbg:suggestedValue": {class: File, path: 60639014357c3a53540ca7a3, name: Homo_sapiens_assembly38.fasta, - secondaryFiles: [{class: File, path: 60639016357c3a53540ca7af, name: Homo_sapiens_assembly38.fasta.fai}, - {class: File, path: 60639019357c3a53540ca7e7, name: Homo_sapiens_assembly38.dict}]}, - "sbg:fileTypes": "FASTA, FA"} - output_basename: {type: 'string', doc: "String to use as basename for all workflow\ - \ outputs."} - sentieon_license: {type: 'string?', doc: "License server host and port for Sentieon\ - \ tools.", default: "10.5.64.221:8990"} - dnascope_cpu: {type: 'int?', doc: "CPU Cores for dnascope to use."} - dnascope_ram: {type: 'int?', doc: "RAM (in GB) for dnascope to use."} -outputs: - minimap2_aligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: true}], - outputSource: array_to_file/out_alignments, doc: "Aligned BAM file from Minimap2."} - dnascope_small_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], - outputSource: sentieon_longread_cli/small_variants, doc: "VCF.GZ file and index containing DNAscope-generated\ - \ small variant calls."} - longreadsv_structural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', - required: true}], outputSource: sentieon_longread_cli/structural_variants, doc: "VCF.GZ\ - \ file and index containing Sentieon LongReadSV-generated SV calls."} -steps: - download_model: - run: ../tools/download_DNAscope_model.cwl - in: - model_name: - valueFrom: "PacBio_HiFi-WGS" - out: [model_bundle] - sentieon_longread_cli: - run: ../tools/sentieon_DNAscope_LongRead_CLI.cwl - in: - input_bam: - source: - - input_unaligned_bam - linkMerge: merge_nested - reference: indexed_reference_fasta - output_vcf: - source: output_basename - valueFrom: $(self).vcf.gz - sentieon_license: sentieon_license - model_bundle: download_model/model_bundle - align: - valueFrom: | - $(1 == 1) - tech: - valueFrom: "HiFi" - bam_format: - valueFrom: | - $(1 == 1) - cpu_per_job: dnascope_cpu - mem_per_job: dnascope_ram - out: [small_variants, structural_variants, out_alignments, mosdepth_out] - array_to_file: - in: - infile: - source: sentieon_longread_cli/out_alignments - valueFrom: | - $(self[0]) - out: [out_alignments] - run: - cwlVersion: v1.2 - class: CommandLineTool - requirements: - - class: InlineJavascriptRequirement - doc: | - Select the first item from an array of BAM files. - baseCommand: [echo, done] - inputs: - infile: { type: 'File', secondaryFiles: [{pattern: '.bai', required: true}]} - outputs: - out_alignments: - type: File - outputBinding: - outputEval: | - $(inputs.infile) - secondaryFiles: - - pattern: .bai - required: true -$namespaces: - sbg: https://sevenbridges.com - diff --git a/workflows/kfdrc-pacbio-longreads-workflow-hifi-ubam-single-sample.cwl b/workflows/kfdrc-pacbio-longreads-workflow-hifi-ubam-single-sample.cwl deleted file mode 100644 index 58010c7..0000000 --- a/workflows/kfdrc-pacbio-longreads-workflow-hifi-ubam-single-sample.cwl +++ /dev/null @@ -1,230 +0,0 @@ -cwlVersion: v1.2 -class: Workflow -id: kfdrc-pacbio-longreads-workflow-hifi-single-sample-ubam -label: Kids First DRC PacBio LongReads Workflow (HiFi single-sample uBAM) -doc: | - # Kids First Data Resource Center Pacific Biosciences Long Reads Alignment and Variant Calling Workflow - -

- -

- - The Kids First Data Resource Center (KFDRC) Pacific Biosciences (PacBio) - Long Reads Alignment and Variant Calling Workflow is a Common Workflow Language - (CWL) implementation of various softwares used to take reads information - generated by PacBio long reads sequencers and generate alignment and variant - information. This pipeline was made possible thanks to significant software and - support contributions from both Sentieon and Wang Genomics Lab. For more - information on our collaborators, check out their websites: - - Sentieon: https://www.sentieon.com/ - - Wang Genomics Lab: https://wglab.org/ - - ## Relevant Softwares and Versions - - [samtools head](http://www.htslib.org/doc/samtools-head.html): `1.17` - - [samtools fastq](http://www.htslib.org/doc/samtools-fastq.html): `1.15.1` - - [Sentieon Minimap2](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#minimap2-binary): `202308.03` - - [Sentieon util sort](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#util-binary): `202308.03` - - [Sentieon DNAScope HiFi](https://support.sentieon.com/manual/): `202308.03` - - [Sentieon LongReadSV](https://support.sentieon.com/manual/): `202308.03` - - [LongReadSum](https://github.com/WGLab/LongReadSum#readme): `1.2.0` - - [Sniffles](https://github.com/fritzsedlazeck/Sniffles#readme): `2.0.7` - - [pbsv](https://github.com/PacificBiosciences/pbsv#readme): `2.9.0` - - ## Input Files - - `input_unaligned_bam`: The primary input of the PacBio HiFi Long Reads Workflow is an unaligned BAM. RG fields are required. Only one SM (Sample name) is allowed. - - `indexed_reference_fasta`: Any suitable human reference genome. KFDRC uses `Homo_sapiens_assembly38.fasta` from Broad Institute. - - ## Output Files - - `dnascope_small_variants`: BGZIP and TABIX indexed VCF containing small variant calls made by Sentieon DNAScope HiFi on `minimap2_aligned_bam`. - - `longreadsum_bam_metrics`: BGZIP TAR containing various metrics collected by LongReadSum from the `minimap2_aligned_bam`. - - `minimap2_aligned_bam`: Indexed BAM file containing reads from the `input_unaligned_bam` aligned to the `indexed_reference_fasta`. - - `pbsv_structural_variants`: BGZIP and TABIX indexed VCF containing structural variant calls made by pbsv on the `minimap2_aligned_bam`. - - `sniffles_structural_variants`: BGZIP and TABIX indexed VCF containing structural variant calls made by Sniffles on the `minimap2_aligned_bam`. - - `longreadsv_structural_variants`: BGZIP and TABIX indexed VCF containing structural variant calls made by Sentieon LongReadSV on the `minimap2_aligned_bam`. - - ## Generalized Process - 1. Read group information (`@RG`) is harvested from the `input_unaligned_bam` header using `samtools head` and `grep`. - 1. Align `input_unaligned_bam` to `indexed_reference_fasta` with the above `@RG` information using samtools fastq, Sentieon Minimap2, and Sentieon sort. - 1. Generate long reads alignment metrics from the `minimap2_aligned_bam` using LongReadSum. - 1. Generate structural variant calls from the `minimap2_aligned_bam` using pbsv. - 1. Generate structural variant calls from the `minimap2_aligned_bam` using Sniffles. - 1. Generate structural variant calls from the `minimap2_aligned_bam` using Sentieon LongReadSV. - 1. Generate small variant from the `minimap2_aligned_bam` using Sentieon DNAScope HiFi. - - ## Basic Info - - [D3b dockerfiles](https://github.com/d3b-center/bixtools) - - Testing Tools: - - [Seven Bridges Cavatica Platform](https://cavatica.sbgenomics.com/) - - [Common Workflow Language reference implementation (cwltool)](https://github.com/common-workflow-language/cwltool/) - - ## References - - KFDRC AWS s3 bucket: s3://kids-first-seq-data/broad-references/ - - Cavatica: https://cavatica.sbgenomics.com/u/kfdrc-harmonization/kf-references/ - - Broad Institute Goolge Cloud: https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0/ -requirements: -- class: InlineJavascriptRequirement -- class: MultipleInputFeatureRequirement -- class: ScatterFeatureRequirement -- class: StepInputExpressionRequirement -- class: SubworkflowFeatureRequirement -inputs: - input_unaligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: false}], - doc: "Unaligned BAM file and index containing HiFi long reads generated by a PacBio sequencer.", - "sbg:fileTypes": "BAM"} - indexed_reference_fasta: {type: 'File', secondaryFiles: [{pattern: '.fai', required: true}, - {pattern: '^.dict', required: true}], doc: "Reference fasta and fai index.", - "sbg:suggestedValue": {class: File, path: 60639014357c3a53540ca7a3, name: Homo_sapiens_assembly38.fasta, - secondaryFiles: [{class: File, path: 60639016357c3a53540ca7af, name: Homo_sapiens_assembly38.fasta.fai}, - {class: File, path: 60639019357c3a53540ca7e7, name: Homo_sapiens_assembly38.dict}]}, - "sbg:fileTypes": "FASTA, FA"} - output_basename: {type: 'string', doc: "String to use as basename for all workflow\ - \ outputs."} - sentieon_license: {type: 'string?', doc: "License server host and port for Sentieon\ - \ tools.", default: "10.5.64.221:8990"} - longreadsum_cpu: {type: 'int?', doc: "CPU Cores for longreadsum to use."} - dnascope_cpu: {type: 'int?', doc: "CPU Cores for dnascope to use."} - dnascope_ram: {type: 'int?', doc: "RAM (in GB) for dnascope to use."} - pbsv_cpu: {type: 'int?', doc: "CPU Cores for pbsv to use."} - pbsv_ram: {type: 'int?', doc: "RAM (in GB) for pbsv to use."} - sniffles_cpu: {type: 'int?', doc: "CPU Cores for sniffles to use."} - sniffles_ram: {type: 'int?', doc: "RAM (in GB) for sniffles to use."} -outputs: - minimap2_aligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: true}], - outputSource: dnascope_longread/minimap2_aligned_bam, doc: "Aligned BAM file from Minimap2."} - longreadsum_bam_metrics: {type: 'File', outputSource: tar_longreadsum_dir/output, - doc: "TAR.GZ file containing longreadsum-generated metrics."} - dnascope_small_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], - outputSource: dnascope_longread/dnascope_small_variants, doc: "VCF.GZ file and index containing DNAscope-generated\ - \ small variant calls."} - pbsv_strucutural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], - outputSource: bgzip_tabix_index_pbsv_vcf/output, doc: "VCF.GZ file and index containing\ - \ pbsv-generated SV calls."} - sniffles_structural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', - required: true}], outputSource: sniffles/output_vcf, doc: "VCF.GZ file and\ - \ index containing sniffles-generated SV calls."} - longreadsv_structural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', - required: true}], outputSource: dnascope_longread/longreadsv_structural_variants, doc: "VCF.GZ\ - \ file and index containing Sentieon LongReadSV-generated SV calls."} -steps: - dnascope_longread: - run: ../subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl - in: - input_unaligned_bam: input_unaligned_bam - indexed_reference_fasta: indexed_reference_fasta - output_basename: output_basename - sentieon_license: sentieon_license - dnascope_cpu: dnascope_cpu - dnascope_ram: dnascope_ram - out: [minimap2_aligned_bam, dnascope_small_variants, longreadsv_structural_variants] - longreadsum: - hints: - - class: "sbg:AWSInstanceType" - value: c5.9xlarge - run: ../tools/longreadsum.cwl - in: - input_type: - valueFrom: "bam" - input_file: dnascope_longread/minimap2_aligned_bam - output_dir: output_basename - output_basename: output_basename - log: - valueFrom: "test.log" - log_level: - valueFrom: "2" - cpu: longreadsum_cpu - out: [outputs] - tar_longreadsum_dir: - run: ../tools/tar.cwl - in: - output_filename: - source: output_basename - valueFrom: $(self).longreadsum.tar.gz - input_dir: longreadsum/outputs - out: [output] - pbsv_discover: - hints: - - class: "sbg:AWSInstanceType" - value: c5.9xlarge - run: ../tools/pbsv_discover.cwl - in: - input_bam: dnascope_longread/minimap2_aligned_bam - output_filename: - source: output_basename - valueFrom: $(self).pbsv.svsig.gz - hifi_preset: - valueFrom: | - $(1 == 1) - cpu: pbsv_cpu - ram: pbsv_ram - out: [output_svsig] - pbsv_call: - hints: - - class: "sbg:AWSInstanceType" - value: c5.9xlarge - run: ../tools/pbsv_call.cwl - in: - reference_fasta: indexed_reference_fasta - input_svsig: pbsv_discover/output_svsig - output_filename: - source: output_basename - valueFrom: $(self).pbsv.vcf - hifi_preset: - valueFrom: | - $(1 == 1) - cpu: pbsv_cpu - ram: pbsv_ram - out: [output_vcf] - bgzip_tabix_index_pbsv_vcf: - run: ../tools/bgzip_tabix_index.cwl - in: - input_vcf: pbsv_call/output_vcf - cpu: pbsv_cpu - out: [output] - sniffles: - hints: - - class: "sbg:AWSInstanceType" - value: c5.9xlarge - run: ../tools/sniffles.cwl - in: - input_bam: - source: dnascope_longread/minimap2_aligned_bam - valueFrom: $([self]) - vcf_output_filename: - source: output_basename - valueFrom: $(self).sniffles.vcf.gz - reference_fasta: indexed_reference_fasta - cpu: sniffles_cpu - ram: sniffles_ram - out: [output_vcf, output_snf] -$namespaces: - sbg: https://sevenbridges.com -hints: -- class: "sbg:maxNumberOfParallelInstances" - value: 2 -"sbg:license": Apache License 2.0 -"sbg:publisher": KFDRC -"sbg:categories": -- ALIGNMENT -- DNA -- INDEL -- LONG -- LONGREADS -- LONGREADSUM -- METRICS -- NANOCALLER -- PACBIO -- PACIFIC -- PBMM2 -- PBSV -- SENTIEON -- SNIFFLES -- SNP -- SOMATIC -- STRUCTURAL -- SV -- VARIANT -- WGS -- WXS -"sbg:links": -- id: 'https://github.com/kids-first/kf-longreads-workflow/releases/tag/v2.0.2' - label: github-release diff --git a/workflows/kfdrc-pacbio-longreads-workflow.cwl b/workflows/kfdrc-pacbio-longreads-workflow.cwl index 42e0e57..d178457 100644 --- a/workflows/kfdrc-pacbio-longreads-workflow.cwl +++ b/workflows/kfdrc-pacbio-longreads-workflow.cwl @@ -117,7 +117,7 @@ outputs: outputSource: clt_pickvalue/outfile, doc: "Aligned BAM file from Minimap2."} longreadsum_bam_metrics: {type: 'File', outputSource: tar_longreadsum_dir/output, doc: "TAR.GZ file containing longreadsum-generated metrics."} - dnascope_small_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], + dnascope_small_variants: {type: 'File?', secondaryFiles: [{pattern: '.tbi', required: true}], outputSource: dnascope/small_variants, doc: "VCF.GZ file and index containing DNAscope-generated\ \ small variant calls."} pbsv_strucutural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], @@ -234,9 +234,7 @@ steps: out: [model_bundle] dnascope: run: ../tools/sentieon_DNAscope_LongRead_CLI.cwl - when: $(inputs.minimap2_preset != "map-pb") in: - minimap2_preset: minimap2_preset sentieon_license: sentieon_license reference: indexed_reference_fasta input_bam: @@ -250,6 +248,9 @@ steps: valueFrom: $(self).dnascope.vcf.gz skip-mosdepth: default: true + skip-small-variants: + source: minimap2_preset + valueFrom: $(inputs.minimap2_preset == "map-pb") cpu_per_job: dnascope_cpu mem_per_job: dnascope_ram out: [small_variants, structural_variants] From 425873bd2d9c5b4d4984b1c23e7f4fa1eca30bdb Mon Sep 17 00:00:00 2001 From: Haodong Chen Date: Fri, 13 Dec 2024 12:00:19 -0800 Subject: [PATCH 07/11] Fix a typo --- workflows/kfdrc-pacbio-longreads-workflow.cwl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/kfdrc-pacbio-longreads-workflow.cwl b/workflows/kfdrc-pacbio-longreads-workflow.cwl index d178457..6cee14c 100644 --- a/workflows/kfdrc-pacbio-longreads-workflow.cwl +++ b/workflows/kfdrc-pacbio-longreads-workflow.cwl @@ -250,7 +250,7 @@ steps: default: true skip-small-variants: source: minimap2_preset - valueFrom: $(inputs.minimap2_preset == "map-pb") + valueFrom: $(self == "map-pb") cpu_per_job: dnascope_cpu mem_per_job: dnascope_ram out: [small_variants, structural_variants] From b021bcf96656e9a51cfdf14b2468c3438cb667c1 Mon Sep 17 00:00:00 2001 From: Haodong Chen Date: Thu, 16 Jan 2025 14:05:56 -0800 Subject: [PATCH 08/11] Add sentieon-cli to the ONT workflow --- workflows/kfdrc-ont-longreads-workflow.cwl | 38 +++++++++++++++------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/workflows/kfdrc-ont-longreads-workflow.cwl b/workflows/kfdrc-ont-longreads-workflow.cwl index ecb1d9d..7733d9b 100644 --- a/workflows/kfdrc-ont-longreads-workflow.cwl +++ b/workflows/kfdrc-ont-longreads-workflow.cwl @@ -190,8 +190,8 @@ inputs: cutesv_ram: {type: 'int?', doc: "RAM (in GB) for cutesv to use."} sniffles_cpu: {type: 'int?', doc: "CPU Cores for sniffles to use."} sniffles_ram: {type: 'int?', doc: "RAM (in GB) for sniffles to use."} - longreadsv_cpu: {type: 'int?', doc: "CPU Cores for Sentieon LongReadSV to use."} - longreadsv_ram: {type: 'int?', doc: "RAM (in GB) for Sentieon LongReadSV to use."} + dnascope_cpu: {type: 'int?', doc: "CPU Cores for Sentieon DNAscope to use."} + dnascope_ram: {type: 'int?', doc: "RAM (in GB) for Sentieon DNAscope to use."} outputs: minimap2_aligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: true}], outputSource: clt_pickvalue/outfile, doc: "Aligned BAM file from Minimap2."} @@ -206,8 +206,11 @@ outputs: sniffles_structural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}], outputSource: sniffles/output_vcf, doc: "VCF.GZ file and\ \ index containing sniffles-generated SV calls."} + dnascope_small_variants: {type: 'File?', secondaryFiles: [{pattern: '.tbi', required: true}], + outputSource: dnascope/small_variants, doc: "VCF.GZ file and index containing DNAscope-generated\ + \ small variant calls."} longreadsv_structural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', - required: true}], outputSource: sentieon_longreadsv/output_vcf, doc: "VCF.GZ\ + required: true}], outputSource: dnascope/structural_variants, doc: "VCF.GZ\ \ file and index containing Sentieon LongReadSV-generated SV calls."} steps: samtools_split: @@ -281,20 +284,31 @@ steps: $(self[0] == null ? self[1][0] : self[0]) cpu: minimap2_cpu out: [outfile] - sentieon_longreadsv: - run: ../tools/sentieon_LongReadSV.cwl + download_model: + run: ../tools/download_DNAscope_model.cwl + in: + model_name: + valueFrom: "Oxford_Nanopore-WGS" + out: [model_bundle] + dnascope: + run: ../tools/sentieon_DNAscope_LongRead_CLI.cwl in: sentieon_license: sentieon_license reference: indexed_reference_fasta - input_bam: clt_pickvalue/outfile - platform: + input_bam: + source: [clt_pickvalue/outfile] + linkMerge: merge_flattened + model_bundle: download_model/model_bundle + tech: valueFrom: "ONT" - output_file_name: + output_vcf: source: output_basename - valueFrom: $(self).longreadsv.vcf.gz - cpu: longreadsv_cpu - ram: longreadsv_ram - out: [output_vcf] + valueFrom: $(self).dnascope.vcf.gz + skip-mosdepth: + default: true + cpu_per_job: dnascope_cpu + mem_per_job: dnascope_ram + out: [small_variants, structural_variants] longreadsum: run: ../tools/longreadsum.cwl hints: From 359f1f55c1f855a6840dc7b63c8913b0e69f33d3 Mon Sep 17 00:00:00 2001 From: dmiller15 Date: Fri, 14 Feb 2025 11:17:00 -0500 Subject: [PATCH 09/11] :wrench: cleanup changes from Sentieon --- docs/dockers_ont.md | 6 ++-- docs/dockers_pacbio.md | 2 ++ scripts/get_dnascope_model.py | 28 +++++++++++++++++ tools/download_DNAscope_model.cwl | 31 ++----------------- tools/sentieon_DNAscope_LongRead_CLI.cwl | 30 ++++-------------- workflows/kfdrc-ont-longreads-workflow.cwl | 17 ++++------ workflows/kfdrc-pacbio-longreads-workflow.cwl | 9 ++---- 7 files changed, 49 insertions(+), 74 deletions(-) create mode 100644 scripts/get_dnascope_model.py diff --git a/docs/dockers_ont.md b/docs/dockers_ont.md index 834b635..80eec55 100644 --- a/docs/dockers_ont.md +++ b/docs/dockers_ont.md @@ -12,8 +12,8 @@ nanocaller_merge.cwl|genomicslab/nanocaller:3.2.0 samtools_coverage.cwl|staphb/samtools:1.17 samtools_head.cwl|staphb/samtools:1.17 samtools_split.cwl|staphb/samtools:1.17 -sentieon_LongReadSV.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202112.06 -sentieon_ReadWriter.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202112.01_hifi -sentieon_minimap2.cwl|pgc-images.sbgenomics.com/d3b-bixu/sentieon:202112.01_hifi +sentieon_DNAscope_LongRead_CLI.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202308.03 +sentieon_ReadWriter.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202308.03 +sentieon_minimap2.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202308.03 sniffles.cwl|pgc-images.sbgenomics.com/d3b-bixu/sniffles:2.0.7 tar.cwl|None diff --git a/docs/dockers_pacbio.md b/docs/dockers_pacbio.md index c262759..73343fd 100644 --- a/docs/dockers_pacbio.md +++ b/docs/dockers_pacbio.md @@ -11,5 +11,7 @@ pbsv_discover.cwl|quay.io/biocontainers/pbsv:2.9.0--h9ee0642_0 samtools_head.cwl|staphb/samtools:1.17 samtools_split.cwl|staphb/samtools:1.17 sentieon_DNAscope_LongRead_CLI.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202308.03 +sentieon_ReadWriter.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202308.03 +sentieon_minimap2.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202308.03 sniffles.cwl|pgc-images.sbgenomics.com/d3b-bixu/sniffles:2.0.7 tar.cwl|None diff --git a/scripts/get_dnascope_model.py b/scripts/get_dnascope_model.py new file mode 100644 index 0000000..54ae516 --- /dev/null +++ b/scripts/get_dnascope_model.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python3 + +import argparse +import yaml +import requests +import sys + + +def main(): + parser = argparse.ArgumentParser(description="Download DNAscope model bundle") + parser.add_argument("model_name", help="the name of the model bundle, e.g. Illumina_WGS") + args = parser.parse_args() + model_name = args.model_name.split("-") + sentieon_models_yaml = "https://github.com/Sentieon/sentieon-models/raw/refs/heads/main/sentieon_models.yaml" + response = requests.get(sentieon_models_yaml, allow_redirects=True) + content = response.content.decode("utf-8") + content = yaml.safe_load(content) + try: + url = content["DNAscope_bundles"][model_name[0]][model_name[1]] + r = requests.get(url, allow_redirects=True) + open(url.split("/")[-1], 'wb').write(r.content) + except: + open('empty.bundle', 'wb') + print('Models updated on: ' + content["Updated on"], file=sys.stderr) + + +if __name__ == '__main__': + main() diff --git a/tools/download_DNAscope_model.cwl b/tools/download_DNAscope_model.cwl index 73b5571..1eb3bc4 100644 --- a/tools/download_DNAscope_model.cwl +++ b/tools/download_DNAscope_model.cwl @@ -12,35 +12,8 @@ requirements: - class: InitialWorkDirRequirement listing: - entryname: get_dnascope_model.py - entry: | - #!/usr/bin/env python3 - - import argparse - import yaml - import requests - import sys - - def main(): - parser = argparse.ArgumentParser(description="Download DNAscope model bundle") - parser.add_argument("model_name", help="the name of the model bundle, e.g. Illumina_WGS") - args = parser.parse_args() - model_name = args.model_name.split("-") - sentieon_models_yaml = "https://github.com/Sentieon/sentieon-models/raw/refs/heads/main/sentieon_models.yaml" - response = requests.get(sentieon_models_yaml, allow_redirects=True) - content = response.content.decode("utf-8") - content = yaml.safe_load(content) - try: - url = content["DNAscope_bundles"][model_name[0]][model_name[1]] - r = requests.get(url, allow_redirects=True) - open(url.split("/")[-1], 'wb').write(r.content) - except: - open('empty.bundle', 'wb') - print('Models updated on: ' + content["Updated on"], file=sys.stderr) - - if __name__ == '__main__': - main() - - + entry: + $include: ../scripts/get_dnascope_model.py arguments: - position: 0 valueFrom: 'pip install pyyaml requests;' diff --git a/tools/sentieon_DNAscope_LongRead_CLI.cwl b/tools/sentieon_DNAscope_LongRead_CLI.cwl index 350506d..cdd383b 100644 --- a/tools/sentieon_DNAscope_LongRead_CLI.cwl +++ b/tools/sentieon_DNAscope_LongRead_CLI.cwl @@ -1,6 +1,6 @@ cwlVersion: v1.2 class: CommandLineTool -label: Sentieon_DNAscope_LongRead +id: sentieon_DNAscope_LongRead_CLI doc: |- This tool uses **Sentieon DNAscope** to call germline variants from PacBio HiFi reads [1]. @@ -11,28 +11,8 @@ doc: |- requirements: - class: ShellCommandRequirement - class: ResourceRequirement - coresMin: |- - ${ - if (inputs.cpu_per_job) - { - return inputs.cpu_per_job - } - else - { - return 36 - } - } - ramMin: |- - ${ - if (inputs.mem_per_job) - { - return inputs.mem_per_job - } - else - { - return 71000 - } - } + coresMin: $(inputs.cpu_per_job) + ramMin: $(inputs.mem_per_job * 1000) - class: DockerRequirement dockerPull: pgc-images.sbgenomics.com/hdchen/sentieon:202308.03 - class: EnvVarRequirement @@ -204,10 +184,12 @@ inputs: label: CPU per job doc: CPU per job type: int? + default: 36 mem_per_job: label: Memory per job - doc: Memory per job[MB]. + doc: Memory per job[GB]. type: int? + default: 71 outputs: small_variants: diff --git a/workflows/kfdrc-ont-longreads-workflow.cwl b/workflows/kfdrc-ont-longreads-workflow.cwl index 7733d9b..fd87cd4 100644 --- a/workflows/kfdrc-ont-longreads-workflow.cwl +++ b/workflows/kfdrc-ont-longreads-workflow.cwl @@ -22,9 +22,9 @@ doc: | ## Relevant Softwares and Versions - [samtools head](http://www.htslib.org/doc/samtools-head.html): `1.17` - [samtools fastq](http://www.htslib.org/doc/samtools-fastq.html): `1.15.1` - - [Sentieon Minimap2](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#minimap2-binary): `202112.01` - - [Sentieon util sort](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#util-binary): `202112.01` - - [Sentieon LongReadSV](https://support.sentieon.com/manual/): `202112.06` + - [Sentieon Minimap2](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#minimap2-binary): `202308.03` + - [Sentieon util sort](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#util-binary): `202308.03` + - [Sentieon LongReadSV](https://support.sentieon.com/manual/): `202308.03` - [LongReadSum](https://github.com/WGLab/LongReadSum#readme): `1.2.0` - [Sniffles](https://github.com/fritzsedlazeck/Sniffles#readme): `2.0.7` - [CuteSV](https://github.com/tjiangHIT/cuteSV#readme): `2.0.3` @@ -90,6 +90,7 @@ inputs: \ header."} sentieon_license: {type: 'string?', doc: "License server host and port for Sentieon\ \ tools.", default: "10.5.64.221:8990"} + sentieon_dnascope_model: { type: 'File', doc: "Sentieon DNAscoep model bundle." } minimap2_preset: type: - name: minimap2_preset @@ -284,21 +285,15 @@ steps: $(self[0] == null ? self[1][0] : self[0]) cpu: minimap2_cpu out: [outfile] - download_model: - run: ../tools/download_DNAscope_model.cwl - in: - model_name: - valueFrom: "Oxford_Nanopore-WGS" - out: [model_bundle] dnascope: run: ../tools/sentieon_DNAscope_LongRead_CLI.cwl in: sentieon_license: sentieon_license reference: indexed_reference_fasta - input_bam: + input_bam: source: [clt_pickvalue/outfile] linkMerge: merge_flattened - model_bundle: download_model/model_bundle + model_bundle: sentieon_dnascope_model tech: valueFrom: "ONT" output_vcf: diff --git a/workflows/kfdrc-pacbio-longreads-workflow.cwl b/workflows/kfdrc-pacbio-longreads-workflow.cwl index 6cee14c..0ffee9b 100644 --- a/workflows/kfdrc-pacbio-longreads-workflow.cwl +++ b/workflows/kfdrc-pacbio-longreads-workflow.cwl @@ -84,6 +84,7 @@ inputs: \ this value will override the SM value provided in the input_unaligned_bam."} sentieon_license: {type: 'string?', doc: "License server host and port for Sentieon\ \ tools.", default: "10.5.64.221:8990"} + sentieon_dnascope_model: { type: 'File', doc: "Sentieon DNAscoep model bundle." } minimap2_preset: type: - name: minimap2_preset @@ -226,12 +227,6 @@ steps: valueFrom: $(self).longreadsum.tar.gz input_dir: longreadsum/outputs out: [output] - download_model: - run: ../tools/download_DNAscope_model.cwl - in: - model_name: - valueFrom: "PacBio_HiFi-WGS" - out: [model_bundle] dnascope: run: ../tools/sentieon_DNAscope_LongRead_CLI.cwl in: @@ -240,7 +235,7 @@ steps: input_bam: source: [clt_pickvalue/outfile] linkMerge: merge_flattened - model_bundle: download_model/model_bundle + model_bundle: sentieon_dnascope_model tech: valueFrom: "HiFi" output_vcf: From 0eb1d0afdc3b958c7b051de2bc79ae5d7821e3fd Mon Sep 17 00:00:00 2001 From: dmiller15 Date: Mon, 17 Feb 2025 10:12:42 -0500 Subject: [PATCH 10/11] :bug: optional longreadsv --- workflows/kfdrc-pacbio-longreads-workflow.cwl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/workflows/kfdrc-pacbio-longreads-workflow.cwl b/workflows/kfdrc-pacbio-longreads-workflow.cwl index 0ffee9b..6e3e6b0 100644 --- a/workflows/kfdrc-pacbio-longreads-workflow.cwl +++ b/workflows/kfdrc-pacbio-longreads-workflow.cwl @@ -245,7 +245,10 @@ steps: default: true skip-small-variants: source: minimap2_preset - valueFrom: $(self == "map-pb") + valueFrom: $(self != "map-hifi") + skip-svs: + source: minimap2_preset + valueFrom: $(self != "map-hifi") cpu_per_job: dnascope_cpu mem_per_job: dnascope_ram out: [small_variants, structural_variants] From 6c25e0d70cc84dcff3d605b1450d5c25089d7386 Mon Sep 17 00:00:00 2001 From: Dan Miller Date: Thu, 27 Feb 2025 11:26:12 -0500 Subject: [PATCH 11/11] :broom: Cleanup typos Co-authored-by: Miguel Brown --- tools/sentieon_DNAscope_LongRead_CLI.cwl | 2 +- workflows/kfdrc-ont-longreads-workflow.cwl | 2 +- workflows/kfdrc-pacbio-longreads-workflow.cwl | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/sentieon_DNAscope_LongRead_CLI.cwl b/tools/sentieon_DNAscope_LongRead_CLI.cwl index cdd383b..d6ec424 100644 --- a/tools/sentieon_DNAscope_LongRead_CLI.cwl +++ b/tools/sentieon_DNAscope_LongRead_CLI.cwl @@ -80,7 +80,7 @@ inputs: prefix: -m tech: type: string? - doc: "{HiFi,ONT} Sequencing technology used to generate the reads. (default: 'HiFi')" + doc: "{HiFi,ONT} Sequencing technology used to generate the reads. (default: 'HiFi')" inputBinding: position: 7 prefix: --tech diff --git a/workflows/kfdrc-ont-longreads-workflow.cwl b/workflows/kfdrc-ont-longreads-workflow.cwl index fd87cd4..a5fffc4 100644 --- a/workflows/kfdrc-ont-longreads-workflow.cwl +++ b/workflows/kfdrc-ont-longreads-workflow.cwl @@ -90,7 +90,7 @@ inputs: \ header."} sentieon_license: {type: 'string?', doc: "License server host and port for Sentieon\ \ tools.", default: "10.5.64.221:8990"} - sentieon_dnascope_model: { type: 'File', doc: "Sentieon DNAscoep model bundle." } + sentieon_dnascope_model: { type: 'File', doc: "Sentieon DNAscope model bundle." } minimap2_preset: type: - name: minimap2_preset diff --git a/workflows/kfdrc-pacbio-longreads-workflow.cwl b/workflows/kfdrc-pacbio-longreads-workflow.cwl index 6e3e6b0..d457571 100644 --- a/workflows/kfdrc-pacbio-longreads-workflow.cwl +++ b/workflows/kfdrc-pacbio-longreads-workflow.cwl @@ -84,7 +84,7 @@ inputs: \ this value will override the SM value provided in the input_unaligned_bam."} sentieon_license: {type: 'string?', doc: "License server host and port for Sentieon\ \ tools.", default: "10.5.64.221:8990"} - sentieon_dnascope_model: { type: 'File', doc: "Sentieon DNAscoep model bundle." } + sentieon_dnascope_model: { type: 'File', doc: "Sentieon DNAscope model bundle." } minimap2_preset: type: - name: minimap2_preset