From 66034ae3b63ffa0adc2acd1b9b1203aec26d5c08 Mon Sep 17 00:00:00 2001
From: Haodong Chen <chen.hdong@gmail.com>
Date: Fri, 8 Nov 2024 08:37:32 -0800
Subject: [PATCH 01/11] Add Sentieon HiFi workflow

---
 ...entieon-pacbio-hifi-longreads-workflow.cwl |  73 ++++++
 tools/download_DNAscope_model.cwl             |  73 ++++++
 tools/sentieon_DNAscope_LongRead_CLI.cwl      | 247 ++++++++++++++++++
 3 files changed, 393 insertions(+)
 create mode 100644 subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl
 create mode 100644 tools/download_DNAscope_model.cwl
 create mode 100644 tools/sentieon_DNAscope_LongRead_CLI.cwl

diff --git a/subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl b/subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl
new file mode 100644
index 0000000..66fd0f5
--- /dev/null
+++ b/subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl
@@ -0,0 +1,73 @@
+cwlVersion: v1.2
+class: Workflow
+id: sentieon-pacbio-hifi-longreads-workflow
+doc: |
+  Run Sentieon PacBio HiFi workflow
+  Minimap2
+  DNAscope
+  LongReadSV
+requirements:
+- class: InlineJavascriptRequirement
+- class: MultipleInputFeatureRequirement
+- class: ScatterFeatureRequirement
+- class: StepInputExpressionRequirement
+- class: SubworkflowFeatureRequirement
+inputs:
+  input_unaligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: false}],
+    doc: "Unaligned BAM file and index containing long reads generated by an PacBio sequencer.",
+    "sbg:fileTypes": "BAM"}
+  indexed_reference_fasta: {type: 'File', secondaryFiles: [{pattern: '.fai', required: true},
+      {pattern: '^.dict', required: false}], doc: "Reference fasta and fai index.",
+    "sbg:suggestedValue": {class: File, path: 60639014357c3a53540ca7a3, name: Homo_sapiens_assembly38.fasta,
+      secondaryFiles: [{class: File, path: 60639016357c3a53540ca7af, name: Homo_sapiens_assembly38.fasta.fai},
+        {class: File, path: 60639019357c3a53540ca7e7, name: Homo_sapiens_assembly38.dict}]},
+    "sbg:fileTypes": "FASTA, FA"}
+  output_basename: {type: 'string', doc: "String to use as basename for all workflow\
+      \ outputs."}
+  sentieon_license: {type: 'string?', doc: "License server host and port for Sentieon\
+      \ tools.", default: "10.5.64.221:8990"}
+  dnascope_cpu: {type: 'int?', doc: "CPU Cores for dnascope to use."}
+  dnascope_ram: {type: 'int?', doc: "RAM (in GB) for dnascope to use."}
+outputs:
+  minimap2_aligned_bam: {type: 'File[]?', secondaryFiles: [{pattern: '.bai', required: true}],
+    outputSource: sentieon_longread_cli/out_alignments, doc: "Aligned BAM file from Minimap2."}
+  dnascope_small_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}],
+    outputSource: sentieon_longread_cli/small_variants, doc: "VCF.GZ file and index containing DNAscope-generated\
+      \ small variant calls."}
+  longreadsv_structural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi',
+        required: true}], outputSource: sentieon_longread_cli/structural_variants, doc: "VCF.GZ\
+      \ file and index containing Sentieon LongReadSV-generated SV calls."}
+steps:
+  download_model:
+    run: ../tools/download_DNAscope_model.cwl
+    in:
+      model_name: 
+        valueFrom: "PacBio_HiFi-WGS"
+    out: [model_bundle]
+  sentieon_longread_cli:
+    run: ../tools/sentieon_DNAscope_LongRead_CLI.cwl
+    in:
+      input_bam: 
+        source: 
+          - input_unaligned_bam
+        linkMerge: merge_nested
+      reference: indexed_reference_fasta
+      output_vcf:
+        source: output_basename
+        valueFrom: $(self).vcf.gz
+      sentieon_license: sentieon_license
+      model_bundle: download_model/model_bundle
+      align:
+        valueFrom: |
+          $(1 == 1)
+      tech:
+        valueFrom: "HiFi"
+      bam_format:
+        valueFrom: |
+          $(1 == 1)
+      cpu_per_job: dnascope_cpu
+      mem_per_job: dnascope_ram
+    out: [small_variants, structural_variants, out_alignments, mosdepth_out]
+$namespaces:
+  sbg: https://sevenbridges.com
+
diff --git a/tools/download_DNAscope_model.cwl b/tools/download_DNAscope_model.cwl
new file mode 100644
index 0000000..73b5571
--- /dev/null
+++ b/tools/download_DNAscope_model.cwl
@@ -0,0 +1,73 @@
+cwlVersion: v1.2
+class: CommandLineTool
+label: Download DNAscope model bundle
+hints:
+  - class: ResourceRequirement
+    coresMin: 1
+requirements:
+  - class: ShellCommandRequirement
+  - class: InlineJavascriptRequirement
+  - class: DockerRequirement
+    dockerPull: python:3.7-slim
+  - class: InitialWorkDirRequirement
+    listing:
+      - entryname: get_dnascope_model.py
+        entry: |
+          #!/usr/bin/env python3
+          
+          import argparse
+          import yaml
+          import requests
+          import sys
+          
+          def main():
+              parser = argparse.ArgumentParser(description="Download DNAscope model bundle")
+              parser.add_argument("model_name", help="the name of the model bundle, e.g. Illumina_WGS")
+              args = parser.parse_args()
+              model_name = args.model_name.split("-")
+              sentieon_models_yaml = "https://github.com/Sentieon/sentieon-models/raw/refs/heads/main/sentieon_models.yaml"
+              response = requests.get(sentieon_models_yaml, allow_redirects=True)
+              content = response.content.decode("utf-8")
+              content = yaml.safe_load(content)
+              try:
+                  url = content["DNAscope_bundles"][model_name[0]][model_name[1]]
+                  r = requests.get(url, allow_redirects=True)
+                  open(url.split("/")[-1], 'wb').write(r.content)
+              except:
+                  open('empty.bundle', 'wb')
+              print('Models updated on: ' + content["Updated on"], file=sys.stderr)
+          
+          if __name__ == '__main__':
+              main()
+
+
+arguments:
+  - position: 0
+    valueFrom: 'pip install pyyaml requests;'
+    shellQuote: false
+  - position: 1
+    valueFrom: 'python get_dnascope_model.py'
+    shellQuote: false
+inputs:
+  - id: model_name
+    label: Model name
+    doc: Model platform and data type. For example, Illumina_WGS
+    type: 
+    - type: enum
+      symbols:
+        - Illumina-WGS
+        - Illumina-WES
+        - MGI-WGS
+        - MGI-WES
+        - Element_Biosciences-WGS
+        - PacBio_HiFi-WGS
+        - Oxford_Nanopore-WGS
+    inputBinding:
+      position: 2
+outputs:
+  - id: model_bundle
+    label: DNAscope Model bundle
+    type: File
+    outputBinding:
+      glob: '*.bundle'
+
diff --git a/tools/sentieon_DNAscope_LongRead_CLI.cwl b/tools/sentieon_DNAscope_LongRead_CLI.cwl
new file mode 100644
index 0000000..d51124c
--- /dev/null
+++ b/tools/sentieon_DNAscope_LongRead_CLI.cwl
@@ -0,0 +1,247 @@
+cwlVersion: v1.2
+class: CommandLineTool
+label: Sentieon_DNAscope_LongRead
+doc: |-
+  This tool uses **Sentieon DNAscope** to call germline variants from PacBio HiFi reads [1].
+
+  ###References
+
+  [1] [https://github.com/Sentieon/sentieon-cli/blob/main/docs/dnascope-longread.md](https://github.com/Sentieon/sentieon-cli/blob/main/docs/dnascope-longread.md)
+
+requirements:
+- class: ShellCommandRequirement
+- class: ResourceRequirement
+  coresMin: |-
+    ${
+        if (inputs.cpu_per_job)
+        {
+            return inputs.cpu_per_job
+        }
+        else
+        {
+            return 36
+        }
+    }
+  ramMin: |-
+    ${
+        if (inputs.mem_per_job)
+        {
+            return inputs.mem_per_job
+        }
+        else
+        {
+            return 71000
+        }
+    }
+- class: DockerRequirement
+  dockerPull: pgc-images.sbgenomics.com/hdchen/sentieon:202308.03
+- class: EnvVarRequirement
+  envDef:
+  - envName: SENTIEON_LICENSE
+    envValue: $(inputs.sentieon_license)
+- class: InlineJavascriptRequirement
+
+inputs:
+  sentieon_license:
+    label: Sentieon license
+    doc: License server host and port
+    type: string
+  reference:
+    type: File
+    doc: "Fasta for reference genome"
+    inputBinding:
+      position: 1
+      prefix: -r
+    secondaryFiles:
+    - pattern: .fai
+      required: true
+    - pattern: ^.dict
+      required: false
+    sbg:fileTypes: FA, FASTA
+  fastq:
+    type: File[]?
+    doc: "Sample fastq files"
+    inputBinding:
+      position: 2
+      prefix: --fastq
+  readgroups:
+    type: string[]?
+    doc: "Readgroup information for the fastq files"
+    inputBinding:
+      position: 3
+      prefix: --readgroups
+  input_bam:
+    type: File[]?
+    doc: "sample BAM or CRAM file"
+    inputBinding:
+      position: 4
+      prefix: -i
+    sbg:fileTypes: BAM, CRAM
+    secondaryFiles:
+    - pattern: ^.bai
+      required: false
+    - pattern: ^.crai
+      required: false
+    - pattern: .bai
+      required: false
+    - pattern: .crai
+      required: false
+  align:
+    type: boolean?
+    default: false
+    inputBinding:
+      position: 5
+      prefix: --align
+  model_bundle:
+    type: File
+    doc: "The model bundle file"
+    inputBinding:
+      position: 6
+      prefix: -m
+  tech:
+    type: string?
+    doc: "{HiFi,ONT}     Sequencing technology used to generate the reads. (default: 'HiFi')"
+    inputBinding:
+      position: 7
+      prefix: --tech
+  dbSNP:
+    type: File?
+    doc: "dbSNP vcf file Supplying this file will annotate variants with their dbSNP refSNP ID numbers."
+    inputBinding:
+      position: 8
+      prefix: -d
+    secondaryFiles:
+    - pattern: .tbi
+      required: false
+    - pattern: .idx
+      required: false
+  diploid_bed:
+    type: File?
+    doc: "Region BED file. Supplying this file will limit variant calling to the intervals inside the BED file."
+    inputBinding:
+      position: 9
+      prefix: -b
+    sbg:fileTypes: BED
+  haploid_bed:
+    type: File?
+    inputBinding:
+      position: 10
+      prefix: --haploid-
+    sbg:fileTypes: BED
+  gvcf:
+    type: boolean?
+    default: false
+    inputBinding:
+      position: 11
+      prefix: --gvcf
+  bam_format:
+    type: boolean?
+    default: false
+    doc: "Use the BAM format instead of CRAM for output aligned files (default: False)"
+    inputBinding:
+      position: 12
+      prefix: --bam_format
+  cores:
+    type: int?
+    doc: "Number of threads/processes to use"
+    inputBinding:
+      position: 13
+      prefix: -t
+  skip-small-variants:
+    type: boolean?
+    default: false
+    doc: "Skip small variant (SNV/indel) calling (default: False)"
+    inputBinding:
+      position: 14
+      prefix: --skip-small-variants
+  skip-svs:
+    type: boolean?
+    default: false
+    doc: "Skip SV calling (default: False)"
+    inputBinding:
+      position: 15
+      prefix: --skip-svs
+  skip-mosdepth:
+    type: boolean?
+    default: false
+    doc: "Skip QC with mosdepth (default: False)"
+    inputBinding:
+      position: 16
+      prefix: --skip-mosdepth
+  input_ref:
+    type: File?
+    doc: "Used to decode the input alignment file. Required if the input file is in the CRAM/uCRAM formats"
+    inputBinding:
+      position: 17
+      prefix: --input_ref
+    secondaryFiles:
+    - pattern: .fai
+      required: true
+  fastq_taglist:
+    type: string?
+    doc: "A comma-separated list of tags to retain. Defaults to ''*'' and the 'RG' tag is required"
+    inputBinding:
+      position: 18
+      prefix: --fastq_taglist
+  minimap2_args:
+    type: string?
+    doc: "Extra arguments for sentieon minimap2 (default: '-Y')"
+    inputBinding:
+      position: 20
+      prefix: --minimap2_args
+  util_sort_args:
+    type: string?
+    doc: "Extra arguments for sentieon util sort (default: '--cram_write_options version=3.0,compressor=rans')"
+    inputBinding:
+      position: 21
+      prefix: --util_sort_args
+  output_vcf:
+    type: string
+    doc: "Output VCF File. The file name must end in .vcf.gz"
+    inputBinding:
+      position: 100
+  cpu_per_job:
+    label: CPU per job
+    doc: CPU per job
+    type: int?
+  mem_per_job:
+    label: Memory per job
+    doc: Memory per job[MB].
+    type: int?
+
+outputs:
+  small_variants:
+    type: File
+    secondaryFiles:
+    - pattern: .tbi
+      required: true
+    outputBinding:
+      glob: $(inputs.output_vcf)
+    sbg:fileTypes: VCF.GZ
+  structural_variants:
+    type: File
+    secondaryFiles:
+    - pattern: .tbi
+      required: true
+    outputBinding:
+      glob: $(inputs.output_vcf.replace(".vcf.gz", ".sv.vcf.gz"))
+    sbg:fileTypes: VCF.GZ
+  out_alignments:
+    type: File[]?
+    secondaryFiles:
+    - pattern: .bai
+      required: false
+    - pattern: .crai
+      required: false
+    outputBinding:
+      glob: ["*.cram", "*.bam"]
+  mosdepth_out:
+    type: File[]?
+    outputBinding:
+      glob: '*_mosdepth_*'
+
+baseCommand:
+- sentieon-cli
+- dnascope-longread
+$namespaces:
+  sbg: https://sevenbridges.com

From 2eb39a009ab32fb836360d59cd701162db89c55a Mon Sep 17 00:00:00 2001
From: Haodong Chen <chen.hdong@gmail.com>
Date: Thu, 12 Dec 2024 19:54:33 -0800
Subject: [PATCH 02/11] Update PacBio HiFi workflow

---
 ...entieon-pacbio-hifi-longreads-workflow.cwl |  32 ++-
 ...reads-workflow-hifi-ubam-single-sample.cwl | 230 ++++++++++++++++++
 2 files changed, 259 insertions(+), 3 deletions(-)
 create mode 100644 workflows/kfdrc-pacbio-longreads-workflow-hifi-ubam-single-sample.cwl

diff --git a/subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl b/subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl
index 66fd0f5..efc5ba9 100644
--- a/subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl
+++ b/subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl
@@ -14,7 +14,7 @@ requirements:
 - class: SubworkflowFeatureRequirement
 inputs:
   input_unaligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: false}],
-    doc: "Unaligned BAM file and index containing long reads generated by an PacBio sequencer.",
+    doc: "Unaligned BAM file and index containing long reads generated by a PacBio sequencer.",
     "sbg:fileTypes": "BAM"}
   indexed_reference_fasta: {type: 'File', secondaryFiles: [{pattern: '.fai', required: true},
       {pattern: '^.dict', required: false}], doc: "Reference fasta and fai index.",
@@ -29,8 +29,8 @@ inputs:
   dnascope_cpu: {type: 'int?', doc: "CPU Cores for dnascope to use."}
   dnascope_ram: {type: 'int?', doc: "RAM (in GB) for dnascope to use."}
 outputs:
-  minimap2_aligned_bam: {type: 'File[]?', secondaryFiles: [{pattern: '.bai', required: true}],
-    outputSource: sentieon_longread_cli/out_alignments, doc: "Aligned BAM file from Minimap2."}
+  minimap2_aligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: true}],
+    outputSource: array_to_file/out_alignments, doc: "Aligned BAM file from Minimap2."}
   dnascope_small_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}],
     outputSource: sentieon_longread_cli/small_variants, doc: "VCF.GZ file and index containing DNAscope-generated\
       \ small variant calls."}
@@ -68,6 +68,32 @@ steps:
       cpu_per_job: dnascope_cpu
       mem_per_job: dnascope_ram
     out: [small_variants, structural_variants, out_alignments, mosdepth_out]
+  array_to_file:
+    in:
+      infile:
+        source: sentieon_longread_cli/out_alignments
+        valueFrom: |
+          $(self[0])
+    out: [out_alignments]
+    run:
+      cwlVersion: v1.2
+      class: CommandLineTool
+      requirements:
+        - class: InlineJavascriptRequirement
+      doc: |
+        Select the first item from an array of BAM files.
+      baseCommand: [echo, done]
+      inputs:
+        infile: { type: 'File', secondaryFiles: [{pattern: '.bai', required: true}]}
+      outputs:
+        out_alignments:
+          type: File
+          outputBinding:
+            outputEval: |
+              $(inputs.infile)
+          secondaryFiles:
+          - pattern: .bai
+            required: true
 $namespaces:
   sbg: https://sevenbridges.com
 
diff --git a/workflows/kfdrc-pacbio-longreads-workflow-hifi-ubam-single-sample.cwl b/workflows/kfdrc-pacbio-longreads-workflow-hifi-ubam-single-sample.cwl
new file mode 100644
index 0000000..58010c7
--- /dev/null
+++ b/workflows/kfdrc-pacbio-longreads-workflow-hifi-ubam-single-sample.cwl
@@ -0,0 +1,230 @@
+cwlVersion: v1.2
+class: Workflow
+id: kfdrc-pacbio-longreads-workflow-hifi-single-sample-ubam
+label: Kids First DRC PacBio LongReads Workflow (HiFi single-sample uBAM)
+doc: |
+  # Kids First Data Resource Center Pacific Biosciences Long Reads Alignment and Variant Calling Workflow
+
+  <p align="center">
+    <img src="https://github.com/d3b-center/d3b-research-workflows/raw/master/doc/kfdrc-logo-sm.png">
+  </p>
+
+  The Kids First Data Resource Center (KFDRC) Pacific Biosciences (PacBio)
+  Long Reads Alignment and Variant Calling Workflow is a Common Workflow Language
+  (CWL) implementation of various softwares used to take reads information
+  generated by PacBio long reads sequencers and generate alignment and variant
+  information. This pipeline was made possible thanks to significant software and
+  support contributions from both Sentieon and Wang Genomics Lab. For more
+  information on our collaborators, check out their websites:
+  - Sentieon: https://www.sentieon.com/
+  - Wang Genomics Lab: https://wglab.org/
+
+  ## Relevant Softwares and Versions
+  - [samtools head](http://www.htslib.org/doc/samtools-head.html): `1.17`
+  - [samtools fastq](http://www.htslib.org/doc/samtools-fastq.html): `1.15.1`
+  - [Sentieon Minimap2](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#minimap2-binary): `202308.03`
+  - [Sentieon util sort](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#util-binary): `202308.03`
+  - [Sentieon DNAScope HiFi](https://support.sentieon.com/manual/): `202308.03`
+  - [Sentieon LongReadSV](https://support.sentieon.com/manual/): `202308.03`
+  - [LongReadSum](https://github.com/WGLab/LongReadSum#readme): `1.2.0`
+  - [Sniffles](https://github.com/fritzsedlazeck/Sniffles#readme): `2.0.7`
+  - [pbsv](https://github.com/PacificBiosciences/pbsv#readme): `2.9.0`
+
+  ## Input Files
+  - `input_unaligned_bam`: The primary input of the PacBio HiFi Long Reads Workflow is an unaligned BAM. RG fields are required. Only one SM (Sample name) is allowed.
+  - `indexed_reference_fasta`: Any suitable human reference genome. KFDRC uses `Homo_sapiens_assembly38.fasta` from Broad Institute.
+
+  ## Output Files
+  - `dnascope_small_variants`: BGZIP and TABIX indexed VCF containing small variant calls made by Sentieon DNAScope HiFi on `minimap2_aligned_bam`.
+  - `longreadsum_bam_metrics`: BGZIP TAR containing various metrics collected by LongReadSum from the `minimap2_aligned_bam`.
+  - `minimap2_aligned_bam`: Indexed BAM file containing reads from the `input_unaligned_bam` aligned to the `indexed_reference_fasta`.
+  - `pbsv_structural_variants`: BGZIP and TABIX indexed VCF containing structural variant calls made by pbsv on the `minimap2_aligned_bam`.
+  - `sniffles_structural_variants`: BGZIP and TABIX indexed VCF containing structural variant calls made by Sniffles on the `minimap2_aligned_bam`.
+  - `longreadsv_structural_variants`: BGZIP and TABIX indexed VCF containing structural variant calls made by Sentieon LongReadSV on the `minimap2_aligned_bam`.
+
+  ## Generalized Process
+  1. Read group information (`@RG`) is harvested from the `input_unaligned_bam` header using `samtools head` and `grep`.
+  1. Align `input_unaligned_bam` to `indexed_reference_fasta` with the above `@RG` information using samtools fastq, Sentieon Minimap2, and Sentieon sort.
+  1. Generate long reads alignment metrics from the `minimap2_aligned_bam` using LongReadSum.
+  1. Generate structural variant calls from the `minimap2_aligned_bam` using pbsv.
+  1. Generate structural variant calls from the `minimap2_aligned_bam` using Sniffles.
+  1. Generate structural variant calls from the `minimap2_aligned_bam` using Sentieon LongReadSV.
+  1. Generate small variant from the `minimap2_aligned_bam` using Sentieon DNAScope HiFi.
+
+  ## Basic Info
+  - [D3b dockerfiles](https://github.com/d3b-center/bixtools)
+  - Testing Tools:
+      - [Seven Bridges Cavatica Platform](https://cavatica.sbgenomics.com/)
+      - [Common Workflow Language reference implementation (cwltool)](https://github.com/common-workflow-language/cwltool/)
+
+  ## References
+  - KFDRC AWS s3 bucket: s3://kids-first-seq-data/broad-references/
+  - Cavatica: https://cavatica.sbgenomics.com/u/kfdrc-harmonization/kf-references/
+  - Broad Institute Goolge Cloud: https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0/
+requirements:
+- class: InlineJavascriptRequirement
+- class: MultipleInputFeatureRequirement
+- class: ScatterFeatureRequirement
+- class: StepInputExpressionRequirement
+- class: SubworkflowFeatureRequirement
+inputs:
+  input_unaligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: false}],
+    doc: "Unaligned BAM file and index containing HiFi long reads generated by a PacBio sequencer.",
+    "sbg:fileTypes": "BAM"}
+  indexed_reference_fasta: {type: 'File', secondaryFiles: [{pattern: '.fai', required: true},
+      {pattern: '^.dict', required: true}], doc: "Reference fasta and fai index.",
+    "sbg:suggestedValue": {class: File, path: 60639014357c3a53540ca7a3, name: Homo_sapiens_assembly38.fasta,
+      secondaryFiles: [{class: File, path: 60639016357c3a53540ca7af, name: Homo_sapiens_assembly38.fasta.fai},
+        {class: File, path: 60639019357c3a53540ca7e7, name: Homo_sapiens_assembly38.dict}]},
+    "sbg:fileTypes": "FASTA, FA"}
+  output_basename: {type: 'string', doc: "String to use as basename for all workflow\
+      \ outputs."}
+  sentieon_license: {type: 'string?', doc: "License server host and port for Sentieon\
+      \ tools.", default: "10.5.64.221:8990"}
+  longreadsum_cpu: {type: 'int?', doc: "CPU Cores for longreadsum to use."}
+  dnascope_cpu: {type: 'int?', doc: "CPU Cores for dnascope to use."}
+  dnascope_ram: {type: 'int?', doc: "RAM (in GB) for dnascope to use."}
+  pbsv_cpu: {type: 'int?', doc: "CPU Cores for pbsv to use."}
+  pbsv_ram: {type: 'int?', doc: "RAM (in GB) for pbsv to use."}
+  sniffles_cpu: {type: 'int?', doc: "CPU Cores for sniffles to use."}
+  sniffles_ram: {type: 'int?', doc: "RAM (in GB) for sniffles to use."}
+outputs:
+  minimap2_aligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: true}],
+    outputSource: dnascope_longread/minimap2_aligned_bam, doc: "Aligned BAM file from Minimap2."}
+  longreadsum_bam_metrics: {type: 'File', outputSource: tar_longreadsum_dir/output,
+    doc: "TAR.GZ file containing longreadsum-generated metrics."}
+  dnascope_small_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}],
+    outputSource: dnascope_longread/dnascope_small_variants, doc: "VCF.GZ file and index containing DNAscope-generated\
+      \ small variant calls."}
+  pbsv_strucutural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}],
+    outputSource: bgzip_tabix_index_pbsv_vcf/output, doc: "VCF.GZ file and index containing\
+      \ pbsv-generated SV calls."}
+  sniffles_structural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi',
+        required: true}], outputSource: sniffles/output_vcf, doc: "VCF.GZ file and\
+      \ index containing sniffles-generated SV calls."}
+  longreadsv_structural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi',
+        required: true}], outputSource: dnascope_longread/longreadsv_structural_variants, doc: "VCF.GZ\
+      \ file and index containing Sentieon LongReadSV-generated SV calls."}
+steps:
+  dnascope_longread:
+    run: ../subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl
+    in: 
+      input_unaligned_bam: input_unaligned_bam
+      indexed_reference_fasta: indexed_reference_fasta
+      output_basename: output_basename
+      sentieon_license: sentieon_license
+      dnascope_cpu: dnascope_cpu
+      dnascope_ram: dnascope_ram
+    out: [minimap2_aligned_bam, dnascope_small_variants, longreadsv_structural_variants]
+  longreadsum:
+    hints:
+    - class: "sbg:AWSInstanceType"
+      value: c5.9xlarge
+    run: ../tools/longreadsum.cwl
+    in:
+      input_type:
+        valueFrom: "bam"
+      input_file: dnascope_longread/minimap2_aligned_bam
+      output_dir: output_basename
+      output_basename: output_basename
+      log:
+        valueFrom: "test.log"
+      log_level:
+        valueFrom: "2"
+      cpu: longreadsum_cpu
+    out: [outputs]
+  tar_longreadsum_dir:
+    run: ../tools/tar.cwl
+    in:
+      output_filename:
+        source: output_basename
+        valueFrom: $(self).longreadsum.tar.gz
+      input_dir: longreadsum/outputs
+    out: [output]
+  pbsv_discover:
+    hints:
+    - class: "sbg:AWSInstanceType"
+      value: c5.9xlarge
+    run: ../tools/pbsv_discover.cwl
+    in:
+      input_bam: dnascope_longread/minimap2_aligned_bam
+      output_filename:
+        source: output_basename
+        valueFrom: $(self).pbsv.svsig.gz
+      hifi_preset:
+        valueFrom: |
+          $(1 == 1)
+      cpu: pbsv_cpu
+      ram: pbsv_ram
+    out: [output_svsig]
+  pbsv_call:
+    hints:
+    - class: "sbg:AWSInstanceType"
+      value: c5.9xlarge
+    run: ../tools/pbsv_call.cwl
+    in:
+      reference_fasta: indexed_reference_fasta
+      input_svsig: pbsv_discover/output_svsig
+      output_filename:
+        source: output_basename
+        valueFrom: $(self).pbsv.vcf
+      hifi_preset:
+        valueFrom: |
+          $(1 == 1)
+      cpu: pbsv_cpu
+      ram: pbsv_ram
+    out: [output_vcf]
+  bgzip_tabix_index_pbsv_vcf:
+    run: ../tools/bgzip_tabix_index.cwl
+    in:
+      input_vcf: pbsv_call/output_vcf
+      cpu: pbsv_cpu
+    out: [output]
+  sniffles:
+    hints:
+    - class: "sbg:AWSInstanceType"
+      value: c5.9xlarge
+    run: ../tools/sniffles.cwl
+    in:
+      input_bam:
+        source: dnascope_longread/minimap2_aligned_bam
+        valueFrom: $([self])
+      vcf_output_filename:
+        source: output_basename
+        valueFrom: $(self).sniffles.vcf.gz
+      reference_fasta: indexed_reference_fasta
+      cpu: sniffles_cpu
+      ram: sniffles_ram
+    out: [output_vcf, output_snf]
+$namespaces:
+  sbg: https://sevenbridges.com
+hints:
+- class: "sbg:maxNumberOfParallelInstances"
+  value: 2
+"sbg:license": Apache License 2.0
+"sbg:publisher": KFDRC
+"sbg:categories":
+- ALIGNMENT
+- DNA
+- INDEL
+- LONG
+- LONGREADS
+- LONGREADSUM
+- METRICS
+- NANOCALLER
+- PACBIO
+- PACIFIC
+- PBMM2
+- PBSV
+- SENTIEON
+- SNIFFLES
+- SNP
+- SOMATIC
+- STRUCTURAL
+- SV
+- VARIANT
+- WGS
+- WXS
+"sbg:links":
+- id: 'https://github.com/kids-first/kf-longreads-workflow/releases/tag/v2.0.2'
+  label: github-release

From 49f0ae983c1993f93dab49dfb981d1273c6dba77 Mon Sep 17 00:00:00 2001
From: Haodong Chen <chen.hdong@gmail.com>
Date: Thu, 12 Dec 2024 19:57:21 -0800
Subject: [PATCH 03/11] Update doc

---
 docs/dockers_pacbio.md | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/docs/dockers_pacbio.md b/docs/dockers_pacbio.md
index f804e8f..c262759 100644
--- a/docs/dockers_pacbio.md
+++ b/docs/dockers_pacbio.md
@@ -10,9 +10,6 @@ pbsv_call.cwl|quay.io/biocontainers/pbsv:2.9.0--h9ee0642_0
 pbsv_discover.cwl|quay.io/biocontainers/pbsv:2.9.0--h9ee0642_0
 samtools_head.cwl|staphb/samtools:1.17
 samtools_split.cwl|staphb/samtools:1.17
-sentieon_DNAscope_LongRead.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202112.01_hifi
-sentieon_LongReadSV.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202112.06
-sentieon_ReadWriter.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202112.01_hifi
-sentieon_minimap2.cwl|pgc-images.sbgenomics.com/d3b-bixu/sentieon:202112.01_hifi
+sentieon_DNAscope_LongRead_CLI.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202308.03
 sniffles.cwl|pgc-images.sbgenomics.com/d3b-bixu/sniffles:2.0.7
 tar.cwl|None

From bd37fd27cd3d307e870d87baf5cc17d93ee05fa0 Mon Sep 17 00:00:00 2001
From: Haodong Chen <chen.hdong@gmail.com>
Date: Thu, 12 Dec 2024 23:52:13 -0800
Subject: [PATCH 04/11] Update the PacBio pipeline

---
 tools/sentieon_DNAscope_LongRead.cwl          | 169 ------------------
 tools/sentieon_DNAscope_LongRead_CLI.cwl      |   2 +-
 tools/sentieon_LongReadSV.cwl                 | 134 --------------
 tools/sentieon_ReadWriter.cwl                 |   2 +-
 tools/sentieon_minimap2.cwl                   |   2 +-
 workflows/kfdrc-pacbio-longreads-workflow.cwl |  49 +++--
 6 files changed, 26 insertions(+), 332 deletions(-)
 delete mode 100644 tools/sentieon_DNAscope_LongRead.cwl
 delete mode 100644 tools/sentieon_LongReadSV.cwl

diff --git a/tools/sentieon_DNAscope_LongRead.cwl b/tools/sentieon_DNAscope_LongRead.cwl
deleted file mode 100644
index 14855f2..0000000
--- a/tools/sentieon_DNAscope_LongRead.cwl
+++ /dev/null
@@ -1,169 +0,0 @@
-cwlVersion: v1.2
-class: CommandLineTool
-label: Sentieon_DNAscope_LongRead
-doc: |-
-  This tool uses **Sentieon DNAscope** to call germline variants from PacBio HiFi reads [1].
-
-  ### Input data requirements
-
-  - **Aligned reads**: The pipeline will take PacBio HiFi reads that have been aligned to a reference genome with `pbmm2` or `minimap2`.
-  - **The Reference genome**: A reference genome file in FASTA format with its index file (.fai). 
-
-  ### Common Issues and Important Notes
-
-  * By suppling an optional MHC BED file, additional special handling can be applied to the MHC region to further increase variant calling accuracy.
-  * Currently, the pipeline is only recommended for use with samples from diploid organisms. For samples with both diploid and haploid chromosomes, the `-b INTERVAL` option can be used to limit variant calling to diploid chromosomes.
-
-  ###References
-
-  [1] [https://support.sentieon.com/appnotes/dnascope_hifi/](https://support.sentieon.com/appnotes/dnascope_hifi/)
-
-requirements:
-- class: ShellCommandRequirement
-- class: ResourceRequirement
-  coresMin: |-
-    ${
-        if (inputs.cpu_per_job)
-        {
-            return inputs.cpu_per_job
-        }
-        else
-        {
-            return 36
-        }
-    }
-  ramMin: |-
-    ${
-        if (inputs.mem_per_job)
-        {
-            return inputs.mem_per_job
-        }
-        else
-        {
-            return 71000
-        }
-    }
-- class: DockerRequirement
-  dockerPull: pgc-images.sbgenomics.com/hdchen/sentieon:202112.01_hifi
-- class: EnvVarRequirement
-  envDef:
-  - envName: SENTIEON_LICENSE
-    envValue: $(inputs.sentieon_license)
-- class: InlineJavascriptRequirement
-
-inputs:
-- id: sentieon_license
-  label: Sentieon license
-  doc: License server host and port
-  type: string
-- id: reference
-  label: Reference
-  doc: Reference fasta with associated fai index
-  type: File
-  secondaryFiles:
-  - pattern: .fai
-    required: true
-  - pattern: ^.dict
-    required: false
-  inputBinding:
-    prefix: -r
-    position: 10
-    shellQuote: false
-  sbg:fileTypes: FA, FASTA
-- id: input_bam
-  label: Input BAM
-  doc: Input BAM file
-  type: File
-  secondaryFiles:
-  - pattern: ^.bai
-    required: false
-  - pattern: ^.crai
-    required: false
-  - pattern: .bai
-    required: false
-  - pattern: .crai
-    required: false
-  inputBinding:
-    prefix: -i
-    position: 11
-    shellQuote: false
-  sbg:fileTypes: BAM, CRAM
-- id: output_file_name
-  label: Output file name
-  doc: The output VCF file name. Must end with ".gz".
-  type: string?
-- id: dbSNP
-  label: dbSNP VCF file
-  doc: |-
-    Supplying this file will annotate variants with their dbSNP refSNP ID numbers. (optional)
-  type: File?
-  secondaryFiles:
-  - pattern: .tbi
-    required: false
-  - pattern: .idx
-    required: false
-  inputBinding:
-    prefix: -d
-    position: 30
-    shellQuote: false
-- id: bed
-  label: Region BED file
-  doc: |-
-    Supplying this file will limit variant calling to the intervals inside the BED file. (optional)
-  type: File?
-  inputBinding:
-    prefix: -b
-    position: 39
-    shellQuote: false
-  sbg:fileTypes: BED
-- id: mhc
-  label: MHC BED file
-  doc: |-
-    Supplying this file will enable the special handling of the MHC region. (optional)
-  type: File?
-  inputBinding:
-    prefix: -B
-    position: 60
-    shellQuote: false
-  sbg:fileTypes: BED
-- id: cpu_per_job
-  label: CPU per job
-  doc: CPU per job
-  type: int?
-- id: mem_per_job
-  label: Memory per job
-  doc: Memory per job[MB]
-  type: int?
-
-outputs:
-- id: output_vcf
-  type: File
-  secondaryFiles:
-  - pattern: .tbi
-    required: true
-  outputBinding:
-    glob: '*.vcf.gz'
-  sbg:fileTypes: VCF.GZ
-
-baseCommand:
-- /bin/bash
-- /opt/dnascope_hifi/DNAscopeHiFiBeta0.4.pipeline/dnascope_HiFi.sh
-arguments:
-- prefix: ''
-  position: 1
-  valueFrom: -m /opt/dnascope_hifi/DNAscopeHiFiBeta0.4.pipeline/DNAscopeHiFiBeta0.4.model
-  shellQuote: false
-- prefix: ''
-  position: 100
-  valueFrom: |-
-    ${
-        if (inputs.output_file_name)
-            return inputs.output_file_name
-        else
-            var basename = inputs.input_bam.nameroot
-            return basename.concat(".vcf.gz")
-    }
-  shellQuote: false
-
-$namespaces:
-  sbg: https://sevenbridges.com
diff --git a/tools/sentieon_DNAscope_LongRead_CLI.cwl b/tools/sentieon_DNAscope_LongRead_CLI.cwl
index d51124c..350506d 100644
--- a/tools/sentieon_DNAscope_LongRead_CLI.cwl
+++ b/tools/sentieon_DNAscope_LongRead_CLI.cwl
@@ -126,7 +126,7 @@ inputs:
     type: File?
     inputBinding:
       position: 10
-      prefix: --haploid-
+      prefix: --haploid-bed
     sbg:fileTypes: BED
   gvcf:
     type: boolean?
diff --git a/tools/sentieon_LongReadSV.cwl b/tools/sentieon_LongReadSV.cwl
deleted file mode 100644
index a3d8ca6..0000000
--- a/tools/sentieon_LongReadSV.cwl
+++ /dev/null
@@ -1,134 +0,0 @@
-cwlVersion: v1.2
-class: CommandLineTool
-id: sentieon_LongReadSV
-doc: |-
-  Sentieon SV calling for PacBio HiFi and Oxford Nanopore long reads.
-  
-  ### Inputs:
-  #### Required
-  - ``Reference``: Location of the reference FASTA file.
-  - ``Input BAM``: Location of the BAM/CRAM input file.
-  - ``Platform``: PacBio HiFi or Oxford Nanopore
-
-requirements:
-- class: ShellCommandRequirement
-- class: InlineJavascriptRequirement
-- class: ResourceRequirement
-  coresMin: $(inputs.cpu) 
-  ramMin: $(inputs.ram * 1000) 
-- class: DockerRequirement
-  dockerPull: pgc-images.sbgenomics.com/hdchen/sentieon:202112.06
-- class: EnvVarRequirement
-  envDef:
-  - envName: SENTIEON_LICENSE
-    envValue: $(inputs.sentieon_license)
-baseCommand:
-- sentieon
-- driver
-arguments:
-- prefix: '--algo'
-  position: 10
-  valueFrom: LongReadSV
-  shellQuote: false
-inputs:
-  sentieon_license:
-    type: 'string'
-    doc: License server host and port
-  reference:
-    type: 'File'
-    secondaryFiles:
-    - pattern: .fai
-      required: true
-    - pattern: ^.dict
-      required: true
-    inputBinding:
-      prefix: -r
-      position: 0
-      shellQuote: false
-    doc: Reference fasta with associated fai index
-    sbg:fileTypes: FA, FASTA
-  input_bam:
-    type: 'File'
-    secondaryFiles:
-    - pattern: ^.bai
-      required: false
-    - pattern: ^.crai
-      required: false
-    - pattern: .bai
-      required: false
-    - pattern: .crai
-      required: false
-    inputBinding:
-      prefix: -i
-      position: 1
-      shellQuote: false
-    doc: Input BAM file
-    sbg:fileTypes: BAM, CRAM
-  platform:
-    type:
-    - 'null'
-    - name: platform
-      type: enum
-      symbols:
-      - PacBioHiFi
-      - ONT
-    default: PacBioHiFi
-    inputBinding:
-      prefix: --model
-      position: 11
-      shellQuote: true
-      valueFrom: |-
-        ${
-            if (self === "PacBioHiFi") {
-                return "/opt/dnascope_models/SentieonLongReadSVHiFiBeta0.1.model";
-            }
-            else if (self === "ONT") {
-                return "/opt/dnascope_models/SentieonLongReadSVONTBeta0.1.model";
-            }
-            return ""
-         }
-    doc: |-
-      PacBio HiFi or Oxford Nanopore (ONT)
-    sbg:toolDefaultValue: PacBioHiFi
-  min_sv_size:
-    type: 'int?'
-    inputBinding:
-      prefix: --min_sv_size
-      shellQuote: true
-      position: 12
-    doc:  minimum SV size in basepairs to output
-    sbg:toolDefaultValue: 40
-  min_map_qual:
-    type: 'int?'
-    inputBinding:
-      prefix: --min_map_qual
-      shellQuote: true
-      position: 12
-    doc:  minimum read mapping quality
-    sbg:toolDefaultValue: 20
-  output_file_name:
-    type: 'string'
-    inputBinding:
-      position: 100
-      shellQuote: true
-    doc: The output VCF file name. Must end with ".vcf.gz".
-  cpu:
-    type: 'int?'
-    default: 36
-    doc: CPUs to allocate to this task
-  ram:
-    type: 'int?'
-    default: 36
-    doc: GB of RAM to allocate to this task 
-outputs:
-  output_vcf:
-    type: 'File'
-    secondaryFiles:
-    - pattern: .tbi
-      required: true
-    outputBinding:
-      glob: '*.vcf.gz'
-    sbg:fileTypes: VCF.GZ
-
-$namespaces:
-  sbg: https://sevenbridges.com
diff --git a/tools/sentieon_ReadWriter.cwl b/tools/sentieon_ReadWriter.cwl
index cb4dfb7..68a8cc4 100644
--- a/tools/sentieon_ReadWriter.cwl
+++ b/tools/sentieon_ReadWriter.cwl
@@ -17,7 +17,7 @@ requirements:
   ramMin: |
     $(inputs.mem_per_job ? inputs.mem_per_job : 16000)
 - class: DockerRequirement
-  dockerPull: pgc-images.sbgenomics.com/hdchen/sentieon:202112.01_hifi
+  dockerPull: pgc-images.sbgenomics.com/hdchen/sentieon:202308.03
 - class: EnvVarRequirement
   envDef:
   - envName: SENTIEON_LICENSE
diff --git a/tools/sentieon_minimap2.cwl b/tools/sentieon_minimap2.cwl
index 5e0bbad..2bfc931 100644
--- a/tools/sentieon_minimap2.cwl
+++ b/tools/sentieon_minimap2.cwl
@@ -18,7 +18,7 @@ requirements:
   coresMin: $(inputs.cpu_per_job)
   ramMin: $(inputs.mem_per_job * 1000)
 - class: DockerRequirement
-  dockerPull: pgc-images.sbgenomics.com/d3b-bixu/sentieon:202112.01_hifi
+  dockerPull: pgc-images.sbgenomic6s.com/hdchen/sentieon:202308.03
 - class: EnvVarRequirement
   envDef:
   - envName: SENTIEON_LICENSE
diff --git a/workflows/kfdrc-pacbio-longreads-workflow.cwl b/workflows/kfdrc-pacbio-longreads-workflow.cwl
index d0bf8d5..42e0e57 100644
--- a/workflows/kfdrc-pacbio-longreads-workflow.cwl
+++ b/workflows/kfdrc-pacbio-longreads-workflow.cwl
@@ -22,10 +22,10 @@ doc: |
   ## Relevant Softwares and Versions
   - [samtools head](http://www.htslib.org/doc/samtools-head.html): `1.17`
   - [samtools fastq](http://www.htslib.org/doc/samtools-fastq.html): `1.15.1`
-  - [Sentieon Minimap2](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#minimap2-binary): `202112.01`
-  - [Sentieon util sort](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#util-binary): `202112.01`
-  - [Sentieon DNAScope HiFi](https://support.sentieon.com/manual/): `202112.01`
-  - [Sentieon LongReadSV](https://support.sentieon.com/manual/): `202112.06`
+  - [Sentieon Minimap2](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#minimap2-binary): `202308.03`
+  - [Sentieon util sort](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#util-binary): `202308.03`
+  - [Sentieon DNAScope HiFi](https://support.sentieon.com/manual/): `202308.03`
+  - [Sentieon LongReadSV](https://support.sentieon.com/manual/): `202308.03`
   - [LongReadSum](https://github.com/WGLab/LongReadSum#readme): `1.2.0`
   - [Sniffles](https://github.com/fritzsedlazeck/Sniffles#readme): `2.0.7`
   - [pbsv](https://github.com/PacificBiosciences/pbsv#readme): `2.9.0`
@@ -112,15 +112,13 @@ inputs:
   pbsv_ram: {type: 'int?', doc: "RAM (in GB) for pbsv to use."}
   sniffles_cpu: {type: 'int?', doc: "CPU Cores for sniffles to use."}
   sniffles_ram: {type: 'int?', doc: "RAM (in GB) for sniffles to use."}
-  longreadsv_cpu: {type: 'int?', doc: "CPU Cores for Sentieon LongReadSV to use."}
-  longreadsv_ram: {type: 'int?', doc: "RAM (in GB) for Sentieon LongReadSV to use."}
 outputs:
   minimap2_aligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: true}],
     outputSource: clt_pickvalue/outfile, doc: "Aligned BAM file from Minimap2."}
   longreadsum_bam_metrics: {type: 'File', outputSource: tar_longreadsum_dir/output,
     doc: "TAR.GZ file containing longreadsum-generated metrics."}
   dnascope_small_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}],
-    outputSource: dnascope/output_vcf, doc: "VCF.GZ file and index containing DNAscope-generated\
+    outputSource: dnascope/small_variants, doc: "VCF.GZ file and index containing DNAscope-generated\
       \ small variant calls."}
   pbsv_strucutural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}],
     outputSource: bgzip_tabix_index_pbsv_vcf/output, doc: "VCF.GZ file and index containing\
@@ -129,7 +127,7 @@ outputs:
         required: true}], outputSource: sniffles/output_vcf, doc: "VCF.GZ file and\
       \ index containing sniffles-generated SV calls."}
   longreadsv_structural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi',
-        required: true}], outputSource: sentieon_longreadsv/output_vcf, doc: "VCF.GZ\
+        required: true}], outputSource: dnascope/structural_variants, doc: "VCF.GZ\
       \ file and index containing Sentieon LongReadSV-generated SV calls."}
 steps:
   samtools_split:
@@ -228,20 +226,33 @@ steps:
         valueFrom: $(self).longreadsum.tar.gz
       input_dir: longreadsum/outputs
     out: [output]
+  download_model:
+    run: ../tools/download_DNAscope_model.cwl
+    in:
+      model_name:
+        valueFrom: "PacBio_HiFi-WGS"
+    out: [model_bundle]
   dnascope:
-    run: ../tools/sentieon_DNAscope_LongRead.cwl
+    run: ../tools/sentieon_DNAscope_LongRead_CLI.cwl
     when: $(inputs.minimap2_preset != "map-pb")
     in:
       minimap2_preset: minimap2_preset
       sentieon_license: sentieon_license
       reference: indexed_reference_fasta
-      input_bam: clt_pickvalue/outfile
-      output_file_name:
+      input_bam: 
+        source: [clt_pickvalue/outfile]
+        linkMerge: merge_flattened
+      model_bundle: download_model/model_bundle
+      tech:
+        valueFrom: "HiFi"
+      output_vcf:
         source: output_basename
         valueFrom: $(self).dnascope.vcf.gz
+      skip-mosdepth:
+        default: true
       cpu_per_job: dnascope_cpu
       mem_per_job: dnascope_ram
-    out: [output_vcf]
+    out: [small_variants, structural_variants]
   pbsv_discover:
     hints:
     - class: "sbg:AWSInstanceType"
@@ -302,20 +313,6 @@ steps:
       cpu: sniffles_cpu
       ram: sniffles_ram
     out: [output_vcf, output_snf]
-  sentieon_longreadsv:
-    run: ../tools/sentieon_LongReadSV.cwl
-    in:
-      sentieon_license: sentieon_license
-      reference: indexed_reference_fasta
-      input_bam: clt_pickvalue/outfile
-      platform:
-        valueFrom: "PacBioHiFi"
-      output_file_name:
-        source: output_basename
-        valueFrom: $(self).longreadsv.vcf.gz
-      cpu: longreadsv_cpu
-      ram: longreadsv_ram
-    out: [output_vcf]
 $namespaces:
   sbg: https://sevenbridges.com
 hints:

From 3f9640ff34085f8d68963c9092405f9035d39308 Mon Sep 17 00:00:00 2001
From: Haodong Chen <chen.hdong@gmail.com>
Date: Fri, 13 Dec 2024 00:26:38 -0800
Subject: [PATCH 05/11] Fix a typo

---
 tools/sentieon_minimap2.cwl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/sentieon_minimap2.cwl b/tools/sentieon_minimap2.cwl
index 2bfc931..b1dc64a 100644
--- a/tools/sentieon_minimap2.cwl
+++ b/tools/sentieon_minimap2.cwl
@@ -18,7 +18,7 @@ requirements:
   coresMin: $(inputs.cpu_per_job)
   ramMin: $(inputs.mem_per_job * 1000)
 - class: DockerRequirement
-  dockerPull: pgc-images.sbgenomic6s.com/hdchen/sentieon:202308.03
+  dockerPull: pgc-images.sbgenomics.com/hdchen/sentieon:202308.03
 - class: EnvVarRequirement
   envDef:
   - envName: SENTIEON_LICENSE

From 74a073a24cbabc40a35ca2e0efd2a9a9fcee990a Mon Sep 17 00:00:00 2001
From: Haodong Chen <chen.hdong@gmail.com>
Date: Fri, 13 Dec 2024 10:14:27 -0800
Subject: [PATCH 06/11] Allow SV for CLR

---
 ...entieon-pacbio-hifi-longreads-workflow.cwl |  99 --------
 ...reads-workflow-hifi-ubam-single-sample.cwl | 230 ------------------
 workflows/kfdrc-pacbio-longreads-workflow.cwl |   7 +-
 3 files changed, 4 insertions(+), 332 deletions(-)
 delete mode 100644 subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl
 delete mode 100644 workflows/kfdrc-pacbio-longreads-workflow-hifi-ubam-single-sample.cwl

diff --git a/subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl b/subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl
deleted file mode 100644
index efc5ba9..0000000
--- a/subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl
+++ /dev/null
@@ -1,99 +0,0 @@
-cwlVersion: v1.2
-class: Workflow
-id: sentieon-pacbio-hifi-longreads-workflow
-doc: |
-  Run Sentieon PacBio HiFi workflow
-  Minimap2
-  DNAscope
-  LongReadSV
-requirements:
-- class: InlineJavascriptRequirement
-- class: MultipleInputFeatureRequirement
-- class: ScatterFeatureRequirement
-- class: StepInputExpressionRequirement
-- class: SubworkflowFeatureRequirement
-inputs:
-  input_unaligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: false}],
-    doc: "Unaligned BAM file and index containing long reads generated by a PacBio sequencer.",
-    "sbg:fileTypes": "BAM"}
-  indexed_reference_fasta: {type: 'File', secondaryFiles: [{pattern: '.fai', required: true},
-      {pattern: '^.dict', required: false}], doc: "Reference fasta and fai index.",
-    "sbg:suggestedValue": {class: File, path: 60639014357c3a53540ca7a3, name: Homo_sapiens_assembly38.fasta,
-      secondaryFiles: [{class: File, path: 60639016357c3a53540ca7af, name: Homo_sapiens_assembly38.fasta.fai},
-        {class: File, path: 60639019357c3a53540ca7e7, name: Homo_sapiens_assembly38.dict}]},
-    "sbg:fileTypes": "FASTA, FA"}
-  output_basename: {type: 'string', doc: "String to use as basename for all workflow\
-      \ outputs."}
-  sentieon_license: {type: 'string?', doc: "License server host and port for Sentieon\
-      \ tools.", default: "10.5.64.221:8990"}
-  dnascope_cpu: {type: 'int?', doc: "CPU Cores for dnascope to use."}
-  dnascope_ram: {type: 'int?', doc: "RAM (in GB) for dnascope to use."}
-outputs:
-  minimap2_aligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: true}],
-    outputSource: array_to_file/out_alignments, doc: "Aligned BAM file from Minimap2."}
-  dnascope_small_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}],
-    outputSource: sentieon_longread_cli/small_variants, doc: "VCF.GZ file and index containing DNAscope-generated\
-      \ small variant calls."}
-  longreadsv_structural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi',
-        required: true}], outputSource: sentieon_longread_cli/structural_variants, doc: "VCF.GZ\
-      \ file and index containing Sentieon LongReadSV-generated SV calls."}
-steps:
-  download_model:
-    run: ../tools/download_DNAscope_model.cwl
-    in:
-      model_name: 
-        valueFrom: "PacBio_HiFi-WGS"
-    out: [model_bundle]
-  sentieon_longread_cli:
-    run: ../tools/sentieon_DNAscope_LongRead_CLI.cwl
-    in:
-      input_bam: 
-        source: 
-          - input_unaligned_bam
-        linkMerge: merge_nested
-      reference: indexed_reference_fasta
-      output_vcf:
-        source: output_basename
-        valueFrom: $(self).vcf.gz
-      sentieon_license: sentieon_license
-      model_bundle: download_model/model_bundle
-      align:
-        valueFrom: |
-          $(1 == 1)
-      tech:
-        valueFrom: "HiFi"
-      bam_format:
-        valueFrom: |
-          $(1 == 1)
-      cpu_per_job: dnascope_cpu
-      mem_per_job: dnascope_ram
-    out: [small_variants, structural_variants, out_alignments, mosdepth_out]
-  array_to_file:
-    in:
-      infile:
-        source: sentieon_longread_cli/out_alignments
-        valueFrom: |
-          $(self[0])
-    out: [out_alignments]
-    run:
-      cwlVersion: v1.2
-      class: CommandLineTool
-      requirements:
-        - class: InlineJavascriptRequirement
-      doc: |
-        Select the first item from an array of BAM files.
-      baseCommand: [echo, done]
-      inputs:
-        infile: { type: 'File', secondaryFiles: [{pattern: '.bai', required: true}]}
-      outputs:
-        out_alignments:
-          type: File
-          outputBinding:
-            outputEval: |
-              $(inputs.infile)
-          secondaryFiles:
-          - pattern: .bai
-            required: true
-$namespaces:
-  sbg: https://sevenbridges.com
-
diff --git a/workflows/kfdrc-pacbio-longreads-workflow-hifi-ubam-single-sample.cwl b/workflows/kfdrc-pacbio-longreads-workflow-hifi-ubam-single-sample.cwl
deleted file mode 100644
index 58010c7..0000000
--- a/workflows/kfdrc-pacbio-longreads-workflow-hifi-ubam-single-sample.cwl
+++ /dev/null
@@ -1,230 +0,0 @@
-cwlVersion: v1.2
-class: Workflow
-id: kfdrc-pacbio-longreads-workflow-hifi-single-sample-ubam
-label: Kids First DRC PacBio LongReads Workflow (HiFi single-sample uBAM)
-doc: |
-  # Kids First Data Resource Center Pacific Biosciences Long Reads Alignment and Variant Calling Workflow
-
-  <p align="center">
-    <img src="https://github.com/d3b-center/d3b-research-workflows/raw/master/doc/kfdrc-logo-sm.png">
-  </p>
-
-  The Kids First Data Resource Center (KFDRC) Pacific Biosciences (PacBio)
-  Long Reads Alignment and Variant Calling Workflow is a Common Workflow Language
-  (CWL) implementation of various softwares used to take reads information
-  generated by PacBio long reads sequencers and generate alignment and variant
-  information. This pipeline was made possible thanks to significant software and
-  support contributions from both Sentieon and Wang Genomics Lab. For more
-  information on our collaborators, check out their websites:
-  - Sentieon: https://www.sentieon.com/
-  - Wang Genomics Lab: https://wglab.org/
-
-  ## Relevant Softwares and Versions
-  - [samtools head](http://www.htslib.org/doc/samtools-head.html): `1.17`
-  - [samtools fastq](http://www.htslib.org/doc/samtools-fastq.html): `1.15.1`
-  - [Sentieon Minimap2](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#minimap2-binary): `202308.03`
-  - [Sentieon util sort](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#util-binary): `202308.03`
-  - [Sentieon DNAScope HiFi](https://support.sentieon.com/manual/): `202308.03`
-  - [Sentieon LongReadSV](https://support.sentieon.com/manual/): `202308.03`
-  - [LongReadSum](https://github.com/WGLab/LongReadSum#readme): `1.2.0`
-  - [Sniffles](https://github.com/fritzsedlazeck/Sniffles#readme): `2.0.7`
-  - [pbsv](https://github.com/PacificBiosciences/pbsv#readme): `2.9.0`
-
-  ## Input Files
-  - `input_unaligned_bam`: The primary input of the PacBio HiFi Long Reads Workflow is an unaligned BAM. RG fields are required. Only one SM (Sample name) is allowed.
-  - `indexed_reference_fasta`: Any suitable human reference genome. KFDRC uses `Homo_sapiens_assembly38.fasta` from Broad Institute.
-
-  ## Output Files
-  - `dnascope_small_variants`: BGZIP and TABIX indexed VCF containing small variant calls made by Sentieon DNAScope HiFi on `minimap2_aligned_bam`.
-  - `longreadsum_bam_metrics`: BGZIP TAR containing various metrics collected by LongReadSum from the `minimap2_aligned_bam`.
-  - `minimap2_aligned_bam`: Indexed BAM file containing reads from the `input_unaligned_bam` aligned to the `indexed_reference_fasta`.
-  - `pbsv_structural_variants`: BGZIP and TABIX indexed VCF containing structural variant calls made by pbsv on the `minimap2_aligned_bam`.
-  - `sniffles_structural_variants`: BGZIP and TABIX indexed VCF containing structural variant calls made by Sniffles on the `minimap2_aligned_bam`.
-  - `longreadsv_structural_variants`: BGZIP and TABIX indexed VCF containing structural variant calls made by Sentieon LongReadSV on the `minimap2_aligned_bam`.
-
-  ## Generalized Process
-  1. Read group information (`@RG`) is harvested from the `input_unaligned_bam` header using `samtools head` and `grep`.
-  1. Align `input_unaligned_bam` to `indexed_reference_fasta` with the above `@RG` information using samtools fastq, Sentieon Minimap2, and Sentieon sort.
-  1. Generate long reads alignment metrics from the `minimap2_aligned_bam` using LongReadSum.
-  1. Generate structural variant calls from the `minimap2_aligned_bam` using pbsv.
-  1. Generate structural variant calls from the `minimap2_aligned_bam` using Sniffles.
-  1. Generate structural variant calls from the `minimap2_aligned_bam` using Sentieon LongReadSV.
-  1. Generate small variant from the `minimap2_aligned_bam` using Sentieon DNAScope HiFi.
-
-  ## Basic Info
-  - [D3b dockerfiles](https://github.com/d3b-center/bixtools)
-  - Testing Tools:
-      - [Seven Bridges Cavatica Platform](https://cavatica.sbgenomics.com/)
-      - [Common Workflow Language reference implementation (cwltool)](https://github.com/common-workflow-language/cwltool/)
-
-  ## References
-  - KFDRC AWS s3 bucket: s3://kids-first-seq-data/broad-references/
-  - Cavatica: https://cavatica.sbgenomics.com/u/kfdrc-harmonization/kf-references/
-  - Broad Institute Goolge Cloud: https://console.cloud.google.com/storage/browser/genomics-public-data/resources/broad/hg38/v0/
-requirements:
-- class: InlineJavascriptRequirement
-- class: MultipleInputFeatureRequirement
-- class: ScatterFeatureRequirement
-- class: StepInputExpressionRequirement
-- class: SubworkflowFeatureRequirement
-inputs:
-  input_unaligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: false}],
-    doc: "Unaligned BAM file and index containing HiFi long reads generated by a PacBio sequencer.",
-    "sbg:fileTypes": "BAM"}
-  indexed_reference_fasta: {type: 'File', secondaryFiles: [{pattern: '.fai', required: true},
-      {pattern: '^.dict', required: true}], doc: "Reference fasta and fai index.",
-    "sbg:suggestedValue": {class: File, path: 60639014357c3a53540ca7a3, name: Homo_sapiens_assembly38.fasta,
-      secondaryFiles: [{class: File, path: 60639016357c3a53540ca7af, name: Homo_sapiens_assembly38.fasta.fai},
-        {class: File, path: 60639019357c3a53540ca7e7, name: Homo_sapiens_assembly38.dict}]},
-    "sbg:fileTypes": "FASTA, FA"}
-  output_basename: {type: 'string', doc: "String to use as basename for all workflow\
-      \ outputs."}
-  sentieon_license: {type: 'string?', doc: "License server host and port for Sentieon\
-      \ tools.", default: "10.5.64.221:8990"}
-  longreadsum_cpu: {type: 'int?', doc: "CPU Cores for longreadsum to use."}
-  dnascope_cpu: {type: 'int?', doc: "CPU Cores for dnascope to use."}
-  dnascope_ram: {type: 'int?', doc: "RAM (in GB) for dnascope to use."}
-  pbsv_cpu: {type: 'int?', doc: "CPU Cores for pbsv to use."}
-  pbsv_ram: {type: 'int?', doc: "RAM (in GB) for pbsv to use."}
-  sniffles_cpu: {type: 'int?', doc: "CPU Cores for sniffles to use."}
-  sniffles_ram: {type: 'int?', doc: "RAM (in GB) for sniffles to use."}
-outputs:
-  minimap2_aligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: true}],
-    outputSource: dnascope_longread/minimap2_aligned_bam, doc: "Aligned BAM file from Minimap2."}
-  longreadsum_bam_metrics: {type: 'File', outputSource: tar_longreadsum_dir/output,
-    doc: "TAR.GZ file containing longreadsum-generated metrics."}
-  dnascope_small_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}],
-    outputSource: dnascope_longread/dnascope_small_variants, doc: "VCF.GZ file and index containing DNAscope-generated\
-      \ small variant calls."}
-  pbsv_strucutural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}],
-    outputSource: bgzip_tabix_index_pbsv_vcf/output, doc: "VCF.GZ file and index containing\
-      \ pbsv-generated SV calls."}
-  sniffles_structural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi',
-        required: true}], outputSource: sniffles/output_vcf, doc: "VCF.GZ file and\
-      \ index containing sniffles-generated SV calls."}
-  longreadsv_structural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi',
-        required: true}], outputSource: dnascope_longread/longreadsv_structural_variants, doc: "VCF.GZ\
-      \ file and index containing Sentieon LongReadSV-generated SV calls."}
-steps:
-  dnascope_longread:
-    run: ../subworkflows/sentieon-pacbio-hifi-longreads-workflow.cwl
-    in: 
-      input_unaligned_bam: input_unaligned_bam
-      indexed_reference_fasta: indexed_reference_fasta
-      output_basename: output_basename
-      sentieon_license: sentieon_license
-      dnascope_cpu: dnascope_cpu
-      dnascope_ram: dnascope_ram
-    out: [minimap2_aligned_bam, dnascope_small_variants, longreadsv_structural_variants]
-  longreadsum:
-    hints:
-    - class: "sbg:AWSInstanceType"
-      value: c5.9xlarge
-    run: ../tools/longreadsum.cwl
-    in:
-      input_type:
-        valueFrom: "bam"
-      input_file: dnascope_longread/minimap2_aligned_bam
-      output_dir: output_basename
-      output_basename: output_basename
-      log:
-        valueFrom: "test.log"
-      log_level:
-        valueFrom: "2"
-      cpu: longreadsum_cpu
-    out: [outputs]
-  tar_longreadsum_dir:
-    run: ../tools/tar.cwl
-    in:
-      output_filename:
-        source: output_basename
-        valueFrom: $(self).longreadsum.tar.gz
-      input_dir: longreadsum/outputs
-    out: [output]
-  pbsv_discover:
-    hints:
-    - class: "sbg:AWSInstanceType"
-      value: c5.9xlarge
-    run: ../tools/pbsv_discover.cwl
-    in:
-      input_bam: dnascope_longread/minimap2_aligned_bam
-      output_filename:
-        source: output_basename
-        valueFrom: $(self).pbsv.svsig.gz
-      hifi_preset:
-        valueFrom: |
-          $(1 == 1)
-      cpu: pbsv_cpu
-      ram: pbsv_ram
-    out: [output_svsig]
-  pbsv_call:
-    hints:
-    - class: "sbg:AWSInstanceType"
-      value: c5.9xlarge
-    run: ../tools/pbsv_call.cwl
-    in:
-      reference_fasta: indexed_reference_fasta
-      input_svsig: pbsv_discover/output_svsig
-      output_filename:
-        source: output_basename
-        valueFrom: $(self).pbsv.vcf
-      hifi_preset:
-        valueFrom: |
-          $(1 == 1)
-      cpu: pbsv_cpu
-      ram: pbsv_ram
-    out: [output_vcf]
-  bgzip_tabix_index_pbsv_vcf:
-    run: ../tools/bgzip_tabix_index.cwl
-    in:
-      input_vcf: pbsv_call/output_vcf
-      cpu: pbsv_cpu
-    out: [output]
-  sniffles:
-    hints:
-    - class: "sbg:AWSInstanceType"
-      value: c5.9xlarge
-    run: ../tools/sniffles.cwl
-    in:
-      input_bam:
-        source: dnascope_longread/minimap2_aligned_bam
-        valueFrom: $([self])
-      vcf_output_filename:
-        source: output_basename
-        valueFrom: $(self).sniffles.vcf.gz
-      reference_fasta: indexed_reference_fasta
-      cpu: sniffles_cpu
-      ram: sniffles_ram
-    out: [output_vcf, output_snf]
-$namespaces:
-  sbg: https://sevenbridges.com
-hints:
-- class: "sbg:maxNumberOfParallelInstances"
-  value: 2
-"sbg:license": Apache License 2.0
-"sbg:publisher": KFDRC
-"sbg:categories":
-- ALIGNMENT
-- DNA
-- INDEL
-- LONG
-- LONGREADS
-- LONGREADSUM
-- METRICS
-- NANOCALLER
-- PACBIO
-- PACIFIC
-- PBMM2
-- PBSV
-- SENTIEON
-- SNIFFLES
-- SNP
-- SOMATIC
-- STRUCTURAL
-- SV
-- VARIANT
-- WGS
-- WXS
-"sbg:links":
-- id: 'https://github.com/kids-first/kf-longreads-workflow/releases/tag/v2.0.2'
-  label: github-release
diff --git a/workflows/kfdrc-pacbio-longreads-workflow.cwl b/workflows/kfdrc-pacbio-longreads-workflow.cwl
index 42e0e57..d178457 100644
--- a/workflows/kfdrc-pacbio-longreads-workflow.cwl
+++ b/workflows/kfdrc-pacbio-longreads-workflow.cwl
@@ -117,7 +117,7 @@ outputs:
     outputSource: clt_pickvalue/outfile, doc: "Aligned BAM file from Minimap2."}
   longreadsum_bam_metrics: {type: 'File', outputSource: tar_longreadsum_dir/output,
     doc: "TAR.GZ file containing longreadsum-generated metrics."}
-  dnascope_small_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}],
+  dnascope_small_variants: {type: 'File?', secondaryFiles: [{pattern: '.tbi', required: true}],
     outputSource: dnascope/small_variants, doc: "VCF.GZ file and index containing DNAscope-generated\
       \ small variant calls."}
   pbsv_strucutural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi', required: true}],
@@ -234,9 +234,7 @@ steps:
     out: [model_bundle]
   dnascope:
     run: ../tools/sentieon_DNAscope_LongRead_CLI.cwl
-    when: $(inputs.minimap2_preset != "map-pb")
     in:
-      minimap2_preset: minimap2_preset
       sentieon_license: sentieon_license
       reference: indexed_reference_fasta
       input_bam: 
@@ -250,6 +248,9 @@ steps:
         valueFrom: $(self).dnascope.vcf.gz
       skip-mosdepth:
         default: true
+      skip-small-variants:
+        source: minimap2_preset
+        valueFrom: $(inputs.minimap2_preset == "map-pb")
       cpu_per_job: dnascope_cpu
       mem_per_job: dnascope_ram
     out: [small_variants, structural_variants]

From 425873bd2d9c5b4d4984b1c23e7f4fa1eca30bdb Mon Sep 17 00:00:00 2001
From: Haodong Chen <chen.hdong@gmail.com>
Date: Fri, 13 Dec 2024 12:00:19 -0800
Subject: [PATCH 07/11] Fix a typo

---
 workflows/kfdrc-pacbio-longreads-workflow.cwl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/workflows/kfdrc-pacbio-longreads-workflow.cwl b/workflows/kfdrc-pacbio-longreads-workflow.cwl
index d178457..6cee14c 100644
--- a/workflows/kfdrc-pacbio-longreads-workflow.cwl
+++ b/workflows/kfdrc-pacbio-longreads-workflow.cwl
@@ -250,7 +250,7 @@ steps:
         default: true
       skip-small-variants:
         source: minimap2_preset
-        valueFrom: $(inputs.minimap2_preset == "map-pb")
+        valueFrom: $(self == "map-pb")
       cpu_per_job: dnascope_cpu
       mem_per_job: dnascope_ram
     out: [small_variants, structural_variants]

From b021bcf96656e9a51cfdf14b2468c3438cb667c1 Mon Sep 17 00:00:00 2001
From: Haodong Chen <chen.hdong@gmail.com>
Date: Thu, 16 Jan 2025 14:05:56 -0800
Subject: [PATCH 08/11] Add sentieon-cli to the ONT workflow

---
 workflows/kfdrc-ont-longreads-workflow.cwl | 38 +++++++++++++++-------
 1 file changed, 26 insertions(+), 12 deletions(-)

diff --git a/workflows/kfdrc-ont-longreads-workflow.cwl b/workflows/kfdrc-ont-longreads-workflow.cwl
index ecb1d9d..7733d9b 100644
--- a/workflows/kfdrc-ont-longreads-workflow.cwl
+++ b/workflows/kfdrc-ont-longreads-workflow.cwl
@@ -190,8 +190,8 @@ inputs:
   cutesv_ram: {type: 'int?', doc: "RAM (in GB) for cutesv to use."}
   sniffles_cpu: {type: 'int?', doc: "CPU Cores for sniffles to use."}
   sniffles_ram: {type: 'int?', doc: "RAM (in GB) for sniffles to use."}
-  longreadsv_cpu: {type: 'int?', doc: "CPU Cores for Sentieon LongReadSV to use."}
-  longreadsv_ram: {type: 'int?', doc: "RAM (in GB) for Sentieon LongReadSV to use."}
+  dnascope_cpu: {type: 'int?', doc: "CPU Cores for Sentieon DNAscope to use."}
+  dnascope_ram: {type: 'int?', doc: "RAM (in GB) for Sentieon DNAscope to use."}
 outputs:
   minimap2_aligned_bam: {type: 'File', secondaryFiles: [{pattern: '.bai', required: true}],
     outputSource: clt_pickvalue/outfile, doc: "Aligned BAM file from Minimap2."}
@@ -206,8 +206,11 @@ outputs:
   sniffles_structural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi',
         required: true}], outputSource: sniffles/output_vcf, doc: "VCF.GZ file and\
       \ index containing sniffles-generated SV calls."}
+  dnascope_small_variants: {type: 'File?', secondaryFiles: [{pattern: '.tbi', required: true}],
+    outputSource: dnascope/small_variants, doc: "VCF.GZ file and index containing DNAscope-generated\
+      \ small variant calls."}
   longreadsv_structural_variants: {type: 'File', secondaryFiles: [{pattern: '.tbi',
-        required: true}], outputSource: sentieon_longreadsv/output_vcf, doc: "VCF.GZ\
+        required: true}], outputSource: dnascope/structural_variants, doc: "VCF.GZ\
       \ file and index containing Sentieon LongReadSV-generated SV calls."}
 steps:
   samtools_split:
@@ -281,20 +284,31 @@ steps:
           $(self[0] == null ? self[1][0] : self[0])
       cpu: minimap2_cpu
     out: [outfile]
-  sentieon_longreadsv:
-    run: ../tools/sentieon_LongReadSV.cwl
+  download_model:
+    run: ../tools/download_DNAscope_model.cwl
+    in:
+      model_name:
+        valueFrom: "Oxford_Nanopore-WGS"
+    out: [model_bundle]
+  dnascope:
+    run: ../tools/sentieon_DNAscope_LongRead_CLI.cwl
     in:
       sentieon_license: sentieon_license
       reference: indexed_reference_fasta
-      input_bam: clt_pickvalue/outfile
-      platform:
+      input_bam: 
+        source: [clt_pickvalue/outfile]
+        linkMerge: merge_flattened
+      model_bundle: download_model/model_bundle
+      tech:
         valueFrom: "ONT"
-      output_file_name:
+      output_vcf:
         source: output_basename
-        valueFrom: $(self).longreadsv.vcf.gz
-      cpu: longreadsv_cpu
-      ram: longreadsv_ram
-    out: [output_vcf]
+        valueFrom: $(self).dnascope.vcf.gz
+      skip-mosdepth:
+        default: true
+      cpu_per_job: dnascope_cpu
+      mem_per_job: dnascope_ram
+    out: [small_variants, structural_variants]
   longreadsum:
     run: ../tools/longreadsum.cwl
     hints:

From 359f1f55c1f855a6840dc7b63c8913b0e69f33d3 Mon Sep 17 00:00:00 2001
From: dmiller15 <dmiller5191@gmail.com>
Date: Fri, 14 Feb 2025 11:17:00 -0500
Subject: [PATCH 09/11] :wrench: cleanup changes from Sentieon

---
 docs/dockers_ont.md                           |  6 ++--
 docs/dockers_pacbio.md                        |  2 ++
 scripts/get_dnascope_model.py                 | 28 +++++++++++++++++
 tools/download_DNAscope_model.cwl             | 31 ++-----------------
 tools/sentieon_DNAscope_LongRead_CLI.cwl      | 30 ++++--------------
 workflows/kfdrc-ont-longreads-workflow.cwl    | 17 ++++------
 workflows/kfdrc-pacbio-longreads-workflow.cwl |  9 ++----
 7 files changed, 49 insertions(+), 74 deletions(-)
 create mode 100644 scripts/get_dnascope_model.py

diff --git a/docs/dockers_ont.md b/docs/dockers_ont.md
index 834b635..80eec55 100644
--- a/docs/dockers_ont.md
+++ b/docs/dockers_ont.md
@@ -12,8 +12,8 @@ nanocaller_merge.cwl|genomicslab/nanocaller:3.2.0
 samtools_coverage.cwl|staphb/samtools:1.17
 samtools_head.cwl|staphb/samtools:1.17
 samtools_split.cwl|staphb/samtools:1.17
-sentieon_LongReadSV.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202112.06
-sentieon_ReadWriter.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202112.01_hifi
-sentieon_minimap2.cwl|pgc-images.sbgenomics.com/d3b-bixu/sentieon:202112.01_hifi
+sentieon_DNAscope_LongRead_CLI.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202308.03
+sentieon_ReadWriter.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202308.03
+sentieon_minimap2.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202308.03
 sniffles.cwl|pgc-images.sbgenomics.com/d3b-bixu/sniffles:2.0.7
 tar.cwl|None
diff --git a/docs/dockers_pacbio.md b/docs/dockers_pacbio.md
index c262759..73343fd 100644
--- a/docs/dockers_pacbio.md
+++ b/docs/dockers_pacbio.md
@@ -11,5 +11,7 @@ pbsv_discover.cwl|quay.io/biocontainers/pbsv:2.9.0--h9ee0642_0
 samtools_head.cwl|staphb/samtools:1.17
 samtools_split.cwl|staphb/samtools:1.17
 sentieon_DNAscope_LongRead_CLI.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202308.03
+sentieon_ReadWriter.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202308.03
+sentieon_minimap2.cwl|pgc-images.sbgenomics.com/hdchen/sentieon:202308.03
 sniffles.cwl|pgc-images.sbgenomics.com/d3b-bixu/sniffles:2.0.7
 tar.cwl|None
diff --git a/scripts/get_dnascope_model.py b/scripts/get_dnascope_model.py
new file mode 100644
index 0000000..54ae516
--- /dev/null
+++ b/scripts/get_dnascope_model.py
@@ -0,0 +1,28 @@
+#!/usr/bin/env python3
+
+import argparse
+import yaml
+import requests
+import sys
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Download DNAscope model bundle")
+    parser.add_argument("model_name", help="the name of the model bundle, e.g. Illumina_WGS")
+    args = parser.parse_args()
+    model_name = args.model_name.split("-")
+    sentieon_models_yaml = "https://github.com/Sentieon/sentieon-models/raw/refs/heads/main/sentieon_models.yaml"
+    response = requests.get(sentieon_models_yaml, allow_redirects=True)
+    content = response.content.decode("utf-8")
+    content = yaml.safe_load(content)
+    try:
+        url = content["DNAscope_bundles"][model_name[0]][model_name[1]]
+        r = requests.get(url, allow_redirects=True)
+        open(url.split("/")[-1], 'wb').write(r.content)
+    except:
+        open('empty.bundle', 'wb')
+    print('Models updated on: ' + content["Updated on"], file=sys.stderr)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/download_DNAscope_model.cwl b/tools/download_DNAscope_model.cwl
index 73b5571..1eb3bc4 100644
--- a/tools/download_DNAscope_model.cwl
+++ b/tools/download_DNAscope_model.cwl
@@ -12,35 +12,8 @@ requirements:
   - class: InitialWorkDirRequirement
     listing:
       - entryname: get_dnascope_model.py
-        entry: |
-          #!/usr/bin/env python3
-          
-          import argparse
-          import yaml
-          import requests
-          import sys
-          
-          def main():
-              parser = argparse.ArgumentParser(description="Download DNAscope model bundle")
-              parser.add_argument("model_name", help="the name of the model bundle, e.g. Illumina_WGS")
-              args = parser.parse_args()
-              model_name = args.model_name.split("-")
-              sentieon_models_yaml = "https://github.com/Sentieon/sentieon-models/raw/refs/heads/main/sentieon_models.yaml"
-              response = requests.get(sentieon_models_yaml, allow_redirects=True)
-              content = response.content.decode("utf-8")
-              content = yaml.safe_load(content)
-              try:
-                  url = content["DNAscope_bundles"][model_name[0]][model_name[1]]
-                  r = requests.get(url, allow_redirects=True)
-                  open(url.split("/")[-1], 'wb').write(r.content)
-              except:
-                  open('empty.bundle', 'wb')
-              print('Models updated on: ' + content["Updated on"], file=sys.stderr)
-          
-          if __name__ == '__main__':
-              main()
-
-
+        entry:
+          $include: ../scripts/get_dnascope_model.py
 arguments:
   - position: 0
     valueFrom: 'pip install pyyaml requests;'
diff --git a/tools/sentieon_DNAscope_LongRead_CLI.cwl b/tools/sentieon_DNAscope_LongRead_CLI.cwl
index 350506d..cdd383b 100644
--- a/tools/sentieon_DNAscope_LongRead_CLI.cwl
+++ b/tools/sentieon_DNAscope_LongRead_CLI.cwl
@@ -1,6 +1,6 @@
 cwlVersion: v1.2
 class: CommandLineTool
-label: Sentieon_DNAscope_LongRead
+id: sentieon_DNAscope_LongRead_CLI
 doc: |-
   This tool uses **Sentieon DNAscope** to call germline variants from PacBio HiFi reads [1].
 
@@ -11,28 +11,8 @@ doc: |-
 requirements:
 - class: ShellCommandRequirement
 - class: ResourceRequirement
-  coresMin: |-
-    ${
-        if (inputs.cpu_per_job)
-        {
-            return inputs.cpu_per_job
-        }
-        else
-        {
-            return 36
-        }
-    }
-  ramMin: |-
-    ${
-        if (inputs.mem_per_job)
-        {
-            return inputs.mem_per_job
-        }
-        else
-        {
-            return 71000
-        }
-    }
+  coresMin: $(inputs.cpu_per_job)
+  ramMin: $(inputs.mem_per_job * 1000)
 - class: DockerRequirement
   dockerPull: pgc-images.sbgenomics.com/hdchen/sentieon:202308.03
 - class: EnvVarRequirement
@@ -204,10 +184,12 @@ inputs:
     label: CPU per job
     doc: CPU per job
     type: int?
+    default: 36
   mem_per_job:
     label: Memory per job
-    doc: Memory per job[MB].
+    doc: Memory per job[GB].
     type: int?
+    default: 71
 
 outputs:
   small_variants:
diff --git a/workflows/kfdrc-ont-longreads-workflow.cwl b/workflows/kfdrc-ont-longreads-workflow.cwl
index 7733d9b..fd87cd4 100644
--- a/workflows/kfdrc-ont-longreads-workflow.cwl
+++ b/workflows/kfdrc-ont-longreads-workflow.cwl
@@ -22,9 +22,9 @@ doc: |
   ## Relevant Softwares and Versions
   - [samtools head](http://www.htslib.org/doc/samtools-head.html): `1.17`
   - [samtools fastq](http://www.htslib.org/doc/samtools-fastq.html): `1.15.1`
-  - [Sentieon Minimap2](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#minimap2-binary): `202112.01`
-  - [Sentieon util sort](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#util-binary): `202112.01`
-  - [Sentieon LongReadSV](https://support.sentieon.com/manual/): `202112.06`
+  - [Sentieon Minimap2](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#minimap2-binary): `202308.03` 
+  - [Sentieon util sort](https://support.sentieon.com/manual/usages/general/?highlight=minimap2#util-binary): `202308.03`
+  - [Sentieon LongReadSV](https://support.sentieon.com/manual/): `202308.03`
   - [LongReadSum](https://github.com/WGLab/LongReadSum#readme): `1.2.0`
   - [Sniffles](https://github.com/fritzsedlazeck/Sniffles#readme): `2.0.7`
   - [CuteSV](https://github.com/tjiangHIT/cuteSV#readme): `2.0.3`
@@ -90,6 +90,7 @@ inputs:
       \ header."}
   sentieon_license: {type: 'string?', doc: "License server host and port for Sentieon\
       \ tools.", default: "10.5.64.221:8990"}
+  sentieon_dnascope_model: { type: 'File', doc: "Sentieon DNAscoep model bundle." }
   minimap2_preset:
     type:
     - name: minimap2_preset
@@ -284,21 +285,15 @@ steps:
           $(self[0] == null ? self[1][0] : self[0])
       cpu: minimap2_cpu
     out: [outfile]
-  download_model:
-    run: ../tools/download_DNAscope_model.cwl
-    in:
-      model_name:
-        valueFrom: "Oxford_Nanopore-WGS"
-    out: [model_bundle]
   dnascope:
     run: ../tools/sentieon_DNAscope_LongRead_CLI.cwl
     in:
       sentieon_license: sentieon_license
       reference: indexed_reference_fasta
-      input_bam: 
+      input_bam:
         source: [clt_pickvalue/outfile]
         linkMerge: merge_flattened
-      model_bundle: download_model/model_bundle
+      model_bundle: sentieon_dnascope_model
       tech:
         valueFrom: "ONT"
       output_vcf:
diff --git a/workflows/kfdrc-pacbio-longreads-workflow.cwl b/workflows/kfdrc-pacbio-longreads-workflow.cwl
index 6cee14c..0ffee9b 100644
--- a/workflows/kfdrc-pacbio-longreads-workflow.cwl
+++ b/workflows/kfdrc-pacbio-longreads-workflow.cwl
@@ -84,6 +84,7 @@ inputs:
       \ this value will override the SM value provided in the input_unaligned_bam."}
   sentieon_license: {type: 'string?', doc: "License server host and port for Sentieon\
       \ tools.", default: "10.5.64.221:8990"}
+  sentieon_dnascope_model: { type: 'File', doc: "Sentieon DNAscoep model bundle." }
   minimap2_preset:
     type:
     - name: minimap2_preset
@@ -226,12 +227,6 @@ steps:
         valueFrom: $(self).longreadsum.tar.gz
       input_dir: longreadsum/outputs
     out: [output]
-  download_model:
-    run: ../tools/download_DNAscope_model.cwl
-    in:
-      model_name:
-        valueFrom: "PacBio_HiFi-WGS"
-    out: [model_bundle]
   dnascope:
     run: ../tools/sentieon_DNAscope_LongRead_CLI.cwl
     in:
@@ -240,7 +235,7 @@ steps:
       input_bam: 
         source: [clt_pickvalue/outfile]
         linkMerge: merge_flattened
-      model_bundle: download_model/model_bundle
+      model_bundle: sentieon_dnascope_model
       tech:
         valueFrom: "HiFi"
       output_vcf:

From 0eb1d0afdc3b958c7b051de2bc79ae5d7821e3fd Mon Sep 17 00:00:00 2001
From: dmiller15 <dmiller5191@gmail.com>
Date: Mon, 17 Feb 2025 10:12:42 -0500
Subject: [PATCH 10/11] :bug: optional longreadsv

---
 workflows/kfdrc-pacbio-longreads-workflow.cwl | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/workflows/kfdrc-pacbio-longreads-workflow.cwl b/workflows/kfdrc-pacbio-longreads-workflow.cwl
index 0ffee9b..6e3e6b0 100644
--- a/workflows/kfdrc-pacbio-longreads-workflow.cwl
+++ b/workflows/kfdrc-pacbio-longreads-workflow.cwl
@@ -245,7 +245,10 @@ steps:
         default: true
       skip-small-variants:
         source: minimap2_preset
-        valueFrom: $(self == "map-pb")
+        valueFrom: $(self != "map-hifi")
+      skip-svs:
+        source: minimap2_preset
+        valueFrom: $(self != "map-hifi")
       cpu_per_job: dnascope_cpu
       mem_per_job: dnascope_ram
     out: [small_variants, structural_variants]

From 6c25e0d70cc84dcff3d605b1450d5c25089d7386 Mon Sep 17 00:00:00 2001
From: Dan Miller <dmiller15@users.noreply.github.com>
Date: Thu, 27 Feb 2025 11:26:12 -0500
Subject: [PATCH 11/11] :broom: Cleanup typos

Co-authored-by: Miguel Brown <miguel.a.brown@gmail.com>
---
 tools/sentieon_DNAscope_LongRead_CLI.cwl      | 2 +-
 workflows/kfdrc-ont-longreads-workflow.cwl    | 2 +-
 workflows/kfdrc-pacbio-longreads-workflow.cwl | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/sentieon_DNAscope_LongRead_CLI.cwl b/tools/sentieon_DNAscope_LongRead_CLI.cwl
index cdd383b..d6ec424 100644
--- a/tools/sentieon_DNAscope_LongRead_CLI.cwl
+++ b/tools/sentieon_DNAscope_LongRead_CLI.cwl
@@ -80,7 +80,7 @@ inputs:
       prefix: -m
   tech:
     type: string?
-    doc: "{HiFi,ONT}     Sequencing technology used to generate the reads. (default: 'HiFi')"
+    doc: "{HiFi,ONT} Sequencing technology used to generate the reads. (default: 'HiFi')"
     inputBinding:
       position: 7
       prefix: --tech
diff --git a/workflows/kfdrc-ont-longreads-workflow.cwl b/workflows/kfdrc-ont-longreads-workflow.cwl
index fd87cd4..a5fffc4 100644
--- a/workflows/kfdrc-ont-longreads-workflow.cwl
+++ b/workflows/kfdrc-ont-longreads-workflow.cwl
@@ -90,7 +90,7 @@ inputs:
       \ header."}
   sentieon_license: {type: 'string?', doc: "License server host and port for Sentieon\
       \ tools.", default: "10.5.64.221:8990"}
-  sentieon_dnascope_model: { type: 'File', doc: "Sentieon DNAscoep model bundle." }
+  sentieon_dnascope_model: { type: 'File', doc: "Sentieon DNAscope model bundle." }
   minimap2_preset:
     type:
     - name: minimap2_preset
diff --git a/workflows/kfdrc-pacbio-longreads-workflow.cwl b/workflows/kfdrc-pacbio-longreads-workflow.cwl
index 6e3e6b0..d457571 100644
--- a/workflows/kfdrc-pacbio-longreads-workflow.cwl
+++ b/workflows/kfdrc-pacbio-longreads-workflow.cwl
@@ -84,7 +84,7 @@ inputs:
       \ this value will override the SM value provided in the input_unaligned_bam."}
   sentieon_license: {type: 'string?', doc: "License server host and port for Sentieon\
       \ tools.", default: "10.5.64.221:8990"}
-  sentieon_dnascope_model: { type: 'File', doc: "Sentieon DNAscoep model bundle." }
+  sentieon_dnascope_model: { type: 'File', doc: "Sentieon DNAscope model bundle." }
   minimap2_preset:
     type:
     - name: minimap2_preset