From 21fa9afcdfc5f9e41b869f1ad7f31498ea0cdea9 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Thu, 29 Apr 2021 15:11:38 -0400
Subject: [PATCH 1/3] start to annotate

---
 pipes/WDL/workflows/sarscov2_illumina_full.wdl | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pipes/WDL/workflows/sarscov2_illumina_full.wdl b/pipes/WDL/workflows/sarscov2_illumina_full.wdl
index c58e4f994..b100632c4 100644
--- a/pipes/WDL/workflows/sarscov2_illumina_full.wdl
+++ b/pipes/WDL/workflows/sarscov2_illumina_full.wdl
@@ -258,6 +258,7 @@ workflow sarscov2_illumina_full {
 
     ### filter and concatenate final sets for delivery ("passing" and "submittable")
     call sarscov2.sc2_meta_final {
+      # this decorates assembly_meta_tsv with collab/internal IDs, genome_status, and many other columns
       input:
         assembly_stats_tsv = assembly_meta_tsv.combined,
         collab_ids_tsv = collab_ids_tsv,
@@ -266,17 +267,20 @@ workflow sarscov2_illumina_full {
         genome_status_json = filter_bad_ntc_batches.fail_meta_json
     }
     call utils.concatenate as passing_cat_prefilter {
+      # this emits a fasta of only genomes that pass min_unambig
       input:
         infiles     = select_all(passing_assemblies),
         output_name = "assemblies_passing-~{flowcell_id}.prefilter.fasta"
     }
     call nextstrain.filter_sequences_to_list as passing_cat {
+      # this drops all genomes that are failed_NTC
       input:
         sequences = passing_cat_prefilter.combined,
         keep_list = [filter_bad_ntc_batches.seqids_kept],
         out_fname = "assemblies_passing-~{flowcell_id}.fasta"
     }
     call nextstrain.filter_sequences_to_list as submittable_filter {
+      # this drops all failed_annotation (aka VADR fails)
       input:
         sequences = passing_cat.filtered_fasta,
         keep_list = [write_lines(select_all(submittable_id))]

From 9d1699a3c58a6fce187cd4d16cff28a259fd2610 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Thu, 29 Apr 2021 18:11:32 -0400
Subject: [PATCH 2/3] filter cdc delivery to real biosamples

---
 pipes/WDL/tasks/tasks_utils.wdl               | 37 +++++++++++++++++++
 .../WDL/workflows/sarscov2_illumina_full.wdl  | 25 ++++++++-----
 2 files changed, 53 insertions(+), 9 deletions(-)

diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl
index 30f3fbd7d..61ad006da 100644
--- a/pipes/WDL/tasks/tasks_utils.wdl
+++ b/pipes/WDL/tasks/tasks_utils.wdl
@@ -132,6 +132,43 @@ task fetch_row_from_tsv {
   }
 }
 
+task fetch_col_from_tsv {
+  input {
+    File          tsv
+    String        col
+    Boolean       drop_empty = true
+    Boolean       drop_header = true
+    String        out_name = "~{tsv}-~{col}.txt"
+  }
+  command <<<
+    python3 << CODE
+    import csv, gzip
+    col = "~{col}"
+    drop_empty = ~{true="True" false="False" drop_empty}
+    drop_header = ~{true="True" false="False" drop_header}
+    open_or_gzopen = lambda *args, **kwargs: gzip.open(*args, **kwargs) if args[0].endswith('.gz') else open(*args, **kwargs)
+    with open_or_gzopen('~{tsv}', 'rt') as inf:
+      with open('~{out_name}', 'wt') as outf:
+        if not drop_header:
+          outf.write(col+'\n')
+        for row in csv.DictReader(inf, delimiter='\t'):
+          x = row.get(col, '')
+          if x or not drop_empty:
+            outf.write(x+'\n')
+    CODE
+  >>>
+  output {
+    File  out_txt  = "~{out_name}"
+  }
+  runtime {
+    docker: "python:slim"
+    memory: "1 GB"
+    cpu: 1
+    disks: "local-disk 50 HDD"
+    dx_instance_type: "mem1_ssd1_v2_x2"
+  }
+}
+
 task tsv_join {
   meta {
       description: "Perform a full left outer join on multiple TSV tables. Each input tsv must have a header row, and each must must contain the value of id_col in its header. Inputs may or may not be gzipped. Unix/Mac/Win line endings are tolerated on input, Unix line endings are emitted as output. Unicode text safe."
diff --git a/pipes/WDL/workflows/sarscov2_illumina_full.wdl b/pipes/WDL/workflows/sarscov2_illumina_full.wdl
index b100632c4..d7bdfd450 100644
--- a/pipes/WDL/workflows/sarscov2_illumina_full.wdl
+++ b/pipes/WDL/workflows/sarscov2_illumina_full.wdl
@@ -69,6 +69,11 @@ workflow sarscov2_illumina_full {
             id_col       = 'accession',
             out_basename = "biosample_attributes-merged"
     }
+    call utils.fetch_col_from_tsv as accessioned_samples {
+      input:
+        tsv = biosample_merge.out_tsv,
+        col = 'sample_name'
+    }
 
     ### demux, deplete, SRA submission prep, fastqc/multiqc
     call demux_deplete.demux_deplete {
@@ -223,7 +228,7 @@ workflow sarscov2_illumina_full {
         output_name = "assembly_metadata-~{flowcell_id}.tsv"
     }
 
-    ### filter out batches where NTCs assemble
+    ### mark up the bad batches or lanes where NTCs assemble
     call assembly.filter_bad_ntc_batches {
       input:
         seqid_list = write_lines(select_all(passing_assembly_ids)),
@@ -272,11 +277,17 @@ workflow sarscov2_illumina_full {
         infiles     = select_all(passing_assemblies),
         output_name = "assemblies_passing-~{flowcell_id}.prefilter.fasta"
     }
-    call nextstrain.filter_sequences_to_list as passing_cat {
+    call nextstrain.filter_sequences_to_list as passing_ntc {
       # this drops all genomes that are failed_NTC
       input:
         sequences = passing_cat_prefilter.combined,
-        keep_list = [filter_bad_ntc_batches.seqids_kept],
+        keep_list = [filter_bad_ntc_batches.seqids_kept]
+    }
+    call nextstrain.filter_sequences_to_list as passing_cat {
+      # this drops all genomes that don't have BioSample accessions (e.g. control libraries)
+      input:
+        sequences = passing_ntc.filtered_fasta,
+        keep_list = [accessioned_samples.out_txt],
         out_fname = "assemblies_passing-~{flowcell_id}.fasta"
     }
     call nextstrain.filter_sequences_to_list as submittable_filter {
@@ -288,6 +299,7 @@ workflow sarscov2_illumina_full {
 
     ### prep genbank submission
     call ncbi.biosample_to_genbank {
+      # this takes a BioSample attributes file and emits a Genbank Source Modifier Table
       input:
         biosample_attributes = biosample_merge.out_tsv,
         num_segments         = 1,
@@ -299,14 +311,9 @@ workflow sarscov2_illumina_full {
         assembly_stats_tsv = write_tsv(flatten([[['SeqID','Assembly Method','Coverage','Sequencing Technology']],select_all(assembly_cmt)])),
         filter_to_ids      = biosample_to_genbank.sample_ids
     }
-    call utils.concatenate as passing_genomes {
-      input:
-        infiles     = select_all(submittable_genomes),
-        output_name = "assemblies.fasta"
-    }
     call nextstrain.filter_sequences_to_list as submit_genomes {
       input:
-        sequences = passing_genomes.combined,
+        sequences = submittable_filter.filtered_fasta,
         keep_list = [biosample_to_genbank.sample_ids]
     }
     call ncbi.package_genbank_ftp_submission {

From 454ffdb539c21e1e7d648b9e2eff4ed89cd94107 Mon Sep 17 00:00:00 2001
From: Danny Park <dpark@broadinstitute.org>
Date: Thu, 29 Apr 2021 20:00:01 -0400
Subject: [PATCH 3/3] fix

---
 pipes/WDL/tasks/tasks_utils.wdl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl
index 61ad006da..cf4973cf4 100644
--- a/pipes/WDL/tasks/tasks_utils.wdl
+++ b/pipes/WDL/tasks/tasks_utils.wdl
@@ -138,7 +138,7 @@ task fetch_col_from_tsv {
     String        col
     Boolean       drop_empty = true
     Boolean       drop_header = true
-    String        out_name = "~{tsv}-~{col}.txt"
+    String        out_name = "~{basename(basename(tsv, '.txt'), '.tsv')}-~{col}.txt"
   }
   command <<<
     python3 << CODE