From 21fa9afcdfc5f9e41b869f1ad7f31498ea0cdea9 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 29 Apr 2021 15:11:38 -0400 Subject: [PATCH 1/3] start to annotate --- pipes/WDL/workflows/sarscov2_illumina_full.wdl | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pipes/WDL/workflows/sarscov2_illumina_full.wdl b/pipes/WDL/workflows/sarscov2_illumina_full.wdl index c58e4f994..b100632c4 100644 --- a/pipes/WDL/workflows/sarscov2_illumina_full.wdl +++ b/pipes/WDL/workflows/sarscov2_illumina_full.wdl @@ -258,6 +258,7 @@ workflow sarscov2_illumina_full { ### filter and concatenate final sets for delivery ("passing" and "submittable") call sarscov2.sc2_meta_final { + # this decorates assembly_meta_tsv with collab/internal IDs, genome_status, and many other columns input: assembly_stats_tsv = assembly_meta_tsv.combined, collab_ids_tsv = collab_ids_tsv, @@ -266,17 +267,20 @@ workflow sarscov2_illumina_full { genome_status_json = filter_bad_ntc_batches.fail_meta_json } call utils.concatenate as passing_cat_prefilter { + # this emits a fasta of only genomes that pass min_unambig input: infiles = select_all(passing_assemblies), output_name = "assemblies_passing-~{flowcell_id}.prefilter.fasta" } call nextstrain.filter_sequences_to_list as passing_cat { + # this drops all genomes that are failed_NTC input: sequences = passing_cat_prefilter.combined, keep_list = [filter_bad_ntc_batches.seqids_kept], out_fname = "assemblies_passing-~{flowcell_id}.fasta" } call nextstrain.filter_sequences_to_list as submittable_filter { + # this drops all failed_annotation (aka VADR fails) input: sequences = passing_cat.filtered_fasta, keep_list = [write_lines(select_all(submittable_id))] From 9d1699a3c58a6fce187cd4d16cff28a259fd2610 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 29 Apr 2021 18:11:32 -0400 Subject: [PATCH 2/3] filter cdc delivery to real biosamples --- pipes/WDL/tasks/tasks_utils.wdl | 37 +++++++++++++++++++ .../WDL/workflows/sarscov2_illumina_full.wdl | 25 ++++++++----- 2 files changed, 53 insertions(+), 9 deletions(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index 30f3fbd7d..61ad006da 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -132,6 +132,43 @@ task fetch_row_from_tsv { } } +task fetch_col_from_tsv { + input { + File tsv + String col + Boolean drop_empty = true + Boolean drop_header = true + String out_name = "~{tsv}-~{col}.txt" + } + command <<< + python3 << CODE + import csv, gzip + col = "~{col}" + drop_empty = ~{true="True" false="False" drop_empty} + drop_header = ~{true="True" false="False" drop_header} + open_or_gzopen = lambda *args, **kwargs: gzip.open(*args, **kwargs) if args[0].endswith('.gz') else open(*args, **kwargs) + with open_or_gzopen('~{tsv}', 'rt') as inf: + with open('~{out_name}', 'wt') as outf: + if not drop_header: + outf.write(col+'\n') + for row in csv.DictReader(inf, delimiter='\t'): + x = row.get(col, '') + if x or not drop_empty: + outf.write(x+'\n') + CODE + >>> + output { + File out_txt = "~{out_name}" + } + runtime { + docker: "python:slim" + memory: "1 GB" + cpu: 1 + disks: "local-disk 50 HDD" + dx_instance_type: "mem1_ssd1_v2_x2" + } +} + task tsv_join { meta { description: "Perform a full left outer join on multiple TSV tables. Each input tsv must have a header row, and each must must contain the value of id_col in its header. Inputs may or may not be gzipped. Unix/Mac/Win line endings are tolerated on input, Unix line endings are emitted as output. Unicode text safe." diff --git a/pipes/WDL/workflows/sarscov2_illumina_full.wdl b/pipes/WDL/workflows/sarscov2_illumina_full.wdl index b100632c4..d7bdfd450 100644 --- a/pipes/WDL/workflows/sarscov2_illumina_full.wdl +++ b/pipes/WDL/workflows/sarscov2_illumina_full.wdl @@ -69,6 +69,11 @@ workflow sarscov2_illumina_full { id_col = 'accession', out_basename = "biosample_attributes-merged" } + call utils.fetch_col_from_tsv as accessioned_samples { + input: + tsv = biosample_merge.out_tsv, + col = 'sample_name' + } ### demux, deplete, SRA submission prep, fastqc/multiqc call demux_deplete.demux_deplete { @@ -223,7 +228,7 @@ workflow sarscov2_illumina_full { output_name = "assembly_metadata-~{flowcell_id}.tsv" } - ### filter out batches where NTCs assemble + ### mark up the bad batches or lanes where NTCs assemble call assembly.filter_bad_ntc_batches { input: seqid_list = write_lines(select_all(passing_assembly_ids)), @@ -272,11 +277,17 @@ workflow sarscov2_illumina_full { infiles = select_all(passing_assemblies), output_name = "assemblies_passing-~{flowcell_id}.prefilter.fasta" } - call nextstrain.filter_sequences_to_list as passing_cat { + call nextstrain.filter_sequences_to_list as passing_ntc { # this drops all genomes that are failed_NTC input: sequences = passing_cat_prefilter.combined, - keep_list = [filter_bad_ntc_batches.seqids_kept], + keep_list = [filter_bad_ntc_batches.seqids_kept] + } + call nextstrain.filter_sequences_to_list as passing_cat { + # this drops all genomes that don't have BioSample accessions (e.g. control libraries) + input: + sequences = passing_ntc.filtered_fasta, + keep_list = [accessioned_samples.out_txt], out_fname = "assemblies_passing-~{flowcell_id}.fasta" } call nextstrain.filter_sequences_to_list as submittable_filter { @@ -288,6 +299,7 @@ workflow sarscov2_illumina_full { ### prep genbank submission call ncbi.biosample_to_genbank { + # this takes a BioSample attributes file and emits a Genbank Source Modifier Table input: biosample_attributes = biosample_merge.out_tsv, num_segments = 1, @@ -299,14 +311,9 @@ workflow sarscov2_illumina_full { assembly_stats_tsv = write_tsv(flatten([[['SeqID','Assembly Method','Coverage','Sequencing Technology']],select_all(assembly_cmt)])), filter_to_ids = biosample_to_genbank.sample_ids } - call utils.concatenate as passing_genomes { - input: - infiles = select_all(submittable_genomes), - output_name = "assemblies.fasta" - } call nextstrain.filter_sequences_to_list as submit_genomes { input: - sequences = passing_genomes.combined, + sequences = submittable_filter.filtered_fasta, keep_list = [biosample_to_genbank.sample_ids] } call ncbi.package_genbank_ftp_submission { From 454ffdb539c21e1e7d648b9e2eff4ed89cd94107 Mon Sep 17 00:00:00 2001 From: Danny Park Date: Thu, 29 Apr 2021 20:00:01 -0400 Subject: [PATCH 3/3] fix --- pipes/WDL/tasks/tasks_utils.wdl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pipes/WDL/tasks/tasks_utils.wdl b/pipes/WDL/tasks/tasks_utils.wdl index 61ad006da..cf4973cf4 100644 --- a/pipes/WDL/tasks/tasks_utils.wdl +++ b/pipes/WDL/tasks/tasks_utils.wdl @@ -138,7 +138,7 @@ task fetch_col_from_tsv { String col Boolean drop_empty = true Boolean drop_header = true - String out_name = "~{tsv}-~{col}.txt" + String out_name = "~{basename(basename(tsv, '.txt'), '.tsv')}-~{col}.txt" } command <<< python3 << CODE