Skip to content

Commit

Permalink
Merge pull request #278 from broadinstitute/dp-cdc-delivery
Browse files Browse the repository at this point in the history
cdc delivery refine
  • Loading branch information
dpark01 authored Apr 30, 2021
2 parents 69f019c + 4b49a7d commit c1d835f
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 9 deletions.
37 changes: 37 additions & 0 deletions pipes/WDL/tasks/tasks_utils.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,43 @@ task fetch_row_from_tsv {
}
}
task fetch_col_from_tsv {
input {
File tsv
String col
Boolean drop_empty = true
Boolean drop_header = true
String out_name = "~{basename(basename(tsv, '.txt'), '.tsv')}-~{col}.txt"
}
command <<<
python3 << CODE
import csv, gzip
col = "~{col}"
drop_empty = ~{true="True" false="False" drop_empty}
drop_header = ~{true="True" false="False" drop_header}
open_or_gzopen = lambda *args, **kwargs: gzip.open(*args, **kwargs) if args[0].endswith('.gz') else open(*args, **kwargs)
with open_or_gzopen('~{tsv}', 'rt') as inf:
with open('~{out_name}', 'wt') as outf:
if not drop_header:
outf.write(col+'\n')
for row in csv.DictReader(inf, delimiter='\t'):
x = row.get(col, '')
if x or not drop_empty:
outf.write(x+'\n')
CODE
>>>
output {
File out_txt = "~{out_name}"
}
runtime {
docker: "python:slim"
memory: "1 GB"
cpu: 1
disks: "local-disk 50 HDD"
dx_instance_type: "mem1_ssd1_v2_x2"
}
}
task tsv_join {
meta {
description: "Perform a full left outer join on multiple TSV tables. Each input tsv must have a header row, and each must must contain the value of id_col in its header. Inputs may or may not be gzipped. Unix/Mac/Win line endings are tolerated on input, Unix line endings are emitted as output. Unicode text safe."
Expand Down
29 changes: 20 additions & 9 deletions pipes/WDL/workflows/sarscov2_illumina_full.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ workflow sarscov2_illumina_full {
id_col = 'accession',
out_basename = "biosample_attributes-merged"
}
call utils.fetch_col_from_tsv as accessioned_samples {
input:
tsv = biosample_merge.out_tsv,
col = 'sample_name'
}

### demux, deplete, SRA submission prep, fastqc/multiqc
call demux_deplete.demux_deplete {
Expand Down Expand Up @@ -223,7 +228,7 @@ workflow sarscov2_illumina_full {
output_name = "assembly_metadata-~{flowcell_id}.tsv"
}

### filter out batches where NTCs assemble
### mark up the bad batches or lanes where NTCs assemble
call assembly.filter_bad_ntc_batches {
input:
seqid_list = write_lines(select_all(passing_assembly_ids)),
Expand Down Expand Up @@ -264,6 +269,7 @@ workflow sarscov2_illumina_full {

### filter and concatenate final sets for delivery ("passing" and "submittable")
call sarscov2.sc2_meta_final {
# this decorates assembly_meta_tsv with collab/internal IDs, genome_status, and many other columns
input:
assembly_stats_tsv = assembly_meta_tsv.combined,
collab_ids_tsv = collab_ids_tsv,
Expand All @@ -272,24 +278,34 @@ workflow sarscov2_illumina_full {
genome_status_json = filter_bad_ntc_batches.fail_meta_json
}
call utils.concatenate as passing_cat_prefilter {
# this emits a fasta of only genomes that pass min_unambig
input:
infiles = select_all(passing_assemblies),
output_name = "assemblies_passing-~{flowcell_id}.prefilter.fasta"
}
call nextstrain.filter_sequences_to_list as passing_cat {
call nextstrain.filter_sequences_to_list as passing_ntc {
# this drops all genomes that are failed_NTC
input:
sequences = passing_cat_prefilter.combined,
keep_list = [filter_bad_ntc_batches.seqids_kept],
keep_list = [filter_bad_ntc_batches.seqids_kept]
}
call nextstrain.filter_sequences_to_list as passing_cat {
# this drops all genomes that don't have BioSample accessions (e.g. control libraries)
input:
sequences = passing_ntc.filtered_fasta,
keep_list = [accessioned_samples.out_txt],
out_fname = "assemblies_passing-~{flowcell_id}.fasta"
}
call nextstrain.filter_sequences_to_list as submittable_filter {
# this drops all failed_annotation (aka VADR fails)
input:
sequences = passing_cat.filtered_fasta,
keep_list = [write_lines(select_all(submittable_id))]
}

### prep genbank submission
call ncbi.biosample_to_genbank {
# this takes a BioSample attributes file and emits a Genbank Source Modifier Table
input:
biosample_attributes = biosample_merge.out_tsv,
num_segments = 1,
Expand All @@ -301,14 +317,9 @@ workflow sarscov2_illumina_full {
assembly_stats_tsv = write_tsv(flatten([[['SeqID','Assembly Method','Coverage','Sequencing Technology']],select_all(assembly_cmt)])),
filter_to_ids = biosample_to_genbank.sample_ids
}
call utils.concatenate as passing_genomes {
input:
infiles = select_all(submittable_genomes),
output_name = "assemblies.fasta"
}
call nextstrain.filter_sequences_to_list as submit_genomes {
input:
sequences = passing_genomes.combined,
sequences = submittable_filter.filtered_fasta,
keep_list = [biosample_to_genbank.sample_ids]
}
call ncbi.package_genbank_ftp_submission {
Expand Down

0 comments on commit c1d835f

Please sign in to comment.