Skip to content

Commit ad145e5

Browse files
authored
Provenance (#11)
* Add hashing of inputs and unicycler version * Collect provenance * Describe provenance output in README * newline * newline
1 parent 48b3e3d commit ad145e5

12 files changed

+138
-7
lines changed

README.md

+26
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ If we have `sample-01_R{1,2}.fastq.gz`, the output directory will be:
5151

5252
```
5353
sample-01
54+
├── sample-01_20211125165316_provenance.yml
5455
├── sample-01_fastp.csv
5556
├── sample-01_fastp.json
5657
├── sample-01_prokka.gbk
@@ -65,6 +66,8 @@ Including the tool name suffixes to output files allows re-analysis of the same
6566

6667
```
6768
sample-01
69+
├── sample-01_20211125165316_provenance.yml
70+
├── sample-01_20211128122118_provenance.yml
6871
├── sample-01_bakta.gbk
6972
├── sample-01_bakta.gff
7073
├── sample-01_bakta.json
@@ -81,3 +84,26 @@ sample-01
8184
├── sample-01_unicycler.gfa
8285
└── sample-01_unicycler.log
8386
```
87+
88+
### Provenance files
89+
For each pipeline invocation, each sample will produce a `provenance.yml` file with the following contents:
90+
91+
```yml
92+
- tool_name: fastp
93+
tool_version: 0.23.1
94+
- tool_name: shovill
95+
tool_version: 1.1.0
96+
- tool_name: prokka
97+
tool_version: 1.14.5
98+
- tool_name: quast
99+
tool_version: v5.0.2
100+
- input_filename: sample-01_R1.fastq.gz
101+
sha256: 4ac3055ac5f03114a005aff033e7018ea98486cbebdae669880e3f0511ed21bb
102+
- input_filename: sample-01_R2.fastq.gz
103+
sha256: 8db388f56a51920752319c67b5308c7e99f2a566ca83311037a425f8d6bb1ecc
104+
- pipeline_name: BCCDC-PHL/routine-assembly
105+
pipeline_version: 0.1.0
106+
- timestamp_analysis_start: 2021-11-25T16:53:10.549863
107+
```
108+
109+
The filename of the provenance file includes a timestamp with format `YYYYMMDDHHMMSS` to ensure that re-analysis of the same sample will create a unique `provenance.yml` file.

environments/environment.yml

+1
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ channels:
55
- defaults
66
dependencies:
77
- python=3
8+
- perl-digest-sha=5.88
89
- fastp=0.23.1
910
- shovill=1.1.0
1011
- prokka=1.14.5

main.nf

+39-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,10 @@
11
#!/usr/bin/env nextflow
22

3+
import java.time.LocalDateTime
4+
35
nextflow.enable.dsl = 2
46

7+
include { hash_files } from './modules/hash_files.nf'
58
include { fastp } from './modules/fastp.nf'
69
include { fastp_json_to_csv } from './modules/fastp.nf'
710
include { shovill } from './modules/shovill.nf'
@@ -10,32 +13,66 @@ include { prokka } from './modules/prokka.nf'
1013
include { bakta } from './modules/bakta.nf'
1114
include { quast } from './modules/quast.nf'
1215
include { parse_quast_report } from './modules/quast.nf'
16+
include { pipeline_provenance } from './modules/provenance.nf'
17+
include { collect_provenance } from './modules/provenance.nf'
1318

1419

1520
workflow {
21+
ch_start_time = Channel.of(LocalDateTime.now())
22+
ch_pipeline_name = Channel.of(workflow.manifest.name)
23+
ch_pipeline_version = Channel.of(workflow.manifest.version)
24+
25+
ch_pipeline_provenance = pipeline_provenance(ch_pipeline_name.combine(ch_pipeline_version).combine(ch_start_time))
26+
1627
if (params.samplesheet_input != 'NO_FILE') {
17-
ch_fastq = Channel.fromPath(params.samplesheet_input).splitCsv(header: true)
28+
ch_fastq = Channel.fromPath(params.samplesheet_input).splitCsv(header: true).map{ it -> [it['ID'], it['R1'], it['R2']] }
1829
} else {
1930
ch_fastq = Channel.fromFilePairs( params.fastq_search_path, flat: true ).map{ it -> [it[0].split('_')[0], it[1], it[2]] }.unique{ it -> it[0] }
2031
}
32+
2133
run_shovill = params.unicycler ? false : true
2234
run_unicycler = run_shovill ? false : true
2335
run_prokka = params.bakta ? false : true
2436
run_bakta = run_prokka ? false : true
2537

2638
main:
39+
hash_files(ch_fastq.map{ it -> [it[0], [it[1], it[2]]] }.combine(Channel.of("fastq-input")))
40+
2741
fastp(ch_fastq)
2842
fastp_json_to_csv(fastp.out.json)
43+
2944
if (run_shovill) {
3045
ch_assembly = shovill(fastp.out.trimmed_reads).assembly
3146
} else {
3247
ch_assembly = unicycler(fastp.out.trimmed_reads).assembly
3348
}
49+
3450
if (run_prokka) {
3551
prokka(ch_assembly)
3652
} else if (run_bakta) {
3753
bakta(ch_assembly)
3854
}
55+
3956
quast(ch_assembly)
40-
parse_quast_report(quast.out)
57+
parse_quast_report(quast.out.tsv)
58+
59+
ch_provenance = fastp.out.provenance
60+
61+
if (run_shovill) {
62+
ch_provenance = ch_provenance.join(shovill.out.provenance).map{ it -> [it[0], [it[1], it[2]]] }
63+
}
64+
if (run_unicycler) {
65+
ch_provenance = ch_provenance.join(unicycler.out.provenance).map{ it -> [it[0], [it[1], it[2]]] }
66+
}
67+
if (run_prokka) {
68+
ch_provenance = ch_provenance.join(prokka.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
69+
}
70+
if (run_bakta) {
71+
ch_provenance = ch_provenance.join(bakta.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
72+
}
73+
ch_provenance = ch_provenance.join(quast.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
74+
75+
ch_provenance = ch_provenance.join(hash_files.out.provenance).map{ it -> [it[0], it[1] << it[2]] }
76+
ch_provenance = ch_provenance.join(ch_fastq.map{ it -> it[0] }.combine(ch_pipeline_provenance)).map{ it -> [it[0], it[1] << it[2]] }
77+
collect_provenance(ch_provenance)
4178
}

modules/bakta.nf

+3-1
Original file line numberDiff line numberDiff line change
@@ -8,13 +8,15 @@ process bakta {
88
tuple val(sample_id), path(assembly)
99

1010
output:
11-
tuple val(sample_id), path("${sample_id}_bakta.gbk"), emit: genbank
11+
tuple val(sample_id), path("${sample_id}_bakta.gbk"), emit: gbk
1212
tuple val(sample_id), path("${sample_id}_bakta.gff"), emit: gff
1313
tuple val(sample_id), path("${sample_id}_bakta.json"), emit: json
1414
tuple val(sample_id), path("${sample_id}_bakta.log"), emit: log
15+
tuple val(sample_id), path("${sample_id}_bakta_provenance.yml"), emit: provenance
1516

1617
script:
1718
"""
19+
printf -- "- tool_name: bakta\\n tool_version: \$(bakta --version | cut -d ' ' -f 2)\\n" > ${sample_id}_bakta_provenance.yml
1820
bakta --db ${params.bakta_db} --threads ${task.cpus} --compliant --keep-contig-headers --locus-tag ${sample_id} --prefix "${sample_id}" ${assembly}
1921
cp ${sample_id}.gff3 ${sample_id}_bakta.gff
2022
cp ${sample_id}.gbff ${sample_id}_bakta.gbk

modules/fastp.nf

+2
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,12 @@ process fastp {
99
output:
1010
tuple val(sample_id), path("${sample_id}_R1.trim.fastq.gz"), path("${sample_id}_R2.trim.fastq.gz"), emit: trimmed_reads
1111
tuple val(sample_id), path("${sample_id}_fastp.json"), emit: json
12+
tuple val(sample_id), path("${sample_id}_fastp_provenance.yml"), emit: provenance
1213

1314

1415
script:
1516
"""
17+
printf -- "- tool_name: fastp\\n tool_version: \$(fastp --version 2>&1 | cut -d ' ' -f 2)\\n" > ${sample_id}_fastp_provenance.yml
1618
fastp \
1719
-t ${task.cpus} \
1820
-i ${reads_1} \

modules/hash_files.nf

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
process hash_files {
2+
3+
tag { sample_id + " / " + file_type }
4+
5+
input:
6+
tuple val(sample_id), path(files_to_hash), val(file_type)
7+
8+
output:
9+
tuple val(sample_id), path("${sample_id}_${file_type}.sha256.csv"), emit: csv
10+
tuple val(sample_id), path("${sample_id}_${file_type}_provenance.yml"), emit: provenance
11+
12+
script:
13+
"""
14+
shasum -a 256 ${files_to_hash} | tr -s ' ' ',' > ${sample_id}_${file_type}.sha256.csv
15+
while IFS=',' read -r hash filename; do
16+
printf -- "- input_filename: \$filename\\n sha256: \$hash\\n" >> ${sample_id}_${file_type}_provenance.yml
17+
done < ${sample_id}_${file_type}.sha256.csv
18+
"""
19+
}

modules/prokka.nf

+4-1
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,13 @@ process prokka {
88
tuple val(sample_id), path(assembly)
99

1010
output:
11-
tuple val(sample_id), path("${sample_id}_prokka.gbk"), path("${sample_id}_prokka.gff")
11+
tuple val(sample_id), path("${sample_id}_prokka.gbk"), emit: gbk
12+
tuple val(sample_id), path("${sample_id}_prokka.gff"), emit: gff
13+
tuple val(sample_id), path("${sample_id}_prokka_provenance.yml"), emit: provenance
1214

1315
script:
1416
"""
17+
printf -- "- tool_name: prokka\\n tool_version: \$(prokka --version 2>&1 | cut -d ' ' -f 2)\\n" > ${sample_id}_prokka_provenance.yml
1518
prokka --cpus ${task.cpus} --compliant --locustag ${sample_id} --centre "BCCDC-PHL" --prefix "${sample_id}" ${assembly}
1619
cp ${sample_id}/${sample_id}.gbk ${sample_id}_prokka.gbk
1720
cp ${sample_id}/${sample_id}.gff ${sample_id}_prokka.gff

modules/provenance.nf

+35
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
process collect_provenance {
2+
tag { sample_id }
3+
4+
executor 'local'
5+
6+
publishDir "${params.outdir}/${sample_id}", pattern: "${sample_id}_*_provenance.yml", mode: 'copy'
7+
8+
input:
9+
tuple val(sample_id), path(provenance_files)
10+
11+
output:
12+
tuple val(sample_id), file("${sample_id}_*_provenance.yml")
13+
14+
script:
15+
"""
16+
cat ${provenance_files} > ${sample_id}_\$(date +%Y%m%d%H%M%S)_provenance.yml
17+
"""
18+
}
19+
20+
process pipeline_provenance {
21+
tag { pipeline_name + " / " + pipeline_version }
22+
23+
executor 'local'
24+
25+
input:
26+
tuple val(pipeline_name), val(pipeline_version), val(analysis_start)
27+
28+
output:
29+
file("pipeline_provenance.yml")
30+
31+
script:
32+
"""
33+
printf -- "- pipeline_name: ${pipeline_name}\\n pipeline_version: ${pipeline_version}\\n- timestamp_analysis_start: ${analysis_start}\\n" > pipeline_provenance.yml
34+
"""
35+
}

modules/quast.nf

+3-1
Original file line numberDiff line numberDiff line change
@@ -10,10 +10,12 @@ process quast {
1010
tuple val(sample_id), path(assembly)
1111

1212
output:
13-
tuple val(sample_id), path("${sample_id}_quast.tsv")
13+
tuple val(sample_id), path("${sample_id}_quast.tsv"), emit: tsv
14+
tuple val(sample_id), path("${sample_id}_quast_provenance.yml"), emit: provenance
1415

1516
script:
1617
"""
18+
printf -- "- tool_name: quast\\n tool_version: \$(quast --version | cut -d ' ' -f 2)\\n" > ${sample_id}_quast_provenance.yml
1719
quast --threads ${task.cpus} ${assembly} --space-efficient --fast --output-dir ${sample_id}
1820
mv ${sample_id}/transposed_report.tsv ${sample_id}_quast.tsv
1921
"""

modules/shovill.nf

+2-1
Original file line numberDiff line numberDiff line change
@@ -4,17 +4,18 @@ process shovill {
44

55
publishDir "${params.outdir}/${sample_id}", pattern: "${sample_id}_shovill.{fa,log}", mode: 'copy'
66

7-
cpus 8
87

98
input:
109
tuple val(sample_id), path(reads_1), path(reads_2)
1110

1211
output:
1312
tuple val(sample_id), path("${sample_id}_shovill.fa"), emit: assembly
1413
tuple val(sample_id), path("${sample_id}_shovill.log"), emit: log
14+
tuple val(sample_id), path("${sample_id}_shovill_provenance.yml"), emit: provenance
1515

1616
script:
1717
"""
18+
printf -- "- tool_name: shovill\\n tool_version: \$(shovill --version | cut -d ' ' -f 2)\\n" > ${sample_id}_shovill_provenance.yml
1819
shovill --cpus ${task.cpus} --trim --namefmt \"${sample_id}_contig%0d\" --outdir ${sample_id}_assembly --R1 ${reads_1} --R2 ${reads_2}
1920
cp ${sample_id}_assembly/contigs.fa ${sample_id}_shovill.fa
2021
cp ${sample_id}_assembly/shovill.log ${sample_id}_shovill.log

modules/unicycler.nf

+2
Original file line numberDiff line numberDiff line change
@@ -11,9 +11,11 @@ process unicycler {
1111
tuple val(sample_id), path("${sample_id}_unicycler.fa"), emit: assembly
1212
tuple val(sample_id), path("${sample_id}_unicycler.gfa"), emit: assembly_graph
1313
tuple val(sample_id), path("${sample_id}_unicycler.log"), emit: log
14+
tuple val(sample_id), path("${sample_id}_unicycler_provenance.yml"), emit: provenance
1415

1516
script:
1617
"""
18+
printf -- "- tool_name: unicycler\\n tool_version: \$(unicycler --version | cut -d ' ' -f 2)\\n" > ${sample_id}_unicycler_provenance.yml
1719
unicycler --threads ${task.cpus} -1 ${reads_1} -2 ${reads_2} -o ${sample_id}_assembly
1820
sed 's/^>/>${sample_id}_/' ${sample_id}_assembly/assembly.fasta > ${sample_id}_unicycler.fa
1921
cp ${sample_id}_assembly/assembly.gfa ${sample_id}_unicycler.gfa

nextflow.config

+2-1
Original file line numberDiff line numberDiff line change
@@ -56,9 +56,10 @@ process {
5656

5757
manifest {
5858
author = 'Dan Fornika'
59+
name = 'BCCDC-PHL/routine-assembly'
60+
version = '0.2.0'
5961
description = 'BCCDC-PHL Routine Assembly'
6062
mainScript = 'main.nf'
6163
nextflowVersion = '>=20.01.0'
62-
version = '0.1.0'
6364
}
6465

0 commit comments

Comments
 (0)