Skip to content

Commit

Permalink
Merge pull request #18 from remiolsen/dev
Browse files Browse the repository at this point in the history
Dev -> version 0.4.1
  • Loading branch information
remiolsen authored Nov 16, 2021
2 parents c8f7d66 + f6510c6 commit 37551be
Show file tree
Hide file tree
Showing 6 changed files with 28 additions and 58 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
*.pyc
*~
*.egg-info
.DS_Store
11 changes: 4 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@ Python modules:
Software:

* minimap2 v. 2.17
* fastqc v. 0.11.9 (optional)
* multiqc v. 1.8 (optional)

### Manually using pip

Expand Down Expand Up @@ -84,7 +82,7 @@ P12345_101,truseq,CAGGACGT,/path/to/ONTreads.fastq.gz
Then run:

```
anglerfish.py -o /path/to/samples.csv
anglerfish.py -s /path/to/samples.csv
```

### Optional
Expand All @@ -97,9 +95,8 @@ anglerfish.py -o /path/to/samples.csv
--threads THREADS, -t THREADS
Number of threads to use (default: 4)
--skip_demux, -c Only do BC counting and not demuxing
--skip_fastqc, -f After demuxing, skip running FastQC+MultiQC
--max-distance MAX_DISTANCE, -m MAX_DISTANCE
Manually adjust maximum edit distance for BC matching
Manually set maximum edit distance for BC matching, automatically set this is set to either 1 or 2
```

### Output files
Expand All @@ -108,8 +105,8 @@ In folder `anglerfish_????_??_??_?????/`

* `*.fastq.gz` Demultuplexed reads (if any)
* `anglerfish_stats.txt` Barcode statistics from anglerfish run
* `fastqc/` raw output from fastqc (if run)
* `multiqc/anglerfish_results_multiqc_report.html` Summary of demultiplexed reads
* `anglerfish_stats.json` Machine readable anglerfish statistics


## Credits

Expand Down
35 changes: 20 additions & 15 deletions anglerfish.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from datetime import datetime as dt
from itertools import groupby
from collections import Counter
from demux.demux import run_minimap2, parse_paf_lines, layout_matches, cluster_matches, write_demuxedfastq, run_fastqc
from demux.demux import run_minimap2, parse_paf_lines, layout_matches, cluster_matches, write_demuxedfastq
from demux.samplesheet import SampleSheet
import gzip
logging.basicConfig(level=logging.INFO)
Expand All @@ -30,7 +30,10 @@ def run_demux(args):
log.error("There is one or more identical barcodes in the input samplesheet. Aborting!")
exit()
if args.max_distance == None:
args.max_distance = bc_dist - 1
if bc_dist > 1:
args.max_distance = 2
else:
args.max_distance = 1
log.info("Using maximum edit distance of {}".format(args.max_distance))
if args.max_distance >= bc_dist:
log.error(" Edit distance of barcodes in samplesheet are less than the minimum specified {}>={}".format(args.max_distance, bc_dist))
Expand Down Expand Up @@ -74,14 +77,14 @@ def run_demux(args):
fragments, singletons, concats, unknowns = layout_matches(adaptor_name+"_i5",adaptor_name+"_i7",paf_entries)
total = len(fragments)+len(singletons)+len(concats)+len(unknowns)

paf_stats[aln_name] = []
paf_stats[aln_name].append(aln_name+":")
paf_stats[aln_name].append("{}\tinput reads".format(fq_entries))
paf_stats[aln_name].append("{}\treads aligning to adaptor sequences ({:.2f}%)".format(total, (total/float(fq_entries)*100)))
paf_stats[aln_name].append("{}\taligned reads matching both I7 and I5 adaptor ({:.2f}%)".format(len(fragments), (len(fragments)/float(total)*100)))
paf_stats[aln_name].append("{}\taligned reads matching only I7 or I5 adaptor ({:.2f}%)".format(len(singletons), (len(singletons)/float(total)*100)))
paf_stats[aln_name].append("{}\taligned reads matching multiple I7/I5 adaptor pairs ({:.2f}%)".format(len(concats), (len(concats)/float(total)*100)))
paf_stats[aln_name].append("{}\taligned reads with uncategorized alignments ({:.2f}%)".format(len(unknowns), (len(unknowns)/float(total)*100)))
paf_stats[aln_name] = {}
paf_stats[aln_name]["input_reads"] = [fq_entries, 1.0]
paf_stats[aln_name]["reads aligning to adaptor sequences"] = [total, total/float(fq_entries)]
paf_stats[aln_name]["aligned reads matching both I7 and I5 adaptor"] = [len(fragments), len(fragments)/float(total)]
paf_stats[aln_name]["aligned reads matching only I7 or I5 adaptor"] = [len(singletons), len(singletons)/float(total)]
paf_stats[aln_name]["aligned reads matching multiple I7/I5 adaptor pairs"] = [len(concats), len(concats)/float(total)]
paf_stats[aln_name]["aligned reads with uncategorized alignments"] = [len(unknowns), len(unknowns)/float(total)]

no_matches, matches = cluster_matches(adaptors_sorted[key], adaptor_name, fragments, args.max_distance)

aligned_samples = []
Expand Down Expand Up @@ -119,7 +122,9 @@ def run_demux(args):
with open(os.path.join(args.out_fastq,"anglerfish_stats.txt"), "w") as f:
f.write("Anglerfish v. "+version+"\n===================\n")
for key, line in paf_stats.items():
f.write("\n".join(line)+"\n")
f.write(f"{key}:\n")
for i,j in line.items():
f.write(f"{j[0]}\t{i} ({j[1]*100:.2f}%)\n")
json_out["paf_stats"].append(line)
f.write("\n{}\n".format("\t".join(header1)))
for sample in sample_stats:
Expand All @@ -133,8 +138,8 @@ def run_demux(args):

with open(os.path.join(args.out_fastq,"anglerfish_stats.json"), "w") as f:
f.write(json.dumps(json_out,indent=2, sort_keys=True))
if not args.skip_fastqc and not args.skip_demux:
fastqc = run_fastqc(out_fastqs, args.out_fastq, args.threads)
if args.skip_fastqc:
log.warning(" As of version 0.4.1, built in support for FastQC + MultiQC is removed. The '-f' flag is redundant.")


if __name__ == "__main__":
Expand All @@ -143,8 +148,8 @@ def run_demux(args):
parser.add_argument('--out_fastq', '-o', default='.', help='Analysis output folder (default: Current dir)')
parser.add_argument('--threads', '-t', default=4, help='Number of threads to use (default: 4)')
parser.add_argument('--skip_demux', '-c', action='store_true', help='Only do BC counting and not demuxing')
parser.add_argument('--skip_fastqc', '-f', action='store_true', help='After demuxing, skip running FastQC+MultiQC')
parser.add_argument('--max-distance', '-m', type=int, help='Manually adjust maximum edit distance for BC matching')
parser.add_argument('--skip_fastqc', '-f', action='store_true', help=argparse.SUPPRESS)
parser.add_argument('--max-distance', '-m', type=int, help='Manually set maximum edit distance for BC matching, automatically set this is set to either 1 or 2')
parser.add_argument('--debug', '-d', action='store_true', help='Extra commandline output')
args = parser.parse_args()
utcnow = dt.utcnow()
Expand Down
35 changes: 2 additions & 33 deletions demux/demux.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,44 +17,14 @@
def parse_cs(cs_string, index, max_distance):
"""
Parses the CS string of a paf alignment and matches it to the given index using a max Levenshtein distance
TODO: Grab the alignment context and do Smith-Waterman,
or do some clever stuff when parsing the cs string
PIPEDREAM: Do something big-brained with ONT squigglies
TODO / idea: Do something big-brained with ONT squigglies
"""
nt = re.compile("\*n([atcg])")
nts = "".join(re.findall(nt, cs_string))

# Allow for mismatches
return nts, lev.distance(index.lower(), nts)

def run_fastqc(fastqs, out_path, threads):
"""
Runs fastqc + multiqc
"""
fastqc_path = os.path.join(out_path,"fastqc/")
multiqc_path = os.path.join(out_path,"multiqc/")
if not os.path.exists(fastqc_path):
os.mkdir(fastqc_path)
if not os.path.exists(multiqc_path):
os.mkdir(multiqc_path)
cmd1 = [
"fastqc",
"-q",
"-t", str(threads),
"-o", fastqc_path
]
cmd1.extend(fastqs)
proc1 = subprocess.run(cmd1, check=True)
cmd2 = [
"multiqc",
"-i", "anglerfish_results",
"-o", multiqc_path,
"-m", "fastqc",
"-f",
fastqc_path
]
proc2 = subprocess.run(cmd2, check=True)
return proc1.returncode + proc2.returncode

def run_minimap2(fastq_in, indexfile, output_paf, threads):
"""
Expand All @@ -79,8 +49,7 @@ def run_minimap2(fastq_in, indexfile, output_paf, threads):
proc = subprocess.run(cmd, check=True)
return proc.returncode

#from memory_profiler import profile
#@profile

def parse_paf_lines(paf, min_qual=10):
"""
Read and parse one paf alignment lines.
Expand Down
2 changes: 0 additions & 2 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,3 @@ dependencies:
- conda-forge::python-levenshtein=0.12.0
- conda-forge::numpy=1.19.2
- bioconda::minimap2=2.17
- bioconda::fastqc=0.11.9
- bioconda::multiqc=1.9
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

setup(
name='anglerfish',
version='0.4.0',
version='0.4.1',
description='Anglerfish, a tool to demultiplex Illumina libraries from ONT data',
author='Remi-Andre Olsen',
author_email='[email protected]',
Expand Down

0 comments on commit 37551be

Please sign in to comment.