Merge pull request #18 from remiolsen/dev

Dev -> version 0.4.1
NationalGenomicsInfrastructure · Nov 16, 2021 · 37551be · 37551be
2 parents c8f7d66 + f6510c6
commit 37551be
Show file tree

Hide file tree

Showing 6 changed files with 28 additions and 58 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,3 +1,4 @@
 *.pyc
 *~
 *.egg-info
+.DS_Store
diff --git a/README.md b/README.md
@@ -27,8 +27,6 @@ Python modules:
 Software:
 
 * minimap2 v. 2.17
-* fastqc v. 0.11.9 (optional)
-* multiqc v. 1.8 (optional)
 
 ### Manually using pip
 
@@ -84,7 +82,7 @@ P12345_101,truseq,CAGGACGT,/path/to/ONTreads.fastq.gz
 Then run:
 
 ```
-anglerfish.py -o /path/to/samples.csv
+anglerfish.py -s /path/to/samples.csv
 ```
 
 ### Optional
@@ -97,9 +95,8 @@ anglerfish.py -o /path/to/samples.csv
 --threads THREADS, -t THREADS
                       Number of threads to use (default: 4)
 --skip_demux, -c      Only do BC counting and not demuxing
---skip_fastqc, -f     After demuxing, skip running FastQC+MultiQC
 --max-distance MAX_DISTANCE, -m MAX_DISTANCE
-                      Manually adjust maximum edit distance for BC matching
+                      Manually set maximum edit distance for BC matching, automatically set this is set to either 1 or 2
 ```
 
 ### Output files
@@ -108,8 +105,8 @@ In folder `anglerfish_????_??_??_?????/`
 
 * `*.fastq.gz` Demultuplexed reads (if any)
 * `anglerfish_stats.txt` Barcode statistics from anglerfish run
-* `fastqc/` raw output from fastqc (if run)
-* `multiqc/anglerfish_results_multiqc_report.html` Summary of demultiplexed reads
+* `anglerfish_stats.json` Machine readable anglerfish statistics
+
 
 ## Credits
 

diff --git a/anglerfish.py b/anglerfish.py
@@ -11,7 +11,7 @@
 from datetime import datetime as dt
 from itertools import groupby
 from collections import Counter
-from demux.demux import run_minimap2, parse_paf_lines, layout_matches, cluster_matches, write_demuxedfastq, run_fastqc
+from demux.demux import run_minimap2, parse_paf_lines, layout_matches, cluster_matches, write_demuxedfastq
 from demux.samplesheet import SampleSheet
 import gzip
 logging.basicConfig(level=logging.INFO)
@@ -30,7 +30,10 @@ def run_demux(args):
         log.error("There is one or more identical barcodes in the input samplesheet. Aborting!")
         exit()
     if args.max_distance == None:
-        args.max_distance = bc_dist - 1
+        if bc_dist > 1:
+            args.max_distance = 2
+        else:
+            args.max_distance = 1
         log.info("Using maximum edit distance of {}".format(args.max_distance))
     if args.max_distance >= bc_dist:
         log.error(" Edit distance of barcodes in samplesheet are less than the minimum specified {}>={}".format(args.max_distance, bc_dist))
@@ -74,14 +77,14 @@ def run_demux(args):
         fragments, singletons, concats, unknowns = layout_matches(adaptor_name+"_i5",adaptor_name+"_i7",paf_entries)
         total = len(fragments)+len(singletons)+len(concats)+len(unknowns)
 
-        paf_stats[aln_name] = []
-        paf_stats[aln_name].append(aln_name+":")
-        paf_stats[aln_name].append("{}\tinput reads".format(fq_entries))
-        paf_stats[aln_name].append("{}\treads aligning to adaptor sequences ({:.2f}%)".format(total, (total/float(fq_entries)*100)))
-        paf_stats[aln_name].append("{}\taligned reads matching both I7 and I5 adaptor ({:.2f}%)".format(len(fragments), (len(fragments)/float(total)*100)))
-        paf_stats[aln_name].append("{}\taligned reads matching only I7 or I5 adaptor ({:.2f}%)".format(len(singletons), (len(singletons)/float(total)*100)))
-        paf_stats[aln_name].append("{}\taligned reads matching multiple I7/I5 adaptor pairs ({:.2f}%)".format(len(concats), (len(concats)/float(total)*100)))
-        paf_stats[aln_name].append("{}\taligned reads with uncategorized alignments ({:.2f}%)".format(len(unknowns), (len(unknowns)/float(total)*100)))
+        paf_stats[aln_name] = {}
+        paf_stats[aln_name]["input_reads"] = [fq_entries, 1.0]
+        paf_stats[aln_name]["reads aligning to adaptor sequences"] = [total, total/float(fq_entries)]
+        paf_stats[aln_name]["aligned reads matching both I7 and I5 adaptor"] = [len(fragments), len(fragments)/float(total)]
+        paf_stats[aln_name]["aligned reads matching only I7 or I5 adaptor"] = [len(singletons), len(singletons)/float(total)]
+        paf_stats[aln_name]["aligned reads matching multiple I7/I5 adaptor pairs"] = [len(concats), len(concats)/float(total)]
+        paf_stats[aln_name]["aligned reads with uncategorized alignments"] = [len(unknowns), len(unknowns)/float(total)]
+
         no_matches, matches = cluster_matches(adaptors_sorted[key], adaptor_name, fragments, args.max_distance)
 
         aligned_samples = []
@@ -119,7 +122,9 @@ def run_demux(args):
     with open(os.path.join(args.out_fastq,"anglerfish_stats.txt"), "w") as f:
         f.write("Anglerfish v. "+version+"\n===================\n")
         for key, line in paf_stats.items():
-            f.write("\n".join(line)+"\n")
+            f.write(f"{key}:\n")
+            for i,j in line.items():
+                f.write(f"{j[0]}\t{i} ({j[1]*100:.2f}%)\n")
             json_out["paf_stats"].append(line)
         f.write("\n{}\n".format("\t".join(header1)))
         for sample in sample_stats:
@@ -133,8 +138,8 @@ def run_demux(args):
 
     with open(os.path.join(args.out_fastq,"anglerfish_stats.json"), "w") as f:
         f.write(json.dumps(json_out,indent=2, sort_keys=True))
-    if not args.skip_fastqc and not args.skip_demux:
-        fastqc = run_fastqc(out_fastqs, args.out_fastq, args.threads)
+    if args.skip_fastqc:
+        log.warning(" As of version 0.4.1, built in support for FastQC + MultiQC is removed. The '-f' flag is redundant.")
 
 
 if __name__ == "__main__":
@@ -143,8 +148,8 @@ def run_demux(args):
     parser.add_argument('--out_fastq', '-o', default='.', help='Analysis output folder (default: Current dir)')
     parser.add_argument('--threads', '-t', default=4, help='Number of threads to use (default: 4)')
     parser.add_argument('--skip_demux', '-c', action='store_true', help='Only do BC counting and not demuxing')
-    parser.add_argument('--skip_fastqc', '-f', action='store_true', help='After demuxing, skip running FastQC+MultiQC')
-    parser.add_argument('--max-distance', '-m', type=int, help='Manually adjust maximum edit distance for BC matching')
+    parser.add_argument('--skip_fastqc', '-f', action='store_true', help=argparse.SUPPRESS)
+    parser.add_argument('--max-distance', '-m', type=int, help='Manually set maximum edit distance for BC matching, automatically set this is set to either 1 or 2')
     parser.add_argument('--debug', '-d', action='store_true', help='Extra commandline output')
     args = parser.parse_args()
     utcnow = dt.utcnow()

diff --git a/demux/demux.py b/demux/demux.py
@@ -17,44 +17,14 @@
 def parse_cs(cs_string, index, max_distance):
     """
     Parses the CS string of a paf alignment and matches it to the given index using a max Levenshtein distance
-    TODO: Grab the alignment context and do Smith-Waterman,
-          or do some clever stuff when parsing the cs string
-    PIPEDREAM: Do something big-brained with ONT squigglies
+    TODO / idea: Do something big-brained with ONT squigglies
     """
     nt = re.compile("\*n([atcg])")
     nts = "".join(re.findall(nt, cs_string))
 
     # Allow for mismatches
     return nts, lev.distance(index.lower(), nts)
 
-def run_fastqc(fastqs, out_path, threads):
-    """
-    Runs fastqc + multiqc
-    """
-    fastqc_path = os.path.join(out_path,"fastqc/")
-    multiqc_path = os.path.join(out_path,"multiqc/")
-    if not os.path.exists(fastqc_path):
-        os.mkdir(fastqc_path)
-    if not os.path.exists(multiqc_path):
-        os.mkdir(multiqc_path)
-    cmd1 = [
-        "fastqc",
-        "-q",
-        "-t", str(threads),
-        "-o", fastqc_path
-    ]
-    cmd1.extend(fastqs)
-    proc1 = subprocess.run(cmd1, check=True)
-    cmd2 = [
-        "multiqc",
-        "-i", "anglerfish_results",
-        "-o", multiqc_path,
-        "-m", "fastqc",
-        "-f",
-        fastqc_path
-    ]
-    proc2 = subprocess.run(cmd2, check=True)
-    return proc1.returncode + proc2.returncode
 
 def run_minimap2(fastq_in, indexfile, output_paf, threads):
     """
@@ -79,8 +49,7 @@ def run_minimap2(fastq_in, indexfile, output_paf, threads):
     proc = subprocess.run(cmd, check=True)
     return proc.returncode
 
-#from memory_profiler import profile
-#@profile
+
 def parse_paf_lines(paf, min_qual=10):
     """
     Read and parse one paf alignment lines.

diff --git a/environment.yml b/environment.yml
@@ -7,5 +7,3 @@ dependencies:
   - conda-forge::python-levenshtein=0.12.0
   - conda-forge::numpy=1.19.2
   - bioconda::minimap2=2.17
-  - bioconda::fastqc=0.11.9
-  - bioconda::multiqc=1.9
diff --git a/setup.py b/setup.py
@@ -4,7 +4,7 @@
 
 setup(
     name='anglerfish',
-    version='0.4.0',
+    version='0.4.1',
     description='Anglerfish, a tool to demultiplex Illumina libraries from ONT data',
     author='Remi-Andre Olsen',
     author_email='[email protected]',