From 387060661f1f9792ca99285b7b127adb092df811 Mon Sep 17 00:00:00 2001 From: Francesco Vezzi Date: Thu, 13 Mar 2014 16:10:07 +0100 Subject: [PATCH] Major changes in the project: the project is now a regular python package and extensive variation has been done on the MP pipeline in order to make it more general and flexibile --- .../config_de_novo.yaml | 0 .../config_de_novo_amanita.yaml | 0 .../config_de_novo_picea.yaml | 0 .../config_de_novo_uppmax.yaml | 19 +- de_novo_scilife/MP.py | 379 +++++++++++++++++ de_novo_scilife/MP.pyc | Bin 0 -> 14674 bytes de_novo_scilife/QCcontrol.py | 184 ++++++++ de_novo_scilife/QCcontrol.pyc | Bin 0 -> 7018 bytes de_novo_scilife/__init__.py | 0 de_novo_scilife/__init__.pyc | Bin 0 -> 162 bytes {script => de_novo_scilife}/align.py | 178 +++----- de_novo_scilife/align.pyc | Bin 0 -> 14125 bytes {script => de_novo_scilife}/assemble.py | 12 +- de_novo_scilife/assemble.pyc | Bin 0 -> 19602 bytes {script => de_novo_scilife}/common.py | 77 +++- de_novo_scilife/common.pyc | Bin 0 -> 7034 bytes .../deNovo_pipeline.py | 33 +- {script => de_novo_scilife}/evaluete.py | 4 +- de_novo_scilife/evaluete.pyc | Bin 0 -> 11251 bytes de_novo_scilife/pdf/__init__.py | 93 +++++ de_novo_scilife/pdf/__init__.pyc | Bin 0 -> 4843 bytes de_novo_scilife/pdf/common.py | 17 + de_novo_scilife/pdf/common.pyc | Bin 0 -> 559 bytes de_novo_scilife/pdf/theme.py | 57 +++ de_novo_scilife/pdf/theme.pyc | Bin 0 -> 2512 bytes de_novo_scilife/pdf/util.py | 18 + de_novo_scilife/pdf/util.pyc | Bin 0 -> 811 bytes de_novo_scilife/util.pyc | Bin 0 -> 803 bytes pictures/NGI.jpeg | Bin 0 -> 9957 bytes pictures/SciLifeLab.jpeg | Bin 0 -> 1539 bytes script/MP.py | 102 ----- script/QCcontrol.py | 128 ------ setup.py | 14 + utils/OpGen_util.py | 395 ++++++++++++++++++ ...aluation.py => produce_assembly_report.py} | 156 +++---- utils/reverse_complement.py | 51 +++ utils/run_MP_analysis.py | 117 ++++++ 37 files changed, 1575 insertions(+), 459 deletions(-) rename config_de_novo.yaml => config_files/config_de_novo.yaml (100%) rename config_de_novo_amanita.yaml => config_files/config_de_novo_amanita.yaml (100%) rename config_de_novo_picea.yaml => config_files/config_de_novo_picea.yaml (100%) rename config_de_novo_uppmax.yaml => config_files/config_de_novo_uppmax.yaml (81%) create mode 100644 de_novo_scilife/MP.py create mode 100644 de_novo_scilife/MP.pyc create mode 100644 de_novo_scilife/QCcontrol.py create mode 100644 de_novo_scilife/QCcontrol.pyc create mode 100644 de_novo_scilife/__init__.py create mode 100644 de_novo_scilife/__init__.pyc rename {script => de_novo_scilife}/align.py (75%) create mode 100644 de_novo_scilife/align.pyc rename {script => de_novo_scilife}/assemble.py (99%) create mode 100644 de_novo_scilife/assemble.pyc rename {script => de_novo_scilife}/common.py (55%) create mode 100644 de_novo_scilife/common.pyc rename {script => de_novo_scilife}/deNovo_pipeline.py (74%) rename {script => de_novo_scilife}/evaluete.py (96%) create mode 100644 de_novo_scilife/evaluete.pyc create mode 100644 de_novo_scilife/pdf/__init__.py create mode 100644 de_novo_scilife/pdf/__init__.pyc create mode 100644 de_novo_scilife/pdf/common.py create mode 100644 de_novo_scilife/pdf/common.pyc create mode 100644 de_novo_scilife/pdf/theme.py create mode 100644 de_novo_scilife/pdf/theme.pyc create mode 100644 de_novo_scilife/pdf/util.py create mode 100644 de_novo_scilife/pdf/util.pyc create mode 100644 de_novo_scilife/util.pyc create mode 100644 pictures/NGI.jpeg create mode 100644 pictures/SciLifeLab.jpeg delete mode 100644 script/MP.py delete mode 100644 script/QCcontrol.py create mode 100644 setup.py create mode 100644 utils/OpGen_util.py rename utils/{run_assembly_evaluation.py => produce_assembly_report.py} (73%) create mode 100644 utils/reverse_complement.py create mode 100644 utils/run_MP_analysis.py diff --git a/config_de_novo.yaml b/config_files/config_de_novo.yaml similarity index 100% rename from config_de_novo.yaml rename to config_files/config_de_novo.yaml diff --git a/config_de_novo_amanita.yaml b/config_files/config_de_novo_amanita.yaml similarity index 100% rename from config_de_novo_amanita.yaml rename to config_files/config_de_novo_amanita.yaml diff --git a/config_de_novo_picea.yaml b/config_files/config_de_novo_picea.yaml similarity index 100% rename from config_de_novo_picea.yaml rename to config_files/config_de_novo_picea.yaml diff --git a/config_de_novo_uppmax.yaml b/config_files/config_de_novo_uppmax.yaml similarity index 81% rename from config_de_novo_uppmax.yaml rename to config_files/config_de_novo_uppmax.yaml index dd22149..5424818 100644 --- a/config_de_novo_uppmax.yaml +++ b/config_files/config_de_novo_uppmax.yaml @@ -1,10 +1,9 @@ Pipelines: - MP - QCcontrol - assemble - evaluete - align - user_define + QCcontrol: ["fastqc", "abyss", "trimmomatic"] + MP: ["fastqc", "abyss", "trimmomatic", "align"] + assemble: ["soapdenovo", "abyss"] + evaluete: ["qaTools", "FRC"] + align: [] Tools: jellyfish: bin: /home/vezzi/DE_NOVO_PIPELINE/assembly_pipeline/tools/jellyfish-1.1.11/bin/jellyfish @@ -14,7 +13,7 @@ Tools: options: [--threads, "16" , --outdir, fastqc] trimmomatic: bin: /home/vezzi/DE_NOVO_PIPELINE/tools/Trimmomatic-0.30/trimmomatic-0.30.jar - options: [] + options: [""] masurca: bin: /home/vezzi/DE_NOVO_PIPELINE/tools/MaSuRCA-2.0.4/ options: [""] @@ -23,7 +22,7 @@ Tools: options: [""] abyss: bin: /sw/apps/bioinfo/abyss/1.3.5/milou/bin/ - options: ["k=54", "np=16"] + options: [""] abyss_mergePairs: bin: /sw/apps/bioinfo/abyss/1.3.5/milou/bin/abyss-mergepairs options: ["-m 20", "-1 230", "-2 230"] @@ -52,8 +51,8 @@ Tools: bin: /sw/apps/bioinfo/samtools/0.1.19/milou/bin/samtools options: "" qaTools: - bin: /home/vezzi/DE_NOVO_PIPELINE/tools/qaTools/ + bin: /home/vezzi/DE_NOVO_PIPELINE/tools/qaTools/qaCompute options: "" trinity: - bin: ~/DE_NOVO_PIPELINE/tools/trinityrnaseq_r20131110/ + bin: $HOME/DE_NOVO_PIPELINE/tools/trinityrnaseq_r20131110/ options: [] diff --git a/de_novo_scilife/MP.py b/de_novo_scilife/MP.py new file mode 100644 index 0000000..c5aeef8 --- /dev/null +++ b/de_novo_scilife/MP.py @@ -0,0 +1,379 @@ +import sys, os, yaml, glob +import gzip +import subprocess +import pandas as pd +import re +import string +from de_novo_scilife import align +from de_novo_scilife import common +from de_novo_scilife import QCcontrol + +from de_novo_scilife import pdf +from de_novo_scilife.pdf.theme import colors, DefaultTheme + + +def run(global_config, sample_config): + sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config) + + + if "tools" in sample_config and len(sample_config["tools"]) > 0: + """If so, execute them one after the other in the specified order (might not work)""" + for command in sample_config["tools"]: + """with this I pick up at run time the correct function in the current module""" + command_fn = getattr(sys.modules[__name__] , "_run_{}".format(command)) + """Update sample config, each command return sample_config and if necessary it modifies it""" + sample_config = command_fn(global_config, sample_config, sorted_libraries_by_insert) + else: + #run default pipeline for QC + sample_config = _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert) + sample_config = _run_fastqc(global_config, sample_config, sorted_libraries_by_insert) + sample_config = _run_abyss(global_config, sample_config, sorted_libraries_by_insert) + sample_config = _run_align(global_config, sample_config, sorted_libraries_by_insert) + + #sample_config = _run_FindTranslocations(global_config, sample_config, sorted_libraries_by_insert) + #now reverse complement + #sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config) + sample_config = _run_revcomp(global_config, sample_config, sorted_libraries_by_insert) + _run_report(global_config, sample_config,sorted_libraries_by_insert) + +def _run_fastqc(global_config, sample_config,sorted_libraries_by_insert): + return QCcontrol._run_fastqc(global_config, sample_config,sorted_libraries_by_insert) + +def _run_abyss(global_config, sample_config,sorted_libraries_by_insert): + return QCcontrol._run_abyss(global_config, sample_config,sorted_libraries_by_insert) + + + +def _run_align(global_config, sample_config,sorted_libraries_by_insert): + + if "reference" not in sample_config: + print "reference sequence not provided, skypping alignment step. Please provide a reference if you are intrested in aligning the reads against a reference" + return + if not os.path.exists("alignments"): + os.makedirs("alignments") + os.chdir("alignments") + + sorted_libraries_by_insert = align._align_reads(global_config, sample_config, sorted_libraries_by_insert) # align reads + sorted_alignments_by_insert = align._merge_bam_files(global_config, sample_config, sorted_libraries_by_insert) # merge alignments + sorted_alignments_by_insert = align.picard_CGbias(global_config, sample_config,sorted_alignments_by_insert) # compute picard stats + sorted_alignments_by_insert = align.picard_collectInsertSizeMetrics(global_config, sample_config,sorted_alignments_by_insert) + sorted_alignments_by_insert = align.picard_markDuplicates(global_config, sample_config,sorted_alignments_by_insert) + + os.chdir("..") + sample_config["alignments"] = sorted_alignments_by_insert + + return sample_config + + +def _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert): + program = global_config["Tools"]["trimmomatic"]["bin"] + program_folder = os.path.dirname(program) + if "adapters" not in sample_config: + sys.exit("running MP pipeline, adapters file to be used in trimming are needed for Trimmomatic. Please specify them\ + in the sample configuration file and rerun") + adapterFile = sample_config["adapters"] + if not os.path.exists(adapterFile): + sys.exit("Trimmomatic cannot be run as adapter file is not specified or points to unknown position: {}".format(adapterFile)) + + mainDirectory = os.getcwd() + trimmomaticDir = os.path.join(mainDirectory, "Trimmomatic") + if not os.path.exists(trimmomaticDir): + os.makedirs(trimmomaticDir) + os.chdir(trimmomaticDir) + #now I am in running dir, I need to process one by one the libraries + threads = 8 + if "threads" in sample_config: + threads = sample_config["threads"] + + for library, libraryInfo in sorted_libraries_by_insert: + read1=libraryInfo["pair1"] + read2=libraryInfo["pair2"] + orientation = libraryInfo["orientation"] + if orientation=="outtie": # this must be done only with MP + if read2 is not None: + read1_baseName = os.path.split(read1)[1].split(".")[0] + read2_baseName = os.path.split(read2)[1].split(".")[0] + output_read1_pair = os.path.join(trimmomaticDir, "{}.fastq.gz".format(read1_baseName)) + output_read1_sing = os.path.join(trimmomaticDir, "{}_u.fastq.gz".format(read1_baseName)) + output_read2_pair = os.path.join(trimmomaticDir, "{}.fastq.gz".format(read2_baseName)) + output_read2_sing = os.path.join(trimmomaticDir, "{}_u.fastq.gz".format(read2_baseName)) + command = ["java", "-jar", program, "PE", "-threads", "{}".format(threads), "-phred33", read1, read2, output_read1_pair ,output_read1_sing , output_read2_pair, output_read2_sing ,"ILLUMINACLIP:{}:2:30:10".format(adapterFile), "LEADING:3", "TRAILING:3", "SLIDINGWINDOW:4:15", "MINLEN:30" ] + common.print_command(command) + if not common.check_dryrun(sample_config) and not os.path.exists("{}.fastq.gz".format(read1_baseName)): # do not execute is files have been already gennerated + stdOut = open("{}_trimmomatic.stdOut".format(read1_baseName), "w") + stdErr = open("{}_trimmomatic.stdErr".format(read1_baseName), "w") + returnValue = subprocess.call(command, stdout=stdOut, stderr=stdErr) # run the program + if returnValue != 0: + print "error while running command: {}".format(command) + libraryInfo["pair1"] = output_read1_pair + libraryInfo["pair2"] = output_read2_pair + libraryInfo["trimmomatic"] = os.path.join(trimmomaticDir, "{}_trimmomatic.stdErr".format(read1_baseName)) + os.chdir(mainDirectory) + return sample_config + + +def _run_revcomp(global_config, sample_config, sorted_libraries_by_insert): + currentDir = os.getcwd() + workingDir = os.path.join(currentDir, "reverse_complemented") + if not os.path.exists(workingDir): + os.makedirs(workingDir) + os.chdir(workingDir) + + common.print_command("reversing complementing reads, internal function") + for library, libraryInfo in sorted_libraries_by_insert: + pairs = [libraryInfo["pair1"] , libraryInfo["pair2"]] + for pair in pairs: + base_name = os.path.split(pair)[1].split(".fastq")[0] + output_name = "{}_rc.fastq".format(base_name) + if not os.path.exists("{}.gz".format(output_name)): + rev_compl_file = _rev_comp_file(pair, output_name) + libraryInfo["pair1_rc"] = os.path.join(workingDir, rev_compl_file) + os.chdir(currentDir) + return sample_config + + +def _rev_comp_file(input_fastq, output_fastq): + proc = subprocess.Popen(["zcat", "{}".format(input_fastq)], stdout=subprocess.PIPE, bufsize=1) + with gzip.open('{}.gz'.format(output_fastq), 'wb') as output_f: + for header in iter(proc.stdout.readline, b''): + header = header.rstrip() + sequence = proc.stdout.readline().rstrip() + comment = proc.stdout.readline().rstrip() + quality = proc.stdout.readline().rstrip() + + sequence_rc = rc(sequence) + quality_rev = reverse(quality) + + output_f.write("{}\n".format(header)) + output_f.write("{}\n".format(sequence_rc)) + output_f.write("{}\n".format(comment)) + output_f.write("{}\n".format(quality_rev)) + + proc.communicate() + return "{}.gz".format(output_fastq) + +def reverse(quality): + seq_rev = quality[::-1] + return seq_rev + + +def rc(dna): + complements = string.maketrans('acgtrymkbdhvnACGTRYMKBDHVN', 'tgcayrkmvhdbnTGCAYRKMVHDBN') + rcseq = dna.translate(complements)[::-1] + return rcseq + + + + +def _run_report(global_config, sample_config, sorted_libraries_by_insert): + """This function produces a pdf report and stores the important resutls in a single folder""" + + ### retrive all info needed to write the report + sampleName = "sample" + if "output" in sample_config: + sampleName = sample_config["output"] + projectName = "anonymous_project" + if "projectName" in sample_config: + projectName = sample_config["projectName"] + + + currentDir = os.getcwd() + workingDir = os.path.join(currentDir, "report") + if not os.path.exists(workingDir): + os.makedirs(workingDir) + os.chdir(workingDir) + + PDFtitle = "{}.pdf".format(sample_config["output"]) + + TABLE_WIDTH = 540 # this you cannot do in rLab which is why I wrote the helper initially + class MyTheme(DefaultTheme): + doc = { + 'leftMargin': 25, + 'rightMargin': 25, + 'topMargin': 20, + 'bottomMargin': 25, + 'allowSplitting': False + } + # let's create the doc and specify title and author + doc = pdf.Pdf('{} {}'.format(projectName, sampleName), 'NGI-Stockholm, Science for Life Laboratory') + + # now we apply our theme + doc.set_theme(MyTheme) + + # give me some space + doc.add_spacer() + + # this header defaults to H1 + scriptDirectory = os.path.split(os.path.abspath(__file__))[0] + logo_path = os.path.join(scriptDirectory, '../pictures/SciLifeLab.jpeg') + doc.add_image(logo_path, 180, 67, pdf.CENTER) + logo_path = os.path.join(scriptDirectory, '../pictures/NGI.jpeg') + doc.add_image(logo_path, 180, 67, pdf.CENTER) + # give me some space + doc.add_spacer() + + doc.add_header('NGI-Stockholm -- Science For Life Laboratory') + doc.add_header('Mate Pair Best Practice Analysis Report') + doc.add_header('{} -- {}'.format(projectName, sampleName)) + # give me some space + #doc.add_spacer() + #doc.add_paragraph("NGI-Stockholm and Science For Life Laboratory bla bla bla [Mission statement]") + + doc.add_spacer() + doc.add_paragraph("For sample {} belonging to project {} NGI-Stockholm best-practice-analysis for Mate Pairs produce with the Nextera MP \ +protcol has been run. For more informations about the protocols and the general guidelines for analysis refer to \ +http://res.illumina.com/documents/products/technotes/technote_nextera_matepair_data_processing.pdf".format(sampleName, projectName)) + doc.add_spacer() + tools = ["trimmomatic", "fastqc", "abyss", "align"] + if "tools" in sample_config and len(sample_config["tools"]) > 0: + tools = sample_config["tools"] + doc.add_paragraph("The following tools have been employed (tools are listed in order of execution):") + bollet_list = [] + for tool in tools: + if tool != "align": + program_path = global_config["Tools"][tool]["bin"] + bollet_list.append("{} : {}".format(tool, program_path)) + else: + bollet_list.append("{} : {}".format(tool, global_config["Tools"]["bwa"]["bin"])) + bollet_list.append("{} : {}".format(tool, global_config["Tools"]["samtools"]["bin"])) + bollet_list.append("{} : {}".format(tool, global_config["Tools"]["picard"]["bin"])) + doc.add_list(bollet_list) + doc.add_spacer() + doc.add_paragraph("For each tool results are reported (tables, pictures). Moreover you will find all the results and commands that have been\ +in the result delivery folder on Uppmax") + + + for tool in tools: + doc.add_header(tool , pdf.H2) + if tool == "trimmomatic": + doc.add_paragraph("As a result of the Illumina Nextera Mate Pair protocol many reads sequenced will contain the adapter sequence. Illumina reccomends\ +to remove the adaptor before use the reads in any downstream analysis.") + doc.add_paragraph("Adapter sequences removed are:") + adapter_file = sample_config["adapters"] + adapters = [] + with open(adapter_file) as file: + lines = file.readlines() + adapters = [lines[1].rstrip(),lines[3].rstrip(),lines[5].rstrip()] + doc.add_list(adapters) + doc.add_spacer() + + trimmomatic_table_part1 = [[sampleName, "#orig_pairs", "#survived_pairs"]] # this is the header row + trimmomatic_table_part2 = [[sampleName,"#survived_fw_only", "#survived_rv_only", "#discarded"]] + + total_orig_pairs = 0 + total_survived_pairs = 0 + total_survived_fw_only = 0 + total_survived_rv_only = 0 + total_discarded = 0 + + for library, libraryInfo in sorted_libraries_by_insert: + runName = os.path.basename(libraryInfo["trimmomatic"]).split("_1_trimmomatic.stdErr")[0] + with open(libraryInfo["trimmomatic"]) as trimmomatic_output: + lines = trimmomatic_output.readlines() + result_line = lines[-2].rstrip() + match_string = re.compile('Input Read Pairs: (\d+) Both Surviving: (\d+) \(.+\) Forward Only Surviving: (\d+) \(.+\) Reverse Only Surviving: (\d+) \(.+\) Dropped: (\d+) \(.+\)') + read_pairs = int(match_string.match(result_line).group(1)) + survived_pairs = int(match_string.match(result_line).group(2)) + survived_fw_only = int(match_string.match(result_line).group(3)) + survived_rv_only = int(match_string.match(result_line).group(4)) + discarded = int(match_string.match(result_line).group(5)) + read_pairs_perc = "({0:.0f}%)".format((float(survived_pairs)/read_pairs) * 100) + survived_fw_only_perc = "({0:.0f}%)".format((float(survived_fw_only)/read_pairs) * 100) + survived_rv_only_perc = "({0:.0f}%)".format((float(survived_rv_only)/read_pairs) * 100) + survived_discarded_perc = "({0:.0f}%)".format((float(discarded)/read_pairs) * 100) + + total_orig_pairs += read_pairs + total_survived_pairs += survived_pairs + total_survived_fw_only += survived_fw_only + total_survived_rv_only += survived_rv_only + total_discarded += discarded + trimmomatic_table_part1.append([runName,read_pairs, "{} {}".format(survived_pairs, read_pairs_perc)]) # these are the other rows + trimmomatic_table_part2.append([runName, "{} {}".format(survived_fw_only, survived_fw_only_perc), \ + "{} {}".format(survived_rv_only, survived_rv_only_perc), "{} {}".format(discarded, survived_discarded_perc)]) # these are the other rows + + survived_pairs_perc = "({0:.0f}%)".format((float(total_survived_pairs)/total_orig_pairs) * 100) + survived_survived_fw_only_perc = "({0:.0f}%)".format((float(total_survived_fw_only)/total_orig_pairs) * 100) + survived_survived_rv_only_perc = "({0:.0f}%)".format((float(total_survived_rv_only)/total_orig_pairs) * 100) + survived_discarded_perc = "({0:.0f}%)".format((float(total_discarded)/total_orig_pairs) * 100) + + + trimmomatic_table_part1.append(["total", total_orig_pairs, "{} {}".format(total_survived_pairs, survived_pairs_perc)]) # last row is the sum + trimmomatic_table_part2.append(["total", "{} {}".format(survived_fw_only, survived_fw_only_perc), \ + "{} {}".format(survived_rv_only, survived_rv_only_perc), "{} {}".format(discarded, survived_discarded_perc)]) # last row is the sum + + doc.add_table(trimmomatic_table_part1, TABLE_WIDTH) + doc.add_spacer() + doc.add_table(trimmomatic_table_part2, TABLE_WIDTH) + + + if tool == "fastqc": + fastqc_dir = sample_config["fastqc"] + for fastqc_run in [dir for dir in os.listdir(fastqc_dir) if os.path.isdir(os.path.join(fastqc_dir, dir))]: + doc.add_header("{} -- Per Base Quality".format(fastqc_run) , pdf.H3) + fastqc_run_dir = os.path.join(fastqc_dir, fastqc_run, "Images") + doc.add_image(os.path.join(fastqc_run_dir,"per_base_quality.png"), 400, 180, pdf.CENTER) + doc.add_header("{} -- Sequence Length Distribution".format(fastqc_run) , pdf.H3) + fastqc_run_dir = os.path.join(fastqc_dir, fastqc_run, "Images") + doc.add_image(os.path.join(fastqc_run_dir,"sequence_length_distribution.png"), 400, 180, pdf.CENTER) + if tool == "abyss": + doc.add_paragraph("kmer profile with k={}.".format(sample_config["kmer"])) + kmer_1_200 = os.path.join(sample_config["abyss"], "kmer_coverage_1_200.png") + doc.add_image(kmer_1_200, 500, 300, pdf.CENTER) + if tool == "align": + alignments = sample_config["alignments"][0] + alignment_path = alignments[1] + alignment_prefix = alignments[2] + align_dir = os.path.split(alignment_path)[0] + doc.add_header("{} -- Collect Insert Size Metrics".format(sampleName) , pdf.H3) + with open(os.path.join(align_dir, "{}.collectInsertSize.txt".format(alignment_prefix) )) as collectInsertSize: + lines = collectInsertSize.readlines() + line = lines[6].rstrip().split("\t") + insertSize_table = [[line[7], line[6], line[4], line[5]]] # this is the header row + line = lines[7].rstrip().split("\t") + insertSize_table.append([line[7], line[6], line[4], line[5]]) # this is the header row + line = lines[8].rstrip().split("\t") + insertSize_table.append([line[7], line[6], line[4], line[5]]) # this is the header row + line = lines[9].rstrip().split("\t") + insertSize_table.append([line[7], line[6], line[4], line[5]]) # this is the header row + doc.add_table(insertSize_table, TABLE_WIDTH) + doc.add_spacer() + full_path_to_pdf = os.path.join(align_dir, "{}.collectInsertSize.pdf".format(alignment_prefix)) + doc.add_paragraph("Insert size plot can be find in the result directory: {}".format(os.path.join(full_path_to_pdf.split(os.sep)[-3], full_path_to_pdf.split(os.sep)[-2], full_path_to_pdf.split(os.sep)[-1]))) + doc.add_spacer() + + doc.add_header("{} -- Duplicate Metrics".format(sampleName) , pdf.H3) + with open(os.path.join(align_dir, "{}.markDuplicates.txt".format(alignment_prefix) )) as collectInsertSize: + lines = collectInsertSize.readlines() + line = lines[6].rstrip().split("\t") + duplication_table_part1 = [line[0:3]] # this is the header row + duplication_table_part2 = [line[4:6]] # this is the header row + duplication_table_part3 = [line[7:9]] # this is the header row + line = lines[7].rstrip().split("\t") + duplication_table_part1.append(line[0:3]) # + duplication_table_part2.append(line[4:6]) # + duplication_table_part3.append(line[7:9]) # + doc.add_table(duplication_table_part1, TABLE_WIDTH) + doc.add_spacer() + doc.add_table(duplication_table_part2, TABLE_WIDTH) + doc.add_spacer() + doc.add_table(duplication_table_part3, TABLE_WIDTH) + doc.add_spacer() + full_path_to_bam = os.path.join(align_dir, "{}_noDup.bam".format(alignment_prefix)) + doc.add_paragraph("Bam file with makeked duplicated reads can be faound at: {}".format(os.path.join(full_path_to_bam.split(os.sep)[-3], full_path_to_bam.split(os.sep)[-2], full_path_to_bam.split(os.sep)[-1]))) + doc.add_spacer() + + + doc.render(PDFtitle) + + os.chdir(currentDir) + + + + + + + + + + + diff --git a/de_novo_scilife/MP.pyc b/de_novo_scilife/MP.pyc new file mode 100644 index 0000000000000000000000000000000000000000..701e20e37644db72e2526c912e388c4e8512f7f0 GIT binary patch literal 14674 zcmcgzOLH4ncD@bpAyRyak|@!V<<=XrB~YSl*`{TWB~q3+mMFTZ5p9+gR0G`vngq~b zcS9s4AgPQdnZ=~yR3*DCGF90mtJGAbDp@3{%FaI_o7tqYNYyNpZ64ouZg&G9WyOi@ z2t?!dxvz82J+FK2QTE?PhiAV&y;4%~zYKmq#y2z#r5yYjs;bmlS~k?0k-#%*Et8b9 zYAu_T`_x)rQtnr4{YiO1tqmsS18VI+QXW!kLsHJD>aeOERBH#7!MU^YKBCq}^n0JY zkE*p%{oXI{V`^u6pIVejUJT z$*Jo&s3Bl9FS?$IT^LtN9fKUCeVc^@2YlIMfmGe8~vPo z8U)`MRIMzBdy!G?KGnjXVwYA%y~wJzp|)hLd-@xEqFDuVf9`S{Wj6pMpCYo$i6#0Ny!nb2N%ax#}0~Y5@$h?v(`+ z+O=+&i%!uJ2Q@3sQXX-LRj*u-nzT*{lRj+u`6T*WyUdJIK>H!@$8w*q0AeZ z>FlH-pZ-as#2w(5TPlov7=@UTyI5^S8+c)e$%uA*Yu#jp(0!dB|QYkiYhLe+fHcOWgFTi>h3P` zm(CW-{^avHDh?=WrTh?D7?!|}Dxz)N9WRWu7OL5suH(TZP%V@yC|W2etwv}k-z8!TE}Xrv&4?h{XyiDmUo8pp2S0CMvQ)=4?1ST7|k3t;!kGO z7|w{kIgU0|Goo+u&>>(_nnticjovg_QcNiXK$}0Ot)vc4Ks|=L6l&A zQ5r!BW_OgH&t^sizfr9bdSshJDwxj3OGMLyL<>&Q^aiTOu2JR=1Kn2iei? z*CV=L;<0o)K#(X^gjzg+U)!-8ksE~JpW+4nK%L6*!Lr%#8gA9AyO+(Rg-I1Mb`m*&^? zbZ~~Crh!-ZCGhJS)-2g|nh79-TGNIe*DKNeyikmYp3?(XK&s(m%^_FNtZ&x+?K&W# z$GPUr=WU@<*90D^BTi;i(Hff?y$DFza2;5Vblb3Jiy`N(Vk zm)7HXJ1&pE_R<15CEj(PEdwcHMJJJVZ-M_PhK|hA=BkONIg z5uSulbb}zIO~Ws?WV=G1O4u&;c)1%n>VJ#Y!g?JK*$3ECdiWNXAhd$njjU5_Lt4rV zI3$#|9ZgAWx5cR}leMJjl3#aa^Fmn2NFPANaZ8&8C)fpU($#NZ zk2!PLEW$N`@1lKxl3lIlXt4I0S-SIRSPtx3G}b8>*8M7oAdC}p>?fdsSpFIukp(Y+ zjrW6H>C{DXR9KfWj*vp1m3C#6xV%_j_hoHlR?VU;&04g(GMH4r*$CYwkTC_#c0e@K z*sDe}5+~(aflLn7wg(zQ;`c&lU)pxIz_u3IGhqO-P0S1A5K2YhM$Mr9*seC+AH@40 z`4{jF#TApY&4@8z4C6CtOv4*F8v7yx#t1x;!%{Ph@`!O3pHuN?$rv>*8^`fEftn-I z>M~&Bvyf@1AjD@m1G&&DobH46Rx)Y>{xIxXa#&{r0OiyUixkdF7kCredxm9T3%Kq2 z9s3T?9IBtDZ0A=z;x9b;OGRg)pDqeL6Zl5_fvZyncu?*9KF#;Sli#N25zhzzLtE2+ z_yq{$h~0osGYy{uE(%S_5a)pZkzxp^h45j*iRv#D_6@@jf{6|$EwMl>2W|(<6z~{^ znfiso-|Cj5?G(_&QGsUz+9~e+Qgj4QoULJlcv9wpnJ&KTgjdrFT9{51-$Wx{rf=^; zDOAn%X1zoOY+XSoE02fh-;kW96>PT>E(9eF47p`c%H?Nb3`M)6GRNK2+UI?isDi9H zK1AE3N<}n?^&u;6@*!%7+y?P^DvJ@nUJu80c8z@p^{`8DxS5Qs#GMvcP|j!_8C zI0R+fI7boTsfpT?mP|xsJ{1uN1h|^6ydrCWbHT@CJ;+Mz#AkS1mADt!l`Vx0wIlx3 zKU2bX{)q|(GJzv4#RM!)`S`~;Dh5WUB!W4U7I=+;khuN8uCY2T;qpTY z^2N@9^$wt7D2mPX5GO?~3>{|Yc5D|xWb9uSkLT@S9h3VQFT!%mG_f!b3o3dyw$|1H`1mvX#|m%Y2!LdZw_O8 z?=tfkWB#1TQH?0$3Jl9hksKU>VTMLBQwBd~X4ojvwc<8$we$Fjj|UNP^q--Ak0)UX zA+Jc-ZAn7g8gF*AD4nuoEb|9j64fe?4&t*ah`u0Bo=Op)B_0qr$hn9*O$P7=B$Z^n zrmS8QpqVs?%%G`VDo4R?ZL{c9p4M;8-+g3#@!-?j3-=x`MXZj>C3`p6tUawb#rmVW z^S8dRK7H``-ooveB19&3IpF7&MSU_%69qQnK2)Yssv=?{p+5*$IcB z`Xi}k_@j^2dM92$8|r8X^a{*2ZhfHFurWH;0U&gxD5vmur3^7z8NYz01ij%(z%^he%mH@*3SV3X9r4(JS{40gz`KA_ zML&Z$i~frXh+ycD0S0F4)-i8X{8sz_7dFS%EeOxmc32Vn1^#*mHQNeJP2>ZyL=)8Pl;rn z)^|iBaz}I`6NAteI}^|Gnv7e<9*?P=zhR_k;mKdAHi94H3PUp%(acbnoPfhPj-c8} z5D}CbQyYj~ye^ACW1$Q&+zZ+~|#O)5z>;gc(=4m? zrv<$8)sU?2=R;hb>d_L_F;yZHm1dazz!h9a~_^ENDOl0_Iol!@f zx786Mb+N1R9D8(E=DI3n8QqoGpLAHkv%B(5hV}q>mq9*o%vqH11BpF{5`JD&t@9}1 z2b*yLr4tE99HpSOK*uwQDm<%NSWJe@00a$04|R)JQ=U_;9INBk^Qv{J>-B}BHy*)t=S(fvTggn9Y&~u>2 zTlz4^{=xyO$SE*>UhNDRkZ@WN&|4&8 z;mH-%o=JoTET~Nl^mAUdW;hQB6f6y*{975dwKTZ!n(nrV6wb8w%ocGwld7l=lW1hG$n3=inIQ9oG8++?_23KS9924^V*#S-m zRd`dT1F1d~ok1?_ECk?VP|zQ7$o6c!lGdzpex#g_yHJmZhK<3!#NjL{e+&JncCM?| zbTP!fsgqVDme6E^9Fa^|JcJM67@>#Ji%`(4#~ zw^yMpBnpkn@D5wI-&3vkl=BH}AbwEKpIQ9_MILM5_eIO2kyiOVz26rhef1cmBkjc9 z?>xf2?=(Wk7C}<5sJ$}O*=x{rv4^JZIoS~K$d3u{v;>Ea%B$!`=Tp#b4+v&tFX%ql zOSV)f(<8%R`Ul@;%a?l4!#H34O8sxP93J5Xb(;3|17QL1@q&K88MB0QPrrZoirxPp z*?m2nEC)ga{Um(EAqDnf0{cX9Si*WX132v6j9AqHn{TKAUVCdg+ky>z*h8T<&H!4- zmwPwxqx~DW2gD8B1Nw#y+&Y}apLlH-PwIr_owBT4Q;EBTkj z^ESTWH9TA-t{KWp(&ujQoSsoPa2i7tOWyj`4HwFOK;1y{owbr1L@$r|PZIpAy6e#c zJ18ThDKT-cT#4VkBWxq*RKF27VSuveN0DEP0b>B*D#G7h;nhY2lIY8SHk9!T1)VH~ z1S!UWKC!Ezt53RqsdJu{()X8TWc>_N{XM=R@fa|&#$f!3Bd@sX3ExE3(%r=?E0JH? ztoYU1Wpky3D>Wp7@GkK_a+u8fcG1VZAnrvNa8MzW81i{$LgDJb4QJ4S?a(a0(QwP& zU!&feXG;>VyCr6lCgEjN^h|4Bxsp!&$^MDMi|FtGK>~9bw<6|kq-&VVfsL>;5WR&U z^lk`_wuE!#cJ6tbBVvJzh~;p=_?i>&qFeRr*lOgV_~d0J47MciRjbXKSGV(s zzRfs(smTyrID^3<=_o{Qse&6^Y+h0-)HUV>pyVZNN759Jr(<@s|n%aqQT$A#c)ni6eUxriYy*La4GEvh$EW* zy6Nt?r6%cfY0l#QTbJ;VR7UKUTpLpf6t`_1bi`es+_)17oskp%5=9c+wM*C@)Qa+| z;v$;KNm|yGK$mgxu37q0-h6;P@^LXDsZ86TC+@bP5h3QXO;mz14q%LMLR(T# z+EHhj@%_4PX>#i@+-|HYOu(E#<|$1Xpjx$pFcGVLURZ7r1j4RgT2OJ#HVAHdH}KwR;wq`DMAv3`1rx zo(-G9QxMIGVZ00Nz}B}5ejRe(3k{xXC`0pS9WN{)oeS4lyhSfu+k1P= z4@QFm2%R?IfD$j?M1ma^q@=Ehg53NNR$cBcSeG`!WgR2~7-ENJuU-|5{v8;3m+`P9 zW$tu}+q_Q1Gj%4O$y7Wu&czFv3V<2*`J?Wm zQy!U$;RnFxHUzFiQwGzK^=LXt;vsW(s1OnyZNEvq6-j0X_4l{ATV>4KqGVc>QE^q4 zoptep;JFl_pK#ep8M7Uy5H{=*uA*+TB}r=+0YP>qDO`|CQ20HhG#h(lMKK}sca|RA zvC>iE`yv8w{G9RQ&?SG zcyv#^I_nSEj5;zb@uCm6)^XwrQLXxAAM}eVVvl8jBBh3D{va9E32mWV?m<48B_quR z5E~2GHc{2mp8O_@71~CUQ_@K@D+pn@Wzn@))XhrXtthNQN7YLXf(G{L`-nUB~3Ev`$u_ z;k*`2;b|d8{#8wSW+rX4ZwRBH|Gx--yTDB9BFW-bc$JoHL93u`fM|VbmZnK-79|n8 zK=6IN9W9^y4asK59(&4;urLk2PtW!TTtBkD@>BMlW|>_n$)vR((X`p zG?P0C%1Oqfm`q`zUAtD50{_F2MK8?y7!T3+4IBdSzXoUt8jkhntlMQB_4JYz4_$e&wiea4Hpy%!|LevgFI(hyixMSiLDplIJU%LXkPkWxCZ+E$ s#2}XHrjBiV#MOkH;40{!e1yu_y{YWb@uBQc|IpE)nV}>2^bZ~QZ|c85%K!iX literal 0 HcmV?d00001 diff --git a/de_novo_scilife/QCcontrol.py b/de_novo_scilife/QCcontrol.py new file mode 100644 index 0000000..6c8980b --- /dev/null +++ b/de_novo_scilife/QCcontrol.py @@ -0,0 +1,184 @@ +import sys, os, yaml, glob +import subprocess +import pandas as pd +from matplotlib import pyplot as plt +from de_novo_scilife import common + + +def run(global_config, sample_config): + sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config) + if "tools" in sample_config: + """If so, execute them one after the other in the specified order (might not work)""" + for command in sample_config["tools"]: + """with this I pick up at run time the correct function in the current module""" + command_fn = getattr(sys.modules[__name__] , "_run_{}".format(command)) + """Update sample config, each command return sample_config and if necessary it modifies it""" + sample_config = command_fn(global_config, sample_config, sorted_libraries_by_insert) + else: + #run default pipeline for QC + sample_config = _run_fastqc(global_config, sample_config, sorted_libraries_by_insert) + sample_config = _run_abyss(global_config, sample_config, sorted_libraries_by_insert) + sample_config = _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert) + + + +def _run_fastqc(global_config, sample_config, sorted_libraries_by_insert): + mainDir = os.getcwd() + FastqcFolder = os.path.join(os.getcwd(), "fastqc") + if not os.path.exists(FastqcFolder): + os.makedirs(FastqcFolder) + + program=global_config["Tools"]["fastqc"]["bin"] + program_options=global_config["Tools"]["fastqc"]["options"] + for library, libraryInfo in sorted_libraries_by_insert: + command = [program] + for option in program_options: + command.append(option) + read1=libraryInfo["pair1"] + read2=libraryInfo["pair2"] + command.append(read1) + if read2 is not None: + command.append(read2) + common.print_command(command) + folder_output_name = os.path.join(FastqcFolder, os.path.basename(read1).split(".fastq.gz")[0]) + if not common.check_dryrun(sample_config) and not os.path.exists("{}_fastqc.zip".format(folder_output_name)): + fastq_stdOut = open(os.path.join(FastqcFolder , "{}_fastqc.stdout".format(library)), "a") + fastq_stdErr = open(os.path.join(FastqcFolder , "{}_fastqc.stderr".format(library)), "a") + subprocess.call(command, stdout=fastq_stdOut, stderr=fastq_stdErr) + sample_config["fastqc"] = FastqcFolder + return sample_config + + +def _run_abyss(global_config, sample_config, sorted_libraries_by_insert): + mainDir = os.getcwd() + ABySS_Kmer_Folder = os.path.join(os.getcwd(), "abyss_kmer") + if "kmer" not in sample_config: + sys.exit("error in _run_abyss QCcontrol: kmer must be present in sample_config.yaml") + + kmer = sample_config["kmer"] + if not os.path.exists(ABySS_Kmer_Folder): + os.makedirs(ABySS_Kmer_Folder) + + os.chdir(ABySS_Kmer_Folder) + + program = global_config["Tools"]["abyss"]["bin"] + program = os.path.join(os.path.dirname(program), "ABYSS-P") + program_options=global_config["Tools"]["abyss"]["options"] + if "abyss" in sample_config: + program_options=sample_config["abyss"] + + threads = 16 # default for UPPMAX + if "threads" in sample_config : + threads = sample_config["threads"] + + command = "mpirun -np {} {} ".format(threads, program) + command += "-k {} ".format(kmer) + command += "--coverage-hist=histogram.hist -o preUnitgs.fa" + for library, libraryInfo in sorted_libraries_by_insert: + read1=libraryInfo["pair1"] + read2=libraryInfo["pair2"] + orientation = libraryInfo["orientation"] + if orientation=="innie" or orientation=="outtie": + command += " {} ".format(read1) + if read2 is not None: + command += " {} ".format(read2) + if orientation == "none": + command += " {} ".format(read1) + + common.print_command(command) + if not common.check_dryrun(sample_config) and not os.path.exists("histogram.hist"): + ABySS_Kmer_stdOut = open("ABySS_Kmer_Folder.stdOut", "a") + ABySS_Kmer_stdErr = open("ABySS_Kmer_Folder.stdErr", "a") + returnValue = subprocess.call(command, shell=True, stdout=ABySS_Kmer_stdOut, stderr=ABySS_Kmer_stdErr) + if returnValue > 0: + print "ABySS kmer plotting failed: unkwnown reason" + else : + subprocess.call(("rm", "preUnitgs.fa")) + _plotKmerPlot(1,200, kmer, "kmer_coverage_1_200.png") + _plotKmerPlot(1,500, kmer, "kmer_coverage_1_500.png") + _plotKmerPlot(15,200, kmer, "kmer_coverage_15_200.png") + _plotKmerPlot(15,500, kmer, "kmer_coverage_15_500.png") + + os.chdir("..") + sample_config["abyss"] = ABySS_Kmer_Folder + return sample_config + +def _plotKmerPlot(min_limit, max_limit,kmer, output_name): + Kmer_histogram = pd.io.parsers.read_csv("histogram.hist", sep='\t', header=None) + Kmer_coverage = Kmer_histogram[Kmer_histogram.columns[0]].tolist() + Kmer_count = Kmer_histogram[Kmer_histogram.columns[1]].tolist() + Kmer_freq = [Kmer_coverage[i]*Kmer_count[i] for i in range(len(Kmer_coverage))] + kmer_freq_peak = Kmer_freq.index(max(Kmer_freq[min_limit:max_limit])) #coverage peak, disregarding initial peak + kmer_freq_peak_value=max(Kmer_freq[min_limit:max_limit]) + + xmax = max_limit + ymax = kmer_freq_peak_value + (kmer_freq_peak_value*0.30) + + plt.plot(Kmer_coverage, Kmer_freq) + plt.title("K-mer length = {}".format(kmer)) + plt.xlim((0,xmax)) + plt.ylim((0,ymax)) + plt.vlines(kmer_freq_peak, 0, kmer_freq_peak_value, colors='r', linestyles='--') + plt.text(kmer_freq_peak, kmer_freq_peak_value+2000, str(kmer_freq_peak)) + plotname = "{}".format(output_name) + plt.savefig(plotname) + plt.clf() + return 0 + + + +def _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert): + print "running trimmomatic ..." + + program = global_config["Tools"]["trimmomatic"]["bin"] + program_folder = os.path.dirname(program) + adapterFile = sample_config["adapters"] + + if not os.path.exists(adapterFile): + print "Trimmomatic cannot be run as adapter file is not specified or points to unknown position: {}".format(adapterFile) + return sample_config + + rootDir = os.getcwd() + trimmomaticDir = os.path.join(rootDir, "Trimmomatic") + if not os.path.exists(trimmomaticDir): + os.makedirs(trimmomaticDir) + else: + print "trimmomatic assumed already run" + return sample_config # + + os.chdir(trimmomaticDir) + #now I am in running dir, I need to process one by one the libraries + threads = 8 + if "threads" in sample_config: + threads = sample_config["threads"] + + for library, libraryInfo in sorted_libraries_by_insert: + read1=libraryInfo["pair1"] + read2=libraryInfo["pair2"] + orientation = libraryInfo["orientation"] + if orientation=="innie": + if read2 is not None: + read1_baseName = os.path.split(read1)[1].split(".")[0] + read2_baseName = os.path.split(read2)[1].split(".")[0] + output_read1_pair = os.path.join(trimmomaticDir, "{}.fastq.gz".format(read1_baseName)) + output_read1_sing = os.path.join(trimmomaticDir, "{}_u.fastq.gz".format(read1_baseName)) + output_read2_pair = os.path.join(trimmomaticDir, "{}.fastq.gz".format(read2_baseName)) + output_read2_sing = os.path.join(trimmomaticDir, "{}_u.fastq.gz".format(read2_baseName)) + command = ["java", "-jar", program, "PE", "-threads", "{}".format(threads), "-phred33", read1, read2, output_read1_pair ,output_read1_sing , output_read2_pair, output_read2_sing ,"ILLUMINACLIP:{}:2:30:10".format(adapterFile), "MINLEN:30" ] + #print ' '.join(map(str,command)) + stdOut = open("{}_trimmomatic.stdOut".format(read1_baseName), "w") + stdErr = open("{}_trimmomatic.stdErr".format(read1_baseName), "w") + + returnValue = subprocess.call(command, stdout=stdOut, stderr=stdErr) # run the program + if returnValue != 0: + print "error while running command: {}".format(command) + else: + libraryInfo["pair1"] = output_read1_pair + libraryInfo["pair2"] = output_read2_pair + + os.chdir(rootDir) + print "trimmomatic done" + print sample_config + return sample_config + + diff --git a/de_novo_scilife/QCcontrol.pyc b/de_novo_scilife/QCcontrol.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6fc82921b2ea9144c7784bba5c00e27270e42ebb GIT binary patch literal 7018 zcmcIp%W~Yt6>VVn=I|v^4@$NLTYiiab0|@0x4L>;t=nC>qSh;2c|ffXbmc*{J|wcO z;$hVsQR}11V!IU;kE!OkS|7*zfcgzOsj6U5gb5W4i7=^xVG*WOFe1VIn>g{_K;+8I#ApqdM8wKJsJ6;^y=tImLGv#OZ;Z&c`|>SY2HNXaq8LErmu^=M#28% z-QPIWx|r_17I}pEMUALY6H6zMvlXWse(W{UWHZ_-sL_5ii^Hzw0yVHW44!K9>@<&7_8kN)mks(n1zB{DtkS3;W@tN72IVWpDM<&p-9(dNCGo%hy zEV$S02RE!Q;B>>P15X-MTX5b{b#PI&hmdz#b;eYCRQz*H8sGb^y5An*uA1wND@sF< zs@+24QmBn>r_}DK%KzETsCG?*lkC7lN9Da7Nrk4DGWwH7cE-6;F&^yFIgWcmu_}6! z9ahbYZVdnfESym72^Pm!@+Q04(s(c?C8R+#7N83>i89BERZl85rHU!FGfkrdbOQvL zRP8DBWLtZM!uLKmtZeY>_*tt>VNqAj1~3j!XTHa6CyucZC9-YOtccP?%cjiyD8Jg1 z7B$hRF59KP^(exN(scdDopKM=A4QpX){#aRLE0(`?qmPitT4~TqOugx4I;F)8HfNR zrBW>=u)`Yx2i9F3e7E`nzEe<))IKWg&JLE3KwUXwZZfXE|zO zubLf5ci-@JNQ{ut>MV|mq6&3mJ8bNFL4JsR?#wTe;uFlF~Slz3xg7;0*8`3TTz`O0t={ zEH6`F3XAo5=m%!iC@mJJP-auQ@~{+Hs}L|0J7g_AO!=o+Bih_kEMpDu{(_*$r6&>A zb)B79RqG<~XWAwTowi1;32Vk0!)F3H(@5*qJJwtHIM4}7aZGGOQ0lMnlO7;3kUbxz z06^FbyTdBK#iiUCWPtAC6E+L*3Aj`?fcPiirBl%$%=NI0B>+w4oiGOYBK!#O40Rjm znF4GXVmAa2Y;iQ)R|QZ4q(g05hS2;E*|x;8VFlO(jsOLL6=n8+Bzp|mBa%I?c8~{S z&;!~Q7z+RyP!6)s_e<4;NC$sp{J#C*_lf{JrihmSwMjGw6oWJc33vi&8WJ!C$TFjL zZKd}t%BLvjUt7eT)6$EesPqCpV-U7KTL6&RdtXZwKpV5t#==Q$>@8UM-iI?1JWD{v z3k)e?^iOtgd4e+>U^Ryd#ObUG&dC%AGzRCpuhR;_d}l_Ga!z481Q&>u0m8{N>Q_3* zno+?^YIi{8t{uFrKKghMO-iNUZFXT5r&Zvn;8o6P@S1uu&{JI&)$2Xg8{JZG_Nz>( zw?u_CHSQDCr0o)?2bIC;@4>BC1zm)Qc)Lwt1kr^8!Ybe{&(qwA66c7nIK4o>4T zm@bVGFpT9*O?>SibHtz+{RN5Uz`1M)NoLNxtKQ<3EA=edivEpaCl+|OEWj8bBRST> zyWJ*E&Md2IT34^v_#^TOme%MP?nNdqF?pHED@=qCb1@Kt3-=OL4icHW#x?@6I1YXp zTP1+Hudxt^?p|i{CX=_AFy6RtBaubDm$yP8)*k1V3wW2O9s>Wd=<0IE6m-gG7TbNF z#ou8v&*XzOv2^_0_Urcs@8-(Bq_t_m^=)l=OIwR&$7B{F(igt@bwL03hTXh{zx(C(u_>317sN<652! zkb$5EVT|#1_8bBmR*EDY9gHwSSbRsYLc@qFIL=CQqD5Rlsd2R_T2bLB4pp^ZWr7K` z=mK#NDdHvKV8xairxduT4fAGf@EUqwHn`-Xk5&^$2}Xt@ zjRE_D3%Q?w%~zm=Q6;xVgn$eKZv8I2!|Bd!M+&)J-9$) zW}0v%fI;AFz8~_O&ORIQW{oahj3Q}93C`zDkbQac56qjuhV1Ts!pvF{NP1yL@JUo% z+?J^>GJFA`5@j_X!}pSddq=hSMxD39rBzL$*;Kuz?#m zL$HA_2DH$K9Iy44q5;UrtCe8aj?l8QEg*}@IL@}eeVT0_d~5C}zLEP0FmnSl<1kz> z2OuCsTK2vWX~4{+(B1EdApzDIHxOS5hlSCoE*A@=-FN}#1{Q&DlX;gzKoMcs&lUmC zHKP>71T*I(O$wTU+k^J@=Yp&E=gvHxYwGg)Kp6C?r*lo+UM{%stOEDO+8NRfuFb}Q ztuTI$LxRa5*PzvHU?Iw1I9fpP?yV`+nI-yxboZ{RrK0jDHb^?#oTV`!@?Y9CL=gHqje%5E*Z01%_r6p3-sdEl!xLye zsJ-}vag5q!c=4)mFL2$26tC|x(5ct!hCq`B^koQ*mVN36epUb<^%p2{@8}-UY4}N! z3O^@__O(-%bT+{tok%;()mhkxHlr|b!1goVb!ewZ$u5OlW~q)yke3MQG@-8pO(xc7 z61vc>CR*}i!t|koZX+UzVMIOhEfmQ85i%r@NO-7AVA1U1ai@3t)4lj<^)v!eJN~{e zWy1>R_-o6?kp`XUAwY$ z^@`j%L4Rj?6$;G*2`-oVhF0=3m^iC?f=&|h4byZ2L8MqCXc0Yz#!}FqUoan#5#-m+!Vhze&(-_i8Yrw8rXM{)( zLhe#%5@>#^_*};4mGZM}pSRB27onX2{hkGJ9>hm3Eaf;P9%AA(j@TmbzXfu<5vG#i z*5B9eD$EeNfPXD8pyOIkOmoTbHq)VgAsIc e-TYJK$E1y#N3kn1J=4~7Wprlr;^=#$=l=&Ur|l*H literal 0 HcmV?d00001 diff --git a/de_novo_scilife/__init__.py b/de_novo_scilife/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/de_novo_scilife/__init__.pyc b/de_novo_scilife/__init__.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d8841b23e174b48b18b888c018fbc7bd2b417f4 GIT binary patch literal 162 zcmZSn%**v=jX-cR0~9auj+xkvdqiJVY=n?!n*-!73p z<#$Nrr1Co@(y#n3iJVe?w?qb%-y@Mh&cZ*TzJ%~+7|)ONVs_zwa`V{FlwcD1n3?$5I>=IdesXvT9>-L zepWq(NcoX&NV^cXBdmL2G@ivK;uFZLMPX2ly*R8@;|`?5st%$UePf=nA9BudCxg`} z@O&M!H?JI)sueUFX=BN01j~)E>=&Y7C5VD*F(@o=deU97R;i$8u4ZvHC_XFr(RS3R z##~S7hBXV7pfbW7$u5;^%U-!qtW{USlH_QwQZEN-&KszxYf&8dg>twYd6SQ z$thqahlp-!ZOe3%_*`|RCJVtynI&Ot!sYIEIeC{;bGd3V4B8And^Q~sMa=W1zilw;Yx5IEekMK-Ph!+C9I(!y@z-t16YKR4U_cV(M*DsE() zOzuf+WW>4<>ycPDV%U<%>p`qnVkZ#mlUOfeC%IvPHxHy_Dn)ueGrb#X;U<&a_``{i}rz)*z+A;rnNchp$)d;)I`6Yd8ux5I95w%Jlcl zR?)b{)#=CZ19FHu)moQYgYW|~@w|mCvxXT+k`red^Dt@uR7T7mz_G6UhN6zD@EVw5T#>6$iN?faz zwE)4A#BhYvGJpg=4iE9qP%W=|{w;Z6CQ?12LJqk1c(Fvrt&9MCEO4a)6Ko z7GwL+N${ElnlV6I;rXthRjm=%8?h#gnB1{tuOgM6?@E*%jg4V@ph8vi*QnNS;+IVN zS~6>0TZx@=SbfI1qiaGA%hqlBZE+|XTi1}A>g`Hn=Un) zD>{pl{AhAsW-gn{9YT(bSZ!41M4xK6pY=~M7+^5S;E>OWBh>6|o@?!k-ZIKDWKEJ8N*oO+aC`yP6J?rd^;T`l^~20Z@as4f z9~T7LAJZ&AqJpfUhy+Rr<=0{L*d5j>YY1xZ6k@HautyYOpFM>9;Y2MCqHdo}6?xVk zw9Z0NcF1q$ILPm_25jV^Hgn!YjR9%>wo#}2ru?K+_|bocg9R-Lb!s#tIGIt7(dhmh z)Tv&fex;U$LZy~<(*)Hn)aWoPDYZ3e2~Yg7^(i1%cp{V>M?mWifh#83sMA0o0bfvK z09!;v%XU)5g1$njg0Ci3h5z}`br7+S`iv^>q#9-;0Hu&k#-Y3r@08A?*ib3%)INa+0YX%g}p1%WY! z)edIx23hM1%O-iiNS&g#zOeiYYWCjx7uNb$qJU*_{5NH|^-mI3O9WbgkT_{PkBf6t zlkRlk-h=rWIlN?Mk?)>Gk$Vn-Cgu?O1X2Zo*1Qc*6DvlaRJO+7FKHgp(I-!CEe25~ zAATO(nJ7uK(KRp9)E!f`aycl*v&D}>PtON&6c+Uu3whw(nY%OY%)-=6AwTo8hm72L zzAJd3i4b!Oiw~EOqq)5wJX|uVv0`hNWARq3-{C;_Cf%jNQMKy+iY9(c=JOBdG28=p zX=eIP95oCsKbpKhH$AyD_h6xrUvlRbW+BTbcka(D%*`w;X`Z;H`NhKYoO|cGE=FEl zT-9fg3C*xq7!JdNy}2+79yDUuC`xS}UOW>;20vpTdMJ%Tv53|H0xy??3iW7}3M1`( zifbyGXl!gmaFPolSQS(^!U#qLv8WWp?iKuY-(@Kmb{U$q>Qw^KGrC@eTH&`|tA$n9 zLBzepfHmFUU?BL;8c?xSm%!$}hd>Oz7HIzw3vM&uwh%cIkeB_oS&OI@3)DA+45M)i zt1u#h$o(PP-DEo|=48o|D3*tV9tWCI5VP?MQ?tuh&jLs1tO=!qfKfszVez7M%esoZ z3y2d8`w{E4PFe#72-#5vgnxcKKsalF@Tad0ggY>Ay3`pWR*HwfKtKY~(?DXMKq5dV zgFrX|jz&RN6I5%$p`a2hnxxxfe>L&$325S;=J;lcnM(qOS^ zK>-~N9R4DK!(UkIU%vzn-COu>?oSxV&OHMCLg5{EQ0>*>kKAy|V3lDs1jtqh-z3AC=13aPV z#et!3UC7R`i{8u(ec-=CT``$?uI6VzsHXA>e%;s|tL*J8MW*kLnQ@ao-)#|t)F=un zwT*y+wW830lD+8jHX80npS^2wteC@zyej7WI$?&?J%O3MAM+a&9KoI52U|LnJ5jN6 zqNI8PEmO#$F-C8!*pILR_j+T0c)%MAJ8_>kcIkjO78Dl-B0ZequSMTU4tS68;C_>~ z>5xM%nK6D8)c92V#k5Gx05J2!eati8nPy@PK^OgRk^}2cl4;j) zTbguJB(Y!hjAaN9hz+0(1H#~i3lmZ$ZscyU|4_~No_h=~%%mM-!l^tom0>@(OdHlP zAuye%?zCfoAFfY?1Ja;H5_dr+iTUC~z)@(6>(f}$@MIFRltr=!AEU8p(UGBPAZT!Z zf~=t)MYH!v-Ih9A2lvQft+(0hoi_c*W<@dgeUeV3EyaVIP8O>uOG71+7jIhPxy)g2 z!Ry#!+J1=~m{&(RE9s&PtA4O`+}=tdI3mb3C%b_Wk02a^j$eagkgihWymRSF6T}(S zVw3BMBd3x1AzQlD!Fq!?O_^77GZx#H*GV$l1K7By1lTABqf~9-Xz!;&q&`roKtuK5 zj5b3!d9&3`M>lKjtaxy*`6JXO#`vUKb4jCJl2n)0tr)TH_ZXvg5&k9?Bdpxwrr*Ns zF0)v8U&i7Lsmn~-{r0PPx#`^i=9eOPy6Gh;HX>Zl#bUM64yKE=AjUIOw4_answ7*k z!b+`7qxL2;#rMdP#HQ^@Dk4d3L%a8w_@K+Ve2X7V4y^1RAaLp!9E5E`2jHzw)E6U6Z8j4Pn|7236s-*n)YOzf$Yg#v*8BlZ&MHf z?#s9^bAhCQELgz2RHj&&X2@y?jwUsd;?&U9Ig2@Rch{mF0FQutE!o}6ebXY8dAimzTVSJ+#cplf# z3Q7WNYT|O`$ZF6x{WRI(iS*wR7JN#QZ>Pxt@e)9(^otgO=0v|xJ8MuF@H^n!z|rc+ z{oOm}LNRBpZ&Ei<%y*T4i3~fNiho)8S12IXl(Ru&BavJZQkHZP&D!aq9HskSDvsO? z`d8DWxvkqH9U2^z4xV60cLq~{bVE5vw{M*2L9Ur2%f}q)$RDA+B~ikuB74b)9C;7t zZ^2ze*rL+DK=LIj4Q~dHTcOEC&i|FWK(UC#m0TDfTk&+f9%7Nq)t^aAA&iB%hI>HK zi<0kUDiU{qbY{ZS(WshBAF*uq=izoUF$KisiiI|MdfxnsUYG=I~l*>)+EF)5IGscJJS4FM}$7f}3 zkiEbfNzVuJ*fUvrRCyKEOub;Vu;}Ne7Is)5id!5ncbODboT+D#bqPNC5h@vD`803- zPBO2Z0SSV5m*x5mYL?YSStn0bK7v-3bOkr1AR>~cx`sR2d_*ZlD*dQ(mztlPpXb%8 zAM^84W#c=hmz*&NNiVSr{Dx_XF}Z_C>Um^`2dR91X=BeWp;7qS7Wbg}$!QJ56(N#i9B)i#(6KC!hrs_?O|)X$z! z*JF0SvYVo`h)7XT1W~|5UMh-pv?%}h3Zkt4N~}^02JU7C>$3Px3-bMbj`JSojRZIc z=Z(#Q4I$E(y2+rhYYh?d!GA82*N|C(lwjGwcTnM+#7R~xJjhD&!GpjqKo&qVl2x?V zKsMlvKr6{#gHI9Xy}BWb|86=wRvwi4*D!GQrIH(C*uT>XDVhlEIs845&23f_~ji$Vy0zIt6UL~<}T zmbM-r(_WY+CYX>LAK^t$ z>1fgX((Z3FpY&=pznbu>yE5UM?x)B#?eIFcn)aXr*JB$>k5>o0_Z^YjKU)1S@jp=C z8It2Lc8Nig=?KXYe$%8j6nl}^=R$J$+*RgoTnuz0upF|E-i{)|oD8;fVtB*^S`UH+)3st%& z_w&c(qG-O2=A-5_*oC@|!4IV__C*kq>mVQWMHu3e&=yb@b01IY(raBoy$h(v_y~S? zSnt5kao)O&2T?;w?5uqjZHDnYO2`a1K?i;^&&J;%0YC?Qr!)pwWfI6Xo(Q|e6Jcn~ zbn1!7Xbe;XX$`OrOaSZwn`^&5rUQ8MF65C-R^W`4N6YeFH&{gFkqm!?%IiD@>;jS6id* z)T)l>(4raTJ2I(iwpQ~V{EyJb(RO9Zh;KE!{%+rPrUT%)sCnR-JxkKOlLfcl2qLc( zj1~2UreO@Ys3fyPbjR^FpE*~b%Y#Md^29ag^3C0=Vpq8&$xTbi;38eDQ?*9fJVV8c zQdzd7LqW*}d3JvoK!{Be1@*F548$?%%J~QTRqir_1_E(b=9i{%UnWqn>EZPadAoUf zSgiTM2zR0}f9P8^<_)WuX~`qRX0lMM`0g%BkL!pM4-0sER4~IY@sRR@Uv;F`1)s@W zwR&)%+U$0#m>wp&AE)r6{}TarJ6_A;2(8J%g1YCy+O>pDiz$HTd^jpRWH==0nZaoh zCSc>@0%ry%2h#RhzLr$f&XcRxdp=m~nEwPe@v+E*aGwjnxiOTulas;-^-f~cu=$466MaE%_7q%Qpdm91% zGmAM3U$KZ=o|+$TllgXyyUbrPGM1%@w^1S-BqL88*Lfi9aJm`v;QxRmC!(zbuV`UI zTW?r5pKC5Kz;cBSSw%E1R%5~Upk cH=u;Gzq#&#?&3&`3N`0VCi6p&|q?2nlHff(OLYAb6=zh307l-}hf@ zpNnsk4Czp6XM5e(+H38-*6;tVwVV8+H#PXhe|iOF|4ZQSd-$inkBg6AN0pSi*Kj-P zo)d>B)V)OPPO5v!*xjM-b;RyYb+0pax0chT?sX|Yp}vX0UCK|2W4H1<#L=z%PI07^ z-zAPc%HJi99_8;AN3Zg`#j#iUDRJy$Yv+dcpykE?#(&cV-c6_dcLRU*C%8mPJ#f^T zqn2(MN~#APD(X~eH1|Q5T1ztH(}Y^@P;2;A z%(9kHPm^lhQS~$GZsnkgc5RG6U(O9BQT;TELL;ixlGeN6%7x37+>`a8-3;x*rBE%G ztCc9tI9&~+T&Z|J%!kE5=kBlOiWMD%5z1F;F^KX}6rzE+vsEo&<*L723UtKMa=A*r z9OQCR>OwUvBlSLpvqoLb*ZiPTeN-Ki_VlvAQuThmlq*y#3&q8VAv#~Km4Z0r2t&}K zz~8n_Y<#}rM=2z=?%YCU2tFkjeI2ghTCtEHUb-~=DEQ=);_%pbZhGd!ncVEe?D*uw z^!TtJ*;Rwq*^_D#Iy1bY{}Tif2Y2I#;4T!9<_!Srw4`ih^rT_ zUUBV(Yp=NW!L?6Z`{CNJ*6?B}^>DWe@5S{VXszm?dXQ3UJq%*g>i?nE@CNp&hY1z_ zL7d~z4ms@29NwIyx1~_OAH&7&Mg@0mmVwqR(&bKs`x{h#T3*Nliryo!_aHm**6pZA zJ-|I3m7$d`^pB~v1Im9%tsPYUaTV=FxykiDc>-isl0M%slqbwkZWtf8wMi#iJzI?2 z%HNpr`HL4YKHVz(cB0wyaGM8abIBkT zRO=0&ZuPtpd!DENH@-b*w(Rn=anXD~e#Tw?<7BhT;r`il`Hz$JA1B%6&(oJ?mhy+B zv+OHyPWk7R|Ej#e9gJ9%wajToUr>H#hv=;Ghc`!?`LNRr3s>Nan0KdjAFi09E5*v9 z8->M6Fw1Tx0MM(?nnkwsP-s^O3JdItV;n!+)JRjL}xOkU?Gy_JCg~Ei%XGy7s2+f-g0%d94+y_ABR<3tD(DG z307)B0ZXS_thmuq;F_h?HH)pgP()p(V$&abZ?nLxj{F&Pk%OSe8zaWUP$FEp6{JM| z*cl*>tGiwbN+o>|uH~p$8qSxBixn@(`x^afYmM?DctUg0kNbQfucLes0m}BbsU;cf$`*B%1f;WJpqERL zmZuZRH-(dN;tC|^Vo(X1+4dtE+1dh!7$^;_YUvwvWZ|(7G*GIRkCw#opjxbW9KeXF z{bGoRp&G8{f|a6@IAEFPdB|ivrItrHy(K33OJ)f`D0uUBUpRQH{ zk0>aZ`b*)s4f|Y3K5b}C_nLNI19$$`#-Wat) zDbGBS(|}27d|^2ZQPWs_57B{Uj@4!;OGGmk!+iPr#I%s7_2%qr?2+8$y*7xuDsQKC zPgEAF(k&q#RPzSYb=hMFd8A`c#){q9XfC=+-jq#&d`ozD=)>i(@?pNT90-rfrZh}PN&o5q?{h-FgUft>BY6*IS7AmvKv10(}xuC4`Q4&S49s zuh#n213+t^@o%Gy&+q80& z2FuvTeQJF`t;tBP8RMY-9l$99Ogcb6SRbGd%4ZiCr1wu4>kK{UyF|#vF10?W)&>b! z`_uyr@F2T)J+0Q#3h+zQ=7XevO8KXi|BCX@DF3ViTA6%XXGy9spY+eM$OE(&L0&Rq z;G;@EA#95X%WTb*Ll^M=O>555F-c)A=bEAgrTBWr0UqZ(k5jS7h2A56$#RN?c`>Xm z*L1d^A9=?Z%$X3um}wRDL>c3YZtKcR?Ec#Ci;9B zU2tPBR>V}1%d5vp*BjuaeWCD(DnbdlOF{{C<9XM5*nP9(PuAt``RB=cN{=8A_e-zT zgr>)Tc?)GMgBRROOl$sKaCtkA^C52cB7Re{Vw1?gN~~dOc0&#CJYx+kIj#lewN#b&Jj+>XgxnzTCWjgMrdAaDHUxHmqH!lp;37}#jv z+@0%eVphZ-@6Pl{{2GYWU}N#l==&&ebaHZbbpH07+g6uLSS?lnp1%9I7%icfTmW-5 z%(}CsAg=?ruoM&?y78N5V9W(Zy5(vZpl^+)CI$)dG@I4DtwwF_y=*%9MOHc_@A)ZI z>G1{jUg6~oFY+RsrHg#y@d5ETl?YEg4#9rLEFcC7f8lv3nf zhVdq=g&~5$259syG3+ugS9p1im#e(Ij*EOCUUGfslv?bW3yr8M_kgP&kM*uL?+}oR@)zZT>`eB88j>oK$B9 zzzIPV@w|2-bQch^SI|@7j$Om%3EU0CmiX$!9moz$2b#0W>1TAnn?@*E@+*b|p9%-|5(NoW zNM%J)XrBks1mK~lpn`{R9YPrJ5U#^;0T1EohYJK|vJXg;4zeoY0uu_NLPHzkDqLvB zHKX3LQs|tNgCSlXA(iOA68BLdly?)h=wq!VzSQbDZan*Ag?E3P<%HBLH_i&`za2-o z?Gd(ylM*T*E6I;bGYlxfZSKX2gT`aULF2LFpz%Cw4fW&BU6zc_!Ry131BRKAJ0t@M zBs1Q_BINeA2>QS;2w_$+7P-m_W02dyXhs$DA%2+$#fZci0h-_6Y#$oqU$yNERfj3m ze|?Jt{x#|OHt|lHurc);EY5#(yVP$f|7$|9))qM4;lB+cC7F&i$<(&JvFG781BS>g z)0*>hygts&jLw=JC1F#V|IJONt?0BUYXfzRhvgs?p(j%lE$a;<5-ySci~{szT@!7M zTDBaDc9)G-BEz%X)hg##av?URg38FHix&+Hidgsc$%*S9r?%xnf#Lk4MeeMPkogGJ z(4QWPhooH2*QD4h7gGX}vW8S3y$F2dLAwj3{GuMYk`jrHxSOjimve>XsJgJgU}||` z@CY{<&|+?6v%{PE+Qffg{!Chx^`h}sp&^^`9yPmkt3r6*afEs&7-b2GwhY@9 zC(G(Weqk=C3tW^R$t^=*(}e{q<%u2BngQ*Kii>f=LbPZSN&!&#SWhO@Uz2*ESheV| zLU}EZY33cbQl$cm@M&r|DG)3ZA zbbL9K|v9zK&_1sli8|kkkf>@ zj0^+95<)TRT2IRo)+-xFW@*LVc|r|fxgKaqyzhAux2~S8C4m=ym`@pdA=&06hb=gKH2jpdDOkxS;$A zp`{`5(RCDI00!{EF=2olB|0C=9yW15O+cP9JL;gDH3*C#2#9>xnnt3hP^Myqiy9W> zL(#e97SP5MBL3AsP1HYgSiURef962cVp@S!D3i}#CR9w8xlf@${EC(a)Y`rWUvBMA zi+rYUE%F(UMLy#>)0)R_=b9L^U1r-}E(b*f2HBa% zG~Im0AhTkxla$mg2Eh?!g*==e$Wj3W`a%8!%*EO*M)D$?q<@eE%-Og2vYgl0KJz-H zN~n;BWnlh1hCiHCFcgSGQ&$wYilZe|hX$dF@cSL6HQ>S>QiA##8RNZN5imDq12(o) z;J?xIwr6`&`9Q-f@y2lduRFkr;6n5j`LH?X02AdI`iaz&(qzK-$0sL0 zx;Zg-JIBBgVFQpP&7 z$%(0n`5Y_B-I)1s+#9{cw!IEI<&fTBcf@rdy!xm#6@+upZ%3|NbPJ1*fsxhF11*hQ z%wE1~;-QSKj=Xm3x~V`~b7SVt^n7j}YT(@MnaMGfgz7mu(|4vMcyw%T#C;f-x1 zP3wX1_P>-E8DlfK+cQ%$von()@eb#`i78Sv!P-7IG3agxLL(&=25^|Xi^mfC%q?8X zzKkEOtX|Sovh4HwmfI2mRE*pLED}7hS@%xmVWs-G;x6QiC2Y)!Fqv(hP}58D$fCm0 z>#K8f68&U7W6>4B>b;C=HZiPp3(}E^DK~1OEoB-Z;=Dy-?;i77&ZXpq;|7~KH+~0q zf?QHwobCk@uCy(ieLm1_GA@xI&Bz0J^Q>VO+&=$&2|wWWkn-jw`9?H{y!mZ>h-mwyo$c zhYOM|iWO#*;ljv+S#iadHLjpo(?2+4+>_+J96__)Mwi$S&OTu&jv#w0> zEUoPVqq&0j#y-9(7SJ)Zj>EIS+LxsfA~+9_bW!&y8?*-H89jsM)UV?7A`+j*aZaRZ zJe)^~kZe0!VEPU%*x!LI_Vwc}EkJKKSP^AqPw>T9f@1d7jc8w3GPoe)G7%N!l{hjVqn@g>N+-pnzX!J_e@F@P4NikTWf&!={w5 zaT?E0Yy%i2c|$fF_>7-3Q~(R8;cd@$i5fUGc^g{Sovjr49!IhqdK zBAgbWk4*>RB@vJ@N2~n(Rjs$O2n3xZw)JpizCD=i$2nipWpGdwZe>y1Eiy)QE>Vdq zHOQ{6NYbK!i9kyY*xi}wy?GQP+M^D{LeRm;S#O#Z(HJp|Z^X@-G^T=MbSze^kh4+^ z*kRN_z|mw_&xO}Qmlz}eE8b+~tqN=#Zc?(} z5;x7b?F5_D6-}xah?+kIV~V0miZqNV-?Ip@VOyMA<7MrA9kDHVIZUrGCMn{%7}IM= zpkm`%+BAqp`j%&{g%`2={VaIMd%*KC%_EtlSh&q6LUZHoe$Z)K{M4*Go3j0nOo*l| zEH2q>@s{C@W5KRAb_!=1uod&=D7aFI*(c}g{%tblAEFsMX}RRS!=TMRFeFb&_Q4(h zZQU{-L@2`#CwT1wRUU9oiMvPKr?J6qD6|J#>r`{QoKq*CHsL@GZp(F~ zSpt?xijyX%5QkIGCh9*__`PcvN&zHd_na8F)0!r3yil4&4I8J6I4h1lAXzhc#7rV` z5;4n&t(MPi*EEr_hUN1~apq)!da{l$6lG;G0@YDeV+4q*C``@Dkj^g1UdTj;aM=_~ zBek(H=zYX=X07CoZY&;9A@WY~a+;S{csauh)jRJjFGIYXoBg*mtynnQf@>4~?>z@L2B2Ob6LV{y;{1vEjmOV1f5Jt}9rGt#JbsS( z6D}S9slZr91m@2; zg3;J@>5#jD2hHTM^ZI@%xnE3_f0xp5unCzk9m+)Ck_mG?Gwo$2Op#0_Q!@^C@0X)J zm@OG-P1-#)KwA+Thz9>8|Q6nl)5CQT(df&)XG*3m!3hu8NuZ5|W^s}j>U#vuIr zgeeVXGbdMbnm$edv$#=lY}82I8&l(JLzqCu=#^%+rY0VXLVi6`#7-6@8mco8t+y!xFEiXZkuqaRW^e;(`tq!ae-y=3`eBJ zpb-ls1Cxnjv`dqO=Ll@TB?egcWxCB_6_cV|Ll&Dt+>%1yMD~_r+{i>Z-JVs3ppO6C z$4jg7Wt@d%*Dv}^M2Q`HL_t0*6@&0ySt>GBskga^xG2AA`ZRTR9255l?Q%m{n;OSO zL~3u-*&|~;Zfp8*(%;hbnVTIQ8=rG`vVA<{waH$f@$*wHm9i{0SINc#={2n!&dblY zR5q*}W7&KQ1&C2o3nq<9F1~Wj`kr9{{gO-#8bh+$nRg_EMMwVv!gg5WI$&$a>elNF z$a=TYzDSErfKw$0<3j?W6%ToFZk2%mb;flH&8M_5O+QvuZ zOQHm9t+Td7xf>j`H3kbJjBwdjXwn#6cv!e5|8CM2+Im*!T>J^TG!unBl@qApZzfI2 z&@&rNX`IY8_hb_D^Yu7yy!uL;w#b=jCPg*;vVH5ESvlr=uCZ=^FA15`tlK>2!3IH* zy-biW*4P!i%I&gcA%DNR*pMy7x7DiMw4r1{l63L%P2;XTl=$C`P=3=~pReM(8_pqH zecU8l+-?Rs4rHKJN{p_olUOBTkqIlf8S`lsKfrS3({(PP__eW7P*|!|OV!0ySH7Wf zaa3cO-^bX*8Gg*=$~gvu2%O7UD84XqMcH)m9XLcHBU4JrCJDdoMkv=DgkER>18>dE znD3HYAr$u=R}QKo84byWaM6{+qatvMDg{ouF{-)lyKcsJ#e03`mi;y-Q)9K~kx%SP zF}T{3NC&@}i5KB&xCso5JxVidTN2n-1idkew(JN36hTd@xCt-r>!YCqDEygC<_;zn zqxcz!K%P5}*E63YrJXF>#qx~D-(|B~8v%sVh9zxxyygEMlxsflZ?euUKIXw}!PDR- zxeW1_1Pl~F1sJ5+W%9x!1ntBNR1~RC_Tg*qrbSFW(fpIJ#GveyBLrtk`K32Q(~=_X z>`fd-Zg4~*~i zy&5mS#LKVp^6R{O7ndRYYq`9rkDyrR+va~u^$w*Q&Cka!;)dm?v%9Ce*ge%f*Zr`& K-2Hm@d;bGPlZ49v literal 0 HcmV?d00001 diff --git a/script/common.py b/de_novo_scilife/common.py similarity index 55% rename from script/common.py rename to de_novo_scilife/common.py index 8b38b69..7658ca5 100644 --- a/script/common.py +++ b/de_novo_scilife/common.py @@ -1,6 +1,20 @@ import sys, os, yaml, glob import subprocess import align +import datetime +import time +import types + + +# Header levels +H1, H2, H3, H4, H5, H6 = 1, 2, 3, 4, 5, 6 + +# List styles +UL, OL = 0, 1 + +# Alignment +CENTER, LEFT, RIGHT = 'CENTER', 'LEFT', 'RIGHT' + def prepare_folder_structure(sorted_libraries_by_insert): mainDir = os.getcwd() @@ -98,13 +112,68 @@ def is_exe(fpath): def _sort_libraries_by_insert(sample_config): sorted_libraries_by_insert = sorted(sample_config["libraries"].iteritems(), key=lambda (k,v): v["insert"]) - if os.path.exists(os.path.join(os.getcwd(), "DATA")): - sorted_libraries_by_insert = update_sample_config(sorted_libraries_by_insert) - else: - sorted_libraries_by_insert = prepare_folder_structure(sorted_libraries_by_insert) + #if os.path.exists(os.path.join(os.getcwd(), "DATA")): + # sorted_libraries_by_insert = update_sample_config(sorted_libraries_by_insert) + #else: + # sorted_libraries_by_insert = prepare_folder_structure(sorted_libraries_by_insert) return sorted_libraries_by_insert +def check_dryrun(sample_config): + if "dryrun" in sample_config: + return 1 + return 0 + +def print_command(command): + """ prinnts in ahuman readable way a command stored in a list""" + ts = time.time() + st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S') + print st + if type(command) is list: + print ' '.join(command) + else: + print command + + +def _check_pipeline(sample_config, global_config): + """ check that user specified commands are supported by this pipeline""" + pipeline = sample_config["pipeline"] + user_tools = sample_config["tools"] #might be empty + global_tools = global_config["Pipelines"][pipeline] + """be sure that pipeline I am going to run tries to execute only supported programs""" + for tool in user_tools: + if tool not in global_tools: + print "tool {} not available in pipeline {}. Only these methods are available: {}".format(tool, pipeline, global_tools) + sys.exit("Error: invalid configuration file(s)") + + """check that programs to be executed are supported""" + tools_to_check = [] + if user_tools is not []: # check that the suer wants to run only some (or all) the programs + tools_to_check = user_tools + else: + tools_to_check = global_config["Pipelines"][pipeline] # otherwise check all the tools supported + + for tool in tools_to_check: + if tool == "align": + tool = "bwa" + tool_bin = global_config["Tools"][tool]["bin"] + """this step is a case to case step as several special case are present""" + special_tools = ["abyss", "masurca", "cabog", "allpaths", "picard", "trinity"] + if tool in special_tools: + if not os.path.isdir(tool_bin): + print "Error: tool {} requires bin directory in config file. Directory {} does not exists.\ + (N.B., not the binary as that one is guessed at running time for this tools). Please modify the global config.\ + If you are not planning to run this tool specify the list of to be run tools in the sample_config section.".format(tool, tool_bin) + else: + if not os.path.exists(tool_bin): + print "Error: tool {} requires full path to binaries in config file. Path {} does not exists.\ + Please modify the global config.\ + If you are not planning to run this tool specify the list of to be run tools in the sample_config section.".format(tool, tool_bin) + + + + + diff --git a/de_novo_scilife/common.pyc b/de_novo_scilife/common.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9ce3c0c780a4374cd9ceb3e7d912371b13e226b3 GIT binary patch literal 7034 zcmd5>TW=fJ5uPP=A!SRlW!aGuC*54s(nTUCaW1H07)>n2QX|vCQjQ%WF0te;rInVu z)Sjgz##T|lL7O6d^B?F#AA&yg&-AC{xoE#{4n@mO5EO-7RMMWEbJ=s|%*;14=cxGo znqjG6)$J^`pT!9Ifr;_H&!=WoZu9maXi=Y zm@%~EB0G*e6ghH~F33*taU*zmSdgQ_ao31ED9A%cvNgfPJCLMSoy1NwKInEhYw2Xg zE&=TnO>jWu?lXCL~7%Jt!yS z!KgeKk%y!DZAm!Jel#vuIYK|biXm9~L#(~`t=zHf1rywggQaT~%B8A8TmE?SV{-#l z5^fM>t{-QiAN1YuFfzGWasxlv3NtrP-E=?9-a5#lJVayeFS&V`b)v*aJv1oC*DXMawC)E7T~1A=JJ{<-da(cKMS~ zBcgz-lZYCT!`vb&R#c1_m;zDuldJ5*->?s` zv3(^VOvio*4)!vOE8y+aV_coazN?;E;uVlXOa*CZ+$7D1pgIU$vz_+hKwF!kyB%iX ztN2m;cr+AGT^W7cQzoHa@fazz5s$L>2qljUc#|xqSkTtJGc0CMz>kPDlJKCB_?=Ke z=FwKY^DHi~pjCME!XDA!y~N^wWRT>f*M(yV8^-T+$W5|LybND?2UvSQ8QOL9-?)Ew!^A8jd7ll-k>t~3DjveuujwtDfzv+nbQzO7B<{#JL10G=s&^Yv|eGLB?QGz-NDyvQmKvXg`D>&-= zM-A0q0{t0Ep|-N9;U!8EL=2AFSZUlpTGZf__VRqtdUJ!VR^={6S=ds*`nl=G2wMES zon{?Bw=gEEE0nhlm2t3v_(xmN=y1YT$S!SV5Nc>s@ak$c8e}cKiqL*M6ZG z>Xa+$kJZ(xmsJOQ7E^i@T7@jDgGG)Jw(>OVKQCG4bm+rX8aDm8jW$E(1mSwp87WLz zNz$I^h-j$hI}{KVW`9+TCwJxW zw(uJbiu3r66@}ySy(K2T@V7fie6C9=^NUjuG=yNEZ0;mRqKisX_QA~+EE-T)cjT}G&X0v+qm_a z)go7JGtbnj!nA@WdBz!-Wd-^R8=Rc>E{?jEjgL21)#{>69XHnO>4i{r9T*MB_(yh$zcm#p>9$R&r30;XwhAWX5%0{>Qc=ykE<6QV) zJ8Es;17R5H=n$rhbIusv;{_LP;XJ5PK{ZdoQ1j{xUI$@AI)jQm%Q2!9;fSgliai8ax$s*(oh-hs zTH4ie-*%$!SsMy9b2aum&A`8waeA^<6y;8tv!@o_Ee>ZqpmmJxWh(eYHl+#QbK{hm zV}soM*a>Qnyik$i{A_fHY3@ZeOC><3(8KN+^9X3QMlWtXxb8E^`f$hgYJl_DebDoI ztHUZgp0U9aAFb`MwcD^9+~*QRmJMW27?keiF1lc)@J|ozRRdBCn^z+J1Obn!gtfIB zt3bW=VgMzxs9W4Uk`mi^V>N-F6tPiF zUEk$#&rbr^ARZy6ahx0TSkJh%@WopTowpVOcVTsD;pWl;vo=gx)eJ@bP#z)Yr1dc? z*9z^aouwUCOpY{NyK!)AvqjSp{gR8Q7t*X)9ngRm1x{}CW%B1{iDZ=G$#IHM8T=j% zHk&Q*OgoA&dA87^^)b6*x}=)>2r_`E>0GEBPz6wkfryhunTKbXM{k zK{?1*$0fVOSO=x^P=JpS_li&&#ERI6G!~k7_AUwj?rgy2Ebr~7#$h68+AjRlC0$o zD-0TWn#M-m+{R#xF~1qKyU6Ky!uI!((kcXC?Z;oEbJ4w(#9RS+o(r?yP6Nxp@Pj3c zGH+nIl`KoMC7@;Bk0Y|nV`wka8!A`Bqq(Hf6Wv>hhLz@luPvL(VG)xwVSD7+Z}yR3 z(7rmp>18cDuH`q=Eya!>#~Q!Y!SQw{3v@K%coOA(^BD|sz!103ZD!$K4@stRu?DZ~ zY`RHwTUM3FMHg35gC2bI)D6U<`_xaRy7VF*nX# z_~9f8A=ae<=aD1FL-TX-Vuxdx)Z1>>TTpO9y}7soEOp>t9MNE?12 z*9{K%4@CO^D+KG}_1tpeif95hYEwblZDe3sdW)pTPYtj+y{gsZ;BvFjHWB}6Mk&xdT#N7H)EEA#<7$57ER`tD5Ss} z&ZzjQn=Uf0*Hbq}+M$7I0cuDpvZlqq_5Pp`DzCUuR)8jH_Z@C@qh zqIj$|rJiv%R>BHb;aYlec#TY|&wd*^8T^jS6;BtZi{%UDnQ}>=+44krw0!E{`{V_{ literal 0 HcmV?d00001 diff --git a/script/deNovo_pipeline.py b/de_novo_scilife/deNovo_pipeline.py similarity index 74% rename from script/deNovo_pipeline.py rename to de_novo_scilife/deNovo_pipeline.py index 1932bc2..a1e75f8 100644 --- a/script/deNovo_pipeline.py +++ b/de_novo_scilife/deNovo_pipeline.py @@ -1,9 +1,11 @@ import sys, os, yaml, glob import argparse -import evaluete -import QCcontrol -import assemble -import align +from de_novo_scilife import evaluete +from de_novo_scilife import MP +from de_novo_scilife import assemble +from de_novo_scilife import QCcontrol +from de_novo_scilife import align +from de_novo_scilife import common def main(args): with open(args.global_config) as in_handle: @@ -13,18 +15,25 @@ def main(args): sample_config = yaml.load(sample_config_handle) check_consistency(global_config,sample_config) - - if sample_config["pipeline"] == "user_define": - sys.exit("user_define pipeline (i.e. a pipeline that execute one after the other the commands specified by the user is not yet supported)") - run_analys(global_config, sample_config) - + + if common.check_dryrun(sample_config): + print "Option dryrun idenitfied: commands will only be printed, not executed" + + + if sample_config["pipeline"] in global_config["Pipelines"]: + run_analys(global_config, sample_config) + else: + sys.exit("Error: pipeline {} is not one of the supported ones:{}".format(sample_config["pipeline"], global_config["Pipelines"])) + + return 0 def run_analys(global_config, sample_config): - analysis = sample_config["pipeline"] # analysis to be executed - command = analysis - command_fn = getattr(__import__(command), "run") + """ check that user specified commands are supported by this pipeline and that all commands I am going to run are available either via PATH or via global config""" + common._check_pipeline(sample_config, global_config) + pipeline = sample_config["pipeline"] # pipeline/analysis to be executed + command_fn = getattr(globals()[pipeline], "run") # this stopped to --> workgetattr(__import__(command), "run") command_fn(global_config, sample_config) diff --git a/script/evaluete.py b/de_novo_scilife/evaluete.py similarity index 96% rename from script/evaluete.py rename to de_novo_scilife/evaluete.py index 507e1cf..4c1922b 100644 --- a/script/evaluete.py +++ b/de_novo_scilife/evaluete.py @@ -57,6 +57,8 @@ def _check_libraries(sorted_libraries_by_insert): if current_insert != libraryInfo["insert"]: current_insert = libraryInfo["insert"] different_inserts += 1 + if libraryInfo["orientation"] == "outtie": + sys.exit("error: in valiadation only innie libraries can be used, please reverse complement your MP libs (N.B. if you employed MP module of this pipeline then the reverse complemented reads are already available). This is needed in order to run FRCurve, please complain with FRCurve author!!!!") if different_inserts > 2: sys.exit("error: in valiadation only two libraries are admitted (usually a PE and a MP, sometimes 2 PE)") return @@ -183,7 +185,7 @@ def _run_qaTools(global_config, sorted_alignments_by_insert, sample_config): os.chdir("QAstats") BAMfile = sorted_alignments_by_insert[0][1] - cl = ["{}/qaCompute".format(qaTools), "-m", "-q", "0", "-i", BAMfile, "{}.cov".format(os.path.basename(BAMfile))] + cl = ["{}".format(qaTools), "-m", "-q", "0", "-i", BAMfile, "{}.cov".format(os.path.basename(BAMfile))] print cl stdOut = open("QAtools.stdOut", "a") stdErr = open("QAtools.stdErr", "a") diff --git a/de_novo_scilife/evaluete.pyc b/de_novo_scilife/evaluete.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9d86548289fad1eb35142843d82c70934b48f941 GIT binary patch literal 11251 zcmcgy&2JoOT7RqCU+vhj^Wh{t$xJHq*&|{nnb{$b3}HRC<76jxoN{(D@{ZM1cU8B` zuC8j=TkY71`+%|%AR!G1E-V+25El+ed*Bb)%N~$etyThY;lK$FoZtjP@cTV)mAmc9 zWMEliyUXwQ`@A2|@A*AXb@5-vNB<*yeYLK#p921W8z248czF1;RH)S2LE2JlR$g9E zYlS>rRBOdNJ)qVG^7Nou8_d%~YHcV_539A|JUyb;N_l!zt&J*wRBfJ6UO~OEl$8lF zrZ&gb+DYXV)%PLLgz^R?F{!*kNt{yNkR(nkZ&(s%ls6)Yv&t(;VoG_Vk~pWl6OuU3 z6|Ppsu-f3i@Uxs2k(_Nd>)eSUd3_rSm)xsw4^pi)xP2=9J0vau~t)a;&2UidLXMA%WI0# z2Q`CP&G=VphO(NIM{6>1hRuGDP=>7`mHdje`zh1d#q#|6P1PMx>4Zur)#fRco>paS z>gE}>KfrD;3aUG(_VI@$n7zOfEVcdb>PdtfFp9ZxlB<;y*vjQ}6i+t{no-M-Qk`OF z)eU)mAkPow`7L>VqQhJut#;^hB-`^FZYNADXW5pg zSC3n*I7-Xxq2na2hCwZHlfc*2+Fmt?w4bC{5o+rjetoM~opOk3trLV^HS(WU6Tjgn zepL5y=1RpHd!|&|O*dp3e>c!r#|X+=?w0Qbi57Y58%Rn@R-5vkRV9f5T~C{-TYl2? zt2MV(Z3LmO({T(7AWHYE3 zdJaJYhh?O4hD%UXBIb5k;qM7Ml+@OMO8z3RAC-Fa@2O${!432Q+cKpwY7uaeDfKmm zzC^e_0W^uLjyq`z4Z4U1ev-t=Ejx&8Y;E9r(#VdZa1Uis;M)iKX4l=wuK9LH``%R> zm&etuu%oss%QK(M*g?ZasqLdpyyttYHCNY;8+N)8XuBP> z{V<4p#qI0&J__iA|y=S{SZVf;88ux9HNanSI6q;mLI=bVA z*htr2S+JoSNGz>fwRPO`)1c*R`vX+GU*R;)1v~_jfJ8=doi|uSc=UpXU{{*qolZ}p zwBAX8XL*_P4qH}4suH&OAv9gBb_lz}WjasF`B%b3?$+O5$ zTB3Mt&ub})_t&_D2Z{$}qv5(ut%H`8p)Aw_m@~yvW4&QjMN4jrBB7yCLE!pkQEh^` zh(Uw)W+kv`(nIe_$xxORJB~iXdH!I4ECD+z+yIl&a+={Wm<*r%lnIJCpgv|D=)s`K zg|bz!0n|dnVOGEV8^Q)l0246LDETYZg%m?%3@?=GqU(t2j;LM0#SqY@uDWPl%ARCW zC61glE!dxZ!3gZi)}Ts0w*VyIRr|*DXBmScLgbffvJo^l zsI42onUAhtueGn*Vf+*&0{R+i8iodF#4HL*Y{oTWNI!N7#SnF)B?n|^5DINmiGn(k zgo7YT{iL1v>5P3hPVANo+XeKC?4}<}wtdIG@oBA1WYK97M@^`-?lhY)yuOEX z#nsHYMNV~yyibMwB+gq%OY%;IYAc%VyvG#DghQ?%dh6CS>4+nZXw@9j8nNv)KrOAi z07Tq|*3;q>0EwAma&_t^sV0(%VZ_!7Yk8UksV`^Zyv`wMD4l7J%Xtsvu9D@PM#&+j zB#c5v&x)%JPwWT!F-iFwAUAGx1EPS4Q7Y|kcZ6oqv{Jld1RjFOBIlgPgTO<`8MDsd zc{<|(Z{R-xXrRE7F|--SX98)0(hvaZq@*uND{&eqI5An^>1`yaVPO0#KT#wklilJ4&)?74q$F!acNeLq!vW7^JYIdiF1h$o{Qm{eiNjPd+2KIw`$?u z@ok-)s!(tdu!TuAnStVV73)Q$cpWH=Y{P%zMialV4QA1T&HzA^aZ7{;Ch`| z6q^GgC91K(SUP;)^Wv|;mqYeKo}HwTp%aR6%4#1=eNF98Vva%=a=^U?nJg&%j~1xG zCd9ue`vM!)b8BHq%*kZObcvM~4tcf0Yij&HgOT)H43cDwvfW8FVGC3%3Zf!pnavO7=V!mJdGn^H+x|2h6&Y4CfaPgJZUt%KK5|A= zl1ev};OmCJQ^wtE%>1ngbX^mB8vVb&miL`GJd`73PfS?IY@#*>Nk-TCVAC? zxdGV>V!DGZn-ao-dY}5|Y`W7AvMuNYCMR76`EJy*j_1^u(d)2vzpH;K6vw7L_LuaYKfiy-#2M3Lh7H6%=*i7q{+2wrT!-ZK(vb+ZmpY)4$CM!BB zE&*xvv9mCDFCVe5#0)18^}PGMJCp9Fg00$b3pUxJK8Bz5a8}b>yz9HrB;R&?9U~ZJ zi+5~KPusKL*;%F5M3}E_3Am!7$jt4iS)pKp7i~}e1+l?fkvNS*MC4>C?J$*gge^g#G)P0!L%TaZBU*&Rdf1SFmjNxp4k@z% zK_Sna9k;297>6rYikXJl1bt9Yj}g&uo5zSP&I$ziQ+zbTZmRGhsOYqH9>0^oi^;+{ z{4WDdgoT|)3*Jn@j6cUZjUs*>3nNtpF$I{E$*f3ja*Uww2}HVoY90@rK`xf!HWW%KVfFjasiI3UWfS08VG z*Lv~Z{6njITJ4|C zr5543*se1o;u~@-kf2v2Qd6j2(Pe4H^8~rXZcVA(zvGzm>pwZP=xhUEH>5{0QG`dh zKPB66PVM7(#?;oZO8%gb9f{2lh7ji`wBJI#x7(F+MNp4`4XX!amc2jbX-Jj(Qj;dOfN@3MdLzV1#A{D=;nyGgv50CZHIG`_;=iJPC|C!z5-Yg~HD> z;l*#55QY!053r<5>iaO3;!nZq_Z^W~zWj*9`sPPuY%EOctJice#hWjk0$-az1nB5- zedDZeee ztNu3LD*)JL=uWh#yJDjR2E~#J78+|LFt~&l`)!%6AaxiJyUQi*XaYixbUw z*W#Sm9vYh+ZJXXUdW(agffMQ-AiE$(UP#9fP`e`{O1l|2Mgc)M8?wI+Z@UrU2m?4r z(u}AlcBA3eeYit-CC?=_rR;8ELhE~)-xSEju;<|?B|)vjx4YRM83ADvdNmwg>{a4M zO+Of*Ja6|6bExqVu>>P22Tj2$dveJ{M1a()yuVxy{t4S*j09VX*t;f=tont&{ig&U zhi9bI zC>I#e8_z}ipinbR9ElJ?}2_~e`a@$g{iyHavn)ADCO#RXunL{9P3g`D( zP9}M9elm~2KKUNNk`GAh)iMNwFlt{3RfWt`f*@4RNan`-DDy2MymiEPr}Q!2#^1)I z|ALPOW&lOV$FAVR-%G;I#tUb_#rR9sHj3P#eDTVm+Ufq>z~%m2;qu|0hieVn2t zG>4DoHGua9yy9Cc;dyY-K_q&GV04I~O5u6%!CnQ(R$kF7lo)K@8s9KPBYD397C1#R zScFmI?8};Rd~b1X-6Wdzb@Av>@_05&HtVUoUST_P2YEjb{-$5h2qKS>{Xaq4v=Fk+ zs%CvI?nJ3vjxgb>41OoAig;4;AmcW= z;kJxY@+)e_>0%d$PJqL}mt6NYzCGh=(OEI%cu&e3jjy+eUg9fB@@tfOYfFSrC%zYD%~zE|2Nr6eF^{o literal 0 HcmV?d00001 diff --git a/de_novo_scilife/pdf/__init__.py b/de_novo_scilife/pdf/__init__.py new file mode 100644 index 0000000..c8747bf --- /dev/null +++ b/de_novo_scilife/pdf/__init__.py @@ -0,0 +1,93 @@ +""" + __init__.py " + + Author: H.C. v. Stockhausen < hc at vst.io > + Date: 2012-10-14 + +""" + +import cStringIO +import urllib +from reportlab.platypus.doctemplate import SimpleDocTemplate +from reportlab.platypus.flowables import Image +from reportlab.platypus import Paragraph, Spacer, KeepTogether +from reportlab.lib import colors +from reportlab.platypus.tables import Table, TableStyle +from reportlab.platypus import ListFlowable, ListItem +from reportlab.lib.styles import getSampleStyleSheet + +from theme import DefaultTheme +from util import calc_table_col_widths +from common import * + +class Pdf(object): + + story = [] + theme = DefaultTheme + + def __init__(self, title, author): + self.title = title + self.author = author + + def set_theme(self, theme): + self.theme = theme + + def add(self, flowable): + self.story.append(flowable) + + def add_header(self, text, level=H1): + p = Paragraph(text, self.theme.header_for_level(level)) + self.add(p) + + def add_spacer(self, height_inch=None): + height_inch = height_inch or self.theme.spacer_height + self.add(Spacer(1, height_inch)) # magic 1? no, first param not yet implemented by rLab guys + + def add_paragraph(self, text, style=None): + style = style or self.theme.paragraph + p = Paragraph(text, style) + self.add(p) + + def add_list(self, items, list_style=UL): + styles = getSampleStyleSheet() + style = styles["Normal"] + list_to_print = [] + for item in items: + list_to_print.append(Paragraph(item, style)) + t = ListFlowable(list_to_print , bulletType='i') + self.add(t) + + def add_table(self, rows, width=None, col_widths=None, align=CENTER, + extra_style=[]): + style = self.theme.table_style + extra_style + if width and col_widths is None: # one cannot spec table width in rLab only col widths + col_widths = calc_table_col_widths(rows, width) # this helper calcs it for us + table = Table(rows, col_widths, style=style, hAlign=align) + self.add(table) + + def add_image(self, src, width, height, align=CENTER): + img = Image(src, width, height) + img.hAlign = align + self.add(img) + + def add_qrcode(self, data, size=150, align=CENTER): + "FIXME: ReportLib also supports QR-Codes. Check it out." + + src = "http://chart.googleapis.com/chart?" + src += "chs=%sx%s&" % (size, size) + src += "cht=qr&" + src += "chl=" + urllib.quote(data) + self.add_image(src, size, size, align) + + def render(self, pdfTitle): +# buffer = cStringIO.StringIO() + doc_template_args = self.theme.doc_template_args() + doc = SimpleDocTemplate("{}".format(pdfTitle), title=self.title, author=self.author, + **doc_template_args) + doc.build(self.story) +# pdf = buffer.getvalue() +# buffer.close() +# return pdf + + + diff --git a/de_novo_scilife/pdf/__init__.pyc b/de_novo_scilife/pdf/__init__.pyc new file mode 100644 index 0000000000000000000000000000000000000000..960507494edb1022e31931574d0f38e0753967b1 GIT binary patch literal 4843 zcmcgv+j1Mn5$#=ENPyr))PWi`s{)T_I8jj}n)=czYOd4qZl z%A3?{y0JQC3zWC0*K+MS$`&bKqTUka%hX$Tz4Nle3iVbfzeK%DuD3ziD&?1{ciFX@ zlwG0xD)p|q_5x+sC|{%AnrpWx`-t|EMbg*lfc0FWw?wBzf9c(Tal7qh*!(xHngRcW zVLD807dNj6!qP!n`)7k68&E((=6}`H&o-HojmI6Ht?_w zuAW6@)Gwpaz}B&D6veuXPq zp~apds}sIV6FV?&5ghP^g>lc4gP8xx4;1}97xOQ!G?UMxKG#0!3K+(8WYl^!Fb_f< z&_S3ZJx@u6-ZDoUl(fJ@$O|IZL~fB@q@*tLlJs-WGWJjA=q)pN1qL2k&1ST3oB^GyQP}tNm6aXm_PC1OL z*hFUs3QSZhT(!Bl$~DzqYjyFKI}U6>7*jYa$WhcrC6^7B;aGgv4M^fU4{om z8D{!OXX+#FY|BIdX(3tt%1R8885W?AF4zf(Q|yG$sdtN86Lx+)!xy|e^9I8@TArbN zA{UX}!Xbu(1wa@7INafaZVEe`rQ!{6+(=aAQ`7#yiYpX+e~t(X+;qd7A>zRYa)Z;Sg@|5HQ!fO|EX<{HPAcR8VM@C@ODK@w4j z9T)5eU;{dw>DA#5g9r*&EIVh+bL)%~F*o>zoRZuJA;I;NDx&>Kjb8l%KH2)~O**O5 zn;OCb+{AKq%99rX{fW85gPdYGvz&U~$(-=3N^j<77Yp9+O3dgMWgcacVfV)w2l&0? zk(SGnwxN~+rxmhCnnW z#Zo!ZUN9ce7Ss1R-8l3M_c@L2)A$mR! z8BRaP*C89r9_Jo+ktO8|O#TFq+Bivi5fm6a%F_O@4N-EE#r<_J66aXKuNZ2D zSB#Q*!ggpasZ1PO7B7u3UY;-l&g?-W0};-V-8op6k%P_Y0Z`@0HVYS+&v9`9ik$U; z8wQLce}&Ol^*C@s5mx#_QnpM)tkmC?{{y3`3>+6S;bMJ8p3X&w@ zq@>`Fo5@#O7?pjaxYJRdpHq2cXDh|g`{OiA6uyhjy|1t(Kg{>=NuVBKJ+26EeZg)9 zD@%Uk>JnNTs6qa+_X*#SZu<&1CZ7kxDA!>q5ibn$A{q0yTqs5;gi;BrY84U|Ni5)0 zO7zW!`WZUzpbAiZ#+3|x|3uw}3>7xFyL;P8eTfO6p!yosW3FcQi*ylRKnhuFjf=U- zRo$<9b-z(^=c<&; zokjZ{e9Rw@#-_vbrz~*M`qNqN=>Uo-Tmg9p%x3YGR+4S5zjtv* zGJ|nh4#p_Ju8=n@(0ga2&g8ivuNH7#@RoQmwrM8$QXu3Qlovx0i~U0#TXE|LfKM3u gz;W^`m}BrTt@|yn<*o4VO)l5?cZD + Date: 2012-10-14 + +""" + +# Header levels +H1, H2, H3, H4, H5, H6 = 1, 2, 3, 4, 5, 6 + +# List styles +UL, OL = 0, 1 + +# Alignment +CENTER, LEFT, RIGHT = 'CENTER', 'LEFT', 'RIGHT' + diff --git a/de_novo_scilife/pdf/common.pyc b/de_novo_scilife/pdf/common.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e7ecd9fc427d77387a486f84a26f821c894a88fb GIT binary patch literal 559 zcmaKp&riZI6vrQoUk=c#$9mg^jUjpp+%p7;( rITnq`mSv2jVX{tE(VuunSJv + Date: 2012-10-14 + +""" + + +from reportlab.lib.styles import getSampleStyleSheet +from reportlab.lib.units import inch +from reportlab.lib import colors + +from common import * + + +class DefaultTheme(object): + + _s = getSampleStyleSheet() + doc = { + 'leftMargin': None, + 'rightMargin': None, + 'topMargin': None, + 'bottomMargin': None + } + headers = { + H1: _s['Heading1'], + H2: _s['Heading2'], + H3: _s['Heading3'], + H4: _s['Heading4'], + H5: _s['Heading5'], + H6: _s['Heading6'], + } + paragraph = _s['Normal'] + spacer_height = 0.25 * inch + table_style = [ + ('ALIGN', (0,0), (-1,-1), 'LEFT'), + ('VALIGN', (0,0), (-1,-1), 'TOP'), + ('FONT', (0,0), (-1,0), 'Helvetica-Bold'), + ('LINEBELOW', (0,0), (-1,0), 1, colors.black), + ('BACKGROUND', (0,0), (-1,0), colors.HexColor('#C0C0C0')), + ('ROWBACKGROUNDS', (0,1), (-1, -1), [colors.white, colors.HexColor('#E0E0E0')]) + ] + + @classmethod + def doc_template_args(cls): + return dict([(k, v) for k, v in cls.doc.items() if v is not None]) + + @classmethod + def header_for_level(cls, level): + return cls.headers[level] + + def __new__(cls, *args, **kwargs): + raise TypeError("Theme classes may not be instantiated.") + + + diff --git a/de_novo_scilife/pdf/theme.pyc b/de_novo_scilife/pdf/theme.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5b90fa15856e42b5497d0654379598271b8660af GIT binary patch literal 2512 zcmcImdruoj5T8368_Y8W5@^!q@@V2l!e9baiXvJbPC%tLI@3h8(w@$@Yy05N=iV-+ zMC6~QAF$u0AE2F?#o(xawGy|w+4;@vYaTmynZG6r7v6(<0GCe&`|t2o4I}~pJ7Iya zA+jL0p<#nXI0I1z;w&_>24^7}fj9?^oWUay<>5%=K#oES$LHY>fR6w_HO4@ULcKUf zd%QzZaPZL`8Ou`dyufJdRCki}g|k=MDLMU;Q`bpw((zA~>^d)OI`QlUL@=K97#t?ZIoG5hTWiJP!3U$ zL`kX)ruonG1kzpE@=qguM5n#-*f=V-@m1g8K#3l+K-y$$BLiU;#24ho4eBh@*`~|D zMq>m*bWsLG78Yd=Hi#WTHgDJ*vZIF0BRgi;QDh5-9Yb~;*O}l&3b5Fi1ThYB3g#Nq zASOV33F1D8X=vd|W-W|X@Anp<0SOB+Pe1;Cx z1tgJd=^y;G9daYr|lD z7;I=tf@+e+e$++|pFh9VBiL;n?7yytB;e%IlsX5MS4TYI4Qn;sp`+R%>t5BWM`{WM zdot=v9R~i=b`l9q#vJTdE8CTW+S`y*l+54W+WG#qTl=xPtMUjR?W|DKw18WCd#$Y- zb!CNGK+|bcI^YhzY7>dZga^0)IJV%J6LR4iN^L4HHSgjtI4k6|1UAR~Bh|)(4T&be(Zv+(Qp6 zcgo9s`Tl*lyj$_AwKp~IaR0DE-dGmW>n8ogQ$ZMoE%`6p6RqVzvCy}0K|GF!C+S64 z0zAwcwS+=tBLO?0Zt${SD5c0FNG(_#7|GrzcXu#-Bwl-(uj6rzmST&N<1Pm`06b#kxSY81GM%t=+f|To%6GXmJQaQ1I?sOCFG^G=E zmG-+jL|=)LDQ=X9N9R3RNz)|dC>p~l@Z?U;>ytBLZ_)+ieb4L4GtXN`)>v*;*(h_{ zPznZ8Q8$&HO#UC*>!pa}1%_7sPg)Ai&|LoBETW8y2^!?b((|~md0w1|(+HtUJ7_w) zr`)^Pxjcgedu5jIt(xw+>DElQZn_QE!I4}*dVcD+Q@_`7IfW-tu6lkTQ?DbbaJsZ6 z=WsLfgOke&_uCGA+*o|p!E4NgR{Kpno1zaRW6qvf;)htc_i+{XArZ>1>omBTYwc3% z(bZNfNOJa=m9g^N3UGSGbPn|2O|@TJM5O)^rE Nxw*`Yh0QKx{sAYXD&_zH literal 0 HcmV?d00001 diff --git a/de_novo_scilife/pdf/util.py b/de_novo_scilife/pdf/util.py new file mode 100644 index 0000000..81e3873 --- /dev/null +++ b/de_novo_scilife/pdf/util.py @@ -0,0 +1,18 @@ +""" + util.py + + Author: H.C. v. Stockhausen < hc at vst.io > + Date: 2012-10-14 + +""" + +def calc_table_col_widths(rows, table_width): + max_chars_per_col = [0] * len(rows[0]) + for row in rows: + for idx, col in enumerate(row): + for line in str(col).split('\n'): + max_chars_per_col[idx] = max(len(line), + max_chars_per_col[idx]) + sum_chars = sum(max_chars_per_col) + return [(x * table_width / sum_chars) for x in max_chars_per_col] + diff --git a/de_novo_scilife/pdf/util.pyc b/de_novo_scilife/pdf/util.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53fc74f7af6ee586095aac4d64de003d55a41952 GIT binary patch literal 811 zcmcIiU2oGc6unNrMqvWPCR7Pf5HEdMvrar!)dX5AQGvRNN>l=d$sBJnq)i(;U73=n z@edF`gI~uV;MkqS1J6Xc$M@s-`uNI~-`mX}{OLr{&Bw65fawQ-i0D0#(RVnEB9A8R ziYbkJhnG3{F2&S)uim z45Ok#C=O0X^wFcTN@b1KJu)5@HCy;nu~%`7XH?e7R9)A)O%)(jsdbITdHv;trpqqu z2~ADSX2B{ObO9Y6&e)-HiwgWuQ#5GzlSj*+H**>*$R=%P+HBW>#!fGhb7Li$ochnT6|3C7-n|A2Nxu^irGKHVV%Y$uCu8Vsjrc*j&V6rujl4B^K` zjaT3;;OTe)j@?RJaEq1i`1|w8@h6vlZ`XgGKOYO)EQb9hOg{pIL?3{RzQbV@dNgj9 zZ1E$Y?qg*}#jW|bx(i-qCQ8z-|E6`^@|P`tY!Y!51(}vJzvoB74~)Omrlk`9$Q|eh zMs~q+u-`e@>+J7!4qdt~Jn-Lv)nLak{R;pcmy8Ti?)=6rqn*N|Rb&(Wv(HqNXf+|j zC@&F;{j(u`@~9|NQK40rj7NFJ7QUA3Rb0mlDyn44j*Hnrx(kav6qvxY${WX zz-6^bL67cvtc=S{gVnl{O`EnKGKtdUTARBz3NB*FuT^Lw^AIt*xrl;P^SMm9NMd&u z?QU1$n#<-bO{`|piO|_}eGFqGS~UUWK);69o+}ZwqfUD%Z*Ek(Kj5R2_b2>xcsh7H z91Yr`O>@nMB4fcfHV^3kDrhNm( iJ^;?cL~#D@KK#G>{=djA@b{)kn8otQ{#RQCRqqeOOtM!1 literal 0 HcmV?d00001 diff --git a/pictures/NGI.jpeg b/pictures/NGI.jpeg new file mode 100644 index 0000000000000000000000000000000000000000..e71176ef0860711fad0156c253dd808398e31690 GIT binary patch literal 9957 zcmcI~byOVBv+v**2<{Nv-7P^E*hP174-g1$!94+z;IO#c;uhRBIE3H?4;tJh0Rorr z@Atdso%_zaf4=TMGpDLQQ&n?LRZmyfJpOq63E)E%!HNJ95&$4Qb>MLc2nDdvF^Gu> ziHV7bNJvS@h{-9)$;l|ls7T02$tX!*P*ReRQPQw6kW(|$5R)dv6378@opdb+d$OK3z1W1p40M(O?X#ZIHe;P6hDjGTx#?z!4K7jNOg@%oVgNpJG z=!pbS3D5}X`O%4Fp{@+(@8k1`Nd;h9+VF&$NemJ}Az`<7JyT?iaysrGYJ1l=PMIu2 z@-5z+Ge3!=eA0u0^gon62}edkf0~ngvI!Xp2^Ad!6XnUGf2{dOHNlf6C>jy5tZO{U zlNR&$rwoEZc{M$g>x^=4!b}Pv%i709;MtRH$OI?^fHZJJ4g_N&w>r&XLcw66U%?c= zFTFoi$Y2Z>^u4qIUcIEHKfp&i+dX@(AI#|(AfQ4PBv5Ety@v$R%E;!~FmI1`_q8qF z(HS1mwk?~7b)84#i0TzdKQq3R>S)-~xlF%S$)cPx7b!%DKwVHG$_=v#w0N;L5li#1 zaS>QceR4V_vPHOqvi>R^nhbRXkU|Tw^^wuzh~w}nr!8sfp%qCXa9!UGmB6);EkX~a zqz#xnh&0w9(g+sY*ILIHi(~vcsc;lbaXfIJ|C0VB81K+|gmH@YH=w;~k>pCgJwkpx z2n{Xlp>ARB&ph5~&wt5uw^>ydIC?I1!1(Ps{p*MinBoM9F4IP7i@v7}Ep`G)CR*RZ zoH*2cU3;0d4d=>uHIPLeh$ibHux=72SW?x9bA?jDynKgWPeT|s;)7@ibRIq3moMFK zvUlH(2>7&=}ugrs*Wu6Qf#p|OcvD{Js;RuRi+~W)Mes0zxw;;uc3M` z8iNKu|G%wjm#c`>R9AKJ=qo=nG{ZK;BU0%=Qp8Y^G}JcHb3ZkQ+6Myk9t4+=I=d^$2jl zf}NtqE!Rt0o!4h?{0N)(yyU!@Rr*XrE%u_j7 z8lf>v_>!vj?%Ih~;;**ex<`2y5Vs_^(%Yr{AnSzIxxW4djo+im?RVIhDED&ln|#H* zV&(U{a_m!H7pLT%(z1o|BLJ)Z!`OFKamKLct9{pscryxdHTH5XZWL7M--%Vj*w$2+ zf`_@1)7HeQB2fk@F~g@LZ~-}DMW0~XMxc69h2WFbR1+086jn26&WHAOQII1yt_UV4 zi}f7o_(6!+YzwH$9hs9Bu?t&6GW=T=HNTw!7M{Qy%P9tuL2y=I34K&@!Yq zQUb58u2u~UUqD6!BD4GZeY?=4i8bDCtp6NNH~sVM+!|rd%@(>(wV~n#jB~a-FF%O&+fl>RQLT*gyzz3Psm_7<~0!kUNBJ^+|D1w zMZb~0DX##}Gub-<)s``*U<@^7<-K51N~&BmW}y+_@L!%v@oSr^y{toC3+leU1c)w2 z`?&7r_yb;yrpCUkYXI}GCk%@>0swDiep2-#ZxKo%^B>~7t*zLLNr&`7x~&qm06>8f47M&2`eAnbam(( zFV4hp{ikdz^b)2@YIF=d;{9Nux#FR=Yx@TxKxbF!8_`wDy&}*V%zyj*e7ej2rcBsZ z>3IKC@U+8EN@4G3!PJ}Wio0nVcjuKKofv7W9nKpL1$2{AJ1Yaeo*7Kb)n!WUiVq@( zb`94lk3hlGOLWyenC(Kc;`Ll0s!iunB4MtIB>UgPEF_Lhibme{s{MhhH!Z9?c=z)% zsAvrb27+g{ib%aFx$wR56%_Y9ES4~q=MOzAbl>PPucjWA+KR61&2fCTlG>?ad*@WL z9~Gh;EofKIn(k9L!-m@Oo?_iN$ZInA#sA_`f`WBXz-cfhQd(B#Ym8iU;u=4i>>ns$ z7N-or?&&QZ#!qB`?q4G>yDs{Qu%4wTmq;GqKMT66ywyaeNrZs3@7SqEx@l@==-E3M z8pIO|NGh^J!A0pX(s*HR9cKA$qI!Z(E*;QB=9_iyZ?B%G;U>?si5+ntqzjrH7Jp}c z`8s-a`R~kqRQ6EEmjBm|#6Mfb75GQhD;h(dpcBR&Y4*H*m&ukikN9vblTCGGu^i zX*QT-$cn<$X66ZP_^8kFLd5zF>Zj(oic=^=amnr&)|9z=_-Bd` z+vViE7rydnXigm^G&OGlCz592xb_tg^VBm$h+D9Sxef4K7RF^3e3@QZ*4h27&S3%H`t=dSjJ0k%#x4HtS;+CPE>m; z>0T}*oEoO7(bgNMxL$FnJABgfh8E6XXYPw{YH*~Qkb?|xN7Ds&TXja%dX3gtY#a}y z#zzo^9F9GpJWgUwy@rpf-V(6HsO zuo>;tHy?DFQ;5By8O$z^JA-|dz z1*gEEo8e?n6Y-RMgxm*hEfBpn335^u@y$TiAHF8s4&AvUy}%vKqHw*0qFA+Cx1Tok zrV9IWnr_X+9nr3WPDmVU7d4e%^d2pLIN8fcC%EXcEu9^kw!|P8rW` zlA@7!E(iE%9&NANH}y0V6TDwwIz%>;DQ0*Uf0e5;ajWe}O*_6|xmUQgjjjemzx|;I z_;~Uu)hkVOgLPW>Gb_5+eSP4g18*FlVrBGIhr_Yd#j_l489#^iDXZNqtK;Q;+0`ONt#{W#;!7> zZf%%uTJ-Xy@C>9f%Wn?hv0O2)zEOI_4K{>-eJ<@j32~H)?>>s;-@VSyf8X?EbtCR*)IpaX6-fK;GCPQXPB(p8TBMF z*_oPbt(Mzxprpn(5^qQn`GIY}?Q#$4)HfQPg8U5a7Rx=t!?aL5Nb-IaK_z#G2crsh z=x^Xb!R!9zQ#-O zi9+gr1o(qV!{*-rB~J+p5G+=a`&@J?gZXN|s&=D<%#bPFn_{2~v|{4t=g9M-C2fjC z-AR(uRJsPVdig!0Z8vr9DACVm{IKzkOL8^s!YhCXZ$YW?lU37#`F{YOBQNVExjRSk zZi@U@))c-rLXYwm9Ip-bTO}3~I7YNFH64&K^Dr`%#C*ubdoityD=le`{S|QQ2u7(urjzZaK%e zM(oG@(AS=3-=K9=Ed;aOL|WYfxftEpiL${8s(9QTmL-PWK>oX>he{bV6KS(Ri&7vo z3}1ZGSfV3T=_S3PqZuCIUk#U{DG!yXAi1b<(!xsQ1i8}4;J=pg3;HdyOZOjznw ztODqs_k0ub{I6;G!9lnd>%G^2eOK)KJTh%WmV9mGU#w&ZNt^zM^5PmsL|;qSc3AvL z$o$q&P0%3Qt3B>lHj0T^t{HxpMp~)`c`_AOgZe?6I`gYn&ZDk7dP_?=+=PF>%Ukwn z%+e0^q>wuBplirHz}_8oUMc2)eUp9@^99)cPN-kCDGUzX+358Zly-?KkZGz#z(k64QUH=q2*xi}l`Fu-Rh<^Ucw9WW?p*pINhlv{Q(p&8{9=9G)!=wT^p&dPKv zWHgc_^-`%SKVo@n@kX)aO4rF|G3)~eE{GGFXenGwr0QgB{j`%o2CU7T$&q&d<(QZ# zqwdKTZ`7pli>uapkM8PoN8&G=;|eEf5Dn8m&EJfyHX2FnU2u%etVQ$BZR!6Q+bVrK%ZEX&gA`2ZbV?zEo2boAHKn8<%vMjZD6+|lme;Gi zBnMoF0P0iaAP1o&5gF#!7M$-ZkW`AoaeNinkcn$F-RKf^g9+FmtjA^6a+CH z57TYjtEuwM=%DfE(Fwz?EUv&Nn2;riOd_K~x5N8NWT-Y+=hIvH#$( zRP(cVKlY8~XrwS13TtiTEAn$-^FxR94AL}(;``i$ zR6Kf4a0yHnA7ug+M|aTLT*w1B7H4zO)=&o}cDx>{EF!IBifV}a#PlmP*T9H}E-@tC zg~>$v z_D*Ah0-8&O?9B

{=!pqQ26k<+h8N9=>T5Eu?x*@qQ7sw>$T(tqI4G9U(6kIIc%K za!;z9N6SlJ?Rq{7dgD3TQSb@!Z^Wn9gV#z9ZHsufMfa^;2gm;#Ep&}mf4+dN6`Ed7 zEka8WmSb$+GQbIHp8r~_RSMDYDXYnWO+cd6HYC_m5r(WBz@}|30%?!{3a~R`B5^as zAt$DHKG;SY2Uu9SFqeE83R>@TabbC93vPC9vFwpZ=DG&%y@T$ytR$7n z(uRmaFx_Rp#LhZ1+ZiaWN1cbay?8QPH=U%WrnRKG zsR|*&1#Yn)PV%UoWD0G?VyoIz=tyrm@nj1nb z-$aNkalVqCwvwj>Jc`p0M5q(BvAlDYz~w-J+}GwvNUX^Y4Hiqd>G#jjX&kY2WzKKX zv3uTZ4|b>%=&|L%Vg?H{6u@N{dL{Zgpw>%<*>9$fs;yq48p7Tw6l>m6DJ!G0Bk4%) zTb5PivChdhZRTyqwKUW)bs8ql1xAkx=o+Ovkd&Qwi5g~S)FAMe@#ZwOXJ!He$~>5k;#Q9<_d(w#ZW$6r!OEZ7?7EqMYn)%-Z9ko0UDH zgrP9NR5U|Y8Ng6k%8seVQ!*7%wW!r7Sp^YLt5mnnh*bUi z)I97>d@WGW0+uEOMZRXoD>J_lu8x$G z&m%w(tM}J@cfVzH>>n0m)^F(z{{2s#v@zwG?SoED@N>MowKMW7`L*XtI^(96O-92$ zz|^lNQJObjCuiTKq@DZ&5&_pI3aD7#Sne1(hquFxvF5aIliLtX*hIkyZ=BC4(he*h z%>DHTvh|1(ru?#860u9rKCK``Fmm=DlEOLd7L}ZhW)&tXEimE1D?q3X!shS30A;41 zv1Naayr9eX5gD`v`{3E~z+1S*rub$_Cki7hM_@g47pNhV)Enk`{2i>U&GNR3%W2 zn`j9}b$BQXcXY$1y~hU!mHyy!c^zSWgzuZaLDNHya$yRXe0kbeG>AdgwgFPDb^MSH zKj%IiU?ptrJb-}ku+(rpElGHi1!I!>i#PPEL`wr2(@)tBBjp0$8X9>ZLdCQ|j4u~J zOMCqVRuEp_9i6hVNRI{kcUu}Y-1BXPusX1eTM{JRkPX&$qkTyp!bsX#XrkilwXdQM z%6hBglzxxn=jjLQos*(t%c*!uCoUCTy~Fa8YuDtyCeG1oi)o=G)3_kl=l{^ck~0Ka z_j?J9-%Dy38-E>)u5}8m(5haX*?+pf@qd+&_7fFZ!LY_{FAry$B$GPL$Kd)amH!vb~GlBqLR>M|s9rHR?eHmY??!_>yNj>m~a^j>}6 z_y0ImABDHkWZ#;Z3GhstwZ_g5Np8Ho@^8|+{#vteQh0SiQ`F`0qB`!s!JjPa)>IC5 zB-H}iW?{{O_f^7?5Jk<}Zy(G7QyBto=Qyv! zlzSHD;uFS=E|_Eh*jpgs=gE%k?ZI{bc;dbMDaa}IF6n}P3C|33OoXDgMMuBJOV?!x3PylW`|w8YF=LVc&=`F*olULW<=08OKhn)%_(S$y0=chjGMM8nl?w08`IZDeSFIcoLcQK zmMz_N*X;CFF?eTMm0RI1gMx9<4$CUMEypml$?zE^Cj6zbdj(Pt;Z76mrsFCJJp1)71g57JY$G$)&}HScu~^$&+ZjEqMe zX%db5riwGw`iLhHM{N+(mn4_@jXLo~Nzr8GC#-{^6%A|5`Nu^0Inp_|o4-?@t$o#6 zzoU=*9U0gH55#}4d$sVwUuiNZ;QXx#{f(Irzuoz-LZerhGI|N;s9iP^kgkN;D)L-@8u` zqHNv?ZCie0ERqbggL)m|tadqV@ww(!V8~~7CS=($LIJ4_wqct^&`^hNLHzKZ0fMwU z$zC6kgwrF0iW2R08cF`+_|GA09@=B2|FX)B#+=f&Qi4+HrV z5kbl3KzplC3xMH|l=t9M-+hI2SxxGO9mU@*Q}NF9yKV7$1&jN7V~2xNM-RjI$8yra zL05N-!-4!k-;alU9XX1ORhKMb$wV3Ay@!Ll{0@1D*;6N5L-iaKHW5$cWDpw%Fg{TRzd-L)l zDH0@0lkNzWa-d83Td&TD6im>?qL89^Ovv}m@EC{M^q&T*v6AhWk&EeycV9VhSd!xL zIOlF}m-)AL>q=8Fx;?+g2|Beu)Hp`o5JJOXagqX06%c#n3XsqbZ>i0&29#0Z+>$=` zDPUS`a%cQwyQ1!B? z@$gqZ)^zdn#|{cfEP=R{yEo@Y{BE}TW_aD-ndEwK<2vnQRxrKqb+PO@cMCDv&-wLK z%==FmTM7{x@CXWo1;dK9F?Se*TNT~H;u5_P*6Sb$kw)4IRhl-U7m>xX&Qx0vUnNxi z>LuODM`Kcu3Ilb}(!qhtU`ElPLbDgm7h_yQEgOSY7{rdGf`>_xl&_9*@A_{dmw#4t<+>9la2X zCSKvR$4u0Rm_^T+tg~5MtI3(sxmc8p6zir+)A9{o4HInDe7G#w8(&d@(vtep!Pq4H zY~nz?D0g1akcDWIwL;OfuZ5CI*^m@ZftFxX&GqFx6#A31a+IEk`{SH8+RrLAPN>Pf zg};9|l)EHu4bmvdm3N;r`bWuk`Z#hmxe3m3E7Iv8&3gU-U9G>%+q3tf-(CFk<-Z}H z3M;oh71-~5JrCmDra91PrG0vYHo7|F`S5*LqF3dsU0=ghW#9jYh?)T*f>3r2JaO$f z*EUrN#z+bEQM#W~s~*p=*O{4CH`X>3yhjeBrcMtV5);|jh@hQusK1eugFsfM2Cf6u z+y)uPc-^@SD?T^#B$%Uazj=R_ThoZB*r^0XJ}9Dxf!~(Cs#zN(^wnTpyiYlOw|gJ- z_9AH5!C~&I=dLRgvzbK4n}j9BUUhgNDWb#uwzJw~@!*2XLvv|qIxyY;(_3cEed5Lb zEjG<4S87GZ&oSoWauDc7Ofi~R^JoaU^{6dZGGV$943W4B-70Ge+pw=cirEf;*HtSR=&|!*6jK{97MaQBReD#H`xl`$PfVVqA|gvBz|U9s3&J`&cKVv|7LqdH@RA{hu4JpIR#JhK9e zSlQ+&ZMUmQHZQ{1I@ zVg0_bR|X3J*nlBCoBSnbT_Z4aKI6-`BE_)%1U-DV1ReA2Zku=FPj$xR*O;D;$13RD0#t5Z!>?w?p#^_WFMsoXu;^tm$6|oeJ*`o!NoOr(osg4B*ct?E z+RqJwP;(}Fcse!?T6L`94hmb$8!f5?4f@#CBV*Fxzr$c8Q99kIjk`*c+$yiBp{}W=h0@l-p||O1qj4B427}W#!szQ`46s5JaUyM2Kudl(n{YR7f^aAS#Ut*~%7H z5Y?IrS<5&LqCr$5fh27QP>O&ha2^gC0WcH@M}g8VKo0<5aPXg3e+EKMUI7mK2imIx zFc1!dD}eGa<$pN{g98YZoQAxiwZc{#y5~`@rV%AIuj9?ZyNcVi@TPXVcv1XL*{P09 z^Z&8{2uHwV1dEcn!9WE3ry&5sP;i8Xq32d>x^x0ify+1wjskW9<7C_1hh>eK2?PA* z7f~shB^=gl{!^A|(WAHI>ZVN6F*b#E!S2BQ=A1mItWA7>eMEAm(0$=>=%Qz_=$)-c z`zhs7tb)CPq~b+bQE{n2wdJy#Au+kMHvBSfon5rOsn z1?VEDvlgZP@xsKB`oX*pAFjxeQtuUHA(J8OuhLNpF<<>5jnwY ztbDIwn*~GaWY>jvB36q=c7`I2)?;J;Jz-4bpK%1$E?L=_H`vS-FMN~&E5Z#?(_Czp z=+u?&qM_rl#g^^rHFTQOH}Q4H!-n7UmIy;bojsf8hQUYOJ6Wdna}tM@n$6A@H>;lm|21QDdI>MqgRRG+{J)Vt<8GoB0?% z^T7I6%<@XS==U9k%5&B)ox(!0QF?hp^f@_iauef3Y|ETO6fRWLvWwgo0H%25Z0UA( zv#77BFX%h(mHOs&iEy7dx2*g$sp*mzyn4L#HxnsfSJ(5=Q~&D8*;m7z ztU}h>#W}6;eA@@lkPr@WvKbxw#$^2Sd{^y5Eg9PMC-5K$xB^4>qyjq!5 z%)&G`E^;$EX5wxfQHu1%@l-gHigLE&mFnpl>(T)K41 znLAu5*$YyE5EV7AbzO0ZuTyybprERBF=FpzLGTO%Y0>jNIp=sh?>#LyTevQ2=8Pi1 zG51Ye95&G>dLjn#IxNQOleHV>P`6(ZF>#&uG4OfV)Iyw(5c#sM{kp`yTx*hJw-#Aw z_z=b4f98%V<3vt;ahS;QhftDR`)PcEg(6F zafQ^|o8HOZ-Hg}i)tUKqUdZ(DZ9&FC!0syO)LApnsk3Gll~*UZz2X6?u4|3*kATb+ zW>7#JOQ&Y^?v+csZ%nEPf|}Xqp0ghydy}H$kK78xFm9qtOrOrSb-Q$bie7unoo>_K zt7_a;bJ&e#Jc8mc#tXBQaNP=@sY literal 0 HcmV?d00001 diff --git a/script/MP.py b/script/MP.py deleted file mode 100644 index 07f893f..0000000 --- a/script/MP.py +++ /dev/null @@ -1,102 +0,0 @@ -import sys, os, yaml, glob -import subprocess -import pandas as pd -import align -import common - - - -def run(global_config, sample_config): - sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config) - print sorted_libraries_by_insert - run_default(global_config, sample_config,sorted_libraries_by_insert) - - - -def run_default(global_config, sample_config,sorted_libraries_by_insert): - print "running the entire pipeline" - print " remove adator" - sample_config = _run_trimmomatic(global_config, sample_config,sorted_libraries_by_insert, "01_trimmomatic") - - sorted_libraries_by_insert = sorted(sample_config["libraries"].iteritems(), key=lambda (k,v): v["insert"]) # recompute sorted libraries by insert - - print " align sequences" - if not os.path.exists("02_alignments"): - os.makedirs("02_alignments") - os.chdir("02_alignments") - print sorted_libraries_by_insert - sorted_libraries_by_insert = align._align_reads(global_config, sample_config, sorted_libraries_by_insert) # align reads - print sorted_libraries_by_insert - sorted_alignments_by_insert = align._merge_bam_files(global_config, sample_config, sorted_libraries_by_insert) # merge alignments - print sorted_libraries_by_insert - sorted_alignments_by_insert = align.picard_CGbias(global_config, sample_config,sorted_alignments_by_insert) - print sorted_libraries_by_insert - sorted_alignments_by_insert = align.picard_collectInsertSizeMetrics(global_config, sample_config,sorted_alignments_by_insert) - print sorted_libraries_by_insert - sorted_alignments_by_insert = align.picard_markDuplicates(global_config, sample_config,sorted_alignments_by_insert) - print sorted_libraries_by_insert - - os.chdir("..") - return 0 - - -def _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert, step): - print " running trimmomatic ..." - mainDir = os.getcwd() - program = global_config["Tools"]["trimmomatic"]["bin"] - program_folder = os.path.dirname(program) - adapterFile = os.path.join(program_folder, "adapters", "NexteraMP.fa") - if not os.path.exists(adapterFile): - print "file {} does not exists: it must be present: you must provide it!!!".format(adapterFile) - return sample_config - - runningDir = os.path.join(mainDir, step) - if not os.path.exists(runningDir): - os.makedirs(runningDir) - os.chdir(runningDir) - #now I am in running dir, I need to process one by one the libraries - - for library, libraryInfo in sorted_libraries_by_insert: - read1=libraryInfo["pair1"] - read2=libraryInfo["pair2"] - orientation = libraryInfo["orientation"] - insert = libraryInfo["insert"] - workingOnLibraryDir = os.path.join(runningDir, "{}_MP_{}".format(library,insert)) - if not os.path.exists(workingOnLibraryDir): - os.makedirs(workingOnLibraryDir) - os.chdir(workingOnLibraryDir) - output_read1_pair = os.path.join(workingOnLibraryDir, "{}_MP_1.fastq.gz".format(library)) - output_read1_sing = os.path.join(workingOnLibraryDir, "{}_MP_u_1.fastq.gz".format(library)) - output_read2_pair = os.path.join(workingOnLibraryDir, "{}_MP_2.fastq.gz".format(library)) - output_read2_sing = os.path.join(workingOnLibraryDir, "{}_MP_u_2.fastq.gz".format(library)) - - if os.path.exists(output_read1_pair): - print "library {} already computed: skyp this".format(library) - libraryInfo["pair1"] = output_read1_pair - libraryInfo["pair2"] = output_read2_pair - os.chdir("..") - continue - - threads = 8 - if "threads" in sample_config: - threads = sample_config["threads"] - - command = ["java", "-jar", program, "PE", "-threads", "{}".format(threads), "-phred33", read1, read2, output_read1_pair ,output_read1_sing , output_read2_pair, output_read2_sing ,"ILLUMINACLIP:{}:2:30:10".format(adapterFile), "MINLEN:30" ] - #print ' '.join(map(str,command)) - stdOut = open("trimmomatic.stdOut", "w") - stdErr = open("trimmomatic.stdErr", "w") - returnValue = subprocess.call(command, stdout=stdOut, stderr=stdErr) - returnValue = 0 - if returnValue != 0: - print "error while running command: {}".format(command) - else: - libraryInfo["pair1"] = output_read1_pair - libraryInfo["pair2"] = output_read2_pair - - os.chdir(runningDir) - - os.chdir(mainDir) - print "trimmomatic done" - return sample_config - - diff --git a/script/QCcontrol.py b/script/QCcontrol.py deleted file mode 100644 index d6236a6..0000000 --- a/script/QCcontrol.py +++ /dev/null @@ -1,128 +0,0 @@ -import sys, os, yaml, glob -import subprocess -import pandas as pd -from matplotlib import pyplot as plt -import common - - -def run(global_config, sample_config): - sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config) - if "tools" in sample_config: - #need to follow the commands listed here - for command in sample_config["tools"]: - command_fn = getattr( sys.modules[__name__] , "_run_{}".format(command)) - sample_config = command_fn(global_config, sample_config, sorted_libraries_by_insert) - else: - #run default pipeline for QC - sample_config = _run_fastqc(global_config, sample_config, sorted_libraries_by_insert) - sample_config = _run_abyss(global_config, sample_config, sorted_libraries_by_insert) - - - - -def _run_fastqc(global_config, sample_config, sorted_libraries_by_insert): - print "running fastqc ..." - mainDir = os.getcwd() - FastqcFolder = os.path.join(os.getcwd(), "fastqc") - if not os.path.exists(FastqcFolder): - os.makedirs(FastqcFolder) - else: - print "done (fastqc folder already present, assumed already run)" - return sample_config - fastq_stdOut = open("fastqc.stdOut", "a") - fastq_stdErr = open("fastqc.stdErr", "a") - program=global_config["Tools"]["fastqc"]["bin"] - program_options=global_config["Tools"]["fastqc"]["options"] - for library, libraryInfo in sorted_libraries_by_insert: - command = [program] - for option in program_options: - command.append(option) - read1=libraryInfo["pair1"] - read2=libraryInfo["pair2"] - command.append(read1) - if read2 is not None: - command.append(read2) - print command - subprocess.call(command, stdout=fastq_stdOut, stderr=fastq_stdErr) - print "fastqc succesfully exectued" - return sample_config - - -def _run_abyss(global_config, sample_config, sorted_libraries_by_insert): - print "I am running abyss to check kmer content" - mainDir = os.getcwd() - ABySS_Kmer_Folder = os.path.join(os.getcwd(), "abyss_kmer") - if not os.path.exists(ABySS_Kmer_Folder): - os.makedirs(ABySS_Kmer_Folder) - else: - print "done (ABySS_Kmer_Folder folder already present, assumed already run)" - os.chdir(ABySS_Kmer_Folder) - _plotKmerPlot(1 , 50) - os.chdir("..") - return - - os.chdir(ABySS_Kmer_Folder) - ABySS_Kmer_stdOut = open("ABySS_Kmer_Folder.stdOut", "a") - ABySS_Kmer_stdErr = open("ABySS_Kmer_Folder.stdErr", "a") - - program = global_config["Tools"]["abyss"]["bin"] - program = os.path.join(os.path.dirname(program), "ABYSS-P") - program_options=global_config["Tools"]["abyss"]["options"] - if "abyss" in sample_config: - program_options=sample_config["abyss"] - - threads = 8 # default for UPPMAX - if "threads" in sample_config : - threads = sample_config["threads"] - - command = "mpirun -np {} {} ".format(threads, program) - kmer = sample_config["kmer"] - command += "-k {} ".format(kmer) - command += "--coverage-hist=histogram.hist -o preUnitgs.fa" - for library, libraryInfo in sorted_libraries_by_insert: - read1=libraryInfo["pair1"] - read2=libraryInfo["pair2"] - orientation = libraryInfo["orientation"] - if orientation=="innie": - command += " {} ".format(read1) - if read2 is not None: - command += " {} ".format(read2) - if orientation == "none": - command += " {} ".format(read1) - print command - subprocess.call(command, shell=True, stdout=ABySS_Kmer_stdOut, stderr=ABySS_Kmer_stdErr) - subprocess.call(("rm", "preUnitgs.fa")) - _plotKmerPlot(15,200) - - os.chdir("..") - return - -def _plotKmerPlot(min_limit, max_limit): - Kmer_histogram = pd.io.parsers.read_csv("histogram.hist", sep='\t', header=None) - Kmer_coverage = Kmer_histogram[Kmer_histogram.columns[0]].tolist() - Kmer_count = Kmer_histogram[Kmer_histogram.columns[1]].tolist() - Kmer_freq = [Kmer_coverage[i]*Kmer_count[i] for i in range(len(Kmer_coverage))] - kmer_freq_peak = Kmer_freq.index(max(Kmer_freq[min_limit:max_limit])) #coverage peak, disregarding initial peak - kmer_freq_peak_value=max(Kmer_freq[min_limit:max_limit]) - - xmax = max_limit - ymax = kmer_freq_peak_value + (kmer_freq_peak_value*0.30) - - plt.plot(Kmer_coverage, Kmer_freq) - plt.title('K-mer length = %s' % 54) - plt.xlim((0,xmax)) - plt.ylim((0,ymax)) - plt.vlines(kmer_freq_peak, 0, kmer_freq_peak_value, colors='r', linestyles='--') - plt.text(kmer_freq_peak, kmer_freq_peak_value+2000, str(kmer_freq_peak)) - plotname = "kmer_coverage.png" - plt.savefig(plotname) - plt.clf() - return 0 - - -def _run_jellyfish(global_config, sample_config, sorted_libraries_by_insert): - print "Jellyfish still to be implemented" - - - - diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..f54dca0 --- /dev/null +++ b/setup.py @@ -0,0 +1,14 @@ +#!/usr/bin/env python + +from setuptools import setup + +setup(name='de_novo_scilife', + version='0.5', + description= 'A same automated analysis pipeline for de novo assembly analysis', + author='Francesco Vezzi', + author_email='francesco.vezzi@scilifelab.se', + url='https://github.com/vezzi/de_novo_scilife', + scripts=['de_novo_scilife/deNovo_pipeline.py'], + packages=['de_novo_scilife'], + namespace_packages=["de_novo_scilife"], +) diff --git a/utils/OpGen_util.py b/utils/OpGen_util.py new file mode 100644 index 0000000..2bb0c15 --- /dev/null +++ b/utils/OpGen_util.py @@ -0,0 +1,395 @@ +import sys, os, yaml, glob +import subprocess +import argparse +from collections import Counter +import pandas as pd +import csv + + +def main(args): + workingDir = os.getcwd() + assembly = args.assembly + minCtgLength = args.min_contig + + assembly = _build_new_reference(assembly, minCtgLength) + if args.only_reference > 0: + return + ## now generate stats + (contigsLengthDict, contigsSequencesDict) = _compute_assembly_stats(assembly, args.genome_size) + if args.only_stats > 0: + return + + #problems = find_problems_in_maps(args.opgen_report, contigsLengthDict, contigsSequencesDict) + + produce_consensus(args.opgen_report, contigsLengthDict, contigsSequencesDict, args.output) + + return + + + +def produce_consensus(opgen_report, contigsLengthDict, contigsSequencesDict, output): + print opgen_report + # starting to parse opgen report, since the csv file is unstructured I need to parse it + placement = "{}.placement".format(opgen_report) + stats = "{}.stats".format(opgen_report) + gaps_overlaps = "{}.gaps".format(opgen_report) + unplaced_contigs = "{}.unplaced".format(opgen_report) + with open(opgen_report, 'rb') as unstructured_file: + placement_file = open(placement, 'w') + stats_file = open(stats, 'w') + gaps_overlaps_file = open(gaps_overlaps, 'w') + unplaced_contigs_file = open(unplaced_contigs, 'w') + current_writing_file = placement_file + for row in unstructured_file: + if "N50" in row: + current_writing_file = stats_file + if "Gaps" in row: + current_writing_file = gaps_overlaps_file + if "Unplaced" in row: + current_writing_file = unplaced_contigs_file + current_writing_file.write(row) + current_writing_file.close() + placement_file.close() + stats_file.close() + gaps_overlaps_file.close() + + # I need a list with one element for each base in my map. THe problem is that the OpGen report does not tell the length of the maps, therefore I need to find the maximum limit between the placments part and the gapped part + Maps = {} + Maps_length = {} + ContigsToMaps = {} + MapsToContigs = {} + with open(placement, 'rb') as csvfile: + opgenCSV = csv.reader(csvfile, delimiter='\t') + #Chromosome, Start, End, Contig, Start, End, Orientation + header = opgenCSV.next() + for row in opgenCSV: + if len(row) > 0: + #print ', '.join(row) + (Map, Map_Start, Map_End, Contig, Ctg_Start, Ctg_End, Orientation) = (row[0], int(row[1]), int(row[2]), row[3], int(row[4]), int(row[5]), row[6]) + Contig = Contig.split(" ")[0] + if not ContigsToMaps.has_key(Contig): + ContigsToMaps[Contig] = [[Map, Map_Start, Map_End, Ctg_Start, Ctg_End, Orientation]] + else: + ContigsToMaps[Contig].append([Map, Map_Start, Map_End, Ctg_Start, Ctg_End, Orientation]) + if not Maps_length.has_key(Map): + Maps_length[Map] = Map_End + MapsToContigs[Map] = [[Map_Start, Map_End, Contig, Ctg_Start, Ctg_End, Orientation]] + else: + if Map_End > Maps_length[Map]: + Maps_length[Map] = Map_End + MapsToContigs[Map].append([Map_Start, Map_End, Contig, Ctg_Start, Ctg_End, Orientation]) + else: + print "{} {}".format(Map_End, Maps_length[Map]) + + with open(gaps_overlaps, 'rb') as csvfile: + opgenCSV = csv.reader(csvfile, delimiter='\t') + #Map Type Map_Start Map_End Length + header = opgenCSV.next() + header = opgenCSV.next() + for row in opgenCSV: + if len(row) > 0: + (Map, Type, Map_Start, Map_End, Length) = (row[0], row[1], int(row[2]), int(row[3]), int(row[4])) + if not Maps_length.has_key(Map): + Maps_length[Map] = Map_End + else: + if Map_End > Maps_length[Map]: + Maps_length[Map] = Map_End + + # now Maps_length contains the lenght of the Maps... thank you so much OpGen!!!!! + multipleHittedPositions = 0 + rescuedBases = 0 + conflictingBases = 0 + for Map in Maps_length: + Maps[Map] = ["n"] * Maps_length[Map] + print "now working with Map {}".format(Map) + if MapsToContigs[Map] is not None: # if this map has some contigs aligning + for hit in MapsToContigs[Map]: + start_on_Map = hit[0] + end_on_Map = hit[1] + Ctg = hit[2] + start_on_Ctg = hit[3] + end_on_Ctg = hit[4] + orientation = hit[5] + #extract the seuqence + sequence = contigsSequencesDict[Ctg][start_on_Ctg:end_on_Ctg] + if orientation == '-1': + sequence = revcom(sequence) + letters = list(sequence) + index = start_on_Map + + for base in letters: + if len(Maps[Map]) <= index: + Maps[Map].append("n"); + if Maps[Map][index] is not "n": + multipleHittedPositions += 1 + if Maps[Map][index] is "N" and base is not "N": + Maps[Map][index] = base + rescuedBases += 1 + elif Maps[Map][index] is not "N" and base is not "N": + conflictingBases += 1 + else: + Maps[Map][index] = base + index += 1 + #print hit + print "position in th emap covered more than once {}".format(multipleHittedPositions) + print "rescued bases {}".format(rescuedBases) + print "conflicting bases {}".format(conflictingBases) + print_contigs(Maps, output) + + +def revcom(s): + return complement(s[::-1]) + +def complement(s): + letters = list(s) + complemented = [] + for base in letters: + if base is "A" or base is "a": + complemented.append("T") + elif base is 'C' or base is 'c': + complemented.append("G") + elif base is 'G' or base is 'g': + complemented.append("C") + elif base is 'T' or base is 't': + complemented.append("A") + elif base is 'N' or base is 'n': + complemented.append("N") + else: + complemented.append("N") + + return ''.join(complemented) + + +def print_contigs(Maps, output): + with open("{}.fasta".format(output), "w") as final_assembly: + for Map, sequence in Maps.iteritems(): + final_assembly.write(">{}\n".format(Map)) + final_assembly.write("{}\n".format("".join(sequence))) + + + +def find_problems_in_maps(opgen_report, contigsLengthDict, contigsSequencesDict): + ###TODO: this function is still to be defined. + print opgen_report + # startint to parse opgen report, since the csv file is unstructured I need to parse it + placement = "{}.placement".format(opgen_report) + stats = "{}.stats".format(opgen_report) + gaps_overlaps = "{}.gaps".format(opgen_report) + unplaced_contigs = "{}.unplaced".format(opgen_report) + with open(opgen_report, 'rb') as unstructured_file: + placement_file = open(placement, 'w') + stats_file = open(stats, 'w') + gaps_overlaps_file = open(gaps_overlaps, 'w') + unplaced_contigs_file = open(unplaced_contigs, 'w') + current_writing_file = placement_file + for row in unstructured_file: + if "N50" in row: + current_writing_file = stats_file + if "Gaps" in row: + current_writing_file = gaps_overlaps_file + if "Unplaced" in row: + current_writing_file = unplaced_contigs_file + current_writing_file.write(row) + current_writing_file.close() + placement_file.close() + stats_file.close() + gaps_overlaps_file.close() + + # I need a list with one element for each base in my map. THe problem is that the OpGen report does not tell the length of the maps, therefore I need to find the maximum limit between the placments part and the gapped part + Maps = {} + Maps_length = {} + ContigsToMaps = {} + with open(placement, 'rb') as csvfile: + opgenCSV = csv.reader(csvfile, delimiter='\t') + #Chromosome, Start, End, Contig, Start, End, Orientation + header = opgenCSV.next() + for row in opgenCSV: + if len(row) > 0: + #print ', '.join(row) + (Map, Map_Start, Map_End, Contig, Ctg_Start, Ctg_End, Orientation) = (row[0], int(row[1]), int(row[2]), row[3], int(row[4]), int(row[5]), row[6]) + Contig = Contig.split(" ")[0] + if not ContigsToMaps.has_key(Contig): + ContigsToMaps[Contig] = [[Map, Map_Start, Map_End, Ctg_Start, Ctg_End, Orientation]] + else: + ContigsToMaps[Contig].append([Map, Map_Start, Map_End, Ctg_Start, Ctg_End, Orientation]) + if not Maps_length.has_key(Map): + Maps_length[Map] = Map_End + else: + if Map_End > Maps_length[Map]: + Maps_length[Map] = Map_End + else: + print Map_End + " " + Maps_length[Map] + + with open(gaps_overlaps, 'rb') as csvfile: + opgenCSV = csv.reader(csvfile, delimiter='\t') + #Map Type Map_Start Map_End Length + header = opgenCSV.next() + header = opgenCSV.next() + for row in opgenCSV: + if len(row) > 0: + (Map, Type, Map_Start, Map_End, Length) = (row[0], row[1], int(row[2]), int(row[3]), int(row[4])) + if not Maps_length.has_key(Map): + Maps_length[Map] = Map_End + else: + if Map_End > Maps_length[Map]: + Maps_length[Map] = Map_End + # now Maps_length contains the lenght of the Maps... thank you so much OpGen!!!!! + + # print contigs that align in more than one map + ##### now decide if these contigs are suspicious (need to be broken or they are ok) + ##### if a contig maps in two different places in the same way, then this is a real repeat. If there are two clear alignments then I must suggest to breack the contig + contigsInMultipleMaps = 0 + contigsWithErrors = 0 + for Contig in ContigsToMaps: + length = len(ContigsToMaps[Contig]) + if length > 1: + contigsInMultipleMaps += 1 + contigsCoordinates = [] + print Contig + " --> " + for entry in ContigsToMaps[Contig]: + contigsCoordinates.append([entry[3],entry[4], (entry[3] - entry[3])/contigsLengthDict[Contig] ]) # save only start, end, and length of alignment + print entry + ### check if this contig is an erroneus contig or a real repeat + print "number of contigs aligning in multiple maps is {}".format(contigsInMultipleMaps) + + + + + + + for Map in Maps_length: + Maps[Map] = [0] * Maps_length[Map] + + + + + + +def _compute_assembly_stats(assembly, genomeSize): + stats_file_name = ".".join([assembly, "statistics", "txt"]) + contigsLengthDict = {} + contigsSequencesDict= {} + contigsLength = [] + totalLength = 0 + numContigs = 0 + maxContigLength = 0 + N50 = 0 + N80 = 0 + numNs = 0 + + with open(assembly, "r") as ref_fd: + fasta_header = ref_fd.readline() + header = (fasta_header.split(">")[1]).rstrip() + sequence = "" + for line in ref_fd: + if line.startswith(">"): + contigsLength.append(len(sequence)) + contigsLengthDict[header] = len(sequence) + contigsSequencesDict[header] = sequence + totalLength += len(sequence) + numContigs += 1 + counter = Counter(sequence) + numNs += (counter['n'] + counter['N']) + sequence = "" + header = (line.split(">")[1]).rstrip() + else: + sequence+=line.rstrip() + contigsLength.append(len(sequence)) + contigsLengthDict[header] = len(sequence) + contigsSequencesDict[header] = sequence + totalLength += len(sequence) + numContigs += 1 + counter = Counter(sequence) + numNs += (counter['n'] + counter['N']) + + if os.path.exists(stats_file_name): + print "assembly stast file {} already created".format(stats_file_name) + return (contigsLengthDict, contigsSequencesDict) + + + percentageNs = float(numNs)/totalLength + contigsLength.sort() + contigsLength.reverse() + + + teoN50 = genomeSize * 0.5 + teoN80 = genomeSize * 0.8 + testSum = 0 + N50 = 0 + N80 = 0 + maxContigLength = contigsLength[0] + for con in contigsLength: + testSum += con + if teoN50 < testSum: + if N50 == 0: + N50 = con + if teoN80 < testSum: + N80 = con + break + + with open(stats_file_name, "w") as stats_file_name_fd: + stats_file_name_fd.write("#scaffolds : {}\n".format(numContigs)) + stats_file_name_fd.write("totalLength : {}\n".format(totalLength)) + stats_file_name_fd.write("maxContigLength : {}\n".format(maxContigLength)) + stats_file_name_fd.write("N50 : {}\n".format(N50)) + stats_file_name_fd.write("N80 : {}\n".format(N80)) + stats_file_name_fd.write("percentageNs : {}\n".format(percentageNs)) + + return (contigsLengthDict, contigsSequencesDict) + + + +def _build_new_reference(assembly, minCtgLength): + new_assembly_name = os.path.basename(assembly) + (basename, type, extention) = new_assembly_name.split(os.extsep) + new_assembly_name = basename + ".{}.{}.{}".format(type, minCtgLength, extention) + new_assembly_name = os.path.abspath(os.path.basename(new_assembly_name)) + + if os.path.exists(new_assembly_name): + print "assembly {} already created".format(new_assembly_name) + return new_assembly_name + + contigID = 1 + with open(new_assembly_name, "w") as new_ref_fd: + with open(assembly, "r") as ref_fd: + fasta_header = ref_fd.readline() + sequence = "" + for line in ref_fd: + line = line + if line.startswith(">"): + if len(sequence) >= minCtgLength: + new_ref_fd.write(">contig{}\n".format(contigID)) + new_ref_fd.write(sequence) + sequence = "" + contigID += 1 + else: + sequence+=line + if len(sequence) >= minCtgLength: + new_ref_fd.write(">contig{}".format(contigID)) + new_ref_fd.write(sequence) + return new_assembly_name + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--assembly', help="assembly sequence", type=str) + parser.add_argument('--genome-size', help="expected genome size", default=0, type=int) + parser.add_argument('--min-contig', help="minimum contig length", default=2000, type=int) + parser.add_argument('--only-reference', help="generates only the new refernce (i.e., only scaffolds longer than min-contig)", default=0, type=int) + parser.add_argument('--only-stats', help="generates only assembly stats", default=0, type=int) + + parser.add_argument('--opgen-report', help="op gen placment report", type=str) + parser.add_argument('--find-problems-in-maps', help="generates only the new refernce (i.e., only scaffolds longer than min-contig)", default=0, type=int) + parser.add_argument('--produce-consensus', help="generates only the new refernce (i.e., only scaffolds longer than min-contig)", default=0, type=int) + + parser.add_argument('--output', help="aoutput header name", default="opgen_scaffolded_assembly", type=str) + args = parser.parse_args() + if args.genome_size == 0: + print "genome size must be specified" + sys.exit("error") + + main(args) + + + diff --git a/utils/run_assembly_evaluation.py b/utils/produce_assembly_report.py similarity index 73% rename from utils/run_assembly_evaluation.py rename to utils/produce_assembly_report.py index 5a96255..9f5d6f5 100644 --- a/utils/run_assembly_evaluation.py +++ b/utils/produce_assembly_report.py @@ -11,54 +11,18 @@ def main(args): workingDir = os.getcwd() assemblers = sum(args.assemblers, []) - if not os.path.exists(args.sample_config): - sys.exit("Error: file {} does not exists".format(args.sample_config)) - - with open(os.path.abspath(args.sample_config)) as sample_config_handle: - sample_config = yaml.load(sample_config_handle) - global_config = os.path.abspath(args.global_config) + if not os.path.exists(args.validation_dir): + sys.exit("Error: directory {} does not exists".format(args.validation_dir)) if not os.path.exists(args.assemblies_dir): - sys.exit("Error: directory {} does not exists".format(assemblies_dir)) - assemblies_dir = os.path.abspath(args.assemblies_dir) - - processed = 0 - - for assembler in assemblers: - assemblerValidationDir = os.path.join(workingDir, assembler) - if not os.path.exists(assemblerValidationDir): - os.makedirs(assemblerValidationDir) - os.chdir(assemblerValidationDir) - #I am in the assembler specific validation directory - if "output" not in sample_config: - sys.exit("Error: sample cofig must contain output field") - outputName = sample_config["output"] - assemblerReference = os.path.join(assemblies_dir, assembler, "{}.scf.fasta".format(outputName)) - print assemblerReference - if not os.path.exists(assemblerReference): - print "for assembler {} no assembly is available, skypping this validation".format(assembler) - os.chdir("..") - continue - processed += 1 - #otherwise I have everyhitng I need - sample_config["reference"] = assemblerReference - - stream = file('{}_{}.yaml'.format(outputName, assembler), 'w') - yaml.dump(sample_config, stream) # Write a YAML representation of data to 'document.yaml'. - stream.close() - - command = "python ~/DE_NOVO_PIPELINE/de_novo_scilife/script/deNovo_pipeline.py --global-config {} --sample-config {}_{}.yaml".format(global_config, outputName, assembler) - subprocess.call(command, shell=True) - os.chdir("..") - if processed == 0: - for assembler in assemblers: - command = ["rm", "-r", assembler] - subprocess.call(command) - sys.exit("Error: no assembly available, probably wrong path specification") - - + sys.exit("Error: directory {} does not exists".format(args.assemblies_dir)) + + validation_dir = os.path.abspath(args.validation_dir) # save valiadation directory + assemblies_dir = os.path.abspath(args.assemblies_dir) # save assemblies directory + outputName = args.output + minContigLength = args.minContigLength + genomeSize = args.genomeSize + processed = 0 # count how many assembler I am going to process - ##Now everything is done - validationDirectory = os.getcwd() if not os.path.exists("LaTeX"): os.makedirs("LaTeX") os.chdir("LaTeX") @@ -67,29 +31,29 @@ def main(args): assemblyStats = [] for assembler in assemblers: #compute assembly statistics - if os.path.exists(os.path.join(assemblies_dir, assembler, "{}.scf.fasta".format(outputName))): - assemblySeq = os.path.join(assemblies_dir, assembler, "{}.scf.fasta".format(outputName)) - assemblyStats.append(computeAssemblyStats(assembler, assemblySeq, sample_config["minCtgLength"], sample_config["genomeSize"])) + assemblySeq = os.path.join(assemblies_dir, assembler, "{}.scf.fasta".format(outputName)) + if os.path.exists(assemblySeq): + assemblyStats.append(computeAssemblyStats(assembler, assemblySeq, minContigLength, genomeSize)) latex_document = _insert_stat_table(latex_document, assemblyStats) #now copy QAstast - if not os.path.exists("pictures"): - os.makedirs("pictures") + if not os.path.exists("QA_pictures"): + os.makedirs("QA_pictures") for assembler in assemblers: - if not os.path.exists(os.path.join("pictures", assembler)): - os.makedirs(os.path.join("pictures", assembler)) + if not os.path.exists(os.path.join("QA_pictures", assembler)): + os.makedirs(os.path.join("QA_pictures", assembler)) #directory structure created for assembler in assemblers: - if os.path.exists(os.path.join(assemblies_dir, assembler, "{}.scf.fasta".format(outputName))): + if os.path.exists(os.path.join(validation_dir, assembler, "reference", "{}.scf.fasta".format(outputName))): latex_document = _new_section(latex_document, assembler) - Original_CoverageDistribution200 = os.path.join(validationDirectory, assembler, "QAstats", "Coverage_distribution_noOutliers.png") - Original_GC_vs_Coverage = os.path.join(validationDirectory, assembler, "QAstats", "GC_vs_Coverage_noOutliers.png") - Original_GC_vs_CtgLength = os.path.join(validationDirectory, assembler, "QAstats", "GC_vs_CtgLength.png") - Original_MedianCov_vs_CtgLength = os.path.join(validationDirectory, assembler, "QAstats", "MedianCov_vs_CtgLength_noOutliers.png") - Copied_CoverageDistribution200 = os.path.join("pictures", assembler, "Coverage_distribution_noOutliers.png") - Copied_GC_vs_Coverage = os.path.join("pictures", assembler, "GC_vs_Coverage_noOutliers.png") - Copied_GC_vs_CtgLength = os.path.join("pictures", assembler, "GC_vs_CtgLength.png") - Copied_MedianCov_vs_CtgLength = os.path.join("pictures", assembler, "MedianCov_vs_CtgLength_noOutliers.png") + Original_CoverageDistribution200 = os.path.join(validation_dir, assembler, "QAstats", "Coverage_distribution_noOutliers.png") + Original_GC_vs_Coverage = os.path.join(validation_dir, assembler, "QAstats", "GC_vs_Coverage_noOutliers.png") + Original_GC_vs_CtgLength = os.path.join(validation_dir, assembler, "QAstats", "GC_vs_CtgLength.png") + Original_MedianCov_vs_CtgLength = os.path.join(validation_dir, assembler, "QAstats", "MedianCov_vs_CtgLength_noOutliers.png") + Copied_CoverageDistribution200 = os.path.join("QA_pictures", assembler, "Coverage_distribution_noOutliers.png") + Copied_GC_vs_Coverage = os.path.join("QA_pictures", assembler, "GC_vs_Coverage_noOutliers.png") + Copied_GC_vs_CtgLength = os.path.join("QA_pictures", assembler, "GC_vs_CtgLength.png") + Copied_MedianCov_vs_CtgLength = os.path.join("QA_pictures", assembler, "MedianCov_vs_CtgLength_noOutliers.png") sh.copy(Original_CoverageDistribution200, Copied_CoverageDistribution200) sh.copy(Original_GC_vs_Coverage , Copied_GC_vs_Coverage ) sh.copy(Original_GC_vs_CtgLength , Copied_GC_vs_CtgLength ) @@ -102,52 +66,59 @@ def main(args): latex_document = _insert_QA_figure(latex_document, pictures, "QA pictures") # now FRCurve latex_document = _new_section(latex_document, "FRCurve") - FRCurves = [] + + if not os.path.exists("FRCurves"): + os.makedirs("FRCurves") + os.chdir("FRCurves") + names = ["_FRC" , "COMPR_MP_FRC" , "COMPR_PE_FRC" , "HIGH_COV_PE_FRC" , "HIGH_NORM_COV_PE_FRC" ,"HIGH_OUTIE_MP_FRC" , "HIGH_OUTIE_PE_FRC" , "HIGH_SINGLE_MP_FRC" , "HIGH_SINGLE_PE_FRC" , "HIGH_SPAN_MP_FRC" , "HIGH_SPAN_PE_FRC" ,"LOW_COV_PE_FRC" , "LOW_NORM_COV_PE_FRC" , "STRECH_MP_FRC" , "STRECH_PE_FRC"] + for name in names: + FRCurves = [] + for assembler in assemblers: + FRCurve_Orig_name = os.path.join(validation_dir, assembler, "FRCurve", "{}{}.txt".format(outputName, name)) + FRCurves.append([assembler, FRCurve_Orig_name]) + FRCname = _plotFRCurve("{}_{}".format(outputName, name),FRCurves) + FRCurves = [] + #plot last FRCurve for assembler in assemblers: - if os.path.exists(os.path.join(assemblies_dir, assembler, "{}.scf.fasta".format(outputName))): - FRCurves.append([assembler, os.path.join(validationDirectory, assembler, "FRCurve", "{}_FRC.txt".format(outputName))]) + FRCurves.append([assembler, os.path.join(validation_dir, assembler, "FRCurve", "{}_FRC.txt".format(outputName))]) FRCname = _plotFRCurve(outputName,FRCurves) - latex_document = _new_figure(latex_document, FRCname, "Feature Response Curve compute on all Features") + print FRCname + os.chdir("..") + latex_document = _new_figure(latex_document, "FRCurves/{}".format(FRCname), "Feature Response Curve compute on all Features") latex_document = _latexFooter(latex_document) latex_document = _latexFooter(latex_document) - if args.generatePDF == 1: - with open("{0}.tex".format(outputName),'w') as f: - f.write(latex_document) - command = ["pdflatex", "{0}.tex".format(outputName)] - latex_stdOut = open("latex.stdOut", "a") - latex_stdErr = open("latex.stdErr", "a") - subprocess.call(command, stdout=latex_stdOut , stderr=latex_stdErr) + with open("{0}.tex".format(outputName),'w') as f: + f.write(latex_document) os.chdir("..") # now prepare delivery folder if not os.path.exists("{}_delivery_report".format(outputName)): os.makedirs("{}_delivery_report".format(outputName)) os.chdir("{}_delivery_report".format(outputName)) - #copy pdf report - if args.generatePDF == 1: - sh.copy(os.path.join(validationDirectory, "LaTeX", "{}.pdf".format(outputName)), "{}.pdf".format(outputName)) + #now copy QA tables for assembler in assemblers: - if os.path.exists(os.path.join(assemblies_dir, assembler, "{}.scf.fasta".format(outputName) )): + if os.path.exists(os.path.join(assemblies_dir, assembler, "{}.scf.fasta".format(outputName) )): if not os.path.exists(assembler): os.makedirs(assembler) if not os.path.exists(os.path.join(assembler, "assembly")): os.makedirs(os.path.join(assembler, "assembly")) - if os.path.exists(os.path.join(assemblies_dir, assembler, "{}.scf.fasta".format(outputName))): - sh.copy(os.path.join(assemblies_dir, assembler, "{}.scf.fasta".format(outputName)), os.path.join(assembler, "assembly", "{}.scf.fasta".format(outputName))) - if os.path.exists(os.path.join(assemblies_dir, assembler, "{}.ctg.fasta".format(outputName))): - sh.copy(os.path.join(assemblies_dir, assembler, "{}.ctg.fasta".format(outputName)), os.path.join(assembler, "assembly", "{}.ctg.fasta".format(outputName))) + + sh.copy(os.path.join(assemblies_dir, assembler, "{}.scf.fasta".format(outputName)), os.path.join(assembler, "assembly", "{}.scf.fasta".format(outputName))) + sh.copy(os.path.join(assemblies_dir, assembler, "{}.ctg.fasta".format(outputName)), os.path.join(assembler, "assembly", "{}.ctg.fasta".format(outputName))) if not os.path.exists(os.path.join(assembler, "QA_table")): os.makedirs(os.path.join(assembler, "QA_table")) - sh.copy(os.path.join(validationDirectory, assembler, "QAstats", "Contigs_Cov_SeqLen_GC.csv"), os.path.join(assembler, "QA_table", "Contigs_Cov_SeqLen_GC.csv")) + sh.copy(os.path.join(validation_dir, assembler, "QAstats", "Contigs_Cov_SeqLen_GC.csv"), os.path.join(assembler, "QA_table", "Contigs_Cov_SeqLen_GC.csv")) if not os.path.exists(os.path.join(assembler, "FRCurves")): os.makedirs(os.path.join(assembler, "FRCurves")) names = ["_FRC" , "COMPR_MP_FRC" , "COMPR_PE_FRC" , "HIGH_COV_PE_FRC" , "HIGH_NORM_COV_PE_FRC" ,"HIGH_OUTIE_MP_FRC" , "HIGH_OUTIE_PE_FRC" , "HIGH_SINGLE_MP_FRC" , "HIGH_SINGLE_PE_FRC" , "HIGH_SPAN_MP_FRC" , "HIGH_SPAN_PE_FRC" ,"LOW_COV_PE_FRC" , "LOW_NORM_COV_PE_FRC" , "STRECH_MP_FRC" , "STRECH_PE_FRC"] for name in names: - FRCurve_Orig_name = os.path.join(validationDirectory, assembler, "FRCurve", "{}{}.txt".format(outputName, name)) + FRCurve_Orig_name = os.path.join(validation_dir, assembler, "FRCurve", "{}{}.txt".format(outputName, name)) FRCurve_Dest_name = os.path.join(assembler, "FRCurves", "{}{}.txt".format(outputName, name)) sh.copy(FRCurve_Orig_name,FRCurve_Dest_name) + # now copy FRCurve pictures + sh.copytree("../LaTeX/FRCurves", "FRCurves") os.chdir("..") return @@ -372,11 +343,18 @@ def _latexHeader(sampleName, assemblers): if __name__ == '__main__': parser = argparse.ArgumentParser() - parser.add_argument('--assemblies-dir', help="Directory where assemblies are stored (one per folder, e.g., assembler/NAME.scf.fasta)", type=str) - parser.add_argument('--assemblers', action='append', nargs='+', help="List of assemblers to be evalueted") - parser.add_argument('--sample-config', help="Sample config. reference field will be over-written", type=str) - parser.add_argument('--global-config', help='foo help') - parser.add_argument('--generatePDF', default=0, type=int) + parser.add_argument('--validation-dir', type=str, required=True, help="Directory where validation is stored (one assembler per folder)") + parser.add_argument('--assemblies-dir', type=str, required=True, help="Directory where assemblies are stored (one assembler per folder)") + parser.add_argument('--assemblers', type=str, required=True, help="List of assemblers to be evalueted", action='append', nargs='+') + parser.add_argument('--sample-config', type=str, help="sample config, it is needed to extract several informations") + parser.add_argument('--global-config', type=str, help="global configuration file") + parser.add_argument('--output', type=str, required=True, help="output header used to store all results (i.e., output.scf.fasta)") + parser.add_argument('--genomeSize', type=int, required=True, help="expected genome size (same as specified in the validation phase)") + parser.add_argument('--minContigLength',type=int, default=2000, help="minimum contig length (usually the same used to validate assembly). Default value set to 2000") args = parser.parse_args() - main(args) \ No newline at end of file + main(args) + + + +#python ~/DE_NOVO_PIPELINE/de_novo_scilife/utils/run_assembly_evaluation.py --assembues-dir /proj/b2013064/nobackup/vezzi/C.Wheat/03_ASSEMBLY/ --assemblers allpaths abyss soapdenovo \ No newline at end of file diff --git a/utils/reverse_complement.py b/utils/reverse_complement.py new file mode 100644 index 0000000..ebd95cb --- /dev/null +++ b/utils/reverse_complement.py @@ -0,0 +1,51 @@ +import sys, os, yaml, glob +import subprocess +import argparse +import string +import gzip +from subprocess import Popen, PIPE + + + + +def main(args): + workingDir = os.getcwd() + + proc = Popen(["zcat", "{}".format(args.fastq)], stdout=PIPE, bufsize=1) + with gzip.open('{}.gz'.format(args.output), 'wb') as output_f: + for header in iter(proc.stdout.readline, b''): + header = header.rstrip() + sequence = proc.stdout.readline().rstrip() + comment = proc.stdout.readline().rstrip() + quality = proc.stdout.readline().rstrip() + + sequence_rc = rc(sequence) + quality_rev = reverse(quality) + + output_f.write("{}\n".format(header)) + output_f.write("{}\n".format(sequence_rc)) + output_f.write("{}\n".format(comment)) + output_f.write("{}\n".format(quality_rev)) + + proc.communicate() + + return + +def reverse(quality): + seq_rev = quality[::-1] + return seq_rev + + +def rc(dna): + complements = string.maketrans('acgtrymkbdhvnACGTRYMKBDHVN', 'tgcayrkmvhdbnTGCAYRKMVHDBN') + rcseq = dna.translate(complements)[::-1] + return rcseq + + +if __name__ == '__main__': + parser = argparse.ArgumentParser(description='Reverse complent the fastq file given as input') + parser.add_argument('--fastq', help="fastq file to reverse complement", type=str) + parser.add_argument('--output', help="Header name for the output file", type=str) + args = parser.parse_args() + + main(args) \ No newline at end of file diff --git a/utils/run_MP_analysis.py b/utils/run_MP_analysis.py new file mode 100644 index 0000000..dcd1302 --- /dev/null +++ b/utils/run_MP_analysis.py @@ -0,0 +1,117 @@ +import sys, os, yaml, glob +import subprocess +import argparse +import re + + +def main(args): + projectFolder = os.getcwd() + samples_data_dir = args.sample_data_dir + projectName = os.path.basename(os.path.normpath(samples_data_dir)) + for sample_dir_name in [dir for dir in os.listdir(samples_data_dir) if os.path.isdir(os.path.join(samples_data_dir, dir))]: + sample_folder = os.path.join(os.getcwd(), sample_dir_name) + if not os.path.exists(sample_folder): + os.makedirs(sample_folder) + os.chdir(sample_folder) + ## now I am in the folder, i can run at the same time QC and MP anlaysis + + pipeline = "MP" + tools = ["trimmomatic", "fastqc", "abyss", "align"] + + + sample_YAML_name = os.path.join(sample_folder, "{}_{}.yaml".format(sample_dir_name, pipeline)) + sample_YAML = open(sample_YAML_name, 'w') + + sample_YAML.write("pipeline:\n") + sample_YAML.write(" {}\n".format(pipeline)) + sample_YAML.write("tools:\n") + sample_YAML.write(" {}\n".format(tools)) + sample_YAML.write("output: {}\n".format(sample_dir_name)) ##TODO: output must became sampleName + sample_YAML.write("projectName: {}\n".format(projectName)) + sample_YAML.write("kmer: 35\n") + sample_YAML.write("threads: 16\n") + sample_YAML.write("genomeSize: \n") + sample_YAML.write("adapters: {}\n".format(args.adapter)) + + if args.reference is not "": + sample_YAML.write("reference: {}\n".format(args.reference)) + sample_YAML.write("libraries:\n") + + sample_data_dir = os.path.join(samples_data_dir,sample_dir_name) + flowcells_dirs = [os.path.join(sample_data_dir,flowcell) for flowcell in os.listdir(sample_data_dir) if os.path.isdir(os.path.join(sample_data_dir,flowcell))] # full path to flowcell + + sample_files = [] + for flowcell in flowcells_dirs: + sample_files.extend([os.path.join(flowcell, f) for f in os.listdir(flowcell) if (os.path.isfile(os.path.join(flowcell,f)) and re.search('.gz$',f))]) + # now sample_files contains all the file sequenced for this sample + + pair1_file = "" + pair2_file = "" + single = "" + library = 1 + for file in sample_files: + sample_YAML.write(" lib{}:\n".format(library)) + if "_1.fastq.gz" in file: + pair1_file = file + pair2_file = re.sub("_1.fastq.gz", "_2.fastq.gz", file) + elif "_2.fastq.gz" in file: + pair2_file = file + pair1_file = re.sub("_2.fastq.gz", "_1.fastq.gz", file) + + sample_YAML.write(" pair1: {}\n".format(pair1_file)) + sample_YAML.write(" pair2: {}\n".format(pair2_file)) + sample_YAML.write(" orientation: {}\n".format(args.orientation)) + sample_YAML.write(" insert: {}\n".format(args.insert)) + sample_YAML.write(" std: {}\n".format(args.std)) + sample_files.remove(pair1_file) + sample_files.remove(pair2_file) + library += 1 + + sample_YAML.close + submit_job(sample_YAML_name, args.global_config, sample_dir_name , pipeline, args.env) # now I can submit the job to slurm + os.chdir(projectFolder) + +def submit_job(sample_config, global_config, output, pipeline, env): + + workingDir = os.getcwd() + slurm_file = os.path.join(workingDir, "{}_{}.slurm".format(output,pipeline)) + slurm_handle = open(slurm_file, "w") + + slurm_handle.write("#! /bin/bash -l\n") + slurm_handle.write("set -e\n") + slurm_handle.write("#SBATCH -A b2013064\n") + slurm_handle.write("#SBATCH -o {}_{}.out\n".format(output,pipeline)) + slurm_handle.write("#SBATCH -e {}_{}.err\n".format(output,pipeline)) + slurm_handle.write("#SBATCH -J {}_{}.job\n".format(output,pipeline)) + slurm_handle.write("#SBATCH -p node -n 16\n") + slurm_handle.write("#SBATCH -t 1-00:00:00\n") + slurm_handle.write("#SBATCH --mail-user francesco.vezzi@scilifelab.se\n") + slurm_handle.write("#SBATCH --mail-type=ALL\n") + + slurm_handle.write("\n\n") + slurm_handle.write("source activate {}\n".format(env)) + slurm_handle.write("module load abyss/1.3.5\n") + slurm_handle.write("load_modules\n") + slurm_handle.write("deNovo_pipeline.py --global-config {} --sample-config {}\n\n".format(global_config,sample_config)) + slurm_handle.close() + + command=("sbatch", slurm_file) + print command + #subprocess.call(command) + + + + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument('--reference' , type=str, default="", help="path to the reference file") + parser.add_argument('--adapter' , type=str, required=True, help="path to the file containing the adaptor sequence to be removed") + parser.add_argument('--global-config' , type=str, required=True, help="global configuration file") + parser.add_argument('--sample-data-dir', type=str, required=True, help="Path to directory (usually INBOX) containing the project (one dir per sample, scilife structure project/sample/flowcell/)") + parser.add_argument('--orientation' , type=str, required=True, help="orientation of the libraries") + parser.add_argument('--insert' , type=str, required=True, help="expected insert size of the libraries") + parser.add_argument('--std' , type=str, required=True, help="expected stdandard variation of the insert size of the libraries") + parser.add_argument('--env' , type=str, default="DeNovoPipeline", help="name of the virtual enviorment (default is DeNovoPipeline)") + args = parser.parse_args() + + main(args) \ No newline at end of file