Major changes in the project: the project is now a regular python pac…

…kage and extensive variation has been done on the MP pipeline in order to make it more general and flexibile
SciLifeLab · Mar 13, 2014 · 3870606 · 3870606
1 parent 25cec53
commit 3870606
Show file tree

Hide file tree

Showing 37 changed files with 1,575 additions and 459 deletions.
diff --git a/config_de_novo.yaml → config_files/config_de_novo.yaml b/config_de_novo.yaml → config_files/config_de_novo.yaml
diff --git a/config_de_novo_amanita.yaml → config_files/config_de_novo_amanita.yaml b/config_de_novo_amanita.yaml → config_files/config_de_novo_amanita.yaml
diff --git a/config_de_novo_picea.yaml → config_files/config_de_novo_picea.yaml b/config_de_novo_picea.yaml → config_files/config_de_novo_picea.yaml
diff --git a/config_de_novo_uppmax.yaml → config_files/config_de_novo_uppmax.yaml b/config_de_novo_uppmax.yaml → config_files/config_de_novo_uppmax.yaml
@@ -1,10 +1,9 @@
 Pipelines:
- MP
- QCcontrol
- assemble
- evaluete
- align
- user_define
+ QCcontrol: ["fastqc", "abyss", "trimmomatic"]
+ MP: ["fastqc", "abyss", "trimmomatic", "align"]
+ assemble: ["soapdenovo", "abyss"]
+ evaluete: ["qaTools", "FRC"]
+ align: []
 Tools:
  jellyfish:
   bin: /home/vezzi/DE_NOVO_PIPELINE/assembly_pipeline/tools/jellyfish-1.1.11/bin/jellyfish
@@ -14,7 +13,7 @@ Tools:
   options: [--threads,  "16" ,  --outdir,  fastqc]
  trimmomatic:
   bin: /home/vezzi/DE_NOVO_PIPELINE/tools/Trimmomatic-0.30/trimmomatic-0.30.jar
-  options: []
+  options: [""]
  masurca:
   bin: /home/vezzi/DE_NOVO_PIPELINE/tools/MaSuRCA-2.0.4/
   options: [""]
@@ -23,7 +22,7 @@ Tools:
   options: [""]
  abyss:
   bin: /sw/apps/bioinfo/abyss/1.3.5/milou/bin/
-  options: ["k=54", "np=16"]
+  options: [""]
  abyss_mergePairs:
   bin: /sw/apps/bioinfo/abyss/1.3.5/milou/bin/abyss-mergepairs
   options: ["-m 20", "-1 230", "-2 230"]
@@ -52,8 +51,8 @@ Tools:
   bin: /sw/apps/bioinfo/samtools/0.1.19/milou/bin/samtools
   options: ""
  qaTools:
-  bin: /home/vezzi/DE_NOVO_PIPELINE/tools/qaTools/
+  bin: /home/vezzi/DE_NOVO_PIPELINE/tools/qaTools/qaCompute
   options: ""
  trinity:
-  bin: ~/DE_NOVO_PIPELINE/tools/trinityrnaseq_r20131110/
+  bin: $HOME/DE_NOVO_PIPELINE/tools/trinityrnaseq_r20131110/
   options: []
diff --git a/de_novo_scilife/MP.py b/de_novo_scilife/MP.py
diff --git a/de_novo_scilife/MP.pyc b/de_novo_scilife/MP.pyc
diff --git a/de_novo_scilife/QCcontrol.py b/de_novo_scilife/QCcontrol.py
@@ -0,0 +1,184 @@
+import sys, os, yaml, glob
+import subprocess
+import pandas as pd
+from matplotlib import pyplot as plt
+from de_novo_scilife import common
+
+
+def run(global_config, sample_config):
+    sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config)
+    if "tools" in sample_config:
+        """If so, execute them one after the other in the specified order (might not work)"""
+        for command in sample_config["tools"]:
+            """with this I pick up at run time the correct function in the current module"""
+            command_fn    = getattr(sys.modules[__name__] , "_run_{}".format(command))
+            """Update sample config, each command return sample_config and if necessary it modifies it"""
+            sample_config = command_fn(global_config, sample_config, sorted_libraries_by_insert)
+    else:
+        #run default pipeline for QC
+        sample_config = _run_fastqc(global_config, sample_config, sorted_libraries_by_insert)
+        sample_config = _run_abyss(global_config, sample_config, sorted_libraries_by_insert)
+        sample_config = _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert)
+
+
+
+def _run_fastqc(global_config, sample_config, sorted_libraries_by_insert):
+    mainDir = os.getcwd()
+    FastqcFolder = os.path.join(os.getcwd(), "fastqc")
+    if not os.path.exists(FastqcFolder):
+        os.makedirs(FastqcFolder)
+
+    program=global_config["Tools"]["fastqc"]["bin"]
+    program_options=global_config["Tools"]["fastqc"]["options"]
+    for library, libraryInfo in sorted_libraries_by_insert:
+        command = [program]
+        for option in program_options:
+            command.append(option)
+        read1=libraryInfo["pair1"]
+        read2=libraryInfo["pair2"]
+        command.append(read1)
+        if read2 is not None:
+            command.append(read2)
+        common.print_command(command)
+        folder_output_name = os.path.join(FastqcFolder, os.path.basename(read1).split(".fastq.gz")[0])
+        if not common.check_dryrun(sample_config) and not os.path.exists("{}_fastqc.zip".format(folder_output_name)):
+            fastq_stdOut = open(os.path.join(FastqcFolder , "{}_fastqc.stdout".format(library)), "a")
+            fastq_stdErr = open(os.path.join(FastqcFolder , "{}_fastqc.stderr".format(library)), "a")
+            subprocess.call(command, stdout=fastq_stdOut, stderr=fastq_stdErr)
+    sample_config["fastqc"] = FastqcFolder
+    return sample_config
+
+
+def _run_abyss(global_config, sample_config, sorted_libraries_by_insert):
+    mainDir = os.getcwd()
+    ABySS_Kmer_Folder = os.path.join(os.getcwd(), "abyss_kmer")
+    if "kmer" not in sample_config:
+        sys.exit("error in _run_abyss QCcontrol: kmer must be present in sample_config.yaml")
+
+    kmer = sample_config["kmer"]
+    if not os.path.exists(ABySS_Kmer_Folder):
+        os.makedirs(ABySS_Kmer_Folder)
+
+    os.chdir(ABySS_Kmer_Folder)
+
+    program = global_config["Tools"]["abyss"]["bin"]
+    program = os.path.join(os.path.dirname(program), "ABYSS-P")
+    program_options=global_config["Tools"]["abyss"]["options"]
+    if "abyss" in sample_config:
+        program_options=sample_config["abyss"]
+
+    threads = 16 # default for UPPMAX
+    if "threads" in sample_config :
+        threads = sample_config["threads"]
+
+    command = "mpirun -np {} {} ".format(threads, program)
+    command += "-k {} ".format(kmer)
+    command += "--coverage-hist=histogram.hist -o preUnitgs.fa"
+    for library, libraryInfo in sorted_libraries_by_insert:
+        read1=libraryInfo["pair1"]
+        read2=libraryInfo["pair2"]
+        orientation = libraryInfo["orientation"]
+        if orientation=="innie" or orientation=="outtie":
+            command += " {} ".format(read1)
+            if read2 is not None:
+                command += " {} ".format(read2)
+        if orientation == "none":
+            command += " {} ".format(read1)
+
+    common.print_command(command)
+    if not common.check_dryrun(sample_config) and not os.path.exists("histogram.hist"):
+        ABySS_Kmer_stdOut = open("ABySS_Kmer_Folder.stdOut", "a")
+        ABySS_Kmer_stdErr = open("ABySS_Kmer_Folder.stdErr", "a")
+        returnValue = subprocess.call(command, shell=True, stdout=ABySS_Kmer_stdOut, stderr=ABySS_Kmer_stdErr)
+        if returnValue > 0:
+            print "ABySS kmer plotting failed: unkwnown reason"
+        else :
+            subprocess.call(("rm", "preUnitgs.fa"))
+            _plotKmerPlot(1,200, kmer, "kmer_coverage_1_200.png")
+            _plotKmerPlot(1,500, kmer, "kmer_coverage_1_500.png")
+            _plotKmerPlot(15,200, kmer, "kmer_coverage_15_200.png")
+            _plotKmerPlot(15,500, kmer, "kmer_coverage_15_500.png")
+
+    os.chdir("..")
+    sample_config["abyss"] = ABySS_Kmer_Folder
+    return sample_config
+
+def _plotKmerPlot(min_limit, max_limit,kmer, output_name):
+    Kmer_histogram = pd.io.parsers.read_csv("histogram.hist", sep='\t', header=None)
+    Kmer_coverage  = Kmer_histogram[Kmer_histogram.columns[0]].tolist()
+    Kmer_count     = Kmer_histogram[Kmer_histogram.columns[1]].tolist()
+    Kmer_freq      = [Kmer_coverage[i]*Kmer_count[i] for i in range(len(Kmer_coverage))]
+    kmer_freq_peak = Kmer_freq.index(max(Kmer_freq[min_limit:max_limit]))	#coverage peak, disregarding initial peak
+    kmer_freq_peak_value=max(Kmer_freq[min_limit:max_limit])
+
+    xmax = max_limit
+    ymax = kmer_freq_peak_value + (kmer_freq_peak_value*0.30)
+
+    plt.plot(Kmer_coverage, Kmer_freq)
+    plt.title("K-mer length = {}".format(kmer))
+    plt.xlim((0,xmax))
+    plt.ylim((0,ymax))
+    plt.vlines(kmer_freq_peak, 0, kmer_freq_peak_value, colors='r', linestyles='--')
+    plt.text(kmer_freq_peak, kmer_freq_peak_value+2000, str(kmer_freq_peak))
+    plotname = "{}".format(output_name)
+    plt.savefig(plotname)
+    plt.clf()
+    return 0
+
+
+
+def _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert):
+    print "running trimmomatic ..."
+
+    program        = global_config["Tools"]["trimmomatic"]["bin"]
+    program_folder = os.path.dirname(program)
+    adapterFile    = sample_config["adapters"]
+
+    if not os.path.exists(adapterFile):
+        print "Trimmomatic cannot be run as adapter file is not specified or points to unknown position: {}".format(adapterFile)
+        return sample_config
+
+    rootDir       = os.getcwd()
+    trimmomaticDir = os.path.join(rootDir, "Trimmomatic")
+    if not os.path.exists(trimmomaticDir):
+        os.makedirs(trimmomaticDir)
+    else:
+        print "trimmomatic assumed already run"
+        return sample_config #
+
+    os.chdir(trimmomaticDir)
+    #now I am in running dir, I need to process one by one the libraries
+    threads = 8
+    if "threads" in sample_config:
+        threads = sample_config["threads"]
+
+    for library, libraryInfo in sorted_libraries_by_insert:
+        read1=libraryInfo["pair1"]
+        read2=libraryInfo["pair2"]
+        orientation = libraryInfo["orientation"]
+        if orientation=="innie":
+            if read2 is not None:
+                read1_baseName = os.path.split(read1)[1].split(".")[0]
+                read2_baseName = os.path.split(read2)[1].split(".")[0]
+                output_read1_pair = os.path.join(trimmomaticDir,  "{}.fastq.gz".format(read1_baseName))
+                output_read1_sing = os.path.join(trimmomaticDir, "{}_u.fastq.gz".format(read1_baseName))
+                output_read2_pair = os.path.join(trimmomaticDir, "{}.fastq.gz".format(read2_baseName))
+                output_read2_sing = os.path.join(trimmomaticDir,  "{}_u.fastq.gz".format(read2_baseName))
+                command = ["java",  "-jar", program, "PE", "-threads", "{}".format(threads),  "-phred33",  read1, read2,  output_read1_pair ,output_read1_sing , output_read2_pair, output_read2_sing ,"ILLUMINACLIP:{}:2:30:10".format(adapterFile), "MINLEN:30" ]
+                #print ' '.join(map(str,command))
+                stdOut = open("{}_trimmomatic.stdOut".format(read1_baseName), "w")
+                stdErr = open("{}_trimmomatic.stdErr".format(read1_baseName), "w")
+
+                returnValue = subprocess.call(command, stdout=stdOut, stderr=stdErr) # run the program
+                if returnValue != 0:
+                    print "error while running command: {}".format(command)
+                else:
+                    libraryInfo["pair1"] = output_read1_pair
+                    libraryInfo["pair2"] = output_read2_pair
+
+    os.chdir(rootDir)
+    print "trimmomatic done"
+    print sample_config
+    return sample_config
+
+
diff --git a/de_novo_scilife/QCcontrol.pyc b/de_novo_scilife/QCcontrol.pyc
diff --git a/de_novo_scilife/__init__.py b/de_novo_scilife/__init__.py
diff --git a/de_novo_scilife/__init__.pyc b/de_novo_scilife/__init__.pyc