added trinity to assemble, improved pipeline logic

SciLifeLab · Jan 27, 2014 · 25cec53 · 25cec53
1 parent 9025ea2
commit 25cec53
Show file tree

Hide file tree

Showing 9 changed files with 251 additions and 148 deletions.
diff --git a/config_de_novo_uppmax.yaml b/config_de_novo_uppmax.yaml
@@ -14,10 +14,13 @@ Tools:
   options: [--threads,  "16" ,  --outdir,  fastqc]
  trimmomatic:
   bin: /home/vezzi/DE_NOVO_PIPELINE/tools/Trimmomatic-0.30/trimmomatic-0.30.jar
-  options: [""]
+  options: []
  masurca:
   bin: /home/vezzi/DE_NOVO_PIPELINE/tools/MaSuRCA-2.0.4/
-  options: 
+  options: [""]
+ allpaths:
+  bin: /home/vezzi/DE_NOVO_PIPELINE/tools/allpathslg-47918/bin/bin/
+  options: [""]
  abyss:
   bin: /sw/apps/bioinfo/abyss/1.3.5/milou/bin/
   options: ["k=54", "np=16"]
@@ -51,3 +54,6 @@ Tools:
  qaTools:
   bin: /home/vezzi/DE_NOVO_PIPELINE/tools/qaTools/
   options: ""
+ trinity:
+  bin: ~/DE_NOVO_PIPELINE/tools/trinityrnaseq_r20131110/
+  options: []
diff --git a/script/MP.py b/script/MP.py
@@ -1,29 +1,29 @@
 import sys, os, yaml, glob
 import subprocess
 import pandas as pd
-from matplotlib import pyplot as plt
 import align
 import common
 
 
 
 def run(global_config, sample_config):
-    sorted_libraries_by_insert = sorted(sample_config["libraries"].iteritems(), key=lambda (k,v): v["insert"])
-    if os.path.exists(os.path.join(os.getcwd(), "DATA")):
-        sorted_libraries_by_insert = common.update_sample_config(sorted_libraries_by_insert)
-    else:
-        sorted_libraries_by_insert = common.prepare_folder_structure(sorted_libraries_by_insert)
-
-    print "remove adator"
-    sample_config = _run_trimmomatic(global_config, sample_config,sorted_libraries_by_insert)
-    print "align sequences"
+    sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config)
+    print sorted_libraries_by_insert
+    run_default(global_config, sample_config,sorted_libraries_by_insert)
+
+
+
+def run_default(global_config, sample_config,sorted_libraries_by_insert):
+    print "running the entire pipeline"
+    print " remove adator"
+    sample_config = _run_trimmomatic(global_config, sample_config,sorted_libraries_by_insert, "01_trimmomatic")
+
     sorted_libraries_by_insert = sorted(sample_config["libraries"].iteritems(), key=lambda (k,v): v["insert"]) # recompute sorted libraries by insert
-    ## TO DO: create a reference
 
-
-    if not os.path.exists("alignments"):
-        os.makedirs("alignments")
-    os.chdir("alignments")
+    print " align sequences"
+    if not os.path.exists("02_alignments"):
+        os.makedirs("02_alignments")
+    os.chdir("02_alignments")
     print sorted_libraries_by_insert
     sorted_libraries_by_insert =  align._align_reads(global_config, sample_config,  sorted_libraries_by_insert) # align reads
     print sorted_libraries_by_insert
@@ -40,31 +40,35 @@ def run(global_config, sample_config):
     return 0
 
 
-def _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert):
-    print "running trimmomatic ..."
+def _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert, step):
+    print "     running trimmomatic ..."
     mainDir = os.getcwd()
     program        = global_config["Tools"]["trimmomatic"]["bin"]
     program_folder = os.path.dirname(program)
-    adaprterFile = os.path.join(program_folder, "adapters", "NexteraMP.fa")
-    if not os.path.exists(adaprterFile):
-        print "file {} does not exists: it must be present: you must provide it!!!".format(adaprterFile)
+    adapterFile = os.path.join(program_folder, "adapters", "NexteraMP.fa")
+    if not os.path.exists(adapterFile):
+        print "file {} does not exists: it must be present: you must provide it!!!".format(adapterFile)
         return sample_config
 
+    runningDir = os.path.join(mainDir, step)
+    if not os.path.exists(runningDir):
+        os.makedirs(runningDir)
+    os.chdir(runningDir)
+    #now I am in running dir, I need to process one by one the libraries
+
     for library, libraryInfo in sorted_libraries_by_insert:
         read1=libraryInfo["pair1"]
         read2=libraryInfo["pair2"]
         orientation = libraryInfo["orientation"]
         insert = libraryInfo["insert"]
-        workingDir = "{}_MP_{}".format(library,insert)
-        if not os.path.exists(workingDir):
-            os.makedirs(workingDir)
-
-        os.chdir(workingDir)
-        currentDir = os.getcwd()
-        output_read1_pair = os.path.join(currentDir, "{}_MP_1.fastq.gz".format(library))
-        output_read1_sing = os.path.join(currentDir, "{}_MP_u_1.fastq.gz".format(library))
-        output_read2_pair = os.path.join(currentDir, "{}_MP_2.fastq.gz".format(library))
-        output_read2_sing = os.path.join(currentDir, "{}_MP_u_2.fastq.gz".format(library))
+        workingOnLibraryDir = os.path.join(runningDir, "{}_MP_{}".format(library,insert))
+        if not os.path.exists(workingOnLibraryDir):
+            os.makedirs(workingOnLibraryDir)
+        os.chdir(workingOnLibraryDir)
+        output_read1_pair = os.path.join(workingOnLibraryDir, "{}_MP_1.fastq.gz".format(library))
+        output_read1_sing = os.path.join(workingOnLibraryDir, "{}_MP_u_1.fastq.gz".format(library))
+        output_read2_pair = os.path.join(workingOnLibraryDir, "{}_MP_2.fastq.gz".format(library))
+        output_read2_sing = os.path.join(workingOnLibraryDir, "{}_MP_u_2.fastq.gz".format(library))
 
         if os.path.exists(output_read1_pair):
             print "library {} already computed: skyp this".format(library)
@@ -77,7 +81,7 @@ def _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert):
         if "threads" in sample_config:
             threads = sample_config["threads"]
 
-        command = ["java",  "-jar", program, "PE", "-threads", "{}".format(threads),  "-phred33",  read1, read2,  output_read1_pair ,output_read1_sing , output_read2_pair, output_read2_sing ,"ILLUMINACLIP:{}:2:30:10".format(adaprterFile), "MINLEN:30" ]
+        command = ["java",  "-jar", program, "PE", "-threads", "{}".format(threads),  "-phred33",  read1, read2,  output_read1_pair ,output_read1_sing , output_read2_pair, output_read2_sing ,"ILLUMINACLIP:{}:2:30:10".format(adapterFile), "MINLEN:30" ]
         #print ' '.join(map(str,command))
         stdOut = open("trimmomatic.stdOut", "w")
         stdErr = open("trimmomatic.stdErr", "w")
@@ -89,86 +93,10 @@ def _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert):
             libraryInfo["pair1"] = output_read1_pair
             libraryInfo["pair2"] = output_read2_pair
 
-
-        os.chdir("..")
-
+        os.chdir(runningDir)
 
+    os.chdir(mainDir)
     print "trimmomatic done"
     return sample_config
 
 
-def _run_abyss(global_config, sample_config, sorted_libraries_by_insert):
-    print "I am running abyss to check kmer content"
-    mainDir = os.getcwd()
-    ABySS_Kmer_Folder = os.path.join(os.getcwd(), "abyss_kmer")
-    if not os.path.exists(ABySS_Kmer_Folder):
-        os.makedirs(ABySS_Kmer_Folder)
-    else:
-        print "done (ABySS_Kmer_Folder folder already present, assumed already run)"
-        #return
-
-    os.chdir(ABySS_Kmer_Folder)
-    ABySS_Kmer_stdOut = open("ABySS_Kmer_Folder.stdOut", "a")
-    ABySS_Kmer_stdErr = open("ABySS_Kmer_Folder.stdErr", "a")
-
-    program = global_config["Tools"]["abyss"]["bin"]
-    program = os.path.join(os.path.dirname(program), "ABYSS-P")
-    program_options=global_config["Tools"]["abyss"]["options"]
-    if "abyss" in sample_config:
-        program_options=sample_config["abyss"]
-
-    threads = 8 # default for UPPMAX
-    if "threads" in sample_config :
-        threads = sample_config["threads"]
-
-    command = "mpirun -np {} {} ".format(threads, program)
-    kmer = sample_config["kmer"]
-    command += "-k {} ".format(kmer)
-    command += "--coverage-hist=histogram.hist -o preUnitgs.fa"
-    for library, libraryInfo in sorted_libraries_by_insert:
-        read1=libraryInfo["pair1"]
-        read2=libraryInfo["pair2"]
-        orientation = libraryInfo["orientation"]
-        if orientation=="innie":
-            command += " {} ".format(read1)
-            if read2 is not None:
-                command += " {} ".format(read2)
-        if orientation == "none":
-            command += " {} ".format(read1)
-    print command
-    subprocess.call(command, shell=True, stdout=ABySS_Kmer_stdOut, stderr=ABySS_Kmer_stdErr)
-    subprocess.call(("rm", "preUnitgs.fa"))
-    _plotKmerPlot()
-
-    os.chdir("..")
-    return
-
-def _plotKmerPlot():
-    Kmer_histogram = pd.io.parsers.read_csv("histogram.hist", sep='\t', header=None)
-    Kmer_coverage  = Kmer_histogram[Kmer_histogram.columns[0]].tolist()
-    Kmer_count     = Kmer_histogram[Kmer_histogram.columns[1]].tolist()
-    Kmer_freq      = [Kmer_coverage[i]*Kmer_count[i] for i in range(len(Kmer_coverage))]
-    kmer_freq_peak = Kmer_freq.index(max(Kmer_freq[15:400]))	#coverage peak, disregarding initial peak
-    kmer_freq_peak_value=max(Kmer_freq[15:400])
-
-    xmax = 200
-    ymax = kmer_freq_peak_value + (kmer_freq_peak_value*0.30)
-
-    plt.plot(Kmer_coverage, Kmer_freq)
-    plt.title('K-mer length = %s' % 54)
-    plt.xlim((0,xmax))
-    plt.ylim((0,ymax))
-    plt.vlines(kmer_freq_peak, 0, kmer_freq_peak_value, colors='r', linestyles='--')
-    plt.text(kmer_freq_peak, kmer_freq_peak_value+2000, str(kmer_freq_peak))
-    plotname = "kmer_coverage.png"
-    plt.savefig(plotname)
-    plt.clf()
-    return 0
-
-
-def _run_jellyfish(global_config, sample_config, sorted_libraries_by_insert):
-    print "Jellyfish still to be implemented"
-
-
-
-
diff --git a/script/QCcontrol.py b/script/QCcontrol.py
@@ -5,14 +5,8 @@
 import common
 
 
-
 def run(global_config, sample_config):
-    sorted_libraries_by_insert = sorted(sample_config["libraries"].iteritems(), key=lambda (k,v): v["insert"])
-    if os.path.exists(os.path.join(os.getcwd(), "DATA")):
-        sorted_libraries_by_insert = common.update_sample_config(sorted_libraries_by_insert)
-    else:
-        sorted_libraries_by_insert = common.prepare_folder_structure(sorted_libraries_by_insert)
-
+    sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config)
     if "tools" in sample_config:
         #need to follow the commands listed here
         for command in sample_config["tools"]:
@@ -34,7 +28,7 @@ def _run_fastqc(global_config, sample_config, sorted_libraries_by_insert):
         os.makedirs(FastqcFolder)
     else:
         print "done (fastqc folder already present, assumed already run)"
-        return
+        return sample_config
     fastq_stdOut = open("fastqc.stdOut", "a")
     fastq_stdErr = open("fastqc.stdErr", "a")
     program=global_config["Tools"]["fastqc"]["bin"]
@@ -62,7 +56,10 @@ def _run_abyss(global_config, sample_config, sorted_libraries_by_insert):
         os.makedirs(ABySS_Kmer_Folder)
     else:
         print "done (ABySS_Kmer_Folder folder already present, assumed already run)"
-        #return
+        os.chdir(ABySS_Kmer_Folder)
+        _plotKmerPlot(1 , 50)
+        os.chdir("..")
+        return
 
     os.chdir(ABySS_Kmer_Folder)
     ABySS_Kmer_stdOut = open("ABySS_Kmer_Folder.stdOut", "a")
@@ -95,20 +92,20 @@ def _run_abyss(global_config, sample_config, sorted_libraries_by_insert):
     print command
     subprocess.call(command, shell=True, stdout=ABySS_Kmer_stdOut, stderr=ABySS_Kmer_stdErr)
     subprocess.call(("rm", "preUnitgs.fa"))
-    _plotKmerPlot()
+    _plotKmerPlot(15,200)
 
     os.chdir("..")
     return
 
-def _plotKmerPlot():
+def _plotKmerPlot(min_limit, max_limit):
     Kmer_histogram = pd.io.parsers.read_csv("histogram.hist", sep='\t', header=None)
     Kmer_coverage  = Kmer_histogram[Kmer_histogram.columns[0]].tolist()
     Kmer_count     = Kmer_histogram[Kmer_histogram.columns[1]].tolist()
     Kmer_freq      = [Kmer_coverage[i]*Kmer_count[i] for i in range(len(Kmer_coverage))]
-    kmer_freq_peak = Kmer_freq.index(max(Kmer_freq[15:400]))	#coverage peak, disregarding initial peak
-    kmer_freq_peak_value=max(Kmer_freq[15:400])
+    kmer_freq_peak = Kmer_freq.index(max(Kmer_freq[min_limit:max_limit]))	#coverage peak, disregarding initial peak
+    kmer_freq_peak_value=max(Kmer_freq[min_limit:max_limit])
 
-    xmax = 200
+    xmax = max_limit
     ymax = kmer_freq_peak_value + (kmer_freq_peak_value*0.30)
 
     plt.plot(Kmer_coverage, Kmer_freq)

diff --git a/script/align.py b/script/align.py
@@ -4,6 +4,8 @@
 import string
 import sys
 import gzip
+import pandas as pd
+
 
 
 def _align_reads(global_config, sample_config, sorted_libraries_by_insert):
@@ -233,7 +235,6 @@ def align_bwa_mem(global_config, read1, read2, reference, threads):
         os.chdir("..")
         return BAMsorted
 
-
     #if not os.path.exists(SAMMapped) and not os.path.exists(BAMunsorted):
     #    with open(SAMMapped, "w") as fh:
     #            stdErr = open("bwa.stdErr", "w")
@@ -250,7 +251,6 @@ def align_bwa_mem(global_config, read1, read2, reference, threads):
 
     bwa_mem_command       = [aligner, "mem", "-M", "-t", "{}".format(threads), reference, read1, read2]
     samtools_view_command = [samtools, "view", "-b", "-S",  "-u",  "-"]
-    print "aligning..."
 
     if not os.path.exists(BAMunsorted):
         command = "{} | {} > {}".format(" ".join(bwa_mem_command), " ".join(samtools_view_command), BAMunsorted)
@@ -375,3 +375,51 @@ def align_bwa(read1, read2, reference):
     BAMsorted = os.path.abspath(BAMsorted)
     os.chdir("..")
     return BAMsorted
+
+
+def _run_pileup(global_config, bamfile):
+    """
+    Perform samtools pileup on a .bam-file
+    """
+
+
+    if "samtools" in global_config["Tools"]:
+        samtools = global_config["Tools"]["samtools"]["bin"]
+    elif not common.which("samtools"):
+        sys.exit("error while trying to run samtools: samtools not present in the path and not in global config, please make sure to install samtools properly")
+
+    pileupfile = bamfile.replace('.bam', '_coverage.csv')
+    pileup_cmd = "{} mpileup {} | awk '{print $2, $4}' > {}".format(samtools, bamfile, pileupfile)
+    p1 = subprocess.Popen(pileup_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
+    p1.wait()
+    if p1.returncode == 0:
+        return pileupfile
+    else:
+        print "Could not perform mpileup"
+        return 1
+
+
+def plot_coverage(pileupfile, samplename=None):
+    """
+    Plot coverage from samtools pileup output
+    """
+    if not samplename:
+        samplename = pileupfile.split('.')[0]
+
+    df = pd.io.parsers.read_csv(pileupfile, sep=' ', names=['pos', 'cov'])
+    pl = df.plot(x='pos', y='cov')
+    avg_cov = sum(df['cov'])/len(df)        #Calculate average coverage
+    pl2 = plt.plot(range(len(df)), [avg_cov]*len(df), 'r--', linewidth=2)
+    plt.xlim([0,len(df)])
+    plt.ylabel('Coverage')
+    plt.xlabel('Position')
+    plt.title(samplename)
+    plt.savefig(samplename)     #Save plot as .png
+    return 0
+
+
+
+
+
+
+