Skip to content

Commit

Permalink
added trinity to assemble, improved pipeline logic
Browse files Browse the repository at this point in the history
  • Loading branch information
Francesco Vezzi committed Jan 27, 2014
1 parent 9025ea2 commit 25cec53
Show file tree
Hide file tree
Showing 9 changed files with 251 additions and 148 deletions.
10 changes: 8 additions & 2 deletions config_de_novo_uppmax.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,13 @@ Tools:
options: [--threads, "16" , --outdir, fastqc]
trimmomatic:
bin: /home/vezzi/DE_NOVO_PIPELINE/tools/Trimmomatic-0.30/trimmomatic-0.30.jar
options: [""]
options: []
masurca:
bin: /home/vezzi/DE_NOVO_PIPELINE/tools/MaSuRCA-2.0.4/
options:
options: [""]
allpaths:
bin: /home/vezzi/DE_NOVO_PIPELINE/tools/allpathslg-47918/bin/bin/
options: [""]
abyss:
bin: /sw/apps/bioinfo/abyss/1.3.5/milou/bin/
options: ["k=54", "np=16"]
Expand Down Expand Up @@ -51,3 +54,6 @@ Tools:
qaTools:
bin: /home/vezzi/DE_NOVO_PIPELINE/tools/qaTools/
options: ""
trinity:
bin: ~/DE_NOVO_PIPELINE/tools/trinityrnaseq_r20131110/
options: []
146 changes: 37 additions & 109 deletions script/MP.py
Original file line number Diff line number Diff line change
@@ -1,29 +1,29 @@
import sys, os, yaml, glob
import subprocess
import pandas as pd
from matplotlib import pyplot as plt
import align
import common



def run(global_config, sample_config):
sorted_libraries_by_insert = sorted(sample_config["libraries"].iteritems(), key=lambda (k,v): v["insert"])
if os.path.exists(os.path.join(os.getcwd(), "DATA")):
sorted_libraries_by_insert = common.update_sample_config(sorted_libraries_by_insert)
else:
sorted_libraries_by_insert = common.prepare_folder_structure(sorted_libraries_by_insert)

print "remove adator"
sample_config = _run_trimmomatic(global_config, sample_config,sorted_libraries_by_insert)
print "align sequences"
sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config)
print sorted_libraries_by_insert
run_default(global_config, sample_config,sorted_libraries_by_insert)



def run_default(global_config, sample_config,sorted_libraries_by_insert):
print "running the entire pipeline"
print " remove adator"
sample_config = _run_trimmomatic(global_config, sample_config,sorted_libraries_by_insert, "01_trimmomatic")

sorted_libraries_by_insert = sorted(sample_config["libraries"].iteritems(), key=lambda (k,v): v["insert"]) # recompute sorted libraries by insert
## TO DO: create a reference


if not os.path.exists("alignments"):
os.makedirs("alignments")
os.chdir("alignments")
print " align sequences"
if not os.path.exists("02_alignments"):
os.makedirs("02_alignments")
os.chdir("02_alignments")
print sorted_libraries_by_insert
sorted_libraries_by_insert = align._align_reads(global_config, sample_config, sorted_libraries_by_insert) # align reads
print sorted_libraries_by_insert
Expand All @@ -40,31 +40,35 @@ def run(global_config, sample_config):
return 0


def _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert):
print "running trimmomatic ..."
def _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert, step):
print " running trimmomatic ..."
mainDir = os.getcwd()
program = global_config["Tools"]["trimmomatic"]["bin"]
program_folder = os.path.dirname(program)
adaprterFile = os.path.join(program_folder, "adapters", "NexteraMP.fa")
if not os.path.exists(adaprterFile):
print "file {} does not exists: it must be present: you must provide it!!!".format(adaprterFile)
adapterFile = os.path.join(program_folder, "adapters", "NexteraMP.fa")
if not os.path.exists(adapterFile):
print "file {} does not exists: it must be present: you must provide it!!!".format(adapterFile)
return sample_config

runningDir = os.path.join(mainDir, step)
if not os.path.exists(runningDir):
os.makedirs(runningDir)
os.chdir(runningDir)
#now I am in running dir, I need to process one by one the libraries

for library, libraryInfo in sorted_libraries_by_insert:
read1=libraryInfo["pair1"]
read2=libraryInfo["pair2"]
orientation = libraryInfo["orientation"]
insert = libraryInfo["insert"]
workingDir = "{}_MP_{}".format(library,insert)
if not os.path.exists(workingDir):
os.makedirs(workingDir)

os.chdir(workingDir)
currentDir = os.getcwd()
output_read1_pair = os.path.join(currentDir, "{}_MP_1.fastq.gz".format(library))
output_read1_sing = os.path.join(currentDir, "{}_MP_u_1.fastq.gz".format(library))
output_read2_pair = os.path.join(currentDir, "{}_MP_2.fastq.gz".format(library))
output_read2_sing = os.path.join(currentDir, "{}_MP_u_2.fastq.gz".format(library))
workingOnLibraryDir = os.path.join(runningDir, "{}_MP_{}".format(library,insert))
if not os.path.exists(workingOnLibraryDir):
os.makedirs(workingOnLibraryDir)
os.chdir(workingOnLibraryDir)
output_read1_pair = os.path.join(workingOnLibraryDir, "{}_MP_1.fastq.gz".format(library))
output_read1_sing = os.path.join(workingOnLibraryDir, "{}_MP_u_1.fastq.gz".format(library))
output_read2_pair = os.path.join(workingOnLibraryDir, "{}_MP_2.fastq.gz".format(library))
output_read2_sing = os.path.join(workingOnLibraryDir, "{}_MP_u_2.fastq.gz".format(library))

if os.path.exists(output_read1_pair):
print "library {} already computed: skyp this".format(library)
Expand All @@ -77,7 +81,7 @@ def _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert):
if "threads" in sample_config:
threads = sample_config["threads"]

command = ["java", "-jar", program, "PE", "-threads", "{}".format(threads), "-phred33", read1, read2, output_read1_pair ,output_read1_sing , output_read2_pair, output_read2_sing ,"ILLUMINACLIP:{}:2:30:10".format(adaprterFile), "MINLEN:30" ]
command = ["java", "-jar", program, "PE", "-threads", "{}".format(threads), "-phred33", read1, read2, output_read1_pair ,output_read1_sing , output_read2_pair, output_read2_sing ,"ILLUMINACLIP:{}:2:30:10".format(adapterFile), "MINLEN:30" ]
#print ' '.join(map(str,command))
stdOut = open("trimmomatic.stdOut", "w")
stdErr = open("trimmomatic.stdErr", "w")
Expand All @@ -89,86 +93,10 @@ def _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert):
libraryInfo["pair1"] = output_read1_pair
libraryInfo["pair2"] = output_read2_pair


os.chdir("..")

os.chdir(runningDir)

os.chdir(mainDir)
print "trimmomatic done"
return sample_config


def _run_abyss(global_config, sample_config, sorted_libraries_by_insert):
print "I am running abyss to check kmer content"
mainDir = os.getcwd()
ABySS_Kmer_Folder = os.path.join(os.getcwd(), "abyss_kmer")
if not os.path.exists(ABySS_Kmer_Folder):
os.makedirs(ABySS_Kmer_Folder)
else:
print "done (ABySS_Kmer_Folder folder already present, assumed already run)"
#return

os.chdir(ABySS_Kmer_Folder)
ABySS_Kmer_stdOut = open("ABySS_Kmer_Folder.stdOut", "a")
ABySS_Kmer_stdErr = open("ABySS_Kmer_Folder.stdErr", "a")

program = global_config["Tools"]["abyss"]["bin"]
program = os.path.join(os.path.dirname(program), "ABYSS-P")
program_options=global_config["Tools"]["abyss"]["options"]
if "abyss" in sample_config:
program_options=sample_config["abyss"]

threads = 8 # default for UPPMAX
if "threads" in sample_config :
threads = sample_config["threads"]

command = "mpirun -np {} {} ".format(threads, program)
kmer = sample_config["kmer"]
command += "-k {} ".format(kmer)
command += "--coverage-hist=histogram.hist -o preUnitgs.fa"
for library, libraryInfo in sorted_libraries_by_insert:
read1=libraryInfo["pair1"]
read2=libraryInfo["pair2"]
orientation = libraryInfo["orientation"]
if orientation=="innie":
command += " {} ".format(read1)
if read2 is not None:
command += " {} ".format(read2)
if orientation == "none":
command += " {} ".format(read1)
print command
subprocess.call(command, shell=True, stdout=ABySS_Kmer_stdOut, stderr=ABySS_Kmer_stdErr)
subprocess.call(("rm", "preUnitgs.fa"))
_plotKmerPlot()

os.chdir("..")
return

def _plotKmerPlot():
Kmer_histogram = pd.io.parsers.read_csv("histogram.hist", sep='\t', header=None)
Kmer_coverage = Kmer_histogram[Kmer_histogram.columns[0]].tolist()
Kmer_count = Kmer_histogram[Kmer_histogram.columns[1]].tolist()
Kmer_freq = [Kmer_coverage[i]*Kmer_count[i] for i in range(len(Kmer_coverage))]
kmer_freq_peak = Kmer_freq.index(max(Kmer_freq[15:400])) #coverage peak, disregarding initial peak
kmer_freq_peak_value=max(Kmer_freq[15:400])

xmax = 200
ymax = kmer_freq_peak_value + (kmer_freq_peak_value*0.30)

plt.plot(Kmer_coverage, Kmer_freq)
plt.title('K-mer length = %s' % 54)
plt.xlim((0,xmax))
plt.ylim((0,ymax))
plt.vlines(kmer_freq_peak, 0, kmer_freq_peak_value, colors='r', linestyles='--')
plt.text(kmer_freq_peak, kmer_freq_peak_value+2000, str(kmer_freq_peak))
plotname = "kmer_coverage.png"
plt.savefig(plotname)
plt.clf()
return 0


def _run_jellyfish(global_config, sample_config, sorted_libraries_by_insert):
print "Jellyfish still to be implemented"




25 changes: 11 additions & 14 deletions script/QCcontrol.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,14 +5,8 @@
import common



def run(global_config, sample_config):
sorted_libraries_by_insert = sorted(sample_config["libraries"].iteritems(), key=lambda (k,v): v["insert"])
if os.path.exists(os.path.join(os.getcwd(), "DATA")):
sorted_libraries_by_insert = common.update_sample_config(sorted_libraries_by_insert)
else:
sorted_libraries_by_insert = common.prepare_folder_structure(sorted_libraries_by_insert)

sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config)
if "tools" in sample_config:
#need to follow the commands listed here
for command in sample_config["tools"]:
Expand All @@ -34,7 +28,7 @@ def _run_fastqc(global_config, sample_config, sorted_libraries_by_insert):
os.makedirs(FastqcFolder)
else:
print "done (fastqc folder already present, assumed already run)"
return
return sample_config
fastq_stdOut = open("fastqc.stdOut", "a")
fastq_stdErr = open("fastqc.stdErr", "a")
program=global_config["Tools"]["fastqc"]["bin"]
Expand Down Expand Up @@ -62,7 +56,10 @@ def _run_abyss(global_config, sample_config, sorted_libraries_by_insert):
os.makedirs(ABySS_Kmer_Folder)
else:
print "done (ABySS_Kmer_Folder folder already present, assumed already run)"
#return
os.chdir(ABySS_Kmer_Folder)
_plotKmerPlot(1 , 50)
os.chdir("..")
return

os.chdir(ABySS_Kmer_Folder)
ABySS_Kmer_stdOut = open("ABySS_Kmer_Folder.stdOut", "a")
Expand Down Expand Up @@ -95,20 +92,20 @@ def _run_abyss(global_config, sample_config, sorted_libraries_by_insert):
print command
subprocess.call(command, shell=True, stdout=ABySS_Kmer_stdOut, stderr=ABySS_Kmer_stdErr)
subprocess.call(("rm", "preUnitgs.fa"))
_plotKmerPlot()
_plotKmerPlot(15,200)

os.chdir("..")
return

def _plotKmerPlot():
def _plotKmerPlot(min_limit, max_limit):
Kmer_histogram = pd.io.parsers.read_csv("histogram.hist", sep='\t', header=None)
Kmer_coverage = Kmer_histogram[Kmer_histogram.columns[0]].tolist()
Kmer_count = Kmer_histogram[Kmer_histogram.columns[1]].tolist()
Kmer_freq = [Kmer_coverage[i]*Kmer_count[i] for i in range(len(Kmer_coverage))]
kmer_freq_peak = Kmer_freq.index(max(Kmer_freq[15:400])) #coverage peak, disregarding initial peak
kmer_freq_peak_value=max(Kmer_freq[15:400])
kmer_freq_peak = Kmer_freq.index(max(Kmer_freq[min_limit:max_limit])) #coverage peak, disregarding initial peak
kmer_freq_peak_value=max(Kmer_freq[min_limit:max_limit])

xmax = 200
xmax = max_limit
ymax = kmer_freq_peak_value + (kmer_freq_peak_value*0.30)

plt.plot(Kmer_coverage, Kmer_freq)
Expand Down
52 changes: 50 additions & 2 deletions script/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
import string
import sys
import gzip
import pandas as pd



def _align_reads(global_config, sample_config, sorted_libraries_by_insert):
Expand Down Expand Up @@ -233,7 +235,6 @@ def align_bwa_mem(global_config, read1, read2, reference, threads):
os.chdir("..")
return BAMsorted


#if not os.path.exists(SAMMapped) and not os.path.exists(BAMunsorted):
# with open(SAMMapped, "w") as fh:
# stdErr = open("bwa.stdErr", "w")
Expand All @@ -250,7 +251,6 @@ def align_bwa_mem(global_config, read1, read2, reference, threads):

bwa_mem_command = [aligner, "mem", "-M", "-t", "{}".format(threads), reference, read1, read2]
samtools_view_command = [samtools, "view", "-b", "-S", "-u", "-"]
print "aligning..."

if not os.path.exists(BAMunsorted):
command = "{} | {} > {}".format(" ".join(bwa_mem_command), " ".join(samtools_view_command), BAMunsorted)
Expand Down Expand Up @@ -375,3 +375,51 @@ def align_bwa(read1, read2, reference):
BAMsorted = os.path.abspath(BAMsorted)
os.chdir("..")
return BAMsorted


def _run_pileup(global_config, bamfile):
"""
Perform samtools pileup on a .bam-file
"""


if "samtools" in global_config["Tools"]:
samtools = global_config["Tools"]["samtools"]["bin"]
elif not common.which("samtools"):
sys.exit("error while trying to run samtools: samtools not present in the path and not in global config, please make sure to install samtools properly")

pileupfile = bamfile.replace('.bam', '_coverage.csv')
pileup_cmd = "{} mpileup {} | awk '{print $2, $4}' > {}".format(samtools, bamfile, pileupfile)
p1 = subprocess.Popen(pileup_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT)
p1.wait()
if p1.returncode == 0:
return pileupfile
else:
print "Could not perform mpileup"
return 1


def plot_coverage(pileupfile, samplename=None):
"""
Plot coverage from samtools pileup output
"""
if not samplename:
samplename = pileupfile.split('.')[0]

df = pd.io.parsers.read_csv(pileupfile, sep=' ', names=['pos', 'cov'])
pl = df.plot(x='pos', y='cov')
avg_cov = sum(df['cov'])/len(df) #Calculate average coverage
pl2 = plt.plot(range(len(df)), [avg_cov]*len(df), 'r--', linewidth=2)
plt.xlim([0,len(df)])
plt.ylabel('Coverage')
plt.xlabel('Position')
plt.title(samplename)
plt.savefig(samplename) #Save plot as .png
return 0







Loading

0 comments on commit 25cec53

Please sign in to comment.