Skip to content

Commit

Permalink
Major changes in the project: the project is now a regular python pac…
Browse files Browse the repository at this point in the history
…kage and extensive variation has been done on the MP pipeline in order to make it more general and flexibile
  • Loading branch information
Francesco Vezzi committed Mar 13, 2014
1 parent 25cec53 commit 3870606
Show file tree
Hide file tree
Showing 37 changed files with 1,575 additions and 459 deletions.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
Pipelines:
MP
QCcontrol
assemble
evaluete
align
user_define
QCcontrol: ["fastqc", "abyss", "trimmomatic"]
MP: ["fastqc", "abyss", "trimmomatic", "align"]
assemble: ["soapdenovo", "abyss"]
evaluete: ["qaTools", "FRC"]
align: []
Tools:
jellyfish:
bin: /home/vezzi/DE_NOVO_PIPELINE/assembly_pipeline/tools/jellyfish-1.1.11/bin/jellyfish
Expand All @@ -14,7 +13,7 @@ Tools:
options: [--threads, "16" , --outdir, fastqc]
trimmomatic:
bin: /home/vezzi/DE_NOVO_PIPELINE/tools/Trimmomatic-0.30/trimmomatic-0.30.jar
options: []
options: [""]
masurca:
bin: /home/vezzi/DE_NOVO_PIPELINE/tools/MaSuRCA-2.0.4/
options: [""]
Expand All @@ -23,7 +22,7 @@ Tools:
options: [""]
abyss:
bin: /sw/apps/bioinfo/abyss/1.3.5/milou/bin/
options: ["k=54", "np=16"]
options: [""]
abyss_mergePairs:
bin: /sw/apps/bioinfo/abyss/1.3.5/milou/bin/abyss-mergepairs
options: ["-m 20", "-1 230", "-2 230"]
Expand Down Expand Up @@ -52,8 +51,8 @@ Tools:
bin: /sw/apps/bioinfo/samtools/0.1.19/milou/bin/samtools
options: ""
qaTools:
bin: /home/vezzi/DE_NOVO_PIPELINE/tools/qaTools/
bin: /home/vezzi/DE_NOVO_PIPELINE/tools/qaTools/qaCompute
options: ""
trinity:
bin: ~/DE_NOVO_PIPELINE/tools/trinityrnaseq_r20131110/
bin: $HOME/DE_NOVO_PIPELINE/tools/trinityrnaseq_r20131110/
options: []
379 changes: 379 additions & 0 deletions de_novo_scilife/MP.py

Large diffs are not rendered by default.

Binary file added de_novo_scilife/MP.pyc
Binary file not shown.
184 changes: 184 additions & 0 deletions de_novo_scilife/QCcontrol.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import sys, os, yaml, glob
import subprocess
import pandas as pd
from matplotlib import pyplot as plt
from de_novo_scilife import common


def run(global_config, sample_config):
sorted_libraries_by_insert = common._sort_libraries_by_insert(sample_config)
if "tools" in sample_config:
"""If so, execute them one after the other in the specified order (might not work)"""
for command in sample_config["tools"]:
"""with this I pick up at run time the correct function in the current module"""
command_fn = getattr(sys.modules[__name__] , "_run_{}".format(command))
"""Update sample config, each command return sample_config and if necessary it modifies it"""
sample_config = command_fn(global_config, sample_config, sorted_libraries_by_insert)
else:
#run default pipeline for QC
sample_config = _run_fastqc(global_config, sample_config, sorted_libraries_by_insert)
sample_config = _run_abyss(global_config, sample_config, sorted_libraries_by_insert)
sample_config = _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert)



def _run_fastqc(global_config, sample_config, sorted_libraries_by_insert):
mainDir = os.getcwd()
FastqcFolder = os.path.join(os.getcwd(), "fastqc")
if not os.path.exists(FastqcFolder):
os.makedirs(FastqcFolder)

program=global_config["Tools"]["fastqc"]["bin"]
program_options=global_config["Tools"]["fastqc"]["options"]
for library, libraryInfo in sorted_libraries_by_insert:
command = [program]
for option in program_options:
command.append(option)
read1=libraryInfo["pair1"]
read2=libraryInfo["pair2"]
command.append(read1)
if read2 is not None:
command.append(read2)
common.print_command(command)
folder_output_name = os.path.join(FastqcFolder, os.path.basename(read1).split(".fastq.gz")[0])
if not common.check_dryrun(sample_config) and not os.path.exists("{}_fastqc.zip".format(folder_output_name)):
fastq_stdOut = open(os.path.join(FastqcFolder , "{}_fastqc.stdout".format(library)), "a")
fastq_stdErr = open(os.path.join(FastqcFolder , "{}_fastqc.stderr".format(library)), "a")
subprocess.call(command, stdout=fastq_stdOut, stderr=fastq_stdErr)
sample_config["fastqc"] = FastqcFolder
return sample_config


def _run_abyss(global_config, sample_config, sorted_libraries_by_insert):
mainDir = os.getcwd()
ABySS_Kmer_Folder = os.path.join(os.getcwd(), "abyss_kmer")
if "kmer" not in sample_config:
sys.exit("error in _run_abyss QCcontrol: kmer must be present in sample_config.yaml")

kmer = sample_config["kmer"]
if not os.path.exists(ABySS_Kmer_Folder):
os.makedirs(ABySS_Kmer_Folder)

os.chdir(ABySS_Kmer_Folder)

program = global_config["Tools"]["abyss"]["bin"]
program = os.path.join(os.path.dirname(program), "ABYSS-P")
program_options=global_config["Tools"]["abyss"]["options"]
if "abyss" in sample_config:
program_options=sample_config["abyss"]

threads = 16 # default for UPPMAX
if "threads" in sample_config :
threads = sample_config["threads"]

command = "mpirun -np {} {} ".format(threads, program)
command += "-k {} ".format(kmer)
command += "--coverage-hist=histogram.hist -o preUnitgs.fa"
for library, libraryInfo in sorted_libraries_by_insert:
read1=libraryInfo["pair1"]
read2=libraryInfo["pair2"]
orientation = libraryInfo["orientation"]
if orientation=="innie" or orientation=="outtie":
command += " {} ".format(read1)
if read2 is not None:
command += " {} ".format(read2)
if orientation == "none":
command += " {} ".format(read1)

common.print_command(command)
if not common.check_dryrun(sample_config) and not os.path.exists("histogram.hist"):
ABySS_Kmer_stdOut = open("ABySS_Kmer_Folder.stdOut", "a")
ABySS_Kmer_stdErr = open("ABySS_Kmer_Folder.stdErr", "a")
returnValue = subprocess.call(command, shell=True, stdout=ABySS_Kmer_stdOut, stderr=ABySS_Kmer_stdErr)
if returnValue > 0:
print "ABySS kmer plotting failed: unkwnown reason"
else :
subprocess.call(("rm", "preUnitgs.fa"))
_plotKmerPlot(1,200, kmer, "kmer_coverage_1_200.png")
_plotKmerPlot(1,500, kmer, "kmer_coverage_1_500.png")
_plotKmerPlot(15,200, kmer, "kmer_coverage_15_200.png")
_plotKmerPlot(15,500, kmer, "kmer_coverage_15_500.png")

os.chdir("..")
sample_config["abyss"] = ABySS_Kmer_Folder
return sample_config

def _plotKmerPlot(min_limit, max_limit,kmer, output_name):
Kmer_histogram = pd.io.parsers.read_csv("histogram.hist", sep='\t', header=None)
Kmer_coverage = Kmer_histogram[Kmer_histogram.columns[0]].tolist()
Kmer_count = Kmer_histogram[Kmer_histogram.columns[1]].tolist()
Kmer_freq = [Kmer_coverage[i]*Kmer_count[i] for i in range(len(Kmer_coverage))]
kmer_freq_peak = Kmer_freq.index(max(Kmer_freq[min_limit:max_limit])) #coverage peak, disregarding initial peak
kmer_freq_peak_value=max(Kmer_freq[min_limit:max_limit])

xmax = max_limit
ymax = kmer_freq_peak_value + (kmer_freq_peak_value*0.30)

plt.plot(Kmer_coverage, Kmer_freq)
plt.title("K-mer length = {}".format(kmer))
plt.xlim((0,xmax))
plt.ylim((0,ymax))
plt.vlines(kmer_freq_peak, 0, kmer_freq_peak_value, colors='r', linestyles='--')
plt.text(kmer_freq_peak, kmer_freq_peak_value+2000, str(kmer_freq_peak))
plotname = "{}".format(output_name)
plt.savefig(plotname)
plt.clf()
return 0



def _run_trimmomatic(global_config, sample_config, sorted_libraries_by_insert):
print "running trimmomatic ..."

program = global_config["Tools"]["trimmomatic"]["bin"]
program_folder = os.path.dirname(program)
adapterFile = sample_config["adapters"]

if not os.path.exists(adapterFile):
print "Trimmomatic cannot be run as adapter file is not specified or points to unknown position: {}".format(adapterFile)
return sample_config

rootDir = os.getcwd()
trimmomaticDir = os.path.join(rootDir, "Trimmomatic")
if not os.path.exists(trimmomaticDir):
os.makedirs(trimmomaticDir)
else:
print "trimmomatic assumed already run"
return sample_config #

os.chdir(trimmomaticDir)
#now I am in running dir, I need to process one by one the libraries
threads = 8
if "threads" in sample_config:
threads = sample_config["threads"]

for library, libraryInfo in sorted_libraries_by_insert:
read1=libraryInfo["pair1"]
read2=libraryInfo["pair2"]
orientation = libraryInfo["orientation"]
if orientation=="innie":
if read2 is not None:
read1_baseName = os.path.split(read1)[1].split(".")[0]
read2_baseName = os.path.split(read2)[1].split(".")[0]
output_read1_pair = os.path.join(trimmomaticDir, "{}.fastq.gz".format(read1_baseName))
output_read1_sing = os.path.join(trimmomaticDir, "{}_u.fastq.gz".format(read1_baseName))
output_read2_pair = os.path.join(trimmomaticDir, "{}.fastq.gz".format(read2_baseName))
output_read2_sing = os.path.join(trimmomaticDir, "{}_u.fastq.gz".format(read2_baseName))
command = ["java", "-jar", program, "PE", "-threads", "{}".format(threads), "-phred33", read1, read2, output_read1_pair ,output_read1_sing , output_read2_pair, output_read2_sing ,"ILLUMINACLIP:{}:2:30:10".format(adapterFile), "MINLEN:30" ]
#print ' '.join(map(str,command))
stdOut = open("{}_trimmomatic.stdOut".format(read1_baseName), "w")
stdErr = open("{}_trimmomatic.stdErr".format(read1_baseName), "w")

returnValue = subprocess.call(command, stdout=stdOut, stderr=stdErr) # run the program
if returnValue != 0:
print "error while running command: {}".format(command)
else:
libraryInfo["pair1"] = output_read1_pair
libraryInfo["pair2"] = output_read2_pair

os.chdir(rootDir)
print "trimmomatic done"
print sample_config
return sample_config


Binary file added de_novo_scilife/QCcontrol.pyc
Binary file not shown.
Empty file added de_novo_scilife/__init__.py
Empty file.
Binary file added de_novo_scilife/__init__.pyc
Binary file not shown.
Loading

0 comments on commit 3870606

Please sign in to comment.