forked from vezzi/NouGAT
-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathprepare_de_novo_assembly.py
123 lines (102 loc) · 6.19 KB
/
prepare_de_novo_assembly.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import sys, os, yaml, glob
import subprocess
import argparse
def main(args):
workingDir = os.getcwd()
assemblers = sum(args.assemblers, [])
samples_data_dir = args.sample_data_dir
checkSupportedAssemblers(assemblers, args.global_config)
#The specified assembler are supported, at least they are present in global configuration (they might be not implemented but this is unlikely)
for sample_dir_name in [dir for dir in os.listdir(samples_data_dir) if os.path.isdir(os.path.join(samples_data_dir, dir))]:
assembly_folder = os.path.join(os.getcwd(), sample_dir_name)
if not os.path.exists(assembly_folder):
os.makedirs(assembly_folder)
os.chdir(assembly_folder)
for assembler in assemblers:
#assemble data stored in sample_dir_name with assembler assembler
sample_YAML_name = os.path.join(assembly_folder, "{}_{}.yaml".format(sample_dir_name, assembler))
sample_YAML = open(sample_YAML_name, 'w')
sample_YAML.write("pipeline:\n")
sample_YAML.write(" assemble\n")
sample_YAML.write("tools:\n")
sample_YAML.write(" [{}]\n".format(assembler))
sample_YAML.write("genomeSize: {}\n".format(args.genomeSize))
sample_YAML.write("kmer: {}\n".format(args.kmer))
sample_YAML.write("output: {}\n".format(sample_dir_name))
sample_YAML.write("libraries:\n")
sample_data_dir = os.path.join(samples_data_dir,sample_dir_name)
sample_files = [ f for f in os.listdir(sample_data_dir) if os.path.isfile(os.path.join(sample_data_dir,f))]
pair1_file = ""
pair2_file = ""
single = ""
sample_YAML.write(" lib1:\n")
for file in sample_files:
if "_R1_" in file or "_1.fastq.gz" in file:
if pair1_file:
sys.exit("Error: processing sample {} found more that one library/run for read 1".format(sample_dir_name))
pair1_file = os.path.join(sample_data_dir,file)
sample_YAML.write(" pair1: {}\n".format(pair1_file))
elif "_R2_" in file or "_2.fastq.gz" in file:
if pair2_file:
sys.exit("Error: processing sample {} found more that one library/run for read 2".format(sample_dir_name))
pair2_file = os.path.join(sample_data_dir,file)
sample_YAML.write(" pair2: {}\n".format(pair2_file))
elif "merged" in file or "single" in file:
single = os.path.join(sample_data_dir,file)
sample_YAML.write(" orientation: {}\n".format(args.orientation))
sample_YAML.write(" insert: {}\n".format(args.insert))
sample_YAML.write(" std: {}\n".format(args.std))
if single != "":
sample_YAML.write(" lib2:\n")
sample_YAML.write(" pair1: {}\n".format(single))
sample_YAML.write(" pair2:\n")
sample_YAML.write(" orientation: none\n")
sample_YAML.write(" insert: 0\n")
sample_YAML.write(" std: 0\n")
sample_YAML.close
if(args.global_config is not None):
submit_job(sample_YAML_name, args.global_config, sample_dir_name, args.project, assembler)
os.chdir(workingDir)
return
def checkSupportedAssemblers(assemblers, global_config):
with open(global_config) as in_handle:
global_config = yaml.load(in_handle)
for assembler in assemblers:
if assembler not in global_config["Tools"]:
print "assembler {} not supported. Supported assemblers specified in the global configuration are:".format(assembler)
for supported_assembler, options in global_config["assemble"].items():
print supported_assembler
sys.exit("Error: assembler {} not supported".format(assembler))
def submit_job(sample_config, global_config, sample_name, project, assembler):
workingDir = os.getcwd()
slurm_file = os.path.join(workingDir, "{}_{}.slurm".format(sample_name, assembler))
slurm_handle = open(slurm_file, "w")
slurm_handle.write("#! /bin/bash -l\n")
slurm_handle.write("set -e\n")
slurm_handle.write("#SBATCH -A {}\n".format(project))
slurm_handle.write("#SBATCH -o {}_{}.out\n".format(sample_name, assembler))
slurm_handle.write("#SBATCH -e {}_{}.err\n".format(sample_name, assembler))
slurm_handle.write("#SBATCH -J {}_{}.job\n".format(sample_name, assembler))
slurm_handle.write("#SBATCH -p node -n 8\n")
slurm_handle.write("#SBATCH -t 05:00:00\n")
slurm_handle.write("#SBATCH --mail-user [email protected]\n")
slurm_handle.write("#SBATCH --mail-type=ALL\n")
slurm_handle.write("\n\n");
slurm_handle.write("module load abyss/1.3.5\n");
slurm_handle.write("python ~/DE_NOVO_PIPELINE/de_novo_scilife/script/deNovo_pipeline.py --global-config {} --sample-config {}\n\n".format(global_config,sample_config))
slurm_handle.close()
command=("sbatch", slurm_file)
subprocess.call(command)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--sample-data-dir', help="full path to directory containing one folder per sample. Each sample contaains only one library (i.e., one PE lib)", type=str)
parser.add_argument('--genomeSize', help="genome size", type=str)
parser.add_argument('--orientation', help="I assume I am working only with PE (if not manual editing is needed)", type=str)
parser.add_argument('--insert', help="I assume that all samples have the same insert (if not manual editing is needed)", type=str)
parser.add_argument('--std', help="I assume tha all sample have same std (if not manual editing is needed)", type=str)
parser.add_argument('--kmer', help="kmer to use", type=str)
parser.add_argument('--global-config', help='foo help')
parser.add_argument('--assemblers', action='append', nargs='+', help="List of assemblers to be employed on the datasets specified")
parser.add_argument('--project', help="UPPMAX project to use", default="b2010029")
args = parser.parse_args()
main(args)