forked from xenobiolab/xenomorph
-
Notifications
You must be signed in to change notification settings - Fork 0
/
xenomorph.py
354 lines (244 loc) · 16.1 KB
/
xenomorph.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
##############################
##Xombo - An XNA toolkit ###
##############################
import argparse, textwrap
import os
import sys
from lib.xm_tools import *
from lib.xm_params import *
parser = argparse.ArgumentParser(
usage='"python xenomorph.py [-h] {preprocess, morph, extract, models, fasta2x}',
formatter_class=argparse.RawTextHelpFormatter,
description= textwrap.dedent('''
_
| |
__ __ ___ _ __ ___ _ __ ___ ___ _ __ _ __ | |__
\ \/ / / _ \| '_ \ / _ \ | '_ ` _ \ / _ \ | '__|| '_ \ | '_ \
> < | __/| | | || (_) || | | | | || (_) || | | |_) || | | |
/_/\_\ \___||_| |_| \___/ |_| |_| |_| \___/ |_| | .__/ |_| |_|
| |
|_|
********** Xenomorph : An XNA sequencing toolkit *********
Xenomorph is a suite of tools used for nanopore sequencing of alternative basepairs (XNAs). This toolkit incorporates ONT-workflows to preprocess fast5 files and extract signal levels for kmers in a sequence. Models parameterized on XNA basepairs can then be used to test if signal levels match XNA pairs. Xenomorph relies on kmer models that were parameterized using libraries of XNA-containing DNA. The general pipeline consists of two steps: 1) preprocessing fast5 reads and extracting level information and 2) basecalling using a selected kmer model.
Xenomorph command groups (additional help available within each command group):
preprocess [Preprocess] fast5 reads with reference fasta containing XNAs for 'morph' or 'de-novo'
morph [Basecall] Use kmer levels identified with preprocess to basecall XNA position based on alternative hypothesis testing (per read)
extract [Utility] Extracts raw signal in region associated with XNA bases.
stats [Utility] Calculate global concensus basecalls from per-read output file and generate summary output
fasta2x [Utility] Converts fasta file with XNAs in sequence (e.g. BSPZKXJV) to fasta file with XNA positional information in header
models [Utility] View summary of active and inactive kmer models that can be used for basecalling or activate models
'''))
subparsers = parser.add_subparsers(dest='subparsers')
#Preprocess
parser_preprocess = subparsers.add_parser('preprocess', help='[-w working_dir] [-f fast5_dir] [-r reference_fasta]')
parser_preprocess.add_argument('-f',metavar ='[fast5_dir]', type=str,required = True, help='Input directory containing multi-fast5 folders.')
parser_preprocess.add_argument('-w',metavar = '[working_dir]', type=str,required = True, help='Working directory for storing analysis pipeline temp files and outputs.')
parser_preprocess.add_argument('-r',metavar = '[reference_fasta]', type=str, required = True, help='Fasta (.fa, .fasta) file of sequence or sequences with XNAs (e.g. BSPZKXJV) in sequence.')
parser_preprocess.add_argument('-t',metavar = '[tombo_index_id]', type=str, required = False, help='Store resquiggle results with the following ID (default 000)')
parser_preprocess.add_argument('-o',metavar = '[output_file]', type=str, required = False, help='Specify path of output file (.csv) for level extraction (optional; default = working_dir/level_output_summary.csv)')
parser_preprocess.add_argument('-b',action = 'store_true', help='Use guppy to add basecall reads to file (optional).')
parser_preprocess.add_argument('-x',action = 'store_true', help='Overwrite all temp analysis files and force re-analysis (optional).')
#Morph
parser_morph = subparsers.add_parser('morph', help='[-l level_summary_file] [-m model_input] [-o output_summary_file] [-g global_basecalling]')
parser_morph.add_argument('-l',metavar = '[level_summary_file]', type=str,required = True, help='Working directory for storing analysis pipeline temp files and outputs.')
parser_morph.add_argument('-m',metavar = '[model_file]', type=str, required = True, help='Single letter abbreviations (e.g. ATGCBS) or specify kmer model file (.csv).')
parser_morph.add_argument('-o',metavar = '[output_file]', type=str, help='Output file name (optional; default output_summary_basecalls.csv)')
parser_morph.add_argument('-g',action = 'store_true', help='Perform global basecalling (across all reads) instead of per-read base calling.')
parser_morph.add_argument('-n',action = 'store_true', help='Perform global null basecalling using reads without XNAs.')
#Extract
parser_extract = subparsers.add_parser('extract', help='[-w working_dir] [-f fast5_dir] [-r reference_fasta] [-l level_summary_file] [-o output_raw_file]')
parser_extract.add_argument('-w',metavar = '[working_dir]', type=str,required = True, help='Working directory for storing analysis pipeline temp files and outputs.')
parser_extract.add_argument('-f',metavar ='[fast5_dir]', type=str,required = True, help='Input directory containing multi-fast5 folders.')
parser_extract.add_argument('-r',metavar = '[reference_fasta]', type=str, required = True, help='Fasta (.fa, .fasta) file of sequence or sequences with XNAs (e.g. BSPZKXJV) in sequence.')
parser_extract.add_argument('-l',metavar = '[level_summary_file]', type=str,required = True, help='Output file (.csv) from preprocess that contains normalized levels of each read.')
parser_extract.add_argument('-o',metavar = '[output_file]', type=str, required = True, help='Output file name for raw reads (optional; default output_levels_raw.csv)')
#Stats
parser_stats = subparsers.add_parser('stats', help='[-i input_bc_file]')
parser_stats.add_argument('-i',metavar ='[input_bc_file]', type=str,required = True, help='Calculate global concensus basecall from per-read output file (generated by morph).')
#Fasta2x
parser_fasta2x = subparsers.add_parser('fasta2x', help='[-i fasta_input] [-o xfasta_output]')
parser_fasta2x.add_argument('fasta_input',metavar ='[fasta_input]', type=str, help='Input fasta file containing XNAs in sequence.')
parser_fasta2x.add_argument('xfasta_output', metavar ='[xfasta_output]', type=str, help='Output fasta file with XNA positional information in header and gapped sequences.')
#Models
parser_models = subparsers.add_parser('models', help='[-s active/inactive/all] [-a base_abbreviation]')
parser_models.add_argument('-a',metavar ='[abbreviation]', type=str,required = False, help='Activate model params for XNA with the given abbreviation (e.g. -a Bn to activate Bn, deactivate Bc).')
parser_models.add_argument('-s',metavar = '[active/inactive/all]', type=str, required = False, help='Show model descriptions of active, inactive, or all kmer models.')
args = parser.parse_args()
args.subparsers
#Print help if no arguments provided
if args.subparsers==None:
parser.print_help()
sys.exit(0)
#Parse arguments
if args.subparsers == 'fasta2x' :
os.system('python lib/xm_fasta2x_rc.py '+args.fasta_input+' '+args.xfasta_output)
if args.subparsers == 'morph':
if os.path.exists(args.l)==False:
print("Xenomorph Status - [Error] Level-extracted input file not found. Check path or run 'xenomorph preprocess' to generate")
sys.exit()
if sum(1 for line in open(args.l)) <=1:
print("Xenomorph Status - [Error] No reads found in level extraction input file. Ensure 'xenomorph preprocess' ran successfully")
sys.exit()
level_summary_file = args.l
#load model fn
if '.csv' in args.m:
model_fn = args.m
elif all(elem in 'ATGCBSPZJVKX' for elem in args.m):
print('Xenomorph Status - [Morph] Compiling model from input model base abbreviations.')
#Parse model, get paths of each model file component
mf = parse_model_files(args.m,True)
#Compile model from file paths into a master kmer model
km = compile_model(mf, args.m)
#Save model
model_fn=os.path.normpath(args.l).replace(os.path.basename(args.l),'')+'active_kmer_model.csv'
km.to_csv(model_fn, index=False)
if args.o:
output_fn=args.o
else:
output_fn=os.path.normpath(args.l).replace(os.path.basename(args.l),'')+'/output_summary_basecalled.csv'
if args.g:
if args.n:
print("Xenomorph Status - [Morph Global Null] Testing null hypothesis of XNA basecalls by grouping reads that map to the same region (global).")
cmd = 'python lib/global_null_morph.py '+model_fn+' '+level_summary_file+' '+output_fn
os.system(cmd)
else:
print("Xenomorph Status - [Global Morph] Testing alternative hypothesis of XNA basecalls by grouping reads that map to the same region (global).")
cmd = 'python lib/global_morph.py '+model_fn+' '+level_summary_file+' '+output_fn
os.system(cmd)
elif args.n:
print("Xenomorph Status - [Morph Null] Testing null hypothesis of XNA basecalls by grouping reads that map to the same region (global).")
cmd = 'python lib/null_morph.py '+model_fn+' '+level_summary_file+' '+output_fn
os.system(cmd)
else:
print("Xenomorph Status - [Morph] Testing alternative hypothesis of XNA basecalls on individual reads (per-read).")
cmd = 'python lib/morph.py '+model_fn+' '+level_summary_file+' '+output_fn
os.system(cmd)
elif args.subparsers == 'extract':
print("Xenomorph Status - [Extract] Extracting raw signal in region associated with XNA level measurements.")
working_dir = os.path.normpath(args.w)
CHECK_FOLDER = os.path.isdir(args.w)
if not CHECK_FOLDER:
print("Xenomorph Status - [Error] Working directory "+args.w+" not found.")
print("Xenomorph Status - [Error] Run xenomorph.py preprocess first to generate directories.")
if os.path.exists(args.l)==False:
print("Xenomorph Status - [Error] Could not locate file: "+args.l)
print("Xenomorph Status - [Error] Level-extracted input file not found. Check path or run 'xenomorph preprocess' to generate.")
sys.exit()
level_summary_file = args.l
if not check_xfasta_format(args.r, standard_bases):
xfasta=os.path.normpath(args.w)+'/xfasta_'+os.path.basename(args.r)
if args.o:
output_fn=args.o
fast5_sing_dir=os.path.normpath(working_dir)+'/'+os.path.basename(args.f)+'_single'
xfasta=os.path.normpath(args.w)+'/xfasta_'+os.path.basename(args.r)
cmd = 'python lib/xm_get_raw_signal.py '+fast5_sing_dir+' '+xfasta+' '+level_summary_file+' '+output_fn
os.system(cmd)
elif args.subparsers == 'preprocess':
level_summary_file = args.l
working_dir = os.path.normpath(args.w)
#Create a working directory if it does not existW
CHECK_FOLDER = os.path.isdir(args.w)
if not CHECK_FOLDER:
os.makedirs(working_dir)
print("Xenomorph Status - [Preprocess] Creating output working directory : "+working_dir)
#Convert reference fasta to xfasta
if os.path.exists(args.r)==False:
print("Xenomorph Status - [Error] Fasta file input not found.")
sys.exit()
if not check_xfasta_format(args.r, standard_bases):
xfasta=os.path.normpath(args.w)+'/xfasta_'+os.path.basename(args.r)
if not os.path.exists(xfasta) or args.x:
print("Xenomorph Status - [Preprocess] Performing fasta to xfasta conversion on input reference fasta. Saving output "+xfasta+'.')
print("Xenomorph Status - [Preprocess] Using fasta2x_rcf.")
cmd = 'python lib/xm_fasta2x_rcf.py '+os.path.normpath(args.r)+' '+xfasta
#cmd = 'python lib/xm_fasta2x_rc.py '+os.path.normpath(args.r)+' '+xfasta
os.system(cmd)
else:
print("Xenomorph Status - [Preprocess] xfasta file found in working directory. Skipping xfasta conversion.")
else:
xfasta=args.r
print("Xenomorph Status - [Preprocess] Input reference fasta already in xfasta format. Skipping xfasta conversion.")
#check for creation of a reverse fasta file - required for some XNAs for segmentation
if os.path.exists(xfasta[0:xfasta.find('.fa')]+'_rc.fa')==True:
print("Xenomorph Status - [Preprocess] xfasta reverse complement created for handling special bases.")
xfasta_rc = xfasta[0:xfasta.find('.fa')]+'_rc.fa'
###Multi to single and merge single folders
fast5_sing_dir=os.path.normpath(working_dir)+'/'+os.path.basename(args.f)+'_single'
CHECK_FAST5_SINGLE = os.path.isdir(fast5_sing_dir)
if not CHECK_FAST5_SINGLE:
print("Xenomorph Status - [Preprocess] Converting multi fast5 to single fast5 using ONT tools.")
cmd = 'python xombo.py split '+os.path.normpath(args.f)+' '+fast5_sing_dir
os.system(cmd)
else:
print("Xenomorph Status - [Preprocess] Single fast5 directory found in working directory. Skipping multi-to-single conversion.")
#Check that fast5 multi-to-single worked properly. If not, exit with error message to check paths.
CHECK_FAST5_SINGLE_N = sum('.fast5' in s for s in os.listdir(fast5_sing_dir))
if CHECK_FAST5_SINGLE_N > 0:
print("Xenomorph Status - [Preprocess] Found "+str(CHECK_FAST5_SINGLE_N)+' single fast5 files in directory')
else:
print("Xenomorph Status - [Error] Single fast5 files not found. Check paths to ensure multi-to-single fast5 conversion is working properly.")
sys.exit()
###Rebasecall - required for resquiggling and assignment
if args.b == True:
fastq_dir=os.path.normpath(args.w)+'/'+os.path.basename(args.f)+'_fastq'
print("Xenomorph Status - [Preprocess] Using guppy to basecall single fast5 files.")
if use_reference_to_basecall==True:
cmd = 'python xombo.py basecall '+fast5_sing_dir+' '+fastq_dir+' -r '+xfasta
else:
cmd = 'python xombo.py basecall '+fast5_sing_dir+' '+fastq_dir
os.system(cmd)
###Assign reads
print("Xenomorph Status - [Preprocess] Using tombo preprocess to assign fastq basecalls to fast5 files.")
cmd = 'python xombo.py preprocess '+fast5_sing_dir+' '+fastq_dir
os.system(cmd)
else:
print("Xenomorph Status - [Preprocess] No basecall flag (-b) entered. Proceeding with resquiggle.")
#Set up index group from -t flag
if args.t:
index_group = args.t
rc_index_group = args.t+'r'
else:
index_group = ''
rc_index_group = 'r'
#Resquiggle if needed
tombo_index= os.path.normpath(args.w)+'/.'+os.path.basename(fast5_sing_dir)+'.RawGenomeCorrected_000'+index_group+'.tombo.index'
if not os.path.exists(tombo_index) or args.x:
print("Xenomorph Status - [Preprocess] Using tombo to resquiggle fast5 reads using xfasta reference.")
cmd = 'python xombo.py resquiggle '+fast5_sing_dir+' '+os.path.normpath(xfasta)+' -g "'+index_group+'"'
os.system(cmd)
if os.path.exists(xfasta[0:xfasta.find('.fa')]+'_rc.fa')==True:
print("Xenomorph Status - [Preprocess] Using tombo to resquiggle fast5 reads using rc-xfasta reference.")
cmd = 'python xombo.py resquiggle '+fast5_sing_dir+' '+os.path.normpath(xfasta_rc)+' -g '+rc_index_group
os.system(cmd)
else:
print("Xenomorph Status - [Preprocess] Resquiggle file found. Skipping and proceeding to level extraction. Use (-x) flag to force resquiggle with new xfasta file.")
if args.o:
level_output_fn=os.path.normpath(args.w)+'/'+args.o
else:
level_output_fn= os.path.normpath(args.w)+'/output_levels_summary.csv'
print("Xenomorph Status - [Preprocess] Performing level extraction surrounding XNA locations.")
cmd = 'python lib/xm_get_levels.py '+fast5_sing_dir+' '+os.path.normpath(xfasta)+' '+level_output_fn+' '+index_group
os.system(cmd)
if os.path.exists(xfasta[0:xfasta.find('.fa')]+'_rc.fa')==True:
print("Xenomorph Status - [Preprocess] Performing level extraction on surrounding XNA locations with reverse set.")
cmd = 'python lib/xm_get_levels.py '+fast5_sing_dir+' '+os.path.normpath(xfasta_rc)+' '+level_output_fn+' '+rc_index_group
os.system(cmd)
print("Xenomorph Status - [Preprocess] Saving output level file to "+level_output_fn)
print("Xenomorph Status - [Preprocess Complete] Use 'xenomorph.py morph' for alternative hypothesis testing on XNA positions.")
#Model handling
elif args.subparsers == 'models':
if args.a:
try:
activate_model(args.a)
model_summary('all')
print("Xenomorph Status - [Models] "+args.a+" successfully set as active model within group. Other models sharing same single letter code inactivated.")
except:
print("Xenomorph Status - [Error] Error activating model. Check proper abbreivation was used, or manually activate by editing models/config_models.csv.")
if args.s:
model_summary(args.s)
elif args.subparsers == 'stats':
try:
cmd = 'python lib/xm_stats.py '+args.i
os.system(cmd)
except:
print("Xenomorph Status - [Error] Stats failed. Ensure file path to basecalled output.csv is correct, or run xenomorph morph to generate new basecall file.")