-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgene_parser_path.py
177 lines (161 loc) · 6.79 KB
/
gene_parser_path.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#!/usr/bin/env python
'''These functions can be used together to make lists of sequences for all genes
in a master file of blast outputs. The master file should be created using the
script makemasterDB.py, which concatenates a bunch of blast datasets into one
csv file. The file MUST be csv. The gene_parser_csv2.py script
must be executed from within a folder that contains the blastDatabase.txt file
and all trinity files that correspond to that master blast file. As is, this
program has to be executed from the python interpreter. If you rerun this file
it will replace any fasta files with the same output names.'''
#import pylauncher
import sys
import re
import os
import csv
import time
print (time.strftime("%d/%m/%Y"))
print (time.strftime("%H:%M:%S"))
def parsefile(filename):
'''This function parses a tab-delimited file from R into a list of lists.'''
with open(filename,'rU') as f:
reader=csv.reader(f)
d=list(reader)
return d
#parsefile('testfile.txt') #to test
def makelist(filename,column):
'''This function takes a parsed database and a column number and returns a
list of unique items found in that column. It is intended to be used to
make a list of unique genes in the database.'''
genelist=[]
for line in filename:
#print line
if line[column] not in genelist:
genelist.append(line[column])
length=len(genelist)
#print genelist
date=time.strftime("%Y-%m-%d")
output_file=date+'_genelist.txt'
with open(output_file, 'w') as f:
for item in genelist:
if '\t' in item:
item.strip('\t')
f.writelines('%s\n' %item)
return genelist, length
#filelist=parsefile('testfile.txt')
#makelist(filelist,12)
#filelist=parsefile('2014-03-14_tcdb_blast.csv')
#(genelist,length)=makelist(filelist,12)
def rowfinder(genelist,database,column):
'''This function takes a genelist (from makelist()), the parsed database that
genelist was made from, and the column where the genename was and
returns a dictionary containing subsetted databases (value) for each
gene (key)'''
allDB={}
for gene in genelist:
#print gene
newDB=[]
for line in database:
if gene == line[column]:
newDB.append(line)
#print newDB
allDB[gene]=newDB
print 'The genes in this file are:\n'
for item in allDB:
print item
return allDB
#filelist=parsefile('testfile.txt')
#(genelist,length)=makelist(filelist,12)
#rowfinder(genelist,filelist,12)
def makedict(allDB,assemtype):
'''This function takes a dictionary of genes (keys) and subsetted databases
corresponding to each gene (values). It returns a dictionary that pairs
each gene with a list of sequences that need to be pulled from trinity files.'''
trinities={}
for gene in allDB: #for each gene in the file
filePairs=[]
for line in allDB[gene]: #for line in each subsetted database, find query and filename
if assemtype == "SOAP":
try:
assembly=line[13].strip('"')+'_'+line[14][line[14].index('P')+1:line[14].index('_')].strip("'")+'_out.contig'
except:
print "unable to pull from %s, %s" %(line[13],line[14])
elif assemtype == 'Trinity':
assembly='Trinity_'+line[13].strip('"')+'.fasta' #writes filename of trinity file
sequence=line[0].strip('"')
print assembly
#print sequence,gene
filePairs.append((assembly,sequence)) #make list of trinity and sequence pairs for each gene
#print 'The sequences found for the gene %s are %s' %(gene,filePairs)
#print
trinities[gene]=filePairs #add list of pairs to dictionary, with gene name as key
#print 'The trinities for the gene %s are %s' %(gene,trinities)
print '\n***I have determined which sequences should be pulled for each gene.***'
print 'The sequences found for each of the following genes are: \n'
for item in trinities: #prints dictionary
print item+':'
for seqs in trinities[item]:
print seqs
print '%d sequences total' %(len(trinities[item]))
print
return trinities
#filelist=parsefile('testfile.txt')
#(genelist,length)=makelist(filelist,12)
#allDB=rowfinder(genelist,filelist,12)
#newDict=makedict(allDB)
from Bio import SeqIO
def pullSequences(dictionary, directory,assemtype):
'''This function takes the dictionary which lists the filename and sequences
to be pulled for each gene. It creates a new fasta file for each gene with
lists of sequences pulled from all blast files that aligned to that gene.'''
print '***Now pulling sequences.***'
for gene in dictionary:
fastaList=[]
print '--------------------------------------------------'
print 'Pulling sequences for %s: \n' %(gene)
for item in dictionary[gene]: #for each sequence listed for a gene
filename=item[0]
sequence=item[1]
if assemtype == "Trinity":
tplace=filename.index('_') #finds location of _ in file name
eplace=filename.index('.') #finds location of . in file name
tissue=filename[(tplace+1):eplace] #finds tissue name in file name
elif assemtype == "SOAP":
tissue=filename.split('_')[0]
#print ' Pulling %s from %s' %(sequence,filename)
#print
for record in SeqIO.parse(open(filename, 'rU'),'fasta'): #all files must be in folder
if record.id == sequence: #find the sequence in the right file
record.description = tissue+'_'+record.description
fastaList.append(record) #copy to a list
break
print 'The %d sequences pulled for the gene %s are:\n' %(len(fastaList),gene)
for item in fastaList:
print item
print
while '/' in gene: #this removes '/' from file names
loc=gene.index('/')
gene1=gene[:loc]
gene2=gene[(loc+1):]
gene=gene1+'--'+gene2
if len(gene) > 220:
gene=gene[:220]
gene=re.sub(' ', '_', gene)
output_file=gene+'.fasta'
fullpath=os.path.join(directory,output_file)
with open(fullpath, 'w') as f:
SeqIO.write(fastaList, f, "fasta") #export to a file
f.close()
#if __name__ == '__main__':
def main():
print 'It seems to be working'
filename=sys.argv[1]
column=int(sys.argv[2])
assemtype=sys.argv[4]
filelist=parsefile(filename)
(genelist,length)=makelist(filelist,column)
allDB=rowfinder(genelist,filelist,column)
newDict=makedict(allDB,assemtype)
pullSequences(newDict,sys.argv[3],assemtype)
print (time.strftime("%d/%m/%Y"))
print (time.strftime("%H:%M:%S"))
main()