-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpullseqs_byname.py
54 lines (46 loc) · 1.41 KB
/
pullseqs_byname.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 7 14:27:49 2014
@author: RDT
"""
from Bio import SeqIO
import csv
import sys
def get_seqs(seqfile, seqs_to_pull, outfile):
'''Open fasta infile and return iterator of SeqRecords with protein sequences.'''
records = SeqIO.parse(seqfile, 'fasta')
seqlist=[]
matchlist=[]
organism=seqfile.split('_')[0]
# with open(seqs_to_pull,'rU') as f:
# for line in f:
# matchlist.append(line)
# print matchlist
matchlist=seqs_to_pull
for rec in records:
# for item in matchlist:
# print item
if matchlist == rec.id:
if len(rec) > 160:
rec.description+=' '
rec.description+=seqfile
rec.id = organism+'_'+rec.id
seqlist.append(rec)
print "-- %s appended" %matchlist
else:
print "%s not appended because length (%d) was too short" %(matchlist, len(rec))
else:
continue
with open(outfile, 'a') as f:
SeqIO.write(seqlist, f, 'fasta')
def main():
assert len(sys.argv) == 4, "usage: python pullseqs_byname.py <sequences.fasta> <sequence_names.txt> <genename>"
infile = sys.argv[1]
print "infile is", infile
wanted_sequences = sys.argv[2]
print "sequences that will be pulled are", wanted_sequences
genename = sys.argv[3]
outfile = genename+'.fasta'
get_seqs(infile,wanted_sequences,outfile)
print "appending to ", outfile
main()