-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathclean_seq_names.py
41 lines (36 loc) · 1.26 KB
/
clean_seq_names.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
# -*- coding: utf-8 -*-
"""
Created on Mon Apr 14 20:27:07 2014
@author: RDT
"""
'''This script reorganizes the rec.id and rec.description of a fasta file.'''
from Bio import SeqIO
import sys
def clean_seqs(infile,genename,database):
'''Open fasta infile and return iterator of SeqRecords with protein sequences.'''
records = SeqIO.parse(infile, 'fasta')
blasthit=str(infile[:-6])
newrecords=[]
# print filename
for rec in records:
items=(rec.description).split(' ') # turns description into a list
# print items
rec.id = genename+items[1] # adds gene name to sequence ID
newitem=''
# print length
for i in range(3,len(items)):
newitem='%s %s ' %(newitem,items[i]) # concatenates paths
if items[1][0] == '_':
items[1] = items[1][1:]
rec.description="'%s' %s %s %s %s" %(blasthit, database, items[1], items[2], newitem) # rewrites description
# print rec.id
# print rec.description
newrecords.append(rec)
outfile=genename+'_clean.fasta'
with open(outfile,'w') as f:
SeqIO.write(newrecords, f, 'fasta')
if __name__ == '__main__':
infile = sys.argv[1]
gene = sys.argv[2]
database= sys.argv[3]
clean_seqs(infile,gene,database)