-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFreqWordMismatch.py
145 lines (117 loc) · 4.17 KB
/
FreqWordMismatch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
__author__ = 'Cullin'
import itertools
from ApproxPatternCount import count
"""
Solution
Iterate text
For each word generate_kmers(word)
for each kmer if not in dict approxpatterncount(text,kmer,dist)
"""
def reverse_complement(seq):
"""
:param
seq(str): dna sequence
:return:
reverse complement of seq
Raises:
Raises Value Error if an improper seq is passed
"""
seq = seq[::-1]
seq = list(seq)
for i,base in enumerate(seq):
if base is 'A':
seq[i] = 'T'
elif base is 'T':
seq[i] = 'A'
elif base is 'G':
seq[i] = 'C'
elif base is 'C':
seq[i] = 'G'
else:
raise ValueError("%s is not a valid base" % base)
return ''.join(seq)
def check_equal(iterator):
"""
# source : http://stackoverflow.com/questions/3844801/check-if-all-elements-in-a-list-are-identical
Check to see if all values in the iterable container are equal i.e. list, tuple
:param
iterator: iterable object
:return:
Bool
"""
try:
iterator = iter(iterator)
first = next(iterator)
return all(first == rest for rest in iterator)
except StopIteration:
return True
def generate_kmers(pattern,d):
"""
returns all possible kmers for given pattern with up to d mismatches
args:
pattern(str) - pattern of bases
d(int) - max number of mismatches allowed in kmers
return:
list()
"""
# key = index of base in pattern, value = list of the other possible bases for that position
index_possibilities = dict()
bases = set(['A','G','C','T'])
# return list, initialize with pattern
kmers = [pattern]
if d < 1:
return kmers
for i, base in enumerate(pattern):
temp = bases
index_possibilities[i] = list(temp - set(base))
# get all combinations of indices to change
index_combos = list(itertools.combinations_with_replacement(range(0,len(pattern)),d))
for combo in index_combos:
temp = list(pattern)
# case where all indexes are the same
if check_equal(combo):
index = combo[0]
for base in index_possibilities[index]:
temp[index] = base
kmers.append(''.join(temp))
else:
# We need to build the expression statement
arg_list = []
for i in range(0,d):
arg_list.append('index_possibilities[%d]' % i)
arg = ','.join(arg_list)
# Forward declaration
cartesian_product = None
# get cross product of the list of possible bases for each index
exec 'cartesian_product = list(itertools.product(%s))' % arg
for base_tuple in cartesian_product:
for i, ind in enumerate(combo):
# set the index to new value
temp[ind] = base_tuple[i]
kmers.append(''.join(temp))
return kmers
def freq_words_mismatch(text, k, d):
"""
Finds the most frequent kmer (including occurrences of its reverse complement) of length k with up to d mismatches
in text.
Args:
:param text(str): text to search through
:param k(int): length of kmer
:param d(int): max number of mismatches allowed for possible kmer
:return:
list of most freq kmers of length k and with up to d mismatches found in text
"""
freq_words = {}
for i in range(0, len(text) - k + 1):
word = text[i:i+k]
kmers = generate_kmers(word,d)
for kmer in kmers:
if not freq_words.has_key(kmer):
reverse = reverse_complement(kmer)
found = count(text,kmer,d) + count(text,reverse,d)
freq_words[kmer] = found
maximum = max(freq_words.values())
max_keys = [key for key,val in freq_words.items() if val == maximum]
max_keys.append(maximum)
return max_keys
#print freq_words_mismatch('ATATGCTGGTGGATTATAGCATTATAGCAGGCATGCGCTATGGATATAGATGCTAGCGCTAGCGCTGGAGGCTGGTAAGTGGGCGCGCGCTATGGATTATATAAGAGAGATATAGTGGATATGCTGGATATATATATTAATTGGGCAGGCATTATAAGAGATGCATATAGAGATTAGCAGAGAGATAGTGGGCATATTGGGC',7,3)