-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathread.py
141 lines (100 loc) · 4.04 KB
/
read.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
"""
read.py
Class representing a single sequence read in the .sam format.
"""
import sys
#DD63XKN1:432:C7RRYACXX:1:1307:18100:35594 113 pseudo1 7 1 5M1D7M1D88M pseudo2 8795984 0 CTCAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAACCCTAAGCCCTAAACCCTAAACCCTAAACCCTAAACCCTGGGCTGCTCATTTGCAGGAAA B7'07BB<<7<7FFFFBBBBBBB<70BBBBB<'FFB<<B<FF<FFBBFFBBFBBFFFF<B<IFFFFBBIIFFFF<IIIIFIIIIIFIFFFBFFFFFF<BB PG:Z:MarkDuplicates RG:Z:1 NM:i:2 SM:i:67 MQ:i:0 PQ:i:237 UQ:i:90 XQ:i:22
class read():
def __init__(self,read_str):
"""
"""
self.raw_str = read_str
sline = read_str.split('\t')
self.read_id = sline[0]
self.flag = sline[1]
self.chrom = sline[2]
self.read_start = int(sline[3])
self.MAPQ = sline[4]
self.CIGAR = sline[5]
self.mate_chrom = sline[6]
self.mate_pos = sline[7]
self.template_len = sline[8]
self.seq = sline[9]
self.base_qual = sline[10]
self.rev_comp = None
if len(self.seq) != 100:
print "Seq len:", len(self.seq), "\n", "Temp len:", self.template_len, \
"MQ: ", self.MAPQ
# self.base_range = range(int(self.read_start), int(self.read_start) + len(self.template_len))
self.range_end = self.read_start + (len(self.seq) - 1)
def set_rev_comp(self, rev_comp):
"""
"""
self.rev_comp = rev_comp
if self.rev_comp:
self.seq = self.get_rev_comp()
def get_rev_comp(self):
"""
"""
base_comp = {"A":"T", "C":"G", "G":"C", "T":"A", "a":"t", "t":"a",\
"g":"c", "c":"g", "N":"N","n":"n"}
rev_seq = ""
for base in self.seq[::-1]:
rev_seq = rev_seq + base_comp[base]
return rev_seq
def get_base_idx(self, pos):
"""Returns the index of the base in this reads sequence string.
Args:
pos: reference relative index
Returns:
idx: Int index of pos in read
"""
idx = int(pos) - self.read_start
if self.rev_comp:
idx = (len(self.seq) - idx) - 1
return idx
def read_maps_pos(self, pos):
# print "read maps", self.read_start, pos, self.range_end
# print self.raw_str
return ((self.read_start <= int(pos)) and (int(pos) <= self.range_end))
def get_base_at_pos(self, pos):
"""Gets base pair at given position.
Args:
pos: Integer bp position on a chromosome
Returns:
Single character representing base pair
Raises:
ValueError if specified position is not covered by this read
"""
if not ((self.read_start <= pos) and (pos <= self.range_end)):
sys.stderr.write("Err\n")
sys.stderr.write(self.raw_str +"\n")
sys.stderr.write(str(self.read_start) + " " + str(pos) + " " + str(self.range_end) + "\n")
raise ValueError("Read does not cover specified position")
base_idx = self.get_base_idx(pos)
try:
base = self.seq[base_idx]
except:
print "Base index fail"
print "target pos", pos
print "Start pos", self.read_start
print self.raw_str
print base_idx
print len(self.seq)
return self.seq[base_idx]
def get_base_qual_at_pos(self, pos):
"""Gets base pair quality of a base at given position.
Args:
pos: Integer bp position on a chromosome
Returns:
Int representing base pair quality. ord(base_str) - 33
Raises:
ValueError if specified position is not covered by this read
"""
if not ((self.read_start <= pos) and (pos <= self.range_end)):
print self.read_start
print pos
print self.range_end
raise ValueError("Read does not cover specified position")
base_idx = self.get_base_idx(pos)
return ord(self.base_qual[base_idx]) - 33