-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathlexer.py
206 lines (183 loc) · 7.74 KB
/
lexer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import re, ast, sys, sympy
from errors import *
from utils import *
class Token:
def __init__(self, type, content, **kwargs):
self.type = type
self.content = content
self.values = kwargs
def type(self):
return self.type
def content(self):
return self.content
def values(self):
return self.values
def getValue(self, key):
return self.values[key]
def __str__(self):
return '<Token type=%s content=%s>' % (str(self.type), str(self.content))
def __repr__(self):
return str(self)
def __eq__(self, other):
return isinstance(other, Token) and other.type == self.type and other.content == self.content
class LexerMatcher:
def __init__(self, matcher, getter, getlast = False, skip = lambda *a: 0):
self.matcher = matcher
self.getter = getter
self.skip = skip
self.getlast = getlast
def match(self, *code_or_last):
return self.matcher(*code_or_last)
def get(self, code, match):
return self.getter(code, match)
def skip(self, code, match):
return self.skip(code, match)
class RegexMatcher(LexerMatcher):
def __init__(self, regex, group, tokentype, processor = lambda x: x, offset = 0, **kwargs):
self.regex = regex
self.group = group
self.tokentype = tokentype
self.processor = processor
self.offset = offset
self.values = kwargs
self.getlast = False
def match(self, code):
return re.match(self.regex, code)
def get(self, code, match):
return (match.span()[1] + self.offset, Token(self.tokentype, self.processor(match.group(self.group)), **self.values))
def skip(self, code, match):
return match.span()[1] * (self.group == -1)
class ErrorMatcher(LexerMatcher):
def __init__(self, matcher, errortype):
self.matcher = matcher
self.errortype = errortype
self.getlast = False
def match(self, code):
return self.matcher.match(code)
def get(self, code, match):
raise self.errortype()
def skip(self, code, match):
return self.matcher.skip(code, match)
identifier_chars = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789'
def findv(operator, values):
for value in values:
if operator in value[1]: return value[0]
def oper_matcher(array, values, counter = []):
return LexerMatcher(lambda code: (lambda x: (lambda y: y if y not in counter else '')(x and max(x, key=len)))([operator for operator in array if code.startswith(operator)]), lambda code, match: (len(match), Token(findv(match, values), match)))
class Lexer:
def __init__(self, rules, code):
self.rules = rules
self.code = code
self.index = 0
self.last = None
def __iter__(self):
return self
def __next__(self):
if self.index >= len(self.code): raise StopIteration
for rule in self.rules:
code = self.code[self.index:]
match = rule.match(code, self.last) if rule.getlast else rule.match(code)
if match:
skip = rule.skip(code, match)
if skip:
self.index += skip
self.last = self.__next__()
return self.last
else:
token = rule.get(code, match)
if token is not None:
self.index += token[0]
self.last = token[1]
return self.last
raise RuntimeError('Unknown token at index %d: "...%s..."' % (self.index, self.code[self.index:][:10].replace('\n', '\\n')))
binary_RTL = [
('**',),
('=',)
]
binary_operators = [
('.',),
('**',),
('*', '/', '//', '||', '%'),
('+', '-'),
('>>', '<<'),
('&',),
('^',),
('|',),
('>', '<', '<=', '>='),
('==', '!=', ':=', '=', '=:'),
('..',),
('in', 'not in', 'is', 'are', 'is not', 'are not', 'inside', 'not inside'),
('and', 'nand'),
('or', 'nor'),
('**=', '*=', '/=', '//=', '+=', '-=', '>>=', '<<=', '%=', '&=', '|=', '&&=', '||='),
]
prefix_operators = ['!', '++', '--', '~', '@', '$', '$$', '!!', '%%']
postfix_operators = ['!', '++', '--', '??']
unifix_operators = prefix_operators + postfix_operators
def recurstr(array):
if isinstance(array, (map,)):
array = list(array)
if isinstance(array, list):
return str(list(map(recurstr, array)))
return str(array)
keywords = ['if', 'else', 'unless', 'while', 'for', 'try', 'except', 'exist not', 'exist', 'exists not', 'exists', 'break', 'continue', 'import', 'include', 'as', 'from', 'to', 'by', 'timeof', 'sizeof', 'del', 'return', 'repeat', 'const']
ignore = ('not',)
operators = sum(binary_operators, ()) + sum(binary_RTL, ()) + tuple(unifix_operators)
def flags(key):
flag = 0
if 'a' in key:
flag += re.ASCII
if 'i' in key:
flag += re.IGNORECASE
if 'l' in key:
flag += re.LOCALE
if 'm' in key:
flag += re.MULTILINE
if 's' in key:
flag += re.DOTALL
if 'x' in key:
flag += re.VERBOSE
return flag
def intify(base):
def inner(string):
left, right = re.split('[^0-9]', string, 1)
return sympy.Integer(int(right, base) * (base ** int(left)))
return inner
matchers = [
RegexMatcher(r'#.+', -1, 'comment'),
RegexMatcher(r'/\*([^*]|\*[^/])*\*/', -1, 'comment'),
RegexMatcher('(%s)' % '|'.join('\\(\\s*%s\\s*\\)' % re.escape(operator) for operator in sum(binary_operators, ())), 1, 'binopfunc/expression', lambda x: x[1:-1].strip()),
RegexMatcher('(%s)' % '|'.join('\\(\\s*%s\\s*\\)' % re.escape(operator) for operator in unifix_operators), 1, 'unopfunc/expression', lambda x: x[1:-1].strip()),
LexerMatcher(lambda code, last: None if last and ('expression' in last.type or 'bracket' in last.type and ')' == last.content) else re.match(r'/((\s*([^)/]|\\.)([^/\\]|\\.)*)?)/([ailmsx]*)', code), lambda code, match: (match.end(), Token('literal:expression', re.compile(match.group(1), flags(match.groups()[-1])))), getlast = True),
LexerMatcher(lambda code, last: None if last and ('expression' in last.type or 'bracket' in last.type and ')' == last.content) else re.match(r'/((\s*([^)/]|\\.)([^/\\]|\\.)*)?)/', code), lambda code, match: (match.end(), Token('literal:expression', re.compile(match.group(1)))), getlast = True),
RegexMatcher(r'\d+b[01]+', 0, 'literal:expression', intify(2)),
RegexMatcher(r'\d+o[0-7]+', 0, 'literal:expression', intify(8)),
RegexMatcher(r'\d+x[0-9a-fA-F]+', 0, 'literal:expression', intify(16)),
RegexMatcher(r'\d+e\d+', 0, 'literal:expression', lambda x: (lambda y: sympy.Integer(y[0]) * 10 ** sympy.Integer(y[1]))(x.split('e'))),
RegexMatcher(r'\d*\.\d+j', 0, 'literal:expression', lambda x: sympy.I * sympy.Rational(x[:-1])),
RegexMatcher(r'\d+j', 0, 'literal:expression', lambda x: sympy.I * int(x[:-1])),
RegexMatcher(r'\d*\.\d+', 0, 'literal:expression', sympy.Rational),
RegexMatcher(r'\d+', 0, 'literal:expression', sympy.Integer),
RegexMatcher(r'"([^"\\]|\\.)*"', 0, 'literal:expression', lambda x: x[1:-1]),
RegexMatcher(r"'([^'\\]|\\.)*'", 0, 'literal:expression', lambda x: x[1:-1]),
ErrorMatcher(RegexMatcher(r'"([^"\\]|\\.)*', 0, ''), UnclosedStringError),
ErrorMatcher(RegexMatcher(r"'([^'\\]|\\.)*", 0, ''), UnclosedStringError),
RegexMatcher('(%s)' % '|'.join(['(%s)[^A-Za-z_]' % keyword for keyword in keywords]), 1, 'keyword', lambda x: x[:-1], -1),
LexerMatcher(lambda code: re.match('[A-Za-z_][A-Za-z_0-9]*', code), lambda code, match: None if match.group() in operators + ignore else (match.end(), Token('keyword' if match.group() in keywords else 'identifier:expression', match.group()))),
RegexMatcher(r';', 0, 'semicolon'),
RegexMatcher(r',', 0, 'comma'),
RegexMatcher(r'(\?)[^?]', 1, 'ternary', offset = -1),
RegexMatcher(r':>', 0, 'maparrow'),
RegexMatcher(r'->', 0, 'arrow'),
RegexMatcher(r'=>', 0, 'lambda'),
oper_matcher(operators, [('unifix_operator', unifix_operators), ('binary_RTL', sum(binary_RTL, ())), ('binary_operator', sum(binary_operators, ()))]),
RegexMatcher(r':', 0, 'colon'),
RegexMatcher(r'[\(\)\[\]\{\}]', 0, 'bracket'),
RegexMatcher(r'\s+', -1, 'whitespace'),
]
def tokens(code, matchers = matchers):
return Lexer(matchers, code)
def tokenize(code, matchers = matchers):
return list(tokens(code, matchers))
if __name__ == '__main__':
for i in tokens("open(sys.argv[1], 'r').read()" and input()): print(i)