| # |
| # Licensed to the Apache Software Foundation (ASF) under one |
| # or more contributor license agreements. See the NOTICE file |
| # distributed with this work for additional information |
| # regarding copyright ownership. The ASF licenses this file |
| # to you under the Apache License, Version 2.0 (the |
| # "License"); you may not use this file except in compliance |
| # with the License. You may obtain a copy of the License at |
| # |
| # http://www.apache.org/licenses/LICENSE-2.0 |
| # |
| # Unless required by applicable law or agreed to in writing, |
| # software distributed under the License is distributed on an |
| # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| # KIND, either express or implied. See the License for the |
| # specific language governing permissions and limitations |
| # under the License. |
| # |
| import re |
| |
| class Type: |
| |
| def __init__(self, name, pattern=None): |
| self.name = name |
| self.pattern = pattern |
| |
| def __repr__(self): |
| return self.name |
| |
| class Lexicon: |
| |
| def __init__(self): |
| self.types = [] |
| self._eof = None |
| |
| def define(self, name, pattern): |
| t = Type(name, pattern) |
| self.types.append(t) |
| return t |
| |
| def eof(self, name): |
| t = Type(name) |
| self._eof = t |
| return t |
| |
| def compile(self): |
| types = self.types[:] |
| joined = "|".join(["(%s)" % t.pattern for t in types]) |
| rexp = re.compile(joined) |
| return Lexer(types, self._eof, rexp) |
| |
| class Token: |
| |
| def __init__(self, type, value, input, position): |
| self.type = type |
| self.value = value |
| self.input = input |
| self.position = position |
| |
| def line_info(self): |
| return line_info(self.input, self.position) |
| |
| def __repr__(self): |
| if self.value is None: |
| return repr(self.type) |
| else: |
| return "%s(%s)" % (self.type, self.value) |
| |
| |
| class LexError(Exception): |
| pass |
| |
| def line_info(st, pos): |
| idx = 0 |
| lineno = 1 |
| column = 0 |
| line_pos = 0 |
| while idx < pos: |
| if st[idx] == "\n": |
| lineno += 1 |
| column = 0 |
| line_pos = idx |
| column += 1 |
| idx += 1 |
| |
| end = st.find("\n", line_pos) |
| if end < 0: |
| end = len(st) |
| line = st[line_pos:end] |
| |
| return line, lineno, column |
| |
| class Lexer: |
| |
| def __init__(self, types, eof, rexp): |
| self.types = types |
| self.eof = eof |
| self.rexp = rexp |
| self.byname = {} |
| for t in self.types + [eof]: |
| self.byname[t.name] = t |
| |
| def type(self, name): |
| return self.byname[name] |
| |
| def lex(self, st): |
| pos = 0 |
| while pos < len(st): |
| m = self.rexp.match(st, pos) |
| if m is None: |
| line, ln, col = line_info(st, pos) |
| raise LexError("unrecognized characters line:%s,%s: %s" % (ln, col, line)) |
| else: |
| idx = m.lastindex |
| t = Token(self.types[idx - 1], m.group(idx), st, pos) |
| yield t |
| pos = m.end() |
| yield Token(self.eof, None, st, pos) |