""" Lexical analysis is the breaking of a string into tokens. """
import re import linecache from builtins import SyntaxError as BaseSyntaxError
class SyntaxError(BaseSyntaxError): pass
class UnexpectedEndError(SyntaxError): pass
class LexicalGrammar: """Quick and dirty lexer implementation.
In order to support multi-part lexing (multiple calls to .write()),
both 1. the `ignore` regular expression; and 2. the union of the family of
regular expressions given by `tokens` and `regexp`; must have have the
following property: if they match a string s, they also match every prefix
of that string.
This requirement isnot enforced by assertions; if it's not met, the
tokenizer will just have bugs when sent multiple chunks of data. """ def __init__(self, tokens, ignore=r'[ \t]*', **regexps): def token_to_re(token):
s = re.escape(token) if s.isalpha():
s += r'\b' return s
token_list = sorted(tokens.split(), key=len, reverse=True)
self.ignore_re = re.compile(ignore)
self.token_re = re.compile("|".join(token_to_re(token) for token in token_list))
self.parser_pairs = [(k, re.compile(v)) for k, v in regexps.items()]
# Drop the parsed text and reset counters. Note that setting # self.previous_token_end to 0 really is correct. Setting # self.current_token_start to 0 is as good as anything else, because # there is no current token.
self.src = self.src[self.point:]
self.point = 0
self.previous_token_end = 0
self.current_token_start = 0
def current_line(self): # OK, this is gruesome, but we return the current line if we have the # whole thing and otherwise we ... try loading it from disk. if'\n'in self.src[:self.current_token_start]:
line_start = self.src.rindex('\n', 0, self.current_token_start) + 1 elif self.start_column == 0:
line_start = 0 else:
line_start = -1
if line_start != -1:
line_end = self.src.find('\n', line_start) if line_end == -1: if self.closed: return self.src[line_start:] + '\n' else: return self.src[line_start:line_end] + '\n'
def throw(self, msg_or_exception):
lineno, column = self.current_token_position() if isinstance(msg_or_exception, Exception):
e = msg_or_exception
e.filename = self.filename
e.lineno = lineno
e.offset = column + 1 else: # Apparently this is the secret handshake to create a Python # SyntaxError and get a good error message when Python prints it.
line = self.current_line()
args = (self.filename, lineno, column + 1, line)
e = SyntaxError(msg_or_exception, args) raise e
def throw_unexpected_end(self):
self.throw(UnexpectedEndError("unexpected end of input"))
def saw_line_terminator(self): """True if there's a LineTerminator before the current token."""
i = self.previous_token_end
j = self.current_token_start
ws_between = self.src[i:j] return any(c in ws_between for c in'\r\n\u2028\u2029')
def _match(self, closing): # Advance over text matching ignore_re.
ignore_match = self.ignore_re.match(self.src, self.point) if ignore_match isNone: raise ValueError("ignore_re should always match")
point = ignore_match.end() if point == len(self.src): if closing:
self.point = point
self._current_match = None returnNone
# Try the token_re.
token_match = self.token_re.match(self.src, point)
# Try all the parser_pairs. for name, pattern in self.parser_pairs:
match = pattern.match(self.src, point) if match isnotNone: break else:
name = match = None
if match isnotNoneand token_match isnotNoneand match.end() > token_match.end(): pass elif token_match isnotNone:
name, match = token_match.group(0), token_match elif match isnotNone: pass else:
self.throw("unexpected characters {!r}"
.format(self.src[point:point + 12]))
# But how do we know subsequent .write() calls won't provide more text, # extending this token? Here we take advantage of the odd requirement # LexicalGrammar imposes on its users. Every prefix of a match is a # match. So if this hypothetical "extended" token would match, then the # entire remainder of self.src is a match. ifnot closing and match.end() == len(self.src): # This token might be extensible. Refuse to match.
self._current_match = None returnNone
# This token definitely is not extensible.
self.previous_token_end = self.point
self.current_token_start = match.start()
self.point = match.end()
self._current_match = match return name
Messung V0.5
¤ Dauer der Verarbeitung: 0.11 Sekunden
(vorverarbeitet)
¤
Die Informationen auf dieser Webseite wurden
nach bestem Wissen sorgfältig zusammengestellt. Es wird jedoch weder Vollständigkeit, noch Richtigkeit,
noch Qualität der bereit gestellten Informationen zugesichert.
Bemerkung:
Die farbliche Syntaxdarstellung und die Messung sind noch experimentell.