| @@ -12,6 +12,7 @@ _T = TypeVar('_T') | |||||
| class Pattern(ABC): | class Pattern(ABC): | ||||
| value: str | value: str | ||||
| flags: Collection[str] | flags: Collection[str] | ||||
| raw: str | |||||
| def __init__(self, value: str, flags: Collection[str] = ...): | def __init__(self, value: str, flags: Collection[str] = ...): | ||||
| ... | ... | ||||
| @@ -73,6 +74,8 @@ class TerminalDef: | |||||
| def __init__(self, name: str, pattern: Pattern, priority: int = ...): | def __init__(self, name: str, pattern: Pattern, priority: int = ...): | ||||
| ... | ... | ||||
| def user_repr(self) -> str: ... | |||||
| class Token(str): | class Token(str): | ||||
| @@ -1,3 +1,5 @@ | |||||
| from warnings import warn | |||||
| from .utils import Serialize | from .utils import Serialize | ||||
| from .lexer import TerminalDef | from .lexer import TerminalDef | ||||
| @@ -5,11 +7,13 @@ from .lexer import TerminalDef | |||||
| class LexerConf(Serialize): | class LexerConf(Serialize): | ||||
| __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type' | |||||
| __serialize_fields__ = 'terminals', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type' | |||||
| __serialize_namespace__ = TerminalDef, | __serialize_namespace__ = TerminalDef, | ||||
| def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): | |||||
| self.tokens = tokens # TODO should be terminals | |||||
| def __init__(self, terminals, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): | |||||
| self.terminals = terminals | |||||
| self.terminals_by_name = {t.name: t for t in self.terminals} | |||||
| assert len(self.terminals) == len(self.terminals_by_name) | |||||
| self.ignore = ignore | self.ignore = ignore | ||||
| self.postlex = postlex | self.postlex = postlex | ||||
| self.callbacks = callbacks or {} | self.callbacks = callbacks or {} | ||||
| @@ -17,9 +21,17 @@ class LexerConf(Serialize): | |||||
| self.re_module = re_module | self.re_module = re_module | ||||
| self.skip_validation = skip_validation | self.skip_validation = skip_validation | ||||
| self.use_bytes = use_bytes | self.use_bytes = use_bytes | ||||
| self.lexer_type = None | self.lexer_type = None | ||||
| @property | |||||
| def tokens(self): | |||||
| warn("LexerConf.tokens is deprecated. Use LexerConf.terminals instead", DeprecationWarning) | |||||
| return self.terminals | |||||
| def _deserialize(self): | |||||
| self.terminals_by_name = {t.name: t for t in self.terminals} | |||||
| class ParserConf(Serialize): | class ParserConf(Serialize): | ||||
| __serialize_fields__ = 'rules', 'start', 'parser_type' | __serialize_fields__ = 'rules', 'start', 'parser_type' | ||||
| @@ -1,4 +1,5 @@ | |||||
| from .utils import STRING_TYPE, logger | |||||
| from .utils import STRING_TYPE, logger, NO_VALUE | |||||
| ###{standalone | ###{standalone | ||||
| @@ -39,6 +40,7 @@ class UnexpectedInput(LarkError): | |||||
| After catching one of these exceptions, you may call the following helper methods to create a nicer error message. | After catching one of these exceptions, you may call the following helper methods to create a nicer error message. | ||||
| """ | """ | ||||
| pos_in_stream = None | pos_in_stream = None | ||||
| _terminals_by_name = None | |||||
| def get_context(self, text, span=40): | def get_context(self, text, span=40): | ||||
| """Returns a pretty string pinpointing the error in the text, | """Returns a pretty string pinpointing the error in the text, | ||||
| @@ -95,7 +97,7 @@ class UnexpectedInput(LarkError): | |||||
| if ut.state == self.state: | if ut.state == self.state: | ||||
| if use_accepts and hasattr(self, 'accepts') and ut.accepts != self.accepts: | if use_accepts and hasattr(self, 'accepts') and ut.accepts != self.accepts: | ||||
| logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % | logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % | ||||
| (self.state, self.accepts, ut.accepts, i, j)) | |||||
| (self.state, self.accepts, ut.accepts, i, j)) | |||||
| continue | continue | ||||
| try: | try: | ||||
| if ut.token == self.token: # Try exact match first | if ut.token == self.token: # Try exact match first | ||||
| @@ -116,44 +118,61 @@ class UnexpectedInput(LarkError): | |||||
| return candidate[0] | return candidate[0] | ||||
| def _format_expected(self, expected): | |||||
| if self._terminals_by_name: | |||||
| expected = [self._terminals_by_name[t_name].user_repr() for t_name in expected] | |||||
| return "Expected one of: \n\t* %s\n" % '\n\t* '.join(expected) | |||||
| class UnexpectedEOF(ParseError, UnexpectedInput): | class UnexpectedEOF(ParseError, UnexpectedInput): | ||||
| def __init__(self, expected, state=None): | |||||
| def __init__(self, expected, state=None, terminals_by_name=None): | |||||
| self.expected = expected | self.expected = expected | ||||
| self.state = state | self.state = state | ||||
| from .lexer import Token | from .lexer import Token | ||||
| self.token = Token("<EOF>", "") #, line=-1, column=-1, pos_in_stream=-1) | |||||
| self.token = Token("<EOF>", "") # , line=-1, column=-1, pos_in_stream=-1) | |||||
| self.pos_in_stream = -1 | self.pos_in_stream = -1 | ||||
| self.line = -1 | self.line = -1 | ||||
| self.column = -1 | self.column = -1 | ||||
| self._terminals_by_name = terminals_by_name | |||||
| message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected)) | |||||
| super(UnexpectedEOF, self).__init__(message) | |||||
| super(UnexpectedEOF, self).__init__() | |||||
| def __str__(self): | |||||
| message = "Unexpected end-of-input. " | |||||
| message += self._format_expected(self.expected) | |||||
| return message | |||||
| class UnexpectedCharacters(LexError, UnexpectedInput): | class UnexpectedCharacters(LexError, UnexpectedInput): | ||||
| def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): | |||||
| def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, | |||||
| terminals_by_name=None): | |||||
| # TODO considered_tokens and allowed can be figured out using state | # TODO considered_tokens and allowed can be figured out using state | ||||
| self.line = line | self.line = line | ||||
| self.column = column | self.column = column | ||||
| self.pos_in_stream = lex_pos | self.pos_in_stream = lex_pos | ||||
| self.state = state | self.state = state | ||||
| self._terminals_by_name = terminals_by_name | |||||
| self.allowed = allowed | self.allowed = allowed | ||||
| self.considered_tokens = considered_tokens | self.considered_tokens = considered_tokens | ||||
| self.token_history = token_history | |||||
| if isinstance(seq, bytes): | if isinstance(seq, bytes): | ||||
| _s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace") | |||||
| self.char = seq[lex_pos:lex_pos + 1].decode("ascii", "backslashreplace") | |||||
| else: | else: | ||||
| _s = seq[lex_pos] | |||||
| self.char = seq[lex_pos] | |||||
| self._context = self.get_context(seq) | |||||
| message = "No terminal defined for %r at line %d col %d" % (_s, line, column) | |||||
| message += '\n\n' + self.get_context(seq) | |||||
| if allowed: | |||||
| message += '\nExpecting: %s\n' % allowed | |||||
| if token_history: | |||||
| message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in token_history) | |||||
| super(UnexpectedCharacters, self).__init__() | |||||
| super(UnexpectedCharacters, self).__init__(message) | |||||
| def __str__(self): | |||||
| message = "No terminal defined for '%s' at line %d col %d" % (self.char, self.line, self.column) | |||||
| message += '\n\n' + self._context | |||||
| if self.allowed: | |||||
| message += self._format_expected(self.allowed) | |||||
| if self.token_history: | |||||
| message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in self.token_history) | |||||
| return message | |||||
| class UnexpectedToken(ParseError, UnexpectedInput): | class UnexpectedToken(ParseError, UnexpectedInput): | ||||
| @@ -163,7 +182,8 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||||
| see: :ref:`ParserPuppet`. | see: :ref:`ParserPuppet`. | ||||
| """ | """ | ||||
| def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, token_history=None): | |||||
| def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, terminals_by_name=None, token_history=None): | |||||
| # TODO considered_rules and expected can be figured out using state | # TODO considered_rules and expected can be figured out using state | ||||
| self.line = getattr(token, 'line', '?') | self.line = getattr(token, 'line', '?') | ||||
| self.column = getattr(token, 'column', '?') | self.column = getattr(token, 'column', '?') | ||||
| @@ -171,23 +191,28 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||||
| self.state = state | self.state = state | ||||
| self.token = token | self.token = token | ||||
| self.expected = expected # XXX deprecate? `accepts` is better | |||||
| self.expected = expected # XXX deprecate? `accepts` is better | |||||
| self._accepts = NO_VALUE | |||||
| self.considered_rules = considered_rules | self.considered_rules = considered_rules | ||||
| self.puppet = puppet | self.puppet = puppet | ||||
| self._terminals_by_name = terminals_by_name | |||||
| self.token_history = token_history | self.token_history = token_history | ||||
| # TODO Only calculate `accepts()` when we need to display it to the user | |||||
| # This will improve performance when doing automatic error handling | |||||
| self.accepts = puppet and puppet.accepts() | |||||
| super(UnexpectedToken, self).__init__() | |||||
| message = ("Unexpected token %r at line %s, column %s.\n" | |||||
| "Expected one of: \n\t* %s\n" | |||||
| % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) | |||||
| @property | |||||
| def accepts(self): | |||||
| if self._accepts is NO_VALUE: | |||||
| self._accepts = self.puppet and self.puppet.accepts() | |||||
| return self._accepts | |||||
| def __str__(self): | |||||
| message = ("Unexpected token %r at line %s, column %s.\n%s" | |||||
| % (self.token, self.line, self.column, self._format_expected(self.accepts or self.expected))) | |||||
| if self.token_history: | if self.token_history: | ||||
| message += "Previous tokens: %r\n" % token_history | |||||
| message += "Previous tokens: %r\n" % self.token_history | |||||
| super(UnexpectedToken, self).__init__(message) | |||||
| return message | |||||
| class VisitError(LarkError): | class VisitError(LarkError): | ||||
| @@ -197,6 +222,7 @@ class VisitError(LarkError): | |||||
| - obj: the tree node or token it was processing when the exception was raised | - obj: the tree node or token it was processing when the exception was raised | ||||
| - orig_exc: the exception that cause it to fail | - orig_exc: the exception that cause it to fail | ||||
| """ | """ | ||||
| def __init__(self, rule, obj, orig_exc): | def __init__(self, rule, obj, orig_exc): | ||||
| self.obj = obj | self.obj = obj | ||||
| self.orig_exc = orig_exc | self.orig_exc = orig_exc | ||||
| @@ -204,5 +230,4 @@ class VisitError(LarkError): | |||||
| message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) | message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) | ||||
| super(VisitError, self).__init__(message) | super(VisitError, self).__init__(message) | ||||
| ###} | ###} | ||||
| @@ -416,7 +416,7 @@ class Lark(Serialize): | |||||
| self._callbacks, | self._callbacks, | ||||
| self.options, # Not all, but multiple attributes are used | self.options, # Not all, but multiple attributes are used | ||||
| ) | ) | ||||
| self.terminals = self.parser.lexer_conf.tokens | |||||
| self.terminals = self.parser.lexer_conf.terminals | |||||
| self._terminals_dict = {t.name: t for t in self.terminals} | self._terminals_dict = {t.name: t for t in self.terminals} | ||||
| return self | return self | ||||
| @@ -11,9 +11,10 @@ from copy import copy | |||||
| class Pattern(Serialize): | class Pattern(Serialize): | ||||
| def __init__(self, value, flags=()): | |||||
| def __init__(self, value, flags=(), raw=None): | |||||
| self.value = value | self.value = value | ||||
| self.flags = frozenset(flags) | self.flags = frozenset(flags) | ||||
| self.raw = raw | |||||
| def __repr__(self): | def __repr__(self): | ||||
| return repr(self.to_regexp()) | return repr(self.to_regexp()) | ||||
| @@ -92,6 +93,12 @@ class TerminalDef(Serialize): | |||||
| def __repr__(self): | def __repr__(self): | ||||
| return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) | return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) | ||||
| def user_repr(self): | |||||
| if self.name.startswith('__'): # We represent a generated terminal | |||||
| return self.pattern.raw or self.name | |||||
| else: | |||||
| return self.name | |||||
| class Token(Str): | class Token(Str): | ||||
| """A string with meta-information, that is produced by the lexer. | """A string with meta-information, that is produced by the lexer. | ||||
| @@ -283,7 +290,7 @@ class Lexer(object): | |||||
| class TraditionalLexer(Lexer): | class TraditionalLexer(Lexer): | ||||
| def __init__(self, conf): | def __init__(self, conf): | ||||
| terminals = list(conf.tokens) | |||||
| terminals = list(conf.terminals) | |||||
| assert all(isinstance(t, TerminalDef) for t in terminals), terminals | assert all(isinstance(t, TerminalDef) for t in terminals), terminals | ||||
| self.re = conf.re_module | self.re = conf.re_module | ||||
| @@ -310,6 +317,7 @@ class TraditionalLexer(Lexer): | |||||
| self.user_callbacks = conf.callbacks | self.user_callbacks = conf.callbacks | ||||
| self.g_regex_flags = conf.g_regex_flags | self.g_regex_flags = conf.g_regex_flags | ||||
| self.use_bytes = conf.use_bytes | self.use_bytes = conf.use_bytes | ||||
| self.terminals_by_name = conf.terminals_by_name | |||||
| self._mres = None | self._mres = None | ||||
| @@ -353,7 +361,7 @@ class TraditionalLexer(Lexer): | |||||
| allowed = {"<END-OF-FILE>"} | allowed = {"<END-OF-FILE>"} | ||||
| raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, | raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, | ||||
| allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], | allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], | ||||
| state=parser_state) | |||||
| state=parser_state, terminals_by_name=self.terminals_by_name) | |||||
| value, type_ = res | value, type_ = res | ||||
| @@ -394,14 +402,11 @@ class LexerState: | |||||
| class ContextualLexer(Lexer): | class ContextualLexer(Lexer): | ||||
| def __init__(self, conf, states, always_accept=()): | def __init__(self, conf, states, always_accept=()): | ||||
| terminals = list(conf.tokens) | |||||
| tokens_by_name = {} | |||||
| for t in terminals: | |||||
| assert t.name not in tokens_by_name, t | |||||
| tokens_by_name[t.name] = t | |||||
| terminals = list(conf.terminals) | |||||
| terminals_by_name = conf.terminals_by_name | |||||
| trad_conf = copy(conf) | trad_conf = copy(conf) | ||||
| trad_conf.tokens = terminals | |||||
| trad_conf.terminals = terminals | |||||
| lexer_by_tokens = {} | lexer_by_tokens = {} | ||||
| self.lexers = {} | self.lexers = {} | ||||
| @@ -411,15 +416,14 @@ class ContextualLexer(Lexer): | |||||
| lexer = lexer_by_tokens[key] | lexer = lexer_by_tokens[key] | ||||
| except KeyError: | except KeyError: | ||||
| accepts = set(accepts) | set(conf.ignore) | set(always_accept) | accepts = set(accepts) | set(conf.ignore) | set(always_accept) | ||||
| state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] | |||||
| lexer_conf = copy(trad_conf) | lexer_conf = copy(trad_conf) | ||||
| lexer_conf.tokens = state_tokens | |||||
| lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name] | |||||
| lexer = TraditionalLexer(lexer_conf) | lexer = TraditionalLexer(lexer_conf) | ||||
| lexer_by_tokens[key] = lexer | lexer_by_tokens[key] = lexer | ||||
| self.lexers[state] = lexer | self.lexers[state] = lexer | ||||
| assert trad_conf.tokens is terminals | |||||
| assert trad_conf.terminals is terminals | |||||
| self.root_lexer = TraditionalLexer(trad_conf) | self.root_lexer = TraditionalLexer(trad_conf) | ||||
| def make_lexer_state(self, text): | def make_lexer_state(self, text): | ||||
| @@ -435,9 +439,12 @@ class ContextualLexer(Lexer): | |||||
| except UnexpectedCharacters as e: | except UnexpectedCharacters as e: | ||||
| # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. | # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. | ||||
| # This tests the input against the global context, to provide a nicer error. | # This tests the input against the global context, to provide a nicer error. | ||||
| token = self.root_lexer.next_token(lexer_state, parser_state) | |||||
| raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[lexer_state.last_token]) | |||||
| try: | |||||
| last_token = lexer_state.last_token # Save last_token. Calling root_lexer.next_token will change this to the wrong token | |||||
| token = self.root_lexer.next_token(lexer_state, parser_state) | |||||
| raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[last_token], terminals_by_name=self.root_lexer.terminals_by_name) | |||||
| except UnexpectedCharacters: | |||||
| raise e # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set. | |||||
| class LexerThread: | class LexerThread: | ||||
| """A thread that ties a lexer instance and a lexer state, to be used by the parser""" | """A thread that ties a lexer instance and a lexer state, to be used by the parser""" | ||||
| @@ -454,9 +454,9 @@ def _literal_to_pattern(literal): | |||||
| if literal.type == 'STRING': | if literal.type == 'STRING': | ||||
| s = s.replace('\\\\', '\\') | s = s.replace('\\\\', '\\') | ||||
| return PatternStr(s, flags) | |||||
| return PatternStr(s, flags, raw=literal.value) | |||||
| elif literal.type == 'REGEXP': | elif literal.type == 'REGEXP': | ||||
| return PatternRE(s, flags) | |||||
| return PatternRE(s, flags, raw=literal.value) | |||||
| else: | else: | ||||
| assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]' | assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]' | ||||
| @@ -170,7 +170,7 @@ CYK_FrontEnd = NotImplemented | |||||
| class EarleyRegexpMatcher: | class EarleyRegexpMatcher: | ||||
| def __init__(self, lexer_conf): | def __init__(self, lexer_conf): | ||||
| self.regexps = {} | self.regexps = {} | ||||
| for t in lexer_conf.tokens: | |||||
| for t in lexer_conf.terminals: | |||||
| if t.priority != 1: | if t.priority != 1: | ||||
| raise GrammarError("Dynamic Earley doesn't support weights on terminals", t, t.priority) | raise GrammarError("Dynamic Earley doesn't support weights on terminals", t, t.priority) | ||||
| regexp = t.pattern.to_regexp() | regexp = t.pattern.to_regexp() | ||||
| @@ -302,7 +302,7 @@ class Parser: | |||||
| # this column. Find the item for the start_symbol, which is the root of the SPPF tree. | # this column. Find the item for the start_symbol, which is the root of the SPPF tree. | ||||
| solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] | solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] | ||||
| if not solutions: | if not solutions: | ||||
| expected_terminals = [t.expect for t in to_scan] | |||||
| expected_terminals = [t.expect.name for t in to_scan] | |||||
| raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan)) | raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan)) | ||||
| if self.debug: | if self.debug: | ||||
| @@ -36,7 +36,7 @@ class LALR_Parser(Serialize): | |||||
| return self.parser.parse(*args) | return self.parser.parse(*args) | ||||
| class ParseConf: | |||||
| class ParseConf(object): | |||||
| __slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states' | __slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states' | ||||
| def __init__(self, parse_table, callbacks, start): | def __init__(self, parse_table, callbacks, start): | ||||
| @@ -50,7 +50,7 @@ class ParseConf: | |||||
| self.start = start | self.start = start | ||||
| class ParserState: | |||||
| class ParserState(object): | |||||
| __slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack' | __slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack' | ||||
| def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None): | def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None): | ||||
| @@ -124,7 +124,7 @@ class ParserState: | |||||
| if is_end and state_stack[-1] == end_state: | if is_end and state_stack[-1] == end_state: | ||||
| return value_stack[-1] | return value_stack[-1] | ||||
| class _Parser: | |||||
| class _Parser(object): | |||||
| def __init__(self, parse_table, callbacks, debug=False): | def __init__(self, parse_table, callbacks, debug=False): | ||||
| self.parse_table = parse_table | self.parse_table = parse_table | ||||
| self.callbacks = callbacks | self.callbacks = callbacks | ||||
| @@ -13,6 +13,7 @@ logger.setLevel(logging.CRITICAL) | |||||
| Py36 = (sys.version_info[:2] >= (3, 6)) | Py36 = (sys.version_info[:2] >= (3, 6)) | ||||
| NO_VALUE = object() | |||||
| def classify(seq, key=None, value=None): | def classify(seq, key=None, value=None): | ||||
| d = {} | d = {} | ||||