Merge branch 'MegaIng-better-terminals'

5 years ago · 36a7b050c1
--- a/lark-stubs/lexer.pyi
+++ b/lark-stubs/lexer.pyi
@@ -12,6 +12,7 @@ _T = TypeVar('_T')
 class Pattern(ABC):
    value: str
    flags: Collection[str]
    raw: str
    def __init__(self, value: str, flags: Collection[str] = ...):
        ...
@@ -73,6 +74,8 @@ class TerminalDef:
    def __init__(self, name: str, pattern: Pattern, priority: int = ...):
        ...
    def user_repr(self) -> str: ...
 class Token(str):
--- a/lark/common.py
+++ b/lark/common.py
@@ -1,3 +1,5 @@
 from warnings import warn
 from .utils import Serialize
 from .lexer import TerminalDef
@@ -5,11 +7,13 @@ from .lexer import TerminalDef
 class LexerConf(Serialize):
    __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type'
    __serialize_fields__ = 'terminals', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type'
    __serialize_namespace__ = TerminalDef,
    def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False):
        self.tokens = tokens    # TODO should be terminals
    def __init__(self, terminals, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False):
        self.terminals = terminals
        self.terminals_by_name = {t.name: t for t in self.terminals}
        assert len(self.terminals) == len(self.terminals_by_name)
        self.ignore = ignore
        self.postlex = postlex
        self.callbacks = callbacks or {}
@@ -17,9 +21,17 @@ class LexerConf(Serialize):
        self.re_module = re_module
        self.skip_validation = skip_validation
        self.use_bytes = use_bytes
        self.lexer_type = None
    @property
    def tokens(self):
        warn("LexerConf.tokens is deprecated. Use LexerConf.terminals instead", DeprecationWarning)
        return self.terminals
    def _deserialize(self):
        self.terminals_by_name = {t.name: t for t in self.terminals}
 class ParserConf(Serialize):
    __serialize_fields__ = 'rules', 'start', 'parser_type'
--- a/lark/exceptions.py
+++ b/lark/exceptions.py
@@ -1,4 +1,5 @@
 from .utils import STRING_TYPE, logger
 from .utils import STRING_TYPE, logger, NO_VALUE
 ###{standalone
@@ -39,6 +40,7 @@ class UnexpectedInput(LarkError):
    After catching one of these exceptions, you may call the following helper methods to create a nicer error message.
    """
    pos_in_stream = None
    _terminals_by_name = None
    def get_context(self, text, span=40):
        """Returns a pretty string pinpointing the error in the text,
@@ -95,7 +97,7 @@ class UnexpectedInput(LarkError):
                    if ut.state == self.state:
                        if use_accepts and hasattr(self, 'accepts') and ut.accepts != self.accepts:
                            logger.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" %
                                        (self.state, self.accepts, ut.accepts, i, j))
                                         (self.state, self.accepts, ut.accepts, i, j))
                            continue
                        try:
                            if ut.token == self.token:  # Try exact match first
@@ -116,44 +118,61 @@ class UnexpectedInput(LarkError):
        return candidate[0]
    def _format_expected(self, expected):
        if self._terminals_by_name:
            expected = [self._terminals_by_name[t_name].user_repr() for t_name in expected]
        return "Expected one of: \n\t* %s\n" % '\n\t* '.join(expected)
 class UnexpectedEOF(ParseError, UnexpectedInput):
    def __init__(self, expected, state=None):
    def __init__(self, expected, state=None, terminals_by_name=None):
        self.expected = expected
        self.state = state
        from .lexer import Token
        self.token = Token("<EOF>", "") #, line=-1, column=-1, pos_in_stream=-1)
        self.token = Token("<EOF>", "")  # , line=-1, column=-1, pos_in_stream=-1)
        self.pos_in_stream = -1
        self.line = -1
        self.column = -1
        self._terminals_by_name = terminals_by_name
        message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected))
        super(UnexpectedEOF, self).__init__(message)
        super(UnexpectedEOF, self).__init__()
    def __str__(self):
        message = "Unexpected end-of-input. "
        message += self._format_expected(self.expected)
        return message
 class UnexpectedCharacters(LexError, UnexpectedInput):
    def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None):
    def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None,
                 terminals_by_name=None):
        # TODO considered_tokens and allowed can be figured out using state
        self.line = line
        self.column = column
        self.pos_in_stream = lex_pos
        self.state = state
        self._terminals_by_name = terminals_by_name
        self.allowed = allowed
        self.considered_tokens = considered_tokens
        self.token_history = token_history
        if isinstance(seq, bytes):
            _s = seq[lex_pos:lex_pos+1].decode("ascii", "backslashreplace")
            self.char = seq[lex_pos:lex_pos + 1].decode("ascii", "backslashreplace")
        else:
            _s = seq[lex_pos]
            self.char = seq[lex_pos]
        self._context = self.get_context(seq)
        message = "No terminal defined for %r at line %d col %d" % (_s, line, column)
        message += '\n\n' + self.get_context(seq)
        if allowed:
            message += '\nExpecting: %s\n' % allowed
        if token_history:
            message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in token_history)
        super(UnexpectedCharacters, self).__init__()
        super(UnexpectedCharacters, self).__init__(message)
    def __str__(self):
        message = "No terminal defined for '%s' at line %d col %d" % (self.char, self.line, self.column)
        message += '\n\n' + self._context
        if self.allowed:
            message += self._format_expected(self.allowed)
        if self.token_history:
            message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in self.token_history)
        return message
 class UnexpectedToken(ParseError, UnexpectedInput):
@@ -163,7 +182,8 @@ class UnexpectedToken(ParseError, UnexpectedInput):
    see: :ref:`ParserPuppet`.
    """
    def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, token_history=None):
    def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, terminals_by_name=None, token_history=None):
        # TODO considered_rules and expected can be figured out using state
        self.line = getattr(token, 'line', '?')
        self.column = getattr(token, 'column', '?')
@@ -171,23 +191,28 @@ class UnexpectedToken(ParseError, UnexpectedInput):
        self.state = state
        self.token = token
        self.expected = expected     # XXX deprecate? `accepts` is better
        self.expected = expected  # XXX deprecate? `accepts` is better
        self._accepts = NO_VALUE
        self.considered_rules = considered_rules
        self.puppet = puppet
        self._terminals_by_name = terminals_by_name
        self.token_history = token_history
        # TODO Only calculate `accepts()` when we need to display it to the user
        # This will improve performance when doing automatic error handling
        self.accepts = puppet and puppet.accepts()
        super(UnexpectedToken, self).__init__()
        message = ("Unexpected token %r at line %s, column %s.\n"
                   "Expected one of: \n\t* %s\n"
                   % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected)))
    @property
    def accepts(self):
        if self._accepts is NO_VALUE:
            self._accepts = self.puppet and self.puppet.accepts()
        return self._accepts
    def __str__(self):
        message = ("Unexpected token %r at line %s, column %s.\n%s"
                   % (self.token, self.line, self.column, self._format_expected(self.accepts or self.expected)))
        if self.token_history:
            message += "Previous tokens: %r\n" % token_history
            message += "Previous tokens: %r\n" % self.token_history
        super(UnexpectedToken, self).__init__(message)
        return message
 class VisitError(LarkError):
@@ -197,6 +222,7 @@ class VisitError(LarkError):
    - obj: the tree node or token it was processing when the exception was raised
    - orig_exc: the exception that cause it to fail
    """
    def __init__(self, rule, obj, orig_exc):
        self.obj = obj
        self.orig_exc = orig_exc
@@ -204,5 +230,4 @@ class VisitError(LarkError):
        message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc)
        super(VisitError, self).__init__(message)
 ###}
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -416,7 +416,7 @@ class Lark(Serialize):
            self._callbacks,
            self.options,  # Not all, but multiple attributes are used
        )
        self.terminals = self.parser.lexer_conf.tokens
        self.terminals = self.parser.lexer_conf.terminals
        self._terminals_dict = {t.name: t for t in self.terminals}
        return self
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -11,9 +11,10 @@ from copy import copy
 class Pattern(Serialize):
    def __init__(self, value, flags=()):
    def __init__(self, value, flags=(), raw=None):
        self.value = value
        self.flags = frozenset(flags)
        self.raw = raw
    def __repr__(self):
        return repr(self.to_regexp())
@@ -92,6 +93,12 @@ class TerminalDef(Serialize):
    def __repr__(self):
        return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)
    def user_repr(self):
        if self.name.startswith('__'): # We represent a generated terminal
            return self.pattern.raw or self.name
        else:
            return self.name
 class Token(Str):
    """A string with meta-information, that is produced by the lexer.
@@ -283,7 +290,7 @@ class Lexer(object):
 class TraditionalLexer(Lexer):
    def __init__(self, conf):
        terminals = list(conf.tokens)
        terminals = list(conf.terminals)
        assert all(isinstance(t, TerminalDef) for t in terminals), terminals
        self.re = conf.re_module
@@ -310,6 +317,7 @@ class TraditionalLexer(Lexer):
        self.user_callbacks = conf.callbacks
        self.g_regex_flags = conf.g_regex_flags
        self.use_bytes = conf.use_bytes
        self.terminals_by_name = conf.terminals_by_name
        self._mres = None
@@ -353,7 +361,7 @@ class TraditionalLexer(Lexer):
                    allowed = {"<END-OF-FILE>"}
                raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
                                           allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token],
                                           state=parser_state)
                                           state=parser_state, terminals_by_name=self.terminals_by_name)
            value, type_ = res
@@ -394,14 +402,11 @@ class LexerState:
 class ContextualLexer(Lexer):
    def __init__(self, conf, states, always_accept=()):
        terminals = list(conf.tokens)
        tokens_by_name = {}
        for t in terminals:
            assert t.name not in tokens_by_name, t
            tokens_by_name[t.name] = t
        terminals = list(conf.terminals)
        terminals_by_name = conf.terminals_by_name
        trad_conf = copy(conf)
        trad_conf.tokens = terminals
        trad_conf.terminals = terminals
        lexer_by_tokens = {}
        self.lexers = {}
@@ -411,15 +416,14 @@ class ContextualLexer(Lexer):
                lexer = lexer_by_tokens[key]
            except KeyError:
                accepts = set(accepts) | set(conf.ignore) | set(always_accept)
                state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name]
                lexer_conf = copy(trad_conf)
                lexer_conf.tokens = state_tokens
                lexer_conf.terminals = [terminals_by_name[n] for n in accepts if n in terminals_by_name]
                lexer = TraditionalLexer(lexer_conf)
                lexer_by_tokens[key] = lexer
            self.lexers[state] = lexer
        assert trad_conf.tokens is terminals
        assert trad_conf.terminals is terminals
        self.root_lexer = TraditionalLexer(trad_conf)
    def make_lexer_state(self, text):
@@ -435,9 +439,12 @@ class ContextualLexer(Lexer):
        except UnexpectedCharacters as e:
            # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context.
            # This tests the input against the global context, to provide a nicer error.
            token = self.root_lexer.next_token(lexer_state, parser_state)
            raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[lexer_state.last_token])
            try:
                last_token = lexer_state.last_token  # Save last_token. Calling root_lexer.next_token will change this to the wrong token
                token = self.root_lexer.next_token(lexer_state, parser_state)
                raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[last_token], terminals_by_name=self.root_lexer.terminals_by_name)
            except UnexpectedCharacters:
                raise e  # Raise the original UnexpectedCharacters. The root lexer raises it with the wrong expected set.
 class LexerThread:
    """A thread that ties a lexer instance and a lexer state, to be used by the parser"""
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -454,9 +454,9 @@ def _literal_to_pattern(literal):
    if literal.type == 'STRING':
        s = s.replace('\\\\', '\\')
        return PatternStr(s, flags)
        return PatternStr(s, flags, raw=literal.value)
    elif literal.type == 'REGEXP':
        return PatternRE(s, flags)
        return PatternRE(s, flags, raw=literal.value)
    else:
        assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]'
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -170,7 +170,7 @@ CYK_FrontEnd = NotImplemented
 class EarleyRegexpMatcher:
    def __init__(self, lexer_conf):
        self.regexps = {}
        for t in lexer_conf.tokens:
        for t in lexer_conf.terminals:
            if t.priority != 1:
                raise GrammarError("Dynamic Earley doesn't support weights on terminals", t, t.priority)
            regexp = t.pattern.to_regexp()
--- a/lark/parsers/earley.py
+++ b/lark/parsers/earley.py
@@ -302,7 +302,7 @@ class Parser:
        # this column. Find the item for the start_symbol, which is the root of the SPPF tree.
        solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0]
        if not solutions:
            expected_terminals = [t.expect for t in to_scan]
            expected_terminals = [t.expect.name for t in to_scan]
            raise UnexpectedEOF(expected_terminals, state=frozenset(i.s for i in to_scan))
        if self.debug:
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@@ -36,7 +36,7 @@ class LALR_Parser(Serialize):
        return self.parser.parse(*args)
 class ParseConf:
 class ParseConf(object):
    __slots__ = 'parse_table', 'callbacks', 'start', 'start_state', 'end_state', 'states'
    def __init__(self, parse_table, callbacks, start):
@@ -50,7 +50,7 @@ class ParseConf:
        self.start = start
 class ParserState:
 class ParserState(object):
    __slots__ = 'parse_conf', 'lexer', 'state_stack', 'value_stack'
    def __init__(self, parse_conf, lexer, state_stack=None, value_stack=None):
@@ -124,7 +124,7 @@ class ParserState:
                if is_end and state_stack[-1] == end_state:
                    return value_stack[-1]
 class _Parser:
 class _Parser(object):
    def __init__(self, parse_table, callbacks, debug=False):
        self.parse_table = parse_table
        self.callbacks = callbacks
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -13,6 +13,7 @@ logger.setLevel(logging.CRITICAL)
 Py36 = (sys.version_info[:2] >= (3, 6))
 NO_VALUE = object()
 def classify(seq, key=None, value=None):
    d = {}