match_examples() now works for Earley+Standard

Note: This refactor opens the door for implementing a ContextualLexer for Earley. But unlike the existing one for LALR, it will have to be computed at runtime, rather than ahead of time.
5 years ago · 7fa993320e
--- a/lark/exceptions.py
+++ b/lark/exceptions.py
@@ -123,6 +123,7 @@ class UnexpectedEOF(ParseError, UnexpectedInput):

 class UnexpectedCharacters(LexError, UnexpectedInput):
    def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None):
        # TODO considered_tokens and allowed can be figured out using state
        self.line = line
        self.column = column
        self.pos_in_stream = lex_pos
@@ -154,6 +155,7 @@ class UnexpectedToken(ParseError, UnexpectedInput):
    see: :ref:`ParserPuppet`.
    """
    def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, token_history=None):
        # TODO considered_tokens and allowed can be figured out using state
        self.line = getattr(token, 'line', '?')
        self.column = getattr(token, 'column', '?')
        self.pos_in_stream = getattr(token, 'pos_in_stream', None)
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -353,7 +353,7 @@ class TraditionalLexer(Lexer):
                    allowed = {"<END-OF-FILE>"}
                raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,
                                           allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token],
                                           state=(parser_state and parser_state.position))
                                           state=parser_state)

            value, type_ = res

@@ -436,7 +436,7 @@ class ContextualLexer(Lexer):
            # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context.
            # This tests the input against the global context, to provide a nicer error.
            token = self.root_lexer.next_token(lexer_state, parser_state)
            raise UnexpectedToken(token, e.allowed, state=parser_state.position, token_history=[lexer_state.last_token])
            raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[lexer_state.last_token])


 class LexerThread:
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -173,9 +173,6 @@ class Earley(WithLexer):
        tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None
        self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class)

    def make_lexer(self, text):
        return WithLexer.make_lexer(self, text).lex(None)

    def match(self, term, token):
        return term.name == token.type

--- a/lark/parsers/earley.py
+++ b/lark/parsers/earley.py
@@ -146,7 +146,7 @@ class Parser:
                        column.add(new_item)
                        items.append(new_item)

    def _parse(self, stream, columns, to_scan, start_symbol=None):
    def _parse(self, lexer, columns, to_scan, start_symbol=None):
        def is_quasi_complete(item):
            if item.is_complete:
                return True
@@ -245,7 +245,7 @@ class Parser:

            if not next_set and not next_to_scan:
                expect = {i.expect.name for i in to_scan}
                raise UnexpectedToken(token, expect, considered_rules = set(to_scan))
                raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.expect for i in to_scan))

            return next_to_scan

@@ -261,20 +261,24 @@ class Parser:
        # Completions will be added to the SPPF tree, and predictions will be recursively
        # processed down to terminals/empty nodes to be added to the scanner for the next
        # step.
        expects = {i.expect for i in to_scan}
        i = 0
        for token in stream:
        for token in lexer.lex(expects):
            self.predict_and_complete(i, to_scan, columns, transitives)

            to_scan = scan(i, token, to_scan)
            i += 1

            expects.clear()
            expects |= {i.expect for i in to_scan}

        self.predict_and_complete(i, to_scan, columns, transitives)

        ## Column is now the final column in the parse.
        assert i == len(columns)-1
        return to_scan

    def parse(self, stream, start):
    def parse(self, lexer, start):
        assert start, start
        start_symbol = NonTerminal(start)

@@ -291,7 +295,7 @@ class Parser:
            else:
                columns[0].add(item)

        to_scan = self._parse(stream, columns, to_scan, start_symbol)
        to_scan = self._parse(lexer, columns, to_scan, start_symbol)

        # If the parse was successful, the start
        # symbol should have been completed in the last step of the Earley cycle, and will be in
@@ -299,7 +303,7 @@ class Parser:
        solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0]
        if not solutions:
            expected_terminals = [t.expect for t in to_scan]
            raise UnexpectedEOF(expected_terminals, state={i.s for i in to_scan})
            raise UnexpectedEOF(expected_terminals, state=frozenset(i.expect for i in to_scan))

        if self.debug:
            from .earley_forest import ForestToPyDotVisitor
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@@ -3,7 +3,7 @@
 # Author: Erez Shinan (2017)
 # Email : erezshin@gmail.com
 from copy import deepcopy, copy
 from ..exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken
 from ..exceptions import UnexpectedInput, UnexpectedToken
 from ..lexer import Token

 from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable
@@ -62,6 +62,12 @@ class ParserState:
    def position(self):
        return self.state_stack[-1]

    # Necessary for match_examples() to work
    def __eq__(self, other):
        if not isinstance(other, ParserState):
            return False
        return self.position == other.position

    def __copy__(self):
        return type(self)(
            self.parse_conf,
@@ -86,7 +92,7 @@ class ParserState:
                action, arg = states[state][token.type]
            except KeyError:
                expected = {s for s in states[state].keys() if s.isupper()}
                raise UnexpectedToken(token, expected, state=state, puppet=None)
                raise UnexpectedToken(token, expected, state=self, puppet=None)

            assert arg != end_state

--- a/lark/parsers/xearley.py
+++ b/lark/parsers/xearley.py
@@ -113,7 +113,8 @@ class Parser(BaseParser):
            del delayed_matches[i+1]    # No longer needed, so unburden memory

            if not next_set and not delayed_matches and not next_to_scan:
                raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan}, set(to_scan), state={i.s for i in next_to_scan})
                raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan},
                                           set(to_scan), state=frozenset(i.expect for i in to_scan))

            return next_to_scan

--- a/lark/tree_matcher.py
+++ b/lark/tree_matcher.py
@@ -69,6 +69,14 @@ def parse_rulename(s):
    return name, args



 class ChildrenLexer:
    def __init__(self, children):
        self.children = children

    def lex(self, parser_state):
        return self.children

 class TreeMatcher:
    """Match the elements of a tree node, based on an ontology
    provided by a Lark grammar.
@@ -173,6 +181,6 @@ class TreeMatcher:
            self._parser_cache[rulename] = parser

        # find a full derivation
        unreduced_tree = parser.parse(tree.children, rulename)
        unreduced_tree = parser.parse(ChildrenLexer(tree.children), rulename)
        assert unreduced_tree.data == rulename
        return unreduced_tree
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -2342,7 +2342,7 @@ def _make_parser_test(LEXER, PARSER):
                self.assertEqual(a.line, 1)
                self.assertEqual(b.line, 2)

        @unittest.skipIf(LEXER=='standard' and PARSER!='lalr', "Puppet error handling only works with LALR for now")
        @unittest.skipIf(PARSER=='cyk', "match_examples() not supported for CYK")
        def test_match_examples(self):
            p = _Lark(r"""
                start: "a" "b" "c"
@@ -2355,11 +2355,15 @@ def _make_parser_test(LEXER, PARSER):
                    return u.match_examples(p.parse, {
                        0: ['abe'],
                        1: ['ab'],
                        2: ['cbc'],
                    })
                assert False

            assert match_error("abe") == 0
            assert match_error("ab") == 1
            assert match_error("bbc") == 2
            assert match_error("cbc") == 2
            self.assertEqual( match_error("dbc"), 2 )


        @unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')