diff --git a/lark/exceptions.py b/lark/exceptions.py index 92ac019..1d63561 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -123,6 +123,7 @@ class UnexpectedEOF(ParseError, UnexpectedInput): class UnexpectedCharacters(LexError, UnexpectedInput): def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): + # TODO considered_tokens and allowed can be figured out using state self.line = line self.column = column self.pos_in_stream = lex_pos @@ -154,6 +155,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): see: :ref:`ParserPuppet`. """ def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, token_history=None): + # TODO considered_tokens and allowed can be figured out using state self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') self.pos_in_stream = getattr(token, 'pos_in_stream', None) diff --git a/lark/lexer.py b/lark/lexer.py index 8be8acd..bda8497 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -353,7 +353,7 @@ class TraditionalLexer(Lexer): allowed = {""} raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], - state=(parser_state and parser_state.position)) + state=parser_state) value, type_ = res @@ -436,7 +436,7 @@ class ContextualLexer(Lexer): # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. # This tests the input against the global context, to provide a nicer error. token = self.root_lexer.next_token(lexer_state, parser_state) - raise UnexpectedToken(token, e.allowed, state=parser_state.position, token_history=[lexer_state.last_token]) + raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[lexer_state.last_token]) class LexerThread: diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 337ddeb..abc0fba 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -173,9 +173,6 @@ class Earley(WithLexer): tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class) - def make_lexer(self, text): - return WithLexer.make_lexer(self, text).lex(None) - def match(self, term, token): return term.name == token.type diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index aa18371..e4a220a 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -146,7 +146,7 @@ class Parser: column.add(new_item) items.append(new_item) - def _parse(self, stream, columns, to_scan, start_symbol=None): + def _parse(self, lexer, columns, to_scan, start_symbol=None): def is_quasi_complete(item): if item.is_complete: return True @@ -245,7 +245,7 @@ class Parser: if not next_set and not next_to_scan: expect = {i.expect.name for i in to_scan} - raise UnexpectedToken(token, expect, considered_rules = set(to_scan)) + raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.expect for i in to_scan)) return next_to_scan @@ -261,20 +261,24 @@ class Parser: # Completions will be added to the SPPF tree, and predictions will be recursively # processed down to terminals/empty nodes to be added to the scanner for the next # step. + expects = {i.expect for i in to_scan} i = 0 - for token in stream: + for token in lexer.lex(expects): self.predict_and_complete(i, to_scan, columns, transitives) to_scan = scan(i, token, to_scan) i += 1 + expects.clear() + expects |= {i.expect for i in to_scan} + self.predict_and_complete(i, to_scan, columns, transitives) ## Column is now the final column in the parse. assert i == len(columns)-1 return to_scan - def parse(self, stream, start): + def parse(self, lexer, start): assert start, start start_symbol = NonTerminal(start) @@ -291,7 +295,7 @@ class Parser: else: columns[0].add(item) - to_scan = self._parse(stream, columns, to_scan, start_symbol) + to_scan = self._parse(lexer, columns, to_scan, start_symbol) # If the parse was successful, the start # symbol should have been completed in the last step of the Earley cycle, and will be in @@ -299,7 +303,7 @@ class Parser: solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] if not solutions: expected_terminals = [t.expect for t in to_scan] - raise UnexpectedEOF(expected_terminals, state={i.s for i in to_scan}) + raise UnexpectedEOF(expected_terminals, state=frozenset(i.expect for i in to_scan)) if self.debug: from .earley_forest import ForestToPyDotVisitor diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index e8c4432..3d006e7 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -3,7 +3,7 @@ # Author: Erez Shinan (2017) # Email : erezshin@gmail.com from copy import deepcopy, copy -from ..exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken +from ..exceptions import UnexpectedInput, UnexpectedToken from ..lexer import Token from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable @@ -62,6 +62,12 @@ class ParserState: def position(self): return self.state_stack[-1] + # Necessary for match_examples() to work + def __eq__(self, other): + if not isinstance(other, ParserState): + return False + return self.position == other.position + def __copy__(self): return type(self)( self.parse_conf, @@ -86,7 +92,7 @@ class ParserState: action, arg = states[state][token.type] except KeyError: expected = {s for s in states[state].keys() if s.isupper()} - raise UnexpectedToken(token, expected, state=state, puppet=None) + raise UnexpectedToken(token, expected, state=self, puppet=None) assert arg != end_state diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index ae98f0f..cf9b6ec 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -113,7 +113,8 @@ class Parser(BaseParser): del delayed_matches[i+1] # No longer needed, so unburden memory if not next_set and not delayed_matches and not next_to_scan: - raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan}, set(to_scan), state={i.s for i in next_to_scan}) + raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan}, + set(to_scan), state=frozenset(i.expect for i in to_scan)) return next_to_scan diff --git a/lark/tree_matcher.py b/lark/tree_matcher.py index 8c1f17a..c9d9fde 100644 --- a/lark/tree_matcher.py +++ b/lark/tree_matcher.py @@ -69,6 +69,14 @@ def parse_rulename(s): return name, args + +class ChildrenLexer: + def __init__(self, children): + self.children = children + + def lex(self, parser_state): + return self.children + class TreeMatcher: """Match the elements of a tree node, based on an ontology provided by a Lark grammar. @@ -173,6 +181,6 @@ class TreeMatcher: self._parser_cache[rulename] = parser # find a full derivation - unreduced_tree = parser.parse(tree.children, rulename) + unreduced_tree = parser.parse(ChildrenLexer(tree.children), rulename) assert unreduced_tree.data == rulename return unreduced_tree diff --git a/tests/test_parser.py b/tests/test_parser.py index edb4b26..863bf5d 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2342,7 +2342,7 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(a.line, 1) self.assertEqual(b.line, 2) - @unittest.skipIf(LEXER=='standard' and PARSER!='lalr', "Puppet error handling only works with LALR for now") + @unittest.skipIf(PARSER=='cyk', "match_examples() not supported for CYK") def test_match_examples(self): p = _Lark(r""" start: "a" "b" "c" @@ -2355,11 +2355,15 @@ def _make_parser_test(LEXER, PARSER): return u.match_examples(p.parse, { 0: ['abe'], 1: ['ab'], + 2: ['cbc'], }) assert False assert match_error("abe") == 0 assert match_error("ab") == 1 + assert match_error("bbc") == 2 + assert match_error("cbc") == 2 + self.assertEqual( match_error("dbc"), 2 ) @unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.')