Note: This refactor opens the door for implementing a ContextualLexer for Earley. But unlike the existing one for LALR, it will have to be computed at runtime, rather than ahead of time.tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.11.2
| @@ -123,6 +123,7 @@ class UnexpectedEOF(ParseError, UnexpectedInput): | |||
| class UnexpectedCharacters(LexError, UnexpectedInput): | |||
| def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): | |||
| # TODO considered_tokens and allowed can be figured out using state | |||
| self.line = line | |||
| self.column = column | |||
| self.pos_in_stream = lex_pos | |||
| @@ -154,6 +155,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||
| see: :ref:`ParserPuppet`. | |||
| """ | |||
| def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, token_history=None): | |||
| # TODO considered_tokens and allowed can be figured out using state | |||
| self.line = getattr(token, 'line', '?') | |||
| self.column = getattr(token, 'column', '?') | |||
| self.pos_in_stream = getattr(token, 'pos_in_stream', None) | |||
| @@ -353,7 +353,7 @@ class TraditionalLexer(Lexer): | |||
| allowed = {"<END-OF-FILE>"} | |||
| raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, | |||
| allowed=allowed, token_history=lex_state.last_token and [lex_state.last_token], | |||
| state=(parser_state and parser_state.position)) | |||
| state=parser_state) | |||
| value, type_ = res | |||
| @@ -436,7 +436,7 @@ class ContextualLexer(Lexer): | |||
| # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, but not in the current context. | |||
| # This tests the input against the global context, to provide a nicer error. | |||
| token = self.root_lexer.next_token(lexer_state, parser_state) | |||
| raise UnexpectedToken(token, e.allowed, state=parser_state.position, token_history=[lexer_state.last_token]) | |||
| raise UnexpectedToken(token, e.allowed, state=parser_state, token_history=[lexer_state.last_token]) | |||
| class LexerThread: | |||
| @@ -173,9 +173,6 @@ class Earley(WithLexer): | |||
| tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None | |||
| self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class) | |||
| def make_lexer(self, text): | |||
| return WithLexer.make_lexer(self, text).lex(None) | |||
| def match(self, term, token): | |||
| return term.name == token.type | |||
| @@ -146,7 +146,7 @@ class Parser: | |||
| column.add(new_item) | |||
| items.append(new_item) | |||
| def _parse(self, stream, columns, to_scan, start_symbol=None): | |||
| def _parse(self, lexer, columns, to_scan, start_symbol=None): | |||
| def is_quasi_complete(item): | |||
| if item.is_complete: | |||
| return True | |||
| @@ -245,7 +245,7 @@ class Parser: | |||
| if not next_set and not next_to_scan: | |||
| expect = {i.expect.name for i in to_scan} | |||
| raise UnexpectedToken(token, expect, considered_rules = set(to_scan)) | |||
| raise UnexpectedToken(token, expect, considered_rules=set(to_scan), state=frozenset(i.expect for i in to_scan)) | |||
| return next_to_scan | |||
| @@ -261,20 +261,24 @@ class Parser: | |||
| # Completions will be added to the SPPF tree, and predictions will be recursively | |||
| # processed down to terminals/empty nodes to be added to the scanner for the next | |||
| # step. | |||
| expects = {i.expect for i in to_scan} | |||
| i = 0 | |||
| for token in stream: | |||
| for token in lexer.lex(expects): | |||
| self.predict_and_complete(i, to_scan, columns, transitives) | |||
| to_scan = scan(i, token, to_scan) | |||
| i += 1 | |||
| expects.clear() | |||
| expects |= {i.expect for i in to_scan} | |||
| self.predict_and_complete(i, to_scan, columns, transitives) | |||
| ## Column is now the final column in the parse. | |||
| assert i == len(columns)-1 | |||
| return to_scan | |||
| def parse(self, stream, start): | |||
| def parse(self, lexer, start): | |||
| assert start, start | |||
| start_symbol = NonTerminal(start) | |||
| @@ -291,7 +295,7 @@ class Parser: | |||
| else: | |||
| columns[0].add(item) | |||
| to_scan = self._parse(stream, columns, to_scan, start_symbol) | |||
| to_scan = self._parse(lexer, columns, to_scan, start_symbol) | |||
| # If the parse was successful, the start | |||
| # symbol should have been completed in the last step of the Earley cycle, and will be in | |||
| @@ -299,7 +303,7 @@ class Parser: | |||
| solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] | |||
| if not solutions: | |||
| expected_terminals = [t.expect for t in to_scan] | |||
| raise UnexpectedEOF(expected_terminals, state={i.s for i in to_scan}) | |||
| raise UnexpectedEOF(expected_terminals, state=frozenset(i.expect for i in to_scan)) | |||
| if self.debug: | |||
| from .earley_forest import ForestToPyDotVisitor | |||
| @@ -3,7 +3,7 @@ | |||
| # Author: Erez Shinan (2017) | |||
| # Email : erezshin@gmail.com | |||
| from copy import deepcopy, copy | |||
| from ..exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken | |||
| from ..exceptions import UnexpectedInput, UnexpectedToken | |||
| from ..lexer import Token | |||
| from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable | |||
| @@ -62,6 +62,12 @@ class ParserState: | |||
| def position(self): | |||
| return self.state_stack[-1] | |||
| # Necessary for match_examples() to work | |||
| def __eq__(self, other): | |||
| if not isinstance(other, ParserState): | |||
| return False | |||
| return self.position == other.position | |||
| def __copy__(self): | |||
| return type(self)( | |||
| self.parse_conf, | |||
| @@ -86,7 +92,7 @@ class ParserState: | |||
| action, arg = states[state][token.type] | |||
| except KeyError: | |||
| expected = {s for s in states[state].keys() if s.isupper()} | |||
| raise UnexpectedToken(token, expected, state=state, puppet=None) | |||
| raise UnexpectedToken(token, expected, state=self, puppet=None) | |||
| assert arg != end_state | |||
| @@ -113,7 +113,8 @@ class Parser(BaseParser): | |||
| del delayed_matches[i+1] # No longer needed, so unburden memory | |||
| if not next_set and not delayed_matches and not next_to_scan: | |||
| raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan}, set(to_scan), state={i.s for i in next_to_scan}) | |||
| raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect.name for item in to_scan}, | |||
| set(to_scan), state=frozenset(i.expect for i in to_scan)) | |||
| return next_to_scan | |||
| @@ -69,6 +69,14 @@ def parse_rulename(s): | |||
| return name, args | |||
| class ChildrenLexer: | |||
| def __init__(self, children): | |||
| self.children = children | |||
| def lex(self, parser_state): | |||
| return self.children | |||
| class TreeMatcher: | |||
| """Match the elements of a tree node, based on an ontology | |||
| provided by a Lark grammar. | |||
| @@ -173,6 +181,6 @@ class TreeMatcher: | |||
| self._parser_cache[rulename] = parser | |||
| # find a full derivation | |||
| unreduced_tree = parser.parse(tree.children, rulename) | |||
| unreduced_tree = parser.parse(ChildrenLexer(tree.children), rulename) | |||
| assert unreduced_tree.data == rulename | |||
| return unreduced_tree | |||
| @@ -2342,7 +2342,7 @@ def _make_parser_test(LEXER, PARSER): | |||
| self.assertEqual(a.line, 1) | |||
| self.assertEqual(b.line, 2) | |||
| @unittest.skipIf(LEXER=='standard' and PARSER!='lalr', "Puppet error handling only works with LALR for now") | |||
| @unittest.skipIf(PARSER=='cyk', "match_examples() not supported for CYK") | |||
| def test_match_examples(self): | |||
| p = _Lark(r""" | |||
| start: "a" "b" "c" | |||
| @@ -2355,11 +2355,15 @@ def _make_parser_test(LEXER, PARSER): | |||
| return u.match_examples(p.parse, { | |||
| 0: ['abe'], | |||
| 1: ['ab'], | |||
| 2: ['cbc'], | |||
| }) | |||
| assert False | |||
| assert match_error("abe") == 0 | |||
| assert match_error("ab") == 1 | |||
| assert match_error("bbc") == 2 | |||
| assert match_error("cbc") == 2 | |||
| self.assertEqual( match_error("dbc"), 2 ) | |||
| @unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') | |||