diff --git a/lark-stubs/load_grammar.pyi b/lark-stubs/load_grammar.pyi index bbd5751..0521e33 100644 --- a/lark-stubs/load_grammar.pyi +++ b/lark-stubs/load_grammar.pyi @@ -2,6 +2,7 @@ from typing import List, Tuple, Union, Callable, Dict, Optional from lark import Tree from lark.grammar import RuleOptions +from lark.exceptions import UnexpectedInput class Grammar: @@ -24,3 +25,6 @@ class GrammarBuilder: def validate(self) -> None: ... def build(self) -> Grammar: ... + + +def find_grammar_errors(text: str, start: str='start') -> List[Tuple[UnexpectedInput, str]]: ... \ No newline at end of file diff --git a/lark/lark.py b/lark/lark.py index 5d1f241..426099e 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -1,5 +1,5 @@ from __future__ import absolute_import -from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken, ConfigurationError, assert_config +from lark.exceptions import ConfigurationError, assert_config import sys, os, pickle, hashlib from io import open @@ -518,35 +518,7 @@ class Lark(Serialize): result of the transformation. Otherwise, returns a Tree instance. """ - - try: - return self.parser.parse(text, start=start) - except UnexpectedInput as e: - if on_error is None: - raise - - while True: - if isinstance(e, UnexpectedCharacters): - s = e.puppet.lexer_state.state - p = s.line_ctr.char_pos - - if not on_error(e): - raise e - - if isinstance(e, UnexpectedCharacters): - # If user didn't change the character position, then we should - if p == s.line_ctr.char_pos: - s.line_ctr.feed(s.text[p:p+1]) - - try: - return e.puppet.resume_parse() - except UnexpectedToken as e2: - if isinstance(e, UnexpectedToken) and e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet: - # Prevent infinite loop - raise e2 - e = e2 - except UnexpectedCharacters as e2: - e = e2 + return self.parser.parse(text, start=start, on_error=on_error) @property def source(self): diff --git a/lark/load_grammar.py b/lark/load_grammar.py index b633a96..44ed774 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -8,7 +8,7 @@ import pkgutil from ast import literal_eval from numbers import Integral -from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start +from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder @@ -16,7 +16,7 @@ from .parser_frontends import ParsingFrontend from .common import LexerConf, ParserConf from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify, suppress, dedup_list, Str -from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken +from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError from .tree import Tree, SlottedTree as ST from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive @@ -853,6 +853,54 @@ def _parse_grammar(text, name, start='start'): return PrepareGrammar().transform(tree) +def _error_repr(error): + if isinstance(error, UnexpectedToken): + error2 = _translate_parser_exception(_get_parser().parse, error) + if error2: + return error2 + expected = ', '.join(error.accepts or error.expected) + return "Unexpected token %r. Expected one of: {%s}" % (str(error.token), expected) + else: + return str(error) + +def _search_puppet(puppet, predicate): + def expand(node): + path, p = node + for choice in p.choices(): + t = Token(choice, '') + try: + new_p = p.feed_token(t) + except ParseError: # Illegal + pass + else: + yield path + (choice,), new_p + + for path, p in bfs_all_unique([((), puppet)], expand): + if predicate(p): + return path, p + +def find_grammar_errors(text, start='start'): + errors = [] + def on_error(e): + errors.append((e, _error_repr(e))) + + # recover to a new line + token_path, _ = _search_puppet(e.puppet.as_immutable(), lambda p: '_NL' in p.choices()) + for token_type in token_path: + e.puppet.feed_token(Token(token_type, '')) + e.puppet.feed_token(Token('_NL', '\n')) + return True + + _tree = _get_parser().parse(text + '\n', start, on_error=on_error) + + errors_by_line = classify(errors, lambda e: e[0].line) + errors = [el[0] for el in errors_by_line.values()] # already sorted + + for e in errors: + e[0].puppet = None + return errors + + def _get_mangle(prefix, aliases, base_mangle=None): def mangle(s): if s in aliases: diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 0fab159..5acbbeb 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -101,18 +101,16 @@ class ParsingFrontend(Serialize): self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex) - def parse(self, text, start=None): + def parse(self, text, start=None, on_error=None): if start is None: start = self.parser_conf.start if len(start) > 1: raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start) start ,= start - if self.skip_lexer: - return self.parser.parse(text, start) - - lexer_thread = LexerThread(self.lexer, text) - return self.parser.parse(lexer_thread, start) + stream = text if self.skip_lexer else LexerThread(self.lexer, text) + kw = {} if on_error is None else {'on_error': on_error} + return self.parser.parse(stream, start, **kw) def get_frontend(parser, lexer): diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 9f08b81..9ca36f0 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -9,6 +9,7 @@ from ..utils import Serialize from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable from .lalr_puppet import ParserPuppet +from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken ###{standalone @@ -32,8 +33,35 @@ class LALR_Parser(Serialize): def serialize(self, memo): return self._parse_table.serialize(memo) - def parse(self, *args): - return self.parser.parse(*args) + def parse(self, lexer, start, on_error=None): + try: + return self.parser.parse(lexer, start) + except UnexpectedInput as e: + if on_error is None: + raise + + while True: + if isinstance(e, UnexpectedCharacters): + s = e.puppet.lexer_state.state + p = s.line_ctr.char_pos + + if not on_error(e): + raise e + + if isinstance(e, UnexpectedCharacters): + # If user didn't change the character position, then we should + if p == s.line_ctr.char_pos: + s.line_ctr.feed(s.text[p:p+1]) + + try: + return e.puppet.resume_parse() + except UnexpectedToken as e2: + if isinstance(e, UnexpectedToken) and e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet: + # Prevent infinite loop + raise e2 + e = e2 + except UnexpectedCharacters as e2: + e = e2 class ParseConf(object): diff --git a/lark/utils.py b/lark/utils.py index 7f1b971..023b118 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -318,6 +318,14 @@ def bfs(initial, expand): visited.add(next_node) open_q.append(next_node) +def bfs_all_unique(initial, expand): + "bfs, but doesn't keep track of visited (aka seen), because there can be no repetitions" + open_q = deque(list(initial)) + while open_q: + node = open_q.popleft() + yield node + open_q += expand(node) + def _serialize(value, memo): if isinstance(value, Serialize): diff --git a/tests/test_grammar.py b/tests/test_grammar.py index 130b47e..ea8652f 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -4,7 +4,7 @@ import sys from unittest import TestCase, main from lark import Lark, Token, Tree -from lark.load_grammar import GrammarError, GRAMMAR_ERRORS +from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors from lark.load_grammar import FromPackageLoader @@ -160,6 +160,41 @@ class TestGrammar(TestCase): x = p.parse('12 capybaras') self.assertEqual(x.children, ['12', 'capybaras']) + def test_find_grammar_errors(self): + text = """ + a: rule + b rule + c: rule + B.: "hello" f + D: "okay" + """ + + assert [e.line for e, _s in find_grammar_errors(text)] == [3, 5] + + text = """ + a: rule + b rule + | ok + c: rule + B.: "hello" f + D: "okay" + """ + + assert [e.line for e, _s in find_grammar_errors(text)] == [3, 4, 6] + + text = """ + a: rule @#$#@$@&& + b: rule + | ok + c: rule + B: "hello" f @ + D: "okay" + """ + + x = find_grammar_errors(text) + assert [e.line for e, _s in find_grammar_errors(text)] == [2, 6] + + if __name__ == '__main__': main()