| @@ -2,6 +2,7 @@ from typing import List, Tuple, Union, Callable, Dict, Optional | |||
| from lark import Tree | |||
| from lark.grammar import RuleOptions | |||
| from lark.exceptions import UnexpectedInput | |||
| class Grammar: | |||
| @@ -24,3 +25,6 @@ class GrammarBuilder: | |||
| def validate(self) -> None: ... | |||
| def build(self) -> Grammar: ... | |||
| def find_grammar_errors(text: str, start: str='start') -> List[Tuple[UnexpectedInput, str]]: ... | |||
| @@ -1,5 +1,5 @@ | |||
| from __future__ import absolute_import | |||
| from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken, ConfigurationError, assert_config | |||
| from lark.exceptions import ConfigurationError, assert_config | |||
| import sys, os, pickle, hashlib | |||
| from io import open | |||
| @@ -518,35 +518,7 @@ class Lark(Serialize): | |||
| result of the transformation. Otherwise, returns a Tree instance. | |||
| """ | |||
| try: | |||
| return self.parser.parse(text, start=start) | |||
| except UnexpectedInput as e: | |||
| if on_error is None: | |||
| raise | |||
| while True: | |||
| if isinstance(e, UnexpectedCharacters): | |||
| s = e.puppet.lexer_state.state | |||
| p = s.line_ctr.char_pos | |||
| if not on_error(e): | |||
| raise e | |||
| if isinstance(e, UnexpectedCharacters): | |||
| # If user didn't change the character position, then we should | |||
| if p == s.line_ctr.char_pos: | |||
| s.line_ctr.feed(s.text[p:p+1]) | |||
| try: | |||
| return e.puppet.resume_parse() | |||
| except UnexpectedToken as e2: | |||
| if isinstance(e, UnexpectedToken) and e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet: | |||
| # Prevent infinite loop | |||
| raise e2 | |||
| e = e2 | |||
| except UnexpectedCharacters as e2: | |||
| e = e2 | |||
| return self.parser.parse(text, start=start, on_error=on_error) | |||
| @property | |||
| def source(self): | |||
| @@ -8,7 +8,7 @@ import pkgutil | |||
| from ast import literal_eval | |||
| from numbers import Integral | |||
| from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start | |||
| from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique | |||
| from .lexer import Token, TerminalDef, PatternStr, PatternRE | |||
| from .parse_tree_builder import ParseTreeBuilder | |||
| @@ -16,7 +16,7 @@ from .parser_frontends import ParsingFrontend | |||
| from .common import LexerConf, ParserConf | |||
| from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol | |||
| from .utils import classify, suppress, dedup_list, Str | |||
| from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken | |||
| from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken, ParseError | |||
| from .tree import Tree, SlottedTree as ST | |||
| from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive | |||
| @@ -853,6 +853,54 @@ def _parse_grammar(text, name, start='start'): | |||
| return PrepareGrammar().transform(tree) | |||
| def _error_repr(error): | |||
| if isinstance(error, UnexpectedToken): | |||
| error2 = _translate_parser_exception(_get_parser().parse, error) | |||
| if error2: | |||
| return error2 | |||
| expected = ', '.join(error.accepts or error.expected) | |||
| return "Unexpected token %r. Expected one of: {%s}" % (str(error.token), expected) | |||
| else: | |||
| return str(error) | |||
| def _search_puppet(puppet, predicate): | |||
| def expand(node): | |||
| path, p = node | |||
| for choice in p.choices(): | |||
| t = Token(choice, '') | |||
| try: | |||
| new_p = p.feed_token(t) | |||
| except ParseError: # Illegal | |||
| pass | |||
| else: | |||
| yield path + (choice,), new_p | |||
| for path, p in bfs_all_unique([((), puppet)], expand): | |||
| if predicate(p): | |||
| return path, p | |||
| def find_grammar_errors(text, start='start'): | |||
| errors = [] | |||
| def on_error(e): | |||
| errors.append((e, _error_repr(e))) | |||
| # recover to a new line | |||
| token_path, _ = _search_puppet(e.puppet.as_immutable(), lambda p: '_NL' in p.choices()) | |||
| for token_type in token_path: | |||
| e.puppet.feed_token(Token(token_type, '')) | |||
| e.puppet.feed_token(Token('_NL', '\n')) | |||
| return True | |||
| _tree = _get_parser().parse(text + '\n', start, on_error=on_error) | |||
| errors_by_line = classify(errors, lambda e: e[0].line) | |||
| errors = [el[0] for el in errors_by_line.values()] # already sorted | |||
| for e in errors: | |||
| e[0].puppet = None | |||
| return errors | |||
| def _get_mangle(prefix, aliases, base_mangle=None): | |||
| def mangle(s): | |||
| if s in aliases: | |||
| @@ -101,18 +101,16 @@ class ParsingFrontend(Serialize): | |||
| self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex) | |||
| def parse(self, text, start=None): | |||
| def parse(self, text, start=None, on_error=None): | |||
| if start is None: | |||
| start = self.parser_conf.start | |||
| if len(start) > 1: | |||
| raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start) | |||
| start ,= start | |||
| if self.skip_lexer: | |||
| return self.parser.parse(text, start) | |||
| lexer_thread = LexerThread(self.lexer, text) | |||
| return self.parser.parse(lexer_thread, start) | |||
| stream = text if self.skip_lexer else LexerThread(self.lexer, text) | |||
| kw = {} if on_error is None else {'on_error': on_error} | |||
| return self.parser.parse(stream, start, **kw) | |||
| def get_frontend(parser, lexer): | |||
| @@ -9,6 +9,7 @@ from ..utils import Serialize | |||
| from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable | |||
| from .lalr_puppet import ParserPuppet | |||
| from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken | |||
| ###{standalone | |||
| @@ -32,8 +33,35 @@ class LALR_Parser(Serialize): | |||
| def serialize(self, memo): | |||
| return self._parse_table.serialize(memo) | |||
| def parse(self, *args): | |||
| return self.parser.parse(*args) | |||
| def parse(self, lexer, start, on_error=None): | |||
| try: | |||
| return self.parser.parse(lexer, start) | |||
| except UnexpectedInput as e: | |||
| if on_error is None: | |||
| raise | |||
| while True: | |||
| if isinstance(e, UnexpectedCharacters): | |||
| s = e.puppet.lexer_state.state | |||
| p = s.line_ctr.char_pos | |||
| if not on_error(e): | |||
| raise e | |||
| if isinstance(e, UnexpectedCharacters): | |||
| # If user didn't change the character position, then we should | |||
| if p == s.line_ctr.char_pos: | |||
| s.line_ctr.feed(s.text[p:p+1]) | |||
| try: | |||
| return e.puppet.resume_parse() | |||
| except UnexpectedToken as e2: | |||
| if isinstance(e, UnexpectedToken) and e.token.type == e2.token.type == '$END' and e.puppet == e2.puppet: | |||
| # Prevent infinite loop | |||
| raise e2 | |||
| e = e2 | |||
| except UnexpectedCharacters as e2: | |||
| e = e2 | |||
| class ParseConf(object): | |||
| @@ -318,6 +318,14 @@ def bfs(initial, expand): | |||
| visited.add(next_node) | |||
| open_q.append(next_node) | |||
| def bfs_all_unique(initial, expand): | |||
| "bfs, but doesn't keep track of visited (aka seen), because there can be no repetitions" | |||
| open_q = deque(list(initial)) | |||
| while open_q: | |||
| node = open_q.popleft() | |||
| yield node | |||
| open_q += expand(node) | |||
| def _serialize(value, memo): | |||
| if isinstance(value, Serialize): | |||
| @@ -4,7 +4,7 @@ import sys | |||
| from unittest import TestCase, main | |||
| from lark import Lark, Token, Tree | |||
| from lark.load_grammar import GrammarError, GRAMMAR_ERRORS | |||
| from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors | |||
| from lark.load_grammar import FromPackageLoader | |||
| @@ -160,6 +160,41 @@ class TestGrammar(TestCase): | |||
| x = p.parse('12 capybaras') | |||
| self.assertEqual(x.children, ['12', 'capybaras']) | |||
| def test_find_grammar_errors(self): | |||
| text = """ | |||
| a: rule | |||
| b rule | |||
| c: rule | |||
| B.: "hello" f | |||
| D: "okay" | |||
| """ | |||
| assert [e.line for e, _s in find_grammar_errors(text)] == [3, 5] | |||
| text = """ | |||
| a: rule | |||
| b rule | |||
| | ok | |||
| c: rule | |||
| B.: "hello" f | |||
| D: "okay" | |||
| """ | |||
| assert [e.line for e, _s in find_grammar_errors(text)] == [3, 4, 6] | |||
| text = """ | |||
| a: rule @#$#@$@&& | |||
| b: rule | |||
| | ok | |||
| c: rule | |||
| B: "hello" f @ | |||
| D: "okay" | |||
| """ | |||
| x = find_grammar_errors(text) | |||
| assert [e.line for e, _s in find_grammar_errors(text)] == [2, 6] | |||
| if __name__ == '__main__': | |||
| main() | |||