diff --git a/lark-stubs/exceptions.pyi b/lark-stubs/exceptions.pyi index f09bfbd..268844c 100644 --- a/lark-stubs/exceptions.pyi +++ b/lark-stubs/exceptions.pyi @@ -1,9 +1,9 @@ # -*- coding: utf-8 -*- -from typing import Dict, Iterable, Callable, Union +from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set from .tree import Tree from .lexer import Token - +from .parsers.lalr_puppet import ParserPuppet class LarkError(Exception): pass @@ -21,27 +21,37 @@ class LexError(LarkError): pass +T = TypeVar('T') + + class UnexpectedInput(LarkError): + line: int + column: int pos_in_stream: int + state: Any def get_context(self, text: str, span: int = ...): ... def match_examples( - self, - parse_fn: Callable[[str], Tree], - examples: Dict[str, Iterable[str]] - ): + self, + parse_fn: Callable[[str], Tree], + examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], + token_type_match_fallback: bool = False, + use_accepts: bool = False, + ) -> T: ... class UnexpectedToken(ParseError, UnexpectedInput): - pass - + expected: Set[str] + considered_rules: Set[str] + puppet: ParserPuppet + accepts: Set[str] class UnexpectedCharacters(LexError, UnexpectedInput): - line: int - column: int + allowed: Set[str] + considered_tokens: Set[Any] class VisitError(LarkError): diff --git a/lark-stubs/parsers/__init__.pyi b/lark-stubs/parsers/__init__.pyi new file mode 100644 index 0000000..e69de29 diff --git a/lark-stubs/parsers/lalr_puppet.pyi b/lark-stubs/parsers/lalr_puppet.pyi new file mode 100644 index 0000000..c138c32 --- /dev/null +++ b/lark-stubs/parsers/lalr_puppet.pyi @@ -0,0 +1,21 @@ +from typing import Set, Dict, Any + +from lark import Token, Tree + + +class ParserPuppet(object): + """ + Represents a LalrParser that can be step through. + Shouldn't instantiated by hand, but is accessible as `UnexpectedToken.puppet` + """ + def feed_token(self, token: Token): ... + + def copy(self) -> ParserPuppet: ... + + def pretty(self) -> str: ... + + def choices(self) -> Dict[str, Any]: ... + + def accepts(self) -> Set[str]: ... + + def resume_parse(self) -> Tree: ... diff --git a/lark/exceptions.py b/lark/exceptions.py index a844dd4..03f3da4 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -1,3 +1,5 @@ +import logging + from .utils import STRING_TYPE ###{standalone @@ -37,36 +39,44 @@ class UnexpectedInput(LarkError): after = text[pos:end].split(b'\n', 1)[0] return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") - def match_examples(self, parse_fn, examples, token_type_match_fallback=False): + def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False): """ Given a parser instance and a dictionary mapping some label with some malformed syntax examples, it'll return the label for the example that bests matches the current error. """ assert self.state is not None, "Not supported for this exception" + + if isinstance(examples, dict): + examples = examples.items() candidate = (None, False) - for label, example in examples.items(): + for i, (label, example) in enumerate(examples): assert not isinstance(example, STRING_TYPE) - for malformed in example: + for j, malformed in enumerate(example): try: parse_fn(malformed) except UnexpectedInput as ut: - if ut.state == self.state: + if ut.state == self.state and (not use_accepts or ut.accepts == self.accepts): try: if ut.token == self.token: # Try exact match first + logging.debug("Exact Match at example [%s][%s]" % (i, j)) return label if token_type_match_fallback: # Fallback to token types match if (ut.token.type == self.token.type) and not candidate[-1]: + logging.debug("Token Type Fallback at example [%s][%s]" % (i, j)) candidate = label, True except AttributeError: pass if not candidate[0]: + logging.debug("Same State match at example [%s][%s]" % (i, j)) candidate = label, False - + elif ut.state == self.state: + logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % + (self.state, self.accepts, ut.accepts, i, j)) return candidate[0] @@ -96,7 +106,7 @@ class UnexpectedCharacters(LexError, UnexpectedInput): class UnexpectedToken(ParseError, UnexpectedInput): - def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): + def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, accepts=None): self.token = token self.expected = expected # XXX str shouldn't necessary self.line = getattr(token, 'line', '?') @@ -105,10 +115,11 @@ class UnexpectedToken(ParseError, UnexpectedInput): self.state = state self.pos_in_stream = getattr(token, 'pos_in_stream', None) self.puppet = puppet + self.accepts = accepts message = ("Unexpected token %r at line %s, column %s.\n" "Expected one of: \n\t* %s\n" - % (token, self.line, self.column, '\n\t* '.join(self.expected))) + % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) super(UnexpectedToken, self).__init__(message) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index ae7ec32..0ee546c 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -85,7 +85,7 @@ TERMINALS = { 'RULE': '!?[_?]?[a-z][_a-z0-9]*', 'TERMINAL': '_?[A-Z][_A-Z0-9]*', 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', - 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS, + 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS, '_NL': r'(\r?\n)+\s*', 'WS': r'[ \t]+', 'COMMENT': r'\s*//[^\n]*', @@ -336,7 +336,7 @@ class PrepareAnonTerminals(Transformer_InPlace): term_name = None elif isinstance(p, PatternRE): - if p in self.term_reverse: # Kind of a wierd placement.name + if p in self.term_reverse: # Kind of a weird placement.name term_name = self.term_reverse[p].name else: assert False, p @@ -409,6 +409,13 @@ def _literal_to_pattern(literal): flags = v[flag_start:] assert all(f in _RE_FLAGS for f in flags), flags + if literal.type == 'STRING' and '\n' in v: + raise GrammarError('You cannot put newlines in string literals') + + if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags: + raise GrammarError('You can only use newlines in regular expressions ' + 'with the `x` (verbose) flag') + v = v[:flag_start] assert v[0] == v[-1] and v[0] in '"/' x = v[1:-1] @@ -417,9 +424,11 @@ def _literal_to_pattern(literal): if literal.type == 'STRING': s = s.replace('\\\\', '\\') - - return { 'STRING': PatternStr, - 'REGEXP': PatternRE }[literal.type](s, flags) + return PatternStr(s, flags) + elif literal.type == 'REGEXP': + return PatternRE(s, flags) + else: + assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]' @inline_args @@ -841,7 +850,7 @@ class GrammarLoader: if len(stmt.children) > 1: path_node, arg1 = stmt.children else: - path_node, = stmt.children + path_node ,= stmt.children arg1 = None if isinstance(arg1, Tree): # Multi import diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index f26cbc5..ba75606 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -59,12 +59,13 @@ class _Parser: try: return states[state][token.type] except KeyError: - expected = [s for s in states[state].keys() if s.isupper()] + expected = {s for s in states[state].keys() if s.isupper()} try: puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) + accepts = puppet.accepts() except NameError: - puppet = None - raise UnexpectedToken(token, expected, state=state, puppet=puppet) + puppet = accepts = None + raise UnexpectedToken(token, expected, state=state, puppet=puppet, accepts=accepts) def reduce(rule): size = len(rule.expansion) diff --git a/lark/parsers/lalr_puppet.py b/lark/parsers/lalr_puppet.py index 968783c..24c77a1 100644 --- a/lark/parsers/lalr_puppet.py +++ b/lark/parsers/lalr_puppet.py @@ -3,8 +3,10 @@ from copy import deepcopy from .lalr_analysis import Shift, Reduce +from .. import Token -class ParserPuppet: + +class ParserPuppet(object): def __init__(self, parser, state_stack, value_stack, start, stream, set_state): self.parser = parser self._state_stack = state_stack @@ -16,7 +18,7 @@ class ParserPuppet: self.result = None def feed_token(self, token): - """Advance the parser state, as if it just recieved `token` from the lexer + """Advance the parser state, as if it just received `token` from the lexer """ end_state = self.parser.parse_table.end_states[self._start] @@ -66,14 +68,27 @@ class ParserPuppet: self._set_state, ) - def pretty(): - print("Puppet choices:") - for k, v in self.choices.items(): - print('\t-', k, '->', v) - print('stack size:', len(self._state_stack)) + def pretty(self): + out = ["Puppet choices:"] + for k, v in self.choices().items(): + out.append('\t- %s -> %s' % (k, v)) + out.append('stack size: %s' % len(self._state_stack)) + return '\n'.join(out) def choices(self): return self.parser.parse_table.states[self._state_stack[-1]] + def accepts(self): + accepts = set() + for t in self.choices(): + new_puppet = self.copy() + try: + new_puppet.feed_token(Token(t, '')) + except KeyError: + pass + else: + accepts.add(t) + return accepts + def resume_parse(self): return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack) diff --git a/tests/test_parser.py b/tests/test_parser.py index cd3ea4d..48a4674 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1262,6 +1262,32 @@ def _make_parser_test(LEXER, PARSER): tree = l.parse('aA') self.assertEqual(tree.children, ['a', 'A']) + def test_token_flags_verbose(self): + g = _Lark(r"""start: NL | ABC + ABC: / [a-z] /x + NL: /\n/ + """) + x = g.parse('a') + self.assertEqual(x.children, ['a']) + + def test_token_flags_verbose_multiline(self): + g = _Lark(r"""start: ABC + ABC: / a b c + d + e f + /x + """) + x = g.parse('abcdef') + self.assertEqual(x.children, ['abcdef']) + + def test_token_multiline_only_works_with_x_flag(self): + g = r"""start: ABC + ABC: / a b c + d + e f + /i + """ + self.assertRaises( GrammarError, _Lark, g) @unittest.skipIf(PARSER == 'cyk', "No empty rules") def test_twice_empty(self):