From 9923987e94547ded8a17d7a03840c4cebce39188 Mon Sep 17 00:00:00 2001 From: decorator-factory <42166884+decorator-factory@users.noreply.github.com> Date: Mon, 10 Aug 2020 23:07:55 +0300 Subject: [PATCH 1/8] allow multiline regexes with 'x' (verbose) flag --- lark/load_grammar.py | 13 ++++++++++--- tests/test_parser.py | 26 ++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index ae7ec32..d716ec1 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -13,7 +13,7 @@ from .parser_frontends import LALR_TraditionalLexer from .common import LexerConf, ParserConf from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify, suppress, dedup_list, Str -from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken +from .exceptions import GrammarError, LarkError, UnexpectedCharacters, UnexpectedToken from .tree import Tree, SlottedTree as ST from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive @@ -85,7 +85,7 @@ TERMINALS = { 'RULE': '!?[_?]?[a-z][_a-z0-9]*', 'TERMINAL': '_?[A-Z][_A-Z0-9]*', 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', - 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS, + 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS, '_NL': r'(\r?\n)+\s*', 'WS': r'[ \t]+', 'COMMENT': r'\s*//[^\n]*', @@ -336,7 +336,7 @@ class PrepareAnonTerminals(Transformer_InPlace): term_name = None elif isinstance(p, PatternRE): - if p in self.term_reverse: # Kind of a wierd placement.name + if p in self.term_reverse: # Kind of a weird placement.name term_name = self.term_reverse[p].name else: assert False, p @@ -409,6 +409,13 @@ def _literal_to_pattern(literal): flags = v[flag_start:] assert all(f in _RE_FLAGS for f in flags), flags + if literal.type == 'STRING' and '\n' in v: + raise GrammarError('You cannot put newlines in string literals') + + if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags: + raise GrammarError('You can only use newlines in regular expressions ' + 'with the `x` (verbose) flag') + v = v[:flag_start] assert v[0] == v[-1] and v[0] in '"/' x = v[1:-1] diff --git a/tests/test_parser.py b/tests/test_parser.py index cd3ea4d..48a4674 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1262,6 +1262,32 @@ def _make_parser_test(LEXER, PARSER): tree = l.parse('aA') self.assertEqual(tree.children, ['a', 'A']) + def test_token_flags_verbose(self): + g = _Lark(r"""start: NL | ABC + ABC: / [a-z] /x + NL: /\n/ + """) + x = g.parse('a') + self.assertEqual(x.children, ['a']) + + def test_token_flags_verbose_multiline(self): + g = _Lark(r"""start: ABC + ABC: / a b c + d + e f + /x + """) + x = g.parse('abcdef') + self.assertEqual(x.children, ['abcdef']) + + def test_token_multiline_only_works_with_x_flag(self): + g = r"""start: ABC + ABC: / a b c + d + e f + /i + """ + self.assertRaises( GrammarError, _Lark, g) @unittest.skipIf(PARSER == 'cyk', "No empty rules") def test_twice_empty(self): From 8b59a1642533f1f577b104c7be33f0511193050d Mon Sep 17 00:00:00 2001 From: decorator-factory <42166884+decorator-factory@users.noreply.github.com> Date: Tue, 11 Aug 2020 00:44:23 +0300 Subject: [PATCH 2/8] refactor: replace dict lookup with simple conditional --- lark/load_grammar.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index d716ec1..1a1a396 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -424,9 +424,11 @@ def _literal_to_pattern(literal): if literal.type == 'STRING': s = s.replace('\\\\', '\\') - - return { 'STRING': PatternStr, - 'REGEXP': PatternRE }[literal.type](s, flags) + return PatternStr(s, flags) + elif literal.type == 'REGEXP': + return PatternRE(s, flags) + else: + assert False, 'Invariant failed: literal.type not in ["STRING", "REGEXP"]' @inline_args From 2525e0ce9c594b81a79caa5ff57c66a12a79ca5a Mon Sep 17 00:00:00 2001 From: decorator-factory <42166884+decorator-factory@users.noreply.github.com> Date: Tue, 11 Aug 2020 00:46:54 +0300 Subject: [PATCH 3/8] formatting: fix pistol operator --- lark/load_grammar.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 1a1a396..0ee546c 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -13,7 +13,7 @@ from .parser_frontends import LALR_TraditionalLexer from .common import LexerConf, ParserConf from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify, suppress, dedup_list, Str -from .exceptions import GrammarError, LarkError, UnexpectedCharacters, UnexpectedToken +from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken from .tree import Tree, SlottedTree as ST from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive @@ -850,7 +850,7 @@ class GrammarLoader: if len(stmt.children) > 1: path_node, arg1 = stmt.children else: - path_node, = stmt.children + path_node ,= stmt.children arg1 = None if isinstance(arg1, Tree): # Multi import From 28e0a86f389c329a35091b7acb7b0afc5d57dc74 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Wed, 12 Aug 2020 14:48:55 +0200 Subject: [PATCH 4/8] Small improvements for debug info --- lark-stubs/exceptions.pyi | 15 ++++++++++----- lark/exceptions.py | 15 ++++++++++++--- lark/parsers/lalr_puppet.py | 6 +++--- 3 files changed, 25 insertions(+), 11 deletions(-) diff --git a/lark-stubs/exceptions.pyi b/lark-stubs/exceptions.pyi index f09bfbd..012ac51 100644 --- a/lark-stubs/exceptions.pyi +++ b/lark-stubs/exceptions.pyi @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from typing import Dict, Iterable, Callable, Union +from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple from .tree import Tree from .lexer import Token @@ -21,6 +21,9 @@ class LexError(LarkError): pass +T = TypeVar('T') + + class UnexpectedInput(LarkError): pos_in_stream: int @@ -28,10 +31,12 @@ class UnexpectedInput(LarkError): ... def match_examples( - self, - parse_fn: Callable[[str], Tree], - examples: Dict[str, Iterable[str]] - ): + self, + parse_fn: Callable[[str], Tree], + examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], + token_type_match_fallback: bool = False, + print_debug_info: bool = True + ) -> T: ... diff --git a/lark/exceptions.py b/lark/exceptions.py index 033275c..47670a6 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -37,34 +37,43 @@ class UnexpectedInput(LarkError): after = text[pos:end].split(b'\n', 1)[0] return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") - def match_examples(self, parse_fn, examples, token_type_match_fallback=False): + def match_examples(self, parse_fn, examples, token_type_match_fallback=False, print_debug_info=True): """ Given a parser instance and a dictionary mapping some label with some malformed syntax examples, it'll return the label for the example that bests matches the current error. """ assert self.state is not None, "Not supported for this exception" + + if isinstance(examples, dict): + examples = examples.items() candidate = (None, False) - for label, example in examples.items(): + for i, (label, example) in enumerate(examples): assert not isinstance(example, STRING_TYPE) - for malformed in example: + for j, malformed in enumerate(example): try: parse_fn(malformed) except UnexpectedInput as ut: if ut.state == self.state: try: if ut.token == self.token: # Try exact match first + if print_debug_info: + print("Exact Match at %d, with example %d" % (i, j), (ut.token, self.token, ut.state, self.state)) return label if token_type_match_fallback: # Fallback to token types match if (ut.token.type == self.token.type) and not candidate[-1]: + if print_debug_info: + print("Token Type Fallback at %d, with example %d" % (i, j)) candidate = label, True except AttributeError: pass if not candidate[0]: + if print_debug_info: + print("Defaulted at %d, with example %d" % (i, j)) candidate = label, False return candidate[0] diff --git a/lark/parsers/lalr_puppet.py b/lark/parsers/lalr_puppet.py index 968783c..d5a4703 100644 --- a/lark/parsers/lalr_puppet.py +++ b/lark/parsers/lalr_puppet.py @@ -16,7 +16,7 @@ class ParserPuppet: self.result = None def feed_token(self, token): - """Advance the parser state, as if it just recieved `token` from the lexer + """Advance the parser state, as if it just received `token` from the lexer """ end_state = self.parser.parse_table.end_states[self._start] @@ -66,9 +66,9 @@ class ParserPuppet: self._set_state, ) - def pretty(): + def pretty(self): print("Puppet choices:") - for k, v in self.choices.items(): + for k, v in self.choices().items(): print('\t-', k, '->', v) print('stack size:', len(self._state_stack)) From a7bcd0bc2d3cb96030d9e77523c0007e8034ce49 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Wed, 12 Aug 2020 15:36:01 +0200 Subject: [PATCH 5/8] Added `accepts` attribute to `UnexpectedToken` and update stubs --- lark-stubs/exceptions.pyi | 15 ++++++++++----- lark/exceptions.py | 5 +++-- lark/parsers/lalr_parser.py | 13 +++++++++++-- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/lark-stubs/exceptions.pyi b/lark-stubs/exceptions.pyi index 012ac51..67c39fb 100644 --- a/lark-stubs/exceptions.pyi +++ b/lark-stubs/exceptions.pyi @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple +from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set from .tree import Tree from .lexer import Token @@ -25,7 +25,10 @@ T = TypeVar('T') class UnexpectedInput(LarkError): + line: int + column: int pos_in_stream: int + state: Any def get_context(self, text: str, span: int = ...): ... @@ -41,12 +44,14 @@ class UnexpectedInput(LarkError): class UnexpectedToken(ParseError, UnexpectedInput): - pass - + expected: List[str] + considered_rules: Set[str] + puppet: Any + accepts: List[str] class UnexpectedCharacters(LexError, UnexpectedInput): - line: int - column: int + allowed: Set[str] + considered_tokens: Set[Any] class VisitError(LarkError): diff --git a/lark/exceptions.py b/lark/exceptions.py index 47670a6..022a00f 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -105,7 +105,7 @@ class UnexpectedCharacters(LexError, UnexpectedInput): class UnexpectedToken(ParseError, UnexpectedInput): - def __init__(self, token, expected, considered_rules=None, state=None, puppet=None): + def __init__(self, token, expected, considered_rules=None, state=None, puppet=None, accepts=None): self.token = token self.expected = expected # XXX str shouldn't necessary self.line = getattr(token, 'line', '?') @@ -114,10 +114,11 @@ class UnexpectedToken(ParseError, UnexpectedInput): self.state = state self.pos_in_stream = getattr(token, 'pos_in_stream', None) self.puppet = puppet + self.accepts = accepts message = ("Unexpected token %r at line %s, column %s.\n" "Expected one of: \n\t* %s\n" - % (token, self.line, self.column, '\n\t* '.join(self.expected))) + % (token, self.line, self.column, '\n\t* '.join(self.accepts or self.expected))) super(UnexpectedToken, self).__init__(message) diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index f26cbc5..f61e093 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -62,9 +62,18 @@ class _Parser: expected = [s for s in states[state].keys() if s.isupper()] try: puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) + accepts = [] + for t in expected: + new_puppet = puppet.copy() + try: + new_puppet.feed_token(Token(t, '')) + except KeyError: + pass + else: + accepts.append(t) except NameError: - puppet = None - raise UnexpectedToken(token, expected, state=state, puppet=puppet) + puppet = accepts = None + raise UnexpectedToken(token, expected, state=state, puppet=puppet, accepts=accepts) def reduce(rule): size = len(rule.expansion) From d3b0449f714615b190699644650e41669a1510d4 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Wed, 12 Aug 2020 16:46:36 +0200 Subject: [PATCH 6/8] Improved `match_examples` with `UnexpectedToken.accepts` --- lark/exceptions.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index 022a00f..497cf96 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -55,7 +55,7 @@ class UnexpectedInput(LarkError): try: parse_fn(malformed) except UnexpectedInput as ut: - if ut.state == self.state: + if ut.state == self.state and ut.accepts == self.accepts: try: if ut.token == self.token: # Try exact match first if print_debug_info: From 2e160c046e5de3d82b664d9867c1e9386ff4efb7 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Wed, 12 Aug 2020 16:52:21 +0200 Subject: [PATCH 7/8] Correction for python2.7 (LalrPuppet-> new style class) --- lark/parsers/lalr_puppet.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/parsers/lalr_puppet.py b/lark/parsers/lalr_puppet.py index d5a4703..2b350bf 100644 --- a/lark/parsers/lalr_puppet.py +++ b/lark/parsers/lalr_puppet.py @@ -4,7 +4,7 @@ from copy import deepcopy from .lalr_analysis import Shift, Reduce -class ParserPuppet: +class ParserPuppet(object): def __init__(self, parser, state_stack, value_stack, start, stream, set_state): self.parser = parser self._state_stack = state_stack From cb2d9cded072e0f150b0d6d349fd431369b83a93 Mon Sep 17 00:00:00 2001 From: MegaIng1 Date: Thu, 13 Aug 2020 03:51:01 +0200 Subject: [PATCH 8/8] Refactored ParserPuppet, added stubs --- lark-stubs/exceptions.pyi | 10 +++++----- lark-stubs/parsers/__init__.pyi | 0 lark-stubs/parsers/lalr_puppet.pyi | 21 +++++++++++++++++++++ lark/exceptions.py | 19 ++++++++++--------- lark/parsers/lalr_parser.py | 12 ++---------- lark/parsers/lalr_puppet.py | 21 ++++++++++++++++++--- 6 files changed, 56 insertions(+), 27 deletions(-) create mode 100644 lark-stubs/parsers/__init__.pyi create mode 100644 lark-stubs/parsers/lalr_puppet.pyi diff --git a/lark-stubs/exceptions.pyi b/lark-stubs/exceptions.pyi index 67c39fb..268844c 100644 --- a/lark-stubs/exceptions.pyi +++ b/lark-stubs/exceptions.pyi @@ -3,7 +3,7 @@ from typing import Dict, Iterable, Callable, Union, TypeVar, Tuple, Any, List, Set from .tree import Tree from .lexer import Token - +from .parsers.lalr_puppet import ParserPuppet class LarkError(Exception): pass @@ -38,16 +38,16 @@ class UnexpectedInput(LarkError): parse_fn: Callable[[str], Tree], examples: Union[Dict[T, Iterable[str]], Iterable[Tuple[T, Iterable[str]]]], token_type_match_fallback: bool = False, - print_debug_info: bool = True + use_accepts: bool = False, ) -> T: ... class UnexpectedToken(ParseError, UnexpectedInput): - expected: List[str] + expected: Set[str] considered_rules: Set[str] - puppet: Any - accepts: List[str] + puppet: ParserPuppet + accepts: Set[str] class UnexpectedCharacters(LexError, UnexpectedInput): allowed: Set[str] diff --git a/lark-stubs/parsers/__init__.pyi b/lark-stubs/parsers/__init__.pyi new file mode 100644 index 0000000..e69de29 diff --git a/lark-stubs/parsers/lalr_puppet.pyi b/lark-stubs/parsers/lalr_puppet.pyi new file mode 100644 index 0000000..c138c32 --- /dev/null +++ b/lark-stubs/parsers/lalr_puppet.pyi @@ -0,0 +1,21 @@ +from typing import Set, Dict, Any + +from lark import Token, Tree + + +class ParserPuppet(object): + """ + Represents a LalrParser that can be step through. + Shouldn't instantiated by hand, but is accessible as `UnexpectedToken.puppet` + """ + def feed_token(self, token: Token): ... + + def copy(self) -> ParserPuppet: ... + + def pretty(self) -> str: ... + + def choices(self) -> Dict[str, Any]: ... + + def accepts(self) -> Set[str]: ... + + def resume_parse(self) -> Tree: ... diff --git a/lark/exceptions.py b/lark/exceptions.py index 92ef64e..03f3da4 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -1,3 +1,5 @@ +import logging + from .utils import STRING_TYPE ###{standalone @@ -37,7 +39,7 @@ class UnexpectedInput(LarkError): after = text[pos:end].split(b'\n', 1)[0] return (before + after + b'\n' + b' ' * len(before) + b'^\n').decode("ascii", "backslashreplace") - def match_examples(self, parse_fn, examples, token_type_match_fallback=False, print_debug_info=True): + def match_examples(self, parse_fn, examples, token_type_match_fallback=False, use_accepts=False): """ Given a parser instance and a dictionary mapping some label with some malformed syntax examples, it'll return the label for the example that bests matches the current error. @@ -55,27 +57,26 @@ class UnexpectedInput(LarkError): try: parse_fn(malformed) except UnexpectedInput as ut: - if ut.state == self.state and ut.accepts == self.accepts: + if ut.state == self.state and (not use_accepts or ut.accepts == self.accepts): try: if ut.token == self.token: # Try exact match first - if print_debug_info: - print("Exact Match at %d, with example %d" % (i, j), (ut.token, self.token, ut.state, self.state)) + logging.debug("Exact Match at example [%s][%s]" % (i, j)) return label if token_type_match_fallback: # Fallback to token types match if (ut.token.type == self.token.type) and not candidate[-1]: - if print_debug_info: - print("Token Type Fallback at %d, with example %d" % (i, j)) + logging.debug("Token Type Fallback at example [%s][%s]" % (i, j)) candidate = label, True except AttributeError: pass if not candidate[0]: - if print_debug_info: - print("Defaulted at %d, with example %d" % (i, j)) + logging.debug("Same State match at example [%s][%s]" % (i, j)) candidate = label, False - + elif ut.state == self.state: + logging.debug("Different accepts with same state[%d]: %s != %s at example [%s][%s]" % + (self.state, self.accepts, ut.accepts, i, j)) return candidate[0] diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index f61e093..ba75606 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -59,18 +59,10 @@ class _Parser: try: return states[state][token.type] except KeyError: - expected = [s for s in states[state].keys() if s.isupper()] + expected = {s for s in states[state].keys() if s.isupper()} try: puppet = ParserPuppet(self, state_stack, value_stack, start, stream, set_state) - accepts = [] - for t in expected: - new_puppet = puppet.copy() - try: - new_puppet.feed_token(Token(t, '')) - except KeyError: - pass - else: - accepts.append(t) + accepts = puppet.accepts() except NameError: puppet = accepts = None raise UnexpectedToken(token, expected, state=state, puppet=puppet, accepts=accepts) diff --git a/lark/parsers/lalr_puppet.py b/lark/parsers/lalr_puppet.py index 2b350bf..24c77a1 100644 --- a/lark/parsers/lalr_puppet.py +++ b/lark/parsers/lalr_puppet.py @@ -3,6 +3,8 @@ from copy import deepcopy from .lalr_analysis import Shift, Reduce +from .. import Token + class ParserPuppet(object): def __init__(self, parser, state_stack, value_stack, start, stream, set_state): @@ -67,13 +69,26 @@ class ParserPuppet(object): ) def pretty(self): - print("Puppet choices:") + out = ["Puppet choices:"] for k, v in self.choices().items(): - print('\t-', k, '->', v) - print('stack size:', len(self._state_stack)) + out.append('\t- %s -> %s' % (k, v)) + out.append('stack size: %s' % len(self._state_stack)) + return '\n'.join(out) def choices(self): return self.parser.parse_table.states[self._state_stack[-1]] + def accepts(self): + accepts = set() + for t in self.choices(): + new_puppet = self.copy() + try: + new_puppet.feed_token(Token(t, '')) + except KeyError: + pass + else: + accepts.add(t) + return accepts + def resume_parse(self): return self.parser.parse(self._stream, self._start, self._set_state, self._value_stack, self._state_stack)