From ec2ba8826ea5f396abab063f47ceaf914333e04c Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 28 Jun 2021 12:11:07 +0300 Subject: [PATCH 01/31] Docs fix + cleanup --- lark/lexer.py | 2 +- lark/utils.py | 11 ----------- 2 files changed, 1 insertion(+), 12 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index a2aefd2..4062c2d 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -120,7 +120,7 @@ class Token(Str): Attributes: type: Name of the token (as specified in grammar) value: Value of the token (redundant, as ``token.value == token`` will always be true) - pos_in_stream: The index of the token in the text + start_pos: The index of the token in the text line: The line of the token in the text (starting with 1) column: The column of the token in the text (starting with 1) end_line: The line where the token ends diff --git a/lark/utils.py b/lark/utils.py index 70516e6..b9d7ac3 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -241,17 +241,6 @@ except ImportError: pass -try: - compare = cmp -except NameError: - def compare(a, b): - if a == b: - return 0 - elif a > b: - return 1 - return -1 - - class Enumerator(Serialize): def __init__(self): self.enums = {} From bdcd2e0011bc0cd4fa3c35f59f28c78a1fa61a78 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Tue, 29 Jun 2021 22:32:56 +0200 Subject: [PATCH 02/31] fix tree_matcher when keep_all_tokens=True by setting sym.filter_out correctly. --- lark/load_grammar.py | 5 ++++- tests/test_reconstructor.py | 16 ++++++++++++++-- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index dcb4c81..c7b98a7 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -630,7 +630,10 @@ class Grammar: else: exp_options = options - assert all(isinstance(x, Symbol) for x in expansion), expansion + for sym in expansion: + assert isinstance(sym, Symbol) + if sym.is_term and exp_options and exp_options.keep_all_tokens: + sym.filter_out = False rule = Rule(NonTerminal(name), expansion, i, alias, exp_options) compiled_rules.append(rule) diff --git a/tests/test_reconstructor.py b/tests/test_reconstructor.py index f132312..e2f2dbe 100644 --- a/tests/test_reconstructor.py +++ b/tests/test_reconstructor.py @@ -3,6 +3,7 @@ import json import sys import unittest +from itertools import product from unittest import TestCase from lark import Lark @@ -20,8 +21,8 @@ def _remove_ws(s): class TestReconstructor(TestCase): - def assert_reconstruct(self, grammar, code): - parser = Lark(grammar, parser='lalr', maybe_placeholders=False) + def assert_reconstruct(self, grammar, code, **options): + parser = Lark(grammar, parser='lalr', maybe_placeholders=False, **options) tree = parser.parse(code) new = Reconstructor(parser).reconstruct(tree) self.assertEqual(_remove_ws(code), _remove_ws(new)) @@ -142,6 +143,17 @@ class TestReconstructor(TestCase): new_json = Reconstructor(json_parser).reconstruct(tree) self.assertEqual(json.loads(new_json), json.loads(test_json)) + def test_keep_all_tokens(self): + g = """ + start: "a"? _B? c? _d? + _B: "b" + c: "c" + _d: "d" + """ + examples = list(map(''.join, product(('', 'a'), ('', 'b'), ('', 'c'), ('', 'd'), ))) + for code in examples: + self.assert_reconstruct(g, code, keep_all_tokens=True) + @unittest.skipIf(sys.version_info < (3, 0), "Python 2 does not play well with Unicode.") def test_switch_grammar_unicode_terminal(self): """ From 389e7fbf5cc4ff8973ceb36e9823e6984df0941b Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 6 Jun 2021 17:02:41 +0300 Subject: [PATCH 03/31] lexer.py: Refactored mres operations into a Scanner class. --- lark/lexer.py | 92 +++++++++++++++++++++++++++++---------------------- 1 file changed, 52 insertions(+), 40 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 4062c2d..7a30d6d 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -127,7 +127,7 @@ class Token(Str): end_column: The next column after the end of the token. For example, if the token is a single character with a column value of 4, end_column will be 5. - end_pos: the index where the token ends (basically ``pos_in_stream + len(token)``) + end_pos: the index where the token ends (basically ``start_pos + len(token)``) """ __slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') @@ -214,15 +214,13 @@ class LineCounter: class UnlessCallback: - def __init__(self, mres): - self.mres = mres + def __init__(self, scanner): + self.scanner = scanner def __call__(self, t): - for mre, type_from_index in self.mres: - m = mre.match(t.value) - if m: - t.type = type_from_index[m.lastindex] - break + res = self.scanner.match(t.value, 0) + if res: + _value, t.type = res return t @@ -254,34 +252,51 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes): if strtok.pattern.flags <= retok.pattern.flags: embedded_strs.add(strtok) if unless: - callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) + callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) terminals = [t for t in terminals if t not in embedded_strs] return terminals, callback -def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes): - # Python sets an unreasonable group limit (currently 100) in its re module - # Worse, the only way to know we reached it is by catching an AssertionError! - # This function recursively tries less and less groups until it's successful. - postfix = '$' if match_whole else '' - mres = [] - while terminals: - pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) - if use_bytes: - pattern = pattern.encode('latin-1') - try: - mre = re_.compile(pattern, g_regex_flags) - except AssertionError: # Yes, this is what Python provides us.. :/ - return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes) - mres.append((mre, {i: n for n, i in mre.groupindex.items()})) - terminals = terminals[max_size:] - return mres +class Scanner: + def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False): + self.terminals = terminals + self.g_regex_flags = g_regex_flags + self.re_ = re_ + self.use_bytes = use_bytes + self.match_whole = match_whole + + self._mres = self._build_mres(terminals, len(terminals)) + + def _build_mres(self, terminals, max_size): + # Python sets an unreasonable group limit (currently 100) in its re module + # Worse, the only way to know we reached it is by catching an AssertionError! + # This function recursively tries less and less groups until it's successful. + postfix = '$' if self.match_whole else '' + mres = [] + while terminals: + pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size]) + if self.use_bytes: + pattern = pattern.encode('latin-1') + try: + mre = self.re_.compile(pattern, self.g_regex_flags) + except AssertionError: # Yes, this is what Python provides us.. :/ + return self._build_mres(terminals, max_size//2) + mres.append((mre, {i: n for n, i in mre.groupindex.items()})) + terminals = terminals[max_size:] + return mres -def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False): - return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes) + def match(self, text, pos): + for mre, type_from_index in self._mres: + m = mre.match(text, pos) + if m: + return m.group(0), type_from_index[m.lastindex] + + @property + def allowed_types(self): + return {v for m, tfi in self._mres for v in tfi.values()} def _regexp_has_newline(r): @@ -341,9 +356,9 @@ class TraditionalLexer(Lexer): self.use_bytes = conf.use_bytes self.terminals_by_name = conf.terminals_by_name - self._mres = None + self._scanner = None - def _build(self): + def _build_scanner(self): terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes) assert all(self.callback.values()) @@ -354,19 +369,16 @@ class TraditionalLexer(Lexer): else: self.callback[type_] = f - self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes) + self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes) @property - def mres(self): - if self._mres is None: - self._build() - return self._mres + def scanner(self): + if self._scanner is None: + self._build_scanner() + return self._scanner def match(self, text, pos): - for mre, type_from_index in self.mres: - m = mre.match(text, pos) - if m: - return m.group(0), type_from_index[m.lastindex] + return self.scanner.match(text, pos) def lex(self, state, parser_state): with suppress(EOFError): @@ -378,7 +390,7 @@ class TraditionalLexer(Lexer): while line_ctr.char_pos < len(lex_state.text): res = self.match(lex_state.text, line_ctr.char_pos) if not res: - allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types + allowed = self.scanner.allowed_types - self.ignore_types if not allowed: allowed = {""} raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column, From e5991739ee5a1d5bd6f78b84a495a2d7e17ce406 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 6 Jun 2021 18:23:46 +0300 Subject: [PATCH 04/31] lexer.py: Small refactor --- lark/lexer.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 7a30d6d..591943b 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -235,6 +235,11 @@ class CallChain: return self.callback2(t) if self.cond(t2) else t2 +def _get_match(re_, regexp, s, flags): + m = re_.match(regexp, s, flags) + if m: + return m.group(0) + def _create_unless(terminals, g_regex_flags, re_, use_bytes): tokens_by_type = classify(terminals, lambda t: type(t.pattern)) assert len(tokens_by_type) <= 2, tokens_by_type.keys() @@ -246,8 +251,7 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes): if strtok.priority > retok.priority: continue s = strtok.pattern.value - m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags) - if m and m.group(0) == s: + if s == _get_match(re_, retok.pattern.to_regexp(), s, g_regex_flags): unless.append(strtok) if strtok.pattern.flags <= retok.pattern.flags: embedded_strs.add(strtok) From da3a993d025d7f3463c5564ba6fed2c0f1146adf Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 6 Jun 2021 18:32:59 +0300 Subject: [PATCH 05/31] lexer.py: Small simplification --- lark/lexer.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 591943b..2925c35 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -271,6 +271,8 @@ class Scanner: self.use_bytes = use_bytes self.match_whole = match_whole + self.allowed_types = {t.name for t in self.terminals} + self._mres = self._build_mres(terminals, len(terminals)) def _build_mres(self, terminals, max_size): @@ -298,10 +300,6 @@ class Scanner: if m: return m.group(0), type_from_index[m.lastindex] - @property - def allowed_types(self): - return {v for m, tfi in self._mres for v in tfi.values()} - def _regexp_has_newline(r): r"""Expressions that may indicate newlines in a regexp: From 3bc070bc1dcbaa91a04f178b985c5250bafc492c Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 15 Jun 2021 17:04:38 +0300 Subject: [PATCH 06/31] Change how propagate_positions work --- lark-stubs/lark.pyi | 4 ++-- lark/lark.py | 4 ++-- lark/lexer.py | 28 +++++++++++++------------- lark/parse_tree_builder.py | 40 +++++++++++++++----------------------- lark/parser_frontends.py | 16 +++++++-------- 5 files changed, 42 insertions(+), 50 deletions(-) diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi index 27c6863..18748d1 100644 --- a/lark-stubs/lark.pyi +++ b/lark-stubs/lark.pyi @@ -33,7 +33,7 @@ class LarkOptions: regex: bool debug: bool keep_all_tokens: bool - propagate_positions: Union[bool, str] + propagate_positions: Union[bool, Callable] maybe_placeholders: bool lexer_callbacks: Dict[str, Callable[[Token], Token]] cache: Union[bool, str] @@ -77,7 +77,7 @@ class Lark: regex: bool = False, debug: bool = False, keep_all_tokens: bool = False, - propagate_positions: Union[bool, str] = False, + propagate_positions: Union[bool, Callable] = False, maybe_placeholders: bool = False, lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None, cache: Union[bool, str] = False, diff --git a/lark/lark.py b/lark/lark.py index 8e879cc..9863243 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -44,7 +44,7 @@ class LarkOptions(Serialize): Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster) propagate_positions Propagates (line, column, end_line, end_column) attributes into all tree branches. - Accepts ``False``, ``True``, or "ignore_ws", which will trim the whitespace around your trees. + Accepts ``False``, ``True``, or a callable, which will filter which nodes to ignore when propagating. maybe_placeholders When ``True``, the ``[]`` operator returns ``None`` when not matched. @@ -162,7 +162,7 @@ class LarkOptions(Serialize): assert_config(self.parser, ('earley', 'lalr', 'cyk', None)) if self.parser == 'earley' and self.transformer: - raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm.' + raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm. ' 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)') if o: diff --git a/lark/lexer.py b/lark/lexer.py index 2925c35..7c2f979 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -133,20 +133,20 @@ class Token(Str): def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None, pos_in_stream=None): try: - self = super(Token, cls).__new__(cls, value) + inst = super(Token, cls).__new__(cls, value) except UnicodeDecodeError: value = value.decode('latin1') - self = super(Token, cls).__new__(cls, value) - - self.type = type_ - self.start_pos = start_pos if start_pos is not None else pos_in_stream - self.value = value - self.line = line - self.column = column - self.end_line = end_line - self.end_column = end_column - self.end_pos = end_pos - return self + inst = super(Token, cls).__new__(cls, value) + + inst.type = type_ + inst.start_pos = start_pos if start_pos is not None else pos_in_stream + inst.value = value + inst.line = line + inst.column = column + inst.end_line = end_line + inst.end_column = end_column + inst.end_pos = end_pos + return inst @property def pos_in_stream(self): @@ -258,8 +258,8 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes): if unless: callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes)) - terminals = [t for t in terminals if t not in embedded_strs] - return terminals, callback + new_terminals = [t for t in terminals if t not in embedded_strs] + return new_terminals, callback diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 7a854bc..b4929c6 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -23,8 +23,9 @@ class ExpandSingleChild: class PropagatePositions: - def __init__(self, node_builder): + def __init__(self, node_builder, node_filter=None): self.node_builder = node_builder + self.node_filter = node_filter def __call__(self, children): res = self.node_builder(children) @@ -33,44 +34,35 @@ class PropagatePositions: if isinstance(res, Tree): res_meta = res.meta - src_meta = self._pp_get_meta(children) - if src_meta is not None: - res_meta.line = src_meta.line - res_meta.column = src_meta.column - res_meta.start_pos = src_meta.start_pos + first_meta = self._pp_get_meta(children) + if first_meta is not None: + res_meta.line = first_meta.line + res_meta.column = first_meta.column + res_meta.start_pos = first_meta.start_pos res_meta.empty = False - src_meta = self._pp_get_meta(reversed(children)) - if src_meta is not None: - res_meta.end_line = src_meta.end_line - res_meta.end_column = src_meta.end_column - res_meta.end_pos = src_meta.end_pos + last_meta = self._pp_get_meta(reversed(children)) + if last_meta is not None: + res_meta.end_line = last_meta.end_line + res_meta.end_column = last_meta.end_column + res_meta.end_pos = last_meta.end_pos res_meta.empty = False return res def _pp_get_meta(self, children): for c in children: + if self.node_filter is not None and not self.node_filter(c): + continue if isinstance(c, Tree): if not c.meta.empty: return c.meta elif isinstance(c, Token): return c -class PropagatePositions_IgnoreWs(PropagatePositions): - def _pp_get_meta(self, children): - for c in children: - if isinstance(c, Tree): - if not c.meta.empty: - return c.meta - elif isinstance(c, Token): - if c and not c.isspace(): # Disregard whitespace-only tokens - return c - - def make_propagate_positions(option): - if option == "ignore_ws": - return PropagatePositions_IgnoreWs + if callable(option): + return partial(PropagatePositions, node_filter=option) elif option is True: return PropagatePositions elif option is False: diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index e066d9a..1818ca7 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -92,26 +92,26 @@ class ParsingFrontend(Serialize): def _verify_start(self, start=None): if start is None: - start = self.parser_conf.start - if len(start) > 1: - raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start) - start ,= start + start_decls = self.parser_conf.start + if len(start_decls) > 1: + raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start_decls) + start ,= start_decls elif start not in self.parser_conf.start: raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start)) return start def parse(self, text, start=None, on_error=None): - start = self._verify_start(start) + chosen_start = self._verify_start(start) stream = text if self.skip_lexer else LexerThread(self.lexer, text) kw = {} if on_error is None else {'on_error': on_error} - return self.parser.parse(stream, start, **kw) + return self.parser.parse(stream, chosen_start, **kw) def parse_interactive(self, text=None, start=None): - start = self._verify_start(start) + chosen_start = self._verify_start(start) if self.parser_conf.parser_type != 'lalr': raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ") stream = text if self.skip_lexer else LexerThread(self.lexer, text) - return self.parser.parse_interactive(stream, start) + return self.parser.parse_interactive(stream, chosen_start) def get_frontend(parser, lexer): From 24f653080f1118471934dba1d2ebc133c992305b Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 16 Jun 2021 10:48:56 +0300 Subject: [PATCH 07/31] More minor refactorings --- lark/exceptions.py | 15 +++++++++------ lark/lark.py | 12 ++++++------ lark/parsers/lalr_parser.py | 4 ++-- 3 files changed, 17 insertions(+), 14 deletions(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index 26ffce3..9d326b8 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -129,6 +129,8 @@ class UnexpectedInput(LarkError): class UnexpectedEOF(ParseError, UnexpectedInput): def __init__(self, expected, state=None, terminals_by_name=None): + super(UnexpectedEOF, self).__init__() + self.expected = expected self.state = state from .lexer import Token @@ -138,7 +140,6 @@ class UnexpectedEOF(ParseError, UnexpectedInput): self.column = -1 self._terminals_by_name = terminals_by_name - super(UnexpectedEOF, self).__init__() def __str__(self): message = "Unexpected end-of-input. " @@ -149,6 +150,8 @@ class UnexpectedEOF(ParseError, UnexpectedInput): class UnexpectedCharacters(LexError, UnexpectedInput): def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, terminals_by_name=None, considered_rules=None): + super(UnexpectedCharacters, self).__init__() + # TODO considered_tokens and allowed can be figured out using state self.line = line self.column = column @@ -167,7 +170,6 @@ class UnexpectedCharacters(LexError, UnexpectedInput): self.char = seq[lex_pos] self._context = self.get_context(seq) - super(UnexpectedCharacters, self).__init__() def __str__(self): message = "No terminal matches '%s' in the current parser context, at line %d col %d" % (self.char, self.line, self.column) @@ -190,6 +192,8 @@ class UnexpectedToken(ParseError, UnexpectedInput): """ def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None): + super(UnexpectedToken, self).__init__() + # TODO considered_rules and expected can be figured out using state self.line = getattr(token, 'line', '?') self.column = getattr(token, 'column', '?') @@ -204,7 +208,6 @@ class UnexpectedToken(ParseError, UnexpectedInput): self._terminals_by_name = terminals_by_name self.token_history = token_history - super(UnexpectedToken, self).__init__() @property def accepts(self): @@ -236,10 +239,10 @@ class VisitError(LarkError): """ def __init__(self, rule, obj, orig_exc): - self.obj = obj - self.orig_exc = orig_exc - message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) super(VisitError, self).__init__(message) + self.obj = obj + self.orig_exc = orig_exc + ###} diff --git a/lark/lark.py b/lark/lark.py index 9863243..9a4b2d5 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -451,11 +451,11 @@ class Lark(Serialize): d = f else: d = pickle.load(f) - memo = d['memo'] + memo_json = d['memo'] data = d['data'] - assert memo - memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {}) + assert memo_json + memo = SerializeMemoizer.deserialize(memo_json, {'Rule': Rule, 'TerminalDef': TerminalDef}, {}) options = dict(data['options']) if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults): raise ConfigurationError("Some options are not allowed when loading a Parser: {}" @@ -512,11 +512,11 @@ class Lark(Serialize): Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...) """ - package = FromPackageLoader(package, search_paths) - full_path, text = package(None, grammar_path) + package_loader = FromPackageLoader(package, search_paths) + full_path, text = package_loader(None, grammar_path) options.setdefault('source_path', full_path) options.setdefault('import_paths', []) - options['import_paths'].append(package) + options['import_paths'].append(package_loader) return cls(text, **options) def __repr__(self): diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index fe40791..d916b46 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -178,8 +178,8 @@ class _Parser(object): for token in state.lexer.lex(state): state.feed_token(token) - token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) - return state.feed_token(token, True) + end_token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) + return state.feed_token(end_token, True) except UnexpectedInput as e: try: e.interactive_parser = InteractiveParser(self, state, state.lexer) From a13cfcef55f6460b9b8897e9c313b9bcb4c80b33 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 5 Jul 2021 12:08:38 +0300 Subject: [PATCH 08/31] Bugfix in propagate_positions: Corrected to account for 'container nodes' --- lark/parse_tree_builder.py | 31 ++++++++++++++++++++++--------- 1 file changed, 22 insertions(+), 9 deletions(-) diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index b4929c6..39d3510 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -30,23 +30,36 @@ class PropagatePositions: def __call__(self, children): res = self.node_builder(children) - # local reference to Tree.meta reduces number of presence checks if isinstance(res, Tree): + # Calculate positions while the tree is streaming, according to the rule: + # - nodes start at the start of their first child's container, + # and end at the end of their last child's container. + # Containers are nodes that take up space in text, but have been inlined in the tree. + res_meta = res.meta first_meta = self._pp_get_meta(children) if first_meta is not None: - res_meta.line = first_meta.line - res_meta.column = first_meta.column - res_meta.start_pos = first_meta.start_pos - res_meta.empty = False + # meta was already set, probably because the rule has been inlined (e.g. `?rule`) + if not hasattr(res_meta, 'line'): + res_meta.line = getattr(first_meta, 'container_line', first_meta.line) + res_meta.column = getattr(first_meta, 'container_column', first_meta.column) + res_meta.start_pos = getattr(first_meta, 'container_start_pos', first_meta.start_pos) + res_meta.empty = False + + res_meta.container_line = getattr(first_meta, 'container_line', first_meta.line) + res_meta.container_column = getattr(first_meta, 'container_column', first_meta.column) last_meta = self._pp_get_meta(reversed(children)) if last_meta is not None: - res_meta.end_line = last_meta.end_line - res_meta.end_column = last_meta.end_column - res_meta.end_pos = last_meta.end_pos - res_meta.empty = False + if not hasattr(res_meta, 'end_line'): + res_meta.end_line = getattr(last_meta, 'container_end_line', last_meta.end_line) + res_meta.end_column = getattr(last_meta, 'container_end_column', last_meta.end_column) + res_meta.end_pos = getattr(last_meta, 'container_end_pos', last_meta.end_pos) + res_meta.empty = False + + res_meta.container_end_line = getattr(last_meta, 'container_end_line', last_meta.end_line) + res_meta.container_end_column = getattr(last_meta, 'container_end_column', last_meta.end_column) return res From d7d02e930899048a18b094d798080e59c5b9af9b Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 5 Jul 2021 12:11:03 +0300 Subject: [PATCH 09/31] Tiny comment fix --- lark/parse_tree_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 39d3510..286038e 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -40,8 +40,8 @@ class PropagatePositions: first_meta = self._pp_get_meta(children) if first_meta is not None: - # meta was already set, probably because the rule has been inlined (e.g. `?rule`) if not hasattr(res_meta, 'line'): + # meta was already set, probably because the rule has been inlined (e.g. `?rule`) res_meta.line = getattr(first_meta, 'container_line', first_meta.line) res_meta.column = getattr(first_meta, 'container_column', first_meta.column) res_meta.start_pos = getattr(first_meta, 'container_start_pos', first_meta.start_pos) From c953dd9505dbba1bd8fbded0077a040a1ce0e5b5 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 5 Jul 2021 13:15:29 +0300 Subject: [PATCH 10/31] Tests: Added a test case demonstrating the need for calculating containers --- tests/test_parser.py | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tests/test_parser.py b/tests/test_parser.py index ff4e064..40ed131 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -94,6 +94,26 @@ class TestParsers(unittest.TestCase): r = g.parse('a') self.assertEqual( r.children[0].meta.line, 1 ) + def test_propagate_positions2(self): + g = Lark("""start: a + a: b + ?b: "(" t ")" + !t: "t" + """, propagate_positions=True) + + start = g.parse("(t)") + a ,= start.children + t ,= a.children + assert t.children[0] == "t" + + assert t.meta.column == 2 + assert t.meta.end_column == 3 + + assert start.column == a.column == 1 + assert start.end_column == a.end_column == 4 + + + def test_expand1(self): g = Lark("""start: a From f14ff6d4d14b500410b8d0d5e14fd2908be95dd9 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 5 Jul 2021 14:33:28 +0300 Subject: [PATCH 11/31] Fixed tests to use meta (Tree.column is deprecated) --- tests/test_parser.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_parser.py b/tests/test_parser.py index 40ed131..8fec82d 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -109,8 +109,8 @@ class TestParsers(unittest.TestCase): assert t.meta.column == 2 assert t.meta.end_column == 3 - assert start.column == a.column == 1 - assert start.end_column == a.end_column == 4 + assert start.meta.column == a.meta.column == 1 + assert start.meta.end_column == a.meta.end_column == 4 From b37519b7c882d3fbfbf44822d8f3e72898a2c2c3 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Fri, 9 Jul 2021 22:44:31 +0300 Subject: [PATCH 12/31] Bugfix for deepcopy + small unrelated refactor (issue #938) --- lark/common.py | 12 ++++++++++++ lark/utils.py | 14 +++++++------- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/lark/common.py b/lark/common.py index 467acf8..cb408d9 100644 --- a/lark/common.py +++ b/lark/common.py @@ -1,4 +1,5 @@ from warnings import warn +from copy import deepcopy from .utils import Serialize from .lexer import TerminalDef @@ -31,6 +32,17 @@ class LexerConf(Serialize): def _deserialize(self): self.terminals_by_name = {t.name: t for t in self.terminals} + def __deepcopy__(self, memo=None): + return type(self)( + deepcopy(self.terminals, memo), + self.re_module, + deepcopy(self.ignore, memo), + deepcopy(self.postlex, memo), + deepcopy(self.callbacks, memo), + deepcopy(self.g_regex_flags, memo), + deepcopy(self.skip_validation, memo), + deepcopy(self.use_bytes, memo), + ) class ParserConf(Serialize): diff --git a/lark/utils.py b/lark/utils.py index b9d7ac3..ea78801 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -73,14 +73,13 @@ class Serialize(object): fields = getattr(self, '__serialize_fields__') res = {f: _serialize(getattr(self, f), memo) for f in fields} res['__type__'] = type(self).__name__ - postprocess = getattr(self, '_serialize', None) - if postprocess: - postprocess(res, memo) + if hasattr(self, '_serialize'): + self._serialize(res, memo) return res @classmethod def deserialize(cls, data, memo): - namespace = getattr(cls, '__serialize_namespace__', {}) + namespace = getattr(cls, '__serialize_namespace__', []) namespace = {c.__name__:c for c in namespace} fields = getattr(cls, '__serialize_fields__') @@ -94,9 +93,10 @@ class Serialize(object): setattr(inst, f, _deserialize(data[f], namespace, memo)) except KeyError as e: raise KeyError("Cannot find key for class", cls, e) - postprocess = getattr(inst, '_deserialize', None) - if postprocess: - postprocess() + + if hasattr(inst, '_deserialize'): + inst._deserialize() + return inst From 688c581949b94eccd7ba30baa092a3e4189af008 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Tue, 13 Jul 2021 16:12:09 +0200 Subject: [PATCH 13/31] Updated a few links I believe that the changed link from `examples` to `/examples` isn't a problem on readthedocs, but we should check. If it works, this PR fixes #941 . --- docs/json_tutorial.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/json_tutorial.md b/docs/json_tutorial.md index 65c6c78..668d9de 100644 --- a/docs/json_tutorial.md +++ b/docs/json_tutorial.md @@ -427,9 +427,9 @@ I measured memory consumption using a little script called [memusg](https://gist | Lark - Earley *(with lexer)* | 42s | 4s | 1167M | 608M | | Lark - LALR(1) | 8s | 1.53s | 453M | 266M | | Lark - LALR(1) tree-less | 4.76s | 1.23s | 70M | 134M | -| PyParsing ([Parser](http://pyparsing.wikispaces.com/file/view/jsonParser.py)) | 32s | 3.53s | 443M | 225M | -| funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/funcparserlib/tests/json.py)) | 8.5s | 1.3s | 483M | 293M | -| Parsimonious ([Parser](https://gist.githubusercontent.com/reclosedev/5222560/raw/5e97cf7eb62c3a3671885ec170577285e891f7d5/parsimonious_json.py)) | ? | 5.7s | ? | 1545M | +| PyParsing ([Parser](https://github.com/pyparsing/pyparsing/blob/master/examples/jsonParser.py)) | 32s | 3.53s | 443M | 225M | +| funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/tests/json.py)) | 8.5s | 1.3s | 483M | 293M | +| Parsimonious ([Parser](https://gist.github.com/reclosedev/5222560)) | ? | 5.7s | ? | 1545M | I added a few other parsers for comparison. PyParsing and funcparselib fair pretty well in their memory usage (they don't build a tree), but they can't compete with the run-time speed of LALR(1). @@ -442,7 +442,7 @@ Once again, shout-out to PyPy for being so effective. This is the end of the tutorial. I hoped you liked it and learned a little about Lark. -To see what else you can do with Lark, check out the [examples](examples). +To see what else you can do with Lark, check out the [examples](/examples). For questions or any other subject, feel free to email me at erezshin at gmail dot com. From 7cb8acbe54eb108b6e99859adfd41717df43e032 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Fri, 9 Jul 2021 22:44:31 +0300 Subject: [PATCH 14/31] Bugfix for deepcopy + small unrelated refactor (issue #938) --- lark/common.py | 12 ++++++++++++ lark/utils.py | 14 +++++++------- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/lark/common.py b/lark/common.py index 467acf8..cb408d9 100644 --- a/lark/common.py +++ b/lark/common.py @@ -1,4 +1,5 @@ from warnings import warn +from copy import deepcopy from .utils import Serialize from .lexer import TerminalDef @@ -31,6 +32,17 @@ class LexerConf(Serialize): def _deserialize(self): self.terminals_by_name = {t.name: t for t in self.terminals} + def __deepcopy__(self, memo=None): + return type(self)( + deepcopy(self.terminals, memo), + self.re_module, + deepcopy(self.ignore, memo), + deepcopy(self.postlex, memo), + deepcopy(self.callbacks, memo), + deepcopy(self.g_regex_flags, memo), + deepcopy(self.skip_validation, memo), + deepcopy(self.use_bytes, memo), + ) class ParserConf(Serialize): diff --git a/lark/utils.py b/lark/utils.py index b9d7ac3..ea78801 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -73,14 +73,13 @@ class Serialize(object): fields = getattr(self, '__serialize_fields__') res = {f: _serialize(getattr(self, f), memo) for f in fields} res['__type__'] = type(self).__name__ - postprocess = getattr(self, '_serialize', None) - if postprocess: - postprocess(res, memo) + if hasattr(self, '_serialize'): + self._serialize(res, memo) return res @classmethod def deserialize(cls, data, memo): - namespace = getattr(cls, '__serialize_namespace__', {}) + namespace = getattr(cls, '__serialize_namespace__', []) namespace = {c.__name__:c for c in namespace} fields = getattr(cls, '__serialize_fields__') @@ -94,9 +93,10 @@ class Serialize(object): setattr(inst, f, _deserialize(data[f], namespace, memo)) except KeyError as e: raise KeyError("Cannot find key for class", cls, e) - postprocess = getattr(inst, '_deserialize', None) - if postprocess: - postprocess() + + if hasattr(inst, '_deserialize'): + inst._deserialize() + return inst From 87a18a098e306dbe0f4258732ad8944832dc4a39 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 15 Jul 2021 17:00:15 +0300 Subject: [PATCH 15/31] Tiny fix: MakeParsingFrontend is a regular method, not a classmethod --- lark/parser_frontends.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 1818ca7..0e53dd5 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -39,8 +39,7 @@ class MakeParsingFrontend: lexer_conf.lexer_type = self.lexer_type return ParsingFrontend(lexer_conf, parser_conf, options) - @classmethod - def deserialize(cls, data, memo, lexer_conf, callbacks, options): + def deserialize(self, data, memo, lexer_conf, callbacks, options): parser_conf = ParserConf.deserialize(data['parser_conf'], memo) parser = LALR_Parser.deserialize(data['parser'], memo, callbacks, options.debug) parser_conf.callbacks = callbacks From 5e5bd187a6fed1d94ff253dbd4f7d908e1d72476 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 22 Jul 2021 11:21:13 +0300 Subject: [PATCH 16/31] Docs: Improved documentation of exceptions --- docs/classes.rst | 2 ++ docs/visitors.rst | 5 +++++ lark/ast_utils.py | 4 ++-- lark/exceptions.py | 31 +++++++++++++++++++++++++------ lark/lark.py | 8 +++++++- 5 files changed, 41 insertions(+), 9 deletions(-) diff --git a/docs/classes.rst b/docs/classes.rst index 7b18460..1287896 100644 --- a/docs/classes.rst +++ b/docs/classes.rst @@ -66,6 +66,8 @@ UnexpectedInput .. autoclass:: lark.exceptions.UnexpectedCharacters +.. autoclass:: lark.exceptions.UnexpectedEOF + InteractiveParser ----------------- diff --git a/docs/visitors.rst b/docs/visitors.rst index a0e1711..f263712 100644 --- a/docs/visitors.rst +++ b/docs/visitors.rst @@ -107,3 +107,8 @@ Discard ------- .. autoclass:: lark.visitors.Discard + +VisitError +------- + +.. autoclass:: lark.exceptions.VisitError \ No newline at end of file diff --git a/lark/ast_utils.py b/lark/ast_utils.py index 0f2e498..b5463a2 100644 --- a/lark/ast_utils.py +++ b/lark/ast_utils.py @@ -36,8 +36,8 @@ def create_transformer(ast_module, transformer=None): Classes starting with an underscore (`_`) will be skipped. Parameters: - ast_module - A Python module containing all the subclasses of `ast_utils.Ast` - transformer (Optional[Transformer]) - An initial transformer. Its attributes may be overwritten. + ast_module: A Python module containing all the subclasses of ``ast_utils.Ast`` + transformer (Optional[Transformer]): An initial transformer. Its attributes may be overwritten. """ t = transformer or Transformer() diff --git a/lark/exceptions.py b/lark/exceptions.py index 9d326b8..fdcd52b 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -36,8 +36,9 @@ class UnexpectedInput(LarkError): Used as a base class for the following exceptions: - - ``UnexpectedToken``: The parser received an unexpected token - ``UnexpectedCharacters``: The lexer encountered an unexpected string + - ``UnexpectedToken``: The parser received an unexpected token + - ``UnexpectedEOF``: The parser expected a token, but the input ended After catching one of these exceptions, you may call the following helper methods to create a nicer error message. """ @@ -128,6 +129,9 @@ class UnexpectedInput(LarkError): class UnexpectedEOF(ParseError, UnexpectedInput): + """An exception that is raised by the parser, when the input ends while it still expects a token. + """ + def __init__(self, expected, state=None, terminals_by_name=None): super(UnexpectedEOF, self).__init__() @@ -148,6 +152,10 @@ class UnexpectedEOF(ParseError, UnexpectedInput): class UnexpectedCharacters(LexError, UnexpectedInput): + """An exception that is raised by the lexer, when it cannot match the next + string of characters to any of its terminals. + """ + def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None, terminals_by_name=None, considered_rules=None): super(UnexpectedCharacters, self).__init__() @@ -185,10 +193,15 @@ class UnexpectedToken(ParseError, UnexpectedInput): """An exception that is raised by the parser, when the token it received doesn't match any valid step forward. - The parser provides an interactive instance through `interactive_parser`, - which is initialized to the point of failture, and can be used for debugging and error handling. + Parameters: + token: The mismatched token + expected: The set of expected tokens + considered_rules: Which rules were considered, to deduce the expected tokens + state: A value representing the parser state. Do not rely on its value or type. + interactive_parser: An instance of ``InteractiveParser``, that is initialized to the point of failture, + and can be used for debugging and error handling. - see: ``InteractiveParser``. + Note: These parameters are available as attributes of the instance. """ def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None): @@ -234,14 +247,20 @@ class VisitError(LarkError): """VisitError is raised when visitors are interrupted by an exception It provides the following attributes for inspection: - - obj: the tree node or token it was processing when the exception was raised - - orig_exc: the exception that cause it to fail + + Parameters: + rule: the name of the visit rule that failed + obj: the tree-node or token that was being processed + orig_exc: the exception that cause it to fail + + Note: These parameters are available as attributes """ def __init__(self, rule, obj, orig_exc): message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) super(VisitError, self).__init__(message) + self.rule = rule self.obj = obj self.orig_exc = orig_exc diff --git a/lark/lark.py b/lark/lark.py index 9a4b2d5..45dec4d 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -102,7 +102,7 @@ class LarkOptions(Serialize): A List of either paths or loader functions to specify from where grammars are imported source_path Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading - **=== End Options ===** + **=== End of Options ===** """ if __doc__: __doc__ += OPTIONS_DOC @@ -527,6 +527,8 @@ class Lark(Serialize): """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard' When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore. + + :raises UnexpectedCharacters: In case the lexer cannot find a suitable match. """ if not hasattr(self, 'lexer') or dont_ignore: lexer = self._build_lexer(dont_ignore) @@ -569,6 +571,10 @@ class Lark(Serialize): If a transformer is supplied to ``__init__``, returns whatever is the result of the transformation. Otherwise, returns a Tree instance. + :raises UnexpectedInput: On a parse error, one of these sub-exceptions will rise: + ``UnexpectedCharacters``, ``UnexpectedToken``, or ``UnexpectedEOF``. + For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``. + """ return self.parser.parse(text, start=start, on_error=on_error) From 55642be13c1a5ac36a999124ae3c875492d574d1 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 25 Jul 2021 17:13:23 +0300 Subject: [PATCH 17/31] Tiny adjustments --- examples/standalone/json_parser_main.py | 4 +++- lark/parsers/lalr_interactive_parser.py | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/examples/standalone/json_parser_main.py b/examples/standalone/json_parser_main.py index 503b249..3d9b5a6 100644 --- a/examples/standalone/json_parser_main.py +++ b/examples/standalone/json_parser_main.py @@ -10,7 +10,9 @@ Standalone Parser import sys -from json_parser import Lark_StandAlone, Transformer, inline_args +from json_parser import Lark_StandAlone, Transformer, v_args + +inline_args = v_args(inline=True) class TreeToJson(Transformer): @inline_args diff --git a/lark/parsers/lalr_interactive_parser.py b/lark/parsers/lalr_interactive_parser.py index ce596b5..d6780cb 100644 --- a/lark/parsers/lalr_interactive_parser.py +++ b/lark/parsers/lalr_interactive_parser.py @@ -65,7 +65,7 @@ class InteractiveParser(object): """Print the output of ``choices()`` in a way that's easier to read.""" out = ["Parser choices:"] for k, v in self.choices().items(): - out.append('\t- %s -> %s' % (k, v)) + out.append('\t- %s -> %r' % (k, v)) out.append('stack size: %s' % len(self.parser_state.state_stack)) return '\n'.join(out) From b0a9afb287eaaeb139140d088cccbd6167f92aa1 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Sun, 25 Jul 2021 23:07:08 +0200 Subject: [PATCH 18/31] Split up repeats from tilde into different rules. --- lark/load_grammar.py | 24 ++++++++++++++++++++++-- lark/utils.py | 30 ++++++++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 2 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index dbf4a1f..569e67d 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -9,7 +9,7 @@ import pkgutil from ast import literal_eval from numbers import Integral -from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique +from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder @@ -196,6 +196,26 @@ class EBNF_to_BNF(Transformer_InPlace): self.rules_by_expr[expr] = t return t + def _add_repeat_rule(self, a, b, target, atom): + if (a, b, target, atom) in self.rules_by_expr: + return self.rules_by_expr[(a, b, target, atom)] + new_name = '__%s_a%d_b%d_%d' % (self.prefix, a, b, self.i) + self.i += 1 + t = NonTerminal(new_name) + tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)]) + self.new_rules.append((new_name, tree, self.rule_options)) + self.rules_by_expr[(a, b, target, atom)] = t + return t + + def _generate_repeats(self, rule, mn, mx): + factors = small_factors(mn) + target = rule + for a, b in factors: + target = self._add_repeat_rule(a, b, target, rule) + + # return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) + return ST('expansions', [ST('expansion', [target] + [rule] * n) for n in range(0, mx - mn + 1)]) + def expr(self, rule, op, *args): if op.value == '?': empty = ST('expansion', []) @@ -220,7 +240,7 @@ class EBNF_to_BNF(Transformer_InPlace): mn, mx = map(int, args) if mx < mn or mn < 0: raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) - return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)]) + return self._generate_repeats(rule, mn, mx) assert False, op def maybe(self, rule): diff --git a/lark/utils.py b/lark/utils.py index ea78801..a3a077f 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -359,3 +359,33 @@ def _serialize(value, memo): return {key:_serialize(elem, memo) for key, elem in value.items()} # assert value is None or isinstance(value, (int, float, str, tuple)), value return value + + +def small_factors(n): + """ + Splits n up into smaller factors and summands <= 10. + Returns a list of [(a, b), ...] + so that the following code returns n: + + n = 1 + for a, b in values: + n = n * a + b + + Currently, we also keep a + b <= 10, but that might change + """ + assert n > 0 + if n < 10: + return [(n, 0)] + # TODO: Think of better algorithms (Prime factors should minimize the number of steps) + for a in range(10, 1, -1): + b = n % a + if a + b > 10: + continue + r = n // a + assert r * a + b == n # Sanity check + if r <= 10: + return [(r, 0), (a, b)] + else: + return [*small_factors(r), (a, b)] + # This should be unreachable, since 2 + 1 <= 10 + assert False, "Failed to factorize %s" % n From 845b6fa477827d6ee77a21eaced1c3f3a4a8d8b0 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Mon, 26 Jul 2021 01:14:46 +0200 Subject: [PATCH 19/31] Refactor + tests + additional splitting up. --- lark/load_grammar.py | 100 ++++++++++++++++++++++++++++++++----------- lark/utils.py | 2 +- tests/test_parser.py | 29 +++++++++++++ 3 files changed, 105 insertions(+), 26 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 569e67d..2f51ff6 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -179,42 +179,87 @@ RULES = { class EBNF_to_BNF(Transformer_InPlace): def __init__(self): self.new_rules = [] - self.rules_by_expr = {} + self.rules_cache = {} self.prefix = 'anon' self.i = 0 self.rule_options = None - def _add_recurse_rule(self, type_, expr): - if expr in self.rules_by_expr: - return self.rules_by_expr[expr] - - new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) + def _name_rule(self, inner): + new_name = '__%s_%s_%d' % (self.prefix, inner, self.i) self.i += 1 - t = NonTerminal(new_name) - tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])]) - self.new_rules.append((new_name, tree, self.rule_options)) - self.rules_by_expr[expr] = t + return new_name + + def _add_rule(self, key, name, expansions): + t = NonTerminal(name) + self.new_rules.append((name, expansions, self.rule_options)) + self.rules_cache[key] = t return t + def _add_recurse_rule(self, type_, expr): + try: + return self.rules_cache[expr] + except KeyError: + new_name = self._name_rule(type_) + t = NonTerminal(new_name) + tree = ST('expansions', [ + ST('expansion', [expr]), + ST('expansion', [t, expr]) + ]) + return self._add_rule(expr, new_name, tree) + def _add_repeat_rule(self, a, b, target, atom): - if (a, b, target, atom) in self.rules_by_expr: - return self.rules_by_expr[(a, b, target, atom)] - new_name = '__%s_a%d_b%d_%d' % (self.prefix, a, b, self.i) - self.i += 1 - t = NonTerminal(new_name) - tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)]) - self.new_rules.append((new_name, tree, self.rule_options)) - self.rules_by_expr[(a, b, target, atom)] = t - return t + """ + When target matches n times atom + This builds a rule that matches atom (a*n + b) times + """ + key = (a, b, target, atom) + try: + return self.rules_cache[key] + except KeyError: + new_name = self._name_rule('a%d_b%d' % (a, b)) + tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)]) + return self._add_rule(key, new_name, tree) + + def _add_repeat_opt_rule(self, a, b, target, target_opt, atom): + """ + When target matches n times atom, and target_opt 0 to n-1 times target_opt, + This builds a rule that matches atom 0 to (a*n+b)-1 times + """ + key = (a, b, target, atom, "opt") + try: + return self.rules_cache[key] + except KeyError: + new_name = self._name_rule('a%d_b%d_opt' % (a, b)) + tree = ST('expansions', [ + ST('expansion', [target] * i + [target_opt]) + for i in range(a) + ] + [ + ST('expansion', [target] * a + [atom] * i) + for i in range(1, b) + ]) + return self._add_rule(key, new_name, tree) def _generate_repeats(self, rule, mn, mx): - factors = small_factors(mn) - target = rule - for a, b in factors: - target = self._add_repeat_rule(a, b, target, rule) + mn_factors = small_factors(mn) + mn_target = rule + for a, b in mn_factors: + mn_target = self._add_repeat_rule(a, b, mn_target, rule) + if mx == mn: + return mn_target + diff = mx - mn + 1 # We add one because _add_repeat_opt_rule needs it. + diff_factors = small_factors(diff) + diff_target = rule + diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. 1-1 times) + for a, b in diff_factors[:-1]: + new_diff_target = self._add_repeat_rule(a, b, diff_target, rule) + diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) + diff_target = new_diff_target + a, b = diff_factors[-1] + diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) # return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) - return ST('expansions', [ST('expansion', [target] + [rule] * n) for n in range(0, mx - mn + 1)]) + # return ST('expansions', [ST('expansion', [mn_target] + [rule] * n) for n in range(0, mx - mn + 1)]) + return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])]) def expr(self, rule, op, *args): if op.value == '?': @@ -240,7 +285,12 @@ class EBNF_to_BNF(Transformer_InPlace): mn, mx = map(int, args) if mx < mn or mn < 0: raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) - return self._generate_repeats(rule, mn, mx) + # For small number of repeats, we don't need to build new rules. + # Value 20 is arbitrarily chosen + if mx > 20: + return self._generate_repeats(rule, mn, mx) + else: + return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) assert False, op def maybe(self, rule): diff --git a/lark/utils.py b/lark/utils.py index a3a077f..2fa5f43 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -373,7 +373,7 @@ def small_factors(n): Currently, we also keep a + b <= 10, but that might change """ - assert n > 0 + assert n >= 0 if n < 10: return [(n, 0)] # TODO: Think of better algorithms (Prime factors should minimize the number of steps) diff --git a/tests/test_parser.py b/tests/test_parser.py index 8fec82d..6c00fbb 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2226,6 +2226,35 @@ def _make_parser_test(LEXER, PARSER): self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') + @unittest.skipIf(PARSER == 'cyk', "For large number of repeats, empty rules might be generated") + def test_ranged_repeat_large(self): + # Large is currently arbitrarily chosen to be large than 20 + g = u"""!start: "A"~30 + """ + l = _Lark(g) + self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") + self.assertEqual(l.parse(u'A'*30), Tree('start', ["A"]*30)) + self.assertRaises(ParseError, l.parse, u'A'*29) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A'*31) + + + g = u"""!start: "A"~0..100 + """ + l = _Lark(g) + self.assertEqual(l.parse(u''), Tree('start', [])) + self.assertEqual(l.parse(u'A'), Tree('start', ['A'])) + self.assertEqual(l.parse(u'A'*100), Tree('start', ['A']*100)) + self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 101) + + # 8191 is a Mersenne prime + g = u"""start: "A"~8191 + """ + l = _Lark(g) + self.assertEqual(l.parse(u'A'*8191), Tree('start', [])) + self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8190) + self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8192) + + @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX def test_priority_vs_embedded(self): g = """ From b4fe22a27dd67bca414be767b92ab2960798f0d6 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Mon, 26 Jul 2021 10:50:37 +0200 Subject: [PATCH 20/31] Python2.7 + comments + Magic constants --- lark/load_grammar.py | 48 ++++++++++++++++++++++++++++++++++++-------- lark/utils.py | 23 ++++++++++++--------- tests/test_parser.py | 11 +++++----- 3 files changed, 58 insertions(+), 24 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 2f51ff6..2b1030f 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -174,6 +174,10 @@ RULES = { 'literal': ['REGEXP', 'STRING'], } +REPEAT_BREAK_THRESHOLD = 20 +# The Threshold whether repeat via ~ are split up into different rules +# For the moment 20 is arbitrarily chosen + @inline_args class EBNF_to_BNF(Transformer_InPlace): @@ -211,25 +215,50 @@ class EBNF_to_BNF(Transformer_InPlace): """ When target matches n times atom This builds a rule that matches atom (a*n + b) times + + The rule is of the form: + + The rules are of the form: (Example a = 3, b = 4) + + new_rule: target target target atom atom atom atom + + e.g. we use target * a and atom * b """ key = (a, b, target, atom) try: return self.rules_cache[key] except KeyError: - new_name = self._name_rule('a%d_b%d' % (a, b)) + new_name = self._name_rule('repeat_a%d_b%d' % (a, b)) tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)]) return self._add_rule(key, new_name, tree) def _add_repeat_opt_rule(self, a, b, target, target_opt, atom): """ When target matches n times atom, and target_opt 0 to n-1 times target_opt, - This builds a rule that matches atom 0 to (a*n+b)-1 times + This builds a rule that matches atom 0 to (a*n+b)-1 times. + The created rule will not have any shift/reduce conflicts so that it can be used with lalr + + The rules are of the form: (Example a = 3, b = 4) + + new_rule: target_opt + | target target_opt + | target target target_opt + + | target target target atom + | target target target atom atom + | target target target atom atom atom + + First we generate target * i followed by target_opt for i from 0 to a-1 + These match 0 to n*a - 1 times atom + + Then we generate target * a followed by atom * i for i from 1 to b-1 + These match n*a to n*a + b-1 times atom """ key = (a, b, target, atom, "opt") try: return self.rules_cache[key] except KeyError: - new_name = self._name_rule('a%d_b%d_opt' % (a, b)) + new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b)) tree = ST('expansions', [ ST('expansion', [target] * i + [target_opt]) for i in range(a) @@ -240,13 +269,19 @@ class EBNF_to_BNF(Transformer_InPlace): return self._add_rule(key, new_name, tree) def _generate_repeats(self, rule, mn, mx): + """ + We treat rule~mn..mx as rule~mn rule~0..(diff=mx-mn). + We then use small_factors to split up mn and diff up into values [(a, b), ...] + This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt + to generate a complete rule/expression that matches the corresponding number of repeats + """ mn_factors = small_factors(mn) mn_target = rule for a, b in mn_factors: mn_target = self._add_repeat_rule(a, b, mn_target, rule) if mx == mn: return mn_target - diff = mx - mn + 1 # We add one because _add_repeat_opt_rule needs it. + diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less diff_factors = small_factors(diff) diff_target = rule diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. 1-1 times) @@ -257,8 +292,6 @@ class EBNF_to_BNF(Transformer_InPlace): a, b = diff_factors[-1] diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) - # return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) - # return ST('expansions', [ST('expansion', [mn_target] + [rule] * n) for n in range(0, mx - mn + 1)]) return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])]) def expr(self, rule, op, *args): @@ -286,8 +319,7 @@ class EBNF_to_BNF(Transformer_InPlace): if mx < mn or mn < 0: raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) # For small number of repeats, we don't need to build new rules. - # Value 20 is arbitrarily chosen - if mx > 20: + if mx > REPEAT_BREAK_THRESHOLD: return self._generate_repeats(rule, mn, mx) else: return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) diff --git a/lark/utils.py b/lark/utils.py index 2fa5f43..1648720 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -187,7 +187,7 @@ def get_regexp_width(expr): return 1, sre_constants.MAXREPEAT else: return 0, sre_constants.MAXREPEAT - + ###} @@ -288,7 +288,7 @@ except ImportError: class FS: exists = os.path.exists - + @staticmethod def open(name, mode="r", **kwargs): if atomicwrites and "w" in mode: @@ -361,9 +361,13 @@ def _serialize(value, memo): return value +# 10 is arbitrarily chosen +SMALL_FACTOR_THRESHOLD = 10 + + def small_factors(n): """ - Splits n up into smaller factors and summands <= 10. + Splits n up into smaller factors and summands <= SMALL_FACTOR_THRESHOLD. Returns a list of [(a, b), ...] so that the following code returns n: @@ -371,21 +375,20 @@ def small_factors(n): for a, b in values: n = n * a + b - Currently, we also keep a + b <= 10, but that might change + Currently, we also keep a + b <= SMALL_FACTOR_THRESHOLD, but that might change """ assert n >= 0 - if n < 10: + if n < SMALL_FACTOR_THRESHOLD: return [(n, 0)] # TODO: Think of better algorithms (Prime factors should minimize the number of steps) - for a in range(10, 1, -1): + for a in range(SMALL_FACTOR_THRESHOLD, 1, -1): b = n % a - if a + b > 10: + if a + b > SMALL_FACTOR_THRESHOLD: continue r = n // a assert r * a + b == n # Sanity check - if r <= 10: + if r <= SMALL_FACTOR_THRESHOLD: return [(r, 0), (a, b)] else: - return [*small_factors(r), (a, b)] - # This should be unreachable, since 2 + 1 <= 10 + return small_factors(r) + [(a, b)] assert False, "Failed to factorize %s" % n diff --git a/tests/test_parser.py b/tests/test_parser.py index 6c00fbb..2247b46 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2233,24 +2233,23 @@ def _make_parser_test(LEXER, PARSER): """ l = _Lark(g) self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") - self.assertEqual(l.parse(u'A'*30), Tree('start', ["A"]*30)) - self.assertRaises(ParseError, l.parse, u'A'*29) - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A'*31) - + self.assertEqual(l.parse(u'A' * 30), Tree('start', ["A"] * 30)) + self.assertRaises(ParseError, l.parse, u'A' * 29) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 31) g = u"""!start: "A"~0..100 """ l = _Lark(g) self.assertEqual(l.parse(u''), Tree('start', [])) self.assertEqual(l.parse(u'A'), Tree('start', ['A'])) - self.assertEqual(l.parse(u'A'*100), Tree('start', ['A']*100)) + self.assertEqual(l.parse(u'A' * 100), Tree('start', ['A'] * 100)) self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 101) # 8191 is a Mersenne prime g = u"""start: "A"~8191 """ l = _Lark(g) - self.assertEqual(l.parse(u'A'*8191), Tree('start', [])) + self.assertEqual(l.parse(u'A' * 8191), Tree('start', [])) self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8190) self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8192) From 6872404f1123bc6dcabb4f1735622747999b2bdc Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 26 Jul 2021 12:20:15 +0300 Subject: [PATCH 21/31] Improvements to the Python3 grammar --- examples/advanced/python3.lark | 142 +++++++++++++++++++++------------ 1 file changed, 89 insertions(+), 53 deletions(-) diff --git a/examples/advanced/python3.lark b/examples/advanced/python3.lark index 0fc5949..e54eb69 100644 --- a/examples/advanced/python3.lark +++ b/examples/advanced/python3.lark @@ -21,7 +21,7 @@ decorators: decorator+ decorated: decorators (classdef | funcdef | async_funcdef) async_funcdef: "async" funcdef -funcdef: "def" NAME "(" parameters? ")" ["->" test] ":" suite +funcdef: "def" NAME "(" [parameters] ")" ["->" test] ":" suite parameters: paramvalue ("," paramvalue)* ["," SLASH] ["," [starparams | kwparams]] | starparams @@ -29,25 +29,36 @@ parameters: paramvalue ("," paramvalue)* ["," SLASH] ["," [starparams | kwparams SLASH: "/" // Otherwise the it will completely disappear and it will be undisguisable in the result starparams: "*" typedparam? ("," paramvalue)* ["," kwparams] -kwparams: "**" typedparam +kwparams: "**" typedparam ","? -?paramvalue: typedparam ["=" test] -?typedparam: NAME [":" test] +?paramvalue: typedparam ("=" test)? +?typedparam: NAME (":" test)? -varargslist: (vfpdef ["=" test] ("," vfpdef ["=" test])* ["," [ "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]] | "**" vfpdef [","]]] - | "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]] - | "**" vfpdef [","]) -vfpdef: NAME +lambdef: "lambda" [lambda_params] ":" test +lambdef_nocond: "lambda" [lambda_params] ":" test_nocond +lambda_params: lambda_paramvalue ("," lambda_paramvalue)* ["," [lambda_starparams | lambda_kwparams]] + | lambda_starparams + | lambda_kwparams +?lambda_paramvalue: NAME ("=" test)? +lambda_starparams: "*" [NAME] ("," lambda_paramvalue)* ["," [lambda_kwparams]] +lambda_kwparams: "**" NAME ","? + ?stmt: simple_stmt | compound_stmt ?simple_stmt: small_stmt (";" small_stmt)* [";"] _NEWLINE -?small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt) -?expr_stmt: testlist_star_expr (annassign | augassign (yield_expr|testlist) - | ("=" (yield_expr|testlist_star_expr))*) -annassign: ":" test ["=" test] -?testlist_star_expr: (test|star_expr) ("," (test|star_expr))* [","] -!augassign: ("+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=") +?small_stmt: (expr_stmt | assign_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt) +expr_stmt: testlist_star_expr +assign_stmt: annassign | augassign | assign + +annassign: testlist_star_expr ":" test ["=" test] +assign: testlist_star_expr ("=" (yield_expr|testlist_star_expr))+ +augassign: testlist_star_expr augassign_op (yield_expr|testlist) +!augassign_op: "+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=" +?testlist_star_expr: test_or_star_expr + | test_or_star_expr ("," test_or_star_expr)+ ","? -> tuple + | test_or_star_expr "," -> tuple + // For normal and annotated assignments, additional restrictions enforced by the interpreter del_stmt: "del" exprlist pass_stmt: "pass" @@ -71,43 +82,52 @@ global_stmt: "global" NAME ("," NAME)* nonlocal_stmt: "nonlocal" NAME ("," NAME)* assert_stmt: "assert" test ["," test] -compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt +?compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt async_stmt: "async" (funcdef | with_stmt | for_stmt) -if_stmt: "if" test ":" suite ("elif" test ":" suite)* ["else" ":" suite] +if_stmt: "if" test ":" suite elifs ["else" ":" suite] +elifs: elif_* +elif_: "elif" test ":" suite while_stmt: "while" test ":" suite ["else" ":" suite] for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite] -try_stmt: ("try" ":" suite ((except_clause ":" suite)+ ["else" ":" suite] ["finally" ":" suite] | "finally" ":" suite)) -with_stmt: "with" with_item ("," with_item)* ":" suite +try_stmt: "try" ":" suite except_clauses ["else" ":" suite] [finally] + | "try" ":" suite finally -> try_finally +finally: "finally" ":" suite +except_clauses: except_clause+ +except_clause: "except" [test ["as" NAME]] ":" suite + +with_stmt: "with" with_items ":" suite +with_items: with_item ("," with_item)* with_item: test ["as" expr] // NB compile.c makes sure that the default except clause is last -except_clause: "except" [test ["as" NAME]] suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT -?test: or_test ("if" or_test "else" test)? | lambdef +?test: or_test ("if" or_test "else" test)? + | lambdef ?test_nocond: or_test | lambdef_nocond -lambdef: "lambda" [varargslist] ":" test -lambdef_nocond: "lambda" [varargslist] ":" test_nocond + ?or_test: and_test ("or" and_test)* ?and_test: not_test ("and" not_test)* -?not_test: "not" not_test -> not +?not_test: "not" not_test -> not_test | comparison -?comparison: expr (_comp_op expr)* +?comparison: expr (comp_op expr)* star_expr: "*" expr -?expr: xor_expr ("|" xor_expr)* + +?expr: or_expr +?or_expr: xor_expr ("|" xor_expr)* ?xor_expr: and_expr ("^" and_expr)* ?and_expr: shift_expr ("&" shift_expr)* ?shift_expr: arith_expr (_shift_op arith_expr)* ?arith_expr: term (_add_op term)* ?term: factor (_mul_op factor)* -?factor: _factor_op factor | power +?factor: _unary_op factor | power -!_factor_op: "+"|"-"|"~" +!_unary_op: "+"|"-"|"~" !_add_op: "+"|"-" !_shift_op: "<<"|">>" !_mul_op: "*"|"@"|"/"|"%"|"//" // <> isn't actually a valid comparison operator in Python. It's here for the // sake of a __future__ import described in PEP 401 (which really works :-) -!_comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not" +!comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not" ?power: await_expr ("**" factor)? ?await_expr: AWAIT? atom_expr @@ -118,61 +138,76 @@ AWAIT: "await" | atom_expr "." NAME -> getattr | atom -?atom: "(" [yield_expr|tuplelist_comp] ")" -> tuple - | "[" [testlist_comp] "]" -> list - | "{" [dict_comp] "}" -> dict - | "{" set_comp "}" -> set +?atom: "(" yield_expr ")" + | "(" _tuple_inner? ")" -> tuple + | "(" comprehension{test_or_star_expr} ")" -> tuple_comprehension + | "[" _testlist_comp? "]" -> list + | "[" comprehension{test_or_star_expr} "]" -> list_comprehension + | "{" _dict_exprlist? "}" -> dict + | "{" comprehension{key_value} "}" -> dict_comprehension + | "{" _set_exprlist "}" -> set + | "{" comprehension{test} "}" -> set_comprehension | NAME -> var - | number | string+ + | TEMPLATE_NAME -> template_var + | number + | string_concat | "(" test ")" | "..." -> ellipsis | "None" -> const_none | "True" -> const_true | "False" -> const_false -?testlist_comp: test | tuplelist_comp -tuplelist_comp: (test|star_expr) (comp_for | ("," (test|star_expr))+ [","] | ",") + +?string_concat: string+ + +_testlist_comp: test | _tuple_inner +_tuple_inner: test_or_star_expr (("," test_or_star_expr)+ [","] | ",") + + +?test_or_star_expr: test + | star_expr + ?subscriptlist: subscript | subscript (("," subscript)+ [","] | ",") -> subscript_tuple -subscript: test | ([test] ":" [test] [sliceop]) -> slice +?subscript: test | ([test] ":" [test] [sliceop]) -> slice sliceop: ":" [test] -exprlist: (expr|star_expr) - | (expr|star_expr) (("," (expr|star_expr))+ [","]|",") -> exprlist_tuple -testlist: test | testlist_tuple +?exprlist: (expr|star_expr) + | (expr|star_expr) (("," (expr|star_expr))+ [","]|",") +?testlist: test | testlist_tuple testlist_tuple: test (("," test)+ [","] | ",") -dict_comp: key_value comp_for - | (key_value | "**" expr) ("," (key_value | "**" expr))* [","] +_dict_exprlist: (key_value | "**" expr) ("," (key_value | "**" expr))* [","] key_value: test ":" test -set_comp: test comp_for - | (test|star_expr) ("," (test | star_expr))* [","] +_set_exprlist: test_or_star_expr ("," test_or_star_expr)* [","] classdef: "class" NAME ["(" [arguments] ")"] ":" suite + + arguments: argvalue ("," argvalue)* ("," [ starargs | kwargs])? | starargs | kwargs - | test comp_for + | comprehension{test} -starargs: "*" test ("," "*" test)* ("," argvalue)* ["," kwargs] +starargs: stararg ("," stararg)* ("," argvalue)* ["," kwargs] +stararg: "*" test kwargs: "**" test ?argvalue: test ("=" test)? - -comp_iter: comp_for | comp_if | async_for -async_for: "async" "for" exprlist "in" or_test [comp_iter] -comp_for: "for" exprlist "in" or_test [comp_iter] -comp_if: "if" test_nocond [comp_iter] +comprehension{comp_result}: comp_result comp_fors [comp_if] +comp_fors: comp_for+ +comp_for: [ASYNC] "for" exprlist "in" or_test +ASYNC: "async" +?comp_if: "if" test_nocond // not used in grammar, but may appear in "node" passed from Parser to Compiler encoding_decl: NAME -yield_expr: "yield" [yield_arg] -yield_arg: "from" test | testlist - +yield_expr: "yield" [testlist] + | "yield" "from" test -> yield_from number: DEC_NUMBER | HEX_NUMBER | BIN_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER string: STRING | LONG_STRING @@ -181,6 +216,7 @@ string: STRING | LONG_STRING %import python (NAME, COMMENT, STRING, LONG_STRING) %import python (DEC_NUMBER, HEX_NUMBER, OCT_NUMBER, BIN_NUMBER, FLOAT_NUMBER, IMAG_NUMBER) + // Other terminals _NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+ From 6a027982c7f8c014dc8403f9d22e52d1c9cb5a21 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 26 Jul 2021 14:54:41 +0300 Subject: [PATCH 22/31] Tiny fix to PR --- examples/advanced/python3.lark | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/advanced/python3.lark b/examples/advanced/python3.lark index e54eb69..7fb5ae5 100644 --- a/examples/advanced/python3.lark +++ b/examples/advanced/python3.lark @@ -148,7 +148,6 @@ AWAIT: "await" | "{" _set_exprlist "}" -> set | "{" comprehension{test} "}" -> set_comprehension | NAME -> var - | TEMPLATE_NAME -> template_var | number | string_concat | "(" test ")" From fa8565366be027df0f788cb1432d90b2b94aa264 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Mon, 26 Jul 2021 18:53:43 +0200 Subject: [PATCH 23/31] Off-by-one fix + Change of thresholds + fix tests --- lark/load_grammar.py | 29 ++++++++++++++++++++++------- lark/utils.py | 8 +++++--- tests/test_parser.py | 19 ++++++++++--------- 3 files changed, 37 insertions(+), 19 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 2b1030f..36f6e2c 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -174,9 +174,21 @@ RULES = { 'literal': ['REGEXP', 'STRING'], } -REPEAT_BREAK_THRESHOLD = 20 +REPEAT_BREAK_THRESHOLD = 50 # The Threshold whether repeat via ~ are split up into different rules -# For the moment 20 is arbitrarily chosen +# 50 is chosen since it keeps the number of states low and therefore lalr analysis time low, +# while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts. +# For a grammar of the form start: "A"~0..N, these are the timing stats: +# N t +# 10 0.000 +# 20 0.004 +# 30 0.016 +# 40 0.049 +# 50 0.109 +# 60 0.215 +# 70 0.383 +# 80 0.631 +# (See PR #949) @inline_args @@ -244,6 +256,7 @@ class EBNF_to_BNF(Transformer_InPlace): | target target_opt | target target target_opt + | target target target | target target target atom | target target target atom atom | target target target atom atom atom @@ -251,7 +264,7 @@ class EBNF_to_BNF(Transformer_InPlace): First we generate target * i followed by target_opt for i from 0 to a-1 These match 0 to n*a - 1 times atom - Then we generate target * a followed by atom * i for i from 1 to b-1 + Then we generate target * a followed by atom * i for i from 0 to b-1 These match n*a to n*a + b-1 times atom """ key = (a, b, target, atom, "opt") @@ -264,7 +277,7 @@ class EBNF_to_BNF(Transformer_InPlace): for i in range(a) ] + [ ST('expansion', [target] * a + [atom] * i) - for i in range(1, b) + for i in range(b) ]) return self._add_rule(key, new_name, tree) @@ -281,15 +294,17 @@ class EBNF_to_BNF(Transformer_InPlace): mn_target = self._add_repeat_rule(a, b, mn_target, rule) if mx == mn: return mn_target + diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less diff_factors = small_factors(diff) - diff_target = rule - diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. 1-1 times) + diff_target = rule # Match rule 1 times + diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. up to 1 -1 times) for a, b in diff_factors[:-1]: new_diff_target = self._add_repeat_rule(a, b, diff_target, rule) diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) diff_target = new_diff_target - a, b = diff_factors[-1] + + a, b = diff_factors[-1] # We do the last on separately since we don't need to call self._add_repeat_rule diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])]) diff --git a/lark/utils.py b/lark/utils.py index 1648720..f447b9e 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -361,8 +361,9 @@ def _serialize(value, memo): return value -# 10 is arbitrarily chosen -SMALL_FACTOR_THRESHOLD = 10 +# Value 5 keeps the number of states in the lalr parser somewhat minimal +# It isn't optimal, but close to it. See PR #949 +SMALL_FACTOR_THRESHOLD = 5 def small_factors(n): @@ -380,7 +381,8 @@ def small_factors(n): assert n >= 0 if n < SMALL_FACTOR_THRESHOLD: return [(n, 0)] - # TODO: Think of better algorithms (Prime factors should minimize the number of steps) + # While this does not provide an optimal solution, it produces a pretty good one. + # See above comment and PR #949 for a in range(SMALL_FACTOR_THRESHOLD, 1, -1): b = n % a if a + b > SMALL_FACTOR_THRESHOLD: diff --git a/tests/test_parser.py b/tests/test_parser.py index 2247b46..b55f848 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2229,21 +2229,22 @@ def _make_parser_test(LEXER, PARSER): @unittest.skipIf(PARSER == 'cyk', "For large number of repeats, empty rules might be generated") def test_ranged_repeat_large(self): # Large is currently arbitrarily chosen to be large than 20 - g = u"""!start: "A"~30 + g = u"""!start: "A"~60 """ l = _Lark(g) self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") - self.assertEqual(l.parse(u'A' * 30), Tree('start', ["A"] * 30)) - self.assertRaises(ParseError, l.parse, u'A' * 29) - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 31) + self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60)) + self.assertRaises(ParseError, l.parse, u'A' * 59) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61) - g = u"""!start: "A"~0..100 + g = u"""!start: "A"~15..100 """ l = _Lark(g) - self.assertEqual(l.parse(u''), Tree('start', [])) - self.assertEqual(l.parse(u'A'), Tree('start', ['A'])) - self.assertEqual(l.parse(u'A' * 100), Tree('start', ['A'] * 100)) - self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 101) + for i in range(0, 110): + if 15 <= i <= 100: + self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i)) + else: + self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * i) # 8191 is a Mersenne prime g = u"""start: "A"~8191 From 3436b3388546cfd8c802ab09d59d6e9e82cb1c7e Mon Sep 17 00:00:00 2001 From: MegaIng Date: Mon, 26 Jul 2021 21:13:29 +0200 Subject: [PATCH 24/31] Refactor small_factors --- lark/utils.py | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/lark/utils.py b/lark/utils.py index f447b9e..610d160 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -379,18 +379,12 @@ def small_factors(n): Currently, we also keep a + b <= SMALL_FACTOR_THRESHOLD, but that might change """ assert n >= 0 - if n < SMALL_FACTOR_THRESHOLD: + if n <= SMALL_FACTOR_THRESHOLD: return [(n, 0)] # While this does not provide an optimal solution, it produces a pretty good one. # See above comment and PR #949 for a in range(SMALL_FACTOR_THRESHOLD, 1, -1): - b = n % a - if a + b > SMALL_FACTOR_THRESHOLD: - continue - r = n // a - assert r * a + b == n # Sanity check - if r <= SMALL_FACTOR_THRESHOLD: - return [(r, 0), (a, b)] - else: + r, b = divmod(n, a) + if a + b <= SMALL_FACTOR_THRESHOLD: return small_factors(r) + [(a, b)] assert False, "Failed to factorize %s" % n From 90460f31d98da5a08ec14c0ad7062756dcc82668 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 27 Jul 2021 11:45:01 +0300 Subject: [PATCH 25/31] Refactored PR #949 and edited the comments/docstrings --- lark/load_grammar.py | 100 ++++++++++++++++++++----------------------- lark/utils.py | 21 ++++----- tests/test_parser.py | 8 ++-- 3 files changed, 60 insertions(+), 69 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 36f6e2c..d1d06cc 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -174,21 +174,15 @@ RULES = { 'literal': ['REGEXP', 'STRING'], } -REPEAT_BREAK_THRESHOLD = 50 + +# Value 5 keeps the number of states in the lalr parser somewhat minimal +# It isn't optimal, but close to it. See PR #949 +SMALL_FACTOR_THRESHOLD = 5 # The Threshold whether repeat via ~ are split up into different rules # 50 is chosen since it keeps the number of states low and therefore lalr analysis time low, # while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts. -# For a grammar of the form start: "A"~0..N, these are the timing stats: -# N t -# 10 0.000 -# 20 0.004 -# 30 0.016 -# 40 0.049 -# 50 0.109 -# 60 0.215 -# 70 0.383 -# 80 0.631 # (See PR #949) +REPEAT_BREAK_THRESHOLD = 50 @inline_args @@ -224,17 +218,16 @@ class EBNF_to_BNF(Transformer_InPlace): return self._add_rule(expr, new_name, tree) def _add_repeat_rule(self, a, b, target, atom): - """ - When target matches n times atom - This builds a rule that matches atom (a*n + b) times + """Generate a rule that repeats target ``a`` times, and repeats atom ``b`` times. - The rule is of the form: + When called recursively (into target), it repeats atom for x(n) times, where: + x(0) = 1 + x(n) = a(n) * x(n-1) + b - The rules are of the form: (Example a = 3, b = 4) + Example rule when a=3, b=4: - new_rule: target target target atom atom atom atom + new_rule: target target target atom atom atom atom - e.g. we use target * a and atom * b """ key = (a, b, target, atom) try: @@ -245,27 +238,29 @@ class EBNF_to_BNF(Transformer_InPlace): return self._add_rule(key, new_name, tree) def _add_repeat_opt_rule(self, a, b, target, target_opt, atom): - """ + """Creates a rule that matches atom 0 to (a*n+b)-1 times. + When target matches n times atom, and target_opt 0 to n-1 times target_opt, - This builds a rule that matches atom 0 to (a*n+b)-1 times. - The created rule will not have any shift/reduce conflicts so that it can be used with lalr - The rules are of the form: (Example a = 3, b = 4) + First we generate target * i followed by target_opt, for i from 0 to a-1 + These match 0 to n*a - 1 times atom + + Then we generate target * a followed by atom * i, for i from 0 to b-1 + These match n*a to n*a + b-1 times atom - new_rule: target_opt - | target target_opt - | target target target_opt + The created rule will not have any shift/reduce conflicts so that it can be used with lalr - | target target target - | target target target atom - | target target target atom atom - | target target target atom atom atom + Example rule when a=3, b=4: - First we generate target * i followed by target_opt for i from 0 to a-1 - These match 0 to n*a - 1 times atom + new_rule: target_opt + | target target_opt + | target target target_opt + + | target target target + | target target target atom + | target target target atom atom + | target target target atom atom atom - Then we generate target * a followed by atom * i for i from 0 to b-1 - These match n*a to n*a + b-1 times atom """ key = (a, b, target, atom, "opt") try: @@ -273,38 +268,39 @@ class EBNF_to_BNF(Transformer_InPlace): except KeyError: new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b)) tree = ST('expansions', [ - ST('expansion', [target] * i + [target_opt]) - for i in range(a) + ST('expansion', [target]*i + [target_opt]) for i in range(a) ] + [ - ST('expansion', [target] * a + [atom] * i) - for i in range(b) + ST('expansion', [target]*a + [atom]*i) for i in range(b) ]) return self._add_rule(key, new_name, tree) def _generate_repeats(self, rule, mn, mx): + """Generates a rule tree that repeats ``rule`` exactly between ``mn`` to ``mx`` times. """ - We treat rule~mn..mx as rule~mn rule~0..(diff=mx-mn). - We then use small_factors to split up mn and diff up into values [(a, b), ...] - This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt - to generate a complete rule/expression that matches the corresponding number of repeats - """ - mn_factors = small_factors(mn) + # For a small number of repeats, we can take the naive approach + if mx < REPEAT_BREAK_THRESHOLD: + return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) + + # For large repeat values, we break the repetition into sub-rules. + # We treat ``rule~mn..mx`` as ``rule~mn rule~0..(diff=mx-mn)``. + # We then use small_factors to split up mn and diff up into values [(a, b), ...] + # This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt + # to generate a complete rule/expression that matches the corresponding number of repeats mn_target = rule - for a, b in mn_factors: + for a, b in small_factors(mn, SMALL_FACTOR_THRESHOLD): mn_target = self._add_repeat_rule(a, b, mn_target, rule) if mx == mn: return mn_target diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less - diff_factors = small_factors(diff) + diff_factors = small_factors(diff, SMALL_FACTOR_THRESHOLD) diff_target = rule # Match rule 1 times diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. up to 1 -1 times) for a, b in diff_factors[:-1]: - new_diff_target = self._add_repeat_rule(a, b, diff_target, rule) diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) - diff_target = new_diff_target + diff_target = self._add_repeat_rule(a, b, diff_target, rule) - a, b = diff_factors[-1] # We do the last on separately since we don't need to call self._add_repeat_rule + a, b = diff_factors[-1] diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])]) @@ -333,11 +329,9 @@ class EBNF_to_BNF(Transformer_InPlace): mn, mx = map(int, args) if mx < mn or mn < 0: raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) - # For small number of repeats, we don't need to build new rules. - if mx > REPEAT_BREAK_THRESHOLD: - return self._generate_repeats(rule, mn, mx) - else: - return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) + + return self._generate_repeats(rule, mn, mx) + assert False, op def maybe(self, rule): diff --git a/lark/utils.py b/lark/utils.py index 610d160..2938591 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -361,14 +361,11 @@ def _serialize(value, memo): return value -# Value 5 keeps the number of states in the lalr parser somewhat minimal -# It isn't optimal, but close to it. See PR #949 -SMALL_FACTOR_THRESHOLD = 5 -def small_factors(n): +def small_factors(n, max_factor): """ - Splits n up into smaller factors and summands <= SMALL_FACTOR_THRESHOLD. + Splits n up into smaller factors and summands <= max_factor. Returns a list of [(a, b), ...] so that the following code returns n: @@ -376,15 +373,15 @@ def small_factors(n): for a, b in values: n = n * a + b - Currently, we also keep a + b <= SMALL_FACTOR_THRESHOLD, but that might change + Currently, we also keep a + b <= max_factor, but that might change """ assert n >= 0 - if n <= SMALL_FACTOR_THRESHOLD: + assert max_factor > 2 + if n <= max_factor: return [(n, 0)] - # While this does not provide an optimal solution, it produces a pretty good one. - # See above comment and PR #949 - for a in range(SMALL_FACTOR_THRESHOLD, 1, -1): + + for a in range(max_factor, 1, -1): r, b = divmod(n, a) - if a + b <= SMALL_FACTOR_THRESHOLD: - return small_factors(r) + [(a, b)] + if a + b <= max_factor: + return small_factors(r, max_factor) + [(a, b)] assert False, "Failed to factorize %s" % n diff --git a/tests/test_parser.py b/tests/test_parser.py index b55f848..ffb1d8f 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2226,7 +2226,7 @@ def _make_parser_test(LEXER, PARSER): self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') - @unittest.skipIf(PARSER == 'cyk', "For large number of repeats, empty rules might be generated") + @unittest.skipIf(PARSER != 'lalr', "We only need to test rule generation, we know BNF is solid on all parsers") def test_ranged_repeat_large(self): # Large is currently arbitrarily chosen to be large than 20 g = u"""!start: "A"~60 @@ -2244,15 +2244,15 @@ def _make_parser_test(LEXER, PARSER): if 15 <= i <= 100: self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i)) else: - self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * i) + self.assertRaises(UnexpectedInput, l.parse, u'A' * i) # 8191 is a Mersenne prime g = u"""start: "A"~8191 """ l = _Lark(g) self.assertEqual(l.parse(u'A' * 8191), Tree('start', [])) - self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8190) - self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8192) + self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190) + self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192) @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX From cf61f78509e52b44c4fbbaf40f42a688f754342b Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 27 Jul 2021 15:08:29 +0300 Subject: [PATCH 26/31] Tests: Moved repeat operator tests to test_grammar --- tests/test_grammar.py | 49 ++++++++++++++++++++++++++++++++++++++++++- tests/test_parser.py | 49 ------------------------------------------- 2 files changed, 48 insertions(+), 50 deletions(-) diff --git a/tests/test_grammar.py b/tests/test_grammar.py index a643117..3ae65f2 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -3,7 +3,7 @@ from __future__ import absolute_import import sys from unittest import TestCase, main -from lark import Lark, Token, Tree +from lark import Lark, Token, Tree, ParseError, UnexpectedInput from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors from lark.load_grammar import FromPackageLoader @@ -198,6 +198,53 @@ class TestGrammar(TestCase): x = find_grammar_errors(text) assert [e.line for e, _s in find_grammar_errors(text)] == [2, 6] + def test_ranged_repeat_terms(self): + g = u"""!start: AAA + AAA: "A"~3 + """ + l = Lark(g, parser='lalr') + self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA') + + g = u"""!start: AABB CC + AABB: "A"~0..2 "B"~2 + CC: "C"~1..2 + """ + l = Lark(g, parser='lalr') + self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC'])) + self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C'])) + self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC'])) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') + + def test_ranged_repeat_large(self): + g = u"""!start: "A"~60 + """ + l = Lark(g, parser='lalr') + self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") + self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60)) + self.assertRaises(ParseError, l.parse, u'A' * 59) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61) + + g = u"""!start: "A"~15..100 + """ + l = Lark(g, parser='lalr') + for i in range(0, 110): + if 15 <= i <= 100: + self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i)) + else: + self.assertRaises(UnexpectedInput, l.parse, u'A' * i) + + # 8191 is a Mersenne prime + g = u"""start: "A"~8191 + """ + l = Lark(g, parser='lalr') + self.assertEqual(l.parse(u'A' * 8191), Tree('start', [])) + self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190) + self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192) if __name__ == '__main__': diff --git a/tests/test_parser.py b/tests/test_parser.py index ffb1d8f..9eb7b26 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2204,55 +2204,6 @@ def _make_parser_test(LEXER, PARSER): self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') - def test_ranged_repeat_terms(self): - g = u"""!start: AAA - AAA: "A"~3 - """ - l = _Lark(g) - self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"])) - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA') - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA') - - g = u"""!start: AABB CC - AABB: "A"~0..2 "B"~2 - CC: "C"~1..2 - """ - l = _Lark(g) - self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC'])) - self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C'])) - self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC'])) - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB') - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB') - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') - - @unittest.skipIf(PARSER != 'lalr', "We only need to test rule generation, we know BNF is solid on all parsers") - def test_ranged_repeat_large(self): - # Large is currently arbitrarily chosen to be large than 20 - g = u"""!start: "A"~60 - """ - l = _Lark(g) - self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") - self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60)) - self.assertRaises(ParseError, l.parse, u'A' * 59) - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61) - - g = u"""!start: "A"~15..100 - """ - l = _Lark(g) - for i in range(0, 110): - if 15 <= i <= 100: - self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i)) - else: - self.assertRaises(UnexpectedInput, l.parse, u'A' * i) - - # 8191 is a Mersenne prime - g = u"""start: "A"~8191 - """ - l = _Lark(g) - self.assertEqual(l.parse(u'A' * 8191), Tree('start', [])) - self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190) - self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192) @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX From e8d5e7e30db5e728cd4521308036aa55730f9957 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 7 Aug 2021 11:11:10 +0300 Subject: [PATCH 27/31] Docs: Updated IDE link --- README.md | 2 +- docs/index.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8ec22ed..f4335d0 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h - [Documentation @readthedocs](https://lark-parser.readthedocs.io/) - [Cheatsheet (PDF)](/docs/_static/lark_cheatsheet.pdf) -- [Online IDE (very basic)](https://lark-parser.github.io/lark/ide/app.html) +- [Online IDE (very basic)](https://lark-parser.github.io/lark/ide) - [Tutorial](/docs/json_tutorial.md) for writing a JSON parser. - Blog post: [How to write a DSL with Lark](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) - [Gitter chat](https://gitter.im/lark-parser/Lobby) diff --git a/docs/index.rst b/docs/index.rst index 39ecd5a..c4e8be6 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -113,7 +113,7 @@ Resources .. _Examples: https://github.com/lark-parser/lark/tree/master/examples .. _Third-party examples: https://github.com/ligurio/lark-grammars -.. _Online IDE: https://lark-parser.github.io/lark/ide/app.html +.. _Online IDE: https://lark-parser.github.io/lark/ide .. _How to write a DSL: http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/ .. _Program Synthesis is Possible: https://www.cs.cornell.edu/~asampson/blog/minisynth.html .. _Cheatsheet (PDF): _static/lark_cheatsheet.pdf From 41b2ba0d3a37757c30e3010763a516e822eaba87 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 7 Aug 2021 11:13:31 +0300 Subject: [PATCH 28/31] Docs: Updated IDE links again --- README.md | 2 +- docs/index.rst | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index f4335d0..82f6148 100644 --- a/README.md +++ b/README.md @@ -26,7 +26,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h - [Documentation @readthedocs](https://lark-parser.readthedocs.io/) - [Cheatsheet (PDF)](/docs/_static/lark_cheatsheet.pdf) -- [Online IDE (very basic)](https://lark-parser.github.io/lark/ide) +- [Online IDE](https://lark-parser.github.io/ide) - [Tutorial](/docs/json_tutorial.md) for writing a JSON parser. - Blog post: [How to write a DSL with Lark](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) - [Gitter chat](https://gitter.im/lark-parser/Lobby) diff --git a/docs/index.rst b/docs/index.rst index c4e8be6..e8bd6b2 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -113,7 +113,7 @@ Resources .. _Examples: https://github.com/lark-parser/lark/tree/master/examples .. _Third-party examples: https://github.com/ligurio/lark-grammars -.. _Online IDE: https://lark-parser.github.io/lark/ide +.. _Online IDE: https://lark-parser.github.io/ide .. _How to write a DSL: http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/ .. _Program Synthesis is Possible: https://www.cs.cornell.edu/~asampson/blog/minisynth.html .. _Cheatsheet (PDF): _static/lark_cheatsheet.pdf From f3826ed3d16d98d1a6619ca8bc8a0d0b9493d596 Mon Sep 17 00:00:00 2001 From: Louis Sautier Date: Tue, 10 Aug 2021 00:02:08 +0200 Subject: [PATCH 29/31] Remove ineffective description-file key from setup.cfg Otherwise, setuptools warns that: "UserWarning: Usage of dash-separated 'description-file' will not be supported in future versions. Please use the underscore name 'description_file' instead" This key doesn't seem to do anything unless you use pbr. --- setup.cfg | 2 -- 1 file changed, 2 deletions(-) diff --git a/setup.cfg b/setup.cfg index 6ddead9..6d71f28 100644 --- a/setup.cfg +++ b/setup.cfg @@ -5,6 +5,4 @@ zip_safe= universal = 1 [metadata] -description-file = README.md license_file = LICENSE - From 3269605211f92942296257e34722a979801c204c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20K=C3=A4ufl?= Date: Tue, 17 Aug 2021 10:46:37 +0200 Subject: [PATCH 30/31] Remove config for Travis CI --- .travis.yml | 15 --------------- README.md | 2 +- tox.ini | 11 ----------- 3 files changed, 1 insertion(+), 27 deletions(-) delete mode 100644 .travis.yml diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 6448cc8..0000000 --- a/.travis.yml +++ /dev/null @@ -1,15 +0,0 @@ -dist: xenial -language: python -python: - - "2.7" - - "3.4" - - "3.5" - - "3.6" - - "3.7" - - "3.8" - - "3.9-dev" - - "pypy2.7-6.0" - - "pypy3.5-6.0" -install: pip install tox-travis -script: - - tox diff --git a/README.md b/README.md index 82f6148..70be4fe 100644 --- a/README.md +++ b/README.md @@ -37,7 +37,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h Lark has no dependencies. -[![Build Status](https://travis-ci.org/lark-parser/lark.svg?branch=master)](https://travis-ci.org/lark-parser/lark) +[![Tests](https://github.com/lark-parser/lark/actions/workflows/tests.yml/badge.svg)](https://github.com/lark-parser/lark/actions/workflows/tests.yml) ### Syntax Highlighting diff --git a/tox.ini b/tox.ini index ef19e2c..cef423b 100644 --- a/tox.ini +++ b/tox.ini @@ -2,17 +2,6 @@ envlist = py27, py34, py35, py36, py37, py38, py39, pypy, pypy3 skip_missing_interpreters=true -[travis] -2.7 = py27 -3.4 = py34 -3.5 = py35 -3.6 = py36 -3.7 = py37 -3.8 = py38 -3.9 = py39 -pypy = pypy -pypy3 = pypy3 - [testenv] whitelist_externals = git deps = From 8f73a58a5446a2ffb078905af8acd11c358d3425 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michael=20K=C3=A4ufl?= Date: Tue, 17 Aug 2021 10:49:20 +0200 Subject: [PATCH 31/31] Run tests against Python 3.10 --- .github/workflows/tests.yml | 2 +- tox.ini | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 1630c8b..c7b9286 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -6,7 +6,7 @@ jobs: runs-on: ubuntu-latest strategy: matrix: - python-version: [2.7, 3.5, 3.6, 3.7, 3.8, 3.9.0-rc - 3.9, pypy2, pypy3] + python-version: [2.7, 3.5, 3.6, 3.7, 3.8, 3.9, 3.10.0-rc - 3.10, pypy2, pypy3] steps: - uses: actions/checkout@v2 diff --git a/tox.ini b/tox.ini index ef19e2c..842ed2b 100644 --- a/tox.ini +++ b/tox.ini @@ -1,5 +1,5 @@ [tox] -envlist = py27, py34, py35, py36, py37, py38, py39, pypy, pypy3 +envlist = py27, py34, py35, py36, py37, py38, py39, py310, pypy, pypy3 skip_missing_interpreters=true [travis]