From 845b6fa477827d6ee77a21eaced1c3f3a4a8d8b0 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Mon, 26 Jul 2021 01:14:46 +0200 Subject: [PATCH] Refactor + tests + additional splitting up. --- lark/load_grammar.py | 100 ++++++++++++++++++++++++++++++++----------- lark/utils.py | 2 +- tests/test_parser.py | 29 +++++++++++++ 3 files changed, 105 insertions(+), 26 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 569e67d..2f51ff6 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -179,42 +179,87 @@ RULES = { class EBNF_to_BNF(Transformer_InPlace): def __init__(self): self.new_rules = [] - self.rules_by_expr = {} + self.rules_cache = {} self.prefix = 'anon' self.i = 0 self.rule_options = None - def _add_recurse_rule(self, type_, expr): - if expr in self.rules_by_expr: - return self.rules_by_expr[expr] - - new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) + def _name_rule(self, inner): + new_name = '__%s_%s_%d' % (self.prefix, inner, self.i) self.i += 1 - t = NonTerminal(new_name) - tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])]) - self.new_rules.append((new_name, tree, self.rule_options)) - self.rules_by_expr[expr] = t + return new_name + + def _add_rule(self, key, name, expansions): + t = NonTerminal(name) + self.new_rules.append((name, expansions, self.rule_options)) + self.rules_cache[key] = t return t + def _add_recurse_rule(self, type_, expr): + try: + return self.rules_cache[expr] + except KeyError: + new_name = self._name_rule(type_) + t = NonTerminal(new_name) + tree = ST('expansions', [ + ST('expansion', [expr]), + ST('expansion', [t, expr]) + ]) + return self._add_rule(expr, new_name, tree) + def _add_repeat_rule(self, a, b, target, atom): - if (a, b, target, atom) in self.rules_by_expr: - return self.rules_by_expr[(a, b, target, atom)] - new_name = '__%s_a%d_b%d_%d' % (self.prefix, a, b, self.i) - self.i += 1 - t = NonTerminal(new_name) - tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)]) - self.new_rules.append((new_name, tree, self.rule_options)) - self.rules_by_expr[(a, b, target, atom)] = t - return t + """ + When target matches n times atom + This builds a rule that matches atom (a*n + b) times + """ + key = (a, b, target, atom) + try: + return self.rules_cache[key] + except KeyError: + new_name = self._name_rule('a%d_b%d' % (a, b)) + tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)]) + return self._add_rule(key, new_name, tree) + + def _add_repeat_opt_rule(self, a, b, target, target_opt, atom): + """ + When target matches n times atom, and target_opt 0 to n-1 times target_opt, + This builds a rule that matches atom 0 to (a*n+b)-1 times + """ + key = (a, b, target, atom, "opt") + try: + return self.rules_cache[key] + except KeyError: + new_name = self._name_rule('a%d_b%d_opt' % (a, b)) + tree = ST('expansions', [ + ST('expansion', [target] * i + [target_opt]) + for i in range(a) + ] + [ + ST('expansion', [target] * a + [atom] * i) + for i in range(1, b) + ]) + return self._add_rule(key, new_name, tree) def _generate_repeats(self, rule, mn, mx): - factors = small_factors(mn) - target = rule - for a, b in factors: - target = self._add_repeat_rule(a, b, target, rule) + mn_factors = small_factors(mn) + mn_target = rule + for a, b in mn_factors: + mn_target = self._add_repeat_rule(a, b, mn_target, rule) + if mx == mn: + return mn_target + diff = mx - mn + 1 # We add one because _add_repeat_opt_rule needs it. + diff_factors = small_factors(diff) + diff_target = rule + diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. 1-1 times) + for a, b in diff_factors[:-1]: + new_diff_target = self._add_repeat_rule(a, b, diff_target, rule) + diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) + diff_target = new_diff_target + a, b = diff_factors[-1] + diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) # return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) - return ST('expansions', [ST('expansion', [target] + [rule] * n) for n in range(0, mx - mn + 1)]) + # return ST('expansions', [ST('expansion', [mn_target] + [rule] * n) for n in range(0, mx - mn + 1)]) + return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])]) def expr(self, rule, op, *args): if op.value == '?': @@ -240,7 +285,12 @@ class EBNF_to_BNF(Transformer_InPlace): mn, mx = map(int, args) if mx < mn or mn < 0: raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) - return self._generate_repeats(rule, mn, mx) + # For small number of repeats, we don't need to build new rules. + # Value 20 is arbitrarily chosen + if mx > 20: + return self._generate_repeats(rule, mn, mx) + else: + return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) assert False, op def maybe(self, rule): diff --git a/lark/utils.py b/lark/utils.py index a3a077f..2fa5f43 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -373,7 +373,7 @@ def small_factors(n): Currently, we also keep a + b <= 10, but that might change """ - assert n > 0 + assert n >= 0 if n < 10: return [(n, 0)] # TODO: Think of better algorithms (Prime factors should minimize the number of steps) diff --git a/tests/test_parser.py b/tests/test_parser.py index 8fec82d..6c00fbb 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2226,6 +2226,35 @@ def _make_parser_test(LEXER, PARSER): self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') + @unittest.skipIf(PARSER == 'cyk', "For large number of repeats, empty rules might be generated") + def test_ranged_repeat_large(self): + # Large is currently arbitrarily chosen to be large than 20 + g = u"""!start: "A"~30 + """ + l = _Lark(g) + self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") + self.assertEqual(l.parse(u'A'*30), Tree('start', ["A"]*30)) + self.assertRaises(ParseError, l.parse, u'A'*29) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A'*31) + + + g = u"""!start: "A"~0..100 + """ + l = _Lark(g) + self.assertEqual(l.parse(u''), Tree('start', [])) + self.assertEqual(l.parse(u'A'), Tree('start', ['A'])) + self.assertEqual(l.parse(u'A'*100), Tree('start', ['A']*100)) + self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 101) + + # 8191 is a Mersenne prime + g = u"""start: "A"~8191 + """ + l = _Lark(g) + self.assertEqual(l.parse(u'A'*8191), Tree('start', [])) + self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8190) + self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8192) + + @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX def test_priority_vs_embedded(self): g = """