From 90460f31d98da5a08ec14c0ad7062756dcc82668 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 27 Jul 2021 11:45:01 +0300 Subject: [PATCH] Refactored PR #949 and edited the comments/docstrings --- lark/load_grammar.py | 100 ++++++++++++++++++++----------------------- lark/utils.py | 21 ++++----- tests/test_parser.py | 8 ++-- 3 files changed, 60 insertions(+), 69 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 36f6e2c..d1d06cc 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -174,21 +174,15 @@ RULES = { 'literal': ['REGEXP', 'STRING'], } -REPEAT_BREAK_THRESHOLD = 50 + +# Value 5 keeps the number of states in the lalr parser somewhat minimal +# It isn't optimal, but close to it. See PR #949 +SMALL_FACTOR_THRESHOLD = 5 # The Threshold whether repeat via ~ are split up into different rules # 50 is chosen since it keeps the number of states low and therefore lalr analysis time low, # while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts. -# For a grammar of the form start: "A"~0..N, these are the timing stats: -# N t -# 10 0.000 -# 20 0.004 -# 30 0.016 -# 40 0.049 -# 50 0.109 -# 60 0.215 -# 70 0.383 -# 80 0.631 # (See PR #949) +REPEAT_BREAK_THRESHOLD = 50 @inline_args @@ -224,17 +218,16 @@ class EBNF_to_BNF(Transformer_InPlace): return self._add_rule(expr, new_name, tree) def _add_repeat_rule(self, a, b, target, atom): - """ - When target matches n times atom - This builds a rule that matches atom (a*n + b) times + """Generate a rule that repeats target ``a`` times, and repeats atom ``b`` times. - The rule is of the form: + When called recursively (into target), it repeats atom for x(n) times, where: + x(0) = 1 + x(n) = a(n) * x(n-1) + b - The rules are of the form: (Example a = 3, b = 4) + Example rule when a=3, b=4: - new_rule: target target target atom atom atom atom + new_rule: target target target atom atom atom atom - e.g. we use target * a and atom * b """ key = (a, b, target, atom) try: @@ -245,27 +238,29 @@ class EBNF_to_BNF(Transformer_InPlace): return self._add_rule(key, new_name, tree) def _add_repeat_opt_rule(self, a, b, target, target_opt, atom): - """ + """Creates a rule that matches atom 0 to (a*n+b)-1 times. + When target matches n times atom, and target_opt 0 to n-1 times target_opt, - This builds a rule that matches atom 0 to (a*n+b)-1 times. - The created rule will not have any shift/reduce conflicts so that it can be used with lalr - The rules are of the form: (Example a = 3, b = 4) + First we generate target * i followed by target_opt, for i from 0 to a-1 + These match 0 to n*a - 1 times atom + + Then we generate target * a followed by atom * i, for i from 0 to b-1 + These match n*a to n*a + b-1 times atom - new_rule: target_opt - | target target_opt - | target target target_opt + The created rule will not have any shift/reduce conflicts so that it can be used with lalr - | target target target - | target target target atom - | target target target atom atom - | target target target atom atom atom + Example rule when a=3, b=4: - First we generate target * i followed by target_opt for i from 0 to a-1 - These match 0 to n*a - 1 times atom + new_rule: target_opt + | target target_opt + | target target target_opt + + | target target target + | target target target atom + | target target target atom atom + | target target target atom atom atom - Then we generate target * a followed by atom * i for i from 0 to b-1 - These match n*a to n*a + b-1 times atom """ key = (a, b, target, atom, "opt") try: @@ -273,38 +268,39 @@ class EBNF_to_BNF(Transformer_InPlace): except KeyError: new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b)) tree = ST('expansions', [ - ST('expansion', [target] * i + [target_opt]) - for i in range(a) + ST('expansion', [target]*i + [target_opt]) for i in range(a) ] + [ - ST('expansion', [target] * a + [atom] * i) - for i in range(b) + ST('expansion', [target]*a + [atom]*i) for i in range(b) ]) return self._add_rule(key, new_name, tree) def _generate_repeats(self, rule, mn, mx): + """Generates a rule tree that repeats ``rule`` exactly between ``mn`` to ``mx`` times. """ - We treat rule~mn..mx as rule~mn rule~0..(diff=mx-mn). - We then use small_factors to split up mn and diff up into values [(a, b), ...] - This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt - to generate a complete rule/expression that matches the corresponding number of repeats - """ - mn_factors = small_factors(mn) + # For a small number of repeats, we can take the naive approach + if mx < REPEAT_BREAK_THRESHOLD: + return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) + + # For large repeat values, we break the repetition into sub-rules. + # We treat ``rule~mn..mx`` as ``rule~mn rule~0..(diff=mx-mn)``. + # We then use small_factors to split up mn and diff up into values [(a, b), ...] + # This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt + # to generate a complete rule/expression that matches the corresponding number of repeats mn_target = rule - for a, b in mn_factors: + for a, b in small_factors(mn, SMALL_FACTOR_THRESHOLD): mn_target = self._add_repeat_rule(a, b, mn_target, rule) if mx == mn: return mn_target diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less - diff_factors = small_factors(diff) + diff_factors = small_factors(diff, SMALL_FACTOR_THRESHOLD) diff_target = rule # Match rule 1 times diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. up to 1 -1 times) for a, b in diff_factors[:-1]: - new_diff_target = self._add_repeat_rule(a, b, diff_target, rule) diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) - diff_target = new_diff_target + diff_target = self._add_repeat_rule(a, b, diff_target, rule) - a, b = diff_factors[-1] # We do the last on separately since we don't need to call self._add_repeat_rule + a, b = diff_factors[-1] diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])]) @@ -333,11 +329,9 @@ class EBNF_to_BNF(Transformer_InPlace): mn, mx = map(int, args) if mx < mn or mn < 0: raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) - # For small number of repeats, we don't need to build new rules. - if mx > REPEAT_BREAK_THRESHOLD: - return self._generate_repeats(rule, mn, mx) - else: - return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) + + return self._generate_repeats(rule, mn, mx) + assert False, op def maybe(self, rule): diff --git a/lark/utils.py b/lark/utils.py index 610d160..2938591 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -361,14 +361,11 @@ def _serialize(value, memo): return value -# Value 5 keeps the number of states in the lalr parser somewhat minimal -# It isn't optimal, but close to it. See PR #949 -SMALL_FACTOR_THRESHOLD = 5 -def small_factors(n): +def small_factors(n, max_factor): """ - Splits n up into smaller factors and summands <= SMALL_FACTOR_THRESHOLD. + Splits n up into smaller factors and summands <= max_factor. Returns a list of [(a, b), ...] so that the following code returns n: @@ -376,15 +373,15 @@ def small_factors(n): for a, b in values: n = n * a + b - Currently, we also keep a + b <= SMALL_FACTOR_THRESHOLD, but that might change + Currently, we also keep a + b <= max_factor, but that might change """ assert n >= 0 - if n <= SMALL_FACTOR_THRESHOLD: + assert max_factor > 2 + if n <= max_factor: return [(n, 0)] - # While this does not provide an optimal solution, it produces a pretty good one. - # See above comment and PR #949 - for a in range(SMALL_FACTOR_THRESHOLD, 1, -1): + + for a in range(max_factor, 1, -1): r, b = divmod(n, a) - if a + b <= SMALL_FACTOR_THRESHOLD: - return small_factors(r) + [(a, b)] + if a + b <= max_factor: + return small_factors(r, max_factor) + [(a, b)] assert False, "Failed to factorize %s" % n diff --git a/tests/test_parser.py b/tests/test_parser.py index b55f848..ffb1d8f 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2226,7 +2226,7 @@ def _make_parser_test(LEXER, PARSER): self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') - @unittest.skipIf(PARSER == 'cyk', "For large number of repeats, empty rules might be generated") + @unittest.skipIf(PARSER != 'lalr', "We only need to test rule generation, we know BNF is solid on all parsers") def test_ranged_repeat_large(self): # Large is currently arbitrarily chosen to be large than 20 g = u"""!start: "A"~60 @@ -2244,15 +2244,15 @@ def _make_parser_test(LEXER, PARSER): if 15 <= i <= 100: self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i)) else: - self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * i) + self.assertRaises(UnexpectedInput, l.parse, u'A' * i) # 8191 is a Mersenne prime g = u"""start: "A"~8191 """ l = _Lark(g) self.assertEqual(l.parse(u'A' * 8191), Tree('start', [])) - self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8190) - self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8192) + self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190) + self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192) @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX