From fa8565366be027df0f788cb1432d90b2b94aa264 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Mon, 26 Jul 2021 18:53:43 +0200 Subject: [PATCH] Off-by-one fix + Change of thresholds + fix tests --- lark/load_grammar.py | 29 ++++++++++++++++++++++------- lark/utils.py | 8 +++++--- tests/test_parser.py | 19 ++++++++++--------- 3 files changed, 37 insertions(+), 19 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 2b1030f..36f6e2c 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -174,9 +174,21 @@ RULES = { 'literal': ['REGEXP', 'STRING'], } -REPEAT_BREAK_THRESHOLD = 20 +REPEAT_BREAK_THRESHOLD = 50 # The Threshold whether repeat via ~ are split up into different rules -# For the moment 20 is arbitrarily chosen +# 50 is chosen since it keeps the number of states low and therefore lalr analysis time low, +# while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts. +# For a grammar of the form start: "A"~0..N, these are the timing stats: +# N t +# 10 0.000 +# 20 0.004 +# 30 0.016 +# 40 0.049 +# 50 0.109 +# 60 0.215 +# 70 0.383 +# 80 0.631 +# (See PR #949) @inline_args @@ -244,6 +256,7 @@ class EBNF_to_BNF(Transformer_InPlace): | target target_opt | target target target_opt + | target target target | target target target atom | target target target atom atom | target target target atom atom atom @@ -251,7 +264,7 @@ class EBNF_to_BNF(Transformer_InPlace): First we generate target * i followed by target_opt for i from 0 to a-1 These match 0 to n*a - 1 times atom - Then we generate target * a followed by atom * i for i from 1 to b-1 + Then we generate target * a followed by atom * i for i from 0 to b-1 These match n*a to n*a + b-1 times atom """ key = (a, b, target, atom, "opt") @@ -264,7 +277,7 @@ class EBNF_to_BNF(Transformer_InPlace): for i in range(a) ] + [ ST('expansion', [target] * a + [atom] * i) - for i in range(1, b) + for i in range(b) ]) return self._add_rule(key, new_name, tree) @@ -281,15 +294,17 @@ class EBNF_to_BNF(Transformer_InPlace): mn_target = self._add_repeat_rule(a, b, mn_target, rule) if mx == mn: return mn_target + diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less diff_factors = small_factors(diff) - diff_target = rule - diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. 1-1 times) + diff_target = rule # Match rule 1 times + diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. up to 1 -1 times) for a, b in diff_factors[:-1]: new_diff_target = self._add_repeat_rule(a, b, diff_target, rule) diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) diff_target = new_diff_target - a, b = diff_factors[-1] + + a, b = diff_factors[-1] # We do the last on separately since we don't need to call self._add_repeat_rule diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])]) diff --git a/lark/utils.py b/lark/utils.py index 1648720..f447b9e 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -361,8 +361,9 @@ def _serialize(value, memo): return value -# 10 is arbitrarily chosen -SMALL_FACTOR_THRESHOLD = 10 +# Value 5 keeps the number of states in the lalr parser somewhat minimal +# It isn't optimal, but close to it. See PR #949 +SMALL_FACTOR_THRESHOLD = 5 def small_factors(n): @@ -380,7 +381,8 @@ def small_factors(n): assert n >= 0 if n < SMALL_FACTOR_THRESHOLD: return [(n, 0)] - # TODO: Think of better algorithms (Prime factors should minimize the number of steps) + # While this does not provide an optimal solution, it produces a pretty good one. + # See above comment and PR #949 for a in range(SMALL_FACTOR_THRESHOLD, 1, -1): b = n % a if a + b > SMALL_FACTOR_THRESHOLD: diff --git a/tests/test_parser.py b/tests/test_parser.py index 2247b46..b55f848 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2229,21 +2229,22 @@ def _make_parser_test(LEXER, PARSER): @unittest.skipIf(PARSER == 'cyk', "For large number of repeats, empty rules might be generated") def test_ranged_repeat_large(self): # Large is currently arbitrarily chosen to be large than 20 - g = u"""!start: "A"~30 + g = u"""!start: "A"~60 """ l = _Lark(g) self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") - self.assertEqual(l.parse(u'A' * 30), Tree('start', ["A"] * 30)) - self.assertRaises(ParseError, l.parse, u'A' * 29) - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 31) + self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60)) + self.assertRaises(ParseError, l.parse, u'A' * 59) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61) - g = u"""!start: "A"~0..100 + g = u"""!start: "A"~15..100 """ l = _Lark(g) - self.assertEqual(l.parse(u''), Tree('start', [])) - self.assertEqual(l.parse(u'A'), Tree('start', ['A'])) - self.assertEqual(l.parse(u'A' * 100), Tree('start', ['A'] * 100)) - self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 101) + for i in range(0, 110): + if 15 <= i <= 100: + self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i)) + else: + self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * i) # 8191 is a Mersenne prime g = u"""start: "A"~8191