From b4fe22a27dd67bca414be767b92ab2960798f0d6 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Mon, 26 Jul 2021 10:50:37 +0200 Subject: [PATCH] Python2.7 + comments + Magic constants --- lark/load_grammar.py | 48 ++++++++++++++++++++++++++++++++++++-------- lark/utils.py | 23 ++++++++++++--------- tests/test_parser.py | 11 +++++----- 3 files changed, 58 insertions(+), 24 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 2f51ff6..2b1030f 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -174,6 +174,10 @@ RULES = { 'literal': ['REGEXP', 'STRING'], } +REPEAT_BREAK_THRESHOLD = 20 +# The Threshold whether repeat via ~ are split up into different rules +# For the moment 20 is arbitrarily chosen + @inline_args class EBNF_to_BNF(Transformer_InPlace): @@ -211,25 +215,50 @@ class EBNF_to_BNF(Transformer_InPlace): """ When target matches n times atom This builds a rule that matches atom (a*n + b) times + + The rule is of the form: + + The rules are of the form: (Example a = 3, b = 4) + + new_rule: target target target atom atom atom atom + + e.g. we use target * a and atom * b """ key = (a, b, target, atom) try: return self.rules_cache[key] except KeyError: - new_name = self._name_rule('a%d_b%d' % (a, b)) + new_name = self._name_rule('repeat_a%d_b%d' % (a, b)) tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)]) return self._add_rule(key, new_name, tree) def _add_repeat_opt_rule(self, a, b, target, target_opt, atom): """ When target matches n times atom, and target_opt 0 to n-1 times target_opt, - This builds a rule that matches atom 0 to (a*n+b)-1 times + This builds a rule that matches atom 0 to (a*n+b)-1 times. + The created rule will not have any shift/reduce conflicts so that it can be used with lalr + + The rules are of the form: (Example a = 3, b = 4) + + new_rule: target_opt + | target target_opt + | target target target_opt + + | target target target atom + | target target target atom atom + | target target target atom atom atom + + First we generate target * i followed by target_opt for i from 0 to a-1 + These match 0 to n*a - 1 times atom + + Then we generate target * a followed by atom * i for i from 1 to b-1 + These match n*a to n*a + b-1 times atom """ key = (a, b, target, atom, "opt") try: return self.rules_cache[key] except KeyError: - new_name = self._name_rule('a%d_b%d_opt' % (a, b)) + new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b)) tree = ST('expansions', [ ST('expansion', [target] * i + [target_opt]) for i in range(a) @@ -240,13 +269,19 @@ class EBNF_to_BNF(Transformer_InPlace): return self._add_rule(key, new_name, tree) def _generate_repeats(self, rule, mn, mx): + """ + We treat rule~mn..mx as rule~mn rule~0..(diff=mx-mn). + We then use small_factors to split up mn and diff up into values [(a, b), ...] + This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt + to generate a complete rule/expression that matches the corresponding number of repeats + """ mn_factors = small_factors(mn) mn_target = rule for a, b in mn_factors: mn_target = self._add_repeat_rule(a, b, mn_target, rule) if mx == mn: return mn_target - diff = mx - mn + 1 # We add one because _add_repeat_opt_rule needs it. + diff = mx - mn + 1 # We add one because _add_repeat_opt_rule generates rules that match one less diff_factors = small_factors(diff) diff_target = rule diff_opt_target = ST('expansion', []) # match rule 0 times (e.g. 1-1 times) @@ -257,8 +292,6 @@ class EBNF_to_BNF(Transformer_InPlace): a, b = diff_factors[-1] diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule) - # return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) - # return ST('expansions', [ST('expansion', [mn_target] + [rule] * n) for n in range(0, mx - mn + 1)]) return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])]) def expr(self, rule, op, *args): @@ -286,8 +319,7 @@ class EBNF_to_BNF(Transformer_InPlace): if mx < mn or mn < 0: raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) # For small number of repeats, we don't need to build new rules. - # Value 20 is arbitrarily chosen - if mx > 20: + if mx > REPEAT_BREAK_THRESHOLD: return self._generate_repeats(rule, mn, mx) else: return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)]) diff --git a/lark/utils.py b/lark/utils.py index 2fa5f43..1648720 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -187,7 +187,7 @@ def get_regexp_width(expr): return 1, sre_constants.MAXREPEAT else: return 0, sre_constants.MAXREPEAT - + ###} @@ -288,7 +288,7 @@ except ImportError: class FS: exists = os.path.exists - + @staticmethod def open(name, mode="r", **kwargs): if atomicwrites and "w" in mode: @@ -361,9 +361,13 @@ def _serialize(value, memo): return value +# 10 is arbitrarily chosen +SMALL_FACTOR_THRESHOLD = 10 + + def small_factors(n): """ - Splits n up into smaller factors and summands <= 10. + Splits n up into smaller factors and summands <= SMALL_FACTOR_THRESHOLD. Returns a list of [(a, b), ...] so that the following code returns n: @@ -371,21 +375,20 @@ def small_factors(n): for a, b in values: n = n * a + b - Currently, we also keep a + b <= 10, but that might change + Currently, we also keep a + b <= SMALL_FACTOR_THRESHOLD, but that might change """ assert n >= 0 - if n < 10: + if n < SMALL_FACTOR_THRESHOLD: return [(n, 0)] # TODO: Think of better algorithms (Prime factors should minimize the number of steps) - for a in range(10, 1, -1): + for a in range(SMALL_FACTOR_THRESHOLD, 1, -1): b = n % a - if a + b > 10: + if a + b > SMALL_FACTOR_THRESHOLD: continue r = n // a assert r * a + b == n # Sanity check - if r <= 10: + if r <= SMALL_FACTOR_THRESHOLD: return [(r, 0), (a, b)] else: - return [*small_factors(r), (a, b)] - # This should be unreachable, since 2 + 1 <= 10 + return small_factors(r) + [(a, b)] assert False, "Failed to factorize %s" % n diff --git a/tests/test_parser.py b/tests/test_parser.py index 6c00fbb..2247b46 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2233,24 +2233,23 @@ def _make_parser_test(LEXER, PARSER): """ l = _Lark(g) self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated") - self.assertEqual(l.parse(u'A'*30), Tree('start', ["A"]*30)) - self.assertRaises(ParseError, l.parse, u'A'*29) - self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A'*31) - + self.assertEqual(l.parse(u'A' * 30), Tree('start', ["A"] * 30)) + self.assertRaises(ParseError, l.parse, u'A' * 29) + self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 31) g = u"""!start: "A"~0..100 """ l = _Lark(g) self.assertEqual(l.parse(u''), Tree('start', [])) self.assertEqual(l.parse(u'A'), Tree('start', ['A'])) - self.assertEqual(l.parse(u'A'*100), Tree('start', ['A']*100)) + self.assertEqual(l.parse(u'A' * 100), Tree('start', ['A'] * 100)) self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 101) # 8191 is a Mersenne prime g = u"""start: "A"~8191 """ l = _Lark(g) - self.assertEqual(l.parse(u'A'*8191), Tree('start', [])) + self.assertEqual(l.parse(u'A' * 8191), Tree('start', [])) self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8190) self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8192)