From b4fe22a27dd67bca414be767b92ab2960798f0d6 Mon Sep 17 00:00:00 2001
From: MegaIng <cornelius@krupp.hamburg>
Date: Mon, 26 Jul 2021 10:50:37 +0200
Subject: [PATCH] Python2.7 + comments + Magic constants

---
 lark/load_grammar.py | 48 ++++++++++++++++++++++++++++++++++++--------
 lark/utils.py        | 23 ++++++++++++---------
 tests/test_parser.py | 11 +++++-----
 3 files changed, 58 insertions(+), 24 deletions(-)

diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index 2f51ff6..2b1030f 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -174,6 +174,10 @@ RULES = {
     'literal': ['REGEXP', 'STRING'],
 }
 
+REPEAT_BREAK_THRESHOLD = 20
+# The Threshold whether repeat via ~ are split up into different rules
+# For the moment 20 is arbitrarily chosen
+
 
 @inline_args
 class EBNF_to_BNF(Transformer_InPlace):
@@ -211,25 +215,50 @@ class EBNF_to_BNF(Transformer_InPlace):
         """
         When target matches n times atom
         This builds a rule that matches atom (a*n + b) times
+
+        The rule is of the form:
+
+        The rules are of the form: (Example a = 3, b = 4)
+
+        new_rule: target target target atom atom atom atom
+
+        e.g. we use target * a and atom * b
         """
         key = (a, b, target, atom)
         try:
             return self.rules_cache[key]
         except KeyError:
-            new_name = self._name_rule('a%d_b%d' % (a, b))
+            new_name = self._name_rule('repeat_a%d_b%d' % (a, b))
             tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)])
             return self._add_rule(key, new_name, tree)
 
     def _add_repeat_opt_rule(self, a, b, target, target_opt, atom):
         """
         When target matches n times atom, and target_opt 0 to n-1 times target_opt,
-        This builds a rule that matches atom 0 to (a*n+b)-1 times
+        This builds a rule that matches atom 0 to (a*n+b)-1 times.
+        The created rule will not have any shift/reduce conflicts so that it can be used with lalr
+
+        The rules are of the form: (Example a = 3, b = 4)
+
+        new_rule: target_opt
+                | target target_opt
+                | target target target_opt
+
+                | target target target atom
+                | target target target atom atom
+                | target target target atom atom atom
+
+        First we generate target * i followed by target_opt for i from 0 to a-1
+        These match 0 to n*a - 1 times atom
+
+        Then we generate target * a followed by atom * i for i from 1 to b-1
+        These match n*a to n*a + b-1 times atom
         """
         key = (a, b, target, atom, "opt")
         try:
             return self.rules_cache[key]
         except KeyError:
-            new_name = self._name_rule('a%d_b%d_opt' % (a, b))
+            new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b))
             tree = ST('expansions', [
                 ST('expansion', [target] * i + [target_opt])
                 for i in range(a)
@@ -240,13 +269,19 @@ class EBNF_to_BNF(Transformer_InPlace):
             return self._add_rule(key, new_name, tree)
 
     def _generate_repeats(self, rule, mn, mx):
+        """
+        We treat rule~mn..mx as rule~mn rule~0..(diff=mx-mn).
+        We then use small_factors to split up mn and diff up into values [(a, b), ...]
+        This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt
+        to generate a complete rule/expression that matches the corresponding number of repeats
+        """
         mn_factors = small_factors(mn)
         mn_target = rule
         for a, b in mn_factors:
             mn_target = self._add_repeat_rule(a, b, mn_target, rule)
         if mx == mn:
             return mn_target
-        diff = mx - mn + 1  # We add one because _add_repeat_opt_rule needs it.
+        diff = mx - mn + 1  # We add one because _add_repeat_opt_rule generates rules that match one less
         diff_factors = small_factors(diff)
         diff_target = rule
         diff_opt_target = ST('expansion', [])  # match rule 0 times (e.g. 1-1 times)
@@ -257,8 +292,6 @@ class EBNF_to_BNF(Transformer_InPlace):
         a, b = diff_factors[-1]
         diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)
 
-        # return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)])
-        # return ST('expansions', [ST('expansion', [mn_target] + [rule] * n) for n in range(0, mx - mn + 1)])
         return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])])
 
     def expr(self, rule, op, *args):
@@ -286,8 +319,7 @@ class EBNF_to_BNF(Transformer_InPlace):
                 if mx < mn or mn < 0:
                     raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
             # For small number of repeats, we don't need to build new rules.
-            # Value 20 is arbitrarily chosen
-            if mx > 20:
+            if mx > REPEAT_BREAK_THRESHOLD:
                 return self._generate_repeats(rule, mn, mx)
             else:
                 return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)])
diff --git a/lark/utils.py b/lark/utils.py
index 2fa5f43..1648720 100644
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -187,7 +187,7 @@ def get_regexp_width(expr):
                 return 1, sre_constants.MAXREPEAT
             else:
                 return 0, sre_constants.MAXREPEAT
-            
+
 ###}
 
 
@@ -288,7 +288,7 @@ except ImportError:
 
 class FS:
     exists = os.path.exists
-    
+
     @staticmethod
     def open(name, mode="r", **kwargs):
         if atomicwrites and "w" in mode:
@@ -361,9 +361,13 @@ def _serialize(value, memo):
     return value
 
 
+# 10 is arbitrarily chosen
+SMALL_FACTOR_THRESHOLD = 10
+
+
 def small_factors(n):
     """
-    Splits n up into smaller factors and summands <= 10.
+    Splits n up into smaller factors and summands <= SMALL_FACTOR_THRESHOLD.
     Returns a list of [(a, b), ...]
     so that the following code returns n:
 
@@ -371,21 +375,20 @@ def small_factors(n):
     for a, b in values:
         n = n * a + b
 
-    Currently, we also keep a + b <= 10, but that might change
+    Currently, we also keep a + b <= SMALL_FACTOR_THRESHOLD, but that might change
     """
     assert n >= 0
-    if n < 10:
+    if n < SMALL_FACTOR_THRESHOLD:
         return [(n, 0)]
     # TODO: Think of better algorithms (Prime factors should minimize the number of steps)
-    for a in range(10, 1, -1):
+    for a in range(SMALL_FACTOR_THRESHOLD, 1, -1):
         b = n % a
-        if a + b > 10:
+        if a + b > SMALL_FACTOR_THRESHOLD:
             continue
         r = n // a
         assert r * a + b == n  # Sanity check
-        if r <= 10:
+        if r <= SMALL_FACTOR_THRESHOLD:
             return [(r, 0), (a, b)]
         else:
-            return [*small_factors(r), (a, b)]
-    # This should be unreachable, since 2 + 1 <= 10
+            return small_factors(r) + [(a, b)]
     assert False, "Failed to factorize %s" % n
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 6c00fbb..2247b46 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -2233,24 +2233,23 @@ def _make_parser_test(LEXER, PARSER):
                 """
             l = _Lark(g)
             self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated")
-            self.assertEqual(l.parse(u'A'*30), Tree('start', ["A"]*30))
-            self.assertRaises(ParseError, l.parse, u'A'*29)
-            self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A'*31)
-
+            self.assertEqual(l.parse(u'A' * 30), Tree('start', ["A"] * 30))
+            self.assertRaises(ParseError, l.parse, u'A' * 29)
+            self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 31)
 
             g = u"""!start: "A"~0..100
                 """
             l = _Lark(g)
             self.assertEqual(l.parse(u''), Tree('start', []))
             self.assertEqual(l.parse(u'A'), Tree('start', ['A']))
-            self.assertEqual(l.parse(u'A'*100), Tree('start', ['A']*100))
+            self.assertEqual(l.parse(u'A' * 100), Tree('start', ['A'] * 100))
             self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 101)
 
             # 8191 is a Mersenne prime
             g = u"""start: "A"~8191
                 """
             l = _Lark(g)
-            self.assertEqual(l.parse(u'A'*8191), Tree('start', []))
+            self.assertEqual(l.parse(u'A' * 8191), Tree('start', []))
             self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8190)
             self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8192)