From ec2ba8826ea5f396abab063f47ceaf914333e04c Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Mon, 28 Jun 2021 12:11:07 +0300
Subject: [PATCH 01/31] Docs fix + cleanup

---
 lark/lexer.py |  2 +-
 lark/utils.py | 11 -----------
 2 files changed, 1 insertion(+), 12 deletions(-)

diff --git a/lark/lexer.py b/lark/lexer.py
index a2aefd2..4062c2d 100644
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -120,7 +120,7 @@ class Token(Str):
     Attributes:
         type: Name of the token (as specified in grammar)
         value: Value of the token (redundant, as ``token.value == token`` will always be true)
-        pos_in_stream: The index of the token in the text
+        start_pos: The index of the token in the text
         line: The line of the token in the text (starting with 1)
         column: The column of the token in the text (starting with 1)
         end_line: The line where the token ends
diff --git a/lark/utils.py b/lark/utils.py
index 70516e6..b9d7ac3 100644
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -241,17 +241,6 @@ except ImportError:
             pass
 
 
-try:
-    compare = cmp
-except NameError:
-    def compare(a, b):
-        if a == b:
-            return 0
-        elif a > b:
-            return 1
-        return -1
-
-
 class Enumerator(Serialize):
     def __init__(self):
         self.enums = {}

From bdcd2e0011bc0cd4fa3c35f59f28c78a1fa61a78 Mon Sep 17 00:00:00 2001
From: MegaIng <cornelius@krupp.hamburg>
Date: Tue, 29 Jun 2021 22:32:56 +0200
Subject: [PATCH 02/31] fix tree_matcher when keep_all_tokens=True by setting
 sym.filter_out correctly.

---
 lark/load_grammar.py        |  5 ++++-
 tests/test_reconstructor.py | 16 ++++++++++++++--
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index dcb4c81..c7b98a7 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -630,7 +630,10 @@ class Grammar:
                 else:
                     exp_options = options
 
-                assert all(isinstance(x, Symbol) for x in expansion), expansion
+                for sym in expansion:
+                    assert isinstance(sym, Symbol)
+                    if sym.is_term and exp_options and exp_options.keep_all_tokens:
+                        sym.filter_out = False
                 rule = Rule(NonTerminal(name), expansion, i, alias, exp_options)
                 compiled_rules.append(rule)
 
diff --git a/tests/test_reconstructor.py b/tests/test_reconstructor.py
index f132312..e2f2dbe 100644
--- a/tests/test_reconstructor.py
+++ b/tests/test_reconstructor.py
@@ -3,6 +3,7 @@
 import json
 import sys
 import unittest
+from itertools import product
 from unittest import TestCase
 
 from lark import Lark
@@ -20,8 +21,8 @@ def _remove_ws(s):
 
 class TestReconstructor(TestCase):
 
-    def assert_reconstruct(self, grammar, code):
-        parser = Lark(grammar, parser='lalr', maybe_placeholders=False)
+    def assert_reconstruct(self, grammar, code, **options):
+        parser = Lark(grammar, parser='lalr', maybe_placeholders=False, **options)
         tree = parser.parse(code)
         new = Reconstructor(parser).reconstruct(tree)
         self.assertEqual(_remove_ws(code), _remove_ws(new))
@@ -142,6 +143,17 @@ class TestReconstructor(TestCase):
         new_json = Reconstructor(json_parser).reconstruct(tree)
         self.assertEqual(json.loads(new_json), json.loads(test_json))
 
+    def test_keep_all_tokens(self):
+        g = """
+        start: "a"? _B? c? _d?
+        _B: "b"
+        c: "c"
+        _d: "d"
+        """
+        examples = list(map(''.join, product(('', 'a'), ('', 'b'), ('', 'c'), ('', 'd'), )))
+        for code in examples:
+            self.assert_reconstruct(g, code, keep_all_tokens=True)
+
     @unittest.skipIf(sys.version_info < (3, 0), "Python 2 does not play well with Unicode.")
     def test_switch_grammar_unicode_terminal(self):
         """

From 389e7fbf5cc4ff8973ceb36e9823e6984df0941b Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Sun, 6 Jun 2021 17:02:41 +0300
Subject: [PATCH 03/31] lexer.py: Refactored mres operations into a Scanner
 class.

---
 lark/lexer.py | 92 +++++++++++++++++++++++++++++----------------------
 1 file changed, 52 insertions(+), 40 deletions(-)

diff --git a/lark/lexer.py b/lark/lexer.py
index 4062c2d..7a30d6d 100644
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -127,7 +127,7 @@ class Token(Str):
         end_column: The next column after the end of the token. For example,
             if the token is a single character with a column value of 4,
             end_column will be 5.
-        end_pos: the index where the token ends (basically ``pos_in_stream + len(token)``)
+        end_pos: the index where the token ends (basically ``start_pos + len(token)``)
     """
     __slots__ = ('type', 'start_pos', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos')
 
@@ -214,15 +214,13 @@ class LineCounter:
 
 
 class UnlessCallback:
-    def __init__(self, mres):
-        self.mres = mres
+    def __init__(self, scanner):
+        self.scanner = scanner
 
     def __call__(self, t):
-        for mre, type_from_index in self.mres:
-            m = mre.match(t.value)
-            if m:
-                t.type = type_from_index[m.lastindex]
-                break
+        res = self.scanner.match(t.value, 0)
+        if res:
+            _value, t.type = res
         return t
 
 
@@ -254,34 +252,51 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes):
                 if strtok.pattern.flags <= retok.pattern.flags:
                     embedded_strs.add(strtok)
         if unless:
-            callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))
+            callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))
 
     terminals = [t for t in terminals if t not in embedded_strs]
     return terminals, callback
 
 
-def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_, use_bytes):
-    # Python sets an unreasonable group limit (currently 100) in its re module
-    # Worse, the only way to know we reached it is by catching an AssertionError!
-    # This function recursively tries less and less groups until it's successful.
-    postfix = '$' if match_whole else ''
-    mres = []
-    while terminals:
-        pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
-        if use_bytes:
-            pattern = pattern.encode('latin-1')
-        try:
-            mre = re_.compile(pattern, g_regex_flags)
-        except AssertionError:  # Yes, this is what Python provides us.. :/
-            return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_, use_bytes)
 
-        mres.append((mre, {i: n for n, i in mre.groupindex.items()}))
-        terminals = terminals[max_size:]
-    return mres
+class Scanner:
+    def __init__(self, terminals, g_regex_flags, re_, use_bytes, match_whole=False):
+        self.terminals = terminals
+        self.g_regex_flags = g_regex_flags
+        self.re_ = re_
+        self.use_bytes = use_bytes
+        self.match_whole = match_whole
+
+        self._mres = self._build_mres(terminals, len(terminals))
+
+    def _build_mres(self, terminals, max_size):
+        # Python sets an unreasonable group limit (currently 100) in its re module
+        # Worse, the only way to know we reached it is by catching an AssertionError!
+        # This function recursively tries less and less groups until it's successful.
+        postfix = '$' if self.match_whole else ''
+        mres = []
+        while terminals:
+            pattern = u'|'.join(u'(?P<%s>%s)' % (t.name, t.pattern.to_regexp() + postfix) for t in terminals[:max_size])
+            if self.use_bytes:
+                pattern = pattern.encode('latin-1')
+            try:
+                mre = self.re_.compile(pattern, self.g_regex_flags)
+            except AssertionError:  # Yes, this is what Python provides us.. :/
+                return self._build_mres(terminals, max_size//2)
 
+            mres.append((mre, {i: n for n, i in mre.groupindex.items()}))
+            terminals = terminals[max_size:]
+        return mres
 
-def build_mres(terminals, g_regex_flags, re_, use_bytes, match_whole=False):
-    return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_, use_bytes)
+    def match(self, text, pos):
+        for mre, type_from_index in self._mres:
+            m = mre.match(text, pos)
+            if m:
+                return m.group(0), type_from_index[m.lastindex]
+
+    @property
+    def allowed_types(self):
+        return {v for m, tfi in self._mres for v in tfi.values()}
 
 
 def _regexp_has_newline(r):
@@ -341,9 +356,9 @@ class TraditionalLexer(Lexer):
         self.use_bytes = conf.use_bytes
         self.terminals_by_name = conf.terminals_by_name
 
-        self._mres = None
+        self._scanner = None
 
-    def _build(self):
+    def _build_scanner(self):
         terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, self.re, self.use_bytes)
         assert all(self.callback.values())
 
@@ -354,19 +369,16 @@ class TraditionalLexer(Lexer):
             else:
                 self.callback[type_] = f
 
-        self._mres = build_mres(terminals, self.g_regex_flags, self.re, self.use_bytes)
+        self._scanner = Scanner(terminals, self.g_regex_flags, self.re, self.use_bytes)
 
     @property
-    def mres(self):
-        if self._mres is None:
-            self._build()
-        return self._mres
+    def scanner(self):
+        if self._scanner is None:
+            self._build_scanner()
+        return self._scanner
 
     def match(self, text, pos):
-        for mre, type_from_index in self.mres:
-            m = mre.match(text, pos)
-            if m:
-                return m.group(0), type_from_index[m.lastindex]
+        return self.scanner.match(text, pos)
 
     def lex(self, state, parser_state):
         with suppress(EOFError):
@@ -378,7 +390,7 @@ class TraditionalLexer(Lexer):
         while line_ctr.char_pos < len(lex_state.text):
             res = self.match(lex_state.text, line_ctr.char_pos)
             if not res:
-                allowed = {v for m, tfi in self.mres for v in tfi.values()} - self.ignore_types
+                allowed = self.scanner.allowed_types - self.ignore_types
                 if not allowed:
                     allowed = {"<END-OF-FILE>"}
                 raise UnexpectedCharacters(lex_state.text, line_ctr.char_pos, line_ctr.line, line_ctr.column,

From e5991739ee5a1d5bd6f78b84a495a2d7e17ce406 Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Sun, 6 Jun 2021 18:23:46 +0300
Subject: [PATCH 04/31] lexer.py: Small refactor

---
 lark/lexer.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/lark/lexer.py b/lark/lexer.py
index 7a30d6d..591943b 100644
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -235,6 +235,11 @@ class CallChain:
         return self.callback2(t) if self.cond(t2) else t2
 
 
+def _get_match(re_, regexp, s, flags):
+    m = re_.match(regexp, s, flags)
+    if m:
+        return m.group(0)
+
 def _create_unless(terminals, g_regex_flags, re_, use_bytes):
     tokens_by_type = classify(terminals, lambda t: type(t.pattern))
     assert len(tokens_by_type) <= 2, tokens_by_type.keys()
@@ -246,8 +251,7 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes):
             if strtok.priority > retok.priority:
                 continue
             s = strtok.pattern.value
-            m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags)
-            if m and m.group(0) == s:
+            if s == _get_match(re_, retok.pattern.to_regexp(), s, g_regex_flags):
                 unless.append(strtok)
                 if strtok.pattern.flags <= retok.pattern.flags:
                     embedded_strs.add(strtok)

From da3a993d025d7f3463c5564ba6fed2c0f1146adf Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Sun, 6 Jun 2021 18:32:59 +0300
Subject: [PATCH 05/31] lexer.py: Small simplification

---
 lark/lexer.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/lark/lexer.py b/lark/lexer.py
index 591943b..2925c35 100644
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -271,6 +271,8 @@ class Scanner:
         self.use_bytes = use_bytes
         self.match_whole = match_whole
 
+        self.allowed_types = {t.name for t in self.terminals}
+
         self._mres = self._build_mres(terminals, len(terminals))
 
     def _build_mres(self, terminals, max_size):
@@ -298,10 +300,6 @@ class Scanner:
             if m:
                 return m.group(0), type_from_index[m.lastindex]
 
-    @property
-    def allowed_types(self):
-        return {v for m, tfi in self._mres for v in tfi.values()}
-
 
 def _regexp_has_newline(r):
     r"""Expressions that may indicate newlines in a regexp:

From 3bc070bc1dcbaa91a04f178b985c5250bafc492c Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Tue, 15 Jun 2021 17:04:38 +0300
Subject: [PATCH 06/31] Change how propagate_positions work

---
 lark-stubs/lark.pyi        |  4 ++--
 lark/lark.py               |  4 ++--
 lark/lexer.py              | 28 +++++++++++++-------------
 lark/parse_tree_builder.py | 40 +++++++++++++++-----------------------
 lark/parser_frontends.py   | 16 +++++++--------
 5 files changed, 42 insertions(+), 50 deletions(-)

diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi
index 27c6863..18748d1 100644
--- a/lark-stubs/lark.pyi
+++ b/lark-stubs/lark.pyi
@@ -33,7 +33,7 @@ class LarkOptions:
     regex: bool
     debug: bool
     keep_all_tokens: bool
-    propagate_positions: Union[bool, str]
+    propagate_positions: Union[bool, Callable]
     maybe_placeholders: bool
     lexer_callbacks: Dict[str, Callable[[Token], Token]]
     cache: Union[bool, str]
@@ -77,7 +77,7 @@ class Lark:
         regex: bool = False,
         debug: bool = False,
         keep_all_tokens: bool = False,
-        propagate_positions: Union[bool, str] = False,
+        propagate_positions: Union[bool, Callable] = False,
         maybe_placeholders: bool = False,
         lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None,
         cache: Union[bool, str] = False,
diff --git a/lark/lark.py b/lark/lark.py
index 8e879cc..9863243 100644
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -44,7 +44,7 @@ class LarkOptions(Serialize):
             Applies the transformer to every parse tree (equivalent to applying it after the parse, but faster)
     propagate_positions
             Propagates (line, column, end_line, end_column) attributes into all tree branches.
-            Accepts ``False``, ``True``, or "ignore_ws", which will trim the whitespace around your trees. 
+            Accepts ``False``, ``True``, or a callable, which will filter which nodes to ignore when propagating.
     maybe_placeholders
             When ``True``, the ``[]`` operator returns ``None`` when not matched.
 
@@ -162,7 +162,7 @@ class LarkOptions(Serialize):
         assert_config(self.parser, ('earley', 'lalr', 'cyk', None))
 
         if self.parser == 'earley' and self.transformer:
-            raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm.'
+            raise ConfigurationError('Cannot specify an embedded transformer when using the Earley algorithm. '
                              'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. LALR)')
 
         if o:
diff --git a/lark/lexer.py b/lark/lexer.py
index 2925c35..7c2f979 100644
--- a/lark/lexer.py
+++ b/lark/lexer.py
@@ -133,20 +133,20 @@ class Token(Str):
 
     def __new__(cls, type_, value, start_pos=None, line=None, column=None, end_line=None, end_column=None, end_pos=None, pos_in_stream=None):
         try:
-            self = super(Token, cls).__new__(cls, value)
+            inst = super(Token, cls).__new__(cls, value)
         except UnicodeDecodeError:
             value = value.decode('latin1')
-            self = super(Token, cls).__new__(cls, value)
-
-        self.type = type_
-        self.start_pos = start_pos if start_pos is not None else pos_in_stream
-        self.value = value
-        self.line = line
-        self.column = column
-        self.end_line = end_line
-        self.end_column = end_column
-        self.end_pos = end_pos
-        return self
+            inst = super(Token, cls).__new__(cls, value)
+
+        inst.type = type_
+        inst.start_pos = start_pos if start_pos is not None else pos_in_stream
+        inst.value = value
+        inst.line = line
+        inst.column = column
+        inst.end_line = end_line
+        inst.end_column = end_column
+        inst.end_pos = end_pos
+        return inst
 
     @property
     def pos_in_stream(self):
@@ -258,8 +258,8 @@ def _create_unless(terminals, g_regex_flags, re_, use_bytes):
         if unless:
             callback[retok.name] = UnlessCallback(Scanner(unless, g_regex_flags, re_, match_whole=True, use_bytes=use_bytes))
 
-    terminals = [t for t in terminals if t not in embedded_strs]
-    return terminals, callback
+    new_terminals = [t for t in terminals if t not in embedded_strs]
+    return new_terminals, callback
 
 
 
diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py
index 7a854bc..b4929c6 100644
--- a/lark/parse_tree_builder.py
+++ b/lark/parse_tree_builder.py
@@ -23,8 +23,9 @@ class ExpandSingleChild:
 
 
 class PropagatePositions:
-    def __init__(self, node_builder):
+    def __init__(self, node_builder, node_filter=None):
         self.node_builder = node_builder
+        self.node_filter = node_filter
 
     def __call__(self, children):
         res = self.node_builder(children)
@@ -33,44 +34,35 @@ class PropagatePositions:
         if isinstance(res, Tree):
             res_meta = res.meta
 
-            src_meta = self._pp_get_meta(children)
-            if src_meta is not None:
-                res_meta.line = src_meta.line
-                res_meta.column = src_meta.column
-                res_meta.start_pos = src_meta.start_pos
+            first_meta = self._pp_get_meta(children)
+            if first_meta is not None:
+                res_meta.line = first_meta.line
+                res_meta.column = first_meta.column
+                res_meta.start_pos = first_meta.start_pos
                 res_meta.empty = False
 
-            src_meta = self._pp_get_meta(reversed(children))
-            if src_meta is not None:
-                res_meta.end_line = src_meta.end_line
-                res_meta.end_column = src_meta.end_column
-                res_meta.end_pos = src_meta.end_pos
+            last_meta = self._pp_get_meta(reversed(children))
+            if last_meta is not None:
+                res_meta.end_line = last_meta.end_line
+                res_meta.end_column = last_meta.end_column
+                res_meta.end_pos = last_meta.end_pos
                 res_meta.empty = False
 
         return res
 
     def _pp_get_meta(self, children):
         for c in children:
+            if self.node_filter is not None and not self.node_filter(c):
+                continue
             if isinstance(c, Tree):
                 if not c.meta.empty:
                     return c.meta
             elif isinstance(c, Token):
                 return c
 
-class PropagatePositions_IgnoreWs(PropagatePositions):
-    def _pp_get_meta(self, children):
-        for c in children:
-            if isinstance(c, Tree):
-                if not c.meta.empty:
-                    return c.meta
-            elif isinstance(c, Token):
-                if c and not c.isspace():     # Disregard whitespace-only tokens
-                    return c
-
-
 def make_propagate_positions(option):
-    if option == "ignore_ws":
-        return PropagatePositions_IgnoreWs
+    if callable(option):
+        return partial(PropagatePositions, node_filter=option)
     elif option is True:
         return PropagatePositions
     elif option is False:
diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py
index e066d9a..1818ca7 100644
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -92,26 +92,26 @@ class ParsingFrontend(Serialize):
     
     def _verify_start(self, start=None):
         if start is None:
-            start = self.parser_conf.start
-            if len(start) > 1:
-                raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start)
-            start ,= start
+            start_decls = self.parser_conf.start
+            if len(start_decls) > 1:
+                raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start_decls)
+            start ,= start_decls
         elif start not in self.parser_conf.start:
             raise ConfigurationError("Unknown start rule %s. Must be one of %r" % (start, self.parser_conf.start))
         return start
 
     def parse(self, text, start=None, on_error=None):
-        start = self._verify_start(start)
+        chosen_start = self._verify_start(start)
         stream = text if self.skip_lexer else LexerThread(self.lexer, text)
         kw = {} if on_error is None else {'on_error': on_error}
-        return self.parser.parse(stream, start, **kw)
+        return self.parser.parse(stream, chosen_start, **kw)
     
     def parse_interactive(self, text=None, start=None):
-        start = self._verify_start(start)
+        chosen_start = self._verify_start(start)
         if self.parser_conf.parser_type != 'lalr':
             raise ConfigurationError("parse_interactive() currently only works with parser='lalr' ")
         stream = text if self.skip_lexer else LexerThread(self.lexer, text)
-        return self.parser.parse_interactive(stream, start)
+        return self.parser.parse_interactive(stream, chosen_start)
 
 
 def get_frontend(parser, lexer):

From 24f653080f1118471934dba1d2ebc133c992305b Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Wed, 16 Jun 2021 10:48:56 +0300
Subject: [PATCH 07/31] More minor refactorings

---
 lark/exceptions.py          | 15 +++++++++------
 lark/lark.py                | 12 ++++++------
 lark/parsers/lalr_parser.py |  4 ++--
 3 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/lark/exceptions.py b/lark/exceptions.py
index 26ffce3..9d326b8 100644
--- a/lark/exceptions.py
+++ b/lark/exceptions.py
@@ -129,6 +129,8 @@ class UnexpectedInput(LarkError):
 
 class UnexpectedEOF(ParseError, UnexpectedInput):
     def __init__(self, expected, state=None, terminals_by_name=None):
+        super(UnexpectedEOF, self).__init__()
+
         self.expected = expected
         self.state = state
         from .lexer import Token
@@ -138,7 +140,6 @@ class UnexpectedEOF(ParseError, UnexpectedInput):
         self.column = -1
         self._terminals_by_name = terminals_by_name
 
-        super(UnexpectedEOF, self).__init__()
 
     def __str__(self):
         message = "Unexpected end-of-input. "
@@ -149,6 +150,8 @@ class UnexpectedEOF(ParseError, UnexpectedInput):
 class UnexpectedCharacters(LexError, UnexpectedInput):
     def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None,
                  terminals_by_name=None, considered_rules=None):
+        super(UnexpectedCharacters, self).__init__()
+
         # TODO considered_tokens and allowed can be figured out using state
         self.line = line
         self.column = column
@@ -167,7 +170,6 @@ class UnexpectedCharacters(LexError, UnexpectedInput):
             self.char = seq[lex_pos]
         self._context = self.get_context(seq)
 
-        super(UnexpectedCharacters, self).__init__()
 
     def __str__(self):
         message = "No terminal matches '%s' in the current parser context, at line %d col %d" % (self.char, self.line, self.column)
@@ -190,6 +192,8 @@ class UnexpectedToken(ParseError, UnexpectedInput):
     """
 
     def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None):
+        super(UnexpectedToken, self).__init__()
+        
         # TODO considered_rules and expected can be figured out using state
         self.line = getattr(token, 'line', '?')
         self.column = getattr(token, 'column', '?')
@@ -204,7 +208,6 @@ class UnexpectedToken(ParseError, UnexpectedInput):
         self._terminals_by_name = terminals_by_name
         self.token_history = token_history
 
-        super(UnexpectedToken, self).__init__()
 
     @property
     def accepts(self):
@@ -236,10 +239,10 @@ class VisitError(LarkError):
     """
 
     def __init__(self, rule, obj, orig_exc):
-        self.obj = obj
-        self.orig_exc = orig_exc
-
         message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc)
         super(VisitError, self).__init__(message)
 
+        self.obj = obj
+        self.orig_exc = orig_exc
+
 ###}
diff --git a/lark/lark.py b/lark/lark.py
index 9863243..9a4b2d5 100644
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -451,11 +451,11 @@ class Lark(Serialize):
             d = f
         else:
             d = pickle.load(f)
-        memo = d['memo']
+        memo_json = d['memo']
         data = d['data']
 
-        assert memo
-        memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
+        assert memo_json
+        memo = SerializeMemoizer.deserialize(memo_json, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
         options = dict(data['options'])
         if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults):
             raise ConfigurationError("Some options are not allowed when loading a Parser: {}"
@@ -512,11 +512,11 @@ class Lark(Serialize):
 
             Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...)
         """
-        package = FromPackageLoader(package, search_paths)
-        full_path, text = package(None, grammar_path)
+        package_loader = FromPackageLoader(package, search_paths)
+        full_path, text = package_loader(None, grammar_path)
         options.setdefault('source_path', full_path)
         options.setdefault('import_paths', [])
-        options['import_paths'].append(package)
+        options['import_paths'].append(package_loader)
         return cls(text, **options)
 
     def __repr__(self):
diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py
index fe40791..d916b46 100644
--- a/lark/parsers/lalr_parser.py
+++ b/lark/parsers/lalr_parser.py
@@ -178,8 +178,8 @@ class _Parser(object):
             for token in state.lexer.lex(state):
                 state.feed_token(token)
 
-            token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
-            return state.feed_token(token, True)
+            end_token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1)
+            return state.feed_token(end_token, True)
         except UnexpectedInput as e:
             try:
                 e.interactive_parser = InteractiveParser(self, state, state.lexer)

From a13cfcef55f6460b9b8897e9c313b9bcb4c80b33 Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Mon, 5 Jul 2021 12:08:38 +0300
Subject: [PATCH 08/31] Bugfix in propagate_positions: Corrected to account for
 'container nodes'

---
 lark/parse_tree_builder.py | 31 ++++++++++++++++++++++---------
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py
index b4929c6..39d3510 100644
--- a/lark/parse_tree_builder.py
+++ b/lark/parse_tree_builder.py
@@ -30,23 +30,36 @@ class PropagatePositions:
     def __call__(self, children):
         res = self.node_builder(children)
 
-        # local reference to Tree.meta reduces number of presence checks
         if isinstance(res, Tree):
+            # Calculate positions while the tree is streaming, according to the rule:
+            # - nodes start at the start of their first child's container,
+            #   and end at the end of their last child's container.
+            # Containers are nodes that take up space in text, but have been inlined in the tree.
+
             res_meta = res.meta
 
             first_meta = self._pp_get_meta(children)
             if first_meta is not None:
-                res_meta.line = first_meta.line
-                res_meta.column = first_meta.column
-                res_meta.start_pos = first_meta.start_pos
-                res_meta.empty = False
+                # meta was already set, probably because the rule has been inlined (e.g. `?rule`)
+                if not hasattr(res_meta, 'line'):
+                    res_meta.line = getattr(first_meta, 'container_line', first_meta.line)
+                    res_meta.column = getattr(first_meta, 'container_column', first_meta.column)
+                    res_meta.start_pos = getattr(first_meta, 'container_start_pos', first_meta.start_pos)
+                    res_meta.empty = False
+
+                res_meta.container_line = getattr(first_meta, 'container_line', first_meta.line)
+                res_meta.container_column = getattr(first_meta, 'container_column', first_meta.column)
 
             last_meta = self._pp_get_meta(reversed(children))
             if last_meta is not None:
-                res_meta.end_line = last_meta.end_line
-                res_meta.end_column = last_meta.end_column
-                res_meta.end_pos = last_meta.end_pos
-                res_meta.empty = False
+                if not hasattr(res_meta, 'end_line'):
+                    res_meta.end_line = getattr(last_meta, 'container_end_line', last_meta.end_line)
+                    res_meta.end_column = getattr(last_meta, 'container_end_column', last_meta.end_column)
+                    res_meta.end_pos = getattr(last_meta, 'container_end_pos', last_meta.end_pos)
+                    res_meta.empty = False
+
+                res_meta.container_end_line = getattr(last_meta, 'container_end_line', last_meta.end_line)
+                res_meta.container_end_column = getattr(last_meta, 'container_end_column', last_meta.end_column)
 
         return res
 

From d7d02e930899048a18b094d798080e59c5b9af9b Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Mon, 5 Jul 2021 12:11:03 +0300
Subject: [PATCH 09/31] Tiny comment fix

---
 lark/parse_tree_builder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py
index 39d3510..286038e 100644
--- a/lark/parse_tree_builder.py
+++ b/lark/parse_tree_builder.py
@@ -40,8 +40,8 @@ class PropagatePositions:
 
             first_meta = self._pp_get_meta(children)
             if first_meta is not None:
-                # meta was already set, probably because the rule has been inlined (e.g. `?rule`)
                 if not hasattr(res_meta, 'line'):
+                    # meta was already set, probably because the rule has been inlined (e.g. `?rule`)
                     res_meta.line = getattr(first_meta, 'container_line', first_meta.line)
                     res_meta.column = getattr(first_meta, 'container_column', first_meta.column)
                     res_meta.start_pos = getattr(first_meta, 'container_start_pos', first_meta.start_pos)

From c953dd9505dbba1bd8fbded0077a040a1ce0e5b5 Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Mon, 5 Jul 2021 13:15:29 +0300
Subject: [PATCH 10/31] Tests: Added a test case demonstrating the need for
 calculating containers

---
 tests/test_parser.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tests/test_parser.py b/tests/test_parser.py
index ff4e064..40ed131 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -94,6 +94,26 @@ class TestParsers(unittest.TestCase):
         r = g.parse('a')
         self.assertEqual( r.children[0].meta.line, 1 )
 
+    def test_propagate_positions2(self):
+        g = Lark("""start: a
+                    a: b
+                    ?b: "(" t ")"
+                    !t: "t"
+                 """, propagate_positions=True)
+
+        start = g.parse("(t)")
+        a ,= start.children
+        t ,= a.children
+        assert t.children[0] == "t"
+
+        assert t.meta.column == 2
+        assert t.meta.end_column == 3
+
+        assert start.column == a.column == 1
+        assert start.end_column == a.end_column == 4
+
+
+
     def test_expand1(self):
 
         g = Lark("""start: a

From f14ff6d4d14b500410b8d0d5e14fd2908be95dd9 Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Mon, 5 Jul 2021 14:33:28 +0300
Subject: [PATCH 11/31] Fixed tests to use meta (Tree.column is deprecated)

---
 tests/test_parser.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/test_parser.py b/tests/test_parser.py
index 40ed131..8fec82d 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -109,8 +109,8 @@ class TestParsers(unittest.TestCase):
         assert t.meta.column == 2
         assert t.meta.end_column == 3
 
-        assert start.column == a.column == 1
-        assert start.end_column == a.end_column == 4
+        assert start.meta.column == a.meta.column == 1
+        assert start.meta.end_column == a.meta.end_column == 4
 
 
 

From b37519b7c882d3fbfbf44822d8f3e72898a2c2c3 Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Fri, 9 Jul 2021 22:44:31 +0300
Subject: [PATCH 12/31] Bugfix for deepcopy + small unrelated refactor (issue
 #938)

---
 lark/common.py | 12 ++++++++++++
 lark/utils.py  | 14 +++++++-------
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/lark/common.py b/lark/common.py
index 467acf8..cb408d9 100644
--- a/lark/common.py
+++ b/lark/common.py
@@ -1,4 +1,5 @@
 from warnings import warn
+from copy import deepcopy
 
 from .utils import Serialize
 from .lexer import TerminalDef
@@ -31,6 +32,17 @@ class LexerConf(Serialize):
     def _deserialize(self):
         self.terminals_by_name = {t.name: t for t in self.terminals}
 
+    def __deepcopy__(self, memo=None):
+        return type(self)(
+            deepcopy(self.terminals, memo),
+            self.re_module,
+            deepcopy(self.ignore, memo),
+            deepcopy(self.postlex, memo),
+            deepcopy(self.callbacks, memo),
+            deepcopy(self.g_regex_flags, memo),
+            deepcopy(self.skip_validation, memo),
+            deepcopy(self.use_bytes, memo),
+        )
 
 
 class ParserConf(Serialize):
diff --git a/lark/utils.py b/lark/utils.py
index b9d7ac3..ea78801 100644
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -73,14 +73,13 @@ class Serialize(object):
         fields = getattr(self, '__serialize_fields__')
         res = {f: _serialize(getattr(self, f), memo) for f in fields}
         res['__type__'] = type(self).__name__
-        postprocess = getattr(self, '_serialize', None)
-        if postprocess:
-            postprocess(res, memo)
+        if hasattr(self, '_serialize'):
+            self._serialize(res, memo)
         return res
 
     @classmethod
     def deserialize(cls, data, memo):
-        namespace = getattr(cls, '__serialize_namespace__', {})
+        namespace = getattr(cls, '__serialize_namespace__', [])
         namespace = {c.__name__:c for c in namespace}
 
         fields = getattr(cls, '__serialize_fields__')
@@ -94,9 +93,10 @@ class Serialize(object):
                 setattr(inst, f, _deserialize(data[f], namespace, memo))
             except KeyError as e:
                 raise KeyError("Cannot find key for class", cls, e)
-        postprocess = getattr(inst, '_deserialize', None)
-        if postprocess:
-            postprocess()
+
+        if hasattr(inst, '_deserialize'):
+            inst._deserialize()
+
         return inst
 
 

From 688c581949b94eccd7ba30baa092a3e4189af008 Mon Sep 17 00:00:00 2001
From: MegaIng <trampchamp@hotmail.de>
Date: Tue, 13 Jul 2021 16:12:09 +0200
Subject: [PATCH 13/31] Updated a few links

I believe that the changed link from `examples` to `/examples` isn't a problem on readthedocs, but we should check. If it works, this PR fixes #941 .
---
 docs/json_tutorial.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/json_tutorial.md b/docs/json_tutorial.md
index 65c6c78..668d9de 100644
--- a/docs/json_tutorial.md
+++ b/docs/json_tutorial.md
@@ -427,9 +427,9 @@ I measured memory consumption using a little script called [memusg](https://gist
 | Lark - Earley *(with lexer)* | 42s | 4s | 1167M | 608M |
 | Lark - LALR(1) | 8s | 1.53s | 453M | 266M |
 | Lark - LALR(1) tree-less | 4.76s | 1.23s | 70M | 134M |
-| PyParsing ([Parser](http://pyparsing.wikispaces.com/file/view/jsonParser.py)) | 32s | 3.53s | 443M | 225M |
-| funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/funcparserlib/tests/json.py)) | 8.5s | 1.3s | 483M | 293M |
-| Parsimonious ([Parser](https://gist.githubusercontent.com/reclosedev/5222560/raw/5e97cf7eb62c3a3671885ec170577285e891f7d5/parsimonious_json.py)) | ? | 5.7s | ? | 1545M |
+| PyParsing ([Parser](https://github.com/pyparsing/pyparsing/blob/master/examples/jsonParser.py)) | 32s | 3.53s | 443M | 225M |
+| funcparserlib ([Parser](https://github.com/vlasovskikh/funcparserlib/blob/master/tests/json.py)) | 8.5s | 1.3s | 483M | 293M |
+| Parsimonious ([Parser](https://gist.github.com/reclosedev/5222560)) | ? | 5.7s | ? | 1545M |
 
 
 I added a few other parsers for comparison. PyParsing and funcparselib fair pretty well in their memory usage (they don't build a tree), but they can't compete with the run-time speed of LALR(1).
@@ -442,7 +442,7 @@ Once again, shout-out to PyPy for being so effective.
 
 This is the end of the tutorial. I hoped you liked it and learned a little about Lark.
 
-To see what else you can do with Lark, check out the [examples](examples).
+To see what else you can do with Lark, check out the [examples](/examples).
 
 For questions or any other subject, feel free to email me at erezshin at gmail dot com.
 

From 7cb8acbe54eb108b6e99859adfd41717df43e032 Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Fri, 9 Jul 2021 22:44:31 +0300
Subject: [PATCH 14/31] Bugfix for deepcopy + small unrelated refactor (issue
 #938)

---
 lark/common.py | 12 ++++++++++++
 lark/utils.py  | 14 +++++++-------
 2 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/lark/common.py b/lark/common.py
index 467acf8..cb408d9 100644
--- a/lark/common.py
+++ b/lark/common.py
@@ -1,4 +1,5 @@
 from warnings import warn
+from copy import deepcopy
 
 from .utils import Serialize
 from .lexer import TerminalDef
@@ -31,6 +32,17 @@ class LexerConf(Serialize):
     def _deserialize(self):
         self.terminals_by_name = {t.name: t for t in self.terminals}
 
+    def __deepcopy__(self, memo=None):
+        return type(self)(
+            deepcopy(self.terminals, memo),
+            self.re_module,
+            deepcopy(self.ignore, memo),
+            deepcopy(self.postlex, memo),
+            deepcopy(self.callbacks, memo),
+            deepcopy(self.g_regex_flags, memo),
+            deepcopy(self.skip_validation, memo),
+            deepcopy(self.use_bytes, memo),
+        )
 
 
 class ParserConf(Serialize):
diff --git a/lark/utils.py b/lark/utils.py
index b9d7ac3..ea78801 100644
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -73,14 +73,13 @@ class Serialize(object):
         fields = getattr(self, '__serialize_fields__')
         res = {f: _serialize(getattr(self, f), memo) for f in fields}
         res['__type__'] = type(self).__name__
-        postprocess = getattr(self, '_serialize', None)
-        if postprocess:
-            postprocess(res, memo)
+        if hasattr(self, '_serialize'):
+            self._serialize(res, memo)
         return res
 
     @classmethod
     def deserialize(cls, data, memo):
-        namespace = getattr(cls, '__serialize_namespace__', {})
+        namespace = getattr(cls, '__serialize_namespace__', [])
         namespace = {c.__name__:c for c in namespace}
 
         fields = getattr(cls, '__serialize_fields__')
@@ -94,9 +93,10 @@ class Serialize(object):
                 setattr(inst, f, _deserialize(data[f], namespace, memo))
             except KeyError as e:
                 raise KeyError("Cannot find key for class", cls, e)
-        postprocess = getattr(inst, '_deserialize', None)
-        if postprocess:
-            postprocess()
+
+        if hasattr(inst, '_deserialize'):
+            inst._deserialize()
+
         return inst
 
 

From 87a18a098e306dbe0f4258732ad8944832dc4a39 Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Thu, 15 Jul 2021 17:00:15 +0300
Subject: [PATCH 15/31] Tiny fix: MakeParsingFrontend is a regular method, not
 a classmethod

---
 lark/parser_frontends.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py
index 1818ca7..0e53dd5 100644
--- a/lark/parser_frontends.py
+++ b/lark/parser_frontends.py
@@ -39,8 +39,7 @@ class MakeParsingFrontend:
         lexer_conf.lexer_type = self.lexer_type
         return ParsingFrontend(lexer_conf, parser_conf, options)
 
-    @classmethod
-    def deserialize(cls, data, memo, lexer_conf, callbacks, options):
+    def deserialize(self, data, memo, lexer_conf, callbacks, options):
         parser_conf = ParserConf.deserialize(data['parser_conf'], memo)
         parser = LALR_Parser.deserialize(data['parser'], memo, callbacks, options.debug)
         parser_conf.callbacks = callbacks

From 5e5bd187a6fed1d94ff253dbd4f7d908e1d72476 Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Thu, 22 Jul 2021 11:21:13 +0300
Subject: [PATCH 16/31] Docs: Improved documentation of exceptions

---
 docs/classes.rst   |  2 ++
 docs/visitors.rst  |  5 +++++
 lark/ast_utils.py  |  4 ++--
 lark/exceptions.py | 31 +++++++++++++++++++++++++------
 lark/lark.py       |  8 +++++++-
 5 files changed, 41 insertions(+), 9 deletions(-)

diff --git a/docs/classes.rst b/docs/classes.rst
index 7b18460..1287896 100644
--- a/docs/classes.rst
+++ b/docs/classes.rst
@@ -66,6 +66,8 @@ UnexpectedInput
 
 .. autoclass:: lark.exceptions.UnexpectedCharacters
 
+.. autoclass:: lark.exceptions.UnexpectedEOF
+
 InteractiveParser
 -----------------
 
diff --git a/docs/visitors.rst b/docs/visitors.rst
index a0e1711..f263712 100644
--- a/docs/visitors.rst
+++ b/docs/visitors.rst
@@ -107,3 +107,8 @@ Discard
 -------
 
 .. autoclass:: lark.visitors.Discard
+
+VisitError
+-------
+
+.. autoclass:: lark.exceptions.VisitError
\ No newline at end of file
diff --git a/lark/ast_utils.py b/lark/ast_utils.py
index 0f2e498..b5463a2 100644
--- a/lark/ast_utils.py
+++ b/lark/ast_utils.py
@@ -36,8 +36,8 @@ def create_transformer(ast_module, transformer=None):
     Classes starting with an underscore (`_`) will be skipped.
 
     Parameters:
-        ast_module - A Python module containing all the subclasses of `ast_utils.Ast`
-        transformer (Optional[Transformer]) - An initial transformer. Its attributes may be overwritten.
+        ast_module: A Python module containing all the subclasses of ``ast_utils.Ast``
+        transformer (Optional[Transformer]): An initial transformer. Its attributes may be overwritten.
     """
     t = transformer or Transformer()
 
diff --git a/lark/exceptions.py b/lark/exceptions.py
index 9d326b8..fdcd52b 100644
--- a/lark/exceptions.py
+++ b/lark/exceptions.py
@@ -36,8 +36,9 @@ class UnexpectedInput(LarkError):
 
     Used as a base class for the following exceptions:
 
-    - ``UnexpectedToken``: The parser received an unexpected token
     - ``UnexpectedCharacters``: The lexer encountered an unexpected string
+    - ``UnexpectedToken``: The parser received an unexpected token
+    - ``UnexpectedEOF``: The parser expected a token, but the input ended
 
     After catching one of these exceptions, you may call the following helper methods to create a nicer error message.
     """
@@ -128,6 +129,9 @@ class UnexpectedInput(LarkError):
 
 
 class UnexpectedEOF(ParseError, UnexpectedInput):
+    """An exception that is raised by the parser, when the input ends while it still expects a token.
+    """
+
     def __init__(self, expected, state=None, terminals_by_name=None):
         super(UnexpectedEOF, self).__init__()
 
@@ -148,6 +152,10 @@ class UnexpectedEOF(ParseError, UnexpectedInput):
 
 
 class UnexpectedCharacters(LexError, UnexpectedInput):
+    """An exception that is raised by the lexer, when it cannot match the next 
+    string of characters to any of its terminals.
+    """
+
     def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None,
                  terminals_by_name=None, considered_rules=None):
         super(UnexpectedCharacters, self).__init__()
@@ -185,10 +193,15 @@ class UnexpectedToken(ParseError, UnexpectedInput):
     """An exception that is raised by the parser, when the token it received
     doesn't match any valid step forward.
 
-    The parser provides an interactive instance through `interactive_parser`,
-    which is initialized to the point of failture, and can be used for debugging and error handling.
+    Parameters:
+        token: The mismatched token
+        expected: The set of expected tokens
+        considered_rules: Which rules were considered, to deduce the expected tokens
+        state: A value representing the parser state. Do not rely on its value or type.
+        interactive_parser: An instance of ``InteractiveParser``, that is initialized to the point of failture,
+                            and can be used for debugging and error handling.
 
-    see: ``InteractiveParser``.
+    Note: These parameters are available as attributes of the instance.
     """
 
     def __init__(self, token, expected, considered_rules=None, state=None, interactive_parser=None, terminals_by_name=None, token_history=None):
@@ -234,14 +247,20 @@ class VisitError(LarkError):
     """VisitError is raised when visitors are interrupted by an exception
 
     It provides the following attributes for inspection:
-    - obj: the tree node or token it was processing when the exception was raised
-    - orig_exc: the exception that cause it to fail
+
+    Parameters:
+        rule: the name of the visit rule that failed
+        obj: the tree-node or token that was being processed
+        orig_exc: the exception that cause it to fail
+
+    Note: These parameters are available as attributes
     """
 
     def __init__(self, rule, obj, orig_exc):
         message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc)
         super(VisitError, self).__init__(message)
 
+        self.rule = rule
         self.obj = obj
         self.orig_exc = orig_exc
 
diff --git a/lark/lark.py b/lark/lark.py
index 9a4b2d5..45dec4d 100644
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -102,7 +102,7 @@ class LarkOptions(Serialize):
             A List of either paths or loader functions to specify from where grammars are imported
     source_path
             Override the source of from where the grammar was loaded. Useful for relative imports and unconventional grammar loading
-    **=== End Options ===**
+    **=== End of Options ===**
     """
     if __doc__:
         __doc__ += OPTIONS_DOC
@@ -527,6 +527,8 @@ class Lark(Serialize):
         """Only lex (and postlex) the text, without parsing it. Only relevant when lexer='standard'
 
         When dont_ignore=True, the lexer will return all tokens, even those marked for %ignore.
+
+        :raises UnexpectedCharacters: In case the lexer cannot find a suitable match.
         """
         if not hasattr(self, 'lexer') or dont_ignore:
             lexer = self._build_lexer(dont_ignore)
@@ -569,6 +571,10 @@ class Lark(Serialize):
             If a transformer is supplied to ``__init__``, returns whatever is the
             result of the transformation. Otherwise, returns a Tree instance.
 
+        :raises UnexpectedInput: On a parse error, one of these sub-exceptions will rise:
+                ``UnexpectedCharacters``, ``UnexpectedToken``, or ``UnexpectedEOF``.
+                For convenience, these sub-exceptions also inherit from ``ParserError`` and ``LexerError``.
+
         """
         return self.parser.parse(text, start=start, on_error=on_error)
 

From 55642be13c1a5ac36a999124ae3c875492d574d1 Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Sun, 25 Jul 2021 17:13:23 +0300
Subject: [PATCH 17/31] Tiny adjustments

---
 examples/standalone/json_parser_main.py | 4 +++-
 lark/parsers/lalr_interactive_parser.py | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/standalone/json_parser_main.py b/examples/standalone/json_parser_main.py
index 503b249..3d9b5a6 100644
--- a/examples/standalone/json_parser_main.py
+++ b/examples/standalone/json_parser_main.py
@@ -10,7 +10,9 @@ Standalone Parser
 
 import sys
 
-from json_parser import Lark_StandAlone, Transformer, inline_args
+from json_parser import Lark_StandAlone, Transformer, v_args
+
+inline_args = v_args(inline=True)
 
 class TreeToJson(Transformer):
     @inline_args
diff --git a/lark/parsers/lalr_interactive_parser.py b/lark/parsers/lalr_interactive_parser.py
index ce596b5..d6780cb 100644
--- a/lark/parsers/lalr_interactive_parser.py
+++ b/lark/parsers/lalr_interactive_parser.py
@@ -65,7 +65,7 @@ class InteractiveParser(object):
         """Print the output of ``choices()`` in a way that's easier to read."""
         out = ["Parser choices:"]
         for k, v in self.choices().items():
-            out.append('\t- %s -> %s' % (k, v))
+            out.append('\t- %s -> %r' % (k, v))
         out.append('stack size: %s' % len(self.parser_state.state_stack))
         return '\n'.join(out)
 

From b0a9afb287eaaeb139140d088cccbd6167f92aa1 Mon Sep 17 00:00:00 2001
From: MegaIng <cornelius@krupp.hamburg>
Date: Sun, 25 Jul 2021 23:07:08 +0200
Subject: [PATCH 18/31] Split up repeats from tilde into different rules.

---
 lark/load_grammar.py | 24 ++++++++++++++++++++++--
 lark/utils.py        | 30 ++++++++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 2 deletions(-)

diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index dbf4a1f..569e67d 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -9,7 +9,7 @@ import pkgutil
 from ast import literal_eval
 from numbers import Integral
 
-from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique
+from .utils import bfs, Py36, logger, classify_bool, is_id_continue, is_id_start, bfs_all_unique, small_factors
 from .lexer import Token, TerminalDef, PatternStr, PatternRE
 
 from .parse_tree_builder import ParseTreeBuilder
@@ -196,6 +196,26 @@ class EBNF_to_BNF(Transformer_InPlace):
         self.rules_by_expr[expr] = t
         return t
 
+    def _add_repeat_rule(self, a, b, target, atom):
+        if (a, b, target, atom) in self.rules_by_expr:
+            return self.rules_by_expr[(a, b, target, atom)]
+        new_name = '__%s_a%d_b%d_%d' % (self.prefix, a, b, self.i)
+        self.i += 1
+        t = NonTerminal(new_name)
+        tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)])
+        self.new_rules.append((new_name, tree, self.rule_options))
+        self.rules_by_expr[(a, b, target, atom)] = t
+        return t
+
+    def _generate_repeats(self, rule, mn, mx):
+        factors = small_factors(mn)
+        target = rule
+        for a, b in factors:
+            target = self._add_repeat_rule(a, b, target, rule)
+
+        # return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)])
+        return ST('expansions', [ST('expansion', [target] + [rule] * n) for n in range(0, mx - mn + 1)])
+
     def expr(self, rule, op, *args):
         if op.value == '?':
             empty = ST('expansion', [])
@@ -220,7 +240,7 @@ class EBNF_to_BNF(Transformer_InPlace):
                 mn, mx = map(int, args)
                 if mx < mn or mn < 0:
                     raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
-            return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)])
+            return self._generate_repeats(rule, mn, mx)
         assert False, op
 
     def maybe(self, rule):
diff --git a/lark/utils.py b/lark/utils.py
index ea78801..a3a077f 100644
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -359,3 +359,33 @@ def _serialize(value, memo):
         return {key:_serialize(elem, memo) for key, elem in value.items()}
     # assert value is None or isinstance(value, (int, float, str, tuple)), value
     return value
+
+
+def small_factors(n):
+    """
+    Splits n up into smaller factors and summands <= 10.
+    Returns a list of [(a, b), ...]
+    so that the following code returns n:
+
+    n = 1
+    for a, b in values:
+        n = n * a + b
+
+    Currently, we also keep a + b <= 10, but that might change
+    """
+    assert n > 0
+    if n < 10:
+        return [(n, 0)]
+    # TODO: Think of better algorithms (Prime factors should minimize the number of steps)
+    for a in range(10, 1, -1):
+        b = n % a
+        if a + b > 10:
+            continue
+        r = n // a
+        assert r * a + b == n  # Sanity check
+        if r <= 10:
+            return [(r, 0), (a, b)]
+        else:
+            return [*small_factors(r), (a, b)]
+    # This should be unreachable, since 2 + 1 <= 10
+    assert False, "Failed to factorize %s" % n

From 845b6fa477827d6ee77a21eaced1c3f3a4a8d8b0 Mon Sep 17 00:00:00 2001
From: MegaIng <cornelius@krupp.hamburg>
Date: Mon, 26 Jul 2021 01:14:46 +0200
Subject: [PATCH 19/31] Refactor + tests + additional splitting up.

---
 lark/load_grammar.py | 100 ++++++++++++++++++++++++++++++++-----------
 lark/utils.py        |   2 +-
 tests/test_parser.py |  29 +++++++++++++
 3 files changed, 105 insertions(+), 26 deletions(-)

diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index 569e67d..2f51ff6 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -179,42 +179,87 @@ RULES = {
 class EBNF_to_BNF(Transformer_InPlace):
     def __init__(self):
         self.new_rules = []
-        self.rules_by_expr = {}
+        self.rules_cache = {}
         self.prefix = 'anon'
         self.i = 0
         self.rule_options = None
 
-    def _add_recurse_rule(self, type_, expr):
-        if expr in self.rules_by_expr:
-            return self.rules_by_expr[expr]
-
-        new_name = '__%s_%s_%d' % (self.prefix, type_, self.i)
+    def _name_rule(self, inner):
+        new_name = '__%s_%s_%d' % (self.prefix, inner, self.i)
         self.i += 1
-        t = NonTerminal(new_name)
-        tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])])
-        self.new_rules.append((new_name, tree, self.rule_options))
-        self.rules_by_expr[expr] = t
+        return new_name
+
+    def _add_rule(self, key, name, expansions):
+        t = NonTerminal(name)
+        self.new_rules.append((name, expansions, self.rule_options))
+        self.rules_cache[key] = t
         return t
 
+    def _add_recurse_rule(self, type_, expr):
+        try:
+            return self.rules_cache[expr]
+        except KeyError:
+            new_name = self._name_rule(type_)
+            t = NonTerminal(new_name)
+            tree = ST('expansions', [
+                ST('expansion', [expr]),
+                ST('expansion', [t, expr])
+            ])
+            return self._add_rule(expr, new_name, tree)
+
     def _add_repeat_rule(self, a, b, target, atom):
-        if (a, b, target, atom) in self.rules_by_expr:
-            return self.rules_by_expr[(a, b, target, atom)]
-        new_name = '__%s_a%d_b%d_%d' % (self.prefix, a, b, self.i)
-        self.i += 1
-        t = NonTerminal(new_name)
-        tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)])
-        self.new_rules.append((new_name, tree, self.rule_options))
-        self.rules_by_expr[(a, b, target, atom)] = t
-        return t
+        """
+        When target matches n times atom
+        This builds a rule that matches atom (a*n + b) times
+        """
+        key = (a, b, target, atom)
+        try:
+            return self.rules_cache[key]
+        except KeyError:
+            new_name = self._name_rule('a%d_b%d' % (a, b))
+            tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)])
+            return self._add_rule(key, new_name, tree)
+
+    def _add_repeat_opt_rule(self, a, b, target, target_opt, atom):
+        """
+        When target matches n times atom, and target_opt 0 to n-1 times target_opt,
+        This builds a rule that matches atom 0 to (a*n+b)-1 times
+        """
+        key = (a, b, target, atom, "opt")
+        try:
+            return self.rules_cache[key]
+        except KeyError:
+            new_name = self._name_rule('a%d_b%d_opt' % (a, b))
+            tree = ST('expansions', [
+                ST('expansion', [target] * i + [target_opt])
+                for i in range(a)
+            ] + [
+                ST('expansion', [target] * a + [atom] * i)
+                for i in range(1, b)
+            ])
+            return self._add_rule(key, new_name, tree)
 
     def _generate_repeats(self, rule, mn, mx):
-        factors = small_factors(mn)
-        target = rule
-        for a, b in factors:
-            target = self._add_repeat_rule(a, b, target, rule)
+        mn_factors = small_factors(mn)
+        mn_target = rule
+        for a, b in mn_factors:
+            mn_target = self._add_repeat_rule(a, b, mn_target, rule)
+        if mx == mn:
+            return mn_target
+        diff = mx - mn + 1  # We add one because _add_repeat_opt_rule needs it.
+        diff_factors = small_factors(diff)
+        diff_target = rule
+        diff_opt_target = ST('expansion', [])  # match rule 0 times (e.g. 1-1 times)
+        for a, b in diff_factors[:-1]:
+            new_diff_target = self._add_repeat_rule(a, b, diff_target, rule)
+            diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)
+            diff_target = new_diff_target
+        a, b = diff_factors[-1]
+        diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)
 
         # return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)])
-        return ST('expansions', [ST('expansion', [target] + [rule] * n) for n in range(0, mx - mn + 1)])
+        # return ST('expansions', [ST('expansion', [mn_target] + [rule] * n) for n in range(0, mx - mn + 1)])
+        return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])])
 
     def expr(self, rule, op, *args):
         if op.value == '?':
@@ -240,7 +285,12 @@ class EBNF_to_BNF(Transformer_InPlace):
                 mn, mx = map(int, args)
                 if mx < mn or mn < 0:
                     raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
-            return self._generate_repeats(rule, mn, mx)
+            # For small number of repeats, we don't need to build new rules.
+            # Value 20 is arbitrarily chosen
+            if mx > 20:
+                return self._generate_repeats(rule, mn, mx)
+            else:
+                return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)])
         assert False, op
 
     def maybe(self, rule):
diff --git a/lark/utils.py b/lark/utils.py
index a3a077f..2fa5f43 100644
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -373,7 +373,7 @@ def small_factors(n):
 
     Currently, we also keep a + b <= 10, but that might change
     """
-    assert n > 0
+    assert n >= 0
     if n < 10:
         return [(n, 0)]
     # TODO: Think of better algorithms (Prime factors should minimize the number of steps)
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 8fec82d..6c00fbb 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -2226,6 +2226,35 @@ def _make_parser_test(LEXER, PARSER):
             self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
             self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
 
+        @unittest.skipIf(PARSER == 'cyk', "For large number of repeats, empty rules might be generated")
+        def test_ranged_repeat_large(self):
+            # Large is currently arbitrarily chosen to be large than 20
+            g = u"""!start: "A"~30
+                """
+            l = _Lark(g)
+            self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated")
+            self.assertEqual(l.parse(u'A'*30), Tree('start', ["A"]*30))
+            self.assertRaises(ParseError, l.parse, u'A'*29)
+            self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A'*31)
+
+
+            g = u"""!start: "A"~0..100
+                """
+            l = _Lark(g)
+            self.assertEqual(l.parse(u''), Tree('start', []))
+            self.assertEqual(l.parse(u'A'), Tree('start', ['A']))
+            self.assertEqual(l.parse(u'A'*100), Tree('start', ['A']*100))
+            self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 101)
+
+            # 8191 is a Mersenne prime
+            g = u"""start: "A"~8191
+                """
+            l = _Lark(g)
+            self.assertEqual(l.parse(u'A'*8191), Tree('start', []))
+            self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8190)
+            self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8192)
+
+
         @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now")  # TODO XXX
         def test_priority_vs_embedded(self):
             g = """

From b4fe22a27dd67bca414be767b92ab2960798f0d6 Mon Sep 17 00:00:00 2001
From: MegaIng <cornelius@krupp.hamburg>
Date: Mon, 26 Jul 2021 10:50:37 +0200
Subject: [PATCH 20/31] Python2.7 + comments + Magic constants

---
 lark/load_grammar.py | 48 ++++++++++++++++++++++++++++++++++++--------
 lark/utils.py        | 23 ++++++++++++---------
 tests/test_parser.py | 11 +++++-----
 3 files changed, 58 insertions(+), 24 deletions(-)

diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index 2f51ff6..2b1030f 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -174,6 +174,10 @@ RULES = {
     'literal': ['REGEXP', 'STRING'],
 }
 
+REPEAT_BREAK_THRESHOLD = 20
+# The Threshold whether repeat via ~ are split up into different rules
+# For the moment 20 is arbitrarily chosen
+
 
 @inline_args
 class EBNF_to_BNF(Transformer_InPlace):
@@ -211,25 +215,50 @@ class EBNF_to_BNF(Transformer_InPlace):
         """
         When target matches n times atom
         This builds a rule that matches atom (a*n + b) times
+
+        The rule is of the form:
+
+        The rules are of the form: (Example a = 3, b = 4)
+
+        new_rule: target target target atom atom atom atom
+
+        e.g. we use target * a and atom * b
         """
         key = (a, b, target, atom)
         try:
             return self.rules_cache[key]
         except KeyError:
-            new_name = self._name_rule('a%d_b%d' % (a, b))
+            new_name = self._name_rule('repeat_a%d_b%d' % (a, b))
             tree = ST('expansions', [ST('expansion', [target] * a + [atom] * b)])
             return self._add_rule(key, new_name, tree)
 
     def _add_repeat_opt_rule(self, a, b, target, target_opt, atom):
         """
         When target matches n times atom, and target_opt 0 to n-1 times target_opt,
-        This builds a rule that matches atom 0 to (a*n+b)-1 times
+        This builds a rule that matches atom 0 to (a*n+b)-1 times.
+        The created rule will not have any shift/reduce conflicts so that it can be used with lalr
+
+        The rules are of the form: (Example a = 3, b = 4)
+
+        new_rule: target_opt
+                | target target_opt
+                | target target target_opt
+
+                | target target target atom
+                | target target target atom atom
+                | target target target atom atom atom
+
+        First we generate target * i followed by target_opt for i from 0 to a-1
+        These match 0 to n*a - 1 times atom
+
+        Then we generate target * a followed by atom * i for i from 1 to b-1
+        These match n*a to n*a + b-1 times atom
         """
         key = (a, b, target, atom, "opt")
         try:
             return self.rules_cache[key]
         except KeyError:
-            new_name = self._name_rule('a%d_b%d_opt' % (a, b))
+            new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b))
             tree = ST('expansions', [
                 ST('expansion', [target] * i + [target_opt])
                 for i in range(a)
@@ -240,13 +269,19 @@ class EBNF_to_BNF(Transformer_InPlace):
             return self._add_rule(key, new_name, tree)
 
     def _generate_repeats(self, rule, mn, mx):
+        """
+        We treat rule~mn..mx as rule~mn rule~0..(diff=mx-mn).
+        We then use small_factors to split up mn and diff up into values [(a, b), ...]
+        This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt
+        to generate a complete rule/expression that matches the corresponding number of repeats
+        """
         mn_factors = small_factors(mn)
         mn_target = rule
         for a, b in mn_factors:
             mn_target = self._add_repeat_rule(a, b, mn_target, rule)
         if mx == mn:
             return mn_target
-        diff = mx - mn + 1  # We add one because _add_repeat_opt_rule needs it.
+        diff = mx - mn + 1  # We add one because _add_repeat_opt_rule generates rules that match one less
         diff_factors = small_factors(diff)
         diff_target = rule
         diff_opt_target = ST('expansion', [])  # match rule 0 times (e.g. 1-1 times)
@@ -257,8 +292,6 @@ class EBNF_to_BNF(Transformer_InPlace):
         a, b = diff_factors[-1]
         diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)
 
-        # return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)])
-        # return ST('expansions', [ST('expansion', [mn_target] + [rule] * n) for n in range(0, mx - mn + 1)])
         return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])])
 
     def expr(self, rule, op, *args):
@@ -286,8 +319,7 @@ class EBNF_to_BNF(Transformer_InPlace):
                 if mx < mn or mn < 0:
                     raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
             # For small number of repeats, we don't need to build new rules.
-            # Value 20 is arbitrarily chosen
-            if mx > 20:
+            if mx > REPEAT_BREAK_THRESHOLD:
                 return self._generate_repeats(rule, mn, mx)
             else:
                 return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)])
diff --git a/lark/utils.py b/lark/utils.py
index 2fa5f43..1648720 100644
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -187,7 +187,7 @@ def get_regexp_width(expr):
                 return 1, sre_constants.MAXREPEAT
             else:
                 return 0, sre_constants.MAXREPEAT
-            
+
 ###}
 
 
@@ -288,7 +288,7 @@ except ImportError:
 
 class FS:
     exists = os.path.exists
-    
+
     @staticmethod
     def open(name, mode="r", **kwargs):
         if atomicwrites and "w" in mode:
@@ -361,9 +361,13 @@ def _serialize(value, memo):
     return value
 
 
+# 10 is arbitrarily chosen
+SMALL_FACTOR_THRESHOLD = 10
+
+
 def small_factors(n):
     """
-    Splits n up into smaller factors and summands <= 10.
+    Splits n up into smaller factors and summands <= SMALL_FACTOR_THRESHOLD.
     Returns a list of [(a, b), ...]
     so that the following code returns n:
 
@@ -371,21 +375,20 @@ def small_factors(n):
     for a, b in values:
         n = n * a + b
 
-    Currently, we also keep a + b <= 10, but that might change
+    Currently, we also keep a + b <= SMALL_FACTOR_THRESHOLD, but that might change
     """
     assert n >= 0
-    if n < 10:
+    if n < SMALL_FACTOR_THRESHOLD:
         return [(n, 0)]
     # TODO: Think of better algorithms (Prime factors should minimize the number of steps)
-    for a in range(10, 1, -1):
+    for a in range(SMALL_FACTOR_THRESHOLD, 1, -1):
         b = n % a
-        if a + b > 10:
+        if a + b > SMALL_FACTOR_THRESHOLD:
             continue
         r = n // a
         assert r * a + b == n  # Sanity check
-        if r <= 10:
+        if r <= SMALL_FACTOR_THRESHOLD:
             return [(r, 0), (a, b)]
         else:
-            return [*small_factors(r), (a, b)]
-    # This should be unreachable, since 2 + 1 <= 10
+            return small_factors(r) + [(a, b)]
     assert False, "Failed to factorize %s" % n
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 6c00fbb..2247b46 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -2233,24 +2233,23 @@ def _make_parser_test(LEXER, PARSER):
                 """
             l = _Lark(g)
             self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated")
-            self.assertEqual(l.parse(u'A'*30), Tree('start', ["A"]*30))
-            self.assertRaises(ParseError, l.parse, u'A'*29)
-            self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A'*31)
-
+            self.assertEqual(l.parse(u'A' * 30), Tree('start', ["A"] * 30))
+            self.assertRaises(ParseError, l.parse, u'A' * 29)
+            self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 31)
 
             g = u"""!start: "A"~0..100
                 """
             l = _Lark(g)
             self.assertEqual(l.parse(u''), Tree('start', []))
             self.assertEqual(l.parse(u'A'), Tree('start', ['A']))
-            self.assertEqual(l.parse(u'A'*100), Tree('start', ['A']*100))
+            self.assertEqual(l.parse(u'A' * 100), Tree('start', ['A'] * 100))
             self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 101)
 
             # 8191 is a Mersenne prime
             g = u"""start: "A"~8191
                 """
             l = _Lark(g)
-            self.assertEqual(l.parse(u'A'*8191), Tree('start', []))
+            self.assertEqual(l.parse(u'A' * 8191), Tree('start', []))
             self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8190)
             self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8192)
 

From 6872404f1123bc6dcabb4f1735622747999b2bdc Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Mon, 26 Jul 2021 12:20:15 +0300
Subject: [PATCH 21/31] Improvements to the Python3 grammar

---
 examples/advanced/python3.lark | 142 +++++++++++++++++++++------------
 1 file changed, 89 insertions(+), 53 deletions(-)

diff --git a/examples/advanced/python3.lark b/examples/advanced/python3.lark
index 0fc5949..e54eb69 100644
--- a/examples/advanced/python3.lark
+++ b/examples/advanced/python3.lark
@@ -21,7 +21,7 @@ decorators: decorator+
 decorated: decorators (classdef | funcdef | async_funcdef)
 
 async_funcdef: "async" funcdef
-funcdef: "def" NAME "(" parameters? ")" ["->" test] ":" suite
+funcdef: "def" NAME "(" [parameters] ")" ["->" test] ":" suite
 
 parameters: paramvalue ("," paramvalue)* ["," SLASH] ["," [starparams | kwparams]]
           | starparams
@@ -29,25 +29,36 @@ parameters: paramvalue ("," paramvalue)* ["," SLASH] ["," [starparams | kwparams
 
 SLASH: "/" // Otherwise the it will completely disappear and it will be undisguisable in the result
 starparams: "*" typedparam? ("," paramvalue)* ["," kwparams]
-kwparams: "**" typedparam
+kwparams: "**" typedparam ","?
 
-?paramvalue: typedparam ["=" test]
-?typedparam: NAME [":" test]
+?paramvalue: typedparam ("=" test)?
+?typedparam: NAME (":" test)?
 
-varargslist: (vfpdef ["=" test] ("," vfpdef ["=" test])* ["," [ "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]] | "**" vfpdef [","]]]
-  | "*" [vfpdef] ("," vfpdef ["=" test])* ["," ["**" vfpdef [","]]]
-  | "**" vfpdef [","])
 
-vfpdef: NAME
+lambdef: "lambda" [lambda_params] ":" test
+lambdef_nocond: "lambda" [lambda_params] ":" test_nocond
+lambda_params: lambda_paramvalue ("," lambda_paramvalue)* ["," [lambda_starparams | lambda_kwparams]]
+          | lambda_starparams
+          | lambda_kwparams
+?lambda_paramvalue: NAME ("=" test)?
+lambda_starparams: "*" [NAME]  ("," lambda_paramvalue)* ["," [lambda_kwparams]]
+lambda_kwparams: "**" NAME ","?
+
 
 ?stmt: simple_stmt | compound_stmt
 ?simple_stmt: small_stmt (";" small_stmt)* [";"] _NEWLINE
-?small_stmt: (expr_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt)
-?expr_stmt: testlist_star_expr (annassign | augassign (yield_expr|testlist)
-         | ("=" (yield_expr|testlist_star_expr))*)
-annassign: ":" test ["=" test]
-?testlist_star_expr: (test|star_expr) ("," (test|star_expr))* [","]
-!augassign: ("+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//=")
+?small_stmt: (expr_stmt | assign_stmt | del_stmt | pass_stmt | flow_stmt | import_stmt | global_stmt | nonlocal_stmt | assert_stmt)
+expr_stmt: testlist_star_expr 
+assign_stmt: annassign | augassign | assign
+
+annassign: testlist_star_expr ":" test ["=" test]
+assign: testlist_star_expr ("=" (yield_expr|testlist_star_expr))+
+augassign: testlist_star_expr augassign_op (yield_expr|testlist)
+!augassign_op: "+=" | "-=" | "*=" | "@=" | "/=" | "%=" | "&=" | "|=" | "^=" | "<<=" | ">>=" | "**=" | "//="
+?testlist_star_expr: test_or_star_expr 
+                   | test_or_star_expr ("," test_or_star_expr)+ ","?  -> tuple
+                   | test_or_star_expr ","  -> tuple
+
 // For normal and annotated assignments, additional restrictions enforced by the interpreter
 del_stmt: "del" exprlist
 pass_stmt: "pass"
@@ -71,43 +82,52 @@ global_stmt: "global" NAME ("," NAME)*
 nonlocal_stmt: "nonlocal" NAME ("," NAME)*
 assert_stmt: "assert" test ["," test]
 
-compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt
+?compound_stmt: if_stmt | while_stmt | for_stmt | try_stmt | with_stmt | funcdef | classdef | decorated | async_stmt
 async_stmt: "async" (funcdef | with_stmt | for_stmt)
-if_stmt: "if" test ":" suite ("elif" test ":" suite)* ["else" ":" suite]
+if_stmt: "if" test ":" suite elifs ["else" ":" suite]
+elifs: elif_*
+elif_: "elif" test ":" suite
 while_stmt: "while" test ":" suite ["else" ":" suite]
 for_stmt: "for" exprlist "in" testlist ":" suite ["else" ":" suite]
-try_stmt: ("try" ":" suite ((except_clause ":" suite)+ ["else" ":" suite] ["finally" ":" suite] | "finally" ":" suite))
-with_stmt: "with" with_item ("," with_item)*  ":" suite
+try_stmt: "try" ":" suite except_clauses ["else" ":" suite] [finally]
+        | "try" ":" suite finally   -> try_finally
+finally: "finally" ":" suite
+except_clauses: except_clause+ 
+except_clause: "except" [test ["as" NAME]] ":" suite
+
+with_stmt: "with" with_items ":" suite
+with_items: with_item ("," with_item)* 
 with_item: test ["as" expr]
 // NB compile.c makes sure that the default except clause is last
-except_clause: "except" [test ["as" NAME]]
 suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT
 
-?test: or_test ("if" or_test "else" test)? | lambdef
+?test: or_test ("if" or_test "else" test)?
+     | lambdef
 ?test_nocond: or_test | lambdef_nocond
-lambdef: "lambda" [varargslist] ":" test
-lambdef_nocond: "lambda" [varargslist] ":" test_nocond
+
 ?or_test: and_test ("or" and_test)*
 ?and_test: not_test ("and" not_test)*
-?not_test: "not" not_test -> not
+?not_test: "not" not_test -> not_test
          | comparison
-?comparison: expr (_comp_op expr)*
+?comparison: expr (comp_op expr)*
 star_expr: "*" expr
-?expr: xor_expr ("|" xor_expr)*
+
+?expr: or_expr
+?or_expr: xor_expr ("|" xor_expr)*
 ?xor_expr: and_expr ("^" and_expr)*
 ?and_expr: shift_expr ("&" shift_expr)*
 ?shift_expr: arith_expr (_shift_op arith_expr)*
 ?arith_expr: term (_add_op term)*
 ?term: factor (_mul_op factor)*
-?factor: _factor_op factor | power
+?factor: _unary_op factor | power
 
-!_factor_op: "+"|"-"|"~"
+!_unary_op: "+"|"-"|"~"
 !_add_op: "+"|"-"
 !_shift_op: "<<"|">>"
 !_mul_op: "*"|"@"|"/"|"%"|"//"
 // <> isn't actually a valid comparison operator in Python. It's here for the
 // sake of a __future__ import described in PEP 401 (which really works :-)
-!_comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not"
+!comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not"
 
 ?power: await_expr ("**" factor)?
 ?await_expr: AWAIT? atom_expr
@@ -118,61 +138,76 @@ AWAIT: "await"
           | atom_expr "." NAME               -> getattr
           | atom
 
-?atom: "(" [yield_expr|tuplelist_comp] ")" -> tuple
-     | "[" [testlist_comp] "]"  -> list
-     | "{" [dict_comp] "}" -> dict
-     | "{" set_comp "}" -> set
+?atom: "(" yield_expr ")"
+     | "(" _tuple_inner? ")" -> tuple
+     | "(" comprehension{test_or_star_expr} ")" -> tuple_comprehension
+     | "[" _testlist_comp? "]"  -> list
+     | "[" comprehension{test_or_star_expr} "]"  -> list_comprehension
+     | "{" _dict_exprlist? "}" -> dict
+     | "{" comprehension{key_value} "}" -> dict_comprehension
+     | "{" _set_exprlist "}" -> set
+     | "{" comprehension{test} "}" -> set_comprehension
      | NAME -> var
-     | number | string+
+     | TEMPLATE_NAME -> template_var
+     | number 
+     | string_concat
      | "(" test ")"
      | "..." -> ellipsis
      | "None"    -> const_none
      | "True"    -> const_true
      | "False"   -> const_false
 
-?testlist_comp: test | tuplelist_comp
-tuplelist_comp: (test|star_expr) (comp_for | ("," (test|star_expr))+ [","] | ",")
+
+?string_concat: string+
+
+_testlist_comp: test | _tuple_inner
+_tuple_inner: test_or_star_expr (("," test_or_star_expr)+ [","] | ",")
+            
+
+?test_or_star_expr: test
+                 | star_expr
+
 ?subscriptlist: subscript
               | subscript (("," subscript)+ [","] | ",") -> subscript_tuple
-subscript: test | ([test] ":" [test] [sliceop]) -> slice
+?subscript: test | ([test] ":" [test] [sliceop]) -> slice
 sliceop: ":" [test]
-exprlist: (expr|star_expr)
-        | (expr|star_expr) (("," (expr|star_expr))+ [","]|",") -> exprlist_tuple
-testlist: test | testlist_tuple
+?exprlist: (expr|star_expr)
+         | (expr|star_expr) (("," (expr|star_expr))+ [","]|",")
+?testlist: test | testlist_tuple
 testlist_tuple: test (("," test)+ [","] | ",")
-dict_comp: key_value comp_for 
-         | (key_value | "**" expr) ("," (key_value | "**" expr))* [","]
+_dict_exprlist: (key_value | "**" expr) ("," (key_value | "**" expr))* [","]
 
 key_value: test ":"  test
 
-set_comp: test comp_for 
-        | (test|star_expr) ("," (test | star_expr))* [","]
+_set_exprlist: test_or_star_expr (","  test_or_star_expr)* [","]
 
 classdef: "class" NAME ["(" [arguments] ")"] ":" suite
 
+
+
 arguments: argvalue ("," argvalue)*  ("," [ starargs | kwargs])?
          | starargs
          | kwargs
-         | test comp_for
+         | comprehension{test}
 
-starargs: "*" test ("," "*" test)* ("," argvalue)* ["," kwargs]
+starargs: stararg ("," stararg)* ("," argvalue)* ["," kwargs]
+stararg: "*" test
 kwargs: "**" test
 
 ?argvalue: test ("=" test)?
 
 
-
-comp_iter: comp_for | comp_if | async_for
-async_for: "async" "for" exprlist "in" or_test [comp_iter]
-comp_for: "for" exprlist "in" or_test [comp_iter]
-comp_if: "if" test_nocond [comp_iter]
+comprehension{comp_result}: comp_result comp_fors [comp_if]
+comp_fors: comp_for+ 
+comp_for: [ASYNC] "for" exprlist "in" or_test
+ASYNC: "async"
+?comp_if: "if" test_nocond
 
 // not used in grammar, but may appear in "node" passed from Parser to Compiler
 encoding_decl: NAME
 
-yield_expr: "yield" [yield_arg]
-yield_arg: "from" test | testlist
-
+yield_expr: "yield" [testlist]
+          | "yield" "from" test -> yield_from
 
 number: DEC_NUMBER | HEX_NUMBER | BIN_NUMBER | OCT_NUMBER | FLOAT_NUMBER | IMAG_NUMBER
 string: STRING | LONG_STRING
@@ -181,6 +216,7 @@ string: STRING | LONG_STRING
 %import python (NAME, COMMENT, STRING, LONG_STRING)
 %import python (DEC_NUMBER, HEX_NUMBER, OCT_NUMBER, BIN_NUMBER, FLOAT_NUMBER, IMAG_NUMBER)
 
+
 // Other terminals
 
 _NEWLINE: ( /\r?\n[\t ]*/ | COMMENT )+

From 6a027982c7f8c014dc8403f9d22e52d1c9cb5a21 Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Mon, 26 Jul 2021 14:54:41 +0300
Subject: [PATCH 22/31] Tiny fix to PR

---
 examples/advanced/python3.lark | 1 -
 1 file changed, 1 deletion(-)

diff --git a/examples/advanced/python3.lark b/examples/advanced/python3.lark
index e54eb69..7fb5ae5 100644
--- a/examples/advanced/python3.lark
+++ b/examples/advanced/python3.lark
@@ -148,7 +148,6 @@ AWAIT: "await"
      | "{" _set_exprlist "}" -> set
      | "{" comprehension{test} "}" -> set_comprehension
      | NAME -> var
-     | TEMPLATE_NAME -> template_var
      | number 
      | string_concat
      | "(" test ")"

From fa8565366be027df0f788cb1432d90b2b94aa264 Mon Sep 17 00:00:00 2001
From: MegaIng <cornelius@krupp.hamburg>
Date: Mon, 26 Jul 2021 18:53:43 +0200
Subject: [PATCH 23/31] Off-by-one fix + Change of thresholds + fix tests

---
 lark/load_grammar.py | 29 ++++++++++++++++++++++-------
 lark/utils.py        |  8 +++++---
 tests/test_parser.py | 19 ++++++++++---------
 3 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index 2b1030f..36f6e2c 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -174,9 +174,21 @@ RULES = {
     'literal': ['REGEXP', 'STRING'],
 }
 
-REPEAT_BREAK_THRESHOLD = 20
+REPEAT_BREAK_THRESHOLD = 50
 # The Threshold whether repeat via ~ are split up into different rules
-# For the moment 20 is arbitrarily chosen
+# 50 is chosen since it keeps the number of states low and therefore lalr analysis time low,
+# while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts.
+# For a grammar  of the form start: "A"~0..N, these are the timing stats:
+#  N  t
+# 10 0.000
+# 20 0.004
+# 30 0.016
+# 40 0.049
+# 50 0.109
+# 60 0.215
+# 70 0.383
+# 80 0.631
+# (See PR #949)
 
 
 @inline_args
@@ -244,6 +256,7 @@ class EBNF_to_BNF(Transformer_InPlace):
                 | target target_opt
                 | target target target_opt
 
+                | target target target
                 | target target target atom
                 | target target target atom atom
                 | target target target atom atom atom
@@ -251,7 +264,7 @@ class EBNF_to_BNF(Transformer_InPlace):
         First we generate target * i followed by target_opt for i from 0 to a-1
         These match 0 to n*a - 1 times atom
 
-        Then we generate target * a followed by atom * i for i from 1 to b-1
+        Then we generate target * a followed by atom * i for i from 0 to b-1
         These match n*a to n*a + b-1 times atom
         """
         key = (a, b, target, atom, "opt")
@@ -264,7 +277,7 @@ class EBNF_to_BNF(Transformer_InPlace):
                 for i in range(a)
             ] + [
                 ST('expansion', [target] * a + [atom] * i)
-                for i in range(1, b)
+                for i in range(b)
             ])
             return self._add_rule(key, new_name, tree)
 
@@ -281,15 +294,17 @@ class EBNF_to_BNF(Transformer_InPlace):
             mn_target = self._add_repeat_rule(a, b, mn_target, rule)
         if mx == mn:
             return mn_target
+
         diff = mx - mn + 1  # We add one because _add_repeat_opt_rule generates rules that match one less
         diff_factors = small_factors(diff)
-        diff_target = rule
-        diff_opt_target = ST('expansion', [])  # match rule 0 times (e.g. 1-1 times)
+        diff_target = rule  # Match rule 1 times
+        diff_opt_target = ST('expansion', [])  # match rule 0 times (e.g. up to 1 -1 times)
         for a, b in diff_factors[:-1]:
             new_diff_target = self._add_repeat_rule(a, b, diff_target, rule)
             diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)
             diff_target = new_diff_target
-        a, b = diff_factors[-1]
+
+        a, b = diff_factors[-1]  # We do the last on separately since we don't need to call self._add_repeat_rule
         diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)
 
         return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])])
diff --git a/lark/utils.py b/lark/utils.py
index 1648720..f447b9e 100644
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -361,8 +361,9 @@ def _serialize(value, memo):
     return value
 
 
-# 10 is arbitrarily chosen
-SMALL_FACTOR_THRESHOLD = 10
+# Value 5 keeps the number of states in the lalr parser somewhat minimal
+# It isn't optimal, but close to it. See PR #949
+SMALL_FACTOR_THRESHOLD = 5
 
 
 def small_factors(n):
@@ -380,7 +381,8 @@ def small_factors(n):
     assert n >= 0
     if n < SMALL_FACTOR_THRESHOLD:
         return [(n, 0)]
-    # TODO: Think of better algorithms (Prime factors should minimize the number of steps)
+    # While this does not provide an optimal solution, it produces a pretty good one.
+    # See above comment and PR #949
     for a in range(SMALL_FACTOR_THRESHOLD, 1, -1):
         b = n % a
         if a + b > SMALL_FACTOR_THRESHOLD:
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 2247b46..b55f848 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -2229,21 +2229,22 @@ def _make_parser_test(LEXER, PARSER):
         @unittest.skipIf(PARSER == 'cyk', "For large number of repeats, empty rules might be generated")
         def test_ranged_repeat_large(self):
             # Large is currently arbitrarily chosen to be large than 20
-            g = u"""!start: "A"~30
+            g = u"""!start: "A"~60
                 """
             l = _Lark(g)
             self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated")
-            self.assertEqual(l.parse(u'A' * 30), Tree('start', ["A"] * 30))
-            self.assertRaises(ParseError, l.parse, u'A' * 29)
-            self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 31)
+            self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60))
+            self.assertRaises(ParseError, l.parse, u'A' * 59)
+            self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61)
 
-            g = u"""!start: "A"~0..100
+            g = u"""!start: "A"~15..100
                 """
             l = _Lark(g)
-            self.assertEqual(l.parse(u''), Tree('start', []))
-            self.assertEqual(l.parse(u'A'), Tree('start', ['A']))
-            self.assertEqual(l.parse(u'A' * 100), Tree('start', ['A'] * 100))
-            self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 101)
+            for i in range(0, 110):
+                if 15 <= i <= 100:
+                    self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i))
+                else:
+                    self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * i)
 
             # 8191 is a Mersenne prime
             g = u"""start: "A"~8191

From 3436b3388546cfd8c802ab09d59d6e9e82cb1c7e Mon Sep 17 00:00:00 2001
From: MegaIng <cornelius@krupp.hamburg>
Date: Mon, 26 Jul 2021 21:13:29 +0200
Subject: [PATCH 24/31] Refactor small_factors

---
 lark/utils.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/lark/utils.py b/lark/utils.py
index f447b9e..610d160 100644
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -379,18 +379,12 @@ def small_factors(n):
     Currently, we also keep a + b <= SMALL_FACTOR_THRESHOLD, but that might change
     """
     assert n >= 0
-    if n < SMALL_FACTOR_THRESHOLD:
+    if n <= SMALL_FACTOR_THRESHOLD:
         return [(n, 0)]
     # While this does not provide an optimal solution, it produces a pretty good one.
     # See above comment and PR #949
     for a in range(SMALL_FACTOR_THRESHOLD, 1, -1):
-        b = n % a
-        if a + b > SMALL_FACTOR_THRESHOLD:
-            continue
-        r = n // a
-        assert r * a + b == n  # Sanity check
-        if r <= SMALL_FACTOR_THRESHOLD:
-            return [(r, 0), (a, b)]
-        else:
+        r, b = divmod(n, a)
+        if a + b <= SMALL_FACTOR_THRESHOLD:
             return small_factors(r) + [(a, b)]
     assert False, "Failed to factorize %s" % n

From 90460f31d98da5a08ec14c0ad7062756dcc82668 Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Tue, 27 Jul 2021 11:45:01 +0300
Subject: [PATCH 25/31] Refactored PR #949 and edited the comments/docstrings

---
 lark/load_grammar.py | 100 ++++++++++++++++++++-----------------------
 lark/utils.py        |  21 ++++-----
 tests/test_parser.py |   8 ++--
 3 files changed, 60 insertions(+), 69 deletions(-)

diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index 36f6e2c..d1d06cc 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -174,21 +174,15 @@ RULES = {
     'literal': ['REGEXP', 'STRING'],
 }
 
-REPEAT_BREAK_THRESHOLD = 50
+
+# Value 5 keeps the number of states in the lalr parser somewhat minimal
+# It isn't optimal, but close to it. See PR #949
+SMALL_FACTOR_THRESHOLD = 5
 # The Threshold whether repeat via ~ are split up into different rules
 # 50 is chosen since it keeps the number of states low and therefore lalr analysis time low,
 # while not being to overaggressive and unnecessarily creating rules that might create shift/reduce conflicts.
-# For a grammar  of the form start: "A"~0..N, these are the timing stats:
-#  N  t
-# 10 0.000
-# 20 0.004
-# 30 0.016
-# 40 0.049
-# 50 0.109
-# 60 0.215
-# 70 0.383
-# 80 0.631
 # (See PR #949)
+REPEAT_BREAK_THRESHOLD = 50
 
 
 @inline_args
@@ -224,17 +218,16 @@ class EBNF_to_BNF(Transformer_InPlace):
             return self._add_rule(expr, new_name, tree)
 
     def _add_repeat_rule(self, a, b, target, atom):
-        """
-        When target matches n times atom
-        This builds a rule that matches atom (a*n + b) times
+        """Generate a rule that repeats target ``a`` times, and repeats atom ``b`` times.
 
-        The rule is of the form:
+        When called recursively (into target), it repeats atom for x(n) times, where:
+            x(0) = 1
+            x(n) = a(n) * x(n-1) + b
 
-        The rules are of the form: (Example a = 3, b = 4)
+        Example rule when a=3, b=4:
 
-        new_rule: target target target atom atom atom atom
+            new_rule: target target target atom atom atom atom
 
-        e.g. we use target * a and atom * b
         """
         key = (a, b, target, atom)
         try:
@@ -245,27 +238,29 @@ class EBNF_to_BNF(Transformer_InPlace):
             return self._add_rule(key, new_name, tree)
 
     def _add_repeat_opt_rule(self, a, b, target, target_opt, atom):
-        """
+        """Creates a rule that matches atom 0 to (a*n+b)-1 times.
+
         When target matches n times atom, and target_opt 0 to n-1 times target_opt,
-        This builds a rule that matches atom 0 to (a*n+b)-1 times.
-        The created rule will not have any shift/reduce conflicts so that it can be used with lalr
 
-        The rules are of the form: (Example a = 3, b = 4)
+        First we generate target * i followed by target_opt, for i from 0 to a-1
+        These match 0 to n*a - 1 times atom
+
+        Then we generate target * a followed by atom * i, for i from 0 to b-1
+        These match n*a to n*a + b-1 times atom
 
-        new_rule: target_opt
-                | target target_opt
-                | target target target_opt
+        The created rule will not have any shift/reduce conflicts so that it can be used with lalr
 
-                | target target target
-                | target target target atom
-                | target target target atom atom
-                | target target target atom atom atom
+        Example rule when a=3, b=4:
 
-        First we generate target * i followed by target_opt for i from 0 to a-1
-        These match 0 to n*a - 1 times atom
+            new_rule: target_opt
+                    | target target_opt
+                    | target target target_opt
+
+                    | target target target
+                    | target target target atom
+                    | target target target atom atom
+                    | target target target atom atom atom
 
-        Then we generate target * a followed by atom * i for i from 0 to b-1
-        These match n*a to n*a + b-1 times atom
         """
         key = (a, b, target, atom, "opt")
         try:
@@ -273,38 +268,39 @@ class EBNF_to_BNF(Transformer_InPlace):
         except KeyError:
             new_name = self._name_rule('repeat_a%d_b%d_opt' % (a, b))
             tree = ST('expansions', [
-                ST('expansion', [target] * i + [target_opt])
-                for i in range(a)
+                ST('expansion', [target]*i + [target_opt]) for i in range(a)
             ] + [
-                ST('expansion', [target] * a + [atom] * i)
-                for i in range(b)
+                ST('expansion', [target]*a + [atom]*i) for i in range(b)
             ])
             return self._add_rule(key, new_name, tree)
 
     def _generate_repeats(self, rule, mn, mx):
+        """Generates a rule tree that repeats ``rule`` exactly between ``mn`` to ``mx`` times.
         """
-        We treat rule~mn..mx as rule~mn rule~0..(diff=mx-mn).
-        We then use small_factors to split up mn and diff up into values [(a, b), ...]
-        This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt
-        to generate a complete rule/expression that matches the corresponding number of repeats
-        """
-        mn_factors = small_factors(mn)
+        # For a small number of repeats, we can take the naive approach
+        if mx < REPEAT_BREAK_THRESHOLD:
+            return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)])
+
+        # For large repeat values, we break the repetition into sub-rules. 
+        # We treat ``rule~mn..mx`` as ``rule~mn rule~0..(diff=mx-mn)``.
+        # We then use small_factors to split up mn and diff up into values [(a, b), ...]
+        # This values are used with the help of _add_repeat_rule and _add_repeat_rule_opt
+        # to generate a complete rule/expression that matches the corresponding number of repeats
         mn_target = rule
-        for a, b in mn_factors:
+        for a, b in small_factors(mn, SMALL_FACTOR_THRESHOLD):
             mn_target = self._add_repeat_rule(a, b, mn_target, rule)
         if mx == mn:
             return mn_target
 
         diff = mx - mn + 1  # We add one because _add_repeat_opt_rule generates rules that match one less
-        diff_factors = small_factors(diff)
+        diff_factors = small_factors(diff, SMALL_FACTOR_THRESHOLD)
         diff_target = rule  # Match rule 1 times
         diff_opt_target = ST('expansion', [])  # match rule 0 times (e.g. up to 1 -1 times)
         for a, b in diff_factors[:-1]:
-            new_diff_target = self._add_repeat_rule(a, b, diff_target, rule)
             diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)
-            diff_target = new_diff_target
+            diff_target = self._add_repeat_rule(a, b, diff_target, rule)
 
-        a, b = diff_factors[-1]  # We do the last on separately since we don't need to call self._add_repeat_rule
+        a, b = diff_factors[-1]
         diff_opt_target = self._add_repeat_opt_rule(a, b, diff_target, diff_opt_target, rule)
 
         return ST('expansions', [ST('expansion', [mn_target] + [diff_opt_target])])
@@ -333,11 +329,9 @@ class EBNF_to_BNF(Transformer_InPlace):
                 mn, mx = map(int, args)
                 if mx < mn or mn < 0:
                     raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx))
-            # For small number of repeats, we don't need to build new rules.
-            if mx > REPEAT_BREAK_THRESHOLD:
-                return self._generate_repeats(rule, mn, mx)
-            else:
-                return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx + 1)])
+
+            return self._generate_repeats(rule, mn, mx)
+
         assert False, op
 
     def maybe(self, rule):
diff --git a/lark/utils.py b/lark/utils.py
index 610d160..2938591 100644
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -361,14 +361,11 @@ def _serialize(value, memo):
     return value
 
 
-# Value 5 keeps the number of states in the lalr parser somewhat minimal
-# It isn't optimal, but close to it. See PR #949
-SMALL_FACTOR_THRESHOLD = 5
 
 
-def small_factors(n):
+def small_factors(n, max_factor):
     """
-    Splits n up into smaller factors and summands <= SMALL_FACTOR_THRESHOLD.
+    Splits n up into smaller factors and summands <= max_factor.
     Returns a list of [(a, b), ...]
     so that the following code returns n:
 
@@ -376,15 +373,15 @@ def small_factors(n):
     for a, b in values:
         n = n * a + b
 
-    Currently, we also keep a + b <= SMALL_FACTOR_THRESHOLD, but that might change
+    Currently, we also keep a + b <= max_factor, but that might change
     """
     assert n >= 0
-    if n <= SMALL_FACTOR_THRESHOLD:
+    assert max_factor > 2
+    if n <= max_factor:
         return [(n, 0)]
-    # While this does not provide an optimal solution, it produces a pretty good one.
-    # See above comment and PR #949
-    for a in range(SMALL_FACTOR_THRESHOLD, 1, -1):
+
+    for a in range(max_factor, 1, -1):
         r, b = divmod(n, a)
-        if a + b <= SMALL_FACTOR_THRESHOLD:
-            return small_factors(r) + [(a, b)]
+        if a + b <= max_factor:
+            return small_factors(r, max_factor) + [(a, b)]
     assert False, "Failed to factorize %s" % n
diff --git a/tests/test_parser.py b/tests/test_parser.py
index b55f848..ffb1d8f 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -2226,7 +2226,7 @@ def _make_parser_test(LEXER, PARSER):
             self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
             self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
 
-        @unittest.skipIf(PARSER == 'cyk', "For large number of repeats, empty rules might be generated")
+        @unittest.skipIf(PARSER != 'lalr', "We only need to test rule generation, we know BNF is solid on all parsers")
         def test_ranged_repeat_large(self):
             # Large is currently arbitrarily chosen to be large than 20
             g = u"""!start: "A"~60
@@ -2244,15 +2244,15 @@ def _make_parser_test(LEXER, PARSER):
                 if 15 <= i <= 100:
                     self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i))
                 else:
-                    self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * i)
+                    self.assertRaises(UnexpectedInput, l.parse, u'A' * i)
 
             # 8191 is a Mersenne prime
             g = u"""start: "A"~8191
                 """
             l = _Lark(g)
             self.assertEqual(l.parse(u'A' * 8191), Tree('start', []))
-            self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8190)
-            self.assertRaises((UnexpectedToken, UnexpectedInput), l.parse, u'A' * 8192)
+            self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190)
+            self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192)
 
 
         @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now")  # TODO XXX

From cf61f78509e52b44c4fbbaf40f42a688f754342b Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Tue, 27 Jul 2021 15:08:29 +0300
Subject: [PATCH 26/31] Tests: Moved repeat operator tests to test_grammar

---
 tests/test_grammar.py | 49 ++++++++++++++++++++++++++++++++++++++++++-
 tests/test_parser.py  | 49 -------------------------------------------
 2 files changed, 48 insertions(+), 50 deletions(-)

diff --git a/tests/test_grammar.py b/tests/test_grammar.py
index a643117..3ae65f2 100644
--- a/tests/test_grammar.py
+++ b/tests/test_grammar.py
@@ -3,7 +3,7 @@ from __future__ import absolute_import
 import sys
 from unittest import TestCase, main
 
-from lark import Lark, Token, Tree
+from lark import Lark, Token, Tree, ParseError, UnexpectedInput
 from lark.load_grammar import GrammarError, GRAMMAR_ERRORS, find_grammar_errors
 from lark.load_grammar import FromPackageLoader
 
@@ -198,6 +198,53 @@ class TestGrammar(TestCase):
         x = find_grammar_errors(text)
         assert [e.line for e, _s in find_grammar_errors(text)] == [2, 6]
 
+    def test_ranged_repeat_terms(self):
+        g = u"""!start: AAA
+                AAA: "A"~3
+            """
+        l = Lark(g, parser='lalr')
+        self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"]))
+        self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA')
+        self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
+
+        g = u"""!start: AABB CC
+                AABB: "A"~0..2 "B"~2
+                CC: "C"~1..2
+            """
+        l = Lark(g, parser='lalr')
+        self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC']))
+        self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C']))
+        self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC']))
+        self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB')
+        self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
+        self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
+        self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
+
+    def test_ranged_repeat_large(self):
+        g = u"""!start: "A"~60
+            """
+        l = Lark(g, parser='lalr')
+        self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated")
+        self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60))
+        self.assertRaises(ParseError, l.parse, u'A' * 59)
+        self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61)
+
+        g = u"""!start: "A"~15..100
+            """
+        l = Lark(g, parser='lalr')
+        for i in range(0, 110):
+            if 15 <= i <= 100:
+                self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i))
+            else:
+                self.assertRaises(UnexpectedInput, l.parse, u'A' * i)
+
+        # 8191 is a Mersenne prime
+        g = u"""start: "A"~8191
+            """
+        l = Lark(g, parser='lalr')
+        self.assertEqual(l.parse(u'A' * 8191), Tree('start', []))
+        self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190)
+        self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192)
 
 
 if __name__ == '__main__':
diff --git a/tests/test_parser.py b/tests/test_parser.py
index ffb1d8f..9eb7b26 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -2204,55 +2204,6 @@ def _make_parser_test(LEXER, PARSER):
             self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
 
 
-        def test_ranged_repeat_terms(self):
-            g = u"""!start: AAA
-                    AAA: "A"~3
-                """
-            l = _Lark(g)
-            self.assertEqual(l.parse(u'AAA'), Tree('start', ["AAA"]))
-            self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AA')
-            self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAA')
-
-            g = u"""!start: AABB CC
-                    AABB: "A"~0..2 "B"~2
-                    CC: "C"~1..2
-                """
-            l = _Lark(g)
-            self.assertEqual(l.parse(u'AABBCC'), Tree('start', ['AABB', 'CC']))
-            self.assertEqual(l.parse(u'BBC'), Tree('start', ['BB', 'C']))
-            self.assertEqual(l.parse(u'ABBCC'), Tree('start', ['ABB', 'CC']))
-            self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAB')
-            self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAABBB')
-            self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB')
-            self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB')
-
-        @unittest.skipIf(PARSER != 'lalr', "We only need to test rule generation, we know BNF is solid on all parsers")
-        def test_ranged_repeat_large(self):
-            # Large is currently arbitrarily chosen to be large than 20
-            g = u"""!start: "A"~60
-                """
-            l = _Lark(g)
-            self.assertGreater(len(l.rules), 1, "Expected that more than one rule will be generated")
-            self.assertEqual(l.parse(u'A' * 60), Tree('start', ["A"] * 60))
-            self.assertRaises(ParseError, l.parse, u'A' * 59)
-            self.assertRaises((ParseError, UnexpectedInput), l.parse, u'A' * 61)
-
-            g = u"""!start: "A"~15..100
-                """
-            l = _Lark(g)
-            for i in range(0, 110):
-                if 15 <= i <= 100:
-                    self.assertEqual(l.parse(u'A' * i), Tree('start', ['A']*i))
-                else:
-                    self.assertRaises(UnexpectedInput, l.parse, u'A' * i)
-
-            # 8191 is a Mersenne prime
-            g = u"""start: "A"~8191
-                """
-            l = _Lark(g)
-            self.assertEqual(l.parse(u'A' * 8191), Tree('start', []))
-            self.assertRaises(UnexpectedInput, l.parse, u'A' * 8190)
-            self.assertRaises(UnexpectedInput, l.parse, u'A' * 8192)
 
 
         @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now")  # TODO XXX

From e8d5e7e30db5e728cd4521308036aa55730f9957 Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Sat, 7 Aug 2021 11:11:10 +0300
Subject: [PATCH 27/31] Docs: Updated IDE link

---
 README.md      | 2 +-
 docs/index.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 8ec22ed..f4335d0 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h
 
 - [Documentation @readthedocs](https://lark-parser.readthedocs.io/)
 - [Cheatsheet (PDF)](/docs/_static/lark_cheatsheet.pdf)
-- [Online IDE (very basic)](https://lark-parser.github.io/lark/ide/app.html)
+- [Online IDE (very basic)](https://lark-parser.github.io/lark/ide)
 - [Tutorial](/docs/json_tutorial.md) for writing a JSON parser.
 - Blog post: [How to write a DSL with Lark](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/)
 - [Gitter chat](https://gitter.im/lark-parser/Lobby)
diff --git a/docs/index.rst b/docs/index.rst
index 39ecd5a..c4e8be6 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -113,7 +113,7 @@ Resources
 
 .. _Examples: https://github.com/lark-parser/lark/tree/master/examples
 .. _Third-party examples: https://github.com/ligurio/lark-grammars
-.. _Online IDE: https://lark-parser.github.io/lark/ide/app.html
+.. _Online IDE: https://lark-parser.github.io/lark/ide
 .. _How to write a DSL: http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/
 .. _Program Synthesis is Possible: https://www.cs.cornell.edu/~asampson/blog/minisynth.html
 .. _Cheatsheet (PDF): _static/lark_cheatsheet.pdf

From 41b2ba0d3a37757c30e3010763a516e822eaba87 Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Sat, 7 Aug 2021 11:13:31 +0300
Subject: [PATCH 28/31] Docs: Updated IDE links again

---
 README.md      | 2 +-
 docs/index.rst | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index f4335d0..82f6148 100644
--- a/README.md
+++ b/README.md
@@ -26,7 +26,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h
 
 - [Documentation @readthedocs](https://lark-parser.readthedocs.io/)
 - [Cheatsheet (PDF)](/docs/_static/lark_cheatsheet.pdf)
-- [Online IDE (very basic)](https://lark-parser.github.io/lark/ide)
+- [Online IDE](https://lark-parser.github.io/ide)
 - [Tutorial](/docs/json_tutorial.md) for writing a JSON parser.
 - Blog post: [How to write a DSL with Lark](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/)
 - [Gitter chat](https://gitter.im/lark-parser/Lobby)
diff --git a/docs/index.rst b/docs/index.rst
index c4e8be6..e8bd6b2 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -113,7 +113,7 @@ Resources
 
 .. _Examples: https://github.com/lark-parser/lark/tree/master/examples
 .. _Third-party examples: https://github.com/ligurio/lark-grammars
-.. _Online IDE: https://lark-parser.github.io/lark/ide
+.. _Online IDE: https://lark-parser.github.io/ide
 .. _How to write a DSL: http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/
 .. _Program Synthesis is Possible: https://www.cs.cornell.edu/~asampson/blog/minisynth.html
 .. _Cheatsheet (PDF): _static/lark_cheatsheet.pdf

From f3826ed3d16d98d1a6619ca8bc8a0d0b9493d596 Mon Sep 17 00:00:00 2001
From: Louis Sautier <sautier.louis@gmail.com>
Date: Tue, 10 Aug 2021 00:02:08 +0200
Subject: [PATCH 29/31] Remove ineffective description-file key from setup.cfg

Otherwise, setuptools warns that:
"UserWarning: Usage of dash-separated 'description-file' will not be
supported in future versions. Please use the underscore name
'description_file' instead"

This key doesn't seem to do anything unless you use pbr.
---
 setup.cfg | 2 --
 1 file changed, 2 deletions(-)

diff --git a/setup.cfg b/setup.cfg
index 6ddead9..6d71f28 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -5,6 +5,4 @@ zip_safe=
 universal = 1
 
 [metadata]
-description-file = README.md
 license_file = LICENSE
-

From 3269605211f92942296257e34722a979801c204c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michael=20K=C3=A4ufl?= <michael-k@users.noreply.github.com>
Date: Tue, 17 Aug 2021 10:46:37 +0200
Subject: [PATCH 30/31] Remove config for Travis CI

---
 .travis.yml | 15 ---------------
 README.md   |  2 +-
 tox.ini     | 11 -----------
 3 files changed, 1 insertion(+), 27 deletions(-)
 delete mode 100644 .travis.yml

diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 6448cc8..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,15 +0,0 @@
-dist: xenial
-language: python
-python:
-  - "2.7"
-  - "3.4"
-  - "3.5"
-  - "3.6"
-  - "3.7"
-  - "3.8"
-  - "3.9-dev"
-  - "pypy2.7-6.0"
-  - "pypy3.5-6.0"
-install: pip install tox-travis
-script:
-  - tox
diff --git a/README.md b/README.md
index 82f6148..70be4fe 100644
--- a/README.md
+++ b/README.md
@@ -37,7 +37,7 @@ Most importantly, Lark will save you time and prevent you from getting parsing h
 
 Lark has no dependencies.
 
-[![Build Status](https://travis-ci.org/lark-parser/lark.svg?branch=master)](https://travis-ci.org/lark-parser/lark)
+[![Tests](https://github.com/lark-parser/lark/actions/workflows/tests.yml/badge.svg)](https://github.com/lark-parser/lark/actions/workflows/tests.yml)
 
 ### Syntax Highlighting
 
diff --git a/tox.ini b/tox.ini
index ef19e2c..cef423b 100644
--- a/tox.ini
+++ b/tox.ini
@@ -2,17 +2,6 @@
 envlist = py27, py34, py35, py36, py37, py38, py39, pypy, pypy3
 skip_missing_interpreters=true
 
-[travis]
-2.7 = py27
-3.4 = py34
-3.5 = py35
-3.6 = py36
-3.7 = py37
-3.8 = py38
-3.9 = py39
-pypy = pypy
-pypy3 = pypy3
-
 [testenv]
 whitelist_externals = git
 deps =

From 8f73a58a5446a2ffb078905af8acd11c358d3425 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Michael=20K=C3=A4ufl?= <michael-k@users.noreply.github.com>
Date: Tue, 17 Aug 2021 10:49:20 +0200
Subject: [PATCH 31/31] Run tests against Python 3.10

---
 .github/workflows/tests.yml | 2 +-
 tox.ini                     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
index 1630c8b..c7b9286 100644
--- a/.github/workflows/tests.yml
+++ b/.github/workflows/tests.yml
@@ -6,7 +6,7 @@ jobs:
     runs-on: ubuntu-latest
     strategy:
       matrix:
-        python-version: [2.7, 3.5, 3.6, 3.7, 3.8, 3.9.0-rc - 3.9, pypy2, pypy3]
+        python-version: [2.7, 3.5, 3.6, 3.7, 3.8, 3.9, 3.10.0-rc - 3.10, pypy2, pypy3]
 
     steps:
       - uses: actions/checkout@v2
diff --git a/tox.ini b/tox.ini
index ef19e2c..842ed2b 100644
--- a/tox.ini
+++ b/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py27, py34, py35, py36, py37, py38, py39, pypy, pypy3
+envlist = py27, py34, py35, py36, py37, py38, py39, py310, pypy, pypy3
 skip_missing_interpreters=true
 
 [travis]