From 4d8301f73cbea06a6182e02aa120ec7c563a88db Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 4 May 2019 20:13:39 +0300 Subject: [PATCH 001/132] Version Bump --- lark/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/__init__.py b/lark/__init__.py index 6328784..7fd92ee 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -5,4 +5,4 @@ from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, Une from .lexer import Token from .lark import Lark -__version__ = "0.7.0" +__version__ = "0.7.1" From 28e571f1c68984c72b582b12c00a11168f0b9d94 Mon Sep 17 00:00:00 2001 From: Paul Vinciguerra Date: Mon, 6 May 2019 09:20:21 -0400 Subject: [PATCH 002/132] Fix DeprecationWarning in lalr_analysis.py Under python 3.3+, logging.warn is deprecated. Use logging.warning instead. Fixes: /Library/Frameworks/Python.framework/Versions/3.6/lib/python3.6/site-packages/lark/parsers/lalr_analysis.py:87: DeprecationWarning: The 'warn' function is deprecated, use 'warning' instead --- lark/parsers/lalr_analysis.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index cceaa45..54a4041 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -112,9 +112,9 @@ class LALR_Analyzer(GrammarAnalyzer): for k, v in lookahead.items(): if len(v) > 1: if self.debug: - logging.warn("Shift/reduce conflict for terminal %s: (resolving as shift)", k.name) + logging.warning("Shift/reduce conflict for terminal %s: (resolving as shift)", k.name) for act, arg in v: - logging.warn(' * %s: %s', act, arg) + logging.warning(' * %s: %s', act, arg) for x in v: # XXX resolving shift/reduce into shift, like PLY # Give a proper warning From 09afcfcfc7b77efa7e6001641def38454afcace4 Mon Sep 17 00:00:00 2001 From: fbindel Date: Thu, 9 May 2019 14:41:41 +0200 Subject: [PATCH 003/132] Allow any graph attribute in `pydot__tree_to_png`. Keeping the explicit `rankdir="LR"` as default, add `kwargs` to `pydot__tree_to_png` and `pydot.Dot` so that all graphviz attributes are available for the graph. --- lark/tree.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lark/tree.py b/lark/tree.py index 3b845d6..fd0038e 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -141,17 +141,19 @@ class SlottedTree(Tree): __slots__ = 'data', 'children', 'rule', '_meta' -def pydot__tree_to_png(tree, filename, rankdir="LR"): +def pydot__tree_to_png(tree, filename, rankdir="LR", **kwargs): """Creates a colorful image that represents the tree (data+children, without meta) Possible values for `rankdir` are "TB", "LR", "BT", "RL", corresponding to directed graphs drawn from top to bottom, from left to right, from bottom to - top, and from right to left, respectively. See: - https://www.graphviz.org/doc/info/attrs.html#k:rankdir + top, and from right to left, respectively. + + `kwargs` can be any graph attribute (e. g. `dpi=200`). For a list of + possible attributes, see https://www.graphviz.org/doc/info/attrs.html. """ import pydot - graph = pydot.Dot(graph_type='digraph', rankdir=rankdir) + graph = pydot.Dot(graph_type='digraph', rankdir=rankdir, **kwargs) i = [0] From 0f9dfdd6237ea093ce038fa18de3e1764b89a6b1 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 11 May 2019 09:42:16 +0300 Subject: [PATCH 004/132] Re-implemented CustomLexer after regression (Issue #377) --- lark/parser_frontends.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index f81001c..ab69d01 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -65,7 +65,7 @@ class WithLexer(Serialize): inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) inst.init_lexer() return inst - + def _serialize(self, data, memo): data['parser'] = data['parser'].serialize(memo) @@ -107,11 +107,12 @@ class LALR_ContextualLexer(LALR_WithLexer): ###} class LALR_CustomLexer(LALR_WithLexer): - def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): - pass # TODO - - def init_lexer(self): + def __init__(self, lexer_cls, lexer_conf, parser_conf, *, options=None): self.lexer = lexer_cls(self.lexer_conf) + debug = options.debug if options else False + self.parser = LALR_Parser(parser_conf, debug=debug) + WithLexer.__init__(self, lexer_conf, parser_conf, options) + def tokenize_text(text): line = 1 From e5868415ebc9f9a985549f82ff8137806d78450d Mon Sep 17 00:00:00 2001 From: Mostafa Razavi Date: Sun, 12 May 2019 19:32:50 +0200 Subject: [PATCH 005/132] Implement embedded in-place transformers. See #378. As discussed in issue #378, when an embedded transformer (that is, one passed to the Lark class using the transformer argument), is an inplace transformer (either a subclass of Transformer_InPlace, or with the @v_args(tree=True) decorator), the in-place transformer was not working correctly and in-fact Lark used it like a normal non-in-place transformer, expecting it to return the transformed value. --- lark/parse_tree_builder.py | 12 ++++++++++++ tests/test_parser.py | 27 ++++++++++++++++++++++++++- 2 files changed, 38 insertions(+), 1 deletion(-) diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 977c371..550bc17 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -2,6 +2,7 @@ from .exceptions import GrammarError from .lexer import Token from .tree import Tree from .visitors import InlineTransformer # XXX Deprecated +from .visitors import Transformer_InPlace ###{standalone from functools import partial, wraps @@ -193,6 +194,15 @@ def ptb_inline_args(func): return func(*children) return f +def inplace_transformer(func): + @wraps(func) + def f(children): + # function name in a Transformer is a rule name. + tree = Tree(func.__name__, children) + func(tree) + return tree + return f + class ParseTreeBuilder: def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False, maybe_placeholders=False): self.tree_class = tree_class @@ -231,6 +241,8 @@ class ParseTreeBuilder: # XXX InlineTransformer is deprecated! if getattr(f, 'inline', False) or isinstance(transformer, InlineTransformer): f = ptb_inline_args(f) + elif hasattr(f, 'whole_tree') or isinstance(transformer, Transformer_InPlace): + f = inplace_transformer(f) except AttributeError: f = partial(self.tree_class, user_callback_name) diff --git a/tests/test_parser.py b/tests/test_parser.py index ce8b7d6..0fddf14 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -20,7 +20,7 @@ logging.basicConfig(level=logging.INFO) from lark.lark import Lark from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters from lark.tree import Tree -from lark.visitors import Transformer +from lark.visitors import Transformer, Transformer_InPlace, v_args from lark.grammar import Rule from lark.lexer import TerminalDef @@ -150,6 +150,31 @@ class TestParsers(unittest.TestCase): r = g.parse("xx") self.assertEqual( r.children, [""] ) + def test_embedded_transformer_inplace(self): + class T1(Transformer_InPlace): + def a(self, tree): + assert isinstance(tree, Tree) + tree.children.append("tested") + + @v_args(tree=True) + class T2(Transformer): + def a(self, tree): + assert isinstance(tree, Tree) + tree.children.append("tested") + + class T3(Transformer): + @v_args(tree=True) + def a(self, tree): + assert isinstance(tree, Tree) + tree.children.append("tested") + + for t in [T1(), T2(), T3()]: + g = Lark("""start: a + a : "x" + """, parser='lalr', transformer=t) + r = g.parse("x") + first, = r.children + self.assertEqual(first.children, ["tested"]) def test_alias(self): Lark("""start: ["a"] "b" ["c"] "e" ["f"] ["g"] ["h"] "x" -> d """) From f71df240b65c8425b6b10b4beb60fdee92f74cf6 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 12 May 2019 21:34:21 +0300 Subject: [PATCH 006/132] Removed Python2 incompatibility --- lark/parser_frontends.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index ab69d01..0634814 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -107,7 +107,7 @@ class LALR_ContextualLexer(LALR_WithLexer): ###} class LALR_CustomLexer(LALR_WithLexer): - def __init__(self, lexer_cls, lexer_conf, parser_conf, *, options=None): + def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): self.lexer = lexer_cls(self.lexer_conf) debug = options.debug if options else False self.parser = LALR_Parser(parser_conf, debug=debug) From a9106df824133f748e33544c6128cc355bc03dab Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 12 May 2019 22:11:13 +0300 Subject: [PATCH 007/132] Corrected thee Transformer's whole_tree interface, for both internal and external use --- lark/parse_tree_builder.py | 3 +-- lark/visitors.py | 2 +- tests/test_parser.py | 34 +++++++++++++++++++++++++++------- 3 files changed, 29 insertions(+), 10 deletions(-) diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 550bc17..b54b6e8 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -199,8 +199,7 @@ def inplace_transformer(func): def f(children): # function name in a Transformer is a rule name. tree = Tree(func.__name__, children) - func(tree) - return tree + return func(tree) return f class ParseTreeBuilder: diff --git a/lark/visitors.py b/lark/visitors.py index 53847f9..4a0f639 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -36,7 +36,7 @@ class Transformer: return f(*children) elif getattr(f, 'whole_tree', False): if new_children is not None: - raise NotImplementedError("Doesn't work with the base Transformer class") + tree.children = new_children return f(tree) else: return f(children) diff --git a/tests/test_parser.py b/tests/test_parser.py index 0fddf14..1cf702d 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -151,30 +151,50 @@ class TestParsers(unittest.TestCase): self.assertEqual( r.children, [""] ) def test_embedded_transformer_inplace(self): + @v_args(tree=True) class T1(Transformer_InPlace): def a(self, tree): - assert isinstance(tree, Tree) + assert isinstance(tree, Tree), tree tree.children.append("tested") + return tree + + def b(self, tree): + return Tree(tree.data, tree.children + ['tested2']) @v_args(tree=True) class T2(Transformer): def a(self, tree): assert isinstance(tree, Tree) tree.children.append("tested") + return tree + + def b(self, tree): + return Tree(tree.data, tree.children + ['tested2']) class T3(Transformer): @v_args(tree=True) def a(self, tree): assert isinstance(tree, Tree) tree.children.append("tested") + return tree + + @v_args(tree=True) + def b(self, tree): + return Tree(tree.data, tree.children + ['tested2']) for t in [T1(), T2(), T3()]: - g = Lark("""start: a - a : "x" - """, parser='lalr', transformer=t) - r = g.parse("x") - first, = r.children - self.assertEqual(first.children, ["tested"]) + for internal in [False, True]: + g = Lark("""start: a b + a : "x" + b : "y" + """, parser='lalr', transformer=t if internal else None) + r = g.parse("xy") + if not internal: + r = t.transform(r) + + a, b = r.children + self.assertEqual(a.children, ["tested"]) + self.assertEqual(b.children, ["tested2"]) def test_alias(self): Lark("""start: ["a"] "b" ["c"] "e" ["f"] ["g"] ["h"] "x" -> d """) From e79689dce7eabd4fcaaedc1d50927725e76a53a4 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 19 May 2019 13:30:25 +0300 Subject: [PATCH 008/132] Remove unused rules (Issue #384) --- lark/lark.py | 2 +- lark/load_grammar.py | 16 ++++++++++++++-- lark/reconstruct.py | 2 +- tests/test_parser.py | 13 +++++++++++++ 4 files changed, 29 insertions(+), 4 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index 9bb49a3..87f7137 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -200,7 +200,7 @@ class Lark(Serialize): self.grammar = load_grammar(grammar, self.source) # Compile the EBNF grammar into BNF - self.terminals, self.rules, self.ignore_tokens = self.grammar.compile() + self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) # If the user asked to invert the priorities, negate them all here. # This replaces the old 'resolve__antiscore_sum' option. diff --git a/lark/load_grammar.py b/lark/load_grammar.py index bfd8585..281dc5b 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -205,7 +205,7 @@ class EBNF_to_BNF(Transformer_InPlace): keep_all_tokens = self.rule_options and self.rule_options.keep_all_tokens def will_not_get_removed(sym): - if isinstance(sym, NonTerminal): + if isinstance(sym, NonTerminal): return not sym.name.startswith('_') if isinstance(sym, Terminal): return keep_all_tokens or not sym.filter_out @@ -465,7 +465,7 @@ class Grammar: self.rule_defs = rule_defs self.ignore = ignore - def compile(self): + def compile(self, start): # We change the trees in-place (to support huge grammars) # So deepcopy allows calling compile more than once. term_defs = deepcopy(list(self.term_defs)) @@ -546,6 +546,18 @@ class Grammar: # Remove duplicates compiled_rules = list(set(compiled_rules)) + + # Filter out unused rules + while True: + c = len(compiled_rules) + used_rules = {s for r in compiled_rules + for s in r.expansion + if isinstance(s, NonTerminal) + and s != r.origin} + compiled_rules = [r for r in compiled_rules if r.origin.name==start or r.origin in used_rules] + if len(compiled_rules) == c: + break + # Filter out unused terminals used_terms = {t.name for r in compiled_rules for t in r.expansion diff --git a/lark/reconstruct.py b/lark/reconstruct.py index 2800840..c446913 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -69,7 +69,7 @@ class MakeMatchTree: class Reconstructor: def __init__(self, parser): # XXX TODO calling compile twice returns different results! - tokens, rules, _grammar_extra = parser.grammar.compile() + tokens, rules, _grammar_extra = parser.grammar.compile(parser.options.start) self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens}) self.rules = list(self._build_recons_rules(rules)) diff --git a/tests/test_parser.py b/tests/test_parser.py index 1cf702d..d582878 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1493,6 +1493,19 @@ def _make_parser_test(LEXER, PARSER): parser.parse(r'"That" "And a \"b"') + + def test_meddling_unused(self): + "Unless 'unused' is removed, LALR analysis will fail on reduce-reduce collision" + + grammar = """ + start: EKS* x + x: EKS + unused: x* + EKS: "x" + """ + parser = _Lark(grammar) + + @unittest.skipIf(PARSER!='lalr', "Serialize currently only works for LALR parsers (though it should be easy to extend)") def test_serialize(self): grammar = """ From 7b43742afd2c4953f34ad68f7e6aae417c24b22f Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Thu, 23 May 2019 01:05:07 +0300 Subject: [PATCH 009/132] Fixed IMAG_NUMBER in the Python3 grammar (Issue #387) --- examples/python3.lark | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/python3.lark b/examples/python3.lark index bfd2a4c..3f39f9f 100644 --- a/examples/python3.lark +++ b/examples/python3.lark @@ -178,7 +178,7 @@ HEX_NUMBER.2: /0x[\da-f]*/i OCT_NUMBER.2: /0o[0-7]*/i BIN_NUMBER.2 : /0b[0-1]*/i FLOAT_NUMBER.2: /((\d+\.\d*|\.\d+)(e[-+]?\d+)?|\d+(e[-+]?\d+))/i -IMAG_NUMBER.2: /\d+j|${FLOAT_NUMBER}j/i +IMAG_NUMBER.2: /\d+j/i | FLOAT_NUMBER "j"i %ignore /[\t \f]+/ // WS %ignore /\\[\t \f]*\r?\n/ // LINE_CONT From 8e81dc00619c594922cf305f9d56c54cd5c15275 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 27 May 2019 14:40:11 +0300 Subject: [PATCH 010/132] Update "Projects using Lark" --- README.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index bc48b48..b9a07cf 100644 --- a/README.md +++ b/README.md @@ -132,9 +132,17 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail ### Projects using Lark + - [storyscript](https://github.com/storyscript/storyscript) - The programming language for Application Storytelling + - [tartiflette](https://github.com/dailymotion/tartiflette) - a GraphQL engine by Dailymotion. Lark is used to parse the GraphQL schemas definitions. - [mappyfile](https://github.com/geographika/mappyfile) - a MapFile parser for working with MapServer configuration + - [synapse](https://github.com/vertexproject/synapse) - an intelligence analysis platform + - [Command-Block-Assembly](https://github.com/simon816/Command-Block-Assembly) - An assembly language, and C compiler, for Minecraft commands + - [SPFlow](https://github.com/SPFlow/SPFlow) - Library for Sum-Product Networks + - [https://github.com/aiqm/torchani](Accurate Neural Network Potential on PyTorch) - [pytreeview](https://gitlab.com/parmenti/pytreeview) - a lightweight tree-based grammar explorer - - [tartiflette](https://github.com/dailymotion/tartiflette) - a GraphQL engine by Dailymotion (Lark is used to parse the GraphQL schemas definitions) + - [required](https://github.com/shezadkhan137/required) - multi-field validation using docstrings + - [miniwdl](https://github.com/chanzuckerberg/miniwdl) - A static analysis toolkit for the Workflow Description Language + Using Lark? Send me a message and I'll add your project! From 335be9d289ebbc6c94adc634e9f7a15ab611942f Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 27 May 2019 14:44:40 +0300 Subject: [PATCH 011/132] Update README.md --- README.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b9a07cf..975b9a4 100644 --- a/README.md +++ b/README.md @@ -134,14 +134,15 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail - [storyscript](https://github.com/storyscript/storyscript) - The programming language for Application Storytelling - [tartiflette](https://github.com/dailymotion/tartiflette) - a GraphQL engine by Dailymotion. Lark is used to parse the GraphQL schemas definitions. + - [Hypothesis](https://github.com/HypothesisWorks/hypothesis) - Library for property-based testing - [mappyfile](https://github.com/geographika/mappyfile) - a MapFile parser for working with MapServer configuration - [synapse](https://github.com/vertexproject/synapse) - an intelligence analysis platform - [Command-Block-Assembly](https://github.com/simon816/Command-Block-Assembly) - An assembly language, and C compiler, for Minecraft commands - [SPFlow](https://github.com/SPFlow/SPFlow) - Library for Sum-Product Networks - - [https://github.com/aiqm/torchani](Accurate Neural Network Potential on PyTorch) - - [pytreeview](https://gitlab.com/parmenti/pytreeview) - a lightweight tree-based grammar explorer + - [Torchani](https://github.com/aiqm/torchani) - Accurate Neural Network Potential on PyTorch - [required](https://github.com/shezadkhan137/required) - multi-field validation using docstrings - [miniwdl](https://github.com/chanzuckerberg/miniwdl) - A static analysis toolkit for the Workflow Description Language + - [pytreeview](https://gitlab.com/parmenti/pytreeview) - a lightweight tree-based grammar explorer Using Lark? Send me a message and I'll add your project! From 9a64d2124be6affe6664ef222e484b95a69aa808 Mon Sep 17 00:00:00 2001 From: David Kemp <19152940+davaya@users.noreply.github.com> Date: Wed, 29 May 2019 13:21:33 -0400 Subject: [PATCH 012/132] Clarify handling of filtered terminals --- docs/tree_construction.md | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/docs/tree_construction.md b/docs/tree_construction.md index 47deab2..f80c743 100644 --- a/docs/tree_construction.md +++ b/docs/tree_construction.md @@ -22,6 +22,23 @@ Lark filters out certain types of terminals by default, considering them punctua - Unnamed regular expressions (like `/[0-9]/`) - Named terminals whose name starts with a letter (like `DIGIT`) + - All terminals concatenated within a terminal + +**Example:** +``` +start: PNAME pname + +PNAME: "(" NAME ")" +pname: "(" NAME ")" + +NAME: /\w+/ +%ignore /\s+/ +``` +Lark will parse "(Hello) (World)" as: + + start + (Hello) + pname World Rules prefixed with `!` will retain all their literals regardless. From 6a14e25f407b81490d5dc4a5a701a23c386e652d Mon Sep 17 00:00:00 2001 From: David Kemp <19152940+davaya@users.noreply.github.com> Date: Wed, 29 May 2019 15:05:10 -0400 Subject: [PATCH 013/132] Update description of terminals --- docs/tree_construction.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/docs/tree_construction.md b/docs/tree_construction.md index f80c743..6b581e0 100644 --- a/docs/tree_construction.md +++ b/docs/tree_construction.md @@ -22,7 +22,8 @@ Lark filters out certain types of terminals by default, considering them punctua - Unnamed regular expressions (like `/[0-9]/`) - Named terminals whose name starts with a letter (like `DIGIT`) - - All terminals concatenated within a terminal + +Note: Terminals composed of literals and other terminals always include the entire match without filtering any part. **Example:** ``` From f814d91f9dc0d9989a9ef413cca0f2622427eb74 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 11 Jun 2019 11:09:16 +0300 Subject: [PATCH 014/132] Removed possibly problematic code (Issue #372) --- lark/load_grammar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 281dc5b..8bda118 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -520,7 +520,7 @@ class Grammar: if alias and name.startswith('_'): raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) - empty_indices = [x==_EMPTY for i, x in enumerate(expansion)] + empty_indices = [x==_EMPTY for x in expansion] if any(empty_indices): exp_options = copy(options) if options else RuleOptions() exp_options.empty_indices = empty_indices From 39a17f1d56fe067d5afd72c114bedd9cbb7eba20 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Fri, 21 Jun 2019 10:04:57 +0300 Subject: [PATCH 015/132] Fixed broken link in docs (Issue #399) --- docs/how_to_use.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/how_to_use.md b/docs/how_to_use.md index c2987df..886b440 100644 --- a/docs/how_to_use.md +++ b/docs/how_to_use.md @@ -10,7 +10,7 @@ This is the recommended process for working with Lark: 3. Try your grammar in Lark against each input sample. Make sure the resulting parse-trees make sense. -4. Use Lark's grammar features to [[shape the tree|Tree Construction]]: Get rid of superfluous rules by inlining them, and use aliases when specific cases need clarification. +4. Use Lark's grammar features to [shape the tree](tree_construction.md): Get rid of superfluous rules by inlining them, and use aliases when specific cases need clarification. - You can perform steps 1-4 repeatedly, gradually growing your grammar to include more sentences. From f1e844accdb0dc544e3c92d1571d0c9a7e832765 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 1 Jul 2019 17:07:23 +0300 Subject: [PATCH 016/132] Mid work. Not promising --- lark/common.py | 1 + lark/exceptions.py | 4 +++- lark/lark.py | 3 +++ lark/lexer.py | 4 +++- lark/load_grammar.py | 5 +++-- lark/parsers/cyk.py | 2 +- lark/parsers/earley.py | 2 +- lark/parsers/grammar_analysis.py | 2 +- lark/parsers/lalr_analysis.py | 16 +++++++--------- lark/parsers/lalr_parser.py | 6 +++--- tests/test_parser.py | 9 +++++++++ 11 files changed, 35 insertions(+), 19 deletions(-) diff --git a/lark/common.py b/lark/common.py index e1ec220..7103d14 100644 --- a/lark/common.py +++ b/lark/common.py @@ -20,6 +20,7 @@ class LexerConf(Serialize): class ParserConf: def __init__(self, rules, callbacks, start): + assert isinstance(start, list) self.rules = rules self.callbacks = callbacks self.start = start diff --git a/lark/exceptions.py b/lark/exceptions.py index f781968..4207589 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -52,7 +52,7 @@ class UnexpectedInput(LarkError): class UnexpectedCharacters(LexError, UnexpectedInput): - def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None): + def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) self.line = line @@ -65,6 +65,8 @@ class UnexpectedCharacters(LexError, UnexpectedInput): message += '\n\n' + self.get_context(seq) if allowed: message += '\nExpecting: %s\n' % allowed + if token_history: + message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in token_history) super(UnexpectedCharacters, self).__init__(message) diff --git a/lark/lark.py b/lark/lark.py index 87f7137..e096c55 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -85,6 +85,9 @@ class LarkOptions(Serialize): options[name] = value + if isinstance(options['start'], str): + options['start'] = [options['start']] + self.__dict__['options'] = options assert self.parser in ('earley', 'lalr', 'cyk', None) diff --git a/lark/lexer.py b/lark/lexer.py index bdf635d..3e881f8 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -149,6 +149,7 @@ class _Lex: newline_types = frozenset(newline_types) ignore_types = frozenset(ignore_types) line_ctr = LineCounter() + last_token = None while line_ctr.char_pos < len(stream): lexer = self.lexer @@ -166,6 +167,7 @@ class _Lex: t = lexer.callback[t.type](t) if not isinstance(t, Token): raise ValueError("Callbacks must return a token (returned %r)" % t) + last_token = t yield t else: if type_ in lexer.callback: @@ -180,7 +182,7 @@ class _Lex: break else: allowed = {v for m, tfi in lexer.mres for v in tfi.values()} - raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state) + raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token]) class UnlessCallback: diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 8bda118..f7b1011 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -554,7 +554,8 @@ class Grammar: for s in r.expansion if isinstance(s, NonTerminal) and s != r.origin} - compiled_rules = [r for r in compiled_rules if r.origin.name==start or r.origin in used_rules] + used_rules |= {NonTerminal(s) for s in start} + compiled_rules = [r for r in compiled_rules if r.origin in used_rules] if len(compiled_rules) == c: break @@ -690,7 +691,7 @@ class GrammarLoader: callback = ParseTreeBuilder(rules, ST).create_callback() lexer_conf = LexerConf(terminals, ['WS', 'COMMENT']) - parser_conf = ParserConf(rules, callback, 'start') + parser_conf = ParserConf(rules, callback, ['start']) self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf) self.canonize_tree = CanonizeTree() diff --git a/lark/parsers/cyk.py b/lark/parsers/cyk.py index 2121449..52584a7 100644 --- a/lark/parsers/cyk.py +++ b/lark/parsers/cyk.py @@ -89,7 +89,7 @@ class Parser(object): self.orig_rules = {rule: rule for rule in rules} rules = [self._to_rule(rule) for rule in rules] self.grammar = to_cnf(Grammar(rules)) - self.start = NT(start) + self.start = NT(start[0]) def _to_rule(self, lark_rule): """Converts a lark rule, (lhs, rhs, callback, options), to a Rule.""" diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 0518174..3cd0193 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -274,7 +274,7 @@ class Parser: assert i == len(columns)-1 def parse(self, stream, start_symbol=None): - start_symbol = NonTerminal(start_symbol or self.parser_conf.start) + start_symbol = NonTerminal(start_symbol or self.parser_conf.start[0]) columns = [set()] to_scan = set() # The scan buffer. 'Q' in E.Scott's paper. diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 732496c..bdfd92f 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -109,7 +109,7 @@ class GrammarAnalyzer(object): def __init__(self, parser_conf, debug=False): self.debug = debug - rules = parser_conf.rules + [Rule(NonTerminal('$root'), [NonTerminal(parser_conf.start), Terminal('$END')])] + rules = parser_conf.rules + [Rule(NonTerminal('$root'), [NonTerminal(s), Terminal('$END')]) for s in parser_conf.start] self.rules_by_origin = classify(rules, lambda r: r.origin) if len(rules) != len(set(rules)): diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 54a4041..76e44c7 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -29,10 +29,10 @@ Shift = Action('Shift') Reduce = Action('Reduce') class ParseTable: - def __init__(self, states, start_state, end_state): + def __init__(self, states, start_state, end_states): self.states = states self.start_state = start_state - self.end_state = end_state + self.end_states = end_states def serialize(self, memo): tokens = Enumerator() @@ -48,7 +48,7 @@ class ParseTable: 'tokens': tokens.reversed(), 'states': states, 'start_state': self.start_state, - 'end_state': self.end_state, + 'end_states': self.end_states, } @classmethod @@ -59,7 +59,7 @@ class ParseTable: for token, (action, arg) in actions.items()} for state, actions in data['states'].items() } - return cls(states, data['start_state'], data['end_state']) + return cls(states, data['start_state'], data['end_states']) class IntParseTable(ParseTable): @@ -77,8 +77,8 @@ class IntParseTable(ParseTable): start_state = state_to_idx[parse_table.start_state] - end_state = state_to_idx[parse_table.end_state] - return cls(int_states, start_state, end_state) + end_states = [state_to_idx[s] for s in parse_table.end_states] + return cls(int_states, start_state, end_states) ###} @@ -130,9 +130,7 @@ class LALR_Analyzer(GrammarAnalyzer): for _ in bfs([self.start_state], step): pass - self.end_state ,= self.end_states - - self._parse_table = ParseTable(self.states, self.start_state, self.end_state) + self._parse_table = ParseTable(self.states, self.start_state, self.end_states) if self.debug: self.parse_table = self._parse_table diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index aea75ca..1d56f5e 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -40,7 +40,7 @@ class _Parser: def __init__(self, parse_table, callbacks): self.states = parse_table.states self.start_state = parse_table.start_state - self.end_state = parse_table.end_state + self.end_states = parse_table.end_states self.callbacks = callbacks def parse(self, seq, set_state=None): @@ -81,7 +81,7 @@ class _Parser: for token in stream: while True: action, arg = get_action(token) - assert arg != self.end_state + assert arg not in self.end_states if action is Shift: state_stack.append(arg) @@ -95,7 +95,7 @@ class _Parser: while True: _action, arg = get_action(token) if _action is Shift: - assert arg == self.end_state + assert arg in self.end_states val ,= value_stack return val else: diff --git a/tests/test_parser.py b/tests/test_parser.py index d582878..bc8388c 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1523,6 +1523,15 @@ def _make_parser_test(LEXER, PARSER): parser3 = Lark.deserialize(d, namespace, m) self.assertEqual(parser3.parse('ABC'), Tree('start', [Tree('b', [])]) ) + def test_multi_start(self): + parser = _Lark(''' + a: "x" + b: "x" "b"? + ''', start=['a', 'b']) + + # parser.parse('acab') + # parser.parse('bcab') + _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() From 71fe87964a547ed56cb7af8befdcb9d863d6bb74 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 1 Jul 2019 17:39:10 +0300 Subject: [PATCH 017/132] Small refactor in grammar analysis --- lark/parsers/grammar_analysis.py | 8 ++++++-- lark/parsers/lalr_analysis.py | 5 ----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 732496c..ab84efb 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -109,7 +109,8 @@ class GrammarAnalyzer(object): def __init__(self, parser_conf, debug=False): self.debug = debug - rules = parser_conf.rules + [Rule(NonTerminal('$root'), [NonTerminal(parser_conf.start), Terminal('$END')])] + root_rule = Rule(NonTerminal('$root'), [NonTerminal(parser_conf.start), Terminal('$END')]) + rules = parser_conf.rules + [root_rule] self.rules_by_origin = classify(rules, lambda r: r.origin) if len(rules) != len(set(rules)): @@ -121,7 +122,10 @@ class GrammarAnalyzer(object): if not (sym.is_term or sym in self.rules_by_origin): raise GrammarError("Using an undefined rule: %s" % sym) # TODO test validation - self.start_state = self.expand_rule(NonTerminal('$root')) + self.start_state = self.expand_rule(root_rule.origin) + + end_rule = RulePtr(root_rule, len(root_rule.expansion)) + self.end_state = fzset({end_rule}) self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules) diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 54a4041..ee2f75c 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -85,7 +85,6 @@ class IntParseTable(ParseTable): class LALR_Analyzer(GrammarAnalyzer): def compute_lookahead(self): - self.end_states = [] self.states = {} def step(state): @@ -105,8 +104,6 @@ class LALR_Analyzer(GrammarAnalyzer): new_state = fzset(rps) lookahead[sym].append((Shift, new_state)) - if sym == Terminal('$END'): - self.end_states.append( new_state ) yield new_state for k, v in lookahead.items(): @@ -130,8 +127,6 @@ class LALR_Analyzer(GrammarAnalyzer): for _ in bfs([self.start_state], step): pass - self.end_state ,= self.end_states - self._parse_table = ParseTable(self.states, self.start_state, self.end_state) if self.debug: From bcc4e67bceea22635b286c852f846ad4d324f01a Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 1 Jul 2019 18:30:25 +0300 Subject: [PATCH 018/132] CYK also working --- lark/parser_frontends.py | 4 ++-- lark/parsers/cyk.py | 12 +++++++----- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 1b55fe1..c1bb3c9 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -194,13 +194,13 @@ class CYK(WithLexer): self.init_traditional_lexer() self._analysis = GrammarAnalyzer(parser_conf) - self._parser = cyk.Parser(parser_conf.rules, parser_conf.start) + self.parser = cyk.Parser(parser_conf.rules) self.callbacks = parser_conf.callbacks def parse(self, text, start): tokens = list(self.lex(text)) - parse = self._parser.parse(tokens) + parse = self._parse(tokens, start) parse = self._transform(parse) return parse diff --git a/lark/parsers/cyk.py b/lark/parsers/cyk.py index 52584a7..7b25609 100644 --- a/lark/parsers/cyk.py +++ b/lark/parsers/cyk.py @@ -84,12 +84,11 @@ class RuleNode(object): class Parser(object): """Parser wrapper.""" - def __init__(self, rules, start): + def __init__(self, rules): super(Parser, self).__init__() self.orig_rules = {rule: rule for rule in rules} rules = [self._to_rule(rule) for rule in rules] self.grammar = to_cnf(Grammar(rules)) - self.start = NT(start[0]) def _to_rule(self, lark_rule): """Converts a lark rule, (lhs, rhs, callback, options), to a Rule.""" @@ -100,13 +99,16 @@ class Parser(object): weight=lark_rule.options.priority if lark_rule.options and lark_rule.options.priority else 0, alias=lark_rule) - def parse(self, tokenized): # pylint: disable=invalid-name + def parse(self, tokenized, start): # pylint: disable=invalid-name """Parses input, which is a list of tokens.""" + assert start + start = NT(start) + table, trees = _parse(tokenized, self.grammar) # Check if the parse succeeded. - if all(r.lhs != self.start for r in table[(0, len(tokenized) - 1)]): + if all(r.lhs != start for r in table[(0, len(tokenized) - 1)]): raise ParseError('Parsing failed.') - parse = trees[(0, len(tokenized) - 1)][self.start] + parse = trees[(0, len(tokenized) - 1)][start] return self._to_tree(revert_cnf(parse)) def _to_tree(self, rule_node): From be2e860c83eb6c0ee30f6f8cb8063373e2989067 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 24 Dec 2018 16:11:46 +0200 Subject: [PATCH 019/132] Added to tests: Make sure the standalone parser is reusable --- tests/test_tools.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tests/test_tools.py b/tests/test_tools.py index ff823ec..27927eb 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -49,6 +49,8 @@ class TestStandalone(TestCase): l = _Lark() x = l.parse('12 elephants') self.assertEqual(x.children, ['12', 'elephants']) + x = l.parse('16 candles') + self.assertEqual(x.children, ['16', 'candles']) def test_contextual(self): grammar = """ From 505c46e9ba30f125f457654422f6521121a4c5f1 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 2 Jul 2019 18:31:22 +0300 Subject: [PATCH 020/132] Cleaned up a test --- tests/test_tools.py | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/tests/test_tools.py b/tests/test_tools.py index 27927eb..5316396 100644 --- a/tests/test_tools.py +++ b/tests/test_tools.py @@ -1,11 +1,9 @@ from __future__ import absolute_import import sys -import unittest -from unittest import TestCase +from unittest import TestCase, main from lark.tree import Tree - from lark.tools import standalone try: @@ -94,26 +92,19 @@ class TestStandalone(TestCase): _NEWLINE: /\n/ """ - # from lark import Lark - # l = Lark(grammar, parser='lalr', lexer='contextual', postlex=MyIndenter()) - # x = l.parse('(\n)\n') - # print('@@', x) - - context = self._create_standalone(grammar) _Lark = context['Lark_StandAlone'] - # l = _Lark(postlex=MyIndenter()) - # x = l.parse('()\n') - # print(x) + l = _Lark(postlex=MyIndenter()) + x = l.parse('()\n') + self.assertEqual(x, Tree('start', [])) l = _Lark(postlex=MyIndenter()) x = l.parse('(\n)\n') - print(x) - + self.assertEqual(x, Tree('start', [])) if __name__ == '__main__': - unittest.main() + main() From 94e7e82a199d846650090d7f46562ed9c1f10692 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 9 Jul 2019 16:57:30 +0300 Subject: [PATCH 021/132] Updated docstrings for multiple start symbols --- lark/lark.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index 82cf76a..5c43fa8 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -43,7 +43,7 @@ class LarkOptions(Serialize): keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False) cache_grammar - Cache the Lark grammar (Default: False) postlex - Lexer post-processing (Default: None) Only works with the standard and contextual lexers. - start - The start symbol (Default: start) + start - The start symbol, either a string, or a list of strings for multiple possible starts (Default: "start") profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False) priority - How priorities should be evaluated - auto, none, normal, invert (Default: auto) propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches. @@ -291,7 +291,12 @@ class Lark(Serialize): return stream def parse(self, text, start=None): - "Parse the given text, according to the options provided. Returns a tree, unless specified otherwise." + """Parse the given text, according to the options provided. + + The 'start' parameter is required if Lark was given multiple possible start symbols (using the start option). + + Returns a tree, unless specified otherwise. + """ return self.parser.parse(text, start=start) ###} From e3cbd7aadc26faa69b1cda1253871dca3f1c665f Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 21 Jul 2019 00:50:28 +0200 Subject: [PATCH 022/132] Negative priority now allowed in rules and tokens. Updated docs about priority --- docs/grammar.md | 13 ++++++++++++- docs/parsers.md | 2 +- lark/load_grammar.py | 4 ++-- tests/test_parser.py | 26 ++++++++++++++++++++++++++ 4 files changed, 41 insertions(+), 4 deletions(-) diff --git a/docs/grammar.md b/docs/grammar.md index ad70f6e..9343ee4 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -45,6 +45,12 @@ Literals can be one of: * `/re with flags/imulx` * Literal range: `"a".."z"`, `"1".."9"`, etc. +### Priority + +Terminals can be assigned priority only when using a lexer (future versions may support Earley's dynamic lexing). + +Priority can be either positive or negative. In not specified for a terminal, it's assumed to be 1 (i.e. the default). + #### Notes for when using a lexer: When using a lexer (standard or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched in an order according to the following criteria: @@ -90,7 +96,7 @@ Each item is one of: * `item*` - Zero or more instances of item * `item+` - One or more instances of item * `item ~ n` - Exactly *n* instances of item -* `item ~ n..m` - Between *n* to *m* instances of item +* `item ~ n..m` - Between *n* to *m* instances of item (not recommended for wide ranges, due to performance issues) **Examples:** ```perl @@ -102,6 +108,11 @@ expr: expr operator expr four_words: word ~ 4 ``` +### Priority + +Rules can be assigned priority only when using Earley (future versions may support LALR as well). + +Priority can be either positive or negative. In not specified for a terminal, it's assumed to be 1 (i.e. the default). ## Directives diff --git a/docs/parsers.md b/docs/parsers.md index 35de223..fb7c997 100644 --- a/docs/parsers.md +++ b/docs/parsers.md @@ -7,7 +7,7 @@ An [Earley Parser](https://www.wikiwand.com/en/Earley_parser) is a chart parser Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitely using `lexer='dynamic'`. -It's possible to bypass the dynamic lexer, and use the regular Earley parser with a traditional lexer, that tokenizes as an independant first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'` +It's possible to bypass the dynamic lexing, and use the regular Earley parser with a traditional lexer, that tokenizes as an independant first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'` **SPPF & Ambiguity resolution** diff --git a/lark/load_grammar.py b/lark/load_grammar.py index f7b1011..f6c1d22 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -90,7 +90,7 @@ TERMINALS = { '_IGNORE': r'%ignore', '_DECLARE': r'%declare', '_IMPORT': r'%import', - 'NUMBER': r'\d+', + 'NUMBER': r'[+-]?\d+', } RULES = { @@ -196,7 +196,7 @@ class EBNF_to_BNF(Transformer_InPlace): mn = mx = int(args[0]) else: mn, mx = map(int, args) - if mx < mn: + if mx < mn or mn < 0: raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)]) assert False, op diff --git a/tests/test_parser.py b/tests/test_parser.py index 3238ead..599406f 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1029,6 +1029,32 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(res.children, ['ab']) + grammar = """ + start: A B | AB + A: "a" + B.-20: "b" + AB.-10: "ab" + """ + l = _Lark(grammar) + res = l.parse("ab") + self.assertEqual(res.children, ['a', 'b']) + + + grammar = """ + start: A B | AB + A.-99999999999999999999999: "a" + B: "b" + AB: "ab" + """ + l = _Lark(grammar) + res = l.parse("ab") + + self.assertEqual(res.children, ['ab']) + + + + + def test_import(self): grammar = """ From c87cbc63225fb6a0426d7b5b8f2bfbecd978eda1 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 21 Jul 2019 00:59:50 +0200 Subject: [PATCH 023/132] Removed some dead code --- lark/parsers/grammar_analysis.py | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 086349c..a31f308 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -132,7 +132,7 @@ class GrammarAnalyzer(object): self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules) - def expand_rule(self, rule): + def expand_rule(self, source_rule): "Returns all init_ptrs accessible by rule (recursive)" init_ptrs = set() def _expand_rule(rule): @@ -147,14 +147,7 @@ class GrammarAnalyzer(object): if not new_r.is_term: yield new_r - for _ in bfs([rule], _expand_rule): + for _ in bfs([source_rule], _expand_rule): pass - return fzset(init_ptrs) - - def _first(self, r): - if r.is_term: - return {r} - else: - return {rp.next for rp in self.expand_rule(r) if rp.next.is_term} - + return fzset(init_ptrs) \ No newline at end of file From 39b0d769141d3f8e579b5a4711bd22b579657801 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 21 Jul 2019 09:46:08 +0200 Subject: [PATCH 024/132] Minor optimization in LALR (thanks to @Raekye) --- lark/parsers/grammar_analysis.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index a31f308..8fc0806 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -82,9 +82,10 @@ def calculate_sets(rules): changed = True for i, sym in enumerate(rule.expansion): - if set(rule.expansion[:i]) <= NULLABLE: - if update_set(FIRST[rule.origin], FIRST[sym]): - changed = True + if set(rule.expansion[:i]) > NULLABLE: + break + if update_set(FIRST[rule.origin], FIRST[sym]): + changed = True # Calculate FOLLOW changed = True From 8e9da6a6d6e2d6395211003ed880b712a2304779 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 21 Jul 2019 09:51:27 +0200 Subject: [PATCH 025/132] Minor optimization in LALR (and fix for last commit) --- lark/parsers/grammar_analysis.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 8fc0806..306059d 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -38,7 +38,7 @@ class RulePtr(object): def update_set(set1, set2): - if not set2: + if not set2 or set1 > set2: return False copy = set(set1) @@ -82,10 +82,11 @@ def calculate_sets(rules): changed = True for i, sym in enumerate(rule.expansion): - if set(rule.expansion[:i]) > NULLABLE: + if set(rule.expansion[:i]) <= NULLABLE: + if update_set(FIRST[rule.origin], FIRST[sym]): + changed = True + else: break - if update_set(FIRST[rule.origin], FIRST[sym]): - changed = True # Calculate FOLLOW changed = True From d952f2a0694787ab8e67256574710cb56a2f4c26 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 24 Jul 2019 12:27:34 +0200 Subject: [PATCH 026/132] Token values are now always unicode (resolves issue #411) --- lark/lexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/lexer.py b/lark/lexer.py index 3e881f8..898ee04 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -88,7 +88,7 @@ class Token(Str): self.type = type_ self.pos_in_stream = pos_in_stream - self.value = value + self.value = Str(value) self.line = line self.column = column self.end_line = end_line From 0d164bd344cf164954b52c6fc50f4ddcd23a2cfd Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 24 Jul 2019 23:00:55 +0200 Subject: [PATCH 027/132] Added get_terminal() method (Issue #412) --- lark/lark.py | 6 ++++++ lark/lexer.py | 4 ++++ 2 files changed, 10 insertions(+) diff --git a/lark/lark.py b/lark/lark.py index 5c43fa8..ae71d56 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -205,6 +205,8 @@ class Lark(Serialize): # Compile the EBNF grammar into BNF self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) + self._terminals_dict = {t.name:t for t in self.terminals} + # If the user asked to invert the priorities, negate them all here. # This replaces the old 'resolve__antiscore_sum' option. if self.options.priority == 'invert': @@ -290,6 +292,10 @@ class Lark(Serialize): return self.options.postlex.process(stream) return stream + def get_terminal(self, name): + "Get information about a terminal" + return self._terminals_dict[name] + def parse(self, text, start=None): """Parse the given text, according to the options provided. diff --git a/lark/lexer.py b/lark/lexer.py index 898ee04..4a8b422 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -41,6 +41,8 @@ class Pattern(Serialize): class PatternStr(Pattern): + type = "str" + def to_regexp(self): return self._get_flags(re.escape(self.value)) @@ -50,6 +52,8 @@ class PatternStr(Pattern): max_width = min_width class PatternRE(Pattern): + type = "re" + def to_regexp(self): return self._get_flags(self.value) From 7add0e1f3f561f8701a745f0e1d05d71da82752d Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Thu, 25 Jul 2019 09:56:20 +0200 Subject: [PATCH 028/132] Memoize get_regexp_width (Issue #413) --- lark/lexer.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 4a8b422..d3e4af6 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -57,12 +57,18 @@ class PatternRE(Pattern): def to_regexp(self): return self._get_flags(self.value) + _width = None + def _get_width(self): + if self._width is None: + self._width = get_regexp_width(self.to_regexp()) + return self._width + @property def min_width(self): - return get_regexp_width(self.to_regexp())[0] + return self._get_width()[0] @property def max_width(self): - return get_regexp_width(self.to_regexp())[1] + return self._get_width()[1] class TerminalDef(Serialize): From dd3a812fead3fc1f2a45d45d337fb7abaabf63b1 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 30 Jul 2019 11:31:28 +0200 Subject: [PATCH 029/132] Version bump (0.7.2) --- lark/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/__init__.py b/lark/__init__.py index 7fd92ee..db2ce44 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -5,4 +5,4 @@ from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, Une from .lexer import Token from .lark import Lark -__version__ = "0.7.1" +__version__ = "0.7.2" From 21c41e54a9728587bc2043d187b9230acad9fcec Mon Sep 17 00:00:00 2001 From: Raekye Date: Tue, 30 Jul 2019 19:49:23 -0400 Subject: [PATCH 030/132] lalr parser --- lark/parsers/grammar_analysis.py | 36 ++++++- lark/parsers/lalr_analysis.py | 171 +++++++++++++++++++++++++------ lark/parsers/lalr_parser.py | 21 ++-- 3 files changed, 184 insertions(+), 44 deletions(-) diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 086349c..5a4d0e8 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -36,6 +36,23 @@ class RulePtr(object): def __hash__(self): return hash((self.rule, self.index)) +class LR0ItemSet(object): + __slots__ = ('kernel', 'closure', 'transitions') + + def __init__(self, kernel, closure): + self.kernel = fzset(kernel) + self.closure = fzset(closure) + self.transitions = {} + + def __eq__(self, other): + return self.kernel == other.kernel + + def __hash__(self): + return hash(self.kernel) + + def __repr__(self): + return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure])) + def update_set(set1, set2): if not set2: @@ -130,15 +147,29 @@ class GrammarAnalyzer(object): self.end_states = {start: fzset({RulePtr(root_rule, len(root_rule.expansion))}) for start, root_rule in root_rules.items()} + lr0_root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start)]) + for start in parser_conf.start} + + lr0_rules = parser_conf.rules + list(lr0_root_rules.values()) + + self.lr0_rules_by_origin = classify(lr0_rules, lambda r: r.origin) + + self.lr0_start_states = {start: LR0ItemSet([RulePtr(root_rule, 0)], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin)) + for start, root_rule in lr0_root_rules.items()} + self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules) - def expand_rule(self, rule): + def expand_rule(self, rule, rules_by_origin=None): "Returns all init_ptrs accessible by rule (recursive)" + + if rules_by_origin is None: + rules_by_origin = self.rules_by_origin + init_ptrs = set() def _expand_rule(rule): assert not rule.is_term, rule - for r in self.rules_by_origin[rule]: + for r in rules_by_origin[rule]: init_ptr = RulePtr(r, 0) init_ptrs.add(init_ptr) @@ -157,4 +188,3 @@ class GrammarAnalyzer(object): return {r} else: return {rp.next for rp in self.expand_rule(r) if rp.next.is_term} - diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index eef1f9b..61fe692 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -12,7 +12,7 @@ from collections import defaultdict from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator from ..exceptions import GrammarError -from .grammar_analysis import GrammarAnalyzer, Terminal +from .grammar_analysis import GrammarAnalyzer, Terminal, RulePtr, LR0ItemSet from ..grammar import Rule ###{standalone @@ -84,53 +84,158 @@ class IntParseTable(ParseTable): class LALR_Analyzer(GrammarAnalyzer): - def compute_lookahead(self): + def generate_lr0_states(self): + self.states = set() - self.states = {} def step(state): - lookahead = defaultdict(list) - sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied) - for rp in sat: - for term in self.FOLLOW.get(rp.rule.origin, ()): - lookahead[term].append((Reduce, rp.rule)) + _, unsat = classify_bool(state.closure, lambda rp: rp.is_satisfied) d = classify(unsat, lambda rp: rp.next) for sym, rps in d.items(): - rps = {rp.advance(sym) for rp in rps} + kernel = {rp.advance(sym) for rp in rps} + closure = set(kernel) - for rp in set(rps): + for rp in kernel: if not rp.is_satisfied and not rp.next.is_term: - rps |= self.expand_rule(rp.next) + closure |= self.expand_rule(rp.next, self.lr0_rules_by_origin) - new_state = fzset(rps) - lookahead[sym].append((Shift, new_state)) + new_state = LR0ItemSet(kernel, closure) + state.transitions[sym] = new_state yield new_state - for k, v in lookahead.items(): - if len(v) > 1: - if self.debug: - logging.warning("Shift/reduce conflict for terminal %s: (resolving as shift)", k.name) - for act, arg in v: - logging.warning(' * %s: %s', act, arg) - for x in v: - # XXX resolving shift/reduce into shift, like PLY - # Give a proper warning - if x[0] is Shift: - lookahead[k] = [x] - - for k, v in lookahead.items(): - if not len(v) == 1: - raise GrammarError("Collision in %s: %s" %(k, ', '.join(['\n * %s: %s' % x for x in v]))) - - self.states[state] = {k.name:v[0] for k, v in lookahead.items()} - - for _ in bfs(self.start_states.values(), step): + self.states.add(state) + + for _ in bfs(self.lr0_start_states.values(), step): pass - self._parse_table = ParseTable(self.states, self.start_states, self.end_states) + def discover_lookaheads(self): + # state -> rule -> set of lookaheads + self.lookaheads = defaultdict(lambda: defaultdict(set)) + # state -> rule -> list of (set of lookaheads) to propagate to + self.propagates = defaultdict(lambda: defaultdict(list)) + + for s in self.lr0_start_states.values(): + for rp in s.kernel: + self.lookaheads[s][rp].add(Terminal('$END')) + + # There is a 1 to 1 correspondance between LR0 and LALR1 states. + # We calculate the lookaheads for LALR1 kernel items from the LR0 kernel items. + # use a terminal that does not exist in the grammar + t = Terminal('$#') + for s in self.states: + for rp in s.kernel: + for rp2, la in self.generate_lr1_closure([(rp, t)]): + if rp2.is_satisfied: + continue + next_symbol = rp2.next + next_state = s.transitions[next_symbol] + rp3 = rp2.advance(next_symbol) + assert(rp3 in next_state.kernel) + x = self.lookaheads[next_state][rp3] + if la == t: + # we must propagate rp's lookaheads to rp3's lookahead set + self.propagates[s][rp].append(x) + else: + # this lookahead is "generated spontaneously" for rp3 + x.add(la) + + def propagate_lookaheads(self): + changed = True + while changed: + changed = False + for s in self.states: + for rp in s.kernel: + # from (from is a keyword) + f = self.lookaheads[s][rp] + # to + t = self.propagates[s][rp] + for x in t: + old = len(x) + x |= f + changed = changed or (len(x) != old) + + def generate_lalr1_states(self): + # 1 to 1 correspondance between LR0 and LALR1 states + # We must fetch the lookaheads we calculated, + # to create the LALR1 kernels from the LR0 kernels. + # Then, we generate the LALR1 states by taking the LR1 closure of the new kernel items. + # map of LR0 states to LALR1 states + m = {} + for s in self.states: + kernel = [] + for rp in s.kernel: + las = self.lookaheads[s][rp] + assert(len(las) > 0) + for la in las: + kernel.append((rp, la)) + m[s] = self.generate_lr1_closure(kernel) + + self.states = {} + for s, v in m.items(): + actions = {} + for la, next_state in s.transitions.items(): + actions[la] = (Shift, next_state.closure) + + sat, _ = classify_bool(v, lambda x: x[0].is_satisfied) + reductions = classify(sat, lambda x: x[1], lambda x: x[0]) + for la, rps in reductions.items(): + if len(rps) > 1: + raise GrammarError("Collision in %s: %s" % (la, ', '.join([ str(r.rule) for r in rps ]))) + if la in actions: + if self.debug: + logging.warning("Shift/reduce conflict for terminal %s: (resolving as shift)", la.name) + logging.warning(' * %s', str(rps[0])) + else: + actions[la] = (Reduce, rps[0].rule) + + self.states[s.closure] = {k.name: v for k, v in actions.items()} + + end_states = {} + for s in self.states: + for rp in s: + for start in self.lr0_start_states: + if rp.rule.origin.name == ('$root_' + start) and rp.is_satisfied: + assert(not start in end_states) + end_states[start] = s + + self._parse_table = ParseTable(self.states, {start: state.closure for start, state in self.lr0_start_states.items()}, end_states) if self.debug: self.parse_table = self._parse_table else: self.parse_table = IntParseTable.from_ParseTable(self._parse_table) + def generate_lr1_closure(self, kernel): + closure = set() + + q = list(kernel) + while len(q) > 0: + rp, la = q.pop() + if (rp, la) in closure: + continue + closure.add((rp, la)) + + if rp.is_satisfied: + continue + if rp.next.is_term: + continue + + l = [] + i = rp.index + 1 + n = len(rp.rule.expansion) + while i < n: + s = rp.rule.expansion[i] + l.extend(self.FIRST.get(s, [])) + if not s in self.NULLABLE: + break + i += 1 + + # if all of rp.rule.expansion[rp.index + 1:] were nullable: + if i == n: + l.append(la) + + for r in self.lr0_rules_by_origin[rp.next]: + for s in l: + q.append((RulePtr(r, 0), s)) + + return closure diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 39dd5f3..6eb3839 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -6,7 +6,7 @@ from ..exceptions import UnexpectedToken from ..lexer import Token from ..utils import Enumerator, Serialize -from .lalr_analysis import LALR_Analyzer, Shift, IntParseTable +from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable ###{standalone @@ -15,7 +15,10 @@ class LALR_Parser(object): assert all(r.options is None or r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization" analysis = LALR_Analyzer(parser_conf, debug=debug) - analysis.compute_lookahead() + analysis.generate_lr0_states() + analysis.discover_lookaheads() + analysis.propagate_lookaheads() + analysis.generate_lalr1_states() callbacks = parser_conf.callbacks self._parse_table = analysis.parse_table @@ -65,6 +68,9 @@ class _Parser: raise UnexpectedToken(token, expected, state=state) def reduce(rule): + if state_stack[-1] == end_state: + return True + size = len(rule.expansion) if size: s = value_stack[-size:] @@ -80,6 +86,8 @@ class _Parser: state_stack.append(new_state) value_stack.append(value) + return False + # Main LALR-parser loop for token in stream: while True: @@ -97,11 +105,8 @@ class _Parser: token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) while True: _action, arg = get_action(token) - if _action is Shift: - assert arg == end_state - val ,= value_stack - return val - else: - reduce(arg) + assert(_action is Reduce) + if reduce(arg): + return value_stack[-1] ###} From 6f412c25b705852958243f3538b329c668ccc6d4 Mon Sep 17 00:00:00 2001 From: Raekye Date: Fri, 2 Aug 2019 09:12:22 -0400 Subject: [PATCH 031/132] LALR optimizations and profiling --- lark/grammar.py | 11 +- lark/parsers/grammar_analysis.py | 121 ++++++++++++++- lark/parsers/lalr_analysis.py | 250 ++++++++++++++++++++++++++----- lark/parsers/lalr_parser.py | 12 ++ 4 files changed, 348 insertions(+), 46 deletions(-) diff --git a/lark/grammar.py b/lark/grammar.py index 14893fb..f90cce4 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -3,10 +3,13 @@ from .utils import Serialize ###{standalone class Symbol(Serialize): + __slots__ = ('name', '_hash') + is_term = NotImplemented def __init__(self, name): self.name = name + self._hash = hash(self.name) def __eq__(self, other): assert isinstance(other, Symbol), other @@ -16,7 +19,7 @@ class Symbol(Serialize): return not (self == other) def __hash__(self): - return hash(self.name) + return self._hash def __repr__(self): return '%s(%r)' % (type(self).__name__, self.name) @@ -31,6 +34,7 @@ class Terminal(Symbol): def __init__(self, name, filter_out=False): self.name = name + self._hash = hash(self.name) self.filter_out = filter_out @property @@ -69,7 +73,7 @@ class Rule(Serialize): expansion : a list of symbols order : index of this expansion amongst all rules of the same name """ - __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash') + __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash', '_rp') __serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options' __serialize_namespace__ = Terminal, NonTerminal, RuleOptions @@ -81,6 +85,7 @@ class Rule(Serialize): self.order = order self.options = options self._hash = hash((self.origin, tuple(self.expansion))) + self._rp = None def _deserialize(self): self._hash = hash((self.origin, tuple(self.expansion))) @@ -101,4 +106,4 @@ class Rule(Serialize): -###} \ No newline at end of file +###} diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 4085ea5..71a7bc5 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -1,18 +1,41 @@ -from collections import Counter +from collections import Counter, defaultdict from ..utils import bfs, fzset, classify from ..exceptions import GrammarError from ..grammar import Rule, Terminal, NonTerminal +import time +t_firsts = 0 +t_xy = 0 +t_call = 0 +cache_hits = 0 +cache_misses = 0 + +# used to be just a tuple (rp, la) +# but by making it an object, +# the hash and equality become trivial +# (slightly faster for sets which are hashtables?) +class RulePtrLookahead(object): + __slots__ = 'rp', 'la' + + def __init__(self, rp, la): + self.rp = rp + self.la = la class RulePtr(object): - __slots__ = ('rule', 'index') + __slots__ = ('rule', 'index', '_advance', '_lookaheads', '_next_rules_by_origin', '_first') def __init__(self, rule, index): assert isinstance(rule, Rule) assert index <= len(rule.expansion) self.rule = rule self.index = index + #self._hash = hash((self.rule, self.index)) + #self._hash = None + self._advance = None + self._lookaheads = {} + self._next_rules_by_origin = None + self._first = None def __repr__(self): before = [x.name for x in self.rule.expansion[:self.index]] @@ -23,32 +46,102 @@ class RulePtr(object): def next(self): return self.rule.expansion[self.index] + # don't create duplicate RulePtrs def advance(self, sym): assert self.next == sym - return RulePtr(self.rule, self.index+1) + a = self._advance + if a is None: + a = RulePtr(self.rule, self.index + 1) + self._advance = a + return a @property def is_satisfied(self): return self.index == len(self.rule.expansion) + def lookahead(self, la): + rp_la = self._lookaheads.get(la, None) + if rp_la is None: + rp_la = RulePtrLookahead(self, la) + self._lookaheads[la] = rp_la + return rp_la + + def next_rules_by_origin(self, rules_by_origin): + n = self._next_rules_by_origin + if n is None: + n = rules_by_origin[self.next] + self._next_rules_by_origin = n + return n + + # recursive form of lalr_analyis.py:343 (which is easier to understand IMO) + # normally avoid recursion but this allows us to cache + # each intermediate step in a corresponding RulePtr + def first(self, i, firsts, nullable, t): + global cache_hits + global cache_misses + global t_firsts + global t_xy + global t_call + t_call += time.time() - t + n = len(self.rule.expansion) + if i == n: + return ([], True) + x = self._first + t_x = time.time() + if x is None: + t0 = time.time() + t_y = time.time() + cache_misses += 1 + s = self.rule.expansion[i] + l = list(firsts.get(s, [])) + b = (s in nullable) + if b: + t1 = time.time() + t_firsts += t1 - t0 + l_b_2 = self.advance(s).first(i + 1, firsts, nullable, time.time()) + #l_b_2 = first(self.advance(self.next), i + 1, firsts, nullable, time.time()) + t0 = time.time() + l.extend(l_b_2[0]) + b = l_b_2[1] + x = (l, b) + self._first = x + t1 = time.time() + t_firsts += t1 - t0 + else: + t_y = time.time() + cache_hits += 1 + t_xy += t_y - t_x + return x + + # optimizations were made so that there should never be + # two distinct equal RulePtrs + # should help set/hashtable lookups? + ''' def __eq__(self, other): return self.rule == other.rule and self.index == other.index def __hash__(self): - return hash((self.rule, self.index)) + return self._hash + ''' + class LR0ItemSet(object): - __slots__ = ('kernel', 'closure', 'transitions') + __slots__ = ('kernel', 'closure', 'transitions', 'lookaheads', '_hash') def __init__(self, kernel, closure): self.kernel = fzset(kernel) self.closure = fzset(closure) self.transitions = {} + self.lookaheads = defaultdict(set) + #self._hash = hash(self.kernel) + # state generation ensures no duplicate LR0ItemSets + ''' def __eq__(self, other): return self.kernel == other.kernel def __hash__(self): - return hash(self.kernel) + return self._hash + ''' def __repr__(self): return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure])) @@ -153,14 +246,22 @@ class GrammarAnalyzer(object): for start in parser_conf.start} lr0_rules = parser_conf.rules + list(lr0_root_rules.values()) + assert(len(lr0_rules) == len(set(lr0_rules))) self.lr0_rules_by_origin = classify(lr0_rules, lambda r: r.origin) - self.lr0_start_states = {start: LR0ItemSet([RulePtr(root_rule, 0)], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin)) + # cache RulePtr(r, 0) in r (no duplicate RulePtr objects) + for root_rule in lr0_root_rules.values(): + root_rule._rp = RulePtr(root_rule, 0) + self.lr0_start_states = {start: LR0ItemSet([root_rule._rp], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin)) for start, root_rule in lr0_root_rules.items()} self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules) + # unused, did not help + self.lr1_cache = {} + self.lr1_cache2 = {} + def expand_rule(self, source_rule, rules_by_origin=None): "Returns all init_ptrs accessible by rule (recursive)" @@ -172,7 +273,11 @@ class GrammarAnalyzer(object): assert not rule.is_term, rule for r in rules_by_origin[rule]: - init_ptr = RulePtr(r, 0) + # don't create duplicate RulePtr objects + init_ptr = r._rp + if init_ptr is None: + init_ptr = RulePtr(r, 0) + r._rp = init_ptr init_ptrs.add(init_ptr) if r.expansion: # if not empty rule diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 61fe692..eb87e7a 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -7,13 +7,16 @@ For now, shift/reduce conflicts are automatically resolved as shifts. # Email : erezshin@gmail.com import logging -from collections import defaultdict +from collections import defaultdict, deque from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator from ..exceptions import GrammarError from .grammar_analysis import GrammarAnalyzer, Terminal, RulePtr, LR0ItemSet from ..grammar import Rule +from . import grammar_analysis + +import time ###{standalone @@ -28,6 +31,16 @@ class Action: Shift = Action('Shift') Reduce = Action('Reduce') +t_set_0 = 0 +t_set_1 = 0 +t_expand = 0 +t_rules = 0 +t_append = 0 +t_z = 0 +t_begin = 0 +t_count = 0 +t_call = 0 + class ParseTable: def __init__(self, states, start_states, end_states): self.states = states @@ -86,20 +99,24 @@ class LALR_Analyzer(GrammarAnalyzer): def generate_lr0_states(self): self.states = set() + # map of kernels to LR0ItemSets + cache = {} def step(state): _, unsat = classify_bool(state.closure, lambda rp: rp.is_satisfied) d = classify(unsat, lambda rp: rp.next) for sym, rps in d.items(): - kernel = {rp.advance(sym) for rp in rps} - closure = set(kernel) + kernel = fzset({rp.advance(sym) for rp in rps}) + new_state = cache.get(kernel, None) + if new_state is None: + closure = set(kernel) + for rp in kernel: + if not rp.is_satisfied and not rp.next.is_term: + closure |= self.expand_rule(rp.next, self.lr0_rules_by_origin) + new_state = LR0ItemSet(kernel, closure) + cache[kernel] = new_state - for rp in kernel: - if not rp.is_satisfied and not rp.next.is_term: - closure |= self.expand_rule(rp.next, self.lr0_rules_by_origin) - - new_state = LR0ItemSet(kernel, closure) state.transitions[sym] = new_state yield new_state @@ -109,36 +126,59 @@ class LALR_Analyzer(GrammarAnalyzer): pass def discover_lookaheads(self): + # lookaheads is now a member of LR0ItemSet, so don't need to look up a dictionary here # state -> rule -> set of lookaheads - self.lookaheads = defaultdict(lambda: defaultdict(set)) + #self.lookaheads = defaultdict(lambda: defaultdict(set)) # state -> rule -> list of (set of lookaheads) to propagate to - self.propagates = defaultdict(lambda: defaultdict(list)) + #self.propagates = defaultdict(lambda: defaultdict(list)) + self.propagates = {} + + t0 = time.time() + t = Terminal('$END') for s in self.lr0_start_states.values(): for rp in s.kernel: - self.lookaheads[s][rp].add(Terminal('$END')) + #self.lookaheads[s][rp].add(Terminal('$END')) + s.lookaheads[rp].add(t) + + t_closure = 0 # There is a 1 to 1 correspondance between LR0 and LALR1 states. # We calculate the lookaheads for LALR1 kernel items from the LR0 kernel items. # use a terminal that does not exist in the grammar t = Terminal('$#') for s in self.states: + p = {} + self.propagates[s] = p for rp in s.kernel: - for rp2, la in self.generate_lr1_closure([(rp, t)]): + q = [] + p[rp] = q + t2 = time.time() + z = self.generate_lr1_closure([rp.lookahead(t)], time.time()) + t3 = time.time() + t_closure += t3 - t2 + #for rp2, la in self.generate_lr1_closure([(rp, t)], time.time()): + for rp2_la in z: + rp2 = rp2_la.rp + la = rp2_la.la if rp2.is_satisfied: continue next_symbol = rp2.next next_state = s.transitions[next_symbol] rp3 = rp2.advance(next_symbol) assert(rp3 in next_state.kernel) - x = self.lookaheads[next_state][rp3] + #x = self.lookaheads[next_state][rp3] + x = next_state.lookaheads[rp3] if la == t: # we must propagate rp's lookaheads to rp3's lookahead set - self.propagates[s][rp].append(x) + q.append(x) else: # this lookahead is "generated spontaneously" for rp3 x.add(la) + t1 = time.time() + print('Discovering took {:.3f} (generating closure), {:.3f} (total)'.format(t_closure, t1 - t0)) + def propagate_lookaheads(self): changed = True while changed: @@ -146,7 +186,8 @@ class LALR_Analyzer(GrammarAnalyzer): for s in self.states: for rp in s.kernel: # from (from is a keyword) - f = self.lookaheads[s][rp] + #f = self.lookaheads[s][rp] + f = s.lookaheads[rp] # to t = self.propagates[s][rp] for x in t: @@ -155,20 +196,33 @@ class LALR_Analyzer(GrammarAnalyzer): changed = changed or (len(x) != old) def generate_lalr1_states(self): + t0 = time.time() # 1 to 1 correspondance between LR0 and LALR1 states # We must fetch the lookaheads we calculated, # to create the LALR1 kernels from the LR0 kernels. # Then, we generate the LALR1 states by taking the LR1 closure of the new kernel items. # map of LR0 states to LALR1 states m = {} + t_closure = 0 + z = 0 for s in self.states: + z = max(z, len(s.closure)) kernel = [] for rp in s.kernel: - las = self.lookaheads[s][rp] + #las = self.lookaheads[s][rp] + las = s.lookaheads[rp] assert(len(las) > 0) for la in las: - kernel.append((rp, la)) - m[s] = self.generate_lr1_closure(kernel) + kernel.append(rp.lookahead(la)) + t0_0 = time.time() + m[s] = self.generate_lr1_closure(kernel, time.time()) + t0_1 = time.time() + t_closure += t0_1 - t0_0 + + print('Generating lalr1 closure for lalr kernels took {:.3f}'.format(t_closure)) + print('Max lr0 state size was {}'.format(z)) + + t1 = time.time() self.states = {} for s, v in m.items(): @@ -176,8 +230,8 @@ class LALR_Analyzer(GrammarAnalyzer): for la, next_state in s.transitions.items(): actions[la] = (Shift, next_state.closure) - sat, _ = classify_bool(v, lambda x: x[0].is_satisfied) - reductions = classify(sat, lambda x: x[1], lambda x: x[0]) + sat, _ = classify_bool(v, lambda x: x.rp.is_satisfied) + reductions = classify(sat, lambda x: x.la, lambda x: x.rp) for la, rps in reductions.items(): if len(rps) > 1: raise GrammarError("Collision in %s: %s" % (la, ', '.join([ str(r.rule) for r in rps ]))) @@ -190,6 +244,8 @@ class LALR_Analyzer(GrammarAnalyzer): self.states[s.closure] = {k.name: v for k, v in actions.items()} + t2 = time.time() + end_states = {} for s in self.states: for rp in s: @@ -198,44 +254,168 @@ class LALR_Analyzer(GrammarAnalyzer): assert(not start in end_states) end_states[start] = s + t3 = time.time() + self._parse_table = ParseTable(self.states, {start: state.closure for start, state in self.lr0_start_states.items()}, end_states) + t4 = time.time() + if self.debug: self.parse_table = self._parse_table else: self.parse_table = IntParseTable.from_ParseTable(self._parse_table) - def generate_lr1_closure(self, kernel): + t5 = time.time() + + print(('Generating lalr1 states took ' + ', '.join([ '{:.3f}' ] * 5)).format(t1 - t0, t2 - t1, t3 - t2, t4 - t3, t5 - t4)) + print('Generating firsts took {:.3f} (time actually calculating), {:.3f} (end to end), {:.3f} (just function call)'.format(grammar_analysis.t_firsts, grammar_analysis.t_xy, grammar_analysis.t_call)) + + def generate_lr1_closure(self, kernel, t_caller): + global t_call + global t_set_0 + global t_set_1 + global t_expand + global t_rules + global t_append + global t_z + global t_begin + global t_count + + t_start = time.time() + t_call += t_start - t_caller + + # cache the results of this function + # not many hits, no noticeable performance improvement + ''' + k = fzset(kernel) + cached = self.lr1_cache.get(k, None) + if not cached is None: + return cached + ''' + closure = set() + closure_hash = {} + + y = 0 q = list(kernel) while len(q) > 0: - rp, la = q.pop() - if (rp, la) in closure: + t_a = time.time() + rp_la = q.pop() + #rp_la_hash = hash(rp_la) + t0 = time.time() + t_begin += t0 - t_a + # try to manually maintain hashtable, + # as a set of just hashes (ints) was notably faster + ''' + if rp_la_hash in closure_hash: + if rp_la in closure_hash[rp_la_hash]: + t0_0 = time.time() + t_set_0 += t0_0 - t0 + continue + t0_0 = time.time() + t_set_0 += t0_0 - t0 + else: + closure_hash[rp_la_hash] = [] + ''' + if rp_la in closure: + t0_0 = time.time() + t_set_0 += t0_0 - t0 continue - closure.add((rp, la)) + t0_0 = time.time() + closure.add(rp_la) + #closure_hash[rp_la_hash].append(rp_la) + t1 = time.time() + t_set_0 += t0_0 - t0 + t_set_1 += t1 - t0_0 + rp = rp_la.rp + la = rp_la.la if rp.is_satisfied: continue if rp.next.is_term: continue + t2 = time.time() + + # cache these calculations inside each RulePtr + # see grammar_analysis.py:79 l = [] + ''' i = rp.index + 1 n = len(rp.rule.expansion) - while i < n: - s = rp.rule.expansion[i] - l.extend(self.FIRST.get(s, [])) - if not s in self.NULLABLE: - break - i += 1 - + l2_i = self.lr1_cache2.get((rp.rule, i), None) + l2 = [] + if l2_i is None: + while i < n: + s = rp.rule.expansion[i] + l2.extend(self.FIRST.get(s, [])) + if not s in self.NULLABLE: + break + i += 1 + self.lr1_cache2[(rp.rule, i)] = (l2, i) + else: + l2 = l2_i[0] + i = l2_i[1] + + l.extend(l2) + ''' + # this function call seems really slow (see grammar_analysis.t_call above) + # tried making it not a method call so don't need to look up vtable + # still equally slow + l2, nullable = rp.first(rp.index + 1, self.FIRST, self.NULLABLE, time.time()) + #l2, nullable = grammar_analysis.first(rp, rp.index + 1, self.FIRST, self.NULLABLE, time.time()) + #l.extend(l2) + l = l2 + t3 = time.time() + + t_expand += t3 - t2 + + # if we don't modify l2 and add an extra check in the loop below, + # we don't have to copy it # if all of rp.rule.expansion[rp.index + 1:] were nullable: - if i == n: - l.append(la) + #if nullable: + # l.append(la) + + t4 = time.time() + x = rp.next_rules_by_origin(self.lr0_rules_by_origin) + t5 = time.time() - for r in self.lr0_rules_by_origin[rp.next]: + # usually between 20-60? seen as high as ~175 + y = max(y, len(x) * len(l)) + #print('adding {} * {} rules to closure max {}'.format(len(x), len(l), y)) + for r in x: for s in l: - q.append((RulePtr(r, 0), s)) + # cache RulePtr(r, 0) in r (no duplicate RulePtr objects) + # cache r._rp in _rp (1 less object property lookup?) + _rp = r._rp + if _rp is None: + _rp = RulePtr(r, 0) + r._rp = _rp + q.append(_rp.lookahead(s)) + #q.append((r._rp, s)) + if nullable: + _rp = r._rp + if _rp is None: + _rp = RulePtr(r, 0) + r._rp = _rp + q.append(_rp.lookahead(la)) + #q.append((r._rp, la)) + + t6 = time.time() + t_rules += t5 - t4 + t_append += t6 - t5 + + #self.lr1_cache[k] = closure + + t_end = time.time() + t_z += t_end - t_start + + t_count += 1 + + if t_count % 1000 == 0: + print('\tGenerating lr1 closure took begin {:.3f}, set contains {:.3f}, set add {:.3f}, get first {:.3f}'.format(t_begin, t_set_0, t_set_1, t_expand)) + print('\tget next rules {:.3f}, append rules {:.3f}, total {:.3f}, call time {:.3f}, count {}'.format(t_rules, t_append, t_z, t_call, t_count)) + print('\tmax number of appends {}'.format(y)) return closure diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 6eb3839..b3985ae 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -8,6 +8,8 @@ from ..utils import Enumerator, Serialize from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable +import time + ###{standalone class LALR_Parser(object): @@ -15,10 +17,20 @@ class LALR_Parser(object): assert all(r.options is None or r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization" analysis = LALR_Analyzer(parser_conf, debug=debug) + t0 = time.time() analysis.generate_lr0_states() + t1 = time.time() analysis.discover_lookaheads() + t2 = time.time() analysis.propagate_lookaheads() + t3 = time.time() analysis.generate_lalr1_states() + t4 = time.time() + print('Generating lr0 states took {:.3f}'.format(t1 - t0)) + print('Discovering lookaheads took {:.3f}'.format(t2 - t1)) + print('Propagating lookaheads took took {:.3f}'.format(t3 - t2)) + print('Generating lalr states (closure) took {:.3f}'.format(t4 - t3)) + print('-' * 32) callbacks = parser_conf.callbacks self._parse_table = analysis.parse_table From 0c59cba3f5329381fc75a1a37a8426c15165b230 Mon Sep 17 00:00:00 2001 From: Raekye Date: Fri, 9 Aug 2019 03:26:27 -0400 Subject: [PATCH 032/132] implement DeRemer and Pennello's lookahead algorithm for LALR(1) --- lark/grammar.py | 4 +- lark/parsers/grammar_analysis.py | 110 +------- lark/parsers/lalr_analysis.py | 432 +++++++++++-------------------- lark/parsers/lalr_parser.py | 27 +- 4 files changed, 169 insertions(+), 404 deletions(-) diff --git a/lark/grammar.py b/lark/grammar.py index f90cce4..3480651 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -28,7 +28,7 @@ class Symbol(Serialize): class Terminal(Symbol): - __serialize_fields__ = 'name', 'filter_out' + __serialize_fields__ = 'name', 'filter_out', '_hash' is_term = True @@ -44,7 +44,7 @@ class Terminal(Symbol): class NonTerminal(Symbol): - __serialize_fields__ = 'name', + __serialize_fields__ = 'name', '_hash' is_term = False diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 71a7bc5..b32f62f 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -5,37 +5,18 @@ from ..exceptions import GrammarError from ..grammar import Rule, Terminal, NonTerminal import time -t_firsts = 0 -t_xy = 0 -t_call = 0 -cache_hits = 0 -cache_misses = 0 - -# used to be just a tuple (rp, la) -# but by making it an object, -# the hash and equality become trivial -# (slightly faster for sets which are hashtables?) -class RulePtrLookahead(object): - __slots__ = 'rp', 'la' - - def __init__(self, rp, la): - self.rp = rp - self.la = la +# optimizations were made so that there should never be two distinct equal RulePtrs +# to help with hashtable lookup class RulePtr(object): - __slots__ = ('rule', 'index', '_advance', '_lookaheads', '_next_rules_by_origin', '_first') + __slots__ = ('rule', 'index', '_advance') def __init__(self, rule, index): assert isinstance(rule, Rule) assert index <= len(rule.expansion) self.rule = rule self.index = index - #self._hash = hash((self.rule, self.index)) - #self._hash = None self._advance = None - self._lookaheads = {} - self._next_rules_by_origin = None - self._first = None def __repr__(self): before = [x.name for x in self.rule.expansion[:self.index]] @@ -59,89 +40,16 @@ class RulePtr(object): def is_satisfied(self): return self.index == len(self.rule.expansion) - def lookahead(self, la): - rp_la = self._lookaheads.get(la, None) - if rp_la is None: - rp_la = RulePtrLookahead(self, la) - self._lookaheads[la] = rp_la - return rp_la - - def next_rules_by_origin(self, rules_by_origin): - n = self._next_rules_by_origin - if n is None: - n = rules_by_origin[self.next] - self._next_rules_by_origin = n - return n - - # recursive form of lalr_analyis.py:343 (which is easier to understand IMO) - # normally avoid recursion but this allows us to cache - # each intermediate step in a corresponding RulePtr - def first(self, i, firsts, nullable, t): - global cache_hits - global cache_misses - global t_firsts - global t_xy - global t_call - t_call += time.time() - t - n = len(self.rule.expansion) - if i == n: - return ([], True) - x = self._first - t_x = time.time() - if x is None: - t0 = time.time() - t_y = time.time() - cache_misses += 1 - s = self.rule.expansion[i] - l = list(firsts.get(s, [])) - b = (s in nullable) - if b: - t1 = time.time() - t_firsts += t1 - t0 - l_b_2 = self.advance(s).first(i + 1, firsts, nullable, time.time()) - #l_b_2 = first(self.advance(self.next), i + 1, firsts, nullable, time.time()) - t0 = time.time() - l.extend(l_b_2[0]) - b = l_b_2[1] - x = (l, b) - self._first = x - t1 = time.time() - t_firsts += t1 - t0 - else: - t_y = time.time() - cache_hits += 1 - t_xy += t_y - t_x - return x - - # optimizations were made so that there should never be - # two distinct equal RulePtrs - # should help set/hashtable lookups? - ''' - def __eq__(self, other): - return self.rule == other.rule and self.index == other.index - def __hash__(self): - return self._hash - ''' - +# state generation ensures no duplicate LR0ItemSets class LR0ItemSet(object): - __slots__ = ('kernel', 'closure', 'transitions', 'lookaheads', '_hash') + __slots__ = ('kernel', 'closure', 'transitions', 'lookaheads') def __init__(self, kernel, closure): self.kernel = fzset(kernel) self.closure = fzset(closure) self.transitions = {} self.lookaheads = defaultdict(set) - #self._hash = hash(self.kernel) - - # state generation ensures no duplicate LR0ItemSets - ''' - def __eq__(self, other): - return self.kernel == other.kernel - - def __hash__(self): - return self._hash - ''' def __repr__(self): return '{%s | %s}' % (', '.join([repr(r) for r in self.kernel]), ', '.join([repr(r) for r in self.closure])) @@ -258,9 +166,11 @@ class GrammarAnalyzer(object): self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules) - # unused, did not help - self.lr1_cache = {} - self.lr1_cache2 = {} + self.nonterminal_transitions = [] + self.directly_reads = defaultdict(set) + self.reads = defaultdict(set) + self.includes = defaultdict(set) + self.lookback = defaultdict(set) def expand_rule(self, source_rule, rules_by_origin=None): "Returns all init_ptrs accessible by rule (recursive)" diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index eb87e7a..4104713 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -12,9 +12,8 @@ from collections import defaultdict, deque from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator from ..exceptions import GrammarError -from .grammar_analysis import GrammarAnalyzer, Terminal, RulePtr, LR0ItemSet +from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet from ..grammar import Rule -from . import grammar_analysis import time @@ -31,15 +30,6 @@ class Action: Shift = Action('Shift') Reduce = Action('Reduce') -t_set_0 = 0 -t_set_1 = 0 -t_expand = 0 -t_rules = 0 -t_append = 0 -t_z = 0 -t_begin = 0 -t_count = 0 -t_call = 0 class ParseTable: def __init__(self, states, start_states, end_states): @@ -95,9 +85,60 @@ class IntParseTable(ParseTable): ###} + +# digraph and traverse, see The Theory and Practice of Compiler Writing + +# computes F(x) = G(x) union (union { G(y) | x R y }) +# X: nodes +# R: relation (function mapping node -> list of nodes that satisfy the relation) +# G: set valued function +def digraph(X, R, G): + F = {} + S = [] + N = {} + for x in X: + N[x] = 0 + for x in X: + # this is always true for the first iteration, but N[x] may be updated in traverse below + if N[x] == 0: + traverse(x, S, N, X, R, G, F) + return F + +# x: single node +# S: stack +# N: weights +# X: nodes +# R: relation (see above) +# G: set valued function +# F: set valued function we are computing (map of input -> output) +def traverse(x, S, N, X, R, G, F): + S.append(x) + d = len(S) + N[x] = d + F[x] = G(x) + for y in R(x): + if N[y] == 0: + traverse(y, S, N, X, R, G, F) + n_x = N[x] + assert(n_x > 0) + n_y = N[y] + assert(n_y != 0) + if (n_y > 0) and (n_y < n_x): + N[x] = n_y + F[x].update(F[y]) + if N[x] == d: + f_x = F[x] + while True: + z = S.pop() + N[z] = -1 + F[z] = f_x + if z == x: + break + + class LALR_Analyzer(GrammarAnalyzer): - def generate_lr0_states(self): + def compute_lr0_states(self): self.states = set() # map of kernels to LR0ItemSets cache = {} @@ -125,297 +166,118 @@ class LALR_Analyzer(GrammarAnalyzer): for _ in bfs(self.lr0_start_states.values(), step): pass - def discover_lookaheads(self): - # lookaheads is now a member of LR0ItemSet, so don't need to look up a dictionary here - # state -> rule -> set of lookaheads - #self.lookaheads = defaultdict(lambda: defaultdict(set)) - # state -> rule -> list of (set of lookaheads) to propagate to - #self.propagates = defaultdict(lambda: defaultdict(list)) - self.propagates = {} - - t0 = time.time() - - t = Terminal('$END') - for s in self.lr0_start_states.values(): - for rp in s.kernel: - #self.lookaheads[s][rp].add(Terminal('$END')) - s.lookaheads[rp].add(t) - - t_closure = 0 - - # There is a 1 to 1 correspondance between LR0 and LALR1 states. - # We calculate the lookaheads for LALR1 kernel items from the LR0 kernel items. - # use a terminal that does not exist in the grammar - t = Terminal('$#') - for s in self.states: - p = {} - self.propagates[s] = p - for rp in s.kernel: - q = [] - p[rp] = q - t2 = time.time() - z = self.generate_lr1_closure([rp.lookahead(t)], time.time()) - t3 = time.time() - t_closure += t3 - t2 - #for rp2, la in self.generate_lr1_closure([(rp, t)], time.time()): - for rp2_la in z: - rp2 = rp2_la.rp - la = rp2_la.la + def compute_reads_relations(self): + # handle start state + for root in self.lr0_start_states.values(): + assert(len(root.kernel) == 1) + for rp in root.kernel: + assert(rp.index == 0) + self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ]) + + for state in self.states: + seen = set() + for rp in state.closure: + if rp.is_satisfied: + continue + s = rp.next + # if s is a not a nonterminal + if not s in self.lr0_rules_by_origin: + continue + if s in seen: + continue + seen.add(s) + nt = (state, s) + self.nonterminal_transitions.append(nt) + dr = self.directly_reads[nt] + r = self.reads[nt] + next_state = state.transitions[s] + for rp2 in next_state.closure: if rp2.is_satisfied: continue - next_symbol = rp2.next - next_state = s.transitions[next_symbol] - rp3 = rp2.advance(next_symbol) - assert(rp3 in next_state.kernel) - #x = self.lookaheads[next_state][rp3] - x = next_state.lookaheads[rp3] - if la == t: - # we must propagate rp's lookaheads to rp3's lookahead set - q.append(x) + s2 = rp2.next + # if s2 is a terminal + if not s2 in self.lr0_rules_by_origin: + dr.add(s2) + if s2 in self.NULLABLE: + r.add((next_state, s2)) + + def compute_read_sets(self): + R = lambda nt: self.reads[nt] + G = lambda nt: self.directly_reads[nt] + self.read_sets = digraph(self.nonterminal_transitions, R, G) + + def compute_includes_lookback(self): + for nt in self.nonterminal_transitions: + state, nonterminal = nt + includes = [] + lookback = self.lookback[nt] + for rp in state.closure: + if rp.rule.origin != nonterminal: + continue + # traverse the states for rp(.rule) + state2 = state + for i in range(rp.index, len(rp.rule.expansion)): + s = rp.rule.expansion[i] + nt2 = (state2, s) + state2 = state2.transitions[s] + if not nt2 in self.reads: + continue + j = i + 1 + for j in range(i + 1, len(rp.rule.expansion)): + if not rp.rule.expansion[j] in self.NULLABLE: + break else: - # this lookahead is "generated spontaneously" for rp3 - x.add(la) - - t1 = time.time() - print('Discovering took {:.3f} (generating closure), {:.3f} (total)'.format(t_closure, t1 - t0)) - - def propagate_lookaheads(self): - changed = True - while changed: - changed = False - for s in self.states: - for rp in s.kernel: - # from (from is a keyword) - #f = self.lookaheads[s][rp] - f = s.lookaheads[rp] - # to - t = self.propagates[s][rp] - for x in t: - old = len(x) - x |= f - changed = changed or (len(x) != old) - - def generate_lalr1_states(self): - t0 = time.time() - # 1 to 1 correspondance between LR0 and LALR1 states - # We must fetch the lookaheads we calculated, - # to create the LALR1 kernels from the LR0 kernels. - # Then, we generate the LALR1 states by taking the LR1 closure of the new kernel items. - # map of LR0 states to LALR1 states + includes.append(nt2) + # state2 is at the final state for rp.rule + if rp.index == 0: + for rp2 in state2.closure: + if (rp2.rule == rp.rule) and rp2.is_satisfied: + lookback.add((state2, rp2.rule)) + for nt2 in includes: + self.includes[nt2].add(nt) + + def compute_follow_sets(self): + R = lambda nt: self.includes[nt] + G = lambda nt: self.read_sets[nt] + self.follow_sets = digraph(self.nonterminal_transitions, R, G) + + def compute_lookaheads(self): + for nt, lookbacks in self.lookback.items(): + for state, rule in lookbacks: + for s in self.follow_sets[nt]: + state.lookaheads[s].add(rule) + + def compute_lalr1_states(self): m = {} - t_closure = 0 - z = 0 - for s in self.states: - z = max(z, len(s.closure)) - kernel = [] - for rp in s.kernel: - #las = self.lookaheads[s][rp] - las = s.lookaheads[rp] - assert(len(las) > 0) - for la in las: - kernel.append(rp.lookahead(la)) - t0_0 = time.time() - m[s] = self.generate_lr1_closure(kernel, time.time()) - t0_1 = time.time() - t_closure += t0_1 - t0_0 - - print('Generating lalr1 closure for lalr kernels took {:.3f}'.format(t_closure)) - print('Max lr0 state size was {}'.format(z)) - - t1 = time.time() - - self.states = {} - for s, v in m.items(): + for state in self.states: actions = {} - for la, next_state in s.transitions.items(): + for la, next_state in state.transitions.items(): actions[la] = (Shift, next_state.closure) - - sat, _ = classify_bool(v, lambda x: x.rp.is_satisfied) - reductions = classify(sat, lambda x: x.la, lambda x: x.rp) - for la, rps in reductions.items(): - if len(rps) > 1: - raise GrammarError("Collision in %s: %s" % (la, ', '.join([ str(r.rule) for r in rps ]))) + for la, rules in state.lookaheads.items(): + if len(rules) > 1: + raise GrammarError('Collision in %s: %s' % (la, ', '.join([ str(r) for r in rules ]))) if la in actions: if self.debug: - logging.warning("Shift/reduce conflict for terminal %s: (resolving as shift)", la.name) - logging.warning(' * %s', str(rps[0])) + logging.warning('Shift/reduce conflict for terminal %s: (resolving as shift)', la.name) + logging.warning(' * %s', list(rules)[0]) else: - actions[la] = (Reduce, rps[0].rule) + actions[la] = (Reduce, list(rules)[0]) + m[state] = { k.name: v for k, v in actions.items() } - self.states[s.closure] = {k.name: v for k, v in actions.items()} - - t2 = time.time() + self.states = { k.closure: v for k, v in m.items() } + # compute end states end_states = {} - for s in self.states: - for rp in s: + for state in self.states: + for rp in state: for start in self.lr0_start_states: if rp.rule.origin.name == ('$root_' + start) and rp.is_satisfied: assert(not start in end_states) - end_states[start] = s - - t3 = time.time() + end_states[start] = state - self._parse_table = ParseTable(self.states, {start: state.closure for start, state in self.lr0_start_states.items()}, end_states) - - t4 = time.time() + self._parse_table = ParseTable(self.states, { start: state.closure for start, state in self.lr0_start_states.items() }, end_states) if self.debug: self.parse_table = self._parse_table else: self.parse_table = IntParseTable.from_ParseTable(self._parse_table) - - t5 = time.time() - - print(('Generating lalr1 states took ' + ', '.join([ '{:.3f}' ] * 5)).format(t1 - t0, t2 - t1, t3 - t2, t4 - t3, t5 - t4)) - print('Generating firsts took {:.3f} (time actually calculating), {:.3f} (end to end), {:.3f} (just function call)'.format(grammar_analysis.t_firsts, grammar_analysis.t_xy, grammar_analysis.t_call)) - - def generate_lr1_closure(self, kernel, t_caller): - global t_call - global t_set_0 - global t_set_1 - global t_expand - global t_rules - global t_append - global t_z - global t_begin - global t_count - - t_start = time.time() - t_call += t_start - t_caller - - # cache the results of this function - # not many hits, no noticeable performance improvement - ''' - k = fzset(kernel) - cached = self.lr1_cache.get(k, None) - if not cached is None: - return cached - ''' - - closure = set() - closure_hash = {} - - y = 0 - - q = list(kernel) - while len(q) > 0: - t_a = time.time() - rp_la = q.pop() - #rp_la_hash = hash(rp_la) - t0 = time.time() - t_begin += t0 - t_a - # try to manually maintain hashtable, - # as a set of just hashes (ints) was notably faster - ''' - if rp_la_hash in closure_hash: - if rp_la in closure_hash[rp_la_hash]: - t0_0 = time.time() - t_set_0 += t0_0 - t0 - continue - t0_0 = time.time() - t_set_0 += t0_0 - t0 - else: - closure_hash[rp_la_hash] = [] - ''' - if rp_la in closure: - t0_0 = time.time() - t_set_0 += t0_0 - t0 - continue - t0_0 = time.time() - closure.add(rp_la) - #closure_hash[rp_la_hash].append(rp_la) - t1 = time.time() - t_set_0 += t0_0 - t0 - t_set_1 += t1 - t0_0 - rp = rp_la.rp - la = rp_la.la - - if rp.is_satisfied: - continue - if rp.next.is_term: - continue - - t2 = time.time() - - # cache these calculations inside each RulePtr - # see grammar_analysis.py:79 - l = [] - ''' - i = rp.index + 1 - n = len(rp.rule.expansion) - l2_i = self.lr1_cache2.get((rp.rule, i), None) - l2 = [] - if l2_i is None: - while i < n: - s = rp.rule.expansion[i] - l2.extend(self.FIRST.get(s, [])) - if not s in self.NULLABLE: - break - i += 1 - self.lr1_cache2[(rp.rule, i)] = (l2, i) - else: - l2 = l2_i[0] - i = l2_i[1] - - l.extend(l2) - ''' - # this function call seems really slow (see grammar_analysis.t_call above) - # tried making it not a method call so don't need to look up vtable - # still equally slow - l2, nullable = rp.first(rp.index + 1, self.FIRST, self.NULLABLE, time.time()) - #l2, nullable = grammar_analysis.first(rp, rp.index + 1, self.FIRST, self.NULLABLE, time.time()) - #l.extend(l2) - l = l2 - t3 = time.time() - - t_expand += t3 - t2 - - # if we don't modify l2 and add an extra check in the loop below, - # we don't have to copy it - # if all of rp.rule.expansion[rp.index + 1:] were nullable: - #if nullable: - # l.append(la) - - t4 = time.time() - x = rp.next_rules_by_origin(self.lr0_rules_by_origin) - t5 = time.time() - - # usually between 20-60? seen as high as ~175 - y = max(y, len(x) * len(l)) - #print('adding {} * {} rules to closure max {}'.format(len(x), len(l), y)) - for r in x: - for s in l: - # cache RulePtr(r, 0) in r (no duplicate RulePtr objects) - # cache r._rp in _rp (1 less object property lookup?) - _rp = r._rp - if _rp is None: - _rp = RulePtr(r, 0) - r._rp = _rp - q.append(_rp.lookahead(s)) - #q.append((r._rp, s)) - if nullable: - _rp = r._rp - if _rp is None: - _rp = RulePtr(r, 0) - r._rp = _rp - q.append(_rp.lookahead(la)) - #q.append((r._rp, la)) - - t6 = time.time() - t_rules += t5 - t4 - t_append += t6 - t5 - - #self.lr1_cache[k] = closure - - t_end = time.time() - t_z += t_end - t_start - - t_count += 1 - - if t_count % 1000 == 0: - print('\tGenerating lr1 closure took begin {:.3f}, set contains {:.3f}, set add {:.3f}, get first {:.3f}'.format(t_begin, t_set_0, t_set_1, t_expand)) - print('\tget next rules {:.3f}, append rules {:.3f}, total {:.3f}, call time {:.3f}, count {}'.format(t_rules, t_append, t_z, t_call, t_count)) - print('\tmax number of appends {}'.format(y)) - - return closure diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index b3985ae..657e795 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -17,20 +17,13 @@ class LALR_Parser(object): assert all(r.options is None or r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization" analysis = LALR_Analyzer(parser_conf, debug=debug) - t0 = time.time() - analysis.generate_lr0_states() - t1 = time.time() - analysis.discover_lookaheads() - t2 = time.time() - analysis.propagate_lookaheads() - t3 = time.time() - analysis.generate_lalr1_states() - t4 = time.time() - print('Generating lr0 states took {:.3f}'.format(t1 - t0)) - print('Discovering lookaheads took {:.3f}'.format(t2 - t1)) - print('Propagating lookaheads took took {:.3f}'.format(t3 - t2)) - print('Generating lalr states (closure) took {:.3f}'.format(t4 - t3)) - print('-' * 32) + analysis.compute_lr0_states() + analysis.compute_reads_relations() + analysis.compute_read_sets() + analysis.compute_includes_lookback() + analysis.compute_follow_sets() + analysis.compute_lookaheads() + analysis.compute_lalr1_states() callbacks = parser_conf.callbacks self._parse_table = analysis.parse_table @@ -80,9 +73,6 @@ class _Parser: raise UnexpectedToken(token, expected, state=state) def reduce(rule): - if state_stack[-1] == end_state: - return True - size = len(rule.expansion) if size: s = value_stack[-size:] @@ -98,6 +88,9 @@ class _Parser: state_stack.append(new_state) value_stack.append(value) + if state_stack[-1] == end_state: + return True + return False # Main LALR-parser loop From de24fa055df13dc1fdb0edd5cd4e8faed6bd2a6a Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Fri, 9 Aug 2019 12:41:50 +0200 Subject: [PATCH 033/132] Saving _hash for symbols isn't necessary --- lark/grammar.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/lark/grammar.py b/lark/grammar.py index 3480651..d975a19 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -3,13 +3,12 @@ from .utils import Serialize ###{standalone class Symbol(Serialize): - __slots__ = ('name', '_hash') + __slots__ = ('name',) is_term = NotImplemented def __init__(self, name): self.name = name - self._hash = hash(self.name) def __eq__(self, other): assert isinstance(other, Symbol), other @@ -19,7 +18,7 @@ class Symbol(Serialize): return not (self == other) def __hash__(self): - return self._hash + return hash(self.name) def __repr__(self): return '%s(%r)' % (type(self).__name__, self.name) @@ -28,13 +27,12 @@ class Symbol(Serialize): class Terminal(Symbol): - __serialize_fields__ = 'name', 'filter_out', '_hash' + __serialize_fields__ = 'name', 'filter_out' is_term = True def __init__(self, name, filter_out=False): self.name = name - self._hash = hash(self.name) self.filter_out = filter_out @property @@ -44,7 +42,7 @@ class Terminal(Symbol): class NonTerminal(Symbol): - __serialize_fields__ = 'name', '_hash' + __serialize_fields__ = 'name', is_term = False From 8466981c084e4fb84dae068dbefbdf77a9273c47 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Fri, 9 Aug 2019 13:25:32 +0200 Subject: [PATCH 034/132] Cleanup --- lark/grammar.py | 3 +- lark/parsers/grammar_analysis.py | 34 ++++++---------------- lark/parsers/lalr_analysis.py | 49 ++++++++++++++++++-------------- lark/parsers/lalr_parser.py | 18 ++---------- 4 files changed, 40 insertions(+), 64 deletions(-) diff --git a/lark/grammar.py b/lark/grammar.py index d975a19..91435b2 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -71,7 +71,7 @@ class Rule(Serialize): expansion : a list of symbols order : index of this expansion amongst all rules of the same name """ - __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash', '_rp') + __slots__ = ('origin', 'expansion', 'alias', 'options', 'order', '_hash') __serialize_fields__ = 'origin', 'expansion', 'order', 'alias', 'options' __serialize_namespace__ = Terminal, NonTerminal, RuleOptions @@ -83,7 +83,6 @@ class Rule(Serialize): self.order = order self.options = options self._hash = hash((self.origin, tuple(self.expansion))) - self._rp = None def _deserialize(self): self._hash = hash((self.origin, tuple(self.expansion))) diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index b32f62f..94c32cc 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -3,20 +3,16 @@ from collections import Counter, defaultdict from ..utils import bfs, fzset, classify from ..exceptions import GrammarError from ..grammar import Rule, Terminal, NonTerminal -import time -# optimizations were made so that there should never be two distinct equal RulePtrs -# to help with hashtable lookup class RulePtr(object): - __slots__ = ('rule', 'index', '_advance') + __slots__ = ('rule', 'index') def __init__(self, rule, index): assert isinstance(rule, Rule) assert index <= len(rule.expansion) self.rule = rule self.index = index - self._advance = None def __repr__(self): before = [x.name for x in self.rule.expansion[:self.index]] @@ -27,19 +23,19 @@ class RulePtr(object): def next(self): return self.rule.expansion[self.index] - # don't create duplicate RulePtrs def advance(self, sym): assert self.next == sym - a = self._advance - if a is None: - a = RulePtr(self.rule, self.index + 1) - self._advance = a - return a + return RulePtr(self.rule, self.index+1) @property def is_satisfied(self): return self.index == len(self.rule.expansion) + def __eq__(self, other): + return self.rule == other.rule and self.index == other.index + def __hash__(self): + return hash((self.rule, self.index)) + # state generation ensures no duplicate LR0ItemSets class LR0ItemSet(object): @@ -159,19 +155,11 @@ class GrammarAnalyzer(object): self.lr0_rules_by_origin = classify(lr0_rules, lambda r: r.origin) # cache RulePtr(r, 0) in r (no duplicate RulePtr objects) - for root_rule in lr0_root_rules.values(): - root_rule._rp = RulePtr(root_rule, 0) - self.lr0_start_states = {start: LR0ItemSet([root_rule._rp], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin)) + self.lr0_start_states = {start: LR0ItemSet([RulePtr(root_rule, 0)], self.expand_rule(root_rule.origin, self.lr0_rules_by_origin)) for start, root_rule in lr0_root_rules.items()} self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules) - self.nonterminal_transitions = [] - self.directly_reads = defaultdict(set) - self.reads = defaultdict(set) - self.includes = defaultdict(set) - self.lookback = defaultdict(set) - def expand_rule(self, source_rule, rules_by_origin=None): "Returns all init_ptrs accessible by rule (recursive)" @@ -183,11 +171,7 @@ class GrammarAnalyzer(object): assert not rule.is_term, rule for r in rules_by_origin[rule]: - # don't create duplicate RulePtr objects - init_ptr = r._rp - if init_ptr is None: - init_ptr = RulePtr(r, 0) - r._rp = init_ptr + init_ptr = RulePtr(r, 0) init_ptrs.add(init_ptr) if r.expansion: # if not empty rule diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 4104713..4af2c24 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -15,8 +15,6 @@ from ..exceptions import GrammarError from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet from ..grammar import Rule -import time - ###{standalone class Action: @@ -115,8 +113,8 @@ def traverse(x, S, N, X, R, G, F): S.append(x) d = len(S) N[x] = d - F[x] = G(x) - for y in R(x): + F[x] = G[x] + for y in R[x]: if N[y] == 0: traverse(y, S, N, X, R, G, F) n_x = N[x] @@ -137,9 +135,17 @@ def traverse(x, S, N, X, R, G, F): class LALR_Analyzer(GrammarAnalyzer): + def __init__(self, parser_conf, debug=False): + GrammarAnalyzer.__init__(self, parser_conf, debug) + self.nonterminal_transitions = [] + self.directly_reads = defaultdict(set) + self.reads = defaultdict(set) + self.includes = defaultdict(set) + self.lookback = defaultdict(set) + def compute_lr0_states(self): - self.states = set() + self.lr0_states = set() # map of kernels to LR0ItemSets cache = {} @@ -161,7 +167,7 @@ class LALR_Analyzer(GrammarAnalyzer): state.transitions[sym] = new_state yield new_state - self.states.add(state) + self.lr0_states.add(state) for _ in bfs(self.lr0_start_states.values(), step): pass @@ -174,14 +180,14 @@ class LALR_Analyzer(GrammarAnalyzer): assert(rp.index == 0) self.directly_reads[(root, rp.next)] = set([ Terminal('$END') ]) - for state in self.states: + for state in self.lr0_states: seen = set() for rp in state.closure: if rp.is_satisfied: continue s = rp.next # if s is a not a nonterminal - if not s in self.lr0_rules_by_origin: + if s not in self.lr0_rules_by_origin: continue if s in seen: continue @@ -201,11 +207,6 @@ class LALR_Analyzer(GrammarAnalyzer): if s2 in self.NULLABLE: r.add((next_state, s2)) - def compute_read_sets(self): - R = lambda nt: self.reads[nt] - G = lambda nt: self.directly_reads[nt] - self.read_sets = digraph(self.nonterminal_transitions, R, G) - def compute_includes_lookback(self): for nt in self.nonterminal_transitions: state, nonterminal = nt @@ -220,9 +221,8 @@ class LALR_Analyzer(GrammarAnalyzer): s = rp.rule.expansion[i] nt2 = (state2, s) state2 = state2.transitions[s] - if not nt2 in self.reads: + if nt2 not in self.reads: continue - j = i + 1 for j in range(i + 1, len(rp.rule.expansion)): if not rp.rule.expansion[j] in self.NULLABLE: break @@ -236,20 +236,18 @@ class LALR_Analyzer(GrammarAnalyzer): for nt2 in includes: self.includes[nt2].add(nt) - def compute_follow_sets(self): - R = lambda nt: self.includes[nt] - G = lambda nt: self.read_sets[nt] - self.follow_sets = digraph(self.nonterminal_transitions, R, G) - def compute_lookaheads(self): + read_sets = digraph(self.nonterminal_transitions, self.reads, self.directly_reads) + follow_sets = digraph(self.nonterminal_transitions, self.includes, read_sets) + for nt, lookbacks in self.lookback.items(): for state, rule in lookbacks: - for s in self.follow_sets[nt]: + for s in follow_sets[nt]: state.lookaheads[s].add(rule) def compute_lalr1_states(self): m = {} - for state in self.states: + for state in self.lr0_states: actions = {} for la, next_state in state.transitions.items(): actions[la] = (Shift, next_state.closure) @@ -281,3 +279,10 @@ class LALR_Analyzer(GrammarAnalyzer): self.parse_table = self._parse_table else: self.parse_table = IntParseTable.from_ParseTable(self._parse_table) + + def compute_lalr(self): + self.compute_lr0_states() + self.compute_reads_relations() + self.compute_includes_lookback() + self.compute_lookaheads() + self.compute_lalr1_states() \ No newline at end of file diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 657e795..82c8bba 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -8,8 +8,6 @@ from ..utils import Enumerator, Serialize from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable -import time - ###{standalone class LALR_Parser(object): @@ -17,13 +15,7 @@ class LALR_Parser(object): assert all(r.options is None or r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization" analysis = LALR_Analyzer(parser_conf, debug=debug) - analysis.compute_lr0_states() - analysis.compute_reads_relations() - analysis.compute_read_sets() - analysis.compute_includes_lookback() - analysis.compute_follow_sets() - analysis.compute_lookaheads() - analysis.compute_lalr1_states() + analysis.compute_lalr() callbacks = parser_conf.callbacks self._parse_table = analysis.parse_table @@ -88,11 +80,6 @@ class _Parser: state_stack.append(new_state) value_stack.append(value) - if state_stack[-1] == end_state: - return True - - return False - # Main LALR-parser loop for token in stream: while True: @@ -111,7 +98,8 @@ class _Parser: while True: _action, arg = get_action(token) assert(_action is Reduce) - if reduce(arg): + reduce(arg) + if state_stack[-1] == end_state: return value_stack[-1] ###} From e8c67839c22956586ae6f63e59a727565cd81ba9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mac=C3=AAdo=20Mendes?= Date: Fri, 9 Aug 2019 18:53:02 -0300 Subject: [PATCH 035/132] Test if lexer correctly detects newlines --- tests/test_parser.py | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/tests/test_parser.py b/tests/test_parser.py index 599406f..d0aeb1c 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1558,6 +1558,27 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(parser.parse('xa', 'a'), Tree('a', [])) self.assertEqual(parser.parse('xb', 'b'), Tree('b', [])) + def test_lexer_detect_newline_tokens(self): + # Detect newlines in regular tokens + g = Lark(r"""start: "go" tail* + tail : SA "a" | SB "b" | SC "c" | SD "d" + SA : /\n/ + SB : /./ + SC : /[^a-z]/ + SD : /\s/g + """, parser=PARSER, lexer=LEXER) + _, _, a, _, b, _, c, _, d = g.lex('go\na\nb\nc\nd') + self.assertEqual(a.line, 2) + self.assertEqual(b.line, 3) + self.assertEqual(c.line, 4) + self.assertEqual(d.line, 5) + + # Detect newlines in ignored tokens + for re in ['/\\n/', '/[^a-z]/', '/\\s/']: + g = Lark('start: "a" [start]\n%ignore {}'.format(re), lexer=LEXER, parser=PARSER) + a, b = g.lex('a\na') + self.assertEqual(a.line, 1) + self.assertEqual(b.line, 2) _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() From 1e4dbac58cbec032fd0271b5f1dac26ea2461068 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mac=C3=AAdo=20Mendes?= Date: Fri, 9 Aug 2019 18:54:44 -0300 Subject: [PATCH 036/132] Fix undetected newlines on ignored tokens --- lark/lexer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/lark/lexer.py b/lark/lexer.py index d3e4af6..377fab6 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -270,8 +270,9 @@ def _regexp_has_newline(r): - escaped newline (\\n) - anything but ([^...]) - any-char (.) when the flag (?s) exists + - spaces (\s) """ - return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r) + return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r) class Lexer(object): """Lexer interface From 03ae3e1c0f47a399e729b613884bc4463d9bba23 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Mac=C3=AAdo=20Mendes?= Date: Fri, 9 Aug 2019 18:55:12 -0300 Subject: [PATCH 037/132] Add .idea (Pycharm) to .gitignore --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 293aae0..710a131 100644 --- a/.gitignore +++ b/.gitignore @@ -4,6 +4,7 @@ /lark_parser.egg-info/** tags .vscode +.idea .ropeproject .cache /dist From c5c763580e79f1521d992dcba9e9ba5d9742bc06 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 10 Aug 2019 14:09:37 +0200 Subject: [PATCH 038/132] Fixed test for newline detection --- tests/test_parser.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tests/test_parser.py b/tests/test_parser.py index d0aeb1c..82da48c 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1560,14 +1560,14 @@ def _make_parser_test(LEXER, PARSER): def test_lexer_detect_newline_tokens(self): # Detect newlines in regular tokens - g = Lark(r"""start: "go" tail* - tail : SA "a" | SB "b" | SC "c" | SD "d" - SA : /\n/ - SB : /./ - SC : /[^a-z]/ - SD : /\s/g - """, parser=PARSER, lexer=LEXER) - _, _, a, _, b, _, c, _, d = g.lex('go\na\nb\nc\nd') + g = _Lark(r"""start: "go" tail* + !tail : SA "@" | SB "@" | SC "@" | SD "@" + SA : "a" /\n/ + SB : /b./s + SC : "c" /[^a-z]/ + SD : "d" /\s/ + """) + a,b,c,d = [x.children[1] for x in g.parse('goa\n@b\n@c\n@d\n@').children] self.assertEqual(a.line, 2) self.assertEqual(b.line, 3) self.assertEqual(c.line, 4) @@ -1575,8 +1575,9 @@ def _make_parser_test(LEXER, PARSER): # Detect newlines in ignored tokens for re in ['/\\n/', '/[^a-z]/', '/\\s/']: - g = Lark('start: "a" [start]\n%ignore {}'.format(re), lexer=LEXER, parser=PARSER) - a, b = g.lex('a\na') + g = _Lark('''!start: "a" "a" + %ignore {}'''.format(re)) + a, b = g.parse('a\na').children self.assertEqual(a.line, 1) self.assertEqual(b.line, 2) From 71c4abfb245c30507cf0dc3ae1d0f62895282121 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 11 Aug 2019 11:26:31 +0200 Subject: [PATCH 039/132] Fixed error message (Issue #380) --- lark/load_grammar.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index f6c1d22..8cd5742 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -538,7 +538,7 @@ class Grammar: for dups in duplicates.values(): if len(dups) > 1: if dups[0].expansion: - raise GrammarError("Rules defined twice: %s" % ', '.join(str(i) for i in duplicates)) + raise GrammarError("Rules defined twice: %s\n\n(Might happen due to colliding expansion of optionals: [] or ?)" % ''.join('\n * %s' % i for i in dups)) # Empty rule; assert all other attributes are equal assert len({(r.alias, r.order, r.options) for r in dups}) == len(dups) From 9ca74d7f67a30bfcd2312a537051489f8b2612eb Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 13 Aug 2019 16:38:02 +0200 Subject: [PATCH 040/132] Added the serialize tool for exporting Lark state & analysis --- lark/lexer.py | 5 ++++- lark/tools/serialize.py | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 44 insertions(+), 1 deletion(-) create mode 100644 lark/tools/serialize.py diff --git a/lark/lexer.py b/lark/lexer.py index 377fab6..0966a81 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -8,7 +8,6 @@ from .exceptions import UnexpectedCharacters, LexError ###{standalone class Pattern(Serialize): - __serialize_fields__ = 'value', 'flags' def __init__(self, value, flags=()): self.value = value @@ -41,6 +40,8 @@ class Pattern(Serialize): class PatternStr(Pattern): + __serialize_fields__ = 'value', 'flags' + type = "str" def to_regexp(self): @@ -52,6 +53,8 @@ class PatternStr(Pattern): max_width = min_width class PatternRE(Pattern): + __serialize_fields__ = 'value', 'flags', '_width' + type = "re" def to_regexp(self): diff --git a/lark/tools/serialize.py b/lark/tools/serialize.py new file mode 100644 index 0000000..ec4fca5 --- /dev/null +++ b/lark/tools/serialize.py @@ -0,0 +1,40 @@ +import codecs +import sys +import json + +from lark import Lark +from lark.grammar import RuleOptions, Rule +from lark.lexer import TerminalDef + +import argparse + +argparser = argparse.ArgumentParser(prog='python -m lark.tools.serialize') #description='''Lark Serialization Tool -- Stores Lark's internal state & LALR analysis as a convenient JSON file''') + +argparser.add_argument('grammar_file', type=argparse.FileType('r'), help='A valid .lark file') +argparser.add_argument('-o', '--out', type=argparse.FileType('w'), default=sys.stdout, help='json file path to create (default=stdout)') +argparser.add_argument('-s', '--start', default='start', help='start symbol (default="start")') +argparser.add_argument('-l', '--lexer', default='standard', choices=['standard', 'contextual'], help='lexer type (default="standard")') + + +def serialize(infile, outfile, lexer, start): + lark_inst = Lark(infile, parser="lalr", lexer=lexer, start=start) # TODO contextual + + data, memo = lark_inst.memo_serialize([TerminalDef, Rule]) + outfile.write('{\n') + outfile.write(' "data": %s,\n' % json.dumps(data)) + outfile.write(' "memo": %s\n' % json.dumps(memo)) + outfile.write('}\n') + + +def main(): + if len(sys.argv) == 1 or '-h' in sys.argv or '--help' in sys.argv: + print("Lark Serialization Tool - Stores Lark's internal state & LALR analysis as a JSON file") + print("") + argparser.print_help() + else: + args = argparser.parse_args() + + serialize(args.grammar_file, args.out, args.lexer, args.start) + +if __name__ == '__main__': + main() \ No newline at end of file From 3cdee35af57dbd0a3f9773ade8486044ab8720fc Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 14 Aug 2019 11:31:43 +0200 Subject: [PATCH 041/132] Version bump --- lark/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/__init__.py b/lark/__init__.py index db2ce44..dce9e17 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -5,4 +5,4 @@ from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, Une from .lexer import Token from .lark import Lark -__version__ = "0.7.2" +__version__ = "0.7.3" From d5036eefddbbeeff43b63d0f2e5f7d212ce96033 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 14 Aug 2019 23:37:20 +0200 Subject: [PATCH 042/132] Serialize tool: Multiple start symbols + bugfix --- lark/tools/serialize.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/lark/tools/serialize.py b/lark/tools/serialize.py index ec4fca5..fb69d35 100644 --- a/lark/tools/serialize.py +++ b/lark/tools/serialize.py @@ -12,7 +12,7 @@ argparser = argparse.ArgumentParser(prog='python -m lark.tools.serialize') #desc argparser.add_argument('grammar_file', type=argparse.FileType('r'), help='A valid .lark file') argparser.add_argument('-o', '--out', type=argparse.FileType('w'), default=sys.stdout, help='json file path to create (default=stdout)') -argparser.add_argument('-s', '--start', default='start', help='start symbol (default="start")') +argparser.add_argument('-s', '--start', default='start', help='start symbol (default="start")', nargs='+') argparser.add_argument('-l', '--lexer', default='standard', choices=['standard', 'contextual'], help='lexer type (default="standard")') @@ -33,8 +33,7 @@ def main(): argparser.print_help() else: args = argparser.parse_args() - - serialize(args.grammar_file, args.out, args.lexer, args.start) + serialize(args.grammar_file, args.out, args.lexer, args.start) if __name__ == '__main__': main() \ No newline at end of file From 7e8488d1a01bdb1faa5175f6fa40fd3b84b22fce Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Thu, 15 Aug 2019 18:06:42 +0200 Subject: [PATCH 043/132] Fixed issue #425, keeping in mind unicode issue #411 --- lark/lexer.py | 2 +- lark/load_grammar.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 0966a81..48d8904 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -101,7 +101,7 @@ class Token(Str): self.type = type_ self.pos_in_stream = pos_in_stream - self.value = Str(value) + self.value = value self.line = line self.column = column self.end_line = end_line diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 8cd5742..12ae38f 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -12,7 +12,7 @@ from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import LALR_TraditionalLexer from .common import LexerConf, ParserConf from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol -from .utils import classify, suppress, dedup_list +from .utils import classify, suppress, dedup_list, Str from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken from .tree import Tree, SlottedTree as ST @@ -451,9 +451,9 @@ class PrepareSymbols(Transformer_InPlace): if isinstance(v, Tree): return v elif v.type == 'RULE': - return NonTerminal(v.value) + return NonTerminal(Str(v.value)) elif v.type == 'TERMINAL': - return Terminal(v.value, filter_out=v.startswith('_')) + return Terminal(Str(v.value), filter_out=v.startswith('_')) assert False def _choice_of_rules(rules): From ad9a9cf37a3f10030d0f93e838e8f76f74b21327 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 17 Aug 2019 22:20:11 +0200 Subject: [PATCH 044/132] Added readthedocs.yml (Issue #426) --- readthedocs.yml | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 readthedocs.yml diff --git a/readthedocs.yml b/readthedocs.yml new file mode 100644 index 0000000..9eb8c0d --- /dev/null +++ b/readthedocs.yml @@ -0,0 +1,2 @@ +version: 2 +formats: all From 4bf67aa2d616f1f87630833af7cf0d939e638a6b Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 17 Aug 2019 22:25:44 +0200 Subject: [PATCH 045/132] Fix for readthedocs --- readthedocs.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/readthedocs.yml b/readthedocs.yml index 9eb8c0d..f465212 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -1,2 +1,5 @@ version: 2 +mkdocs: + configuration: mkdocs.yml + fail_on_warning: false formats: all From 06bc432de3bb9fbcba8102b299ae8804cc69ed70 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 17 Aug 2019 22:32:20 +0200 Subject: [PATCH 046/132] Fix for readthedocs (another attempt) --- readthedocs.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/readthedocs.yml b/readthedocs.yml index f465212..dc59191 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -1,5 +1,10 @@ version: 2 + mkdocs: configuration: mkdocs.yml fail_on_warning: false + formats: all + +python: + version: 3.7 From 4266db9ca1c3fe59510f3190a3945bf39d35c08c Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 17 Aug 2019 22:41:02 +0200 Subject: [PATCH 047/132] Fix for readthedocs (yet another attempt) --- readthedocs.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readthedocs.yml b/readthedocs.yml index dc59191..080eeeb 100644 --- a/readthedocs.yml +++ b/readthedocs.yml @@ -7,4 +7,4 @@ mkdocs: formats: all python: - version: 3.7 + version: 3.5 From d845aa3bf1cf6408df27b843c02dd9b4b729c41a Mon Sep 17 00:00:00 2001 From: night199uk Date: Sat, 17 Aug 2019 11:04:11 -0700 Subject: [PATCH 048/132] Add debug flag to Early and XEarley to allow dumping the SPPF --- lark/parser_frontends.py | 5 ++++- lark/parsers/earley.py | 7 ++++++- lark/parsers/xearley.py | 4 ++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index c1bb3c9..8423ae4 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -139,7 +139,8 @@ class Earley(WithLexer): self.init_traditional_lexer() resolve_ambiguity = options.ambiguity == 'resolve' - self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity) + debug = options.debug if options else False + self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug) def match(self, term, token): return term.name == token.type @@ -152,10 +153,12 @@ class XEarley(_ParserFrontend): self._prepare_match(lexer_conf) resolve_ambiguity = options.ambiguity == 'resolve' + debug = options.debug if options else False self.parser = xearley.Parser(parser_conf, self.match, ignore=lexer_conf.ignore, resolve_ambiguity=resolve_ambiguity, + debug=debug, **kw ) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 4d6201b..a98be02 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -20,10 +20,11 @@ from .earley_common import Item, TransitiveItem from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, ForestToAmbiguousTreeVisitor class Parser: - def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True): + def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, debug=False): analysis = GrammarAnalyzer(parser_conf) self.parser_conf = parser_conf self.resolve_ambiguity = resolve_ambiguity + self.debug = debug self.FIRST = analysis.FIRST self.NULLABLE = analysis.NULLABLE @@ -296,6 +297,10 @@ class Parser: # symbol should have been completed in the last step of the Earley cycle, and will be in # this column. Find the item for the start_symbol, which is the root of the SPPF tree. solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] + if self.debug: + from .earley_forest import ForestToPyDotVisitor + debug_walker = ForestToPyDotVisitor() + debug_walker.visit(solutions[0], "sppf.png") if not solutions: expected_tokens = [t.expect for t in to_scan] diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 4ab3ba9..3898d6a 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -24,8 +24,8 @@ from .earley_forest import SymbolNode class Parser(BaseParser): - def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, ignore = (), complete_lex = False): - BaseParser.__init__(self, parser_conf, term_matcher, resolve_ambiguity) + def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, ignore = (), complete_lex = False, debug=False): + BaseParser.__init__(self, parser_conf, term_matcher, resolve_ambiguity, debug) self.ignore = [Terminal(t) for t in ignore] self.complete_lex = complete_lex From dc94ebc42f984ed5b1de6c08eba87c808d790bc7 Mon Sep 17 00:00:00 2001 From: night199uk Date: Sat, 17 Aug 2019 17:53:26 -0700 Subject: [PATCH 049/132] Fix Earley non-determinism Rule.order should be set as the index of each expansion with rules of the same name (e.g: a : b # rule.order 1 | c # rule.order 2). --- lark/load_grammar.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 12ae38f..7b3bb3f 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -511,12 +511,12 @@ class Grammar: simplify_rule = SimplifyRule_Visitor() compiled_rules = [] - for i, rule_content in enumerate(rules): + for rule_content in rules: name, tree, options = rule_content simplify_rule.visit(tree) expansions = rule_tree_to_text.transform(tree) - for expansion, alias in expansions: + for i, (expansion, alias) in enumerate(expansions): if alias and name.startswith('_'): raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) From dc3c009dca02052a8a1df700d22413efd9abcf01 Mon Sep 17 00:00:00 2001 From: night199uk Date: Mon, 19 Aug 2019 20:35:27 -0700 Subject: [PATCH 050/132] Tweak the Earley ambiguity algorithm to correctly prefer earlier branches --- lark/parsers/earley_forest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/parsers/earley_forest.py b/lark/parsers/earley_forest.py index 89522cd..bbceb42 100644 --- a/lark/parsers/earley_forest.py +++ b/lark/parsers/earley_forest.py @@ -122,7 +122,7 @@ class PackedNode(ForestNode): ambiguously. Hence, we use the sort order to identify the order in which ambiguous children should be considered. """ - return self.is_empty, -self.priority, -self.rule.order + return self.is_empty, -self.priority, self.rule.order def __iter__(self): return iter([self.left, self.right]) From 59f3a5707bb486b2127a27e145a6b0058bac89b9 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 20 Aug 2019 11:19:16 +0200 Subject: [PATCH 051/132] Fixed partials (Issue #398) --- lark/utils.py | 2 +- tests/test_trees.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/lark/utils.py b/lark/utils.py index d46beec..afcb072 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -160,7 +160,7 @@ def smart_decorator(f, create_decorator): elif isinstance(f, partial): # wraps does not work for partials in 2.7: https://bugs.python.org/issue3445 - return create_decorator(f.__func__, True) + return wraps(f.func)(create_decorator(f.func, True)) else: return create_decorator(f.__func__.__call__, True) diff --git a/tests/test_trees.py b/tests/test_trees.py index 38f74d5..b28ace2 100644 --- a/tests/test_trees.py +++ b/tests/test_trees.py @@ -4,6 +4,7 @@ import unittest from unittest import TestCase import copy import pickle +import functools from lark.tree import Tree from lark.visitors import Transformer, Interpreter, visit_children_decor, v_args, Discard @@ -146,6 +147,22 @@ class TestTrees(TestCase): res = T().transform(t) self.assertEqual(res, 2.9) + def test_partial(self): + + tree = Tree("start", [Tree("a", ["test1"]), Tree("b", ["test2"])]) + + def test(t, s): + return s.upper() + + @v_args(inline=True) + class T(Transformer): + a = functools.partial(test) + b = functools.partial(lambda t, s: s + "!") + + res = T().transform(tree) + assert res.children == ["TEST1", "test2!"] + + def test_discard(self): class MyTransformer(Transformer): def a(self, args): From def1d2931c70c096d7941a2b7df9eb0f2814cd7d Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Tue, 20 Aug 2019 11:34:48 +0200 Subject: [PATCH 052/132] Fixed partials (Issue #398) --- lark/utils.py | 2 +- tests/test_trees.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/lark/utils.py b/lark/utils.py index afcb072..5eb2333 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -160,7 +160,7 @@ def smart_decorator(f, create_decorator): elif isinstance(f, partial): # wraps does not work for partials in 2.7: https://bugs.python.org/issue3445 - return wraps(f.func)(create_decorator(f.func, True)) + return wraps(f.func)(create_decorator(lambda *args, **kw: f(*args[1:], **kw), True)) else: return create_decorator(f.__func__.__call__, True) diff --git a/tests/test_trees.py b/tests/test_trees.py index b28ace2..4216bd6 100644 --- a/tests/test_trees.py +++ b/tests/test_trees.py @@ -151,16 +151,16 @@ class TestTrees(TestCase): tree = Tree("start", [Tree("a", ["test1"]), Tree("b", ["test2"])]) - def test(t, s): - return s.upper() + def test(prefix, s, postfix): + return prefix + s.upper() + postfix @v_args(inline=True) class T(Transformer): - a = functools.partial(test) - b = functools.partial(lambda t, s: s + "!") + a = functools.partial(test, "@", postfix="!") + b = functools.partial(lambda s: s + "!") res = T().transform(tree) - assert res.children == ["TEST1", "test2!"] + assert res.children == ["@TEST1!", "test2!"] def test_discard(self): From 464f720385c3e67a0ddfcd61b586cfe3b3a32253 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 21 Aug 2019 00:30:03 +0200 Subject: [PATCH 053/132] Fix links in README (Issue #422) --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 975b9a4..b9a1bda 100644 --- a/README.md +++ b/README.md @@ -72,7 +72,7 @@ Lark is great at handling ambiguity. Let's parse the phrase "fruit flies like ba ![fruitflies.png](examples/fruitflies.png) -See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples) +See more [examples here](https://github.com/lark-parser/lark/tree/master/examples) @@ -95,7 +95,7 @@ See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples) - Extensive test suite [![codecov](https://codecov.io/gh/erezsh/lark/branch/master/graph/badge.svg)](https://codecov.io/gh/erezsh/lark) - And much more! -See the full list of [features in the wiki](https://github.com/erezsh/lark/wiki/Features) +See the full list of [features here](https://lark-parser.readthedocs.io/en/latest/features/) ### Comparison to other libraries From c00f4448faae5d6f05487f021c8a7ad2055c60c6 Mon Sep 17 00:00:00 2001 From: Michael Heyvaert Date: Wed, 21 Aug 2019 12:14:28 +0200 Subject: [PATCH 054/132] fix custom lexer handling for lalr parser + test --- lark/parser_frontends.py | 2 +- tests/__main__.py | 1 + tests/test_parser.py | 19 +++++++++++++++---- 3 files changed, 17 insertions(+), 5 deletions(-) diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 8423ae4..ec82299 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -118,7 +118,7 @@ class LALR_ContextualLexer(LALR_WithLexer): class LALR_CustomLexer(LALR_WithLexer): def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): - self.lexer = lexer_cls(self.lexer_conf) + self.lexer = lexer_cls(lexer_conf) debug = options.debug if options else False self.parser = LALR_Parser(parser_conf, debug=debug) WithLexer.__init__(self, lexer_conf, parser_conf, options) diff --git a/tests/__main__.py b/tests/__main__.py index 1c8a951..4762773 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -21,6 +21,7 @@ from .test_parser import ( TestCykStandard, TestLalrContextual, TestEarleyDynamic, + TestLalrCustom, # TestFullEarleyStandard, TestFullEarleyDynamic, diff --git a/tests/test_parser.py b/tests/test_parser.py index 82da48c..4db5ce9 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -22,7 +22,7 @@ from lark.exceptions import GrammarError, ParseError, UnexpectedToken, Unexpecte from lark.tree import Tree from lark.visitors import Transformer, Transformer_InPlace, v_args from lark.grammar import Rule -from lark.lexer import TerminalDef +from lark.lexer import TerminalDef, Lexer, TraditionalLexer __path__ = os.path.dirname(__file__) def _read(n, *args): @@ -431,12 +431,22 @@ def _make_full_earley_test(LEXER): _TestFullEarley.__name__ = _NAME globals()[_NAME] = _TestFullEarley +class CustomLexer(Lexer): + """ + Purpose of this custom lexer is to test the integration, + so it uses the traditionalparser as implementation without custom lexing behaviour. + """ + def __init__(self, lexer_conf): + self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) + def lex(self, *args, **kwargs): + return self.lexer.lex(*args, **kwargs) def _make_parser_test(LEXER, PARSER): + lexer_class_or_name = CustomLexer if LEXER == 'custom' else LEXER def _Lark(grammar, **kwargs): - return Lark(grammar, lexer=LEXER, parser=PARSER, propagate_positions=True, **kwargs) + return Lark(grammar, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) def _Lark_open(gfilename, **kwargs): - return Lark.open(gfilename, lexer=LEXER, parser=PARSER, propagate_positions=True, **kwargs) + return Lark.open(gfilename, lexer=lexer_class_or_name, parser=PARSER, propagate_positions=True, **kwargs) class _TestParser(unittest.TestCase): def test_basic1(self): g = _Lark("""start: a+ b a* "b" a* @@ -1532,7 +1542,7 @@ def _make_parser_test(LEXER, PARSER): parser = _Lark(grammar) - @unittest.skipIf(PARSER!='lalr', "Serialize currently only works for LALR parsers (though it should be easy to extend)") + @unittest.skipIf(PARSER!='lalr' or LEXER=='custom', "Serialize currently only works for LALR parsers without custom lexers (though it should be easy to extend)") def test_serialize(self): grammar = """ start: _ANY b "C" @@ -1594,6 +1604,7 @@ _TO_TEST = [ ('dynamic_complete', 'earley'), ('standard', 'lalr'), ('contextual', 'lalr'), + ('custom', 'lalr'), # (None, 'earley'), ] From b6b95c3ff01896a45b7835a7375203969a8040e3 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 26 Aug 2019 15:45:35 +0200 Subject: [PATCH 056/132] Raw docstring to avoid escape warnings (Issue #438) --- lark/lexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/lexer.py b/lark/lexer.py index 48d8904..9cd7adb 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -268,7 +268,7 @@ def build_mres(terminals, match_whole=False): return _build_mres(terminals, len(terminals), match_whole) def _regexp_has_newline(r): - """Expressions that may indicate newlines in a regexp: + r"""Expressions that may indicate newlines in a regexp: - newlines (\n) - escaped newline (\\n) - anything but ([^...]) From 11cd11394f6dcc88bf80962642932ada2d1e9efb Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 27 Aug 2019 23:53:08 +0200 Subject: [PATCH 057/132] Possibly a fix for issue #441 --- lark/utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/utils.py b/lark/utils.py index 5eb2333..9513b8b 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -172,7 +172,7 @@ import sre_parse import sre_constants def get_regexp_width(regexp): try: - return sre_parse.parse(regexp).getwidth() + return [int(x) for x in sre_parse.parse(regexp).getwidth()] except sre_constants.error: raise ValueError(regexp) From a17311785711aceb1bc0211f5631d55f4256a72b Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 28 Aug 2019 23:20:41 +0200 Subject: [PATCH 058/132] Included iter_subtrees and related methods in standalone parser (Issue #440) --- lark/tree.py | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/lark/tree.py b/lark/tree.py index fd0038e..ee8dfb7 100644 --- a/lark/tree.py +++ b/lark/tree.py @@ -56,30 +56,6 @@ class Tree(object): def __hash__(self): return hash((self.data, tuple(self.children))) -###} - - def expand_kids_by_index(self, *indices): - "Expand (inline) children at the given indices" - for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices - kid = self.children[i] - self.children[i:i+1] = kid.children - - def find_pred(self, pred): - "Find all nodes where pred(tree) == True" - return filter(pred, self.iter_subtrees()) - - def find_data(self, data): - "Find all nodes where tree.data == data" - return self.find_pred(lambda t: t.data == data) - - def scan_values(self, pred): - for c in self.children: - if isinstance(c, Tree): - for t in c.scan_values(pred): - yield t - else: - if pred(c): - yield c def iter_subtrees(self): # TODO: Re-write as a more efficient version @@ -102,6 +78,31 @@ class Tree(object): yield x seen.add(id(x)) + def find_pred(self, pred): + "Find all nodes where pred(tree) == True" + return filter(pred, self.iter_subtrees()) + + def find_data(self, data): + "Find all nodes where tree.data == data" + return self.find_pred(lambda t: t.data == data) + +###} + + def expand_kids_by_index(self, *indices): + "Expand (inline) children at the given indices" + for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices + kid = self.children[i] + self.children[i:i+1] = kid.children + + def scan_values(self, pred): + for c in self.children: + if isinstance(c, Tree): + for t in c.scan_values(pred): + yield t + else: + if pred(c): + yield c + def iter_subtrees_topdown(self): stack = [self] while stack: From 56978206a37476b919980c2ccff6d36141ae4161 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 28 Aug 2019 23:40:20 +0200 Subject: [PATCH 059/132] No longer confusing aliases and rules when importing (Issue #433) --- lark/load_grammar.py | 1 + 1 file changed, 1 insertion(+) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 7b3bb3f..4ecfd22 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -605,6 +605,7 @@ def import_from_grammar_into_namespace(grammar, namespace, aliases): _, tree, _ = imported_rules[symbol] except KeyError: raise GrammarError("Missing symbol '%s' in grammar %s" % (symbol, namespace)) + tree = next(tree.find_data("expansion")) # Skip "alias" or other annotations return tree.scan_values(lambda x: x.type in ('RULE', 'TERMINAL')) def get_namespace_name(name): From f06a83a8a79e0507ae58e7f8c5af8888e1d92da8 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 28 Aug 2019 23:50:36 +0200 Subject: [PATCH 060/132] Better error for literal with bad escaping (Issue #287) --- lark/load_grammar.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 4ecfd22..83ee119 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -351,7 +351,10 @@ def _fix_escaping(s): for n in i: w += n if n == '\\': - n2 = next(i) + try: + n2 = next(i) + except StopIteration: + raise ValueError("Literal ended unexpectedly (bad escaping): `%r`" % s) if n2 == '\\': w += '\\\\' elif n2 not in 'uxnftr': From 71b17d8e7ce5fe113c3ffa0e9c7d7e3ab298636a Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Thu, 5 Sep 2019 17:12:02 +0300 Subject: [PATCH 061/132] Update __init__.py --- lark/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/__init__.py b/lark/__init__.py index dce9e17..2b75d7a 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -5,4 +5,4 @@ from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, Une from .lexer import Token from .lark import Lark -__version__ = "0.7.3" +__version__ = "0.7.4" From 571bb400e3ca6ba0d262bdbc42d4c969dea47345 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Thu, 5 Sep 2019 17:35:17 +0300 Subject: [PATCH 062/132] Bugfix for regression (Issue #445) --- lark/load_grammar.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 83ee119..90911fd 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -608,8 +608,9 @@ def import_from_grammar_into_namespace(grammar, namespace, aliases): _, tree, _ = imported_rules[symbol] except KeyError: raise GrammarError("Missing symbol '%s' in grammar %s" % (symbol, namespace)) - tree = next(tree.find_data("expansion")) # Skip "alias" or other annotations - return tree.scan_values(lambda x: x.type in ('RULE', 'TERMINAL')) + + return _find_used_symbols(tree) + def get_namespace_name(name): try: @@ -686,6 +687,11 @@ class PrepareGrammar(Transformer_InPlace): return name +def _find_used_symbols(tree): + assert tree.data == 'expansions' + return {t for x in tree.find_data('expansion') + for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} + class GrammarLoader: def __init__(self): terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] @@ -847,9 +853,7 @@ class GrammarLoader: rule_names.add(name) for name, expansions, _o in rules: - used_symbols = {t for x in expansions.find_data('expansion') - for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} - for sym in used_symbols: + for sym in _find_used_symbols(expansions): if sym.type == 'TERMINAL': if sym not in terminal_names: raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name)) From 54b18e596158071e481aae6e6d74d2f85b2ee4e1 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Fri, 6 Sep 2019 08:11:45 +0300 Subject: [PATCH 063/132] Allow transformers to transform tokens (Issue #389) --- lark/visitors.py | 31 ++++++++++++++++++++++++++++++- tests/test_parser.py | 18 ++++++++++++++++++ 2 files changed, 48 insertions(+), 1 deletion(-) diff --git a/lark/visitors.py b/lark/visitors.py index 4a0f639..7d40e74 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -3,6 +3,7 @@ from functools import wraps from .utils import smart_decorator from .tree import Tree from .exceptions import VisitError, GrammarError +from .lexer import Token ###{standalone from inspect import getmembers, getmro @@ -21,6 +22,10 @@ class Transformer: Can be used to implement map or reduce. """ + __visit_tokens__ = False # For backwards compatibility + def __init__(self, visit_tokens=False): + self.__visit_tokens__ = visit_tokens + def _call_userfunc(self, tree, new_children=None): # Assumes tree is already transformed children = new_children if new_children is not None else tree.children @@ -45,10 +50,29 @@ class Transformer: except Exception as e: raise VisitError(tree, e) + def _call_userfunc_token(self, token): + try: + f = getattr(self, token.type) + except AttributeError: + return self.__default_token__(token) + else: + try: + return f(token) + except (GrammarError, Discard): + raise + except Exception as e: + raise VisitError(token, e) + + def _transform_children(self, children): for c in children: try: - yield self._transform_tree(c) if isinstance(c, Tree) else c + if isinstance(c, Tree): + yield self._transform_tree(c) + elif self.__visit_tokens__ and isinstance(c, Token): + yield self._call_userfunc_token(c) + else: + yield c except Discard: pass @@ -66,6 +90,11 @@ class Transformer: "Default operation on tree (for override)" return Tree(data, children, meta) + def __default_token__(self, token): + "Default operation on token (for override)" + return token + + @classmethod def _apply_decorator(cls, decorator, **kwargs): mro = getmro(cls) diff --git a/tests/test_parser.py b/tests/test_parser.py index 4db5ce9..e9d46e5 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -94,6 +94,24 @@ class TestParsers(unittest.TestCase): r = g.parse('xx') self.assertEqual( r.children[0].data, "c" ) + def test_visit_tokens(self): + class T(Transformer): + def a(self, children): + return children[0] + "!" + def A(self, tok): + return tok.upper() + + # Test regular + g = Lark("""start: a + a : A + A: "x" + """, parser='lalr') + r = T().transform(g.parse("x")) + self.assertEqual( r.children, ["x!"] ) + r = T(True).transform(g.parse("x")) + self.assertEqual( r.children, ["X!"] ) + + def test_embedded_transformer(self): class T(Transformer): def a(self, children): From deb325acb4a4203056f5b51b8457cb0614c10ce1 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Fri, 6 Sep 2019 08:16:42 +0300 Subject: [PATCH 064/132] Pydot now not necessary for earley debug, shows warning instead (Issue #443) --- lark/parsers/earley.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index a98be02..ff4e125 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -10,6 +10,7 @@ is better documented here: http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ """ +import logging from collections import deque from ..visitors import Transformer_InPlace, v_args @@ -299,8 +300,13 @@ class Parser: solutions = [n.node for n in columns[-1] if n.is_complete and n.node is not None and n.s == start_symbol and n.start == 0] if self.debug: from .earley_forest import ForestToPyDotVisitor - debug_walker = ForestToPyDotVisitor() - debug_walker.visit(solutions[0], "sppf.png") + try: + debug_walker = ForestToPyDotVisitor() + except ImportError: + logging.warning("Cannot find dependency 'pydot', will not generate sppf debug image") + else: + debug_walker.visit(solutions[0], "sppf.png") + if not solutions: expected_tokens = [t.expect for t in to_scan] From f43631949cddf6a50fa1bb5d083335b2c6eefee8 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Fri, 6 Sep 2019 08:18:42 +0300 Subject: [PATCH 065/132] Version bump --- lark/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/__init__.py b/lark/__init__.py index 2b75d7a..69d9faf 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -5,4 +5,4 @@ from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, Une from .lexer import Token from .lark import Lark -__version__ = "0.7.4" +__version__ = "0.7.5" From 535aebab3c770d5b3acbe6fa21394c901a1f2345 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 11 Sep 2019 01:05:15 +0300 Subject: [PATCH 066/132] Added to docs (Issue #400) --- docs/grammar.md | 57 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/docs/grammar.md b/docs/grammar.md index 9343ee4..228c8b7 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -1,5 +1,13 @@ # Grammar Reference +Table of contents: + +1. [Definitions](#defs) +1. [Terminals](#terms) +1. [Rules](#rules) +1. [Directives](#dirs) + + ## Definitions **A grammar** is a list of rules and terminals, that together define a language. @@ -25,6 +33,7 @@ Lark begins the parse with the rule 'start', unless specified otherwise in the o Names of rules are always in lowercase, while names of terminals are always in uppercase. This distinction has practical effects, for the shape of the generated parse-tree, and the automatic construction of the lexer (aka tokenizer, or scanner). + ## Terminals Terminals are used to match text into symbols. They can be defined as a combination of literals and other terminals. @@ -70,6 +79,53 @@ WHITESPACE: (" " | /\t/ )+ SQL_SELECT: "select"i ``` +### Regular expressions & Ambiguity + +Each terminal is eventually compiled to a regular expression. All the operators and references inside it are mapped to their respective expressions. + +For example, in the following grammar, `A1` and `A2`, are equivalent: +```perl +A1: "a" | "b" +A2: /a|b/ +``` + +This means that inside terminals, Lark cannot detect or resolve ambiguity, even when using Earley. + +For example, for this grammar: +```perl +start : (A | B)+ +A : "a" | "ab" +B : "b" +``` +We get this behavior: + +```bash +>>> p.parse("ab") +Tree(start, [Token(A, 'a'), Token(B, 'b')]) +``` + +This is happening because Python's regex engine always returns the first matching option. + +If you find yourself in this situation, the recommended solution is to use rules instead. + +Example: + +```python +>>> p = Lark("""start: (a | b)+ +... !a: "a" | "ab" +... !b: "b" +... """, ambiguity="explicit") +>>> print(p.parse("ab").pretty()) +_ambig + start + a ab + start + a a + b b +``` + + + ## Rules **Syntax:** @@ -114,6 +170,7 @@ Rules can be assigned priority only when using Earley (future versions may suppo Priority can be either positive or negative. In not specified for a terminal, it's assumed to be 1 (i.e. the default). + ## Directives ### %ignore From bb57629418c3711c5d3477c7280882fa8927b70a Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Fri, 13 Sep 2019 16:12:22 +0300 Subject: [PATCH 067/132] Added 'edit_terminals' option (Issue #406) --- lark/lark.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/lark/lark.py b/lark/lark.py index ae71d56..c27f534 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -69,6 +69,7 @@ class LarkOptions(Serialize): 'propagate_positions': False, 'lexer_callbacks': {}, 'maybe_placeholders': False, + 'edit_terminals': None, } def __init__(self, options_dict): @@ -205,6 +206,10 @@ class Lark(Serialize): # Compile the EBNF grammar into BNF self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) + if self.options.edit_terminals: + for t in self.terminals: + self.options.edit_terminals(t) + self._terminals_dict = {t.name:t for t in self.terminals} # If the user asked to invert the priorities, negate them all here. From 7a13fb0f5b968046795fa9d221a38c2a34503605 Mon Sep 17 00:00:00 2001 From: Wataru Ashihara Date: Sun, 22 Sep 2019 13:51:14 +0900 Subject: [PATCH 068/132] Fix SyntaxError with Python 3 in JSON tutorial unpacking a tuple argument is invalid in Python 3. >>> def foo(a, (b, c)): File "", line 1 def foo(a, (b, c)): ^ SyntaxError: invalid syntax Fixes #403 --- docs/json_tutorial.md | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/docs/json_tutorial.md b/docs/json_tutorial.md index ca1db73..9cc87e7 100644 --- a/docs/json_tutorial.md +++ b/docs/json_tutorial.md @@ -230,7 +230,8 @@ from lark import Transformer class MyTransformer(Transformer): def list(self, items): return list(items) - def pair(self, (k,v)): + def pair(self, key_value): + k, v = key_value return k, v def dict(self, items): return dict(items) @@ -251,9 +252,11 @@ Also, our definitions of list and dict are a bit verbose. We can do better: from lark import Transformer class TreeToJson(Transformer): - def string(self, (s,)): + def string(self, s): + (s,) = s return s[1:-1] - def number(self, (n,)): + def number(self, n): + (n,) = n return float(n) list = list @@ -315,9 +318,11 @@ json_grammar = r""" """ class TreeToJson(Transformer): - def string(self, (s,)): + def string(self, s): + (s,) = s return s[1:-1] - def number(self, (n,)): + def number(self, n): + (n,) = n return float(n) list = list From d331a8a1b868f73635e78df51b128c90083a413e Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 15 Sep 2019 14:34:53 +0300 Subject: [PATCH 069/132] Version bump (alpha) --- lark/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/__init__.py b/lark/__init__.py index 69d9faf..903c10b 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -5,4 +5,4 @@ from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, Une from .lexer import Token from .lark import Lark -__version__ = "0.7.5" +__version__ = "0.7.6a1" From a7e7b568ff5535a3becee9625ba469b5db444979 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 28 Sep 2019 21:42:39 +0300 Subject: [PATCH 070/132] Fixed contextual lexer error that was confusing users (Issue #194) --- lark/lexer.py | 81 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 47 insertions(+), 34 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 9cd7adb..9ea224e 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -3,7 +3,7 @@ import re from .utils import Str, classify, get_regexp_width, Py36, Serialize -from .exceptions import UnexpectedCharacters, LexError +from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken ###{standalone @@ -43,7 +43,7 @@ class PatternStr(Pattern): __serialize_fields__ = 'value', 'flags' type = "str" - + def to_regexp(self): return self._get_flags(re.escape(self.value)) @@ -166,37 +166,32 @@ class _Lex: while line_ctr.char_pos < len(stream): lexer = self.lexer - for mre, type_from_index in lexer.mres: - m = mre.match(stream, line_ctr.char_pos) - if not m: - continue - - t = None - value = m.group(0) - type_ = type_from_index[m.lastindex] - if type_ not in ignore_types: - t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - if t.type in lexer.callback: - t = lexer.callback[t.type](t) - if not isinstance(t, Token): - raise ValueError("Callbacks must return a token (returned %r)" % t) - last_token = t - yield t - else: - if type_ in lexer.callback: - t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - lexer.callback[type_](t) - - line_ctr.feed(value, type_ in newline_types) - if t: - t.end_line = line_ctr.line - t.end_column = line_ctr.column - - break - else: + res = lexer.match(stream, line_ctr.char_pos) + if not res: allowed = {v for m, tfi in lexer.mres for v in tfi.values()} raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token]) + value, type_ = res + + t = None + if type_ not in ignore_types: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + if t.type in lexer.callback: + t = lexer.callback[t.type](t) + if not isinstance(t, Token): + raise ValueError("Callbacks must return a token (returned %r)" % t) + last_token = t + yield t + else: + if type_ in lexer.callback: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + lexer.callback[type_](t) + + line_ctr.feed(value, type_ in newline_types) + if t: + t.end_line = line_ctr.line + t.end_column = line_ctr.column + class UnlessCallback: def __init__(self, mres): @@ -330,6 +325,11 @@ class TraditionalLexer(Lexer): self.mres = build_mres(terminals) + def match(self, stream, pos): + for mre, type_from_index in self.mres: + m = mre.match(stream, pos) + if m: + return m.group(0), type_from_index[m.lastindex] def lex(self, stream): return _Lex(self).lex(stream, self.newline_types, self.ignore_types) @@ -367,9 +367,22 @@ class ContextualLexer(Lexer): def lex(self, stream): l = _Lex(self.lexers[self.parser_state], self.parser_state) - for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): - yield x - l.lexer = self.lexers[self.parser_state] - l.state = self.parser_state + try: + for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): + yield x + l.lexer = self.lexers[self.parser_state] + l.state = self.parser_state + except UnexpectedCharacters as e: + # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, + # but not in the current context. + # This tests the input against the global context, to provide a nicer error. + root_match = self.root_lexer.match(stream, e.pos_in_stream) + if not root_match: + raise + + value, type_ = root_match + t = Token(type_, value, e.pos_in_stream, e.line, e.column) + expected = {v for m, tfi in l.lexer.mres for v in tfi.values()} + raise UnexpectedToken(t, expected) ###} From a207963e46a0d71a34980ccf85841e78b3d37c95 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 28 Sep 2019 23:38:02 +0300 Subject: [PATCH 071/132] Improved error reporting (Issue #194) --- lark/lexer.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 9ea224e..26213ed 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -168,7 +168,9 @@ class _Lex: lexer = self.lexer res = lexer.match(stream, line_ctr.char_pos) if not res: - allowed = {v for m, tfi in lexer.mres for v in tfi.values()} + allowed = {v for m, tfi in lexer.mres for v in tfi.values()} - ignore_types + if not allowed: + allowed = {""} raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token]) value, type_ = res @@ -382,7 +384,6 @@ class ContextualLexer(Lexer): value, type_ = root_match t = Token(type_, value, e.pos_in_stream, e.line, e.column) - expected = {v for m, tfi in l.lexer.mres for v in tfi.values()} - raise UnexpectedToken(t, expected) + raise UnexpectedToken(t, e.allowed) ###} From 94da6c52b80444c141af50562b507155ca88526d Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 1 Oct 2019 23:17:21 +0300 Subject: [PATCH 072/132] Refactored the Earley code to make it thread-safe (Issue #454) --- lark/parsers/earley.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index ff4e125..87920c3 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -46,12 +46,8 @@ class Parser: # skip the extra tree walk. We'll also skip this if the user just didn't specify priorities # on any rules. if self.forest_sum_visitor is None and rule.options and rule.options.priority is not None: - self.forest_sum_visitor = ForestSumVisitor() + self.forest_sum_visitor = ForestSumVisitor - if resolve_ambiguity: - self.forest_tree_visitor = ForestToTreeVisitor(self.callbacks, self.forest_sum_visitor) - else: - self.forest_tree_visitor = ForestToAmbiguousTreeVisitor(self.callbacks, self.forest_sum_visitor) self.term_matcher = term_matcher @@ -316,7 +312,10 @@ class Parser: assert False, 'Earley should not generate multiple start symbol items!' # Perform our SPPF -> AST conversion using the right ForestVisitor. - return self.forest_tree_visitor.visit(solutions[0]) + forest_tree_visitor_cls = ForestToTreeVisitor if self.resolve_ambiguity else ForestToAmbiguousTreeVisitor + forest_tree_visitor = forest_tree_visitor_cls(self.callbacks, self.forest_sum_visitor and self.forest_sum_visitor()) + + return forest_tree_visitor.visit(solutions[0]) class ApplyCallbacks(Transformer_InPlace): From 404fef87f4f168543a333ca3ecb045f7017ac15a Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 3 Oct 2019 11:29:49 +0300 Subject: [PATCH 073/132] Version bump --- lark/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/__init__.py b/lark/__init__.py index 903c10b..6d8b325 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -5,4 +5,4 @@ from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, Une from .lexer import Token from .lark import Lark -__version__ = "0.7.6a1" +__version__ = "0.7.7" From f566a3618b45340a9b5b0591f36796fea415ff46 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 7 Oct 2019 12:14:10 +0300 Subject: [PATCH 074/132] Bugfix: Lark now throws an error for recursive terminals (Issue #264) --- lark/load_grammar.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 90911fd..1b4ab65 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -479,7 +479,7 @@ class Grammar: # =================== # Convert terminal-trees to strings/regexps - transformer = PrepareLiterals() * TerminalTreeToPattern() + for name, (term_tree, priority) in term_defs: if term_tree is None: # Terminal added through %declare continue @@ -487,7 +487,8 @@ class Grammar: if len(expansions) == 1 and not expansions[0].children: raise GrammarError("Terminals cannot be empty (%s)" % name) - terminals = [TerminalDef(name, transformer.transform(term_tree), priority) + transformer = PrepareLiterals() * TerminalTreeToPattern() + terminals = [TerminalDef(name, transformer.transform( term_tree ), priority) for name, (term_tree, priority) in term_defs if term_tree] # ================= @@ -638,11 +639,10 @@ def import_from_grammar_into_namespace(grammar, namespace, aliases): def resolve_term_references(term_defs): - # TODO Cycles detection # TODO Solve with transitive closure (maybe) - token_dict = {k:t for k, (t,_p) in term_defs} - assert len(token_dict) == len(term_defs), "Same name defined twice?" + term_dict = {k:t for k, (t,_p) in term_defs} + assert len(term_dict) == len(term_defs), "Same name defined twice?" while True: changed = False @@ -655,11 +655,19 @@ def resolve_term_references(term_defs): if item.type == 'RULE': raise GrammarError("Rules aren't allowed inside terminals (%s in %s)" % (item, name)) if item.type == 'TERMINAL': - exp.children[0] = token_dict[item] + term_value = term_dict[item] + exp.children[0] = term_value changed = True if not changed: break + for name, term in term_dict.items(): + for child in term.children: + ids = [id(x) for x in child.iter_subtrees()] + if id(term) in ids: + raise GrammarError("Recursion in terminal '%s' (recursion is only allowed in rules, not terminals)" % name) + + def options_from_rule(name, *x): if len(x) > 1: priority, expansions = x From 0a4530b9427c35a262c8248424cde2e06be54f09 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 8 Oct 2019 09:16:33 +0300 Subject: [PATCH 075/132] Improved Earley error on EOF (Issue #457) --- lark/exceptions.py | 8 ++++++++ lark/parsers/earley.py | 8 ++++---- lark/parsers/xearley.py | 3 ++- 3 files changed, 14 insertions(+), 5 deletions(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index 4207589..28f1b4b 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -13,6 +13,14 @@ class ParseError(LarkError): class LexError(LarkError): pass +class UnexpectedEOF(ParseError): + def __init__(self, expected): + self.expected = expected + + message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected)) + super(UnexpectedEOF, self).__init__(message) + + class UnexpectedInput(LarkError): pos_in_stream = None diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 87920c3..e18d26c 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -14,7 +14,7 @@ import logging from collections import deque from ..visitors import Transformer_InPlace, v_args -from ..exceptions import ParseError, UnexpectedToken +from ..exceptions import UnexpectedEOF, UnexpectedToken from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal from .earley_common import Item, TransitiveItem @@ -270,6 +270,7 @@ class Parser: ## Column is now the final column in the parse. assert i == len(columns)-1 + return to_scan def parse(self, stream, start): assert start, start @@ -288,7 +289,7 @@ class Parser: else: columns[0].add(item) - self._parse(stream, columns, to_scan, start_symbol) + to_scan = self._parse(stream, columns, to_scan, start_symbol) # If the parse was successful, the start # symbol should have been completed in the last step of the Earley cycle, and will be in @@ -306,8 +307,7 @@ class Parser: if not solutions: expected_tokens = [t.expect for t in to_scan] - # raise ParseError('Incomplete parse: Could not find a solution to input') - raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens) + raise UnexpectedEOF(expected_tokens) elif len(solutions) > 1: assert False, 'Earley should not generate multiple start symbol items!' diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index 3898d6a..f32d0d1 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -146,4 +146,5 @@ class Parser(BaseParser): self.predict_and_complete(i, to_scan, columns, transitives) ## Column is now the final column in the parse. - assert i == len(columns)-1 \ No newline at end of file + assert i == len(columns)-1 + return to_scan \ No newline at end of file From 8685a5afc33781bcc72a14ee8be480eb3d4d73bf Mon Sep 17 00:00:00 2001 From: Francesco Franchina Date: Wed, 16 Oct 2019 13:34:25 +0200 Subject: [PATCH 076/132] Fixed some typos in the docs --- docs/classes.md | 2 +- docs/grammar.md | 2 +- docs/how_to_develop.md | 4 ++-- docs/parsers.md | 6 +++--- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/docs/classes.md b/docs/classes.md index 9943fd4..f77d7b8 100644 --- a/docs/classes.md +++ b/docs/classes.md @@ -96,7 +96,7 @@ Trees can be hashed and compared. Transformers & Visitors provide a convenient interface to process the parse-trees that Lark returns. -They are used by inheriting from the correct class (visitor or transformer), and implementing methods corresponding to the rule you wish to process. Each methods accepts the children as an argument. That can be modified using the `v-args` decorator, which allows to inline the arguments (akin to `*args`), or add the tree `meta` property as an argument. +They are used by inheriting from the correct class (visitor or transformer), and implementing methods corresponding to the rule you wish to process. Each methods accepts the children as an argument. That can be modified using the `v_args` decorator, which allows to inline the arguments (akin to `*args`), or add the tree `meta` property as an argument. See: https://github.com/lark-parser/lark/blob/master/lark/visitors.py diff --git a/docs/grammar.md b/docs/grammar.md index 228c8b7..8a8913b 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -179,7 +179,7 @@ All occurrences of the terminal will be ignored, and won't be part of the parse. Using the `%ignore` directive results in a cleaner grammar. -It's especially important for the LALR(1) algorithm, because adding whitespace (or comments, or other extranous elements) explicitly in the grammar, harms its predictive abilities, which are based on a lookahead of 1. +It's especially important for the LALR(1) algorithm, because adding whitespace (or comments, or other extraneous elements) explicitly in the grammar, harms its predictive abilities, which are based on a lookahead of 1. **Syntax:** ```html diff --git a/docs/how_to_develop.md b/docs/how_to_develop.md index d69a1e3..b161e0c 100644 --- a/docs/how_to_develop.md +++ b/docs/how_to_develop.md @@ -7,7 +7,7 @@ There are many ways you can help the project: * Write new grammars for Lark's library * Write a blog post introducing Lark to your audience * Port Lark to another language -* Help me with code developemnt +* Help me with code development If you're interested in taking one of these on, let me know and I will provide more details and assist you in the process. @@ -60,4 +60,4 @@ Another way to run the tests is using setup.py: ```bash python setup.py test -``` \ No newline at end of file +``` diff --git a/docs/parsers.md b/docs/parsers.md index fb7c997..c487238 100644 --- a/docs/parsers.md +++ b/docs/parsers.md @@ -5,9 +5,9 @@ Lark implements the following parsing algorithms: Earley, LALR(1), and CYK An [Earley Parser](https://www.wikiwand.com/en/Earley_parser) is a chart parser capable of parsing any context-free grammar at O(n^3), and O(n^2) when the grammar is unambiguous. It can parse most LR grammars at O(n). Most programming languages are LR, and can be parsed at a linear time. -Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitely using `lexer='dynamic'`. +Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitly using `lexer='dynamic'`. -It's possible to bypass the dynamic lexing, and use the regular Earley parser with a traditional lexer, that tokenizes as an independant first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'` +It's possible to bypass the dynamic lexing, and use the regular Earley parser with a traditional lexer, that tokenizes as an independent first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'` **SPPF & Ambiguity resolution** @@ -21,7 +21,7 @@ Lark provides the following options to combat ambiguity: 1) Lark will choose the best derivation for you (default). Users can choose between different disambiguation strategies, and can prioritize (or demote) individual rules over others, using the rule-priority syntax. -2) Users may choose to recieve the set of all possible parse-trees (using ambiguity='explicit'), and choose the best derivation themselves. While simple and flexible, it comes at the cost of space and performance, and so it isn't recommended for highly ambiguous grammars, or very long inputs. +2) Users may choose to receive the set of all possible parse-trees (using ambiguity='explicit'), and choose the best derivation themselves. While simple and flexible, it comes at the cost of space and performance, and so it isn't recommended for highly ambiguous grammars, or very long inputs. 3) As an advanced feature, users may use specialized visitors to iterate the SPPF themselves. Future versions of Lark intend to improve and simplify this interface. From 35e102903402e48d8c0090b915b864615a4a70ce Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 22 Oct 2019 20:38:57 +0300 Subject: [PATCH 077/132] Start parameter now accepts unicode in Python 2 (Issue #459) --- lark/lark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/lark.py b/lark/lark.py index c27f534..47c6fba 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -86,7 +86,7 @@ class LarkOptions(Serialize): options[name] = value - if isinstance(options['start'], str): + if isinstance(options['start'], STRING_TYPE): options['start'] = [options['start']] self.__dict__['options'] = options From f3714a572f047c5857a2b3ab8d8a161e142f20bf Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 22 Oct 2019 20:43:30 +0300 Subject: [PATCH 078/132] Now prints a nice warning instead of failing tests when js2py isn't installed --- tests/__main__.py | 2 +- tests/test_nearley/test_nearley.py | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/__main__.py b/tests/__main__.py index 4762773..901f101 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -10,7 +10,7 @@ from .test_reconstructor import TestReconstructor try: from .test_nearley.test_nearley import TestNearley except ImportError: - pass + logging.warn("Warning: Skipping tests for Nearley (js2py required)") # from .test_selectors import TestSelectors # from .test_grammars import TestPythonG, TestConfigG diff --git a/tests/test_nearley/test_nearley.py b/tests/test_nearley/test_nearley.py index 721db1d..647f489 100644 --- a/tests/test_nearley/test_nearley.py +++ b/tests/test_nearley/test_nearley.py @@ -15,9 +15,12 @@ NEARLEY_PATH = os.path.join(TEST_PATH, 'nearley') BUILTIN_PATH = os.path.join(NEARLEY_PATH, 'builtin') if not os.path.exists(NEARLEY_PATH): - print("Skipping Nearley tests!") + logging.warn("Nearley not installed. Skipping Nearley tests!") raise ImportError("Skipping Nearley tests!") +import js2py # Ensures that js2py exists, to avoid failing tests + + class TestNearley(unittest.TestCase): def test_css(self): fn = os.path.join(NEARLEY_PATH, 'examples/csscolor.ne') From 9f218f85b6786bc28ac08ffbcc3359e1545b394e Mon Sep 17 00:00:00 2001 From: Timo Furrer Date: Fri, 25 Oct 2019 20:20:27 +0200 Subject: [PATCH 079/132] Copy exc state when converting UnexpectedCharacters to UnexpectedToken exc. Fixes #462 --- lark/lexer.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/lexer.py b/lark/lexer.py index 26213ed..f57ae51 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -384,6 +384,6 @@ class ContextualLexer(Lexer): value, type_ = root_match t = Token(type_, value, e.pos_in_stream, e.line, e.column) - raise UnexpectedToken(t, e.allowed) + raise UnexpectedToken(t, e.allowed, state=e.state) ###} From 17b6d6d3b3a14550722b084391caf9cd12d580e6 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Fri, 1 Nov 2019 14:37:16 +0200 Subject: [PATCH 080/132] BUGFIX for declared terminals --- lark/load_grammar.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 1b4ab65..a65ca1e 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -656,16 +656,18 @@ def resolve_term_references(term_defs): raise GrammarError("Rules aren't allowed inside terminals (%s in %s)" % (item, name)) if item.type == 'TERMINAL': term_value = term_dict[item] + assert term_value is not None exp.children[0] = term_value changed = True if not changed: break for name, term in term_dict.items(): - for child in term.children: - ids = [id(x) for x in child.iter_subtrees()] - if id(term) in ids: - raise GrammarError("Recursion in terminal '%s' (recursion is only allowed in rules, not terminals)" % name) + if term: # Not just declared + for child in term.children: + ids = [id(x) for x in child.iter_subtrees()] + if id(term) in ids: + raise GrammarError("Recursion in terminal '%s' (recursion is only allowed in rules, not terminals)" % name) def options_from_rule(name, *x): From f07359c31683805f4004fe2d6f37dec84b7c094f Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Fri, 1 Nov 2019 14:39:25 +0200 Subject: [PATCH 081/132] Version bump --- lark/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/__init__.py b/lark/__init__.py index 6d8b325..ff24424 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -5,4 +5,4 @@ from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, Une from .lexer import Token from .lark import Lark -__version__ = "0.7.7" +__version__ = "0.7.8" From 1f0b1e7520b7ce00d71d9569f49d8c86b49ccc70 Mon Sep 17 00:00:00 2001 From: Giuliano Oliveira Date: Fri, 1 Nov 2019 18:53:31 -0400 Subject: [PATCH 082/132] added visit_topdown methods to Visitor classes --- lark/visitors.py | 17 +++++++++++++++-- tests/test_trees.py | 39 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/lark/visitors.py b/lark/visitors.py index 7d40e74..c6e4f6b 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -186,6 +186,11 @@ class Visitor(VisitorBase): self._call_userfunc(subtree) return tree + def visit_topdown(self,tree): + for subtree in tree.iter_subtrees_topdown(): + self._call_userfunc(subtree) + return tree + class Visitor_Recursive(VisitorBase): """Bottom-up visitor, recursive @@ -198,8 +203,16 @@ class Visitor_Recursive(VisitorBase): if isinstance(child, Tree): self.visit(child) - f = getattr(self, tree.data, self.__default__) - f(tree) + self._call_userfunc(tree) + return tree + + def visit_topdown(self,tree): + self._call_userfunc(tree) + + for child in tree.children: + if isinstance(child, Tree): + self.visit_topdown(child) + return tree diff --git a/tests/test_trees.py b/tests/test_trees.py index 4216bd6..edd2a8b 100644 --- a/tests/test_trees.py +++ b/tests/test_trees.py @@ -7,7 +7,7 @@ import pickle import functools from lark.tree import Tree -from lark.visitors import Transformer, Interpreter, visit_children_decor, v_args, Discard +from lark.visitors import Visitor, Visitor_Recursive, Transformer, Interpreter, visit_children_decor, v_args, Discard class TestTrees(TestCase): @@ -34,6 +34,43 @@ class TestTrees(TestCase): nodes = list(self.tree1.iter_subtrees_topdown()) self.assertEqual(nodes, expected) + def test_visitor(self): + class Visitor1(Visitor): + def __init__(self): + self.nodes=[] + + def __default__(self,tree): + self.nodes.append(tree) + class Visitor1_Recursive(Visitor_Recursive): + def __init__(self): + self.nodes=[] + + def __default__(self,tree): + self.nodes.append(tree) + + visitor1=Visitor1() + visitor1_recursive=Visitor1_Recursive() + + expected_top_down = [Tree('a', [Tree('b', 'x'), Tree('c', 'y'), Tree('d', 'z')]), + Tree('b', 'x'), Tree('c', 'y'), Tree('d', 'z')] + expected_botton_up= [Tree('b', 'x'), Tree('c', 'y'), Tree('d', 'z'), + Tree('a', [Tree('b', 'x'), Tree('c', 'y'), Tree('d', 'z')])] + + visitor1.visit(self.tree1) + self.assertEqual(visitor1.nodes,expected_botton_up) + + visitor1_recursive.visit(self.tree1) + self.assertEqual(visitor1_recursive.nodes,expected_botton_up) + + visitor1.nodes=[] + visitor1_recursive.nodes=[] + + visitor1.visit_topdown(self.tree1) + self.assertEqual(visitor1.nodes,expected_top_down) + + visitor1_recursive.visit_topdown(self.tree1) + self.assertEqual(visitor1_recursive.nodes,expected_top_down) + def test_interp(self): t = Tree('a', [Tree('b', []), Tree('c', []), 'd']) From 5b930b5973d5e5226209cdf09b3094b69df4e1b9 Mon Sep 17 00:00:00 2001 From: Hao Wu Date: Sun, 3 Nov 2019 08:03:04 +0800 Subject: [PATCH 083/132] page for transformer_and_visitor --- docs/classes.md | 113 -------------------------------- docs/transfromer_and_vistor.md | 115 +++++++++++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 113 deletions(-) create mode 100644 docs/transfromer_and_vistor.md diff --git a/docs/classes.md b/docs/classes.md index f77d7b8..ee6e76f 100644 --- a/docs/classes.md +++ b/docs/classes.md @@ -94,119 +94,6 @@ Trees can be hashed and compared. ## Transformers & Visitors -Transformers & Visitors provide a convenient interface to process the parse-trees that Lark returns. - -They are used by inheriting from the correct class (visitor or transformer), and implementing methods corresponding to the rule you wish to process. Each methods accepts the children as an argument. That can be modified using the `v_args` decorator, which allows to inline the arguments (akin to `*args`), or add the tree `meta` property as an argument. - -See: https://github.com/lark-parser/lark/blob/master/lark/visitors.py - -### Visitors - -Visitors visit each node of the tree, and run the appropriate method on it according to the node's data. - -They work bottom-up, starting with the leaves and ending at the root of the tree. - -**Example** -```python -class IncreaseAllNumbers(Visitor): - def number(self, tree): - assert tree.data == "number" - tree.children[0] += 1 - -IncreaseAllNumbers().visit(parse_tree) -``` - -There are two classes that implement the visitor interface: - -* Visitor - Visit every node (without recursion) - -* Visitor_Recursive - Visit every node using recursion. Slightly faster. - -### Transformers - -Transformers visit each node of the tree, and run the appropriate method on it according to the node's data. - -They work bottom-up (or: depth-first), starting with the leaves and ending at the root of the tree. - -Transformers can be used to implement map & reduce patterns. - -Because nodes are reduced from leaf to root, at any point the callbacks may assume the children have already been transformed (if applicable). - -Transformers can be chained into a new transformer by using multiplication. - -**Example:** -```python -from lark import Tree, Transformer - -class EvalExpressions(Transformer): - def expr(self, args): - return eval(args[0]) - -t = Tree('a', [Tree('expr', ['1+2'])]) -print(EvalExpressions().transform( t )) - -# Prints: Tree(a, [3]) -``` - - -Here are the classes that implement the transformer interface: - -- Transformer - Recursively transforms the tree. This is the one you probably want. -- Transformer_InPlace - Non-recursive. Changes the tree in-place instead of returning new instances -- Transformer_InPlaceRecursive - Recursive. Changes the tree in-place instead of returning new instances - -### v_args - -`v_args` is a decorator. - -By default, callback methods of transformers/visitors accept one argument: a list of the node's children. `v_args` can modify this behavior. - -When used on a transformer/visitor class definition, it applies to all the callback methods inside it. - -`v_args` accepts one of three flags: - -- `inline` - Children are provided as `*args` instead of a list argument (not recommended for very long lists). -- `meta` - Provides two arguments: `children` and `meta` (instead of just the first) -- `tree` - Provides the entire tree as the argument, instead of the children. - -Examples: - -```python -@v_args(inline=True) -class SolveArith(Transformer): - def add(self, left, right): - return left + right - - -class ReverseNotation(Transformer_InPlace): - @v_args(tree=True): - def tree_node(self, tree): - tree.children = tree.children[::-1] -``` - -### Discard - -When raising the `Discard` exception in a transformer callback, that node is discarded and won't appear in the parent. - -## Token - -When using a lexer, the resulting tokens in the trees will be of the Token class, which inherits from Python's string. So, normal string comparisons and operations will work as expected. Tokens also have other useful attributes: - -* `type` - Name of the token (as specified in grammar). -* `pos_in_stream` - the index of the token in the text -* `line` - The line of the token in the text (starting with 1) -* `column` - The column of the token in the text (starting with 1) -* `end_line` - The line where the token ends -* `end_column` - The next column after the end of the token. For example, if the token is a single character with a `column` value of 4, `end_column` will be 5. - - -## UnexpectedInput - -- `UnexpectedInput` - - `UnexpectedToken` - The parser recieved an unexpected token - - `UnexpectedCharacters` - The lexer encountered an unexpected string - -After catching one of these exceptions, you may call the following helper methods to create a nicer error message: ### Methods diff --git a/docs/transfromer_and_vistor.md b/docs/transfromer_and_vistor.md new file mode 100644 index 0000000..8385c93 --- /dev/null +++ b/docs/transfromer_and_vistor.md @@ -0,0 +1,115 @@ +## Transformers & Visitors + +Transformers & Visitors provide a convenient interface to process the parse-trees that Lark returns. + +They are used by inheriting from the correct class (visitor or transformer), and implementing methods corresponding to the rule you wish to process. Each methods accepts the children as an argument. That can be modified using the `v_args` decorator, which allows to inline the arguments (akin to `*args`), or add the tree `meta` property as an argument. + +See: https://github.com/lark-parser/lark/blob/master/lark/visitors.py + +### Visitors + +Visitors visit each node of the tree, and run the appropriate method on it according to the node's data. + +They work bottom-up, starting with the leaves and ending at the root of the tree. + +**Example** +```python +class IncreaseAllNumbers(Visitor): + def number(self, tree): + assert tree.data == "number" + tree.children[0] += 1 + +IncreaseAllNumbers().visit(parse_tree) +``` + +There are two classes that implement the visitor interface: + +* Visitor - Visit every node (without recursion) + +* Visitor_Recursive - Visit every node using recursion. Slightly faster. + +### Transformers + +Transformers visit each node of the tree, and run the appropriate method on it according to the node's data. + +They work bottom-up (or: depth-first), starting with the leaves and ending at the root of the tree. + +Transformers can be used to implement map & reduce patterns. + +Because nodes are reduced from leaf to root, at any point the callbacks may assume the children have already been transformed (if applicable). + +Transformers can be chained into a new transformer by using multiplication. + +**Example:** +```python +from lark import Tree, Transformer + +class EvalExpressions(Transformer): + def expr(self, args): + return eval(args[0]) + +t = Tree('a', [Tree('expr', ['1+2'])]) +print(EvalExpressions().transform( t )) + +# Prints: Tree(a, [3]) +``` + + +Here are the classes that implement the transformer interface: + +- Transformer - Recursively transforms the tree. This is the one you probably want. +- Transformer_InPlace - Non-recursive. Changes the tree in-place instead of returning new instances +- Transformer_InPlaceRecursive - Recursive. Changes the tree in-place instead of returning new instances + +### v_args + +`v_args` is a decorator. + +By default, callback methods of transformers/visitors accept one argument: a list of the node's children. `v_args` can modify this behavior. + +When used on a transformer/visitor class definition, it applies to all the callback methods inside it. + +`v_args` accepts one of three flags: + +- `inline` - Children are provided as `*args` instead of a list argument (not recommended for very long lists). +- `meta` - Provides two arguments: `children` and `meta` (instead of just the first) +- `tree` - Provides the entire tree as the argument, instead of the children. + +Examples: + +```python +@v_args(inline=True) +class SolveArith(Transformer): + def add(self, left, right): + return left + right + + +class ReverseNotation(Transformer_InPlace): + @v_args(tree=True): + def tree_node(self, tree): + tree.children = tree.children[::-1] +``` + +### Discard + +When raising the `Discard` exception in a transformer callback, that node is discarded and won't appear in the parent. + +## Token + +When using a lexer, the resulting tokens in the trees will be of the Token class, which inherits from Python's string. So, normal string comparisons and operations will work as expected. Tokens also have other useful attributes: + +* `type` - Name of the token (as specified in grammar). +* `pos_in_stream` - the index of the token in the text +* `line` - The line of the token in the text (starting with 1) +* `column` - The column of the token in the text (starting with 1) +* `end_line` - The line where the token ends +* `end_column` - The next column after the end of the token. For example, if the token is a single character with a `column` value of 4, `end_column` will be 5. + + +## UnexpectedInput + +- `UnexpectedInput` + - `UnexpectedToken` - The parser recieved an unexpected token + - `UnexpectedCharacters` - The lexer encountered an unexpected string + +After catching one of these exceptions, you may call the following helper methods to create a nicer error message: From 6546ea352a199d16a72abc8aadfb0b78396e0b1f Mon Sep 17 00:00:00 2001 From: Hao Wu Date: Sun, 3 Nov 2019 08:15:31 +0800 Subject: [PATCH 084/132] too much --- docs/classes.md | 21 ++++++++++++++++++++- docs/transfromer_and_vistor.md | 18 ------------------ 2 files changed, 20 insertions(+), 19 deletions(-) diff --git a/docs/classes.md b/docs/classes.md index ee6e76f..b63f8f1 100644 --- a/docs/classes.md +++ b/docs/classes.md @@ -92,8 +92,27 @@ Trees can be hashed and compared. ---- -## Transformers & Visitors +[Guide](transfromer_and_vistor.md) +## Token + +When using a lexer, the resulting tokens in the trees will be of the Token class, which inherits from Python's string. So, normal string comparisons and operations will work as expected. Tokens also have other useful attributes: + +* `type` - Name of the token (as specified in grammar). +* `pos_in_stream` - the index of the token in the text +* `line` - The line of the token in the text (starting with 1) +* `column` - The column of the token in the text (starting with 1) +* `end_line` - The line where the token ends +* `end_column` - The next column after the end of the token. For example, if the token is a single character with a `column` value of 4, `end_column` will be 5. + + +## UnexpectedInput + +- `UnexpectedInput` + - `UnexpectedToken` - The parser recieved an unexpected token + - `UnexpectedCharacters` - The lexer encountered an unexpected string + +After catching one of these exceptions, you may call the following helper methods to create a nicer error message: ### Methods diff --git a/docs/transfromer_and_vistor.md b/docs/transfromer_and_vistor.md index 8385c93..c4a24b1 100644 --- a/docs/transfromer_and_vistor.md +++ b/docs/transfromer_and_vistor.md @@ -94,22 +94,4 @@ class ReverseNotation(Transformer_InPlace): When raising the `Discard` exception in a transformer callback, that node is discarded and won't appear in the parent. -## Token -When using a lexer, the resulting tokens in the trees will be of the Token class, which inherits from Python's string. So, normal string comparisons and operations will work as expected. Tokens also have other useful attributes: - -* `type` - Name of the token (as specified in grammar). -* `pos_in_stream` - the index of the token in the text -* `line` - The line of the token in the text (starting with 1) -* `column` - The column of the token in the text (starting with 1) -* `end_line` - The line where the token ends -* `end_column` - The next column after the end of the token. For example, if the token is a single character with a `column` value of 4, `end_column` will be 5. - - -## UnexpectedInput - -- `UnexpectedInput` - - `UnexpectedToken` - The parser recieved an unexpected token - - `UnexpectedCharacters` - The lexer encountered an unexpected string - -After catching one of these exceptions, you may call the following helper methods to create a nicer error message: From 10a09ebba86fb17c2c13dc3080d6aa1d9987211c Mon Sep 17 00:00:00 2001 From: Hao Wu Date: Mon, 4 Nov 2019 10:14:36 +0800 Subject: [PATCH 085/132] transform token doc --- docs/transfromer_and_vistor.md | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/docs/transfromer_and_vistor.md b/docs/transfromer_and_vistor.md index c4a24b1..463c8d1 100644 --- a/docs/transfromer_and_vistor.md +++ b/docs/transfromer_and_vistor.md @@ -40,6 +40,7 @@ Because nodes are reduced from leaf to root, at any point the callbacks may assu Transformers can be chained into a new transformer by using multiplication. + **Example:** ```python from lark import Tree, Transformer @@ -54,6 +55,18 @@ print(EvalExpressions().transform( t )) # Prints: Tree(a, [3]) ``` +By default, transformer works only on rules, `visit_tokens=True` will make transfomer process tokens. This is handy in parsing simple token, such as turn `INT` to `int`, `NUMBER` to `float`. etc. + +```python +class T(Transformer): + INT = int # same with def INT(self, tok): int(tok) + NUMBER = float # same with def INT(self, tok): int(tok) + def NAME(self, name): + return lookup_dict.get(name, name) + + +T(visit_tokens=True).transform(tree) +``` Here are the classes that implement the transformer interface: From ca36404257691d45d41b4585eb50cbf7d25a756d Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 4 Nov 2019 11:37:12 +0200 Subject: [PATCH 086/132] Some improvements to visitor documentation --- docs/classes.md | 24 +++++++-------- ...{transfromer_and_vistor.md => visitors.md} | 29 ++++++++++++------- mkdocs.yml | 1 + 3 files changed, 31 insertions(+), 23 deletions(-) rename docs/{transfromer_and_vistor.md => visitors.md} (78%) diff --git a/docs/classes.md b/docs/classes.md index b63f8f1..1555a1f 100644 --- a/docs/classes.md +++ b/docs/classes.md @@ -1,15 +1,13 @@ -# Classes - Reference +# Classes Reference This page details the important classes in Lark. ---- -## Lark +## lark.Lark The Lark class is the main interface for the library. It's mostly a thin wrapper for the many different parsers, and for the tree constructor. -### Methods - #### \_\_init\_\_(self, grammar, **options) The Lark class accepts a grammar string or file object, and keyword options: @@ -50,14 +48,10 @@ If a transformer is supplied to `__init__`, returns whatever is the result of th The main tree class -### Properties - * `data` - The name of the rule or alias * `children` - List of matched sub-rules and terminals * `meta` - Line & Column numbers, if using `propagate_positions` -### Methods - #### \_\_init\_\_(self, data, children) Creates a new tree, and stores "data" and "children" in attributes of the same name. @@ -92,8 +86,6 @@ Trees can be hashed and compared. ---- -[Guide](transfromer_and_vistor.md) - ## Token When using a lexer, the resulting tokens in the trees will be of the Token class, which inherits from Python's string. So, normal string comparisons and operations will work as expected. Tokens also have other useful attributes: @@ -105,17 +97,25 @@ When using a lexer, the resulting tokens in the trees will be of the Token class * `end_line` - The line where the token ends * `end_column` - The next column after the end of the token. For example, if the token is a single character with a `column` value of 4, `end_column` will be 5. +## Transformer +## Visitor +## Interpreter + +See the [visitors page](visitors.md) + ## UnexpectedInput +## UnexpectedToken + +## UnexpectedException + - `UnexpectedInput` - `UnexpectedToken` - The parser recieved an unexpected token - `UnexpectedCharacters` - The lexer encountered an unexpected string After catching one of these exceptions, you may call the following helper methods to create a nicer error message: -### Methods - #### get_context(text, span) Returns a pretty string pinpointing the error in the text, with `span` amount of context characters around it. diff --git a/docs/transfromer_and_vistor.md b/docs/visitors.md similarity index 78% rename from docs/transfromer_and_vistor.md rename to docs/visitors.md index 463c8d1..c60c1dc 100644 --- a/docs/transfromer_and_vistor.md +++ b/docs/visitors.md @@ -2,9 +2,9 @@ Transformers & Visitors provide a convenient interface to process the parse-trees that Lark returns. -They are used by inheriting from the correct class (visitor or transformer), and implementing methods corresponding to the rule you wish to process. Each methods accepts the children as an argument. That can be modified using the `v_args` decorator, which allows to inline the arguments (akin to `*args`), or add the tree `meta` property as an argument. +They are used by inheriting from the correct class (visitor or transformer), and implementing methods corresponding to the rule you wish to process. Each method accepts the children as an argument. That can be modified using the `v_args` decorator, which allows to inline the arguments (akin to `*args`), or add the tree `meta` property as an argument. -See: https://github.com/lark-parser/lark/blob/master/lark/visitors.py +See: visitors.py ### Visitors @@ -40,6 +40,8 @@ Because nodes are reduced from leaf to root, at any point the callbacks may assu Transformers can be chained into a new transformer by using multiplication. +`Transformer` can do anything `Visitor` can do, but because it reconstructs the tree, it is slightly less efficient. + **Example:** ```python @@ -55,24 +57,29 @@ print(EvalExpressions().transform( t )) # Prints: Tree(a, [3]) ``` -By default, transformer works only on rules, `visit_tokens=True` will make transfomer process tokens. This is handy in parsing simple token, such as turn `INT` to `int`, `NUMBER` to `float`. etc. +All these classes implement the transformer interface: + +- Transformer - Recursively transforms the tree. This is the one you probably want. +- Transformer_InPlace - Non-recursive. Changes the tree in-place instead of returning new instances +- Transformer_InPlaceRecursive - Recursive. Changes the tree in-place instead of returning new instances + +### visit_tokens + +By default, transformers only visit rules. `visit_tokens=True` will tell Transformer to visit tokens as well. This is a slightly slower alternative to `lexer_callbacks`, but it's easier to maintain and works for all algorithms (even when there isn't a lexer). + +Example: ```python class T(Transformer): - INT = int # same with def INT(self, tok): int(tok) - NUMBER = float # same with def INT(self, tok): int(tok) + INT = int + NUMBER = float def NAME(self, name): return lookup_dict.get(name, name) - + T(visit_tokens=True).transform(tree) ``` -Here are the classes that implement the transformer interface: - -- Transformer - Recursively transforms the tree. This is the one you probably want. -- Transformer_InPlace - Non-recursive. Changes the tree in-place instead of returning new instances -- Transformer_InPlaceRecursive - Recursive. Changes the tree in-place instead of returning new instances ### v_args diff --git a/mkdocs.yml b/mkdocs.yml index 63bdd61..f5b0d1d 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -9,5 +9,6 @@ pages: - How To Develop (Guide): how_to_develop.md - Grammar Reference: grammar.md - Tree Construction Reference: tree_construction.md + - Visitors and Transformers: visitors.md - Classes Reference: classes.md - Recipes: recipes.md From b21e89b7f3b86526177e3891a803d5393adf70fa Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 4 Nov 2019 11:44:13 +0200 Subject: [PATCH 087/132] An addition to the docs --- docs/index.md | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/index.md b/docs/index.md index 8517208..d693cce 100644 --- a/docs/index.md +++ b/docs/index.md @@ -35,8 +35,8 @@ $ pip install lark-parser * [Examples](https://github.com/lark-parser/lark/tree/master/examples) * Tutorials * [How to write a DSL](http://blog.erezsh.com/how-to-write-a-dsl-in-python-with-lark/) - Implements a toy LOGO-like language with an interpreter - * [How to write a JSON parser](json_tutorial.md) - * External + * [How to write a JSON parser](json_tutorial.md) - Teaches you how to use Lark + * Unofficial * [Program Synthesis is Possible](https://www.cs.cornell.edu/~asampson/blog/minisynth.html) - Creates a DSL for Z3 * Guides * [How to use Lark](how_to_use.md) @@ -44,6 +44,7 @@ $ pip install lark-parser * Reference * [Grammar](grammar.md) * [Tree Construction](tree_construction.md) + * [Visitors & Transformers](visitors.md) * [Classes](classes.md) * [Cheatsheet (PDF)](lark_cheatsheet.pdf) * Discussion From 5e37fe458d8518ea9a3d1e9621389419d77459dc Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 4 Nov 2019 11:48:33 +0200 Subject: [PATCH 088/132] Version bump (0.8.0rc1) --- lark/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/__init__.py b/lark/__init__.py index ff24424..0906eb7 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -5,4 +5,4 @@ from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, Une from .lexer import Token from .lark import Lark -__version__ = "0.7.8" +__version__ = "0.8.0rc1" From e39bfa1b18f328adb40d785049e07e9a3264eae8 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 13 Nov 2019 10:00:11 +0200 Subject: [PATCH 089/132] Bugfix: Some tokens did not recieve and end_line (Issue #472) --- lark/lexer.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index f57ae51..806d575 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -175,24 +175,24 @@ class _Lex: value, type_ = res - t = None if type_ not in ignore_types: t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + line_ctr.feed(value, type_ in newline_types) + t.end_line = line_ctr.line + t.end_column = line_ctr.column if t.type in lexer.callback: t = lexer.callback[t.type](t) if not isinstance(t, Token): raise ValueError("Callbacks must return a token (returned %r)" % t) - last_token = t yield t + last_token = t else: if type_ in lexer.callback: - t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - lexer.callback[type_](t) + t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + lexer.callback[type_](t2) + line_ctr.feed(value, type_ in newline_types) + - line_ctr.feed(value, type_ in newline_types) - if t: - t.end_line = line_ctr.line - t.end_column = line_ctr.column class UnlessCallback: From 84f08a452f6aded0530948757841e61e2a4a423d Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 16 Nov 2019 08:52:06 +0200 Subject: [PATCH 090/132] propagate_positions & maybe_placeholders are now true by default, updated docs, tests & examples accordingly (Issue #449, #451) --- docs/classes.md | 24 +++++++++++++----------- docs/grammar.md | 4 ++-- examples/custom_lexer.py | 2 +- examples/reconstruct_json.py | 10 ++-------- lark/lark.py | 4 ++-- lark/reconstruct.py | 1 + tests/test_parser.py | 2 +- tests/test_reconstructor.py | 4 ++-- 8 files changed, 24 insertions(+), 27 deletions(-) diff --git a/docs/classes.md b/docs/classes.md index 1555a1f..1d59551 100644 --- a/docs/classes.md +++ b/docs/classes.md @@ -12,29 +12,31 @@ The Lark class is the main interface for the library. It's mostly a thin wrapper The Lark class accepts a grammar string or file object, and keyword options: -* start - The symbol in the grammar that begins the parse (Default: `"start"`) +* **start** - A list of the rules in the grammar that begin the parse (Default: `["start"]`) -* parser - Decides which parser engine to use, "earley", "lalr" or "cyk". (Default: `"earley"`) +* **parser** - Decides which parser engine to use, "earley", "lalr" or "cyk". (Default: `"earley"`) -* lexer - Overrides default lexer. +* **lexer** - Overrides default lexer, depending on parser. -* transformer - Applies the transformer instead of building a parse tree (only allowed with parser="lalr") +* **transformer** - Applies the provided transformer instead of building a parse tree (only allowed with parser="lalr") -* postlex - Lexer post-processing (Default: None. only works when lexer is "standard" or "contextual") +* **postlex** - Lexer post-processing (Default: `None`. only works when lexer is "standard" or "contextual") -* ambiguity (only relevant for earley and cyk) +* **ambiguity** (only relevant for earley and cyk) * "explicit" - Return all derivations inside an "_ambig" data node. * "resolve" - Let the parser choose the best derivation (greedy for tokens, non-greedy for rules. Default) -* debug - Display warnings (such as Shift-Reduce warnings for LALR) +* **debug** - Display warnings (such as Shift-Reduce warnings for LALR) -* keep_all_tokens - Don't throw away any terminals from the tree (Default=False) +* **keep_all_tokens** - Don't throw away any terminals from the tree (Default=`False`) -* propagate_positions - Propagate line/column count to tree nodes (default=False) +* **propagate_positions** - Propagate line/column count to tree nodes, at the cost of performance (default=`True`) -* lexer_callbacks - A dictionary of callbacks of type f(Token) -> Token, used to interface with the lexer Token generation. Only works with the standard and contextual lexers. See [Recipes](recipes.md) for more information. +* **maybe_placeholders** - The `[]` operator returns `None` when not matched. Setting this to `False` makes it behave like the `?` operator, and return no value at all, which may be a little faster (default=`True`) + +* **lexer_callbacks** - A dictionary of callbacks of type f(Token) -> Token, used to interface with the lexer Token generation. Only works with the standard and contextual lexers. See [Recipes](recipes.md) for more information. #### parse(self, text) @@ -50,7 +52,7 @@ The main tree class * `data` - The name of the rule or alias * `children` - List of matched sub-rules and terminals -* `meta` - Line & Column numbers, if using `propagate_positions` +* `meta` - Line & Column numbers (unless `propagate_positions` is disabled) #### \_\_init\_\_(self, data, children) diff --git a/docs/grammar.md b/docs/grammar.md index 8a8913b..cc518e9 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -147,7 +147,7 @@ Each item is one of: * `TERMINAL` * `"string literal"` or `/regexp literal/` * `(item item ..)` - Group items -* `[item item ..]` - Maybe. Same as `(item item ..)?` +* `[item item ..]` - Maybe. Same as `(item item ..)?`, but generates `None` if there is no match * `item?` - Zero or one instances of item ("maybe") * `item*` - Zero or more instances of item * `item+` - One or more instances of item @@ -157,7 +157,7 @@ Each item is one of: **Examples:** ```perl hello_world: "hello" "world" -mul: [mul "*"] number //# Left-recursion is allowed! +mul: (mul "*")? number //# Left-recursion is allowed and encouraged! expr: expr operator expr | value //# Multi-line, belongs to expr diff --git a/examples/custom_lexer.py b/examples/custom_lexer.py index 786bf4f..732e614 100644 --- a/examples/custom_lexer.py +++ b/examples/custom_lexer.py @@ -29,7 +29,7 @@ parser = Lark(""" data_item: STR INT* %declare STR INT - """, parser='lalr', lexer=TypeLexer) + """, parser='lalr', lexer=TypeLexer, propagate_positions=False) class ParseToDict(Transformer): diff --git a/examples/reconstruct_json.py b/examples/reconstruct_json.py index 07df86c..59c58b0 100644 --- a/examples/reconstruct_json.py +++ b/examples/reconstruct_json.py @@ -25,15 +25,9 @@ test_json = ''' def test_earley(): - json_parser = Lark(json_grammar) + json_parser = Lark(json_grammar, maybe_placeholders=False) tree = json_parser.parse(test_json) - # print ('@@', tree.pretty()) - # for x in tree.find_data('true'): - # x.data = 'false' - # # x.children[0].value = '"HAHA"' - - new_json = Reconstructor(json_parser).reconstruct(tree) print (new_json) print (json.loads(new_json) == json.loads(test_json)) @@ -41,7 +35,7 @@ def test_earley(): def test_lalr(): - json_parser = Lark(json_grammar, parser='lalr') + json_parser = Lark(json_grammar, parser='lalr', maybe_placeholders=False) tree = json_parser.parse(test_json) new_json = Reconstructor(json_parser).reconstruct(tree) diff --git a/lark/lark.py b/lark/lark.py index 47c6fba..db1dfd2 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -66,9 +66,9 @@ class LarkOptions(Serialize): 'profile': False, 'priority': 'auto', 'ambiguity': 'auto', - 'propagate_positions': False, + 'propagate_positions': True, 'lexer_callbacks': {}, - 'maybe_placeholders': False, + 'maybe_placeholders': True, 'edit_terminals': None, } diff --git a/lark/reconstruct.py b/lark/reconstruct.py index c446913..fb47b93 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -69,6 +69,7 @@ class MakeMatchTree: class Reconstructor: def __init__(self, parser): # XXX TODO calling compile twice returns different results! + assert parser.options.maybe_placeholders == False tokens, rules, _grammar_extra = parser.grammar.compile(parser.options.start) self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens}) diff --git a/tests/test_parser.py b/tests/test_parser.py index e9d46e5..35b3015 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -963,7 +963,7 @@ def _make_parser_test(LEXER, PARSER): @unittest.skipIf(PARSER == 'cyk', "No empty rules") def test_twice_empty(self): - g = """!start: [["A"]] + g = """!start: ("A"?)? """ l = _Lark(g) tree = l.parse('A') diff --git a/tests/test_reconstructor.py b/tests/test_reconstructor.py index 526d2e2..ecab499 100644 --- a/tests/test_reconstructor.py +++ b/tests/test_reconstructor.py @@ -16,7 +16,7 @@ def _remove_ws(s): class TestReconstructor(TestCase): def assert_reconstruct(self, grammar, code): - parser = Lark(grammar, parser='lalr') + parser = Lark(grammar, parser='lalr', maybe_placeholders=False) tree = parser.parse(code) new = Reconstructor(parser).reconstruct(tree) self.assertEqual(_remove_ws(code), _remove_ws(new)) @@ -105,7 +105,7 @@ class TestReconstructor(TestCase): %ignore WS """ - json_parser = Lark(json_grammar, parser='lalr') + json_parser = Lark(json_grammar, parser='lalr', maybe_placeholders=False) tree = json_parser.parse(test_json) new_json = Reconstructor(json_parser).reconstruct(tree) From 175c4038305048493618ea64b1d065144624459b Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 17 Nov 2019 15:24:27 +0200 Subject: [PATCH 091/132] Transformers now visit tokens by default --- lark/visitors.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lark/visitors.py b/lark/visitors.py index c6e4f6b..4f32091 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -22,8 +22,8 @@ class Transformer: Can be used to implement map or reduce. """ - __visit_tokens__ = False # For backwards compatibility - def __init__(self, visit_tokens=False): + __visit_tokens__ = True # For backwards compatibility + def __init__(self, visit_tokens=True): self.__visit_tokens__ = visit_tokens def _call_userfunc(self, tree, new_children=None): @@ -189,7 +189,7 @@ class Visitor(VisitorBase): def visit_topdown(self,tree): for subtree in tree.iter_subtrees_topdown(): self._call_userfunc(subtree) - return tree + return tree class Visitor_Recursive(VisitorBase): """Bottom-up visitor, recursive @@ -212,7 +212,7 @@ class Visitor_Recursive(VisitorBase): for child in tree.children: if isinstance(child, Tree): self.visit_topdown(child) - + return tree From 1815bd7fbda932933fed604579e955a8c4d7c021 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 17 Nov 2019 16:01:34 +0200 Subject: [PATCH 092/132] Support for token visitation in internal transformers, as an alternative mechanism for lexer_callbacks --- lark/exceptions.py | 6 +++--- lark/lark.py | 11 ++++++++++- lark/lexer.py | 7 +++++++ lark/visitors.py | 4 ++-- tests/test_parser.py | 20 +++++++++++++------- 5 files changed, 35 insertions(+), 13 deletions(-) diff --git a/lark/exceptions.py b/lark/exceptions.py index 28f1b4b..f46fa82 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -97,10 +97,10 @@ class UnexpectedToken(ParseError, UnexpectedInput): super(UnexpectedToken, self).__init__(message) class VisitError(LarkError): - def __init__(self, tree, orig_exc): - self.tree = tree + def __init__(self, rule, obj, orig_exc): + self.obj = obj self.orig_exc = orig_exc - message = 'Error trying to process rule "%s":\n\n%s' % (tree.data, orig_exc) + message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) super(VisitError, self).__init__(message) ###} diff --git a/lark/lark.py b/lark/lark.py index db1dfd2..d334cc7 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -225,7 +225,16 @@ class Lark(Serialize): for rule in self.rules: if rule.options and rule.options.priority is not None: rule.options.priority = None - self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks) + + # TODO Deprecate lexer_callbacks? + lexer_callbacks = dict(self.options.lexer_callbacks) + if self.options.transformer: + t = self.options.transformer + for term in self.terminals: + if hasattr(t, term.name): + lexer_callbacks[term.name] = getattr(t, term.name) + + self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, lexer_callbacks) if self.options.parser: self.parser = self._build_parser() diff --git a/lark/lexer.py b/lark/lexer.py index 806d575..21951e4 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -108,6 +108,13 @@ class Token(Str): self.end_column = end_column return self + def update(self, type_=None, value=None): + return Token.new_borrow_pos( + type_ if type_ is not None else self.type, + value if value is not None else self.value, + self + ) + @classmethod def new_borrow_pos(cls, type_, value, borrow_t): return cls(type_, value, borrow_t.pos_in_stream, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column) diff --git a/lark/visitors.py b/lark/visitors.py index 4f32091..a2d5e86 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -48,7 +48,7 @@ class Transformer: except (GrammarError, Discard): raise except Exception as e: - raise VisitError(tree, e) + raise VisitError(tree.data, tree, e) def _call_userfunc_token(self, token): try: @@ -61,7 +61,7 @@ class Transformer: except (GrammarError, Discard): raise except Exception as e: - raise VisitError(token, e) + raise VisitError(token.type, token, e) def _transform_children(self, children): diff --git a/tests/test_parser.py b/tests/test_parser.py index 35b3015..caee80e 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -99,16 +99,22 @@ class TestParsers(unittest.TestCase): def a(self, children): return children[0] + "!" def A(self, tok): - return tok.upper() + return tok.update(value=tok.upper()) # Test regular - g = Lark("""start: a - a : A - A: "x" - """, parser='lalr') - r = T().transform(g.parse("x")) + g = """start: a + a : A + A: "x" + """ + p = Lark(g, parser='lalr') + r = T(False).transform(p.parse("x")) self.assertEqual( r.children, ["x!"] ) - r = T(True).transform(g.parse("x")) + r = T().transform(p.parse("x")) + self.assertEqual( r.children, ["X!"] ) + + # Test internal transformer + p = Lark(g, parser='lalr', transformer=T()) + r = p.parse("x") self.assertEqual( r.children, ["X!"] ) From 58d6d9fac1883476ea890634124fbfbabc952650 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 17 Nov 2019 16:10:54 +0200 Subject: [PATCH 093/132] Added Token.end_pos, and updated docs regarding recent commits --- docs/recipes.md | 12 ++++++------ examples/custom_lexer.py | 2 +- lark/lexer.py | 8 +++++--- lark/parse_tree_builder.py | 2 +- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/docs/recipes.md b/docs/recipes.md index 2202ab7..4e734e7 100644 --- a/docs/recipes.md +++ b/docs/recipes.md @@ -19,18 +19,18 @@ It only works with the standard and contextual lexers. ### Example 1: Replace string values with ints for INT tokens ```python -from lark import Lark, Token +from lark import Lark, Transformer -def tok_to_int(tok): - "Convert the value of `tok` from string to int, while maintaining line number & column." - # tok.type == 'INT' - return Token.new_borrow_pos(tok.type, int(tok), tok) +class T(Transformer): + def INT(self, tok): + "Convert the value of `tok` from string to int, while maintaining line number & column." + return tok.update(value=int(tok)) parser = Lark(""" start: INT* %import common.INT %ignore " " -""", parser="lalr", lexer_callbacks = {'INT': tok_to_int}) +""", parser="lalr", transformer=T()) print(parser.parse('3 14 159')) ``` diff --git a/examples/custom_lexer.py b/examples/custom_lexer.py index 732e614..786bf4f 100644 --- a/examples/custom_lexer.py +++ b/examples/custom_lexer.py @@ -29,7 +29,7 @@ parser = Lark(""" data_item: STR INT* %declare STR INT - """, parser='lalr', lexer=TypeLexer, propagate_positions=False) + """, parser='lalr', lexer=TypeLexer) class ParseToDict(Transformer): diff --git a/lark/lexer.py b/lark/lexer.py index 21951e4..871b25e 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -90,9 +90,9 @@ class TerminalDef(Serialize): class Token(Str): - __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column') + __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') - def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None): + def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None, end_pos=None): try: self = super(Token, cls).__new__(cls, value) except UnicodeDecodeError: @@ -106,6 +106,7 @@ class Token(Str): self.column = column self.end_line = end_line self.end_column = end_column + self.end_pos = end_pos return self def update(self, type_=None, value=None): @@ -117,7 +118,7 @@ class Token(Str): @classmethod def new_borrow_pos(cls, type_, value, borrow_t): - return cls(type_, value, borrow_t.pos_in_stream, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column) + return cls(type_, value, borrow_t.pos_in_stream, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos) def __reduce__(self): return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, )) @@ -187,6 +188,7 @@ class _Lex: line_ctr.feed(value, type_ in newline_types) t.end_line = line_ctr.line t.end_column = line_ctr.column + t.end_pos = line_ctr.char_pos if t.type in lexer.callback: t = lexer.callback[t.type](t) if not isinstance(t, Token): diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index b54b6e8..3c47ef0 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -51,7 +51,7 @@ class PropagatePositions: elif isinstance(c, Token): res.meta.end_line = c.end_line res.meta.end_column = c.end_column - res.meta.end_pos = c.pos_in_stream + len(c.value) + res.meta.end_pos = c.end_pos res.meta.empty = False break From 54027942515054682a2958d7a7570a162311c177 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 17 Nov 2019 16:35:53 +0200 Subject: [PATCH 094/132] Fix for nearley --- lark/tools/nearley.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lark/tools/nearley.py b/lark/tools/nearley.py index 8412259..0b04fb5 100644 --- a/lark/tools/nearley.py +++ b/lark/tools/nearley.py @@ -18,7 +18,7 @@ nearley_grammar = r""" expansion: expr+ js - ?expr: item [":" /[+*?]/] + ?expr: item (":" /[+*?]/)? ?item: rule|string|regexp|null | "(" expansions ")" @@ -167,7 +167,7 @@ def create_code_for_nearley_grammar(g, start, builtin_path, folder_path): emit(" __default__ = lambda self, n, c, m: c if c else None") emit() - emit('parser = Lark(grammar, start="n_%s")' % start) + emit('parser = Lark(grammar, start="n_%s", maybe_placeholders=False)' % start) emit('def parse(text):') emit(' return TransformNearley().transform(parser.parse(text))') From 9727eb02264331b3771084ab58778dfbd7fff756 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sun, 17 Nov 2019 20:12:44 +0200 Subject: [PATCH 095/132] Added info to the docs about maybe_placeholders (Issue #451) --- docs/tree_construction.md | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/docs/tree_construction.md b/docs/tree_construction.md index 6b581e0..9e61d4d 100644 --- a/docs/tree_construction.md +++ b/docs/tree_construction.md @@ -7,6 +7,10 @@ For example, the rule `node: child1 child2` will create a tree node with two chi Using `item+` or `item*` will result in a list of items, equivalent to writing `item item item ..`. +Using `item?` will return the item if it matched, or nothing. + +Using `[item]` will return the item if it matched, or the value `None`, if it didn't. It's possible to force `[]` to behave like `()?`, by using the `maybe_placeholders=False` option when initializing Lark. + ### Terminals Terminals are always values in the tree, never branches. From ed3c131ca8b58aa976ec9e2401493acdd3c30b85 Mon Sep 17 00:00:00 2001 From: Mike Roberts Date: Mon, 18 Nov 2019 09:43:50 +0000 Subject: [PATCH 096/132] Allow comments in rule definitions --- examples/lark.lark | 2 +- lark/load_grammar.py | 2 +- tests/test_parser.py | 10 ++++++++++ 3 files changed, 12 insertions(+), 2 deletions(-) diff --git a/examples/lark.lark b/examples/lark.lark index 915cf2e..7373c54 100644 --- a/examples/lark.lark +++ b/examples/lark.lark @@ -44,7 +44,7 @@ _NL: /(\r?\n)+\s*/ %import common.INT -> NUMBER %import common.WS_INLINE -COMMENT: "//" /[^\n]/* +COMMENT: /\s*/ "//" /[^\n]/* %ignore WS_INLINE %ignore COMMENT diff --git a/lark/load_grammar.py b/lark/load_grammar.py index a65ca1e..1070f86 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -85,7 +85,7 @@ TERMINALS = { 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS, '_NL': r'(\r?\n)+\s*', 'WS': r'[ \t]+', - 'COMMENT': r'//[^\n]*', + 'COMMENT': r'\s*//[^\n]*', '_TO': '->', '_IGNORE': r'%ignore', '_DECLARE': r'%declare', diff --git a/tests/test_parser.py b/tests/test_parser.py index caee80e..8cefcb8 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -94,6 +94,16 @@ class TestParsers(unittest.TestCase): r = g.parse('xx') self.assertEqual( r.children[0].data, "c" ) + def test_comment_in_rule_definition(self): + g = Lark("""start: a + a: "a" + // A comment + // Another + | "b" + """) + r = g.parse('b') + self.assertEqual( r.children[0].data, "a" ) + def test_visit_tokens(self): class T(Transformer): def a(self, children): From 99a27663f6fb64c50f61d8b79eb53d2882d0c401 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 20 Nov 2019 21:16:22 +0200 Subject: [PATCH 097/132] Better error message --- lark/parsers/earley_forest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/parsers/earley_forest.py b/lark/parsers/earley_forest.py index bbceb42..e6179e6 100644 --- a/lark/parsers/earley_forest.py +++ b/lark/parsers/earley_forest.py @@ -195,7 +195,7 @@ class ForestVisitor(object): continue if id(next_node) in visiting: - raise ParseError("Infinite recursion in grammar!") + raise ParseError("Infinite recursion in grammar, in rule '%s'!" % next_node.s.name) input_stack.append(next_node) continue From 86f1bb1db69196f13a288bab54c1ae5966b49c80 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 20 Nov 2019 22:31:20 +0200 Subject: [PATCH 098/132] Improved the reconstructor, but it still feels like a lost cause --- lark/reconstruct.py | 53 ++++++++++++++++++++++++++++++++++++++------- 1 file changed, 45 insertions(+), 8 deletions(-) diff --git a/lark/reconstruct.py b/lark/reconstruct.py index fb47b93..b7a6659 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -19,9 +19,13 @@ def is_iter_empty(i): except StopIteration: return True + class WriteTokensTransformer(Transformer_InPlace): - def __init__(self, tokens): + "Inserts discarded tokens into their correct place, according to the rules of grammar" + + def __init__(self, tokens, term_subs): self.tokens = tokens + self.term_subs = term_subs def __default__(self, data, children, meta): # if not isinstance(t, MatchTree): @@ -33,10 +37,15 @@ class WriteTokensTransformer(Transformer_InPlace): to_write = [] for sym in meta.orig_expansion: if is_discarded_terminal(sym): - t = self.tokens[sym.name] - if not isinstance(t.pattern, PatternStr): - raise NotImplementedError("Reconstructing regexps not supported yet: %s" % t) - to_write.append(t.pattern.value) + try: + v = self.term_subs[sym.name](sym) + except KeyError: + t = self.tokens[sym.name] + if not isinstance(t.pattern, PatternStr): + raise NotImplementedError("Reconstructing regexps not supported yet: %s" % t) + + v = t.pattern.value + to_write.append(v) else: x = next(iter_args) if isinstance(x, list): @@ -66,14 +75,34 @@ class MakeMatchTree: t.meta.orig_expansion = self.expansion return t +def best_from_group(seq, group_key, cmp_key): + d = {} + for item in seq: + key = group_key(item) + if key in d: + v1 = cmp_key(item) + v2 = cmp_key(d[key]) + if v2 > v1: + d[key] = item + else: + d[key] = item + return list(d.values()) + class Reconstructor: - def __init__(self, parser): + def __init__(self, parser, term_subs={}): # XXX TODO calling compile twice returns different results! assert parser.options.maybe_placeholders == False tokens, rules, _grammar_extra = parser.grammar.compile(parser.options.start) - self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens}) + self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens}, term_subs) self.rules = list(self._build_recons_rules(rules)) + self.rules.reverse() + # print(len(self.rules)) + self.rules = best_from_group(self.rules, lambda r: r, lambda r: -len(r.expansion)) + # print(len(self.rules)) + + # self.rules = list(set(list(self._build_recons_rules(rules)))) + self.rules.sort(key=lambda r: len(r.expansion)) callbacks = {rule: rule.alias for rule in self.rules} # TODO pass callbacks through dict, instead of alias? self.parser = earley.Parser(ParserConf(self.rules, callbacks, parser.options.start), self._match, resolve_ambiguity=True) @@ -127,4 +156,12 @@ class Reconstructor: yield item def reconstruct(self, tree): - return ''.join(self._reconstruct(tree)) + x = self._reconstruct(tree) + y = [] + prev_item = '' + for item in x: + if prev_item and item and prev_item[-1].isalnum() and item[0].isalnum(): + y.append(' ') + y.append(item) + prev_item = item + return ''.join(y) From 0e141ec8962cd875e90127bcb94ed1b7b25db5ad Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 21 Nov 2019 10:43:46 +0200 Subject: [PATCH 099/132] Small addition to docs --- docs/classes.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/docs/classes.md b/docs/classes.md index 1d59551..021b2f4 100644 --- a/docs/classes.md +++ b/docs/classes.md @@ -53,6 +53,7 @@ The main tree class * `data` - The name of the rule or alias * `children` - List of matched sub-rules and terminals * `meta` - Line & Column numbers (unless `propagate_positions` is disabled) + * meta attributes: `line`, `column`, `start_pos`, `end_line`, `end_column`, `end_pos` #### \_\_init\_\_(self, data, children) @@ -98,6 +99,7 @@ When using a lexer, the resulting tokens in the trees will be of the Token class * `column` - The column of the token in the text (starting with 1) * `end_line` - The line where the token ends * `end_column` - The next column after the end of the token. For example, if the token is a single character with a `column` value of 4, `end_column` will be 5. +* `end_pos` - the index where the token ends (basically pos_in_stream + len(token)) ## Transformer ## Visitor From 2de7e347668270a255d4df77a00b4080738e51dd Mon Sep 17 00:00:00 2001 From: Jussi Laasonen Date: Thu, 5 Dec 2019 14:58:27 +0100 Subject: [PATCH 100/132] Open imported grammars with UTF-8 encoding --- lark/load_grammar.py | 5 +++-- tests/grammars/test_unicode.lark | 1 + tests/test_parser.py | 6 ++++++ tests/test_relative_import_unicode.lark | 3 +++ 4 files changed, 13 insertions(+), 2 deletions(-) create mode 100644 tests/grammars/test_unicode.lark create mode 100644 tests/test_relative_import_unicode.lark diff --git a/lark/load_grammar.py b/lark/load_grammar.py index a65ca1e..bb8fc2f 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -4,6 +4,7 @@ import os.path import sys from ast import literal_eval from copy import copy, deepcopy +from io import open from .utils import bfs from .lexer import Token, TerminalDef, PatternStr, PatternRE @@ -580,13 +581,13 @@ def import_grammar(grammar_path, base_paths=[]): for import_path in import_paths: with suppress(IOError): joined_path = os.path.join(import_path, grammar_path) - with open(joined_path) as f: + with open(joined_path, encoding='utf8') as f: text = f.read() grammar = load_grammar(text, joined_path) _imported_grammars[grammar_path] = grammar break else: - open(grammar_path) + open(grammar_path, encoding='utf8') assert False return _imported_grammars[grammar_path] diff --git a/tests/grammars/test_unicode.lark b/tests/grammars/test_unicode.lark new file mode 100644 index 0000000..9731d0a --- /dev/null +++ b/tests/grammars/test_unicode.lark @@ -0,0 +1 @@ +UNICODE : /[a-zØ-öø-ÿ]/ \ No newline at end of file diff --git a/tests/test_parser.py b/tests/test_parser.py index caee80e..3004041 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1126,6 +1126,12 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(x.children, ['12', 'lions']) + def test_relative_import_unicode(self): + l = _Lark_open('test_relative_import_unicode.lark', rel_to=__file__) + x = l.parse(u'Ø') + self.assertEqual(x.children, [u'Ø']) + + def test_relative_import_rename(self): l = _Lark_open('test_relative_import_rename.lark', rel_to=__file__) x = l.parse('12 lions') diff --git a/tests/test_relative_import_unicode.lark b/tests/test_relative_import_unicode.lark new file mode 100644 index 0000000..8010537 --- /dev/null +++ b/tests/test_relative_import_unicode.lark @@ -0,0 +1,3 @@ +start: UNICODE + +%import .grammars.test_unicode.UNICODE \ No newline at end of file From 616d2339b062e5fa93a995f6c58cc0ad283c141a Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 11 Dec 2019 12:47:56 +0200 Subject: [PATCH 101/132] Update README.md --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index b9a1bda..78107de 100644 --- a/README.md +++ b/README.md @@ -34,13 +34,16 @@ Lark has no dependencies. [![Build Status](https://travis-ci.org/lark-parser/lark.svg?branch=master)](https://travis-ci.org/lark-parser/lark) -### Syntax Highlighting (new) +### Syntax Highlighting -Lark now provides syntax highlighting for its grammar files (\*.lark): +Lark provides syntax highlighting for its grammar files (\*.lark): - [Sublime Text & TextMate](https://github.com/lark-parser/lark_syntax) - [vscode](https://github.com/lark-parser/vscode-lark) +### Clones + +- [Lerchen (Julia)](https://github.com/jamesrhester/Lerchen.jl) - an unofficial clone, written entirely in Julia. ### Hello World From d693a172323488c9ba11796c0a48ef39ace79a3a Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Thu, 12 Dec 2019 09:24:51 +0200 Subject: [PATCH 102/132] Fixed link to Lerche --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 78107de..84e4921 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ Lark provides syntax highlighting for its grammar files (\*.lark): ### Clones -- [Lerchen (Julia)](https://github.com/jamesrhester/Lerchen.jl) - an unofficial clone, written entirely in Julia. +- [Lerche (Julia)](https://github.com/jamesrhester/Lerche.jl) - an unofficial clone, written entirely in Julia. ### Hello World From e1a39c58d0a91b99777e954a6ba7573afae140f8 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 8 Dec 2019 19:13:14 +0200 Subject: [PATCH 103/132] Refactored v_args & visitors to a better, more agile implementation --- lark/parse_tree_builder.py | 23 +++++++++++---- lark/visitors.py | 53 +++++++++++++++++++++++----------- tests/test_parser.py | 58 +++++++++++++++++++++++++++++++++++++- 3 files changed, 111 insertions(+), 23 deletions(-) diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 3c47ef0..b50da43 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -3,6 +3,7 @@ from .lexer import Token from .tree import Tree from .visitors import InlineTransformer # XXX Deprecated from .visitors import Transformer_InPlace +from . import visitors ###{standalone from functools import partial, wraps @@ -202,6 +203,15 @@ def inplace_transformer(func): return func(tree) return f +def apply_visit_wrapper(func, name, wrapper): + if wrapper is visitors._vargs_meta or wrapper is visitors._vargs_meta_inline: + raise NotImplementedError("Meta args not supported for internal transformer") + @wraps(func) + def f(children): + return wrapper(func, name, children, None) + return f + + class ParseTreeBuilder: def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False, maybe_placeholders=False): self.tree_class = tree_class @@ -236,12 +246,15 @@ class ParseTreeBuilder: user_callback_name = rule.alias or rule.origin.name try: f = getattr(transformer, user_callback_name) - assert not getattr(f, 'meta', False), "Meta args not supported for internal transformer" # XXX InlineTransformer is deprecated! - if getattr(f, 'inline', False) or isinstance(transformer, InlineTransformer): - f = ptb_inline_args(f) - elif hasattr(f, 'whole_tree') or isinstance(transformer, Transformer_InPlace): - f = inplace_transformer(f) + wrapper = getattr(f, 'visit_wrapper', None) + if wrapper is not None: + f = apply_visit_wrapper(f, user_callback_name, wrapper) + else: + if isinstance(transformer, InlineTransformer): + f = ptb_inline_args(f) + elif isinstance(transformer, Transformer_InPlace): + f = inplace_transformer(f) except AttributeError: f = partial(self.tree_class, user_callback_name) diff --git a/lark/visitors.py b/lark/visitors.py index a2d5e86..da6b1d5 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -35,14 +35,9 @@ class Transformer: return self.__default__(tree.data, children, tree.meta) else: try: - if getattr(f, 'meta', False): - return f(children, tree.meta) - elif getattr(f, 'inline', False): - return f(*children) - elif getattr(f, 'whole_tree', False): - if new_children is not None: - tree.children = new_children - return f(tree) + wrapper = getattr(f, 'visit_wrapper', None) + if wrapper is not None: + return f.visit_wrapper(f, tree.data, children, tree.meta) else: return f(children) except (GrammarError, Discard): @@ -282,8 +277,7 @@ def inline_args(obj): # XXX Deprecated -def _visitor_args_func_dec(func, inline=False, meta=False, whole_tree=False, static=False): - assert [whole_tree, meta, inline].count(True) <= 1 +def _visitor_args_func_dec(func, visit_wrapper=None, static=False): def create_decorator(_f, with_self): if with_self: def f(self, *args, **kwargs): @@ -298,17 +292,42 @@ def _visitor_args_func_dec(func, inline=False, meta=False, whole_tree=False, sta else: f = smart_decorator(func, create_decorator) f.vargs_applied = True - f.inline = inline - f.meta = meta - f.whole_tree = whole_tree + f.visit_wrapper = visit_wrapper return f -def v_args(inline=False, meta=False, tree=False): + +def _vargs_inline(f, data, children, meta): + return f(*children) +def _vargs_meta_inline(f, data, children, meta): + return f(meta, *children) +def _vargs_meta(f, data, children, meta): + return f(children, meta) # TODO swap these for consistency? Backwards incompatible! +def _vargs_tree(f, data, children, meta): + return f(Tree(data, children, meta)) + +def v_args(inline=False, meta=False, tree=False, wrapper=None): "A convenience decorator factory, for modifying the behavior of user-supplied visitor methods" - if [tree, meta, inline].count(True) > 1: - raise ValueError("Visitor functions can either accept tree, or meta, or be inlined. These cannot be combined.") + if tree and (meta or inline): + raise ValueError("Visitor functions cannot combine 'tree' with 'meta' or 'inline'.") + + func = None + if meta: + if inline: + func = _vargs_meta_inline + else: + func = _vargs_meta + elif inline: + func = _vargs_inline + elif tree: + func = _vargs_tree + + if wrapper is not None: + if func is not None: + raise ValueError("Cannot use 'wrapper' along with 'tree', 'meta' or 'inline'.") + func = wrapper + def _visitor_args_dec(obj): - return _apply_decorator(obj, _visitor_args_func_dec, inline=inline, meta=meta, whole_tree=tree) + return _apply_decorator(obj, _visitor_args_func_dec, visit_wrapper=func) return _visitor_args_dec diff --git a/tests/test_parser.py b/tests/test_parser.py index 3004041..9a96305 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -5,6 +5,7 @@ import unittest import logging import os import sys +from copy import deepcopy try: from cStringIO import StringIO as cStringIO except ImportError: @@ -117,6 +118,61 @@ class TestParsers(unittest.TestCase): r = p.parse("x") self.assertEqual( r.children, ["X!"] ) + def test_vargs_meta(self): + + @v_args(meta=True) + class T1(Transformer): + def a(self, children, meta): + assert not children + return meta.line + + def start(self, children, meta): + return children + + @v_args(meta=True, inline=True) + class T2(Transformer): + def a(self, meta): + return meta.line + + def start(self, meta, *res): + return list(res) + + for T in (T1, T2): + for internal in [False, True]: + try: + g = Lark(r"""start: a+ + a : "x" _NL? + _NL: /\n/+ + """, parser='lalr', transformer=T() if internal else None) + except NotImplementedError: + assert internal + continue + + res = g.parse("xx\nx\nxxx\n\n\nxx") + assert not internal + res = T().transform(res) + + self.assertEqual(res, [1, 1, 2, 3, 3, 3, 6, 6]) + + def test_vargs_tree(self): + tree = Lark(''' + start: a a a + !a: "A" + ''').parse('AAA') + tree_copy = deepcopy(tree) + + @v_args(tree=True) + class T(Transformer): + def a(self, tree): + return 1 + def start(self, tree): + return tree.children + + res = T().transform(tree) + self.assertEqual(res, [1, 1, 1]) + self.assertEqual(tree, tree_copy) + + def test_embedded_transformer(self): class T(Transformer): @@ -188,7 +244,7 @@ class TestParsers(unittest.TestCase): @v_args(tree=True) class T2(Transformer): def a(self, tree): - assert isinstance(tree, Tree) + assert isinstance(tree, Tree), tree tree.children.append("tested") return tree From 8842928963d265e35b9da2c1e2a2acadbee4151a Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Fri, 13 Dec 2019 09:30:41 +0200 Subject: [PATCH 104/132] Fixed multithreading bug in ContextualLexer (Issue #493) --- lark/lexer.py | 19 +++++++------------ lark/parser_frontends.py | 16 ++++++++++++---- 2 files changed, 19 insertions(+), 16 deletions(-) diff --git a/lark/lexer.py b/lark/lexer.py index 871b25e..9d26318 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -288,10 +288,7 @@ class Lexer(object): Method Signatures: lex(self, stream) -> Iterator[Token] - - set_parser_state(self, state) # Optional """ - set_parser_state = NotImplemented lex = NotImplemented @@ -349,6 +346,7 @@ class TraditionalLexer(Lexer): class ContextualLexer(Lexer): + def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): tokens_by_name = {} for t in terminals: @@ -371,18 +369,15 @@ class ContextualLexer(Lexer): self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks) - self.set_parser_state(None) # Needs to be set on the outside - - def set_parser_state(self, state): - self.parser_state = state - - def lex(self, stream): - l = _Lex(self.lexers[self.parser_state], self.parser_state) + def lex(self, stream, get_parser_state): + parser_state = get_parser_state() + l = _Lex(self.lexers[parser_state], parser_state) try: for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): yield x - l.lexer = self.lexers[self.parser_state] - l.state = self.parser_state + parser_state = get_parser_state() + l.lexer = self.lexers[parser_state] + l.state = parser_state # For debug only, no need to worry about multithreading except UnexpectedCharacters as e: # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, # but not in the current context. diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index ec82299..8b42772 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -79,14 +79,13 @@ class WithLexer(_ParserFrontend): def _serialize(self, data, memo): data['parser'] = data['parser'].serialize(memo) - def lex(self, text): - stream = self.lexer.lex(text) + def lex(self, *args): + stream = self.lexer.lex(*args) return self.postlex.process(stream) if self.postlex else stream def parse(self, text, start=None): token_stream = self.lex(text) - sps = self.lexer.set_parser_state - return self._parse(token_stream, start, *[sps] if sps is not NotImplemented else []) + return self._parse(token_stream, start) def init_traditional_lexer(self): self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) @@ -114,6 +113,15 @@ class LALR_ContextualLexer(LALR_WithLexer): ignore=self.lexer_conf.ignore, always_accept=always_accept, user_callbacks=self.lexer_conf.callbacks) + + + def parse(self, text, start=None): + parser_state = [None] + def set_parser_state(s): + parser_state[0] = s + + token_stream = self.lex(text, lambda: parser_state[0]) + return self._parse(token_stream, start, set_parser_state) ###} class LALR_CustomLexer(LALR_WithLexer): From 0f699b1ebbe1487ecf221ec9e7eb4e37a2283b10 Mon Sep 17 00:00:00 2001 From: Ted Summer Date: Tue, 17 Dec 2019 09:37:28 -0700 Subject: [PATCH 105/132] chore(docs): default for propagate_positions=False --- docs/classes.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/classes.md b/docs/classes.md index 021b2f4..6ec9dcc 100644 --- a/docs/classes.md +++ b/docs/classes.md @@ -32,7 +32,7 @@ The Lark class accepts a grammar string or file object, and keyword options: * **keep_all_tokens** - Don't throw away any terminals from the tree (Default=`False`) -* **propagate_positions** - Propagate line/column count to tree nodes, at the cost of performance (default=`True`) +* **propagate_positions** - Propagate line/column count to tree nodes, at the cost of performance (default=`False`) * **maybe_placeholders** - The `[]` operator returns `None` when not matched. Setting this to `False` makes it behave like the `?` operator, and return no value at all, which may be a little faster (default=`True`) From 9a0e7af4e23d51965afdda68d78c116d13a88de7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=A9stor=20N=C3=A1poles?= Date: Wed, 18 Dec 2019 16:34:05 -0500 Subject: [PATCH 106/132] Adding one more project using Lark --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 84e4921..d38546f 100644 --- a/README.md +++ b/README.md @@ -146,6 +146,7 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail - [required](https://github.com/shezadkhan137/required) - multi-field validation using docstrings - [miniwdl](https://github.com/chanzuckerberg/miniwdl) - A static analysis toolkit for the Workflow Description Language - [pytreeview](https://gitlab.com/parmenti/pytreeview) - a lightweight tree-based grammar explorer + - [harmalysis](https://github.com/napulen/harmalysis) - A language for harmonic analysis and music theory Using Lark? Send me a message and I'll add your project! From dcc9d46eef56dd0fb9633a2c3dc7c223f347baeb Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 25 Dec 2019 10:51:04 +0200 Subject: [PATCH 107/132] Fixes to propagate_positions --- lark/parse_tree_builder.py | 4 ++-- tests/__main__.py | 2 +- tests/test_parser.py | 8 ++++++++ 3 files changed, 11 insertions(+), 3 deletions(-) diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index b50da43..6d298f4 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -29,7 +29,7 @@ class PropagatePositions: if isinstance(res, Tree): for c in children: - if isinstance(c, Tree) and c.children and not c.meta.empty: + if isinstance(c, Tree) and not c.meta.empty: res.meta.line = c.meta.line res.meta.column = c.meta.column res.meta.start_pos = c.meta.start_pos @@ -43,7 +43,7 @@ class PropagatePositions: break for c in reversed(children): - if isinstance(c, Tree) and c.children and not c.meta.empty: + if isinstance(c, Tree) and not c.meta.empty: res.meta.end_line = c.meta.end_line res.meta.end_column = c.meta.end_column res.meta.end_pos = c.meta.end_pos diff --git a/tests/__main__.py b/tests/__main__.py index 901f101..477789f 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -10,7 +10,7 @@ from .test_reconstructor import TestReconstructor try: from .test_nearley.test_nearley import TestNearley except ImportError: - logging.warn("Warning: Skipping tests for Nearley (js2py required)") + logging.warning("Warning: Skipping tests for Nearley grammar imports (js2py required)") # from .test_selectors import TestSelectors # from .test_grammars import TestPythonG, TestConfigG diff --git a/tests/test_parser.py b/tests/test_parser.py index 9a96305..3f73990 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -63,6 +63,14 @@ class TestParsers(unittest.TestCase): r = g.parse('a') self.assertEqual( r.children[0].meta.line, 1 ) + g = Lark("""start: x + x: a + a: "a" + """, propagate_positions=True) + + r = g.parse('a') + self.assertEqual( r.children[0].meta.line, 1 ) + def test_expand1(self): g = Lark("""start: a From f7a6366b6c2e3a6963bceb11a4e0cdbd5a41b7c2 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 25 Dec 2019 11:19:10 +0200 Subject: [PATCH 108/132] Make the JSON parser fast again --- examples/json_parser.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/examples/json_parser.py b/examples/json_parser.py index ba1ff1e..7aa7d0f 100644 --- a/examples/json_parser.py +++ b/examples/json_parser.py @@ -49,11 +49,21 @@ class TreeToJson(Transformer): false = lambda self, _: False +### Create the JSON parser with Lark, using the Earley algorithm # json_parser = Lark(json_grammar, parser='earley', lexer='standard') # def parse(x): # return TreeToJson().transform(json_parser.parse(x)) -json_parser = Lark(json_grammar, parser='lalr', lexer='standard', transformer=TreeToJson()) +### Create the JSON parser with Lark, using the LALR algorithm +json_parser = Lark(json_grammar, parser='lalr', + # Using the standard lexer isn't required, and isn't usually recommended. + # But, it's good enough for JSON, and it's slightly faster. + lexer='standard', + # Disabling propagate_positions and placeholders slightly improves speed + propagate_positions=False, + maybe_placeholders=False, + # Using an internal transformer is faster and more memory efficient + transformer=TreeToJson()) parse = json_parser.parse From fbbea5f73093dea5de4867b92831d44c38f60497 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 25 Dec 2019 11:27:29 +0200 Subject: [PATCH 109/132] Removed deprecated feature - profile --- lark/lark.py | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/lark/lark.py b/lark/lark.py index d334cc7..6e51914 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -44,7 +44,6 @@ class LarkOptions(Serialize): cache_grammar - Cache the Lark grammar (Default: False) postlex - Lexer post-processing (Default: None) Only works with the standard and contextual lexers. start - The start symbol, either a string, or a list of strings for multiple possible starts (Default: "start") - profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False) priority - How priorities should be evaluated - auto, none, normal, invert (Default: auto) propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches. lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. @@ -63,7 +62,6 @@ class LarkOptions(Serialize): 'lexer': 'auto', 'transformer': None, 'start': 'start', - 'profile': False, 'priority': 'auto', 'ambiguity': 'auto', 'propagate_positions': True, @@ -114,30 +112,6 @@ class LarkOptions(Serialize): return cls(data) -class Profiler: - def __init__(self): - self.total_time = defaultdict(float) - self.cur_section = '__init__' - self.last_enter_time = time.time() - - def enter_section(self, name): - cur_time = time.time() - self.total_time[self.cur_section] += cur_time - self.last_enter_time - self.last_enter_time = cur_time - self.cur_section = name - - def make_wrapper(self, name, f): - def wrapper(*args, **kwargs): - last_section = self.cur_section - self.enter_section(name) - try: - return f(*args, **kwargs) - finally: - self.enter_section(last_section) - - return wrapper - - class Lark(Serialize): def __init__(self, grammar, **options): """ @@ -165,9 +139,6 @@ class Lark(Serialize): if self.options.cache_grammar: raise NotImplementedError("Not available yet") - assert not self.options.profile, "Feature temporarily disabled" - # self.profiler = Profiler() if self.options.profile else None - if self.options.lexer == 'auto': if self.options.parser == 'lalr': self.options.lexer = 'contextual' From b9c81a54508baf7fe8f96aac7364dfa00608ff42 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 25 Dec 2019 11:39:09 +0200 Subject: [PATCH 110/132] Refactor: Simplify code by assuming rule.options is never None --- examples/standalone/json_parser.py | 9 ++++----- lark/grammar.py | 2 +- lark/lark.py | 4 ++-- lark/load_grammar.py | 4 ++-- lark/parse_tree_builder.py | 6 +++--- lark/parsers/cyk.py | 2 +- lark/parsers/earley.py | 2 +- lark/parsers/earley_forest.py | 2 +- lark/parsers/lalr_parser.py | 3 +-- lark/reconstruct.py | 2 +- 10 files changed, 17 insertions(+), 19 deletions(-) diff --git a/examples/standalone/json_parser.py b/examples/standalone/json_parser.py index d424f1b..73acf9c 100644 --- a/examples/standalone/json_parser.py +++ b/examples/standalone/json_parser.py @@ -1305,8 +1305,7 @@ class ParseTreeBuilder: class LALR_Parser(object): def __init__(self, parser_conf, debug=False): - assert all(r.options is None or r.options.priority is None - for r in parser_conf.rules), "LALR doesn't yet support prioritization" + assert all(r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization" analysis = LALR_Analyzer(parser_conf, debug=debug) analysis.compute_lookahead() callbacks = parser_conf.callbacks @@ -1508,7 +1507,7 @@ class WithLexer(Serialize): inst.postlex = postlex inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) return inst - + def _serialize(self, data, memo): data['parser'] = data['parser'].serialize(memo) @@ -1740,14 +1739,14 @@ class Lark(Serialize): # This replaces the old 'resolve__antiscore_sum' option. if self.options.priority == 'invert': for rule in self.rules: - if rule.options and rule.options.priority is not None: + if rule.options.priority is not None: rule.options.priority = -rule.options.priority # Else, if the user asked to disable priorities, strip them from the # rules. This allows the Earley parsers to skip an extra forest walk # for improved performance, if you don't need them (or didn't specify any). elif self.options.priority == None: for rule in self.rules: - if rule.options and rule.options.priority is not None: + if rule.options.priority is not None: rule.options.priority = None self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks) diff --git a/lark/grammar.py b/lark/grammar.py index 91435b2..cf8cf64 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -81,7 +81,7 @@ class Rule(Serialize): self.expansion = expansion self.alias = alias self.order = order - self.options = options + self.options = options or RuleOptions() self._hash = hash((self.origin, tuple(self.expansion))) def _deserialize(self): diff --git a/lark/lark.py b/lark/lark.py index 6e51914..36cb4b6 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -187,14 +187,14 @@ class Lark(Serialize): # This replaces the old 'resolve__antiscore_sum' option. if self.options.priority == 'invert': for rule in self.rules: - if rule.options and rule.options.priority is not None: + if rule.options.priority is not None: rule.options.priority = -rule.options.priority # Else, if the user asked to disable priorities, strip them from the # rules. This allows the Earley parsers to skip an extra forest walk # for improved performance, if you don't need them (or didn't specify any). elif self.options.priority == None: for rule in self.rules: - if rule.options and rule.options.priority is not None: + if rule.options.priority is not None: rule.options.priority = None # TODO Deprecate lexer_callbacks? diff --git a/lark/load_grammar.py b/lark/load_grammar.py index bb8fc2f..2cd834c 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -503,7 +503,7 @@ class Grammar: ebnf_to_bnf = EBNF_to_BNF() rules = [] for name, rule_tree, options in rule_defs: - ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None + ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options.keep_all_tokens else None tree = transformer.transform(rule_tree) res = ebnf_to_bnf.transform(tree) rules.append((name, res, options)) @@ -527,7 +527,7 @@ class Grammar: empty_indices = [x==_EMPTY for x in expansion] if any(empty_indices): - exp_options = copy(options) if options else RuleOptions() + exp_options = copy(options) exp_options.empty_indices = empty_indices expansion = [x for x in expansion if x!=_EMPTY] else: diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 6d298f4..4ee0071 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -225,12 +225,12 @@ class ParseTreeBuilder: def _init_builders(self, rules): for rule in rules: options = rule.options - keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) - expand_single_child = options.expand1 if options else False + keep_all_tokens = self.always_keep_all_tokens or options.keep_all_tokens + expand_single_child = options.expand1 wrapper_chain = list(filter(None, [ (expand_single_child and not rule.alias) and ExpandSingleChild, - maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders and options else None), + maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders else None), self.propagate_positions and PropagatePositions, self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens), ])) diff --git a/lark/parsers/cyk.py b/lark/parsers/cyk.py index 7b25609..ff0924f 100644 --- a/lark/parsers/cyk.py +++ b/lark/parsers/cyk.py @@ -96,7 +96,7 @@ class Parser(object): assert all(isinstance(x, Symbol) for x in lark_rule.expansion) return Rule( lark_rule.origin, lark_rule.expansion, - weight=lark_rule.options.priority if lark_rule.options and lark_rule.options.priority else 0, + weight=lark_rule.options.priority if lark_rule.options.priority else 0, alias=lark_rule) def parse(self, tokenized, start): # pylint: disable=invalid-name diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index e18d26c..a4ffead 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -45,7 +45,7 @@ class Parser: # the priorities will be stripped from all rules before they reach us, allowing us to # skip the extra tree walk. We'll also skip this if the user just didn't specify priorities # on any rules. - if self.forest_sum_visitor is None and rule.options and rule.options.priority is not None: + if self.forest_sum_visitor is None and rule.options.priority is not None: self.forest_sum_visitor = ForestSumVisitor self.term_matcher = term_matcher diff --git a/lark/parsers/earley_forest.py b/lark/parsers/earley_forest.py index e6179e6..c8b4f25 100644 --- a/lark/parsers/earley_forest.py +++ b/lark/parsers/earley_forest.py @@ -250,7 +250,7 @@ class ForestSumVisitor(ForestVisitor): return iter(node.children) def visit_packed_node_out(self, node): - priority = node.rule.options.priority if not node.parent.is_intermediate and node.rule.options and node.rule.options.priority else 0 + priority = node.rule.options.priority if not node.parent.is_intermediate and node.rule.options.priority else 0 priority += getattr(node.right, 'priority', 0) priority += getattr(node.left, 'priority', 0) node.priority = priority diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 82c8bba..4265ca5 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -12,8 +12,7 @@ from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable ###{standalone class LALR_Parser(object): def __init__(self, parser_conf, debug=False): - assert all(r.options is None or r.options.priority is None - for r in parser_conf.rules), "LALR doesn't yet support prioritization" + assert all(r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization" analysis = LALR_Analyzer(parser_conf, debug=debug) analysis.compute_lalr() callbacks = parser_conf.callbacks diff --git a/lark/reconstruct.py b/lark/reconstruct.py index b7a6659..bd7b6a0 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -108,7 +108,7 @@ class Reconstructor: self._match, resolve_ambiguity=True) def _build_recons_rules(self, rules): - expand1s = {r.origin for r in rules if r.options and r.options.expand1} + expand1s = {r.origin for r in rules if r.options.expand1} aliases = defaultdict(list) for r in rules: From f0da22e9a849bd700b67bd550a11b1eefb3235be Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 26 Dec 2019 19:42:01 +0200 Subject: [PATCH 111/132] LarkOptions now raises AttributeError instead of KeyError (Issue #503) --- lark/lark.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lark/lark.py b/lark/lark.py index 36cb4b6..ea4f46a 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -99,7 +99,11 @@ class LarkOptions(Serialize): raise ValueError("Unknown options: %s" % o.keys()) def __getattr__(self, name): - return self.options[name] + try: + return self.options[name] + except KeyError as e: + raise AttributeError(e) + def __setattr__(self, name, value): assert name in self.options self.options[name] = value From b2f1b3bf7c63d980f69025d467064ba504f6a279 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 11 Jan 2020 16:05:29 +0200 Subject: [PATCH 112/132] Small fixes --- examples/standalone/json_parser.py | 608 ++++++++++++++++++++--------- lark/lark.py | 2 - lark/lexer.py | 2 +- lark/load_grammar.py | 35 +- lark/parsers/lalr_analysis.py | 2 +- lark/tools/standalone.py | 3 + lark/utils.py | 26 ++ 7 files changed, 451 insertions(+), 227 deletions(-) diff --git a/examples/standalone/json_parser.py b/examples/standalone/json_parser.py index 73acf9c..f270ade 100644 --- a/examples/standalone/json_parser.py +++ b/examples/standalone/json_parser.py @@ -1,4 +1,4 @@ -# The file was automatically generated by Lark v0.7.0 +# The file was automatically generated by Lark v0.8.0rc1 # # # Lark Stand-alone Generator Tool @@ -35,6 +35,9 @@ # # +import os +from io import open + class LarkError(Exception): pass @@ -47,6 +50,14 @@ class ParseError(LarkError): class LexError(LarkError): pass +class UnexpectedEOF(ParseError): + def __init__(self, expected): + self.expected = expected + + message = ("Unexpected end-of-input. Expected one of: \n\t* %s\n" % '\n\t* '.join(x.name for x in self.expected)) + super(UnexpectedEOF, self).__init__(message) + + class UnexpectedInput(LarkError): pos_in_stream = None @@ -86,7 +97,7 @@ class UnexpectedInput(LarkError): class UnexpectedCharacters(LexError, UnexpectedInput): - def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None): + def __init__(self, seq, lex_pos, line, column, allowed=None, considered_tokens=None, state=None, token_history=None): message = "No terminal defined for '%s' at line %d col %d" % (seq[lex_pos], line, column) self.line = line @@ -99,6 +110,8 @@ class UnexpectedCharacters(LexError, UnexpectedInput): message += '\n\n' + self.get_context(seq) if allowed: message += '\nExpecting: %s\n' % allowed + if token_history: + message += '\nPrevious tokens: %s\n' % ', '.join(repr(t) for t in token_history) super(UnexpectedCharacters, self).__init__(message) @@ -121,13 +134,25 @@ class UnexpectedToken(ParseError, UnexpectedInput): super(UnexpectedToken, self).__init__(message) class VisitError(LarkError): - def __init__(self, tree, orig_exc): - self.tree = tree + def __init__(self, rule, obj, orig_exc): + self.obj = obj self.orig_exc = orig_exc - message = 'Error trying to process rule "%s":\n\n%s' % (tree.data, orig_exc) + message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) super(VisitError, self).__init__(message) +def classify(seq, key=None, value=None): + d = {} + for item in seq: + k = key(item) if (key is not None) else item + v = value(item) if (value is not None) else item + if k in d: + d[k].append(v) + else: + d[k] = [v] + return d + + def _deserialize(data, namespace, memo): if isinstance(data, dict): if '__type__' in data: # Object @@ -170,7 +195,10 @@ class Serialize(object): inst = cls.__new__(cls) for f in fields: - setattr(inst, f, _deserialize(data[f], namespace, memo)) + try: + setattr(inst, f, _deserialize(data[f], namespace, memo)) + except KeyError as e: + raise KeyError("Cannot find key for class", cls, e) postprocess = getattr(inst, '_deserialize', None) if postprocess: postprocess() @@ -224,7 +252,7 @@ def smart_decorator(f, create_decorator): elif isinstance(f, partial): # wraps does not work for partials in 2.7: https://bugs.python.org/issue3445 - return create_decorator(f.__func__, True) + return wraps(f.func)(create_decorator(lambda *args, **kw: f(*args[1:], **kw), True)) else: return create_decorator(f.__func__.__call__, True) @@ -232,6 +260,15 @@ def smart_decorator(f, create_decorator): import sys, re Py36 = (sys.version_info[:2] >= (3, 6)) +import sre_parse +import sre_constants +def get_regexp_width(regexp): + try: + return [int(x) for x in sre_parse.parse(regexp).getwidth()] + except sre_constants.error: + raise ValueError(regexp) + + class Meta: def __init__(self): self.empty = True @@ -282,6 +319,36 @@ class Tree(object): def __hash__(self): return hash((self.data, tuple(self.children))) + def iter_subtrees(self): + # TODO: Re-write as a more efficient version + + visited = set() + q = [self] + + l = [] + while q: + subtree = q.pop() + l.append( subtree ) + if id(subtree) in visited: + continue # already been here from another branch + visited.add(id(subtree)) + q += [c for c in subtree.children if isinstance(c, Tree)] + + seen = set() + for x in reversed(l): + if id(x) not in seen: + yield x + seen.add(id(x)) + + def find_pred(self, pred): + "Find all nodes where pred(tree) == True" + return filter(pred, self.iter_subtrees()) + + def find_data(self, data): + "Find all nodes where tree.data == data" + return self.find_pred(lambda t: t.data == data) + + from inspect import getmembers, getmro class Discard(Exception): @@ -298,6 +365,10 @@ class Transformer: Can be used to implement map or reduce. """ + __visit_tokens__ = True # For backwards compatibility + def __init__(self, visit_tokens=True): + self.__visit_tokens__ = visit_tokens + def _call_userfunc(self, tree, new_children=None): # Assumes tree is already transformed children = new_children if new_children is not None else tree.children @@ -307,25 +378,39 @@ class Transformer: return self.__default__(tree.data, children, tree.meta) else: try: - if getattr(f, 'meta', False): - return f(children, tree.meta) - elif getattr(f, 'inline', False): - return f(*children) - elif getattr(f, 'whole_tree', False): - if new_children is not None: - raise NotImplementedError("Doesn't work with the base Transformer class") - return f(tree) + wrapper = getattr(f, 'visit_wrapper', None) + if wrapper is not None: + return f.visit_wrapper(f, tree.data, children, tree.meta) else: return f(children) except (GrammarError, Discard): raise except Exception as e: - raise VisitError(tree, e) + raise VisitError(tree.data, tree, e) + + def _call_userfunc_token(self, token): + try: + f = getattr(self, token.type) + except AttributeError: + return self.__default_token__(token) + else: + try: + return f(token) + except (GrammarError, Discard): + raise + except Exception as e: + raise VisitError(token.type, token, e) + def _transform_children(self, children): for c in children: try: - yield self._transform_tree(c) if isinstance(c, Tree) else c + if isinstance(c, Tree): + yield self._transform_tree(c) + elif self.__visit_tokens__ and isinstance(c, Token): + yield self._call_userfunc_token(c) + else: + yield c except Discard: pass @@ -343,13 +428,20 @@ class Transformer: "Default operation on tree (for override)" return Tree(data, children, meta) + def __default_token__(self, token): + "Default operation on token (for override)" + return token + + @classmethod def _apply_decorator(cls, decorator, **kwargs): mro = getmro(cls) assert mro[0] is cls libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)} for name, value in getmembers(cls): - if name.startswith('_') or name in libmembers: + + # Make sure the function isn't inherited (unless it's overwritten) + if name.startswith('_') or (name in libmembers and name not in cls.__dict__): continue if not callable(cls.__dict__[name]): continue @@ -432,6 +524,11 @@ class Visitor(VisitorBase): self._call_userfunc(subtree) return tree + def visit_topdown(self,tree): + for subtree in tree.iter_subtrees_topdown(): + self._call_userfunc(subtree) + return tree + class Visitor_Recursive(VisitorBase): """Bottom-up visitor, recursive @@ -444,8 +541,16 @@ class Visitor_Recursive(VisitorBase): if isinstance(child, Tree): self.visit(child) - f = getattr(self, tree.data, self.__default__) - f(tree) + self._call_userfunc(tree) + return tree + + def visit_topdown(self,tree): + self._call_userfunc(tree) + + for child in tree.children: + if isinstance(child, Tree): + self.visit_topdown(child) + return tree @@ -515,8 +620,7 @@ def inline_args(obj): # XXX Deprecated -def _visitor_args_func_dec(func, inline=False, meta=False, whole_tree=False, static=False): - assert [whole_tree, meta, inline].count(True) <= 1 +def _visitor_args_func_dec(func, visit_wrapper=None, static=False): def create_decorator(_f, with_self): if with_self: def f(self, *args, **kwargs): @@ -531,17 +635,42 @@ def _visitor_args_func_dec(func, inline=False, meta=False, whole_tree=False, sta else: f = smart_decorator(func, create_decorator) f.vargs_applied = True - f.inline = inline - f.meta = meta - f.whole_tree = whole_tree + f.visit_wrapper = visit_wrapper return f -def v_args(inline=False, meta=False, tree=False): + +def _vargs_inline(f, data, children, meta): + return f(*children) +def _vargs_meta_inline(f, data, children, meta): + return f(meta, *children) +def _vargs_meta(f, data, children, meta): + return f(children, meta) # TODO swap these for consistency? Backwards incompatible! +def _vargs_tree(f, data, children, meta): + return f(Tree(data, children, meta)) + +def v_args(inline=False, meta=False, tree=False, wrapper=None): "A convenience decorator factory, for modifying the behavior of user-supplied visitor methods" - if [tree, meta, inline].count(True) > 1: - raise ValueError("Visitor functions can either accept tree, or meta, or be inlined. These cannot be combined.") + if tree and (meta or inline): + raise ValueError("Visitor functions cannot combine 'tree' with 'meta' or 'inline'.") + + func = None + if meta: + if inline: + func = _vargs_meta_inline + else: + func = _vargs_meta + elif inline: + func = _vargs_inline + elif tree: + func = _vargs_tree + + if wrapper is not None: + if func is not None: + raise ValueError("Cannot use 'wrapper' along with 'tree', 'meta' or 'inline'.") + func = wrapper + def _visitor_args_dec(obj): - return _apply_decorator(obj, _visitor_args_func_dec, inline=inline, meta=meta, whole_tree=tree) + return _apply_decorator(obj, _visitor_args_func_dec, visit_wrapper=func) return _visitor_args_dec @@ -604,6 +733,8 @@ class Indenter: class Symbol(Serialize): + __slots__ = ('name',) + is_term = NotImplemented def __init__(self, name): @@ -680,7 +811,7 @@ class Rule(Serialize): self.expansion = expansion self.alias = alias self.order = order - self.options = options + self.options = options or RuleOptions() self._hash = hash((self.origin, tuple(self.expansion))) def _deserialize(self): @@ -705,7 +836,6 @@ class Rule(Serialize): class Pattern(Serialize): - __serialize_fields__ = 'value', 'flags' def __init__(self, value, flags=()): self.value = value @@ -738,6 +868,10 @@ class Pattern(Serialize): class PatternStr(Pattern): + __serialize_fields__ = 'value', 'flags' + + type = "str" + def to_regexp(self): return self._get_flags(re.escape(self.value)) @@ -747,15 +881,25 @@ class PatternStr(Pattern): max_width = min_width class PatternRE(Pattern): + __serialize_fields__ = 'value', 'flags', '_width' + + type = "re" + def to_regexp(self): return self._get_flags(self.value) + _width = None + def _get_width(self): + if self._width is None: + self._width = get_regexp_width(self.to_regexp()) + return self._width + @property def min_width(self): - return get_regexp_width(self.to_regexp())[0] + return self._get_width()[0] @property def max_width(self): - return get_regexp_width(self.to_regexp())[1] + return self._get_width()[1] class TerminalDef(Serialize): @@ -774,9 +918,9 @@ class TerminalDef(Serialize): class Token(Str): - __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column') + __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') - def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None): + def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None, end_line=None, end_column=None, end_pos=None): try: self = super(Token, cls).__new__(cls, value) except UnicodeDecodeError: @@ -790,11 +934,19 @@ class Token(Str): self.column = column self.end_line = end_line self.end_column = end_column + self.end_pos = end_pos return self + def update(self, type_=None, value=None): + return Token.new_borrow_pos( + type_ if type_ is not None else self.type, + value if value is not None else self.value, + self + ) + @classmethod def new_borrow_pos(cls, type_, value, borrow_t): - return cls(type_, value, borrow_t.pos_in_stream, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column) + return cls(type_, value, borrow_t.pos_in_stream, borrow_t.line, borrow_t.column, borrow_t.end_line, borrow_t.end_column, borrow_t.end_pos) def __reduce__(self): return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, )) @@ -846,38 +998,38 @@ class _Lex: newline_types = frozenset(newline_types) ignore_types = frozenset(ignore_types) line_ctr = LineCounter() + last_token = None while line_ctr.char_pos < len(stream): lexer = self.lexer - for mre, type_from_index in lexer.mres: - m = mre.match(stream, line_ctr.char_pos) - if not m: - continue - - t = None - value = m.group(0) - type_ = type_from_index[m.lastindex] - if type_ not in ignore_types: - t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - if t.type in lexer.callback: - t = lexer.callback[t.type](t) - if not isinstance(t, Token): - raise ValueError("Callbacks must return a token (returned %r)" % t) - yield t - else: - if type_ in lexer.callback: - t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) - lexer.callback[type_](t) + res = lexer.match(stream, line_ctr.char_pos) + if not res: + allowed = {v for m, tfi in lexer.mres for v in tfi.values()} - ignore_types + if not allowed: + allowed = {""} + raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state, token_history=last_token and [last_token]) - line_ctr.feed(value, type_ in newline_types) - if t: - t.end_line = line_ctr.line - t.end_column = line_ctr.column + value, type_ = res - break + if type_ not in ignore_types: + t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + line_ctr.feed(value, type_ in newline_types) + t.end_line = line_ctr.line + t.end_column = line_ctr.column + t.end_pos = line_ctr.char_pos + if t.type in lexer.callback: + t = lexer.callback[t.type](t) + if not isinstance(t, Token): + raise ValueError("Callbacks must return a token (returned %r)" % t) + yield t + last_token = t else: - allowed = [v for m, tfi in lexer.mres for v in tfi.values()] - raise UnexpectedCharacters(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column, allowed=allowed, state=self.state) + if type_ in lexer.callback: + t2 = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) + lexer.callback[type_](t2) + line_ctr.feed(value, type_ in newline_types) + + class UnlessCallback: @@ -950,34 +1102,25 @@ def build_mres(terminals, match_whole=False): return _build_mres(terminals, len(terminals), match_whole) def _regexp_has_newline(r): - """Expressions that may indicate newlines in a regexp: + r"""Expressions that may indicate newlines in a regexp: - newlines (\n) - escaped newline (\\n) - anything but ([^...]) - any-char (.) when the flag (?s) exists + - spaces (\s) """ - return '\n' in r or '\\n' in r or '[^' in r or ('(?s' in r and '.' in r) + return '\n' in r or '\\n' in r or '\\s' in r or '[^' in r or ('(?s' in r and '.' in r) -class Lexer(Serialize): +class Lexer(object): """Lexer interface Method Signatures: lex(self, stream) -> Iterator[Token] - - set_parser_state(self, state) # Optional """ - set_parser_state = NotImplemented lex = NotImplemented class TraditionalLexer(Lexer): - __serialize_fields__ = 'terminals', 'ignore_types', 'newline_types' - __serialize_namespace__ = TerminalDef, - - def _deserialize(self): - self.mres = build_mres(self.terminals) - self.callback = {} # TODO implement - def __init__(self, terminals, ignore=(), user_callbacks={}): assert all(isinstance(t, TerminalDef) for t in terminals), terminals @@ -988,7 +1131,7 @@ class TraditionalLexer(Lexer): for t in terminals: try: re.compile(t.pattern.to_regexp()) - except: + except re.error: raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) if t.pattern.min_width == 0: @@ -1001,21 +1144,28 @@ class TraditionalLexer(Lexer): self.ignore_types = list(ignore) terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) + self.terminals = terminals + self.user_callbacks = user_callbacks + self.build() - terminals, self.callback = _create_unless(terminals) + def build(self): + terminals, self.callback = _create_unless(self.terminals) assert all(self.callback.values()) - for type_, f in user_callbacks.items(): + for type_, f in self.user_callbacks.items(): if type_ in self.callback: # Already a callback there, probably UnlessCallback self.callback[type_] = CallChain(self.callback[type_], f, lambda t: t.type == type_) else: self.callback[type_] = f - self.terminals = terminals - self.mres = build_mres(terminals) + def match(self, stream, pos): + for mre, type_from_index in self.mres: + m = mre.match(stream, pos) + if m: + return m.group(0), type_from_index[m.lastindex] def lex(self, stream): return _Lex(self).lex(stream, self.newline_types, self.ignore_types) @@ -1024,8 +1174,6 @@ class TraditionalLexer(Lexer): class ContextualLexer(Lexer): - __serialize_fields__ = 'root_lexer', 'lexers' - __serialize_namespace__ = TraditionalLexer, def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): tokens_by_name = {} @@ -1049,17 +1197,41 @@ class ContextualLexer(Lexer): self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks) - self.set_parser_state(None) # Needs to be set on the outside + def lex(self, stream, get_parser_state): + parser_state = get_parser_state() + l = _Lex(self.lexers[parser_state], parser_state) + try: + for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): + yield x + parser_state = get_parser_state() + l.lexer = self.lexers[parser_state] + l.state = parser_state # For debug only, no need to worry about multithreading + except UnexpectedCharacters as e: + # In the contextual lexer, UnexpectedCharacters can mean that the terminal is defined, + # but not in the current context. + # This tests the input against the global context, to provide a nicer error. + root_match = self.root_lexer.match(stream, e.pos_in_stream) + if not root_match: + raise - def set_parser_state(self, state): - self.parser_state = state + value, type_ = root_match + t = Token(type_, value, e.pos_in_stream, e.line, e.column) + raise UnexpectedToken(t, e.allowed, state=e.state) - def lex(self, stream): - l = _Lex(self.lexers[self.parser_state], self.parser_state) - for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): - yield x - l.lexer = self.lexers[self.parser_state] - l.state = self.parser_state + + +class LexerConf(Serialize): + __serialize_fields__ = 'tokens', 'ignore' + __serialize_namespace__ = TerminalDef, + + def __init__(self, tokens, ignore=(), postlex=None, callbacks=None): + self.tokens = tokens + self.ignore = ignore + self.postlex = postlex + self.callbacks = callbacks or {} + + def _deserialize(self): + self.callbacks = {} # TODO from functools import partial, wraps @@ -1085,7 +1257,7 @@ class PropagatePositions: if isinstance(res, Tree): for c in children: - if isinstance(c, Tree) and c.children and not c.meta.empty: + if isinstance(c, Tree) and not c.meta.empty: res.meta.line = c.meta.line res.meta.column = c.meta.column res.meta.start_pos = c.meta.start_pos @@ -1099,7 +1271,7 @@ class PropagatePositions: break for c in reversed(children): - if isinstance(c, Tree) and c.children and not c.meta.empty: + if isinstance(c, Tree) and not c.meta.empty: res.meta.end_line = c.meta.end_line res.meta.end_column = c.meta.end_column res.meta.end_pos = c.meta.end_pos @@ -1108,7 +1280,7 @@ class PropagatePositions: elif isinstance(c, Token): res.meta.end_line = c.end_line res.meta.end_column = c.end_column - res.meta.end_pos = c.pos_in_stream + len(c.value) + res.meta.end_pos = c.end_pos res.meta.empty = False break @@ -1251,6 +1423,23 @@ def ptb_inline_args(func): return func(*children) return f +def inplace_transformer(func): + @wraps(func) + def f(children): + # function name in a Transformer is a rule name. + tree = Tree(func.__name__, children) + return func(tree) + return f + +def apply_visit_wrapper(func, name, wrapper): + if wrapper is visitors._vargs_meta or wrapper is visitors._vargs_meta_inline: + raise NotImplementedError("Meta args not supported for internal transformer") + @wraps(func) + def f(children): + return wrapper(func, name, children, None) + return f + + class ParseTreeBuilder: def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False, maybe_placeholders=False): self.tree_class = tree_class @@ -1264,12 +1453,12 @@ class ParseTreeBuilder: def _init_builders(self, rules): for rule in rules: options = rule.options - keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) - expand_single_child = options.expand1 if options else False + keep_all_tokens = self.always_keep_all_tokens or options.keep_all_tokens + expand_single_child = options.expand1 wrapper_chain = list(filter(None, [ (expand_single_child and not rule.alias) and ExpandSingleChild, - maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders and options else None), + maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous, options.empty_indices if self.maybe_placeholders else None), self.propagate_positions and PropagatePositions, self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens), ])) @@ -1285,10 +1474,15 @@ class ParseTreeBuilder: user_callback_name = rule.alias or rule.origin.name try: f = getattr(transformer, user_callback_name) - assert not getattr(f, 'meta', False), "Meta args not supported for internal transformer" # XXX InlineTransformer is deprecated! - if getattr(f, 'inline', False) or isinstance(transformer, InlineTransformer): - f = ptb_inline_args(f) + wrapper = getattr(f, 'visit_wrapper', None) + if wrapper is not None: + f = apply_visit_wrapper(f, user_callback_name, wrapper) + else: + if isinstance(transformer, InlineTransformer): + f = ptb_inline_args(f) + elif isinstance(transformer, Transformer_InPlace): + f = inplace_transformer(f) except AttributeError: f = partial(self.tree_class, user_callback_name) @@ -1307,7 +1501,7 @@ class LALR_Parser(object): def __init__(self, parser_conf, debug=False): assert all(r.options.priority is None for r in parser_conf.rules), "LALR doesn't yet support prioritization" analysis = LALR_Analyzer(parser_conf, debug=debug) - analysis.compute_lookahead() + analysis.compute_lalr() callbacks = parser_conf.callbacks self._parse_table = analysis.parse_table @@ -1317,7 +1511,8 @@ class LALR_Parser(object): @classmethod def deserialize(cls, data, memo, callbacks): inst = cls.__new__(cls) - inst.parser = _Parser(IntParseTable.deserialize(data, memo), callbacks) + inst._parse_table = IntParseTable.deserialize(data, memo) + inst.parser = _Parser(inst._parse_table, callbacks) return inst def serialize(self, memo): @@ -1330,19 +1525,22 @@ class LALR_Parser(object): class _Parser: def __init__(self, parse_table, callbacks): self.states = parse_table.states - self.start_state = parse_table.start_state - self.end_state = parse_table.end_state + self.start_states = parse_table.start_states + self.end_states = parse_table.end_states self.callbacks = callbacks - def parse(self, seq, set_state=None): + def parse(self, seq, start, set_state=None): token = None stream = iter(seq) states = self.states - state_stack = [self.start_state] + start_state = self.start_states[start] + end_state = self.end_states[start] + + state_stack = [start_state] value_stack = [] - if set_state: set_state(self.start_state) + if set_state: set_state(start_state) def get_action(token): state = state_stack[-1] @@ -1372,7 +1570,7 @@ class _Parser: for token in stream: while True: action, arg = get_action(token) - assert arg != self.end_state + assert arg != end_state if action is Shift: state_stack.append(arg) @@ -1385,12 +1583,10 @@ class _Parser: token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) while True: _action, arg = get_action(token) - if _action is Shift: - assert arg == self.end_state - val ,= value_stack - return val - else: - reduce(arg) + assert(_action is Reduce) + reduce(arg) + if state_stack[-1] == end_state: + return value_stack[-1] @@ -1405,11 +1601,12 @@ class Action: Shift = Action('Shift') Reduce = Action('Reduce') + class ParseTable: - def __init__(self, states, start_state, end_state): + def __init__(self, states, start_states, end_states): self.states = states - self.start_state = start_state - self.end_state = end_state + self.start_states = start_states + self.end_states = end_states def serialize(self, memo): tokens = Enumerator() @@ -1424,8 +1621,8 @@ class ParseTable: return { 'tokens': tokens.reversed(), 'states': states, - 'start_state': self.start_state, - 'end_state': self.end_state, + 'start_states': self.start_states, + 'end_states': self.end_states, } @classmethod @@ -1436,7 +1633,7 @@ class ParseTable: for token, (action, arg) in actions.items()} for state, actions in data['states'].items() } - return cls(states, data['start_state'], data['end_state']) + return cls(states, data['start_states'], data['end_states']) class IntParseTable(ParseTable): @@ -1453,9 +1650,9 @@ class IntParseTable(ParseTable): int_states[ state_to_idx[s] ] = la - start_state = state_to_idx[parse_table.start_state] - end_state = state_to_idx[parse_table.end_state] - return cls(int_states, start_state, end_state) + start_states = {start:state_to_idx[s] for start, s in parse_table.start_states.items()} + end_states = {start:state_to_idx[s] for start, s in parse_table.end_states.items()} + return cls(int_states, start_states, end_states) @@ -1491,63 +1688,84 @@ def get_frontend(parser, lexer): raise ValueError('Unknown parser: %s' % parser) +class _ParserFrontend(Serialize): + def _parse(self, input, start, *args): + if start is None: + start = self.start + if len(start) > 1: + raise ValueError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start) + start ,= start + return self.parser.parse(input, start, *args) -class WithLexer(Serialize): +class WithLexer(_ParserFrontend): lexer = None parser = None lexer_conf = None + start = None - __serialize_fields__ = 'parser', 'lexer' - __serialize_namespace__ = Rule, ContextualLexer, TraditionalLexer + __serialize_fields__ = 'parser', 'lexer_conf', 'start' + __serialize_namespace__ = LexerConf, + + def __init__(self, lexer_conf, parser_conf, options=None): + self.lexer_conf = lexer_conf + self.start = parser_conf.start + self.postlex = lexer_conf.postlex @classmethod def deserialize(cls, data, memo, callbacks, postlex): inst = super(WithLexer, cls).deserialize(data, memo) inst.postlex = postlex inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) + inst.init_lexer() return inst def _serialize(self, data, memo): data['parser'] = data['parser'].serialize(memo) - def init_traditional_lexer(self, lexer_conf): - self.lexer_conf = lexer_conf - self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) - self.postlex = lexer_conf.postlex - - def init_contextual_lexer(self, lexer_conf): - self.lexer_conf = lexer_conf - self.postlex = lexer_conf.postlex - states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} - always_accept = self.postlex.always_accept if self.postlex else () - self.lexer = ContextualLexer(lexer_conf.tokens, states, - ignore=lexer_conf.ignore, - always_accept=always_accept, - user_callbacks=lexer_conf.callbacks) - - def lex(self, text): - stream = self.lexer.lex(text) + def lex(self, *args): + stream = self.lexer.lex(*args) return self.postlex.process(stream) if self.postlex else stream - def parse(self, text): + def parse(self, text, start=None): token_stream = self.lex(text) - sps = self.lexer.set_parser_state - return self.parser.parse(token_stream, *[sps] if sps is not NotImplemented else []) + return self._parse(token_stream, start) + def init_traditional_lexer(self): + self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) -class LALR_TraditionalLexer(WithLexer): +class LALR_WithLexer(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): debug = options.debug if options else False self.parser = LALR_Parser(parser_conf, debug=debug) - self.init_traditional_lexer(lexer_conf) + WithLexer.__init__(self, lexer_conf, parser_conf, options) -class LALR_ContextualLexer(WithLexer): - def __init__(self, lexer_conf, parser_conf, options=None): - debug = options.debug if options else False - self.parser = LALR_Parser(parser_conf, debug=debug) - self.init_contextual_lexer(lexer_conf) + self.init_lexer() + + def init_lexer(self): + raise NotImplementedError() + +class LALR_TraditionalLexer(LALR_WithLexer): + def init_lexer(self): + self.init_traditional_lexer() + +class LALR_ContextualLexer(LALR_WithLexer): + def init_lexer(self): + states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} + always_accept = self.postlex.always_accept if self.postlex else () + self.lexer = ContextualLexer(self.lexer_conf.tokens, states, + ignore=self.lexer_conf.ignore, + always_accept=always_accept, + user_callbacks=self.lexer_conf.callbacks) + + + def parse(self, text, start=None): + parser_state = [None] + def set_parser_state(s): + parser_state[0] = s + token_stream = self.lex(text, lambda: parser_state[0]) + return self._parse(token_stream, start, set_parser_state) class LarkOptions(Serialize): @@ -1576,8 +1794,7 @@ class LarkOptions(Serialize): keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False) cache_grammar - Cache the Lark grammar (Default: False) postlex - Lexer post-processing (Default: None) Only works with the standard and contextual lexers. - start - The start symbol (Default: start) - profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False) + start - The start symbol, either a string, or a list of strings for multiple possible starts (Default: "start") priority - How priorities should be evaluated - auto, none, normal, invert (Default: auto) propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches. lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. @@ -1596,12 +1813,12 @@ class LarkOptions(Serialize): 'lexer': 'auto', 'transformer': None, 'start': 'start', - 'profile': False, 'priority': 'auto', 'ambiguity': 'auto', - 'propagate_positions': False, + 'propagate_positions': True, 'lexer_callbacks': {}, - 'maybe_placeholders': False, + 'maybe_placeholders': True, + 'edit_terminals': None, } def __init__(self, options_dict): @@ -1618,6 +1835,9 @@ class LarkOptions(Serialize): options[name] = value + if isinstance(options['start'], STRING_TYPE): + options['start'] = [options['start']] + self.__dict__['options'] = options assert self.parser in ('earley', 'lalr', 'cyk', None) @@ -1630,7 +1850,11 @@ class LarkOptions(Serialize): raise ValueError("Unknown options: %s" % o.keys()) def __getattr__(self, name): - return self.options[name] + try: + return self.options[name] + except KeyError as e: + raise AttributeError(e) + def __setattr__(self, name, value): assert name in self.options self.options[name] = value @@ -1643,30 +1867,6 @@ class LarkOptions(Serialize): return cls(data) -class Profiler: - def __init__(self): - self.total_time = defaultdict(float) - self.cur_section = '__init__' - self.last_enter_time = time.time() - - def enter_section(self, name): - cur_time = time.time() - self.total_time[self.cur_section] += cur_time - self.last_enter_time - self.last_enter_time = cur_time - self.cur_section = name - - def make_wrapper(self, name, f): - def wrapper(*args, **kwargs): - last_section = self.cur_section - self.enter_section(name) - try: - return f(*args, **kwargs) - finally: - self.enter_section(last_section) - - return wrapper - - class Lark(Serialize): def __init__(self, grammar, **options): """ @@ -1694,9 +1894,6 @@ class Lark(Serialize): if self.options.cache_grammar: raise NotImplementedError("Not available yet") - assert not self.options.profile, "Feature temporarily disabled" - # self.profiler = Profiler() if self.options.profile else None - if self.options.lexer == 'auto': if self.options.parser == 'lalr': self.options.lexer = 'contextual' @@ -1733,7 +1930,13 @@ class Lark(Serialize): self.grammar = load_grammar(grammar, self.source) # Compile the EBNF grammar into BNF - self.terminals, self.rules, self.ignore_tokens = self.grammar.compile() + self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) + + if self.options.edit_terminals: + for t in self.terminals: + self.options.edit_terminals(t) + + self._terminals_dict = {t.name:t for t in self.terminals} # If the user asked to invert the priorities, negate them all here. # This replaces the old 'resolve__antiscore_sum' option. @@ -1748,7 +1951,16 @@ class Lark(Serialize): for rule in self.rules: if rule.options.priority is not None: rule.options.priority = None - self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks) + + # TODO Deprecate lexer_callbacks? + lexer_callbacks = dict(self.options.lexer_callbacks) + if self.options.transformer: + t = self.options.transformer + for term in self.terminals: + if hasattr(t, term.name): + lexer_callbacks[term.name] = getattr(t, term.name) + + self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, lexer_callbacks) if self.options.parser: self.parser = self._build_parser() @@ -1783,6 +1995,7 @@ class Lark(Serialize): options['postlex'] = postlex inst.options = LarkOptions.deserialize(options, memo) inst.rules = [Rule.deserialize(r, memo) for r in data['rules']] + inst.source = '' inst._prepare_callbacks() inst.parser = inst.parser_class.deserialize(data['parser'], memo, inst._callbacks, inst.options.postlex) return inst @@ -1819,16 +2032,25 @@ class Lark(Serialize): return self.options.postlex.process(stream) return stream - def parse(self, text): - "Parse the given text, according to the options provided. Returns a tree, unless specified otherwise." - return self.parser.parse(text) + def get_terminal(self, name): + "Get information about a terminal" + return self._terminals_dict[name] + + def parse(self, text, start=None): + """Parse the given text, according to the options provided. + + The 'start' parameter is required if Lark was given multiple possible start symbols (using the start option). + + Returns a tree, unless specified otherwise. + """ + return self.parser.parse(text, start=start) DATA = ( -{'rules': [{'@': 27}, {'@': 31}, {'@': 26}, {'@': 13}, {'@': 24}, {'@': 18}, {'@': 16}, {'@': 23}, {'@': 21}, {'@': 17}, {'@': 28}, {'@': 30}, {'@': 25}, {'@': 29}, {'@': 20}, {'@': 22}, {'@': 15}, {'@': 19}, {'@': 12}, {'@': 14}], 'parser': {'parser': {'tokens': {0: 'COMMA', 1: 'RBRACE', 2: u'pair', 3: u'ESCAPED_STRING', 4: u'string', 5: 'COLON', 6: 'RSQB', 7: '$END', 8: 'LBRACE', 9: u'FALSE', 10: u'object', 11: u'SIGNED_NUMBER', 12: u'value', 13: 'LSQB', 14: u'NULL', 15: u'TRUE', 16: u'array', 17: '__anon_star_1', 18: '__anon_star_0', 19: 'start'}, 'states': {0: {0: (0, 1), 1: (0, 32)}, 1: {2: (0, 5), 3: (0, 21), 4: (0, 3)}, 2: {0: (1, {'@': 12}), 1: (1, {'@': 12})}, 3: {5: (0, 13)}, 4: {0: (1, {'@': 13}), 1: (1, {'@': 13}), 6: (1, {'@': 13}), 7: (1, {'@': 13})}, 5: {0: (1, {'@': 14}), 1: (1, {'@': 14})}, 6: {0: (1, {'@': 15}), 6: (1, {'@': 15})}, 7: {0: (1, {'@': 16}), 1: (1, {'@': 16}), 6: (1, {'@': 16}), 7: (1, {'@': 16})}, 8: {3: (0, 21), 4: (0, 4), 8: (0, 34), 9: (0, 7), 10: (0, 33), 11: (0, 25), 12: (0, 12), 13: (0, 14), 14: (0, 24), 15: (0, 11), 16: (0, 27)}, 9: {0: (1, {'@': 17}), 1: (1, {'@': 17}), 6: (1, {'@': 17}), 7: (1, {'@': 17})}, 10: {0: (0, 22), 17: (0, 0), 1: (0, 26)}, 11: {0: (1, {'@': 18}), 1: (1, {'@': 18}), 6: (1, {'@': 18}), 7: (1, {'@': 18})}, 12: {0: (1, {'@': 19}), 6: (1, {'@': 19})}, 13: {3: (0, 21), 4: (0, 4), 8: (0, 34), 9: (0, 7), 10: (0, 33), 11: (0, 25), 12: (0, 15), 13: (0, 14), 14: (0, 24), 15: (0, 11), 16: (0, 27)}, 14: {3: (0, 21), 4: (0, 4), 6: (0, 30), 8: (0, 34), 9: (0, 7), 10: (0, 33), 11: (0, 25), 12: (0, 23), 13: (0, 14), 14: (0, 24), 15: (0, 11), 16: (0, 27)}, 15: {0: (1, {'@': 20}), 1: (1, {'@': 20})}, 16: {0: (1, {'@': 21}), 1: (1, {'@': 21}), 6: (1, {'@': 21}), 7: (1, {'@': 21})}, 17: {3: (0, 21), 4: (0, 4), 8: (0, 34), 9: (0, 7), 10: (0, 33), 11: (0, 25), 12: (0, 6), 13: (0, 14), 14: (0, 24), 15: (0, 11), 16: (0, 27)}, 18: {}, 19: {7: (0, 18)}, 20: {0: (0, 8), 6: (0, 16)}, 21: {0: (1, {'@': 22}), 1: (1, {'@': 22}), 5: (1, {'@': 22}), 6: (1, {'@': 22}), 7: (1, {'@': 22})}, 22: {2: (0, 2), 3: (0, 21), 4: (0, 3)}, 23: {0: (0, 17), 18: (0, 20), 6: (0, 9)}, 24: {0: (1, {'@': 23}), 1: (1, {'@': 23}), 6: (1, {'@': 23}), 7: (1, {'@': 23})}, 25: {0: (1, {'@': 24}), 1: (1, {'@': 24}), 6: (1, {'@': 24}), 7: (1, {'@': 24})}, 26: {0: (1, {'@': 25}), 1: (1, {'@': 25}), 6: (1, {'@': 25}), 7: (1, {'@': 25})}, 27: {0: (1, {'@': 26}), 1: (1, {'@': 26}), 6: (1, {'@': 26}), 7: (1, {'@': 26})}, 28: {3: (0, 21), 4: (0, 4), 8: (0, 34), 9: (0, 7), 10: (0, 33), 11: (0, 25), 12: (0, 29), 13: (0, 14), 14: (0, 24), 15: (0, 11), 16: (0, 27), 19: (0, 19)}, 29: {7: (1, {'@': 27})}, 30: {0: (1, {'@': 28}), 1: (1, {'@': 28}), 6: (1, {'@': 28}), 7: (1, {'@': 28})}, 31: {0: (1, {'@': 29}), 1: (1, {'@': 29}), 6: (1, {'@': 29}), 7: (1, {'@': 29})}, 32: {0: (1, {'@': 30}), 1: (1, {'@': 30}), 6: (1, {'@': 30}), 7: (1, {'@': 30})}, 33: {0: (1, {'@': 31}), 1: (1, {'@': 31}), 6: (1, {'@': 31}), 7: (1, {'@': 31})}, 34: {1: (0, 31), 2: (0, 10), 3: (0, 21), 4: (0, 3)}}, 'end_state': 18, 'start_state': 28}, '__type__': 'LALR_TraditionalLexer', 'lexer': {'ignore_types': [u'WS'], 'terminals': [{'@': 0}, {'@': 1}, {'@': 2}, {'@': 3}, {'@': 4}, {'@': 5}, {'@': 6}, {'@': 7}, {'@': 8}, {'@': 9}, {'@': 10}, {'@': 11}], '__type__': 'TraditionalLexer', 'newline_types': [u'WS']}}, '__type__': 'Lark', 'options': {'profile': False, 'transformer': None, 'lexer': 'standard', 'lexer_callbacks': {}, 'postlex': None, 'parser': 'lalr', 'cache_grammar': False, 'tree_class': None, 'priority': None, 'start': 'start', 'keep_all_tokens': False, 'ambiguity': 'auto', 'debug': False, 'propagate_positions': False, 'maybe_placeholders': False}} +{'rules': [{'@': 27}, {'@': 31}, {'@': 26}, {'@': 13}, {'@': 25}, {'@': 18}, {'@': 16}, {'@': 24}, {'@': 22}, {'@': 17}, {'@': 28}, {'@': 30}, {'@': 20}, {'@': 29}, {'@': 21}, {'@': 23}, {'@': 15}, {'@': 19}, {'@': 12}, {'@': 14}], 'parser': {'lexer_conf': {'tokens': [{'@': 0}, {'@': 1}, {'@': 2}, {'@': 3}, {'@': 4}, {'@': 5}, {'@': 6}, {'@': 7}, {'@': 8}, {'@': 9}, {'@': 10}, {'@': 11}], 'ignore': [u'WS'], '__type__': 'LexerConf'}, 'parser': {'tokens': {0: 'LBRACE', 1: u'FALSE', 2: u'string', 3: u'object', 4: u'NULL', 5: u'SIGNED_NUMBER', 6: u'value', 7: 'start', 8: 'LSQB', 9: u'ESCAPED_STRING', 10: u'TRUE', 11: u'array', 12: 'COMMA', 13: 'RBRACE', 14: u'pair', 15: 'COLON', 16: 'RSQB', 17: '$END', 18: '__anon_star_1', 19: '__anon_star_0'}, 'states': {0: {0: (0, 33), 1: (0, 8), 2: (0, 5), 3: (0, 32), 4: (0, 23), 5: (0, 24), 6: (0, 28), 7: (0, 11), 8: (0, 25), 9: (0, 20), 10: (0, 13), 11: (0, 26)}, 1: {12: (0, 2), 13: (0, 31)}, 2: {9: (0, 20), 2: (0, 4), 14: (0, 6)}, 3: {12: (1, {'@': 12}), 13: (1, {'@': 12})}, 4: {15: (0, 15)}, 5: {16: (1, {'@': 13}), 17: (1, {'@': 13}), 12: (1, {'@': 13}), 13: (1, {'@': 13})}, 6: {12: (1, {'@': 14}), 13: (1, {'@': 14})}, 7: {16: (1, {'@': 15}), 12: (1, {'@': 15})}, 8: {16: (1, {'@': 16}), 17: (1, {'@': 16}), 12: (1, {'@': 16}), 13: (1, {'@': 16})}, 9: {0: (0, 33), 1: (0, 8), 2: (0, 5), 3: (0, 32), 4: (0, 23), 5: (0, 24), 6: (0, 14), 8: (0, 25), 9: (0, 20), 10: (0, 13), 11: (0, 26)}, 10: {16: (1, {'@': 17}), 17: (1, {'@': 17}), 12: (1, {'@': 17}), 13: (1, {'@': 17})}, 11: {}, 12: {18: (0, 1), 12: (0, 21), 13: (0, 16)}, 13: {16: (1, {'@': 18}), 17: (1, {'@': 18}), 12: (1, {'@': 18}), 13: (1, {'@': 18})}, 14: {16: (1, {'@': 19}), 12: (1, {'@': 19})}, 15: {0: (0, 33), 1: (0, 8), 2: (0, 5), 3: (0, 32), 4: (0, 23), 5: (0, 24), 6: (0, 17), 8: (0, 25), 9: (0, 20), 10: (0, 13), 11: (0, 26)}, 16: {16: (1, {'@': 20}), 17: (1, {'@': 20}), 12: (1, {'@': 20}), 13: (1, {'@': 20})}, 17: {12: (1, {'@': 21}), 13: (1, {'@': 21})}, 18: {16: (1, {'@': 22}), 17: (1, {'@': 22}), 12: (1, {'@': 22}), 13: (1, {'@': 22})}, 19: {16: (0, 18), 12: (0, 9)}, 20: {16: (1, {'@': 23}), 17: (1, {'@': 23}), 12: (1, {'@': 23}), 13: (1, {'@': 23}), 15: (1, {'@': 23})}, 21: {9: (0, 20), 2: (0, 4), 14: (0, 3)}, 22: {16: (0, 10), 19: (0, 19), 12: (0, 27)}, 23: {16: (1, {'@': 24}), 17: (1, {'@': 24}), 12: (1, {'@': 24}), 13: (1, {'@': 24})}, 24: {16: (1, {'@': 25}), 17: (1, {'@': 25}), 12: (1, {'@': 25}), 13: (1, {'@': 25})}, 25: {0: (0, 33), 1: (0, 8), 2: (0, 5), 3: (0, 32), 4: (0, 23), 5: (0, 24), 6: (0, 22), 8: (0, 25), 9: (0, 20), 10: (0, 13), 11: (0, 26), 16: (0, 29)}, 26: {16: (1, {'@': 26}), 17: (1, {'@': 26}), 12: (1, {'@': 26}), 13: (1, {'@': 26})}, 27: {0: (0, 33), 1: (0, 8), 2: (0, 5), 3: (0, 32), 4: (0, 23), 5: (0, 24), 6: (0, 7), 8: (0, 25), 9: (0, 20), 10: (0, 13), 11: (0, 26)}, 28: {17: (1, {'@': 27})}, 29: {16: (1, {'@': 28}), 17: (1, {'@': 28}), 12: (1, {'@': 28}), 13: (1, {'@': 28})}, 30: {16: (1, {'@': 29}), 17: (1, {'@': 29}), 12: (1, {'@': 29}), 13: (1, {'@': 29})}, 31: {16: (1, {'@': 30}), 17: (1, {'@': 30}), 12: (1, {'@': 30}), 13: (1, {'@': 30})}, 32: {16: (1, {'@': 31}), 17: (1, {'@': 31}), 12: (1, {'@': 31}), 13: (1, {'@': 31})}, 33: {9: (0, 20), 2: (0, 4), 13: (0, 30), 14: (0, 12)}}, 'end_states': {'start': 11}, 'start_states': {'start': 0}}, '__type__': 'LALR_ContextualLexer', 'start': ['start']}, '__type__': 'Lark', 'options': {'transformer': None, 'lexer': 'contextual', 'lexer_callbacks': {}, 'debug': False, 'postlex': None, 'parser': 'lalr', 'cache_grammar': False, 'tree_class': None, 'priority': None, 'start': ['start'], 'keep_all_tokens': False, 'ambiguity': 'auto', 'edit_terminals': None, 'propagate_positions': True, 'maybe_placeholders': True}} ) MEMO = ( -{0: {'priority': 1, 'pattern': {'__type__': 'PatternRE', 'flags': [], 'value': u'(?:(?:\\+|\\-))?(?:(?:(?:[0-9])+(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+|(?:(?:[0-9])+\\.(?:(?:[0-9])+)?|\\.(?:[0-9])+)(?:(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+)?)|(?:[0-9])+)'}, '__type__': 'TerminalDef', 'name': u'SIGNED_NUMBER'}, 1: {'priority': 1, 'pattern': {'__type__': 'PatternRE', 'flags': [], 'value': u'\\".*?(? 1: if dups[0].expansion: - raise GrammarError("Rules defined twice: %s\n\n(Might happen due to colliding expansion of optionals: [] or ?)" % ''.join('\n * %s' % i for i in dups)) + raise GrammarError("Rules defined twice: %s\n\n(Might happen due to colliding expansion of optionals: [] or ?)" + % ''.join('\n * %s' % i for i in dups)) # Empty rule; assert all other attributes are equal assert len({(r.alias, r.order, r.options) for r in dups}) == len(dups) diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 4af2c24..7822485 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -202,7 +202,7 @@ class LALR_Analyzer(GrammarAnalyzer): continue s2 = rp2.next # if s2 is a terminal - if not s2 in self.lr0_rules_by_origin: + if s2 not in self.lr0_rules_by_origin: dr.add(s2) if s2 in self.NULLABLE: r.add((next_state, s2)) diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index 07016ff..9934567 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -34,6 +34,9 @@ # See . # # + +import os +from io import open ###} import pprint diff --git a/lark/utils.py b/lark/utils.py index 9513b8b..b1354cf 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -1,4 +1,5 @@ import sys +from ast import literal_eval from collections import deque class fzset(frozenset): @@ -239,3 +240,28 @@ class Enumerator(Serialize): assert len(r) == len(self.enums) return r + +def eval_escaping(s): + w = '' + i = iter(s) + for n in i: + w += n + if n == '\\': + try: + n2 = next(i) + except StopIteration: + raise ValueError("Literal ended unexpectedly (bad escaping): `%r`" % s) + if n2 == '\\': + w += '\\\\' + elif n2 not in 'uxnftr': + w += '\\' + w += n2 + w = w.replace('\\"', '"').replace("'", "\\'") + + to_eval = "u'''%s'''" % w + try: + s = literal_eval(to_eval) + except SyntaxError as e: + raise ValueError(s, e) + + return s From 5682dcc57abef5996dc9053bffc108552640055a Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 12 Jan 2020 16:03:16 +0200 Subject: [PATCH 113/132] Added python_bytecode example + Tiny bugfix --- examples/README.md | 1 + examples/python3.lark | 8 ++-- examples/python_bytecode.py | 77 +++++++++++++++++++++++++++++++++++++ lark/load_grammar.py | 2 +- 4 files changed, 83 insertions(+), 5 deletions(-) create mode 100644 examples/python_bytecode.py diff --git a/examples/README.md b/examples/README.md index f40157d..8053ebd 100644 --- a/examples/README.md +++ b/examples/README.md @@ -27,6 +27,7 @@ For example, the following will parse all the Python files in the standard libra - [error\_reporting\_lalr.py](error_reporting_lalr.py) - A demonstration of example-driven error reporting with the LALR parser - [python\_parser.py](python_parser.py) - A fully-working Python 2 & 3 parser (but not production ready yet!) +- [python\_bytecode.py](python_bytecode.py) - A toy example showing how to compile Python directly to bytecode - [conf\_lalr.py](conf_lalr.py) - Demonstrates the power of LALR's contextual lexer on a toy configuration language - [conf\_earley.py](conf_earley.py) - Demonstrates the power of Earley's dynamic lexer on a toy configuration language - [custom\_lexer.py](custom_lexer.py) - Demonstrates using a custom lexer to parse a non-textual stream of data diff --git a/examples/python3.lark b/examples/python3.lark index 3f39f9f..78c9875 100644 --- a/examples/python3.lark +++ b/examples/python3.lark @@ -81,7 +81,7 @@ with_item: test ["as" expr] except_clause: "except" [test ["as" NAME]] suite: simple_stmt | _NEWLINE _INDENT stmt+ _DEDENT -?test: or_test ["if" or_test "else" test] | lambdef +?test: or_test ("if" or_test "else" test)? | lambdef ?test_nocond: or_test | lambdef_nocond lambdef: "lambda" [varargslist] ":" test lambdef_nocond: "lambda" [varargslist] ":" test_nocond @@ -107,7 +107,7 @@ star_expr: "*" expr // sake of a __future__ import described in PEP 401 (which really works :-) !_comp_op: "<"|">"|"=="|">="|"<="|"<>"|"!="|"in"|"not" "in"|"is"|"is" "not" -?power: await_expr ["**" factor] +?power: await_expr ("**" factor)? ?await_expr: AWAIT? atom_expr AWAIT: "await" @@ -137,7 +137,7 @@ dictorsetmaker: ( ((test ":" test | "**" expr) (comp_for | ("," (test ":" test | classdef: "class" NAME ["(" [arguments] ")"] ":" suite -arguments: argvalue ("," argvalue)* ["," [ starargs | kwargs]] +arguments: argvalue ("," argvalue)* ("," [ starargs | kwargs])? | starargs | kwargs | test comp_for @@ -145,7 +145,7 @@ arguments: argvalue ("," argvalue)* ["," [ starargs | kwargs]] starargs: "*" test ("," "*" test)* ("," argvalue)* ["," kwargs] kwargs: "**" test -?argvalue: test ["=" test] +?argvalue: test ("=" test)? diff --git a/examples/python_bytecode.py b/examples/python_bytecode.py new file mode 100644 index 0000000..cbb8ccd --- /dev/null +++ b/examples/python_bytecode.py @@ -0,0 +1,77 @@ +# +# This is a toy example that compiles Python directly to bytecode, without generating an AST. +# It currently only works for very very simple Python code. +# +# It requires the 'bytecode' library. You can get it using +# +# $ pip install bytecode +# + +from lark import Lark, Transformer, v_args +from lark.indenter import Indenter + +from bytecode import Instr, Bytecode + +class PythonIndenter(Indenter): + NL_type = '_NEWLINE' + OPEN_PAREN_types = ['LPAR', 'LSQB', 'LBRACE'] + CLOSE_PAREN_types = ['RPAR', 'RSQB', 'RBRACE'] + INDENT_type = '_INDENT' + DEDENT_type = '_DEDENT' + tab_len = 8 + + +@v_args(inline=True) +class Compile(Transformer): + def number(self, n): + return [Instr('LOAD_CONST', int(n))] + def string(self, s): + return [Instr('LOAD_CONST', s[1:-1])] + def var(self, n): + return [Instr('LOAD_NAME', n)] + + def arith_expr(self, a, op, b): + # TODO support chain arithmetic + assert op == '+' + return a + b + [Instr('BINARY_ADD')] + + def arguments(self, args): + return args + + def funccall(self, name, args): + return name + args + [Instr('CALL_FUNCTION', 1)] + + @v_args(inline=False) + def file_input(self, stmts): + return sum(stmts, []) + [Instr("RETURN_VALUE")] + + def expr_stmt(self, lval, rval): + # TODO more complicated than that + name ,= lval + assert name.name == 'LOAD_NAME' # XXX avoid with another layer of abstraction + return rval + [Instr("STORE_NAME", name.arg)] + + def __default__(self, *args): + assert False, args + + +python_parser3 = Lark.open('python3.lark', rel_to=__file__, start='file_input', + parser='lalr', postlex=PythonIndenter(), + transformer=Compile(), propagate_positions=False) + +def compile_python(s): + insts = python_parser3.parse(s+"\n") + return Bytecode(insts).to_code() + +code = compile_python(""" +a = 3 +b = 5 +print("Hello World!") +print(a+(b+2)) +print((a+b)+2) +""") +exec(code) +# -- Output -- +# Hello World! +# 10 +# 10 diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 83ec341..77095a8 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -501,7 +501,7 @@ class Grammar: empty_indices = [x==_EMPTY for x in expansion] if any(empty_indices): - exp_options = copy(options) + exp_options = copy(options) or RuleOptions() exp_options.empty_indices = empty_indices expansion = [x for x in expansion if x!=_EMPTY] else: From ae691bf35e9e13cdb4f718ca58c41b7e72b51953 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 16 Jan 2020 12:15:37 +0200 Subject: [PATCH 114/132] Revert propagate_positions to be False by default, still not ready for prime-time --- README.md | 6 +++--- docs/classes.md | 4 ++-- lark/lark.py | 2 +- tests/test_parser.py | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 84e4921..6f5ed74 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,7 @@ Lark provides syntax highlighting for its grammar files (\*.lark): ### Clones -- [Lerche (Julia)](https://github.com/jamesrhester/Lerche.jl) - an unofficial clone, written entirely in Julia. +- [Lerche (Julia)](https://github.com/jamesrhester/Lerche.jl) - an unofficial clone, written entirely in Julia. ### Hello World @@ -141,10 +141,10 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail - [mappyfile](https://github.com/geographika/mappyfile) - a MapFile parser for working with MapServer configuration - [synapse](https://github.com/vertexproject/synapse) - an intelligence analysis platform - [Command-Block-Assembly](https://github.com/simon816/Command-Block-Assembly) - An assembly language, and C compiler, for Minecraft commands - - [SPFlow](https://github.com/SPFlow/SPFlow) - Library for Sum-Product Networks + - [SPFlow](https://github.com/SPFlow/SPFlow) - Library for Sum-Product Networks - [Torchani](https://github.com/aiqm/torchani) - Accurate Neural Network Potential on PyTorch - [required](https://github.com/shezadkhan137/required) - multi-field validation using docstrings - - [miniwdl](https://github.com/chanzuckerberg/miniwdl) - A static analysis toolkit for the Workflow Description Language + - [miniwdl](https://github.com/chanzuckerberg/miniwdl) - A static analysis toolkit for the Workflow Description Language - [pytreeview](https://gitlab.com/parmenti/pytreeview) - a lightweight tree-based grammar explorer diff --git a/docs/classes.md b/docs/classes.md index 021b2f4..284ce73 100644 --- a/docs/classes.md +++ b/docs/classes.md @@ -32,7 +32,7 @@ The Lark class accepts a grammar string or file object, and keyword options: * **keep_all_tokens** - Don't throw away any terminals from the tree (Default=`False`) -* **propagate_positions** - Propagate line/column count to tree nodes, at the cost of performance (default=`True`) +* **propagate_positions** - Propagate line/column count to tree nodes, at the cost of performance (default=`False`) * **maybe_placeholders** - The `[]` operator returns `None` when not matched. Setting this to `False` makes it behave like the `?` operator, and return no value at all, which may be a little faster (default=`True`) @@ -52,7 +52,7 @@ The main tree class * `data` - The name of the rule or alias * `children` - List of matched sub-rules and terminals -* `meta` - Line & Column numbers (unless `propagate_positions` is disabled) +* `meta` - Line & Column numbers (if `propagate_positions` is enabled) * meta attributes: `line`, `column`, `start_pos`, `end_line`, `end_column`, `end_pos` #### \_\_init\_\_(self, data, children) diff --git a/lark/lark.py b/lark/lark.py index 33b57e3..3e69b7f 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -62,7 +62,7 @@ class LarkOptions(Serialize): 'start': 'start', 'priority': 'auto', 'ambiguity': 'auto', - 'propagate_positions': True, + 'propagate_positions': False, 'lexer_callbacks': {}, 'maybe_placeholders': True, 'edit_terminals': None, diff --git a/tests/test_parser.py b/tests/test_parser.py index 3f73990..72be997 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -151,7 +151,7 @@ class TestParsers(unittest.TestCase): g = Lark(r"""start: a+ a : "x" _NL? _NL: /\n/+ - """, parser='lalr', transformer=T() if internal else None) + """, parser='lalr', transformer=T() if internal else None, propagate_positions=True) except NotImplementedError: assert internal continue From 4db56dc8b0ec408c3719f1e45138136730d6537c Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 16 Jan 2020 12:21:40 +0200 Subject: [PATCH 115/132] Added shebang to example script (Issue #504) --- examples/standalone/create_standalone.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/examples/standalone/create_standalone.sh b/examples/standalone/create_standalone.sh index 141ab89..d8da6b0 100755 --- a/examples/standalone/create_standalone.sh +++ b/examples/standalone/create_standalone.sh @@ -1 +1,2 @@ +#!/bin/sh PYTHONPATH=../.. python -m lark.tools.standalone json.lark > json_parser.py From fcdba441b47b0fc86f8ce262b97645720f15f1ed Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 16 Jan 2020 12:39:08 +0200 Subject: [PATCH 116/132] Better error message for reduce/reduce conflict (Issue #135) --- lark/load_grammar.py | 1 + lark/parsers/lalr_analysis.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 77095a8..356d03d 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -478,6 +478,7 @@ class Grammar: rules = [] for name, rule_tree, options in rule_defs: ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options.keep_all_tokens else None + ebnf_to_bnf.prefix = name tree = transformer.transform(rule_tree) res = ebnf_to_bnf.transform(tree) rules.append((name, res, options)) diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 7822485..05c1ce8 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -253,10 +253,10 @@ class LALR_Analyzer(GrammarAnalyzer): actions[la] = (Shift, next_state.closure) for la, rules in state.lookaheads.items(): if len(rules) > 1: - raise GrammarError('Collision in %s: %s' % (la, ', '.join([ str(r) for r in rules ]))) + raise GrammarError('Reduce/Reduce collision in %s between the following rules: %s' % (la, ''.join([ '\n\t\t- ' + str(r) for r in rules ]))) if la in actions: if self.debug: - logging.warning('Shift/reduce conflict for terminal %s: (resolving as shift)', la.name) + logging.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) logging.warning(' * %s', list(rules)[0]) else: actions[la] = (Reduce, list(rules)[0]) From 2f92c7b4a79298eb0e11fd87b2550ac1c7f3e73e Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 16 Jan 2020 14:50:10 +0200 Subject: [PATCH 117/132] Small addition to docs about terminal operators --- docs/grammar.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/docs/grammar.md b/docs/grammar.md index cc518e9..c36756b 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -54,6 +54,10 @@ Literals can be one of: * `/re with flags/imulx` * Literal range: `"a".."z"`, `"1".."9"`, etc. +Terminals also support grammar operators, such as `|`, `+`, `*` and `?`. + +Terminals are a linear construct, and therefor may not contain themselves (recursion isn't allowed). + ### Priority Terminals can be assigned priority only when using a lexer (future versions may support Earley's dynamic lexing). @@ -74,7 +78,7 @@ When using a lexer (standard or contextual), it is the grammar-author's responsi IF: "if" INTEGER : /[0-9]+/ INTEGER2 : ("0".."9")+ //# Same as INTEGER -DECIMAL.2: INTEGER "." INTEGER //# Will be matched before INTEGER +DECIMAL.2: INTEGER? "." INTEGER //# Will be matched before INTEGER WHITESPACE: (" " | /\t/ )+ SQL_SELECT: "select"i ``` From 182385d7b7bedfb4b530c225cda5841b67ec2b3d Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sun, 19 Jan 2020 17:14:24 +0200 Subject: [PATCH 118/132] Removed bad syntax: *? and +? no longer accepted by the grammar parser (Issue #511) --- examples/lark.lark | 2 +- lark/load_grammar.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/lark.lark b/examples/lark.lark index 915cf2e..f1f42f6 100644 --- a/examples/lark.lark +++ b/examples/lark.lark @@ -33,7 +33,7 @@ name: RULE | TOKEN _VBAR: _NL? "|" -OP: /[+*][?]?|[?](?![a-z])/ +OP: /[+*]|[?](?![a-z])/ RULE: /!?[_?]?[a-z][_a-z0-9]*/ TOKEN: /_?[A-Z][_A-Z0-9]*/ STRING: _STRING "i"? diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 356d03d..d57301b 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -73,7 +73,7 @@ TERMINALS = { '_RPAR': r'\)', '_LBRA': r'\[', '_RBRA': r'\]', - 'OP': '[+*][?]?|[?](?![a-z])', + 'OP': '[+*]|[?](?![a-z])', '_COLON': ':', '_COMMA': ',', '_OR': r'\|', From ec67938933d087b048da196981e1f0c19080bae9 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Sun, 19 Jan 2020 21:01:53 +0100 Subject: [PATCH 119/132] Small correction in lark.lark Added multi-name-imports to the lark.lark grammar. --- examples/lark.lark | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/lark.lark b/examples/lark.lark index f1f42f6..a561361 100644 --- a/examples/lark.lark +++ b/examples/lark.lark @@ -13,7 +13,8 @@ statement: "%ignore" expansions _NL -> ignore | "%import" import_args ["->" name] _NL -> import | "%declare" name+ -> declare -import_args: "."? name ("." name)* +import_args: import_path ["(" name ("," name)* ")"] +import_path: "."? name ("." name)* ?expansions: alias (_VBAR alias)* From 9552f001b2749e833db9e00c225323937245d632 Mon Sep 17 00:00:00 2001 From: MegaIng Date: Sun, 19 Jan 2020 21:11:09 +0100 Subject: [PATCH 120/132] Update lark.lark --- examples/lark.lark | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/examples/lark.lark b/examples/lark.lark index a561361..c99d528 100644 --- a/examples/lark.lark +++ b/examples/lark.lark @@ -10,11 +10,12 @@ token: TOKEN priority? ":" expansions _NL priority: "." NUMBER statement: "%ignore" expansions _NL -> ignore - | "%import" import_args ["->" name] _NL -> import + | "%import" import_path ["->" name] _NL -> import + | "%import" import_path name_list _NL -> multi_import | "%declare" name+ -> declare -import_args: import_path ["(" name ("," name)* ")"] -import_path: "."? name ("." name)* +!import_path: "."? name ("." name)* +name_list: "(" name ("," name)* ")" ?expansions: alias (_VBAR alias)* From a7c9025858e57bde560e64a1a01c7166cf99259b Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 20 Jan 2020 12:20:38 +0200 Subject: [PATCH 121/132] Added 2 projects to 'Projects using Lark' --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 6f5ed74..920d047 100644 --- a/README.md +++ b/README.md @@ -141,6 +141,8 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail - [mappyfile](https://github.com/geographika/mappyfile) - a MapFile parser for working with MapServer configuration - [synapse](https://github.com/vertexproject/synapse) - an intelligence analysis platform - [Command-Block-Assembly](https://github.com/simon816/Command-Block-Assembly) - An assembly language, and C compiler, for Minecraft commands + - [Hyperledger Fabric Python SDK](https://github.com/hyperledger/fabric-sdk-py) - Fabric-SDK-Py is an implementation of the Hyperledger fabric SDK with Python 3.x + - [Datacube-core](https://github.com/opendatacube/datacube-core) - Open Data Cube analyses continental scale Earth Observation data through time - [SPFlow](https://github.com/SPFlow/SPFlow) - Library for Sum-Product Networks - [Torchani](https://github.com/aiqm/torchani) - Accurate Neural Network Potential on PyTorch - [required](https://github.com/shezadkhan137/required) - multi-field validation using docstrings From f1b07e0571a6fdf38b9703f2ce7cc9c49f717644 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 20 Jan 2020 12:22:56 +0200 Subject: [PATCH 122/132] Small correction to 'Projects using Lark' --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 920d047..e6a52b9 100644 --- a/README.md +++ b/README.md @@ -140,11 +140,11 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail - [Hypothesis](https://github.com/HypothesisWorks/hypothesis) - Library for property-based testing - [mappyfile](https://github.com/geographika/mappyfile) - a MapFile parser for working with MapServer configuration - [synapse](https://github.com/vertexproject/synapse) - an intelligence analysis platform - - [Command-Block-Assembly](https://github.com/simon816/Command-Block-Assembly) - An assembly language, and C compiler, for Minecraft commands - - [Hyperledger Fabric Python SDK](https://github.com/hyperledger/fabric-sdk-py) - Fabric-SDK-Py is an implementation of the Hyperledger fabric SDK with Python 3.x - [Datacube-core](https://github.com/opendatacube/datacube-core) - Open Data Cube analyses continental scale Earth Observation data through time - [SPFlow](https://github.com/SPFlow/SPFlow) - Library for Sum-Product Networks - [Torchani](https://github.com/aiqm/torchani) - Accurate Neural Network Potential on PyTorch + - [Command-Block-Assembly](https://github.com/simon816/Command-Block-Assembly) - An assembly language, and C compiler, for Minecraft commands + - [Fabric-SDK-Py](https://github.com/hyperledger/fabric-sdk-py) - Hyperledger fabric SDK with Python 3.x - [required](https://github.com/shezadkhan137/required) - multi-field validation using docstrings - [miniwdl](https://github.com/chanzuckerberg/miniwdl) - A static analysis toolkit for the Workflow Description Language - [pytreeview](https://gitlab.com/parmenti/pytreeview) - a lightweight tree-based grammar explorer From 52e510780a46b3240524cafdb2a5f0057580cebd Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 22 Jan 2020 12:58:03 +0200 Subject: [PATCH 123/132] Small stuff --- lark/reconstruct.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/lark/reconstruct.py b/lark/reconstruct.py index bd7b6a0..1e3adc7 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -28,8 +28,6 @@ class WriteTokensTransformer(Transformer_InPlace): self.term_subs = term_subs def __default__(self, data, children, meta): - # if not isinstance(t, MatchTree): - # return t if not getattr(meta, 'match_tree', False): return Tree(data, children) @@ -97,11 +95,10 @@ class Reconstructor: self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens}, term_subs) self.rules = list(self._build_recons_rules(rules)) self.rules.reverse() - # print(len(self.rules)) + + # Choose the best rule from each group of {rule => [rule.alias]}, since we only really need one derivation. self.rules = best_from_group(self.rules, lambda r: r, lambda r: -len(r.expansion)) - # print(len(self.rules)) - # self.rules = list(set(list(self._build_recons_rules(rules)))) self.rules.sort(key=lambda r: len(r.expansion)) callbacks = {rule: rule.alias for rule in self.rules} # TODO pass callbacks through dict, instead of alias? self.parser = earley.Parser(ParserConf(self.rules, callbacks, parser.options.start), From 94dd3646d4147d6a91f20f517565dabe3378e930 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 22 Jan 2020 15:07:01 +0200 Subject: [PATCH 124/132] Extend comments in rules tests --- tests/test_parser.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tests/test_parser.py b/tests/test_parser.py index 4b3263d..7edfd3a 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -106,9 +106,12 @@ class TestParsers(unittest.TestCase): def test_comment_in_rule_definition(self): g = Lark("""start: a a: "a" - // A comment - // Another + // A comment + // Another comment | "b" + // Still more + + c: "unrelated" """) r = g.parse('b') self.assertEqual( r.children[0].data, "a" ) From 3688b0053b143c9e0717ecd3739b476e8c6ad0fc Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 22 Jan 2020 15:13:12 +0200 Subject: [PATCH 125/132] Disallow '. .' for '..' syntax (Issue #513) --- lark/load_grammar.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index ba26fe2..051f8cd 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -77,7 +77,8 @@ TERMINALS = { '_COLON': ':', '_COMMA': ',', '_OR': r'\|', - '_DOT': r'\.', + '_DOT': r'\.(?!\.)', + '_DOTDOT': r'\.\.', 'TILDE': '~', 'RULE': '!?[_?]?[a-z][_a-z0-9]*', 'TERMINAL': '_?[A-Z][_A-Z0-9]*', @@ -112,7 +113,7 @@ RULES = { '?expr': ['atom', 'atom OP', 'atom TILDE NUMBER', - 'atom TILDE NUMBER _DOT _DOT NUMBER', + 'atom TILDE NUMBER _DOTDOT NUMBER', ], '?atom': ['_LPAR expansions _RPAR', @@ -130,7 +131,7 @@ RULES = { '?name': ['RULE', 'TERMINAL'], 'maybe': ['_LBRA expansions _RBRA'], - 'range': ['STRING _DOT _DOT STRING'], + 'range': ['STRING _DOTDOT STRING'], 'term': ['TERMINAL _COLON expansions _NL', 'TERMINAL _DOT NUMBER _COLON expansions _NL'], From 73427f785c767dbb06fb538f512602324a7d1cea Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 22 Jan 2020 15:27:05 +0200 Subject: [PATCH 126/132] Version bump (release 0.8) --- lark/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/__init__.py b/lark/__init__.py index 0906eb7..c2cf65e 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -5,4 +5,4 @@ from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, Une from .lexer import Token from .lark import Lark -__version__ = "0.8.0rc1" +__version__ = "0.8.0" From de1f619fcd6dd27c5284b718ad981c7bfcd39608 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Wed, 22 Jan 2020 22:43:01 +0200 Subject: [PATCH 127/132] Bugfix in stand-alone parser (Issue #514) --- examples/standalone/json_parser.py | 10 +++++----- lark/parse_tree_builder.py | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/examples/standalone/json_parser.py b/examples/standalone/json_parser.py index f270ade..04fd6ac 100644 --- a/examples/standalone/json_parser.py +++ b/examples/standalone/json_parser.py @@ -1,4 +1,4 @@ -# The file was automatically generated by Lark v0.8.0rc1 +# The file was automatically generated by Lark v0.8.0 # # # Lark Stand-alone Generator Tool @@ -1432,7 +1432,7 @@ def inplace_transformer(func): return f def apply_visit_wrapper(func, name, wrapper): - if wrapper is visitors._vargs_meta or wrapper is visitors._vargs_meta_inline: + if wrapper is _vargs_meta or wrapper is _vargs_meta_inline: raise NotImplementedError("Meta args not supported for internal transformer") @wraps(func) def f(children): @@ -1815,7 +1815,7 @@ class LarkOptions(Serialize): 'start': 'start', 'priority': 'auto', 'ambiguity': 'auto', - 'propagate_positions': True, + 'propagate_positions': False, 'lexer_callbacks': {}, 'maybe_placeholders': True, 'edit_terminals': None, @@ -2047,10 +2047,10 @@ class Lark(Serialize): DATA = ( -{'rules': [{'@': 27}, {'@': 31}, {'@': 26}, {'@': 13}, {'@': 25}, {'@': 18}, {'@': 16}, {'@': 24}, {'@': 22}, {'@': 17}, {'@': 28}, {'@': 30}, {'@': 20}, {'@': 29}, {'@': 21}, {'@': 23}, {'@': 15}, {'@': 19}, {'@': 12}, {'@': 14}], 'parser': {'lexer_conf': {'tokens': [{'@': 0}, {'@': 1}, {'@': 2}, {'@': 3}, {'@': 4}, {'@': 5}, {'@': 6}, {'@': 7}, {'@': 8}, {'@': 9}, {'@': 10}, {'@': 11}], 'ignore': [u'WS'], '__type__': 'LexerConf'}, 'parser': {'tokens': {0: 'LBRACE', 1: u'FALSE', 2: u'string', 3: u'object', 4: u'NULL', 5: u'SIGNED_NUMBER', 6: u'value', 7: 'start', 8: 'LSQB', 9: u'ESCAPED_STRING', 10: u'TRUE', 11: u'array', 12: 'COMMA', 13: 'RBRACE', 14: u'pair', 15: 'COLON', 16: 'RSQB', 17: '$END', 18: '__anon_star_1', 19: '__anon_star_0'}, 'states': {0: {0: (0, 33), 1: (0, 8), 2: (0, 5), 3: (0, 32), 4: (0, 23), 5: (0, 24), 6: (0, 28), 7: (0, 11), 8: (0, 25), 9: (0, 20), 10: (0, 13), 11: (0, 26)}, 1: {12: (0, 2), 13: (0, 31)}, 2: {9: (0, 20), 2: (0, 4), 14: (0, 6)}, 3: {12: (1, {'@': 12}), 13: (1, {'@': 12})}, 4: {15: (0, 15)}, 5: {16: (1, {'@': 13}), 17: (1, {'@': 13}), 12: (1, {'@': 13}), 13: (1, {'@': 13})}, 6: {12: (1, {'@': 14}), 13: (1, {'@': 14})}, 7: {16: (1, {'@': 15}), 12: (1, {'@': 15})}, 8: {16: (1, {'@': 16}), 17: (1, {'@': 16}), 12: (1, {'@': 16}), 13: (1, {'@': 16})}, 9: {0: (0, 33), 1: (0, 8), 2: (0, 5), 3: (0, 32), 4: (0, 23), 5: (0, 24), 6: (0, 14), 8: (0, 25), 9: (0, 20), 10: (0, 13), 11: (0, 26)}, 10: {16: (1, {'@': 17}), 17: (1, {'@': 17}), 12: (1, {'@': 17}), 13: (1, {'@': 17})}, 11: {}, 12: {18: (0, 1), 12: (0, 21), 13: (0, 16)}, 13: {16: (1, {'@': 18}), 17: (1, {'@': 18}), 12: (1, {'@': 18}), 13: (1, {'@': 18})}, 14: {16: (1, {'@': 19}), 12: (1, {'@': 19})}, 15: {0: (0, 33), 1: (0, 8), 2: (0, 5), 3: (0, 32), 4: (0, 23), 5: (0, 24), 6: (0, 17), 8: (0, 25), 9: (0, 20), 10: (0, 13), 11: (0, 26)}, 16: {16: (1, {'@': 20}), 17: (1, {'@': 20}), 12: (1, {'@': 20}), 13: (1, {'@': 20})}, 17: {12: (1, {'@': 21}), 13: (1, {'@': 21})}, 18: {16: (1, {'@': 22}), 17: (1, {'@': 22}), 12: (1, {'@': 22}), 13: (1, {'@': 22})}, 19: {16: (0, 18), 12: (0, 9)}, 20: {16: (1, {'@': 23}), 17: (1, {'@': 23}), 12: (1, {'@': 23}), 13: (1, {'@': 23}), 15: (1, {'@': 23})}, 21: {9: (0, 20), 2: (0, 4), 14: (0, 3)}, 22: {16: (0, 10), 19: (0, 19), 12: (0, 27)}, 23: {16: (1, {'@': 24}), 17: (1, {'@': 24}), 12: (1, {'@': 24}), 13: (1, {'@': 24})}, 24: {16: (1, {'@': 25}), 17: (1, {'@': 25}), 12: (1, {'@': 25}), 13: (1, {'@': 25})}, 25: {0: (0, 33), 1: (0, 8), 2: (0, 5), 3: (0, 32), 4: (0, 23), 5: (0, 24), 6: (0, 22), 8: (0, 25), 9: (0, 20), 10: (0, 13), 11: (0, 26), 16: (0, 29)}, 26: {16: (1, {'@': 26}), 17: (1, {'@': 26}), 12: (1, {'@': 26}), 13: (1, {'@': 26})}, 27: {0: (0, 33), 1: (0, 8), 2: (0, 5), 3: (0, 32), 4: (0, 23), 5: (0, 24), 6: (0, 7), 8: (0, 25), 9: (0, 20), 10: (0, 13), 11: (0, 26)}, 28: {17: (1, {'@': 27})}, 29: {16: (1, {'@': 28}), 17: (1, {'@': 28}), 12: (1, {'@': 28}), 13: (1, {'@': 28})}, 30: {16: (1, {'@': 29}), 17: (1, {'@': 29}), 12: (1, {'@': 29}), 13: (1, {'@': 29})}, 31: {16: (1, {'@': 30}), 17: (1, {'@': 30}), 12: (1, {'@': 30}), 13: (1, {'@': 30})}, 32: {16: (1, {'@': 31}), 17: (1, {'@': 31}), 12: (1, {'@': 31}), 13: (1, {'@': 31})}, 33: {9: (0, 20), 2: (0, 4), 13: (0, 30), 14: (0, 12)}}, 'end_states': {'start': 11}, 'start_states': {'start': 0}}, '__type__': 'LALR_ContextualLexer', 'start': ['start']}, '__type__': 'Lark', 'options': {'transformer': None, 'lexer': 'contextual', 'lexer_callbacks': {}, 'debug': False, 'postlex': None, 'parser': 'lalr', 'cache_grammar': False, 'tree_class': None, 'priority': None, 'start': ['start'], 'keep_all_tokens': False, 'ambiguity': 'auto', 'edit_terminals': None, 'propagate_positions': True, 'maybe_placeholders': True}} +{'rules': [{'@': 26}, {'@': 30}, {'@': 25}, {'@': 31}, {'@': 23}, {'@': 19}, {'@': 14}, {'@': 22}, {'@': 27}, {'@': 16}, {'@': 28}, {'@': 12}, {'@': 24}, {'@': 29}, {'@': 20}, {'@': 21}, {'@': 15}, {'@': 13}, {'@': 17}, {'@': 18}], 'parser': {'lexer_conf': {'tokens': [{'@': 0}, {'@': 1}, {'@': 2}, {'@': 3}, {'@': 4}, {'@': 5}, {'@': 6}, {'@': 7}, {'@': 8}, {'@': 9}, {'@': 10}, {'@': 11}], 'ignore': [u'WS'], '__type__': 'LexerConf'}, 'parser': {'tokens': {0: 'RSQB', 1: 'COMMA', 2: 'RBRACE', 3: '$END', 4: u'__array_star_0', 5: 'COLON', 6: u'pair', 7: u'ESCAPED_STRING', 8: u'string', 9: 'LBRACE', 10: u'FALSE', 11: u'object', 12: u'NULL', 13: u'SIGNED_NUMBER', 14: u'value', 15: u'array', 16: u'TRUE', 17: 'LSQB', 18: u'__object_star_1', 19: 'start'}, 'states': {0: {0: (1, {'@': 12}), 1: (1, {'@': 12}), 2: (1, {'@': 12}), 3: (1, {'@': 12})}, 1: {0: (0, 11), 1: (0, 20), 4: (0, 17)}, 2: {1: (0, 23), 2: (0, 0)}, 3: {5: (0, 12)}, 4: {8: (0, 3), 6: (0, 13), 7: (0, 21)}, 5: {8: (0, 3), 2: (0, 30), 6: (0, 19), 7: (0, 21)}, 6: {0: (0, 29), 7: (0, 21), 8: (0, 33), 9: (0, 5), 10: (0, 8), 11: (0, 31), 12: (0, 22), 13: (0, 24), 14: (0, 1), 15: (0, 26), 16: (0, 16), 17: (0, 6)}, 7: {0: (1, {'@': 13}), 1: (1, {'@': 13})}, 8: {0: (1, {'@': 14}), 1: (1, {'@': 14}), 2: (1, {'@': 14}), 3: (1, {'@': 14})}, 9: {0: (1, {'@': 15}), 1: (1, {'@': 15})}, 10: {7: (0, 21), 8: (0, 33), 9: (0, 5), 10: (0, 8), 11: (0, 31), 12: (0, 22), 13: (0, 24), 14: (0, 7), 15: (0, 26), 16: (0, 16), 17: (0, 6)}, 11: {0: (1, {'@': 16}), 1: (1, {'@': 16}), 2: (1, {'@': 16}), 3: (1, {'@': 16})}, 12: {7: (0, 21), 8: (0, 33), 9: (0, 5), 10: (0, 8), 11: (0, 31), 12: (0, 22), 13: (0, 24), 14: (0, 18), 15: (0, 26), 16: (0, 16), 17: (0, 6)}, 13: {1: (1, {'@': 17}), 2: (1, {'@': 17})}, 14: {}, 15: {1: (1, {'@': 18}), 2: (1, {'@': 18})}, 16: {0: (1, {'@': 19}), 1: (1, {'@': 19}), 2: (1, {'@': 19}), 3: (1, {'@': 19})}, 17: {0: (0, 28), 1: (0, 10)}, 18: {1: (1, {'@': 20}), 2: (1, {'@': 20})}, 19: {1: (0, 4), 18: (0, 2), 2: (0, 25)}, 20: {7: (0, 21), 8: (0, 33), 9: (0, 5), 10: (0, 8), 11: (0, 31), 12: (0, 22), 13: (0, 24), 14: (0, 9), 15: (0, 26), 16: (0, 16), 17: (0, 6)}, 21: {0: (1, {'@': 21}), 1: (1, {'@': 21}), 2: (1, {'@': 21}), 3: (1, {'@': 21}), 5: (1, {'@': 21})}, 22: {0: (1, {'@': 22}), 1: (1, {'@': 22}), 2: (1, {'@': 22}), 3: (1, {'@': 22})}, 23: {8: (0, 3), 6: (0, 15), 7: (0, 21)}, 24: {0: (1, {'@': 23}), 1: (1, {'@': 23}), 2: (1, {'@': 23}), 3: (1, {'@': 23})}, 25: {0: (1, {'@': 24}), 1: (1, {'@': 24}), 2: (1, {'@': 24}), 3: (1, {'@': 24})}, 26: {0: (1, {'@': 25}), 1: (1, {'@': 25}), 2: (1, {'@': 25}), 3: (1, {'@': 25})}, 27: {3: (1, {'@': 26})}, 28: {0: (1, {'@': 27}), 1: (1, {'@': 27}), 2: (1, {'@': 27}), 3: (1, {'@': 27})}, 29: {0: (1, {'@': 28}), 1: (1, {'@': 28}), 2: (1, {'@': 28}), 3: (1, {'@': 28})}, 30: {0: (1, {'@': 29}), 1: (1, {'@': 29}), 2: (1, {'@': 29}), 3: (1, {'@': 29})}, 31: {0: (1, {'@': 30}), 1: (1, {'@': 30}), 2: (1, {'@': 30}), 3: (1, {'@': 30})}, 32: {7: (0, 21), 8: (0, 33), 9: (0, 5), 10: (0, 8), 11: (0, 31), 12: (0, 22), 13: (0, 24), 14: (0, 27), 15: (0, 26), 16: (0, 16), 17: (0, 6), 19: (0, 14)}, 33: {0: (1, {'@': 31}), 1: (1, {'@': 31}), 2: (1, {'@': 31}), 3: (1, {'@': 31})}}, 'end_states': {'start': 14}, 'start_states': {'start': 32}}, '__type__': 'LALR_ContextualLexer', 'start': ['start']}, '__type__': 'Lark', 'options': {'transformer': None, 'lexer': 'contextual', 'lexer_callbacks': {}, 'debug': False, 'postlex': None, 'parser': 'lalr', 'cache_grammar': False, 'tree_class': None, 'priority': None, 'start': ['start'], 'keep_all_tokens': False, 'ambiguity': 'auto', 'edit_terminals': None, 'propagate_positions': False, 'maybe_placeholders': True}} ) MEMO = ( -{0: {'priority': 1, 'pattern': {'__type__': 'PatternRE', '_width': [2, 4294967295], 'flags': [], 'value': u'\\".*?(? Date: Thu, 23 Jan 2020 01:11:28 +0200 Subject: [PATCH 128/132] Revert maybe_placeholders to be False by default.. It should be changed in a major release, not 0.8 (Issue #515) --- lark/lark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/lark.py b/lark/lark.py index 3e69b7f..01eca80 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -64,7 +64,7 @@ class LarkOptions(Serialize): 'ambiguity': 'auto', 'propagate_positions': False, 'lexer_callbacks': {}, - 'maybe_placeholders': True, + 'maybe_placeholders': False, 'edit_terminals': None, } From 5346231e14d31ab5bbc3cbc014a31b405d40ef39 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Thu, 23 Jan 2020 01:12:12 +0200 Subject: [PATCH 129/132] Version bump --- lark/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lark/__init__.py b/lark/__init__.py index c2cf65e..1a00c9d 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -5,4 +5,4 @@ from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, Une from .lexer import Token from .lark import Lark -__version__ = "0.8.0" +__version__ = "0.8.1" From 3995ad913afab34b8ebc6110a45c0d90d23ca777 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 28 Jan 2020 16:16:48 +0200 Subject: [PATCH 130/132] Tiny tiny cleanup --- lark/parsers/lalr_analysis.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 05c1ce8..8890c3c 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -262,23 +262,23 @@ class LALR_Analyzer(GrammarAnalyzer): actions[la] = (Reduce, list(rules)[0]) m[state] = { k.name: v for k, v in actions.items() } - self.states = { k.closure: v for k, v in m.items() } + states = { k.closure: v for k, v in m.items() } # compute end states end_states = {} - for state in self.states: + for state in states: for rp in state: for start in self.lr0_start_states: if rp.rule.origin.name == ('$root_' + start) and rp.is_satisfied: assert(not start in end_states) end_states[start] = state - self._parse_table = ParseTable(self.states, { start: state.closure for start, state in self.lr0_start_states.items() }, end_states) + _parse_table = ParseTable(states, { start: state.closure for start, state in self.lr0_start_states.items() }, end_states) if self.debug: - self.parse_table = self._parse_table + self.parse_table = _parse_table else: - self.parse_table = IntParseTable.from_ParseTable(self._parse_table) + self.parse_table = IntParseTable.from_ParseTable(_parse_table) def compute_lalr(self): self.compute_lr0_states() From 93976e360ecc1470ae0653cff92e02ec36966f92 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 1 Feb 2020 09:14:07 +0200 Subject: [PATCH 131/132] Fixed docs for maybe_placeholders --- docs/classes.md | 2 +- docs/tree_construction.md | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/classes.md b/docs/classes.md index 284ce73..fd9ee3d 100644 --- a/docs/classes.md +++ b/docs/classes.md @@ -34,7 +34,7 @@ The Lark class accepts a grammar string or file object, and keyword options: * **propagate_positions** - Propagate line/column count to tree nodes, at the cost of performance (default=`False`) -* **maybe_placeholders** - The `[]` operator returns `None` when not matched. Setting this to `False` makes it behave like the `?` operator, and return no value at all, which may be a little faster (default=`True`) +* **maybe_placeholders** - When True, the `[]` operator returns `None` when not matched. When `False`, `[]` behaves like the `?` operator, and return no value at all, which may be a little faster (default=`False`) * **lexer_callbacks** - A dictionary of callbacks of type f(Token) -> Token, used to interface with the lexer Token generation. Only works with the standard and contextual lexers. See [Recipes](recipes.md) for more information. diff --git a/docs/tree_construction.md b/docs/tree_construction.md index 9e61d4d..a4d6088 100644 --- a/docs/tree_construction.md +++ b/docs/tree_construction.md @@ -9,7 +9,9 @@ Using `item+` or `item*` will result in a list of items, equivalent to writing ` Using `item?` will return the item if it matched, or nothing. -Using `[item]` will return the item if it matched, or the value `None`, if it didn't. It's possible to force `[]` to behave like `()?`, by using the `maybe_placeholders=False` option when initializing Lark. +If `maybe_placeholders=False` (the default), then `[]` behaves like `()?`. + +If `maybe_placeholders=True`, then using `[item]` will return the item if it matched, or the value `None`, if it didn't. ### Terminals From a55b7155b51418444f856f55c31abee7a688380f Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Sat, 8 Feb 2020 05:37:45 +0200 Subject: [PATCH 132/132] Added support for v_args in Interpreter (Issue #520) --- lark/visitors.py | 57 ++++++++++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/lark/visitors.py b/lark/visitors.py index da6b1d5..30a2a65 100644 --- a/lark/visitors.py +++ b/lark/visitors.py @@ -13,7 +13,31 @@ class Discard(Exception): # Transformers -class Transformer: +class _Decoratable: + @classmethod + def _apply_decorator(cls, decorator, **kwargs): + mro = getmro(cls) + assert mro[0] is cls + libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)} + for name, value in getmembers(cls): + + # Make sure the function isn't inherited (unless it's overwritten) + if name.startswith('_') or (name in libmembers and name not in cls.__dict__): + continue + if not callable(cls.__dict__[name]): + continue + + # Skip if v_args already applied (at the function level) + if hasattr(cls.__dict__[name], 'vargs_applied'): + continue + + static = isinstance(cls.__dict__[name], (staticmethod, classmethod)) + setattr(cls, name, decorator(value, static=static, **kwargs)) + return cls + + + +class Transformer(_Decoratable): """Visits the tree recursively, starting with the leaves and finally the root (bottom-up) Calls its methods (provided by user via inheritance) according to tree.data @@ -90,27 +114,6 @@ class Transformer: return token - @classmethod - def _apply_decorator(cls, decorator, **kwargs): - mro = getmro(cls) - assert mro[0] is cls - libmembers = {name for _cls in mro[1:] for name, _ in getmembers(_cls)} - for name, value in getmembers(cls): - - # Make sure the function isn't inherited (unless it's overwritten) - if name.startswith('_') or (name in libmembers and name not in cls.__dict__): - continue - if not callable(cls.__dict__[name]): - continue - - # Skip if v_args already applied (at the function level) - if hasattr(cls.__dict__[name], 'vargs_applied'): - continue - - static = isinstance(cls.__dict__[name], (staticmethod, classmethod)) - setattr(cls, name, decorator(value, static=static, **kwargs)) - return cls - class InlineTransformer(Transformer): # XXX Deprecated def _call_userfunc(self, tree, new_children=None): @@ -221,7 +224,7 @@ def visit_children_decor(func): return inner -class Interpreter: +class Interpreter(_Decoratable): """Top-down visitor, recursive Visits the tree, starting with the root and finally the leaves (top-down) @@ -230,8 +233,14 @@ class Interpreter: Unlike Transformer and Visitor, the Interpreter doesn't automatically visit its sub-branches. The user has to explicitly call visit_children, or use the @visit_children_decor """ + def visit(self, tree): - return getattr(self, tree.data)(tree) + f = getattr(self, tree.data) + wrapper = getattr(f, 'visit_wrapper', None) + if wrapper is not None: + return f.visit_wrapper(f, tree.data, tree.children, tree.meta) + else: + return f(tree) def visit_children(self, tree): return [self.visit(child) if isinstance(child, Tree) else child