Merge branch 'fix_recons'

7 years ago · 633bd21e70
--- a/lark/exceptions.py
+++ b/lark/exceptions.py
@@ -75,7 +75,7 @@ class UnexpectedToken(ParseError, UnexpectedInput):
        self.column = getattr(token, 'column', '?')
        self.considered_rules = considered_rules
        self.state = state
        self.pos_in_stream = token.pos_in_stream
        self.pos_in_stream = getattr(token, 'pos_in_stream', None)

        message = ("Unexpected token %r at line %s, column %s.\n"
                   "Expected: %s\n"
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -157,9 +157,9 @@ class Lark:
        self.grammar = load_grammar(grammar, self.source)

        # Compile the EBNF grammar into BNF
        tokens, self.rules, self.ignore_tokens = self.grammar.compile()
        self.terminals, self.rules, self.ignore_tokens = self.grammar.compile()

        self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks)
        self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks)

        if self.options.parser:
            self.parser = self._build_parser()
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -448,8 +448,10 @@ class Grammar:
        self.ignore = ignore

    def compile(self):
        token_defs = list(self.token_defs)
        rule_defs = self.rule_defs
        # We change the trees in-place (to support huge grammars)
        # So deepcopy allows calling compile more than once.
        token_defs = deepcopy(list(self.token_defs))
        rule_defs = deepcopy(self.rule_defs)

        # =================
        #  Compile Tokens
--- a/lark/reconstruct.py
+++ b/lark/reconstruct.py
@@ -67,38 +67,42 @@ class MakeMatchTree:

 class Reconstructor:
    def __init__(self, parser):
        # Recreate the rules to assume a standard lexer
        _tokens, rules, _grammar_extra = parser.grammar.compile()
        # XXX TODO calling compile twice returns different results!
        tokens, rules, _grammar_extra = parser.grammar.compile()

        expand1s = {r.origin for r in parser.rules if r.options and r.options.expand1}
        self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens})
        self.rules = list(self._build_recons_rules(rules))

        d = defaultdict(list)
    def _build_recons_rules(self, rules):
        expand1s = {r.origin for r in rules if r.options and r.options.expand1}

        aliases = defaultdict(list)
        for r in rules:
            # Rules can match their alias
            if r.alias:
                alias = NonTerminal(r.alias)
                d[alias].append(r.expansion)
                d[r.origin].append([alias])
            else:
                d[r.origin].append(r.expansion)
                aliases[r.origin].append( r.alias )

            # Expanded rules can match their own terminal
            for sym in r.expansion:
                if sym in expand1s:
                    d[sym].append([Terminal(sym.name)])
        rule_names = {r.origin for r in rules}
        nonterminals = {sym for sym in rule_names
                       if sym.name.startswith('_') or sym in expand1s or sym in aliases }

        for r in rules:
            recons_exp = [sym if sym in nonterminals else Terminal(sym.name)
                          for sym in r.expansion if not is_discarded_terminal(sym)]

        reduced_rules = defaultdict(list)
        for name, expansions in d.items():
            for expansion in expansions:
                reduced = [sym if sym.name.startswith('_') or sym in expand1s else Terminal(sym.name)
                           for sym in expansion if not is_discarded_terminal(sym)]
            # Skip self-recursive constructs
            if recons_exp == [r.origin]:
                continue

                reduced_rules[name, tuple(reduced)].append(expansion)
            sym = NonTerminal(r.alias) if r.alias else r.origin

        self.rules = [Rule(name, list(reduced), MakeMatchTree(name.name, expansions[0]), None)
                      for (name, reduced), expansions in reduced_rules.items()]
            yield Rule(sym, recons_exp, MakeMatchTree(sym.name, r.expansion))

        self.write_tokens = WriteTokensTransformer({t.name:t for t in _tokens})
        for origin, rule_aliases in aliases.items():
            for alias in rule_aliases:
                yield Rule(origin, [Terminal(alias)], MakeMatchTree(origin.name, [NonTerminal(alias)]))
            
            yield Rule(origin, [Terminal(origin.name)], MakeMatchTree(origin.name, [origin]))
        


    def _match(self, term, token):
--- a/tests/main.py
+++ b/tests/main.py
@@ -5,6 +5,7 @@ import logging

 from .test_trees import TestTrees
 from .test_tools import TestStandalone
 from .test_reconstructor import TestReconstructor

 try:
    from .test_nearley.test_nearley import TestNearley
--- a/tests/test_reconstructor.py
+++ b/tests/test_reconstructor.py
@@ -0,0 +1,116 @@
 import json
 import unittest
 from unittest import TestCase
 from lark import Lark
 from lark.reconstruct import Reconstructor


 common = """
 %import common (WS_INLINE, NUMBER, WORD)
 %ignore WS_INLINE
 """

 def _remove_ws(s):
    return s.replace(' ', '').replace('\n','')

 class TestReconstructor(TestCase):

    def assert_reconstruct(self, grammar, code):
        parser = Lark(grammar, parser='lalr')
        tree = parser.parse(code)
        new = Reconstructor(parser).reconstruct(tree)
        self.assertEqual(_remove_ws(code), _remove_ws(new))

    def test_starred_rule(self):

        g = """
        start: item*
        item: NL
            | rule
        rule: WORD ":" NUMBER
        NL: /(\\r?\\n)+\s*/
        """ + common

        code = """
        Elephants: 12
        """

        self.assert_reconstruct(g, code)

    def test_starred_group(self):

        g = """
        start: (rule | _NL)*
        rule: WORD ":" NUMBER
        _NL: /(\\r?\\n)+\s*/
        """ + common

        code = """
        Elephants: 12
        """

        self.assert_reconstruct(g, code)

    def test_alias(self):

        g = """
        start: line*
        line: NL
            | rule
            | "hello" -> hi
        rule: WORD ":" NUMBER
        NL: /(\\r?\\n)+\s*/
        """ + common

        code = """
        Elephants: 12
        hello
        """

        self.assert_reconstruct(g, code)

    def test_json_example(self):
        test_json = '''
            {
                "empty_object" : {},
                "empty_array"  : [],
                "booleans"     : { "YES" : true, "NO" : false },
                "numbers"      : [ 0, 1, -2, 3.3, 4.4e5, 6.6e-7 ],
                "strings"      : [ "This", [ "And" , "That", "And a \\"b" ] ],
                "nothing"      : null
            }
        '''

        json_grammar = r"""
            ?start: value

            ?value: object
                  | array
                  | string
                  | SIGNED_NUMBER      -> number
                  | "true"             -> true
                  | "false"            -> false
                  | "null"             -> null

            array  : "[" [value ("," value)*] "]"
            object : "{" [pair ("," pair)*] "}"
            pair   : string ":" value

            string : ESCAPED_STRING

            %import common.ESCAPED_STRING
            %import common.SIGNED_NUMBER
            %import common.WS

            %ignore WS
        """

        json_parser = Lark(json_grammar, parser='lalr')
        tree = json_parser.parse(test_json)

        new_json = Reconstructor(json_parser).reconstruct(tree)
        self.assertEqual(json.loads(new_json), json.loads(test_json))


 if __name__ == '__main__':
    unittest.main()