| @@ -75,7 +75,7 @@ class UnexpectedToken(ParseError, UnexpectedInput): | |||
| self.column = getattr(token, 'column', '?') | |||
| self.considered_rules = considered_rules | |||
| self.state = state | |||
| self.pos_in_stream = token.pos_in_stream | |||
| self.pos_in_stream = getattr(token, 'pos_in_stream', None) | |||
| message = ("Unexpected token %r at line %s, column %s.\n" | |||
| "Expected: %s\n" | |||
| @@ -157,9 +157,9 @@ class Lark: | |||
| self.grammar = load_grammar(grammar, self.source) | |||
| # Compile the EBNF grammar into BNF | |||
| tokens, self.rules, self.ignore_tokens = self.grammar.compile() | |||
| self.terminals, self.rules, self.ignore_tokens = self.grammar.compile() | |||
| self.lexer_conf = LexerConf(tokens, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks) | |||
| self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, self.options.lexer_callbacks) | |||
| if self.options.parser: | |||
| self.parser = self._build_parser() | |||
| @@ -448,8 +448,10 @@ class Grammar: | |||
| self.ignore = ignore | |||
| def compile(self): | |||
| token_defs = list(self.token_defs) | |||
| rule_defs = self.rule_defs | |||
| # We change the trees in-place (to support huge grammars) | |||
| # So deepcopy allows calling compile more than once. | |||
| token_defs = deepcopy(list(self.token_defs)) | |||
| rule_defs = deepcopy(self.rule_defs) | |||
| # ================= | |||
| # Compile Tokens | |||
| @@ -67,38 +67,42 @@ class MakeMatchTree: | |||
| class Reconstructor: | |||
| def __init__(self, parser): | |||
| # Recreate the rules to assume a standard lexer | |||
| _tokens, rules, _grammar_extra = parser.grammar.compile() | |||
| # XXX TODO calling compile twice returns different results! | |||
| tokens, rules, _grammar_extra = parser.grammar.compile() | |||
| expand1s = {r.origin for r in parser.rules if r.options and r.options.expand1} | |||
| self.write_tokens = WriteTokensTransformer({t.name:t for t in tokens}) | |||
| self.rules = list(self._build_recons_rules(rules)) | |||
| d = defaultdict(list) | |||
| def _build_recons_rules(self, rules): | |||
| expand1s = {r.origin for r in rules if r.options and r.options.expand1} | |||
| aliases = defaultdict(list) | |||
| for r in rules: | |||
| # Rules can match their alias | |||
| if r.alias: | |||
| alias = NonTerminal(r.alias) | |||
| d[alias].append(r.expansion) | |||
| d[r.origin].append([alias]) | |||
| else: | |||
| d[r.origin].append(r.expansion) | |||
| aliases[r.origin].append( r.alias ) | |||
| # Expanded rules can match their own terminal | |||
| for sym in r.expansion: | |||
| if sym in expand1s: | |||
| d[sym].append([Terminal(sym.name)]) | |||
| rule_names = {r.origin for r in rules} | |||
| nonterminals = {sym for sym in rule_names | |||
| if sym.name.startswith('_') or sym in expand1s or sym in aliases } | |||
| for r in rules: | |||
| recons_exp = [sym if sym in nonterminals else Terminal(sym.name) | |||
| for sym in r.expansion if not is_discarded_terminal(sym)] | |||
| reduced_rules = defaultdict(list) | |||
| for name, expansions in d.items(): | |||
| for expansion in expansions: | |||
| reduced = [sym if sym.name.startswith('_') or sym in expand1s else Terminal(sym.name) | |||
| for sym in expansion if not is_discarded_terminal(sym)] | |||
| # Skip self-recursive constructs | |||
| if recons_exp == [r.origin]: | |||
| continue | |||
| reduced_rules[name, tuple(reduced)].append(expansion) | |||
| sym = NonTerminal(r.alias) if r.alias else r.origin | |||
| self.rules = [Rule(name, list(reduced), MakeMatchTree(name.name, expansions[0]), None) | |||
| for (name, reduced), expansions in reduced_rules.items()] | |||
| yield Rule(sym, recons_exp, MakeMatchTree(sym.name, r.expansion)) | |||
| self.write_tokens = WriteTokensTransformer({t.name:t for t in _tokens}) | |||
| for origin, rule_aliases in aliases.items(): | |||
| for alias in rule_aliases: | |||
| yield Rule(origin, [Terminal(alias)], MakeMatchTree(origin.name, [NonTerminal(alias)])) | |||
| yield Rule(origin, [Terminal(origin.name)], MakeMatchTree(origin.name, [origin])) | |||
| def _match(self, term, token): | |||
| @@ -5,6 +5,7 @@ import logging | |||
| from .test_trees import TestTrees | |||
| from .test_tools import TestStandalone | |||
| from .test_reconstructor import TestReconstructor | |||
| try: | |||
| from .test_nearley.test_nearley import TestNearley | |||
| @@ -0,0 +1,116 @@ | |||
| import json | |||
| import unittest | |||
| from unittest import TestCase | |||
| from lark import Lark | |||
| from lark.reconstruct import Reconstructor | |||
| common = """ | |||
| %import common (WS_INLINE, NUMBER, WORD) | |||
| %ignore WS_INLINE | |||
| """ | |||
| def _remove_ws(s): | |||
| return s.replace(' ', '').replace('\n','') | |||
| class TestReconstructor(TestCase): | |||
| def assert_reconstruct(self, grammar, code): | |||
| parser = Lark(grammar, parser='lalr') | |||
| tree = parser.parse(code) | |||
| new = Reconstructor(parser).reconstruct(tree) | |||
| self.assertEqual(_remove_ws(code), _remove_ws(new)) | |||
| def test_starred_rule(self): | |||
| g = """ | |||
| start: item* | |||
| item: NL | |||
| | rule | |||
| rule: WORD ":" NUMBER | |||
| NL: /(\\r?\\n)+\s*/ | |||
| """ + common | |||
| code = """ | |||
| Elephants: 12 | |||
| """ | |||
| self.assert_reconstruct(g, code) | |||
| def test_starred_group(self): | |||
| g = """ | |||
| start: (rule | _NL)* | |||
| rule: WORD ":" NUMBER | |||
| _NL: /(\\r?\\n)+\s*/ | |||
| """ + common | |||
| code = """ | |||
| Elephants: 12 | |||
| """ | |||
| self.assert_reconstruct(g, code) | |||
| def test_alias(self): | |||
| g = """ | |||
| start: line* | |||
| line: NL | |||
| | rule | |||
| | "hello" -> hi | |||
| rule: WORD ":" NUMBER | |||
| NL: /(\\r?\\n)+\s*/ | |||
| """ + common | |||
| code = """ | |||
| Elephants: 12 | |||
| hello | |||
| """ | |||
| self.assert_reconstruct(g, code) | |||
| def test_json_example(self): | |||
| test_json = ''' | |||
| { | |||
| "empty_object" : {}, | |||
| "empty_array" : [], | |||
| "booleans" : { "YES" : true, "NO" : false }, | |||
| "numbers" : [ 0, 1, -2, 3.3, 4.4e5, 6.6e-7 ], | |||
| "strings" : [ "This", [ "And" , "That", "And a \\"b" ] ], | |||
| "nothing" : null | |||
| } | |||
| ''' | |||
| json_grammar = r""" | |||
| ?start: value | |||
| ?value: object | |||
| | array | |||
| | string | |||
| | SIGNED_NUMBER -> number | |||
| | "true" -> true | |||
| | "false" -> false | |||
| | "null" -> null | |||
| array : "[" [value ("," value)*] "]" | |||
| object : "{" [pair ("," pair)*] "}" | |||
| pair : string ":" value | |||
| string : ESCAPED_STRING | |||
| %import common.ESCAPED_STRING | |||
| %import common.SIGNED_NUMBER | |||
| %import common.WS | |||
| %ignore WS | |||
| """ | |||
| json_parser = Lark(json_grammar, parser='lalr') | |||
| tree = json_parser.parse(test_json) | |||
| new_json = Reconstructor(json_parser).reconstruct(tree) | |||
| self.assertEqual(json.loads(new_json), json.loads(test_json)) | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||