diff --git a/lark/lark.py b/lark/lark.py index ecaa8da..770b821 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -274,7 +274,7 @@ class Lark(Serialize): assert self.options.ambiguity in ('resolve', 'explicit', 'forest', 'auto', ) # Parse the grammar file and compose the grammars (TODO) - self.grammar = load_grammar(grammar, self.source, re_module) + self.grammar = load_grammar(grammar, self.source, re_module, self.options.keep_all_tokens) if self.options.postlex is not None: terminals_to_keep = set(self.options.postlex.always_accept) @@ -335,7 +335,13 @@ class Lark(Serialize): self._callbacks = None # we don't need these callbacks if we aren't building a tree if self.options.ambiguity != 'forest': - self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class or Tree, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr' and self.options.ambiguity=='explicit', self.options.maybe_placeholders) + self._parse_tree_builder = ParseTreeBuilder( + self.rules, + self.options.tree_class or Tree, + self.options.propagate_positions, + self.options.parser!='lalr' and self.options.ambiguity=='explicit', + self.options.maybe_placeholders + ) self._callbacks = self._parse_tree_builder.create_callback(self.options.transformer) def _build_parser(self): diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 1590665..d039638 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -650,22 +650,6 @@ class Grammar: _imported_grammars = {} -def import_grammar(grammar_path, re_, base_paths=[]): - if grammar_path not in _imported_grammars: - import_paths = base_paths + IMPORT_PATHS - for import_path in import_paths: - with suppress(IOError): - joined_path = os.path.join(import_path, grammar_path) - with open(joined_path, encoding='utf8') as f: - text = f.read() - grammar = load_grammar(text, joined_path, re_) - _imported_grammars[grammar_path] = grammar - break - else: - open(grammar_path, encoding='utf8') - assert False - - return _imported_grammars[grammar_path] def import_from_grammar_into_namespace(grammar, namespace, aliases): """Returns all rules and terminals of grammar, prepended @@ -803,7 +787,7 @@ class GrammarLoader: ('%ignore expects a value', ['%ignore %import\n']), ] - def __init__(self, re_module): + def __init__(self, re_module, global_keep_all_tokens): terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] rules = [options_from_rule(name, None, x) for name, x in RULES.items()] @@ -816,6 +800,24 @@ class GrammarLoader: self.canonize_tree = CanonizeTree() self.re_module = re_module + self.global_keep_all_tokens = global_keep_all_tokens + + def import_grammar(self, grammar_path, base_paths=[]): + if grammar_path not in _imported_grammars: + import_paths = base_paths + IMPORT_PATHS + for import_path in import_paths: + with suppress(IOError): + joined_path = os.path.join(import_path, grammar_path) + with open(joined_path, encoding='utf8') as f: + text = f.read() + grammar = self.load_grammar(text, joined_path) + _imported_grammars[grammar_path] = grammar + break + else: + open(grammar_path, encoding='utf8') # Force a file not found error + assert False + + return _imported_grammars[grammar_path] def load_grammar(self, grammar_text, grammar_name=''): "Parse grammar_text, verify, and create Grammar object. Display nice messages on error." @@ -901,7 +903,7 @@ class GrammarLoader: # import grammars for dotted_path, (base_paths, aliases) in imports.items(): grammar_path = os.path.join(*dotted_path) + EXT - g = import_grammar(grammar_path, self.re_module, base_paths=base_paths) + g = self.import_grammar(grammar_path, base_paths=base_paths) new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases) term_defs += new_td @@ -946,7 +948,11 @@ class GrammarLoader: rules = rule_defs rule_names = {} - for name, params, _x, _o in rules: + for name, params, _x, option in rules: + # We can't just simply not throw away the tokens later, we need option.keep_all_tokens to correctly generate maybe_placeholders + if self.global_keep_all_tokens: + option.keep_all_tokens = True + if name.startswith('__'): raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) if name in rule_names: @@ -981,5 +987,5 @@ class GrammarLoader: -def load_grammar(grammar, source, re_): - return GrammarLoader(re_).load_grammar(grammar, source) +def load_grammar(grammar, source, re_, global_keep_all_tokens): + return GrammarLoader(re_, global_keep_all_tokens).load_grammar(grammar, source) diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 8b81d29..a4c4330 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -299,10 +299,9 @@ def apply_visit_wrapper(func, name, wrapper): class ParseTreeBuilder: - def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False, maybe_placeholders=False): + def __init__(self, rules, tree_class, propagate_positions=False, ambiguous=False, maybe_placeholders=False): self.tree_class = tree_class self.propagate_positions = propagate_positions - self.always_keep_all_tokens = keep_all_tokens self.ambiguous = ambiguous self.maybe_placeholders = maybe_placeholders @@ -311,7 +310,7 @@ class ParseTreeBuilder: def _init_builders(self, rules): for rule in rules: options = rule.options - keep_all_tokens = self.always_keep_all_tokens or options.keep_all_tokens + keep_all_tokens = options.keep_all_tokens expand_single_child = options.expand1 wrapper_chain = list(filter(None, [ diff --git a/tests/test_parser.py b/tests/test_parser.py index cea40b9..c38b81e 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2064,6 +2064,10 @@ def _make_parser_test(LEXER, PARSER): # Anonymous tokens shouldn't count p = _Lark("""start: ["a"] ["b"] ["c"] """, maybe_placeholders=True) self.assertEqual(p.parse("").children, []) + + # Unless keep_all_tokens=True + p = _Lark("""start: ["a"] ["b"] ["c"] """, maybe_placeholders=True, keep_all_tokens=True) + self.assertEqual(p.parse("").children, [None, None, None]) # All invisible constructs shouldn't count p = _Lark("""start: [A] ["b"] [_c] ["e" "f" _c]