diff --git a/lark/grammar.py b/lark/grammar.py index da1e282..21a62f1 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -64,6 +64,7 @@ class RuleOptions: self.keep_all_tokens = keep_all_tokens self.expand1 = expand1 self.priority = priority + self.empty_indices = () def __repr__(self): return 'RuleOptions(%r, %r, %r)' % ( diff --git a/lark/lark.py b/lark/lark.py index 15a0bd4..b1e8266 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -45,6 +45,7 @@ class LarkOptions(object): profile - Measure run-time usage in Lark. Read results from the profiler proprety (Default: False) propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches. lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. + maybe_placeholders - Experimental feature. Instead of omitting optional rules (i.e. rule?), replace them with None """ if __doc__: __doc__ += OPTIONS_DOC @@ -66,6 +67,7 @@ class LarkOptions(object): self.propagate_positions = o.pop('propagate_positions', False) self.earley__predict_all = o.pop('earley__predict_all', False) self.lexer_callbacks = o.pop('lexer_callbacks', {}) + self.maybe_placeholders = o.pop('maybe_placeholders', False) assert self.parser in ('earley', 'lalr', 'cyk', None) @@ -179,7 +181,7 @@ class Lark: def _build_parser(self): self.parser_class = get_frontend(self.options.parser, self.options.lexer) - self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr') + self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr', self.options.maybe_placeholders) callback = self._parse_tree_builder.create_callback(self.options.transformer) if self.profiler: for f in dir(callback): diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 1966117..ab42355 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -3,7 +3,7 @@ import os.path import sys from ast import literal_eval -from copy import deepcopy +from copy import copy, deepcopy from .utils import bfs from .lexer import Token, TerminalDef, PatternStr, PatternRE @@ -26,6 +26,8 @@ EXT = '.lark' _RE_FLAGS = 'imslux' +_EMPTY = Symbol('__empty__') + _TERMINAL_NAMES = { '.' : 'DOT', ',' : 'COMMA', @@ -151,7 +153,6 @@ RULES = { 'literal': ['REGEXP', 'STRING'], } - @inline_args class EBNF_to_BNF(Transformer_InPlace): def __init__(self): @@ -175,7 +176,7 @@ class EBNF_to_BNF(Transformer_InPlace): def expr(self, rule, op, *args): if op.value == '?': - return ST('expansions', [rule, ST('expansion', [])]) + return ST('expansions', [rule, _EMPTY]) elif op.value == '+': # a : b c+ d # --> @@ -481,7 +482,8 @@ class Grammar: for name, rule_tree, options in rule_defs: ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None tree = transformer.transform(rule_tree) - rules.append((name, ebnf_to_bnf.transform(tree), options)) + res = ebnf_to_bnf.transform(tree) + rules.append((name, res, options)) rules += ebnf_to_bnf.new_rules assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision" @@ -499,9 +501,17 @@ class Grammar: if alias and name.startswith('_'): raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) - assert all(isinstance(x, Symbol) for x in expansion), expansion + empty_indices = [i for i, x in enumerate(expansion) if x==_EMPTY] + if empty_indices: + assert options + exp_options = copy(options) + exp_options.empty_indices = len(expansion), empty_indices + expansion = [x for x in expansion if x!=_EMPTY] + else: + exp_options = options - rule = Rule(NonTerminal(name), expansion, alias, options) + assert all(isinstance(x, Symbol) for x in expansion), expansion + rule = Rule(NonTerminal(name), expansion, alias, exp_options) compiled_rules.append(rule) return terminals, compiled_rules, self.ignore diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 595ef8c..11b059a 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -1,7 +1,5 @@ from .exceptions import GrammarError -from .utils import suppress from .lexer import Token -from .grammar import Rule from .tree import Tree from .visitors import InlineTransformer # XXX Deprecated @@ -19,6 +17,23 @@ class ExpandSingleChild: else: return self.node_builder(children) +class AddMaybePlaceholder: + def __init__(self, empty_indices, node_builder): + self.node_builder = node_builder + self.empty_indices = empty_indices + + def __call__(self, children): + t = self.node_builder(children) + if self.empty_indices: + exp_len, empty_indices = self.empty_indices + # Calculate offset to handle repetition correctly + # e.g. ("a" "b"?)+ + # For non-repetitive rules, offset should be 0 + offset = len(t.children) - (exp_len - len(empty_indices)) + for i in empty_indices: + t.children.insert(i + offset, None) + return t + class PropagatePositions: def __init__(self, node_builder): @@ -116,11 +131,12 @@ def ptb_inline_args(func): class ParseTreeBuilder: - def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False): + def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False, maybe_placeholders=False): self.tree_class = tree_class self.propagate_positions = propagate_positions self.always_keep_all_tokens = keep_all_tokens self.ambiguous = ambiguous + self.maybe_placeholders = maybe_placeholders self.rule_builders = list(self._init_builders(rules)) @@ -135,6 +151,7 @@ class ParseTreeBuilder: wrapper_chain = filter(None, [ (expand_single_child and not rule.alias) and ExpandSingleChild, maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous), + self.maybe_placeholders and partial(AddMaybePlaceholder, options.empty_indices), self.propagate_positions and PropagatePositions, ]) diff --git a/tests/test_parser.py b/tests/test_parser.py index 3a4a60a..28a2324 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1248,6 +1248,28 @@ def _make_parser_test(LEXER, PARSER): res = p.parse('B') self.assertEqual(len(res.children), 3) + def test_maybe_placeholders(self): + p = Lark("""!start: "a"? "b"? "c"? """, maybe_placeholders=True) + self.assertEqual(p.parse("").children, [None, None, None]) + self.assertEqual(p.parse("a").children, ['a', None, None]) + self.assertEqual(p.parse("b").children, [None, 'b', None]) + self.assertEqual(p.parse("c").children, [None, None, 'c']) + self.assertEqual(p.parse("ab").children, ['a', 'b', None]) + self.assertEqual(p.parse("ac").children, ['a', None, 'c']) + self.assertEqual(p.parse("bc").children, [None, 'b', 'c']) + self.assertEqual(p.parse("abc").children, ['a', 'b', 'c']) + + p = Lark("""!start: ("a"? "b" "c"?)+ """, maybe_placeholders=True) + self.assertEqual(p.parse("b").children, [None, 'b', None]) + self.assertEqual(p.parse("bb").children, [None, 'b', None, None, 'b', None]) + self.assertEqual(p.parse("abbc").children, ['a', 'b', None, None, 'b', 'c']) + self.assertEqual(p.parse("babbcabcb").children, + [None, 'b', None, + 'a', 'b', None, + None, 'b', 'c', + 'a', 'b', 'c', + None, 'b', None]) + _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize()