diff --git a/lark/lark.py b/lark/lark.py index 6b065f8..ba2eac0 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -62,14 +62,13 @@ class LarkOptions(object): self.profile = o.pop('profile', False) self.ambiguity = o.pop('ambiguity', 'auto') self.propagate_positions = o.pop('propagate_positions', False) - self.earley__predict_all = o.pop('earley__predict_all', False) self.lexer_callbacks = o.pop('lexer_callbacks', {}) assert self.parser in ('earley', 'lalr', 'cyk', None) - if self.parser == 'earley' and self.transformer: - raise ValueError('Cannot specify an embedded transformer when using the Earley algorithm.' - 'Please use your transformer on the resulting parse tree, or use a different algorithm (i.e. lalr)') + if self.ambiguity == 'explicit' and self.transformer: + raise ValueError('Cannot specify an embedded transformer when using the Earley algorithm for explicit ambiguity.' + 'Please use your transformer on the resulting Forest, or use a different algorithm (i.e. LALR)') if o: raise ValueError("Unknown options: %s" % o.keys()) @@ -176,7 +175,7 @@ class Lark: def _build_parser(self): self.parser_class = get_frontend(self.options.parser, self.options.lexer) - self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr') + self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr' and self.options.ambiguity=='explicit') callback = self._parse_tree_builder.create_callback(self.options.transformer) if self.profiler: for f in dir(callback): diff --git a/lark/parse_tree_builder.py b/lark/parse_tree_builder.py index 0ebe461..31b7214 100644 --- a/lark/parse_tree_builder.py +++ b/lark/parse_tree_builder.py @@ -7,6 +7,7 @@ from .visitors import InlineTransformer # XXX Deprecated ###{standalone from functools import partial, wraps +from itertools import repeat, product class ExpandSingleChild: @@ -62,23 +63,11 @@ class PropagatePositions: class ChildFilter: + "Optimized childfilter (assumes no duplication in parse tree, so it's safe to change it)" def __init__(self, to_include, node_builder): self.node_builder = node_builder self.to_include = to_include - def __call__(self, children): - filtered = [] - for i, to_expand in self.to_include: - if to_expand: - filtered += children[i].children - else: - filtered.append(children[i]) - - return self.node_builder(filtered) - -class ChildFilterLALR(ChildFilter): - "Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)" - def __call__(self, children): filtered = [] for i, to_expand in self.to_include: @@ -89,19 +78,43 @@ class ChildFilterLALR(ChildFilter): filtered = children[i].children else: filtered.append(children[i]) - return self.node_builder(filtered) def _should_expand(sym): return not sym.is_term and sym.name.startswith('_') -def maybe_create_child_filter(expansion, keep_all_tokens, ambiguous): +def maybe_create_child_filter(expansion, keep_all_tokens): to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) if keep_all_tokens or not (sym.is_term and sym.filter_out)] if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): - return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include) + return partial(ChildFilter, to_include) + +class AmbiguousExpander: + """Deal with the case where we're expanding children ('_rule') into a parent but the children + are ambiguous. i.e. (parent->_ambig->_expand_this_rule). In this case, make the parent itself + ambiguous with as many copies as their are ambiguous children, and then copy the ambiguous children + into the right parents in the right places, essentially shifting the ambiguiuty up the tree.""" + def __init__(self, to_expand, tree_class, node_builder): + self.node_builder = node_builder + self.tree_class = tree_class + self.to_expand = to_expand + def __call__(self, children): + def _is_ambig_tree(child): + return hasattr(child, 'data') and child.data == '_ambig' + + ambiguous = [i for i in self.to_expand if _is_ambig_tree(children[i])] + if ambiguous: + expand = [iter(child.children) if i in ambiguous else repeat(child) for i, child in enumerate(children)] + return self.tree_class('_ambig', [self.node_builder(list(f[0])) for f in product(zip(*expand))]) + return self.node_builder(children) + +def maybe_create_ambiguous_expander(tree_class, expansion, keep_all_tokens): + to_expand = [i for i, sym in enumerate(expansion) + if keep_all_tokens or ((not (sym.is_term and sym.filter_out)) and _should_expand(sym))] + if to_expand: + return partial(AmbiguousExpander, to_expand, tree_class) class Callback(object): pass @@ -113,8 +126,6 @@ def ptb_inline_args(func): return func(*children) return f - - class ParseTreeBuilder: def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False): self.tree_class = tree_class @@ -135,7 +146,8 @@ class ParseTreeBuilder: wrapper_chain = filter(None, [ self.propagate_positions and PropagatePositions, (expand_single_child and not rule.alias) and ExpandSingleChild, - maybe_create_child_filter(rule.expansion, keep_all_tokens, self.ambiguous), + maybe_create_child_filter(rule.expansion, keep_all_tokens), + self.ambiguous and maybe_create_ambiguous_expander(self.tree_class, rule.expansion, keep_all_tokens), ]) yield rule, wrapper_chain diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 7dd972d..927cf4f 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -4,8 +4,7 @@ from functools import partial from .utils import get_regexp_width from .parsers.grammar_analysis import GrammarAnalyzer from .lexer import TraditionalLexer, ContextualLexer, Lexer, Token - -from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk +from .parsers import lalr_parser, earley, earley_forest, xearley, cyk from .tree import Tree class WithLexer: @@ -54,13 +53,13 @@ class LALR_CustomLexer(WithLexer): self.lexer = lexer_cls(lexer_conf) -def get_ambiguity_resolver(options): +def get_ambiguity_options(options): if not options or options.ambiguity == 'resolve': - return resolve_ambig.standard_resolve_ambig + return {} elif options.ambiguity == 'resolve__antiscore_sum': - return resolve_ambig.antiscore_sum_resolve_ambig + return {'forest_sum_visitor': earley_forest.ForestAntiscoreSumVisitor} elif options.ambiguity == 'explicit': - return None + return {'resolve_ambiguity': False} raise ValueError(options) def tokenize_text(text): @@ -76,8 +75,7 @@ class Earley(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): self.init_traditional_lexer(lexer_conf) - self.parser = earley.Parser(parser_conf, self.match, - resolve_ambiguity=get_ambiguity_resolver(options)) + self.parser = earley.Parser(parser_conf, self.match, **get_ambiguity_options(options)) def match(self, term, token): return term.name == token.type @@ -89,11 +87,10 @@ class XEarley: self._prepare_match(lexer_conf) + kw.update(get_ambiguity_options(options)) self.parser = xearley.Parser(parser_conf, self.match, - resolve_ambiguity=get_ambiguity_resolver(options), ignore=lexer_conf.ignore, - predict_all=options.earley__predict_all, **kw ) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index 6f823de..47bf0ff 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -1,160 +1,44 @@ -"This module implements an Earley Parser" - -# The parser uses a parse-forest to keep track of derivations and ambiguations. -# When the parse ends successfully, a disambiguation stage resolves all ambiguity -# (right now ambiguity resolution is not developed beyond the needs of lark) -# Afterwards the parse tree is reduced (transformed) according to user callbacks. -# I use the no-recursion version of Transformer, because the tree might be -# deeper than Python's recursion limit (a bit absurd, but that's life) -# -# The algorithm keeps track of each state set, using a corresponding Column instance. -# Column keeps track of new items using NewsList instances. -# +"""This module implements an scanerless Earley parser. + +The core Earley algorithm used here is based on Elizabeth Scott's implementation, here: + https://www.sciencedirect.com/science/article/pii/S1571066108001497 + +That is probably the best reference for understanding the algorithm here. + +The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format +is better documented here: + http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ +""" # Author: Erez Shinan (2017) # Email : erezshin@gmail.com -from ..tree import Tree from ..visitors import Transformer_InPlace, v_args from ..exceptions import ParseError, UnexpectedToken from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal +from .earley_common import Column, Item +from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode - -class Derivation(Tree): - def __init__(self, rule, items=None): - Tree.__init__(self, 'drv', items or []) - self.meta.rule = rule - self._hash = None - - def _pretty_label(self): # Nicer pretty for debugging the parser - return self.meta.rule.origin.name if self.meta.rule else self.data - - def __hash__(self): - if self._hash is None: - self._hash = Tree.__hash__(self) - return self._hash - -class Item(object): - "An Earley Item, the atom of the algorithm." - - def __init__(self, rule, ptr, start, tree): - self.rule = rule - self.ptr = ptr - self.start = start - self.tree = tree if tree is not None else Derivation(self.rule) - - @property - def expect(self): - return self.rule.expansion[self.ptr] - - @property - def is_complete(self): - return self.ptr == len(self.rule.expansion) - - def advance(self, tree): - assert self.tree.data == 'drv' - new_tree = Derivation(self.rule, self.tree.children + [tree]) - return self.__class__(self.rule, self.ptr+1, self.start, new_tree) - - def __eq__(self, other): - return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule - - def __hash__(self): - return hash((self.rule, self.ptr, id(self.start))) # Always runs Derivation.__hash__ - - def __repr__(self): - before = list(map(str, self.rule.expansion[:self.ptr])) - after = list(map(str, self.rule.expansion[self.ptr:])) - return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) - -class NewsList(list): - "Keeps track of newly added items (append-only)" - - def __init__(self, initial=None): - list.__init__(self, initial or []) - self.last_iter = 0 - - def get_news(self): - i = self.last_iter - self.last_iter = len(self) - return self[i:] - - - -class Column: - "An entry in the table, aka Earley Chart. Contains lists of items." - def __init__(self, i, FIRST, predict_all=False): - self.i = i - self.to_reduce = NewsList() - self.to_predict = NewsList() - self.to_scan = [] - self.item_count = 0 - self.FIRST = FIRST - - self.predicted = set() - self.completed = {} - self.predict_all = predict_all - - def add(self, items): - """Sort items into scan/predict/reduce newslists - - Makes sure only unique items are added. - """ - for item in items: - - item_key = item, item.tree # Elsewhere, tree is not part of the comparison - if item.is_complete: - # XXX Potential bug: What happens if there's ambiguity in an empty rule? - if item.rule.expansion and item_key in self.completed: - old_tree = self.completed[item_key].tree - if old_tree == item.tree: - is_empty = not self.FIRST[item.rule.origin] - if not is_empty: - continue - - if old_tree.data != '_ambig': - new_tree = old_tree.copy() - new_tree.meta.rule = old_tree.meta.rule - old_tree.set('_ambig', [new_tree]) - old_tree.meta.rule = None # No longer a 'drv' node - - if item.tree.children[0] is old_tree: # XXX a little hacky! - raise ParseError("Infinite recursion in grammar! (Rule %s)" % item.rule) - - if item.tree not in old_tree.children: - old_tree.children.append(item.tree) - # old_tree.children.append(item.tree) - else: - self.completed[item_key] = item - self.to_reduce.append(item) - else: - if item.expect.is_term: - self.to_scan.append(item) - else: - k = item_key if self.predict_all else item - if k in self.predicted: - continue - self.predicted.add(k) - self.to_predict.append(item) - - self.item_count += 1 # Only count if actually added - - - def __bool__(self): - return bool(self.item_count) - __nonzero__ = __bool__ # Py2 backwards-compatibility +from collections import deque, defaultdict class Parser: - def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None): + def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, forest_sum_visitor = ForestSumVisitor): analysis = GrammarAnalyzer(parser_conf) self.parser_conf = parser_conf self.resolve_ambiguity = resolve_ambiguity + self.forest_sum_visitor = forest_sum_visitor self.FIRST = analysis.FIRST - self.postprocess = {} + self.callbacks = {} self.predictions = {} + + ## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than + # the slow 'isupper' in is_terminal. + self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } + self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } + for rule in parser_conf.rules: - self.postprocess[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias) + self.callbacks[rule] = getattr(parser_conf.callback, rule.alias or rule.origin, None) self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] self.term_matcher = term_matcher @@ -163,72 +47,163 @@ class Parser: def parse(self, stream, start_symbol=None): # Define parser functions start_symbol = NonTerminal(start_symbol or self.parser_conf.start) - - _Item = Item match = self.term_matcher - - def predict(nonterm, column): - assert not nonterm.is_term, nonterm - return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]] - - def complete(item): - name = item.rule.origin - return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name] - - def predict_and_complete(column): - while True: - to_predict = {x.expect for x in column.to_predict.get_news() - if x.ptr} # if not part of an already predicted batch - to_reduce = set(column.to_reduce.get_news()) - if not (to_predict or to_reduce): - break - - for nonterm in to_predict: - column.add( predict(nonterm, column) ) - - for item in to_reduce: - new_items = list(complete(item)) - if item in new_items: - raise ParseError('Infinite recursion detected! (rule %s)' % item.rule) - column.add(new_items) - - def scan(i, token, column): - next_set = Column(i, self.FIRST) - next_set.add(item.advance(token) for item in column.to_scan if match(item.expect, token)) - - if not next_set: - expect = {i.expect.name for i in column.to_scan} - raise UnexpectedToken(token, expect, considered_rules=set(column.to_scan)) - - return next_set + held_completions = defaultdict(list) + node_cache = {} + token_cache = {} + + def make_symbol_node(s, start, end): + label = (s, start.i, end.i) + if label in node_cache: + node = node_cache[label] + else: + node = node_cache[label] = SymbolNode(s, start, end) + return node + + def predict_and_complete(column, to_scan): + """The core Earley Predictor and Completer. + + At each stage of the input, we handling any completed items (things + that matched on the last cycle) and use those to predict what should + come next in the input stream. The completions and any predicted + non-terminals are recursively processed until we reach a set of, + which can be added to the scan list for the next scanner cycle.""" + held_completions.clear() + + # R (items) = Ei (column.items) + items = deque(column.items) + while items: + item = items.pop() # remove an element, A say, from R + + ### The Earley completer + if item.is_complete: ### (item.s == string) + if item.node is None: + item.node = make_symbol_node(item.s, item.start, column) + item.node.add_family(item.s, item.rule, item.start, None, None) + + # Empty has 0 length. If we complete an empty symbol in a particular + # parse step, we need to be able to use that same empty symbol to complete + # any predictions that result, that themselves require empty. Avoids + # infinite recursion on empty symbols. + # held_completions is 'H' in E.Scott's paper. + is_empty_item = item.start.i == column.i + if is_empty_item: + held_completions[item.rule.origin] = item.node + + originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s] + for originator in originators: + new_item = originator.advance() + new_item.node = make_symbol_node(new_item.s, originator.start, column) + new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) + if new_item.expect in self.TERMINALS: + # Add (B :: aC.B, h, y) to Q + to_scan.add(new_item) + elif new_item not in column.items: + # Add (B :: aC.B, h, y) to Ei and R + column.add(new_item) + items.append(new_item) + + ### The Earley predictor + elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) + new_items = [] + for rule in self.predictions[item.expect]: + new_item = Item(rule, 0, column) + new_items.append(new_item) + + # Process any held completions (H). + if item.expect in held_completions: + new_item = item.advance() + new_item.node = make_symbol_node(new_item.s, item.start, column) + new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) + new_items.append(new_item) + + for new_item in new_items: + if new_item.expect in self.TERMINALS: + to_scan.add(new_item) + elif new_item not in column.items: + column.add(new_item) + items.append(new_item) + + def scan(i, token, column, to_scan): + """The core Earley Scanner. + + This is a custom implementation of the scanner that uses the + Lark lexer to match tokens. The scan list is built by the + Earley predictor, based on the previously completed tokens. + This ensures that at each phase of the parse we have a custom + lexer context, allowing for more complex ambiguities.""" + next_set = Column(i+1, self.FIRST) + next_to_scan = set() + for item in set(to_scan): + if match(item.expect, token): + new_item = item.advance() + new_item.node = make_symbol_node(new_item.s, new_item.start, column) + new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) + + if new_item.expect in self.TERMINALS: + # add (B ::= Aai+1.B, h, y) to Q' + next_to_scan.add(new_item) + else: + # add (B ::= Aa+1.B, h, y) to Ei+1 + next_set.add(new_item) + + if not next_set and not next_to_scan: + expect = {i.expect.name for i in to_scan} + raise UnexpectedToken(token, expect, considered_rules = set(to_scan)) + + return next_set, next_to_scan # Main loop starts column0 = Column(0, self.FIRST) - column0.add(predict(start_symbol, column0)) - column = column0 + + ## The scan buffer. 'Q' in E.Scott's paper. + to_scan = set() + + ## Predict for the start_symbol. + # Add predicted items to the first Earley set (for the predictor) if they + # result in a non-terminal, or the scanner if they result in a terminal. + for rule in self.predictions[start_symbol]: + item = Item(rule, 0, column0) + if item.expect in self.TERMINALS: + to_scan.add(item) + else: + column.add(item) + + ## The main Earley loop. + # Run the Prediction/Completion cycle for any Items in the current Earley set. + # Completions will be added to the SPPF tree, and predictions will be recursively + # processed down to terminals/empty nodes to be added to the scanner for the next + # step. for i, token in enumerate(stream): - predict_and_complete(column) - column = scan(i, token, column) + predict_and_complete(column, to_scan) + + # Clear the node_cache and token_cache, which are only relevant for each + # step in the Earley pass. + node_cache.clear() + token_cache.clear() + column, to_scan = scan(i, token, column, to_scan) - predict_and_complete(column) + predict_and_complete(column, to_scan) - # Parse ended. Now build a parse tree - solutions = [n.tree for n in column.to_reduce - if n.rule.origin==start_symbol and n.start is column0] + ## Column is now the final column in the parse. If the parse was successful, the start + # symbol should have been completed in the last step of the Earley cycle, and will be in + # this column. Find the item for the start_symbol, which is the root of the SPPF tree. + solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0] if not solutions: raise ParseError('Incomplete parse: Could not find a solution to input') - elif len(solutions) == 1: - tree = solutions[0] - else: - tree = Tree('_ambig', solutions) - - if self.resolve_ambiguity: - tree = self.resolve_ambiguity(tree) + elif len(solutions) > 1: + raise ParseError('Earley should not generate multiple start symbol items!') - return ApplyCallbacks(self.postprocess).transform(tree) + ## If we're not resolving ambiguity, we just return the root of the SPPF tree to the caller. + # This means the caller can work directly with the SPPF tree. + if not self.resolve_ambiguity: + return solutions[0] + # ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities + # according to the rules. + return ForestToTreeVisitor(solutions[0], self.forest_sum_visitor, self.callbacks).go() class ApplyCallbacks(Transformer_InPlace): def __init__(self, postprocess): diff --git a/lark/parsers/earley_common.py b/lark/parsers/earley_common.py new file mode 100644 index 0000000..d17abe4 --- /dev/null +++ b/lark/parsers/earley_common.py @@ -0,0 +1,80 @@ +"This module implements an Earley Parser" + +# The parser uses a parse-forest to keep track of derivations and ambiguations. +# When the parse ends successfully, a disambiguation stage resolves all ambiguity +# (right now ambiguity resolution is not developed beyond the needs of lark) +# Afterwards the parse tree is reduced (transformed) according to user callbacks. +# I use the no-recursion version of Transformer, because the tree might be +# deeper than Python's recursion limit (a bit absurd, but that's life) +# +# The algorithm keeps track of each state set, using a corresponding Column instance. +# Column keeps track of new items using NewsList instances. +# +# Author: Erez Shinan (2017) +# Email : erezshin@gmail.com + +## for recursive repr +from ..tree import Tree + +class Derivation(Tree): + def __init__(self, rule, children = None): + Tree.__init__(self, 'drv', children if children is not None else []) + self.meta.rule = rule + self._hash = None + + def __repr__(self, indent = 0): + return 'Derivation(%s, %s, %s)' % (self.data, self.rule.origin, '...') + + def __hash__(self): + if self._hash is None: + self._hash = Tree.__hash__(self) + return self._hash + +class Item(object): + "An Earley Item, the atom of the algorithm." + + __slots__ = ('s', 'rule', 'ptr', 'start', 'is_complete', 'expect', 'node', '_hash') + def __init__(self, rule, ptr, start): + self.is_complete = len(rule.expansion) == ptr + self.rule = rule # rule + self.ptr = ptr # ptr + self.start = start # j + self.node = None # w + if self.is_complete: + self.s = rule.origin + self.expect = None + else: + self.s = (rule, ptr) + self.expect = rule.expansion[ptr] + self._hash = hash((self.s, self.start.i)) + + def advance(self): + return self.__class__(self.rule, self.ptr + 1, self.start) + + def __eq__(self, other): + return self is other or (self.s == other.s and self.start.i == other.start.i) + + def __hash__(self): + return self._hash + + def __repr__(self): + return '%s (%d)' % (self.s if self.is_complete else self.rule.origin, self.start.i) + +class Column: + "An entry in the table, aka Earley Chart. Contains lists of items." + def __init__(self, i, FIRST): + self.i = i + self.items = set() + self.FIRST = FIRST + + def add(self, item): + """Sort items into scan/predict/reduce newslists + + Makes sure only unique items are added. + """ + self.items.add(item) + + def __bool__(self): + return bool(self.items) + + __nonzero__ = __bool__ # Py2 backwards-compatibility diff --git a/lark/parsers/earley_forest.py b/lark/parsers/earley_forest.py new file mode 100644 index 0000000..d6b8a3f --- /dev/null +++ b/lark/parsers/earley_forest.py @@ -0,0 +1,347 @@ +""""This module implements an SPPF implementation + +This is used as the primary output mechanism for the Earley parser +in order to store complex ambiguities. + +Full reference and more details is here: +http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ +""" + +from ..tree import Tree +from ..exceptions import ParseError +from ..lexer import Token +from ..utils import Str +from ..grammar import NonTerminal, Terminal +from .earley_common import Column, Derivation + +from collections import deque + +class SymbolNode(object): + """ + A Symbol Node represents a symbol (or Intermediate LR0). + + Symbol nodes are keyed by the symbol (s). For intermediate nodes + s will be an LR0, stored as a tuple of (rule, ptr). For completed symbol + nodes, s will be a string representing the non-terminal origin (i.e. + the left hand side of the rule). + + The children of a Symbol or Intermediate Node will always be Packed Nodes; + with each Packed Node child representing a single derivation of a production. + + Hence a Symbol Node with a single child is unambiguous. + """ + __slots__ = ('s', 'start', 'end', 'children', 'priority', 'is_intermediate') + def __init__(self, s, start, end): + self.s = s + self.start = start + self.end = end + self.children = set() + self.priority = None + self.is_intermediate = isinstance(s, tuple) + + def add_family(self, lr0, rule, start, left, right): + self.children.add(PackedNode(self, lr0, rule, start, left, right)) + + @property + def is_ambiguous(self): + return len(self.children) > 1 + + def __iter__(self): + return iter(self.children) + + def __eq__(self, other): + if not isinstance(other, SymbolNode): + return False + return self is other or (self.s == other.s and self.start == other.start and self.end is other.end) + + def __hash__(self): + return hash((self.s, self.start.i, self.end.i)) + + def __repr__(self): + symbol = self.s.name if isinstance(self.s, (NonTerminal, Terminal)) else self.s[0].origin.name + return "(%s, %d, %d, %d)" % (symbol, self.start.i, self.end.i, self.priority if self.priority is not None else 0) + +class PackedNode(object): + """ + A Packed Node represents a single derivation in a symbol node. + """ + __slots__ = ('parent', 's', 'rule', 'start', 'left', 'right', 'priority', '_hash') + def __init__(self, parent, s, rule, start, left, right): + self.parent = parent + self.s = s + self.start = start + self.rule = rule + self.left = left + self.right = right + self.priority = None + self._hash = hash((self.s, self.start.i, self.left, self.right)) + + @property + def is_empty(self): + return self.left is None and self.right is None + + def __iter__(self): + return iter([self.left, self.right]) + + def __lt__(self, other): + if self.is_empty and not other.is_empty: return True + if self.priority < other.priority: return True + return False + + def __gt__(self, other): + if self.is_empty and not other.is_empty: return True + if self.priority > other.priority: return True + return False + + def __eq__(self, other): + if not isinstance(other, PackedNode): + return False + return self is other or (self.s == other.s and self.start == other.start and self.left == other.left and self.right == other.right) + + def __hash__(self): + return self._hash + + def __repr__(self): + symbol = self.s.name if isinstance(self.s, (NonTerminal, Terminal)) else self.s[0].origin.name + return "{%s, %d, %s, %s, %s}" % (symbol, self.start.i, self.left, self.right, self.priority if self.priority is not None else 0) + +class ForestVisitor(object): + """ + An abstract base class for building forest visitors. + + Use this as a base when you need to walk the forest. + """ + def __init__(self, root): + self.root = root + self.result = None + + def visit_token_node(self, node): pass + def visit_symbol_node_in(self, node): pass + def visit_symbol_node_out(self, node): pass + def visit_packed_node_in(self, node): pass + def visit_packed_node_out(self, node): pass + + def go(self): + # Visiting is a list of IDs of all symbol/intermediate nodes currently in + # the stack. It serves two purposes: to detect when we 'recurse' in and out + # of a symbol/intermediate so that we can process both up and down. Also, + # since the SPPF can have cycles it allows us to detect if we're trying + # to recurse into a node that's already on the stack (infinite recursion). + visiting = set() + + # We do not use recursion here to walk the Forest due to the limited + # stack size in python. Therefore input_stack is essentially our stack. + input_stack = deque([self.root]) + + # It is much faster to cache these as locals since they are called + # many times in large parses. + vpno = getattr(self, 'visit_packed_node_out') + vpni = getattr(self, 'visit_packed_node_in') + vsno = getattr(self, 'visit_symbol_node_out') + vsni = getattr(self, 'visit_symbol_node_in') + vtn = getattr(self, 'visit_token_node') + while input_stack: + current = next(reversed(input_stack)) + try: + next_node = next(current) + except StopIteration: + input_stack.pop() + continue + except TypeError: + ### If the current object is not an iterator, pass through to Token/SymbolNode + pass + else: + if next_node is None: + continue + + if id(next_node) in visiting: + raise ParseError("Infinite recursion in grammar!") + + input_stack.append(next_node) + continue + + if isinstance(current, Str): + vtn(current) + input_stack.pop() + continue + + current_id = id(current) + if current_id in visiting: + if isinstance(current, PackedNode): vpno(current) + else: vsno(current) + input_stack.pop() + visiting.remove(current_id) + continue + else: + visiting.add(current_id) + if isinstance(current, PackedNode): next_node = vpni(current) + else: next_node = vsni(current) + if next_node is None: + continue + + if id(next_node) in visiting: + raise ParseError("Infinite recursion in grammar!") + + input_stack.append(next_node) + continue + + return self.result + +class ForestSumVisitor(ForestVisitor): + """ + A visitor for prioritizing ambiguous parts of the Forest. + + This visitor is the default when resolving ambiguity. It pushes the priorities + from the rules into the SPPF nodes; and then sorts the packed node children + of ambiguous symbol or intermediate node according to the priorities. + This relies on the custom sort function provided in PackedNode.__lt__; which + uses these properties (and other factors) to sort the ambiguous packed nodes. + """ + def visit_packed_node_in(self, node): + return iter([node.left, node.right]) + + def visit_symbol_node_in(self, node): + return iter(node.children) + + def visit_packed_node_out(self, node): + node.priority = 0 + if node.rule.options and node.rule.options.priority: node.priority += node.rule.options.priority + if node.right is not None and hasattr(node.right, 'priority'): node.priority += node.right.priority + if node.left is not None and hasattr(node.left, 'priority'): node.priority += node.left.priority + + def visit_symbol_node_out(self, node): + node.priority = max(child.priority for child in node.children) + node.children = sorted(node.children, reverse = True) + +class ForestAntiscoreSumVisitor(ForestSumVisitor): + """ + A visitor for prioritizing ambiguous parts of the Forest. + + This visitor is used when resolve_ambiguity == 'resolve__antiscore_sum'. + It pushes the priorities from the rules into the SPPF nodes, and implements + a 'least cost' mechanism for resolving ambiguity (reverse of the default + priority mechanism). It uses a custom __lt__ comparator key for sorting + the packed node children. + """ + def visit_symbol_node_out(self, node): + node.priority = min(child.priority for child in node.children) + node.children = sorted(node.children, key=AntiscoreSumComparator, reverse = True) + +class AntiscoreSumComparator(object): + """ + An antiscore-sum comparator for PackedNode objects. + + This allows 'sorting' an iterable of PackedNode objects so that they + are arranged lowest priority first. + """ + __slots__ = ['obj'] + def __init__(self, obj, *args): + self.obj = obj + + def __lt__(self, other): + if self.obj.is_empty and not other.obj.is_empty: return True + if self.obj.priority > other.obj.priority: return True + return False + + def __gt__(self, other): + if self.obj.is_empty and not other.obj.is_empty: return True + if self.obj.priority < other.obj.priority: return True + return False + +class ForestToTreeVisitor(ForestVisitor): + """ + A Forest visitor which converts an SPPF forest to an unambiguous AST. + + The implementation in this visitor walks only the first ambiguous child + of each symbol node. When it finds an ambiguous symbol node it first + calls the forest_sum_visitor implementation to sort the children + into preference order using the algorithms defined there; so the first + child should always be the highest preference. The forest_sum_visitor + implementation should be another ForestVisitor which sorts the children + according to some priority mechanism. + """ + def __init__(self, root, forest_sum_visitor = ForestSumVisitor, callbacks = None): + super(ForestToTreeVisitor, self).__init__(root) + self.forest_sum_visitor = forest_sum_visitor + self.output_stack = deque() + self.callbacks = callbacks + self.result = None + + def visit_token_node(self, node): + self.output_stack[-1].append(node) + + def visit_symbol_node_in(self, node): + if node.is_ambiguous and node.priority is None: + self.forest_sum_visitor(node).go() + return next(iter(node.children)) + + def visit_packed_node_in(self, node): + if not node.parent.is_intermediate: + self.output_stack.append([]) + return iter([node.left, node.right]) + + def visit_packed_node_out(self, node): + if not node.parent.is_intermediate: + result = self.callbacks[node.rule](self.output_stack.pop()) + if self.output_stack: + self.output_stack[-1].append(result) + else: + self.result = result + +class ForestToAmbiguousTreeVisitor(ForestVisitor): + """ + A Forest visitor which converts an SPPF forest to an ambiguous AST. + + Because of the fundamental disparity between what can be stored in + an SPPF and what can be stored in a Tree; this implementation is not + complete. It correctly deals with ambiguities that occur on symbol nodes only, + and cannot deal with ambiguities that occur on intermediate nodes. + + Usually, most parsers can be rewritten to avoid intermediate node + ambiguities. Also, this implementation could be fixed, however + the code to handle intermediate node ambiguities is messy and + would not be performant. It is much better not to use this and + instead to correctly disambiguate the forest and only store unambiguous + parses in Trees. It is here just to provide some parity with the + old ambiguity='explicit'. + + This is mainly used by the test framework, to make it simpler to write + tests ensuring the SPPF contains the right results. + """ + def __init__(self, root, callbacks): + super(ForestToAmbiguousTreeVisitor, self).__init__(root) + self.output_stack = deque() + self.callbacks = callbacks + self.result = None + + def visit_token_node(self, node): + self.output_stack[-1].children.append(node) + + def visit_symbol_node_in(self, node): + if not node.is_intermediate and node.is_ambiguous: + self.output_stack.append(Tree('_ambig', [])) + return iter(node.children) + + def visit_symbol_node_out(self, node): + if node.is_ambiguous: + result = self.output_stack.pop() + if self.output_stack: + self.output_stack[-1].children.append(result) + else: + self.result = result + + def visit_packed_node_in(self, node): + #### NOTE: + ## When an intermediate node (node.parent.s == tuple) has ambiguous children this + ## forest visitor will break. + if not node.parent.is_intermediate: + self.output_stack.append(Tree('drv', [])) + return iter([node.left, node.right]) + + def visit_packed_node_out(self, node): + if not node.parent.is_intermediate: + result = self.callbacks[node.rule](self.output_stack.pop().children) + if self.output_stack: + self.output_stack[-1].children.append(result) + else: + self.result = result \ No newline at end of file diff --git a/lark/parsers/resolve_ambig.py b/lark/parsers/resolve_ambig.py deleted file mode 100644 index 2470eb9..0000000 --- a/lark/parsers/resolve_ambig.py +++ /dev/null @@ -1,109 +0,0 @@ -from ..utils import compare -from functools import cmp_to_key - -from ..tree import Tree - - -# Standard ambiguity resolver (uses comparison) -# -# Author: Erez Sh - -def _compare_rules(rule1, rule2): - return -compare( len(rule1.expansion), len(rule2.expansion)) - -def _sum_priority(tree): - p = 0 - - for n in tree.iter_subtrees(): - try: - p += n.meta.rule.options.priority or 0 - except AttributeError: - pass - - return p - -def _compare_priority(tree1, tree2): - tree1.iter_subtrees() - -def _compare_drv(tree1, tree2): - try: - rule1 = tree1.meta.rule - except AttributeError: - rule1 = None - - try: - rule2 = tree2.meta.rule - except AttributeError: - rule2 = None - - if None == rule1 == rule2: - return compare(tree1, tree2) - elif rule1 is None: - return -1 - elif rule2 is None: - return 1 - - assert tree1.data != '_ambig' - assert tree2.data != '_ambig' - - p1 = _sum_priority(tree1) - p2 = _sum_priority(tree2) - c = (p1 or p2) and compare(p1, p2) - if c: - return c - - c = _compare_rules(tree1.meta.rule, tree2.meta.rule) - if c: - return c - - # rules are "equal", so compare trees - if len(tree1.children) == len(tree2.children): - for t1, t2 in zip(tree1.children, tree2.children): - c = _compare_drv(t1, t2) - if c: - return c - - return compare(len(tree1.children), len(tree2.children)) - - -def _standard_resolve_ambig(tree): - assert tree.data == '_ambig' - key_f = cmp_to_key(_compare_drv) - best = max(tree.children, key=key_f) - assert best.data == 'drv' - tree.set('drv', best.children) - tree.meta.rule = best.meta.rule # needed for applying callbacks - -def standard_resolve_ambig(tree): - for ambig in tree.find_data('_ambig'): - _standard_resolve_ambig(ambig) - - return tree - - - - -# Anti-score Sum -# -# Author: Uriva (https://github.com/uriva) - -def _antiscore_sum_drv(tree): - if not isinstance(tree, Tree): - return 0 - - assert tree.data != '_ambig' - - return _sum_priority(tree) - -def _antiscore_sum_resolve_ambig(tree): - assert tree.data == '_ambig' - best = min(tree.children, key=_antiscore_sum_drv) - assert best.data == 'drv' - tree.set('drv', best.children) - tree.meta.rule = best.meta.rule # needed for applying callbacks - -def antiscore_sum_resolve_ambig(tree): - for ambig in tree.find_data('_ambig'): - _antiscore_sum_resolve_ambig(ambig) - - return tree diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index f3a5dd6..89eda17 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -1,107 +1,163 @@ -"This module implements an experimental Earley Parser with a dynamic lexer" - -# The parser uses a parse-forest to keep track of derivations and ambiguations. -# When the parse ends successfully, a disambiguation stage resolves all ambiguity -# (right now ambiguity resolution is not developed beyond the needs of lark) -# Afterwards the parse tree is reduced (transformed) according to user callbacks. -# I use the no-recursion version of Transformer and Visitor, because the tree might be -# deeper than Python's recursion limit (a bit absurd, but that's life) -# -# The algorithm keeps track of each state set, using a corresponding Column instance. -# Column keeps track of new items using NewsList instances. -# -# Instead of running a lexer beforehand, or using a costy char-by-char method, this parser -# uses regular expressions by necessity, achieving high-performance while maintaining all of -# Earley's power in parsing any CFG. -# -# +"""This module implements an experimental Earley parser with a dynamic lexer + +The core Earley algorithm used here is based on Elizabeth Scott's implementation, here: + https://www.sciencedirect.com/science/article/pii/S1571066108001497 + +That is probably the best reference for understanding the algorithm here. + +The Earley parser outputs an SPPF-tree as per that document. The SPPF tree format +is better documented here: + http://www.bramvandersanden.com/post/2014/06/shared-packed-parse-forest/ + +Instead of running a lexer beforehand, or using a costy char-by-char method, this parser +uses regular expressions by necessity, achieving high-performance while maintaining all of +Earley's power in parsing any CFG. +""" # Author: Erez Shinan (2017) # Email : erezshin@gmail.com -from collections import defaultdict +from collections import defaultdict, deque from ..exceptions import ParseError, UnexpectedCharacters from ..lexer import Token from ..tree import Tree from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal, Terminal - -from .earley import ApplyCallbacks, Item, Column +from .earley import ApplyCallbacks +from .earley_common import Column, Item +from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode class Parser: - def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False, complete_lex=False): - self.analysis = GrammarAnalyzer(parser_conf) + def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, forest_sum_visitor = ForestSumVisitor, ignore = (), complete_lex = False): + analysis = GrammarAnalyzer(parser_conf) self.parser_conf = parser_conf self.resolve_ambiguity = resolve_ambiguity + self.forest_sum_visitor = forest_sum_visitor self.ignore = [Terminal(t) for t in ignore] - self.predict_all = predict_all self.complete_lex = complete_lex - self.FIRST = self.analysis.FIRST - self.postprocess = {} + self.FIRST = analysis.FIRST + self.callbacks = {} self.predictions = {} + + ## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than + # the slow 'isupper' in is_terminal. + self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } + self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } for rule in parser_conf.rules: - self.postprocess[rule] = getattr(parser_conf.callback, rule.alias) - self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] + self.callbacks[rule] = getattr(parser_conf.callback, rule.alias or rule.origin, None) + self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] self.term_matcher = term_matcher - def parse(self, stream, start_symbol=None): - # Define parser functions start_symbol = NonTerminal(start_symbol or self.parser_conf.start) delayed_matches = defaultdict(list) match = self.term_matcher - text_line = 1 - text_column = 1 - - def predict(nonterm, column): - assert not nonterm.is_term, nonterm - return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] - - def complete(item): - name = item.rule.origin - return [i.advance(item.tree) for i in item.start.to_predict if i.expect == name] - - def predict_and_complete(column): - while True: - to_predict = {x.expect for x in column.to_predict.get_news() - if x.ptr} # if not part of an already predicted batch - to_reduce = column.to_reduce.get_news() - if not (to_predict or to_reduce): - break - - for nonterm in to_predict: - column.add( predict(nonterm, column) ) - for item in to_reduce: - new_items = list(complete(item)) - if item in new_items: - raise ParseError('Infinite recursion detected! (rule %s)' % item.rule) - column.add(new_items) - - def scan(i, column): - to_scan = column.to_scan + # Held Completions (H in E.Scotts paper). + held_completions = {} - for x in self.ignore: - m = match(x, stream, i) - if m: - delayed_matches[m.end()] += set(to_scan) - delayed_matches[m.end()] += set(column.to_reduce) + # Cache for nodes & tokens created in a particular parse step. + node_cache = {} + token_cache = {} - # TODO add partial matches for ignore too? - # s = m.group(0) - # for j in range(1, len(s)): - # m = x.match(s[:-j]) - # if m: - # delayed_matches[m.end()] += to_scan + text_line = 1 + text_column = 0 - for item in to_scan: + def make_symbol_node(s, start, end): + label = (s, start.i, end.i) + if label in node_cache: + node = node_cache[label] + else: + node = node_cache[label] = SymbolNode(s, start, end) + return node + + def predict_and_complete(column, to_scan): + """The core Earley Predictor and Completer. + + At each stage of the input, we handling any completed items (things + that matched on the last cycle) and use those to predict what should + come next in the input stream. The completions and any predicted + non-terminals are recursively processed until we reach a set of, + which can be added to the scan list for the next scanner cycle.""" + held_completions.clear() + + # R (items) = Ei (column.items) + items = deque(column.items) + while items: + item = items.pop() # remove an element, A say, from R + + ### The Earley completer + if item.is_complete: ### (item.s == string) + if item.node is None: + item.node = make_symbol_node(item.s, item.start, column) + item.node.add_family(item.s, item.rule, item.start, None, None) + + # Empty has 0 length. If we complete an empty symbol in a particular + # parse step, we need to be able to use that same empty symbol to complete + # any predictions that result, that themselves require empty. Avoids + # infinite recursion on empty symbols. + # held_completions is 'H' in E.Scott's paper. + is_empty_item = item.start.i == column.i + if is_empty_item: + held_completions[item.rule.origin] = item.node + + originators = [originator for originator in item.start.items if originator.expect is not None and originator.expect == item.s] + for originator in originators: + new_item = originator.advance() + new_item.node = make_symbol_node(new_item.s, originator.start, column) + new_item.node.add_family(new_item.s, new_item.rule, new_item.start, originator.node, item.node) + if new_item.expect in self.TERMINALS: + # Add (B :: aC.B, h, y) to Q + to_scan.add(new_item) + elif new_item not in column.items: + # Add (B :: aC.B, h, y) to Ei and R + column.add(new_item) + items.append(new_item) + + ### The Earley predictor + elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) + new_items = [] + for rule in self.predictions[item.expect]: + new_item = Item(rule, 0, column) + new_items.append(new_item) + + # Process any held completions (H). + if item.expect in held_completions: + new_item = item.advance() + new_item.node = make_symbol_node(new_item.s, item.start, column) + new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) + new_items.append(new_item) + + for new_item in new_items: + if new_item.expect in self.TERMINALS: + to_scan.add(new_item) + elif new_item not in column.items: + column.add(new_item) + items.append(new_item) + + def scan(i, column, to_scan): + """The core Earley Scanner. + + This is a custom implementation of the scanner that uses the + Lark lexer to match tokens. The scan list is built by the + Earley predictor, based on the previously completed tokens. + This ensures that at each phase of the parse we have a custom + lexer context, allowing for more complex ambiguities.""" + + # 1) Loop the expectations and ask the lexer to match. + # Since regexp is forward looking on the input stream, and we only + # want to process tokens when we hit the point in the stream at which + # they complete, we push all tokens into a buffer (delayed_matches), to + # be held possibly for a later parse step when we reach the point in the + # input stream at which they complete. + for item in set(to_scan): m = match(item.expect, stream, i) if m: t = Token(item.expect.name, m.group(0), i, text_line, text_column) - delayed_matches[m.end()].append(item.advance(t)) + delayed_matches[m.end()].append( (item, column, t) ) if self.complete_lex: s = m.group(0) @@ -109,25 +165,85 @@ class Parser: m = match(item.expect, s[:-j]) if m: t = Token(item.expect.name, m.group(0), i, text_line, text_column) - delayed_matches[i+m.end()].append(item.advance(t)) + delayed_matches[i+m.end()].append( (item, column, t) ) + + # Remove any items that successfully matched in this pass from the to_scan buffer. + # This ensures we don't carry over tokens that already matched, if we're ignoring below. + to_scan.remove(item) + + # 3) Process any ignores. This is typically used for e.g. whitespace. + # We carry over any unmatched items from the to_scan buffer to be matched again after + # the ignore. This should allow us to use ignored symbols in non-terminals to implement + # e.g. mandatory spacing. + for x in self.ignore: + m = match(x, stream, i) + if m: + # Carry over any items still in the scan buffer, to past the end of the ignored items. + delayed_matches[m.end()].extend([(item, column, None) for item in to_scan ]) + + # If we're ignoring up to the end of the file, # carry over the start symbol if it already completed. + delayed_matches[m.end()].extend([(item, column, None) for item in column.items if item.is_complete and item.s == start_symbol]) + + next_set = Column(i + 1, self.FIRST) # Ei+1 + next_to_scan = set() + + ## 4) Process Tokens from delayed_matches. + # This is the core of the Earley scanner. Create an SPPF node for each Token, + # and create the symbol node in the SPPF tree. Advance the item that completed, + # and add the resulting new item to either the Earley set (for processing by the + # completer/predictor) or the to_scan buffer for the next parse step. + for item, start, token in delayed_matches[i+1]: + if token is not None: + new_item = item.advance() + new_item.node = make_symbol_node(new_item.s, new_item.start, column) + new_item.node.add_family(new_item.s, item.rule, new_item.start, item.node, token) + else: + new_item = item + + if new_item.expect in self.TERMINALS: + # add (B ::= Aai+1.B, h, y) to Q' + next_to_scan.add(new_item) + else: + # add (B ::= Aa+1.B, h, y) to Ei+1 + next_set.add(new_item) - next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) - next_set.add(delayed_matches[i+1]) del delayed_matches[i+1] # No longer needed, so unburden memory - if not next_set and not delayed_matches: + if not next_set and not delayed_matches and not next_to_scan: raise UnexpectedCharacters(stream, i, text_line, text_column, {item.expect for item in to_scan}, set(to_scan)) - return next_set + return next_set, next_to_scan # Main loop starts - column0 = Column(0, self.FIRST, predict_all=self.predict_all) - column0.add(predict(start_symbol, column0)) - + column0 = Column(0, self.FIRST) column = column0 + + ## The scan buffer. 'Q' in E.Scott's paper. + to_scan = set() + + ## Predict for the start_symbol. + # Add predicted items to the first Earley set (for the predictor) if they + # result in a non-terminal, or the scanner if they result in a terminal. + for rule in self.predictions[start_symbol]: + item = Item(rule, 0, column0) + if item.expect in self.TERMINALS: + to_scan.add(item) + else: + column.add(item) + + ## The main Earley loop. + # Run the Prediction/Completion cycle for any Items in the current Earley set. + # Completions will be added to the SPPF tree, and predictions will be recursively + # processed down to terminals/empty nodes to be added to the scanner for the next + # step. for i, token in enumerate(stream): - predict_and_complete(column) - column = scan(i, column) + predict_and_complete(column, to_scan) + + # Clear the node_cache and token_cache, which are only relevant for each + # step in the Earley pass. + node_cache.clear() + token_cache.clear() + column, to_scan = scan(i, column, to_scan) if token == '\n': text_line += 1 @@ -135,24 +251,24 @@ class Parser: else: text_column += 1 - predict_and_complete(column) + predict_and_complete(column, to_scan) - # Parse ended. Now build a parse tree - solutions = [n.tree for n in column.to_reduce - if n.rule.origin==start_symbol and n.start is column0] + ## Column is now the final column in the parse. If the parse was successful, the start + # symbol should have been completed in the last step of the Earley cycle, and will be in + # this column. Find the item for the start_symbol, which is the root of the SPPF tree. + solutions = [n.node for n in column.items if n.is_complete and n.node is not None and n.s == start_symbol and n.start is column0] if not solutions: - expected_tokens = [t.expect for t in column.to_scan] + expected_tokens = [t.expect for t in to_scan] raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens) + elif len(solutions) > 1: + raise Exception('Earley should not generate more than one start symbol - bug') - elif len(solutions) == 1: - tree = solutions[0] - else: - tree = Tree('_ambig', solutions) - - if self.resolve_ambiguity: - tree = self.resolve_ambiguity(tree) - - return ApplyCallbacks(self.postprocess).transform(tree) - + ## If we're not resolving ambiguity, we just return the root of the SPPF tree to the caller. + # This means the caller can work directly with the SPPF tree. + if not self.resolve_ambiguity: + return solutions[0] + # ... otherwise, disambiguate and convert the SPPF to an AST, removing any ambiguities + # according to the rules. + return ForestToTreeVisitor(solutions[0], self.forest_sum_visitor, self.callbacks).go() diff --git a/lark/reconstruct.py b/lark/reconstruct.py index a21f155..201e671 100644 --- a/lark/reconstruct.py +++ b/lark/reconstruct.py @@ -114,7 +114,7 @@ class Reconstructor: def _reconstruct(self, tree): # TODO: ambiguity? - parser = earley.Parser(ParserConf(self.rules, None, tree.data), self._match, resolve_ambiguity=resolve_ambig.standard_resolve_ambig) + parser = earley.Parser(ParserConf(self.rules, None, tree.data), self._match, resolve_ambiguity=True) unreduced_tree = parser.parse(tree.children) # find a full derivation assert unreduced_tree.data == tree.data res = self.write_tokens.transform(unreduced_tree) diff --git a/tests/test_parser.py b/tests/test_parser.py index 948f56f..77eabd1 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -21,6 +21,8 @@ from lark.lark import Lark from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput from lark.tree import Tree from lark.visitors import Transformer +from lark.parsers.earley_forest import ForestToAmbiguousTreeVisitor +from lark.parsers.earley import ApplyCallbacks __path__ = os.path.dirname(__file__) def _read(n, *args): @@ -236,10 +238,11 @@ def _make_full_earley_test(LEXER): """ parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit') - res = parser.parse('ab') - - self.assertEqual( res.data, '_ambig') - self.assertEqual( len(res.children), 2) + root_symbol = parser.parse('ab') + ambig_tree = ForestToAmbiguousTreeVisitor(root_symbol, parser.parser.parser.callbacks).go() + print(ambig_tree.pretty()) + self.assertEqual( ambig_tree.data, '_ambig') + self.assertEqual( len(ambig_tree.children), 2) def test_ambiguity1(self): grammar = """ @@ -251,9 +254,35 @@ def _make_full_earley_test(LEXER): """ l = Lark(grammar, parser='earley', ambiguity='explicit', lexer=LEXER) - x = l.parse('cde') - assert x.data == '_ambig', x - assert len(x.children) == 2 + root_symbol = l.parse('cde') + ambig_tree = ForestToAmbiguousTreeVisitor(root_symbol, l.parser.parser.callbacks).go() + print(ambig_tree.pretty()) +# tree = ApplyCallbacks(l.parser.parser.postprocess).transform(ambig_tree) + + assert ambig_tree.data == '_ambig', ambig_tree + assert len(ambig_tree.children) == 2 + + @unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions") + def test_ambiguity2(self): + grammar = """ + ANY: /[a-zA-Z0-9 ]+/ + a.2: "A" b+ + b.2: "B" + c: ANY + + start: (a|c)* + """ + l = Lark(grammar, parser='earley', lexer=LEXER) + res = l.parse('ABX') + expected = Tree('start', [ + Tree('a', [ + Tree('b', []) + ]), + Tree('c', [ + 'X' + ]) + ]) + self.assertEqual(res, expected) def test_fruitflies_ambig(self): grammar = """ @@ -272,7 +301,9 @@ def _make_full_earley_test(LEXER): %ignore WS """ parser = Lark(grammar, ambiguity='explicit', lexer=LEXER) - res = parser.parse('fruit flies like bananas') + root_symbol = parser.parse('fruit flies like bananas') + tree = ForestToAmbiguousTreeVisitor(root_symbol, parser.parser.parser.callbacks).go() +# tree = ApplyCallbacks(parser.parser.parser.postprocess).transform(ambig_tree) expected = Tree('_ambig', [ Tree('comparative', [ @@ -290,7 +321,7 @@ def _make_full_earley_test(LEXER): # print res.pretty() # print expected.pretty() - self.assertEqual(res, expected) + self.assertEqual(tree, expected) @unittest.skipIf(LEXER=='dynamic', "Only relevant for the dynamic_complete parser") @@ -303,7 +334,9 @@ def _make_full_earley_test(LEXER): text = """cat""" parser = Lark(grammar, start='start', ambiguity='explicit') - tree = parser.parse(text) + root_symbol = parser.parse(text) + ambig_tree = ForestToAmbiguousTreeVisitor(root_symbol).go() + tree = ApplyCallbacks(parser.parser.parser.postprocess).transform(ambig_tree) self.assertEqual(tree.data, '_ambig') combinations = {tuple(str(s) for s in t.children) for t in tree.children}