From 57a602758d8821259f5ce1bc5762baa580ec3a98 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Mon, 7 Jan 2019 16:24:25 +0200 Subject: [PATCH] Merged part of Earley & XEarley implementation. Major reduction --- lark/parsers/earley.py | 202 ++++++++++++++++++++-------------------- lark/parsers/xearley.py | 190 ++----------------------------------- 2 files changed, 107 insertions(+), 285 deletions(-) diff --git a/lark/parsers/earley.py b/lark/parsers/earley.py index f05e808..41dc6e3 100644 --- a/lark/parsers/earley.py +++ b/lark/parsers/earley.py @@ -17,7 +17,7 @@ from ..exceptions import ParseError, UnexpectedToken from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal from .earley_common import Item, TransitiveItem -from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode +from .earley_forest import ForestToTreeVisitor, ForestSumVisitor, SymbolNode, ForestToAmbiguousTreeVisitor class Parser: def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True): @@ -54,16 +54,105 @@ class Parser: self.term_matcher = term_matcher + def predict_and_complete(self, i, to_scan, columns, transitives): + """The core Earley Predictor and Completer. + + At each stage of the input, we handling any completed items (things + that matched on the last cycle) and use those to predict what should + come next in the input stream. The completions and any predicted + non-terminals are recursively processed until we reach a set of, + which can be added to the scan list for the next scanner cycle.""" + # Held Completions (H in E.Scotts paper). + node_cache = {} + held_completions = {} + + column = columns[i] + # R (items) = Ei (column.items) + items = deque(column) + while items: + item = items.pop() # remove an element, A say, from R + + ### The Earley completer + if item.is_complete: ### (item.s == string) + if item.node is None: + label = (item.s, item.start, i) + item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + item.node.add_family(item.s, item.rule, item.start, None, None) + + # create_leo_transitives(item.rule.origin, item.start) + + ###R Joop Leo right recursion Completer + if item.rule.origin in transitives[item.start]: + transitive = transitives[item.start][item.s] + if transitive.previous in transitives[transitive.column]: + root_transitive = transitives[transitive.column][transitive.previous] + else: + root_transitive = transitive + + new_item = Item(transitive.rule, transitive.ptr, transitive.start) + label = (root_transitive.s, root_transitive.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_path(root_transitive, item.node) + if new_item.expect in self.TERMINALS: + # Add (B :: aC.B, h, y) to Q + to_scan.add(new_item) + elif new_item not in column: + # Add (B :: aC.B, h, y) to Ei and R + column.add(new_item) + items.append(new_item) + ###R Regular Earley completer + else: + # Empty has 0 length. If we complete an empty symbol in a particular + # parse step, we need to be able to use that same empty symbol to complete + # any predictions that result, that themselves require empty. Avoids + # infinite recursion on empty symbols. + # held_completions is 'H' in E.Scott's paper. + is_empty_item = item.start == i + if is_empty_item: + held_completions[item.rule.origin] = item.node + + originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] + for originator in originators: + new_item = originator.advance() + label = (new_item.s, originator.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node) + if new_item.expect in self.TERMINALS: + # Add (B :: aC.B, h, y) to Q + to_scan.add(new_item) + elif new_item not in column: + # Add (B :: aC.B, h, y) to Ei and R + column.add(new_item) + items.append(new_item) + + ### The Earley predictor + elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) + new_items = [] + for rule in self.predictions[item.expect]: + new_item = Item(rule, 0, i) + new_items.append(new_item) + + # Process any held completions (H). + if item.expect in held_completions: + new_item = item.advance() + label = (new_item.s, item.start, i) + new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) + new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) + new_items.append(new_item) + + for new_item in new_items: + if new_item.expect in self.TERMINALS: + to_scan.add(new_item) + elif new_item not in column: + column.add(new_item) + items.append(new_item) + def parse(self, stream, start_symbol=None): # Define parser functions start_symbol = NonTerminal(start_symbol or self.parser_conf.start) match = self.term_matcher - # Held Completions (H in E.Scotts paper). - held_completions = {} - # Cache for nodes & tokens created in a particular parse step. - node_cache = {} token_cache = {} columns = [] transitives = [] @@ -74,10 +163,9 @@ class Parser: quasi = item.advance() while not quasi.is_complete: - symbol = quasi.expect - if symbol not in self.NULLABLE: + if quasi.expect not in self.NULLABLE: return False - if quasi.rule.origin == start_symbol and symbol == start_symbol: + if quasi.rule.origin == start_symbol and quasi.expect == start_symbol: return False quasi = quasi.advance() return True @@ -135,96 +223,7 @@ class Parser: titem = TransitiveItem(origin, trule, originator, start) previous = transitives[start][origin] = titem - def predict_and_complete(i, to_scan): - """The core Earley Predictor and Completer. - - At each stage of the input, we handling any completed items (things - that matched on the last cycle) and use those to predict what should - come next in the input stream. The completions and any predicted - non-terminals are recursively processed until we reach a set of, - which can be added to the scan list for the next scanner cycle.""" - held_completions.clear() - - column = columns[i] - # R (items) = Ei (column.items) - items = deque(column) - while items: - item = items.pop() # remove an element, A say, from R - - ### The Earley completer - if item.is_complete: ### (item.s == string) - if item.node is None: - label = (item.s, item.start, i) - item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) - item.node.add_family(item.s, item.rule, item.start, None, None) - - # create_leo_transitives(item.rule.origin, item.start) - - ###R Joop Leo right recursion Completer - if item.rule.origin in transitives[item.start]: - transitive = transitives[item.start][item.s] - if transitive.previous in transitives[transitive.column]: - root_transitive = transitives[transitive.column][transitive.previous] - else: - root_transitive = transitive - - new_item = Item(transitive.rule, transitive.ptr, transitive.start) - label = (root_transitive.s, root_transitive.start, i) - new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) - new_item.node.add_path(root_transitive, item.node) - if new_item.expect in self.TERMINALS: - # Add (B :: aC.B, h, y) to Q - to_scan.add(new_item) - elif new_item not in column: - # Add (B :: aC.B, h, y) to Ei and R - column.add(new_item) - items.append(new_item) - ###R Regular Earley completer - else: - # Empty has 0 length. If we complete an empty symbol in a particular - # parse step, we need to be able to use that same empty symbol to complete - # any predictions that result, that themselves require empty. Avoids - # infinite recursion on empty symbols. - # held_completions is 'H' in E.Scott's paper. - is_empty_item = item.start == i - if is_empty_item: - held_completions[item.rule.origin] = item.node - - originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] - for originator in originators: - new_item = originator.advance() - label = (new_item.s, originator.start, i) - new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) - new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node) - if new_item.expect in self.TERMINALS: - # Add (B :: aC.B, h, y) to Q - to_scan.add(new_item) - elif new_item not in column: - # Add (B :: aC.B, h, y) to Ei and R - column.add(new_item) - items.append(new_item) - - ### The Earley predictor - elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) - new_items = [] - for rule in self.predictions[item.expect]: - new_item = Item(rule, 0, i) - new_items.append(new_item) - - # Process any held completions (H). - if item.expect in held_completions: - new_item = item.advance() - label = (new_item.s, item.start, i) - new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) - new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) - new_items.append(new_item) - for new_item in new_items: - if new_item.expect in self.TERMINALS: - to_scan.add(new_item) - elif new_item not in column: - column.add(new_item) - items.append(new_item) def scan(i, token, to_scan): """The core Earley Scanner. @@ -237,8 +236,8 @@ class Parser: next_to_scan = set() next_set = set() columns.append(next_set) - next_transitives = dict() - transitives.append(next_transitives) + transitives.append({}) + node_cache = {} for item in set(to_scan): if match(item.expect, token): @@ -284,15 +283,14 @@ class Parser: # step. i = 0 for token in stream: - predict_and_complete(i, to_scan) + self.predict_and_complete(i, to_scan, columns, transitives) # Clear the node_cache and token_cache, which are only relevant for each # step in the Earley pass. - node_cache.clear() to_scan = scan(i, token, to_scan) i += 1 - predict_and_complete(i, to_scan) + self.predict_and_complete(i, to_scan, columns, transitives) ## Column is now the final column in the parse. If the parse was successful, the start # symbol should have been completed in the last step of the Earley cycle, and will be in diff --git a/lark/parsers/xearley.py b/lark/parsers/xearley.py index c985048..aa600f0 100644 --- a/lark/parsers/xearley.py +++ b/lark/parsers/xearley.py @@ -22,56 +22,23 @@ from ..exceptions import ParseError, UnexpectedCharacters from ..lexer import Token from .grammar_analysis import GrammarAnalyzer from ..grammar import NonTerminal, Terminal -from .earley import ApplyCallbacks +from .earley import ApplyCallbacks, Parser as BaseParser from .earley_common import Item, TransitiveItem from .earley_forest import ForestToTreeVisitor, ForestToAmbiguousTreeVisitor, ForestSumVisitor, ForestToPyDotVisitor, SymbolNode -class Parser: +class Parser(BaseParser): def __init__(self, parser_conf, term_matcher, resolve_ambiguity=True, ignore = (), complete_lex = False): - analysis = GrammarAnalyzer(parser_conf) - self.parser_conf = parser_conf - self.resolve_ambiguity = resolve_ambiguity + BaseParser.__init__(self, parser_conf, term_matcher, resolve_ambiguity) self.ignore = [Terminal(t) for t in ignore] self.complete_lex = complete_lex - self.FIRST = analysis.FIRST - self.NULLABLE = analysis.NULLABLE - self.callbacks = {} - self.predictions = {} - - ## These could be moved to the grammar analyzer. Pre-computing these is *much* faster than - # the slow 'isupper' in is_terminal; or even called sym.is_term directly. - self.TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if sym.is_term } - self.NON_TERMINALS = { sym for r in parser_conf.rules for sym in r.expansion if not sym.is_term } - - self.forest_sum_visitor = None - for rule in parser_conf.rules: - self.callbacks[rule] = getattr(parser_conf.callback, rule.alias or rule.origin, None) - self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] - - ## Detect if any rules have priorities set. If the user specified priority = "none" then - # the priorities will be stripped from all rules before they reach us, allowing us to - # skip the extra tree walk. - if self.forest_sum_visitor is None and rule.options and rule.options.priority is not None: - self.forest_sum_visitor = ForestSumVisitor() - - if resolve_ambiguity: - self.forest_tree_visitor = ForestToTreeVisitor(self.callbacks, self.forest_sum_visitor) - else: - self.forest_tree_visitor = ForestToAmbiguousTreeVisitor(self.callbacks, self.forest_sum_visitor) - self.term_matcher = term_matcher - def parse(self, stream, start_symbol=None): start_symbol = NonTerminal(start_symbol or self.parser_conf.start) delayed_matches = defaultdict(list) match = self.term_matcher - # Held Completions (H in E.Scotts paper). - held_completions = {} - # Cache for nodes & tokens created in a particular parse step. - node_cache = {} token_cache = {} columns = [] transitives = [] @@ -92,149 +59,6 @@ class Parser: quasi = quasi.advance() return True - def create_leo_transitives(origin, start): - visited = set() - to_create = [] - trule = None - previous = None - - ### Recursively walk backwards through the Earley sets until we find the - # first transitive candidate. If this is done continuously, we shouldn't - # have to walk more than 1 hop. - while True: - if origin in transitives[start]: - previous = trule = transitives[start][origin] - break - - is_empty_rule = not self.FIRST[origin] - if is_empty_rule: - break - - candidates = [ candidate for candidate in columns[start] if candidate.expect is not None and origin == candidate.expect ] - if len(candidates) != 1: - break - originator = next(iter(candidates)) - - if originator is None or originator in visited: - break - - visited.add(originator) - if not is_quasi_complete(originator): - break - - trule = originator.advance() - if originator.start != start: - visited.clear() - - to_create.append((origin, start, originator)) - origin = originator.rule.origin - start = originator.start - - # If a suitable Transitive candidate is not found, bail. - if trule is None: - return - - #### Now walk forwards and create Transitive Items in each set we walked through; and link - # each transitive item to the next set forwards. - while to_create: - origin, start, originator = to_create.pop() - titem = None - if previous is not None: - titem = previous.next_titem = TransitiveItem(origin, trule, originator, previous.column) - else: - titem = TransitiveItem(origin, trule, originator, start) - previous = transitives[start][origin] = titem - - def predict_and_complete(i, to_scan): - """The core Earley Predictor and Completer. - - At each stage of the input, we handling any completed items (things - that matched on the last cycle) and use those to predict what should - come next in the input stream. The completions and any predicted - non-terminals are recursively processed until we reach a set of, - which can be added to the scan list for the next scanner cycle.""" - held_completions.clear() - - column = columns[i] - # R (items) = Ei (column.items) - items = deque(column) - while items: - item = items.pop() # remove an element, A say, from R - - ### The Earley completer - if item.is_complete: ### (item.s == string) - if item.node is None: - label = (item.s, item.start, i) - item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) - item.node.add_family(item.s, item.rule, item.start, None, None) - - # create_leo_transitives(item.rule.origin, item.start) - - ###R Joop Leo right recursion Completer - if item.rule.origin in transitives[item.start]: - transitive = transitives[item.start][item.s] - if transitive.previous in transitives[transitive.column]: - root_transitive = transitives[transitive.column][transitive.previous] - else: - root_transitive = transitive - - new_item = Item(transitive.rule, transitive.ptr, transitive.start) - label = (root_transitive.s, root_transitive.start, i) - new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) - new_item.node.add_path(root_transitive, item.node) - if new_item.expect in self.TERMINALS: - # Add (B :: aC.B, h, y) to Q - to_scan.add(new_item) - elif new_item not in column: - # Add (B :: aC.B, h, y) to Ei and R - column.add(new_item) - items.append(new_item) - ###R Regular Earley completer - else: - # Empty has 0 length. If we complete an empty symbol in a particular - # parse step, we need to be able to use that same empty symbol to complete - # any predictions that result, that themselves require empty. Avoids - # infinite recursion on empty symbols. - # held_completions is 'H' in E.Scott's paper. - is_empty_item = item.start == i - if is_empty_item: - held_completions[item.rule.origin] = item.node - - originators = [originator for originator in columns[item.start] if originator.expect is not None and originator.expect == item.s] - for originator in originators: - new_item = originator.advance() - label = (new_item.s, originator.start, i) - new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) - new_item.node.add_family(new_item.s, new_item.rule, i, originator.node, item.node) - if new_item.expect in self.TERMINALS: - # Add (B :: aC.B, h, y) to Q - to_scan.add(new_item) - elif new_item not in column: - # Add (B :: aC.B, h, y) to Ei and R - column.add(new_item) - items.append(new_item) - - ### The Earley predictor - elif item.expect in self.NON_TERMINALS: ### (item.s == lr0) - new_items = [] - for rule in self.predictions[item.expect]: - new_item = Item(rule, 0, i) - new_items.append(new_item) - - # Process any held completions (H). - if item.expect in held_completions: - new_item = item.advance() - label = (new_item.s, item.start, i) - new_item.node = node_cache[label] if label in node_cache else node_cache.setdefault(label, SymbolNode(*label)) - new_item.node.add_family(new_item.s, new_item.rule, new_item.start, item.node, held_completions[item.expect]) - new_items.append(new_item) - - for new_item in new_items: - if new_item.expect in self.TERMINALS: - to_scan.add(new_item) - elif new_item not in column: - column.add(new_item) - items.append(new_item) def scan(i, to_scan): """The core Earley Scanner. @@ -245,6 +69,8 @@ class Parser: This ensures that at each phase of the parse we have a custom lexer context, allowing for more complex ambiguities.""" + node_cache = {} + # 1) Loop the expectations and ask the lexer to match. # Since regexp is forward looking on the input stream, and we only # want to process tokens when we hit the point in the stream at which @@ -343,13 +169,11 @@ class Parser: # step. i = 0 for token in stream: - predict_and_complete(i, to_scan) + self.predict_and_complete(i, to_scan, columns, transitives) # Clear the node_cache and token_cache, which are only relevant for each # step in the Earley pass. - node_cache.clear() token_cache.clear() - node_cache.clear() to_scan = scan(i, to_scan) if token == '\n': @@ -359,7 +183,7 @@ class Parser: text_column += 1 i += 1 - predict_and_complete(i, to_scan) + self.predict_and_complete(i, to_scan, columns, transitives) ## Column is now the final column in the parse. If the parse was successful, the start # symbol should have been completed in the last step of the Earley cycle, and will be in