From 538f944602172061ba4396dc726e575c37f7aa72 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Mon, 20 Feb 2017 20:00:24 +0200 Subject: [PATCH] My Earley parser is working --- lark/common.py | 2 +- lark/parser_frontends.py | 60 ++++++++++++++- lark/parsers/earley2.py | 123 +++++++++++++++++-------------- lark/parsers/grammar_analysis.py | 2 +- lark/parsers/lalr_analysis.py | 2 +- 5 files changed, 127 insertions(+), 62 deletions(-) diff --git a/lark/common.py b/lark/common.py index 06220f0..122c7e5 100644 --- a/lark/common.py +++ b/lark/common.py @@ -28,7 +28,7 @@ class UnexpectedToken(ParseError): def is_terminal(sym): - return sym.isupper() or sym[0] == '$' + return isinstance(sym, tuple) or sym.isupper() or sym[0] == '$' class LexerConf: diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 1c46d35..e9f117c 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -3,8 +3,9 @@ import sre_parse from .lexer import Lexer, ContextualLexer -from .common import is_terminal, GrammarError -from .parsers import lalr_parser, earley +from .common import is_terminal, GrammarError, ParserConf +from .parsers import lalr_parser, earley, earley2 +from .parsers.grammar_analysis import Rule class WithLexer: def __init__(self, lexer_conf): @@ -50,7 +51,7 @@ class LALR_ContextualLexer: -class Earley(WithLexer): +class Nearley(WithLexer): def __init__(self, lexer_conf, parser_conf): WithLexer.__init__(self, lexer_conf) @@ -74,6 +75,26 @@ class Earley(WithLexer): assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' return res[0] + +class MyEarley(WithLexer): + def __init__(self, lexer_conf, parser_conf): + WithLexer.__init__(self, lexer_conf) + + rules = [(n, self._prepare_expansion(x), a) + for n,x,a in parser_conf.rules] + + self.parser = earley2.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) + + def _prepare_expansion(self, expansion): + return [(sym,) if is_terminal(sym) else sym for sym in expansion] + + def parse(self, text): + tokens = list(self.lex(text)) + res = self.parser.parse(tokens) + assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' + return res[0] + + class Earley_NoLex: def __init__(self, lexer_conf, parser_conf): self.token_by_name = {t.name:t for t in lexer_conf.tokens} @@ -101,4 +122,35 @@ class Earley_NoLex: assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' return res[0] -ENGINE_DICT = { 'lalr': LALR, 'earley': Earley, 'earley_nolex': Earley_NoLex, 'lalr_contextual_lexer': LALR_ContextualLexer } + +class MyEarley_NoLex: + def __init__(self, lexer_conf, parser_conf): + self.token_by_name = {t.name:t for t in lexer_conf.tokens} + + rules = [(n, list(self._prepare_expansion(x)), a) + for n,x,a in parser_conf.rules] + + self.parser = earley2.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) + + def _prepare_expansion(self, expansion): + for sym in expansion: + if is_terminal(sym): + regexp = self.token_by_name[sym].to_regexp() + width = sre_parse.parse(regexp).getwidth() + if not width == (1,1): + raise GrammarError('Dynamic lexing requires all tokens to have a width of 1 (%s is %s)' % (regexp, width)) + yield re.compile(regexp).match + else: + yield sym + + def parse(self, text): + res = self.parser.parse(text) + assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' + return res[0] + +ENGINE_DICT = { + 'lalr': LALR, + 'earley': MyEarley, + 'earley_nolex': Earley_NoLex, + 'lalr_contextual_lexer': LALR_ContextualLexer +} diff --git a/lark/parsers/earley2.py b/lark/parsers/earley2.py index 7527248..6348747 100644 --- a/lark/parsers/earley2.py +++ b/lark/parsers/earley2.py @@ -1,67 +1,63 @@ +import sys + from ..common import ParseError, UnexpectedToken, is_terminal from grammar_analysis import GrammarAnalyzer -from ..tree import Tree +# is_terminal = callable class Item: - def __init__(self, rule_ptr, start, data): - self.rule_ptr = rule_ptr + def __init__(self, rule, ptr, start, data): + self.rule = rule + self.ptr = ptr self.start = start self.data = data @property def expect(self): - return self.rule_ptr.next + return self.rule.expansion[self.ptr] @property def is_complete(self): - return self.rule_ptr.is_satisfied - - @property - def name(self): - return self.rule_ptr.rule.origin + return self.ptr == len(self.rule.expansion) def advance(self, data): - return Item(self.rule_ptr.advance(self.expect), self.start, self.data + [data]) + return Item(self.rule, self.ptr+1, self.start, self.data + [data]) def __eq__(self, other): - return self.rule_ptr == other.rule_ptr and self.start == other.start + return self.start == other.start and self.ptr == other.ptr and self.rule == other.rule def __hash__(self): - return hash((self.rule_ptr, self.start)) - - def __repr__(self): - return '%s (%s)' % (self.rule_ptr, self.start) + return hash((self.rule, self.ptr, self.start)) class Parser: - def __init__(self, rules, start): - self.analyzer = GrammarAnalyzer(rules, start) - self.start = start + def __init__(self, parser_conf): + self.analysis = GrammarAnalyzer(parser_conf.rules, parser_conf.start) + self.start = parser_conf.start + self.postprocess = {} + self.predictions = {} + for rule in self.analysis.rules: + if rule.origin != '$root': # XXX kinda ugly + self.postprocess[rule] = getattr(parser_conf.callback, rule.alias) + self.predictions[rule.origin] = [(x.rule, x.index) for x in self.analysis.expand_rule(rule.origin)] def parse(self, stream): # Define parser functions def predict(symbol, i): assert not is_terminal(symbol), symbol - return {Item(rp, i, []) for rp in self.analyzer.expand_rule(symbol)} - - def scan(item, inp): - if item.expect == inp: # TODO Do a smarter match, i.e. regexp - return {item.advance(inp)} - else: - return set() + return {Item(rule, index, i, []) for rule, index in self.predictions[symbol]} def complete(item, table): - name = item.name - item.data = Tree(name, item.data) + #item.data = (item.rule_ptr.rule, item.data) + item.data = self.postprocess[item.rule](item.data) return {old_item.advance(item.data) for old_item in table[item.start] - if not old_item.is_complete and old_item.expect == name} + if not old_item.is_complete and old_item.expect == item.rule.origin} - def process_column(i, char): - cur_set = table[-1] + def process_column(i, term): + assert i == len(table)-1 + cur_set = table[i] next_set = set() - table.append(next_set) to_process = cur_set while to_process: @@ -71,61 +67,78 @@ class Parser: new_items |= complete(item, table) else: if is_terminal(item.expect): - next_set |= scan(item, char) + # scan + if item.expect[0] == term: + next_set.add(item.advance(stream[i])) else: - new_items |= predict(item.expect, i) + if item.ptr: # part of an already predicted batch + new_items |= predict(item.expect, i) - to_process = new_items - cur_set + to_process = new_items - cur_set # TODO: is this precaution necessary? cur_set |= to_process - if not next_set and char != '$end': - expect = filter(is_terminal, [i.expect for i in cur_set if not i.is_complete]) - raise UnexpectedToken(char, expect, stream, i) + + if not next_set and term != '$end': + expect = filter(is_terminal, [x.expect for x in cur_set if not x.is_complete]) + raise UnexpectedToken(term, expect, stream, i) + + table.append(next_set) # Main loop starts table = [predict(self.start, 0)] for i, char in enumerate(stream): - process_column(i, char) + process_column(i, char.type) process_column(len(stream), '$end') # Parse ended. Now build a parse tree solutions = [n.data for n in table[len(stream)] - if n.is_complete and n.name==self.start and n.start==0] + if n.is_complete and n.rule.origin==self.start and n.start==0] if not solutions: raise ParseError('Incomplete parse: Could not find a solution to input') return solutions + #return map(self.reduce_solution, solutions) + def reduce_solution(self, solution): + rule, children = solution + children = [self.reduce_solution(c) if isinstance(c, tuple) else c for c in children] + return self.postprocess[rule](children) - +from ..common import ParserConf +# A = 'A'.__eq__ # rules = [ -# ('a', ['a', 'A']), -# ('a', ['a', 'A', 'a']), -# ('a', ['a', 'A', 'A', 'a']), -# ('a', ['A']), +# ('a', ['a', A], None), +# ('a', ['a', A, 'a'], None), +# ('a', ['a', A, A, 'a'], None), +# ('a', [A], None), # ] -# p = Parser(rules, 'a') +# p = Parser(ParserConf(rules, None, 'a')) # for x in p.parse('AAAA'): # print '->' # print x.pretty() +# import re +# NUM = re.compile('[0-9]').match +# ADD = re.compile('[+-]').match +# MUL = re.compile('[*/]').match # rules = [ -# ('sum', ['sum', "A", 'product']), -# ('sum', ['product']), -# ('product', ['product', "M", 'factor']), -# ('product', ['factor']), -# ('factor', ['L', 'sum', 'R']), -# ('factor', ['number']), -# ('number', ['N', 'number']), -# ('number', ['N']), +# ('sum', ['sum', ADD, 'product'], None), +# ('sum', ['product'], None), +# ('product', ['product', MUL, 'factor'], None), +# ('product', ['factor'], None), +# ('factor', ['('.__eq__, 'sum', ')'.__eq__], None), +# ('factor', ['number'], None), +# ('number', [NUM, 'number'], None), +# ('number', [NUM], None), # ] -# p = Parser(rules, 'sum') -# print p.parse('NALNMNANR') +# p = Parser(ParserConf(rules, None, 'sum')) +# # print p.parse('NALNMNANR') +# print p.parse('1+(2*3-4)')[0].pretty() diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index d51700a..c03d9ae 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -133,7 +133,7 @@ class GrammarAnalyzer(object): "Returns all init_ptrs accessible by rule (recursive)" init_ptrs = set() def _expand_rule(rule): - assert not is_terminal(rule) + assert not is_terminal(rule), rule for r in self.rules_by_origin[rule]: init_ptr = RulePtr(r, 0) diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 8a8365d..83f96fc 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -4,7 +4,7 @@ from collections import defaultdict from ..utils import classify, classify_bool, bfs, fzset from ..common import GrammarError, is_terminal -from grammar_analysis import GrammarAnalyzer +from .grammar_analysis import GrammarAnalyzer ACTION_SHIFT = 0