| @@ -47,12 +47,12 @@ class TreeToJson(Transformer): | |||
| true = lambda self, _: True | |||
| false = lambda self, _: False | |||
| json_parser = Lark(json_grammar, parser='earley', lexer='dynamic') | |||
| def parse(x): | |||
| return TreeToJson().transform(json_parser.parse(x)) | |||
| # json_parser = Lark(json_grammar, parser='earley', lexer='standard') | |||
| # def parse(x): | |||
| # return TreeToJson().transform(json_parser.parse(x)) | |||
| # json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) | |||
| # parse = json_parser.parse | |||
| json_parser = Lark(json_grammar, parser='lalr', transformer=TreeToJson()) | |||
| parse = json_parser.parse | |||
| def test(): | |||
| test_json = ''' | |||
| @@ -4,7 +4,7 @@ import sre_parse | |||
| from .lexer import Lexer, ContextualLexer, Token | |||
| from .common import is_terminal, GrammarError, ParserConf, Terminal_Regexp, Terminal_Token | |||
| from .parsers import lalr_parser, old_earley, nearley, earley | |||
| from .parsers import lalr_parser, earley | |||
| from .tree import Transformer | |||
| from .parsers import xearley | |||
| @@ -49,47 +49,6 @@ class LALR_ContextualLexer: | |||
| tokens = self.lexer_conf.postlex.process(tokens) | |||
| return self.parser.parse(tokens, self.lexer.set_parser_state) | |||
| class Nearley(WithLexer): | |||
| def __init__(self, lexer_conf, parser_conf): | |||
| WithLexer.__init__(self, lexer_conf) | |||
| rules = [{'name':n, | |||
| 'symbols': self._prepare_expansion(x), | |||
| 'postprocess': getattr(parser_conf.callback, a)} | |||
| for n,x,a in parser_conf.rules] | |||
| self.parser = nearley.Parser(rules, parser_conf.start) | |||
| def _prepare_expansion(self, expansion): | |||
| return [(sym, None) if is_terminal(sym) else sym for sym in expansion] | |||
| def parse(self, text): | |||
| tokens = list(self.lex(text)) | |||
| res = self.parser.parse(tokens) | |||
| assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | |||
| return res[0] | |||
| class OldEarley(WithLexer): | |||
| def __init__(self, lexer_conf, parser_conf): | |||
| WithLexer.__init__(self, lexer_conf) | |||
| rules = [(n, self._prepare_expansion(x), a) for n,x,a in parser_conf.rules] | |||
| self.parser = old_earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | |||
| def _prepare_expansion(self, expansion): | |||
| return [(sym,) if is_terminal(sym) else sym for sym in expansion] | |||
| def parse(self, text): | |||
| tokens = list(self.lex(text)) | |||
| res = self.parser.parse(tokens) | |||
| assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | |||
| return res[0] | |||
| def tokenize_text(text): | |||
| new_text = [] | |||
| line = 1 | |||
| @@ -101,32 +60,6 @@ def tokenize_text(text): | |||
| new_text.append(Token('CHAR', ch, line=line, column=i - col_start_pos)) | |||
| return new_text | |||
| class OldEarley_NoLex: | |||
| def __init__(self, lexer_conf, parser_conf): | |||
| self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||
| rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] | |||
| self.parser = old_earley.Parser(ParserConf(rules, parser_conf.callback, parser_conf.start)) | |||
| def _prepare_expansion(self, expansion): | |||
| for sym in expansion: | |||
| if is_terminal(sym): | |||
| regexp = self.token_by_name[sym].pattern.to_regexp() | |||
| width = sre_parse.parse(regexp).getwidth() | |||
| if width != (1,1): | |||
| raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) | |||
| yield (re.compile(regexp).match, regexp) | |||
| else: | |||
| yield sym | |||
| def parse(self, text): | |||
| new_text = tokenize_text(text) | |||
| res = self.parser.parse(new_text) | |||
| assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' | |||
| return res[0] | |||
| class Earley_NoLex: | |||
| def __init__(self, lexer_conf, parser_conf, options=None): | |||
| self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||
| @@ -178,7 +111,7 @@ class XEarley: | |||
| def __init__(self, lexer_conf, parser_conf, options=None): | |||
| self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||
| rules = [(n, list(self._prepare_expansion(x)), a) for n,x,a in parser_conf.rules] | |||
| rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] | |||
| resolve_ambiguity = (options.ambiguity=='resolve') if options else True | |||
| ignore = [Terminal_Regexp(self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore] | |||
| @@ -195,6 +128,7 @@ class XEarley: | |||
| if is_terminal(sym): | |||
| regexp = self.token_by_name[sym].pattern.to_regexp() | |||
| width = sre_parse.parse(regexp).getwidth() | |||
| assert width | |||
| yield Terminal_Regexp(regexp) | |||
| else: | |||
| yield sym | |||
| @@ -238,13 +238,13 @@ def _compare_rules(rule1, rule2): | |||
| def _compare_drv(tree1, tree2): | |||
| if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): | |||
| return compare(tree1, tree2) | |||
| return -compare(tree1, tree2) | |||
| try: | |||
| rule1, rule2 = tree1.rule, tree2.rule | |||
| except AttributeError: | |||
| # Probably trees that don't take part in this parse (better way to distinguish?) | |||
| return compare(tree1, tree2) | |||
| return -compare(tree1, tree2) | |||
| # XXX These artifacts can appear due to imperfections in the ordering of Visitor_NoRecurse, | |||
| # when confronted with duplicate (same-id) nodes. Fixing this ordering is possible, but would be | |||
| @@ -264,7 +264,7 @@ def _compare_drv(tree1, tree2): | |||
| if c: | |||
| return c | |||
| return compare(len(tree1.children), len(tree2.children)) | |||
| return -compare(len(tree1.children), len(tree2.children)) | |||
| def _resolve_ambig(tree): | |||
| @@ -1,4 +1,4 @@ | |||
| "This module implements an Earley Parser" | |||
| "This module implements an experimental Earley Parser with a dynamic lexer" | |||
| # The parser uses a parse-forest to keep track of derivations and ambiguations. | |||
| # When the parse ends successfully, a disambiguation stage resolves all ambiguity | |||
| @@ -10,121 +10,21 @@ | |||
| # The algorithm keeps track of each state set, using a corresponding Column instance. | |||
| # Column keeps track of new items using NewsList instances. | |||
| # | |||
| # Instead of running a lexer beforehand, or using a costy char-by-char method, this parser | |||
| # uses regular expressions by necessity, achieving high-performance while maintaining all of | |||
| # Earley's power in parsing any CFG. | |||
| # | |||
| # | |||
| # Author: Erez Shinan (2017) | |||
| # Email : erezshin@gmail.com | |||
| from functools import cmp_to_key | |||
| from collections import defaultdict | |||
| from ..utils import compare | |||
| from ..common import ParseError, UnexpectedToken, Terminal | |||
| from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse | |||
| from ..tree import Tree | |||
| from .grammar_analysis import GrammarAnalyzer | |||
| class EndToken: | |||
| type = '$end' | |||
| class Derivation(Tree): | |||
| def __init__(self, rule, items=None): | |||
| Tree.__init__(self, 'drv', items or []) | |||
| self.rule = rule | |||
| END_TOKEN = EndToken() | |||
| class Item(object): | |||
| "An Earley Item, the atom of the algorithm." | |||
| def __init__(self, rule, ptr, start, tree): | |||
| self.rule = rule | |||
| self.ptr = ptr | |||
| self.start = start | |||
| self.tree = tree if tree is not None else Derivation(self.rule) | |||
| @property | |||
| def expect(self): | |||
| return self.rule.expansion[self.ptr] | |||
| @property | |||
| def is_complete(self): | |||
| return self.ptr == len(self.rule.expansion) | |||
| def advance(self, tree): | |||
| assert self.tree.data == 'drv' | |||
| new_tree = Derivation(self.rule, self.tree.children + [tree]) | |||
| return Item(self.rule, self.ptr+1, self.start, new_tree) | |||
| def __eq__(self, other): | |||
| return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule | |||
| def __hash__(self): | |||
| return hash((self.rule, self.ptr, id(self.start))) | |||
| def __repr__(self): | |||
| before = list(map(str, self.rule.expansion[:self.ptr])) | |||
| after = list(map(str, self.rule.expansion[self.ptr:])) | |||
| return '<(%d) %s : %s * %s>' % (id(self.start), self.rule.origin, ' '.join(before), ' '.join(after)) | |||
| class NewsList(list): | |||
| "Keeps track of newly added items (append-only)" | |||
| def __init__(self, initial=None): | |||
| list.__init__(self, initial or []) | |||
| self.last_iter = 0 | |||
| def get_news(self): | |||
| i = self.last_iter | |||
| self.last_iter = len(self) | |||
| return self[i:] | |||
| class Column: | |||
| "An entry in the table, aka Earley Chart. Contains lists of items." | |||
| def __init__(self, i): | |||
| self.i = i | |||
| self.to_reduce = NewsList() | |||
| self.to_predict = NewsList() | |||
| self.to_scan = NewsList() | |||
| self.item_count = 0 | |||
| self.added = set() | |||
| self.completed = {} | |||
| def add(self, items): | |||
| """Sort items into scan/predict/reduce newslists | |||
| Makes sure only unique items are added. | |||
| """ | |||
| for item in items: | |||
| if item.is_complete: | |||
| # XXX Potential bug: What happens if there's ambiguity in an empty rule? | |||
| if item.rule.expansion and item in self.completed: | |||
| old_tree = self.completed[item].tree | |||
| if old_tree.data != '_ambig': | |||
| new_tree = old_tree.copy() | |||
| new_tree.rule = old_tree.rule | |||
| old_tree.set('_ambig', [new_tree]) | |||
| if item.tree.children[0] is old_tree: # XXX a little hacky! | |||
| raise ParseError("Infinite recursion in grammar!") | |||
| old_tree.children.append(item.tree) | |||
| else: | |||
| self.completed[item] = item | |||
| self.to_reduce.append(item) | |||
| else: | |||
| if item not in self.added: | |||
| self.added.add(item) | |||
| if isinstance(item.expect, Terminal): | |||
| self.to_scan.append(item) | |||
| else: | |||
| self.to_predict.append(item) | |||
| self.item_count += 1 # Only count if actually added | |||
| def __nonzero__(self): | |||
| return bool(self.item_count) | |||
| from earley import ResolveAmbig, ApplyCallbacks, Item, NewsList, Derivation, END_TOKEN, Column | |||
| class Parser: | |||
| def __init__(self, rules, start_symbol, callback, resolve_ambiguity=True, ignore=()): | |||
| @@ -144,7 +44,7 @@ class Parser: | |||
| def parse(self, stream, start_symbol=None): | |||
| # Define parser functions | |||
| start_symbol = start_symbol or self.start_symbol | |||
| matched_terminals = defaultdict(list) | |||
| delayed_matches = defaultdict(list) | |||
| def predict(nonterm, column): | |||
| assert not isinstance(nonterm, Terminal), nonterm | |||
| @@ -178,16 +78,17 @@ class Parser: | |||
| for item in to_scan: | |||
| m = item.expect.match(stream, i) | |||
| if m: | |||
| matched_terminals[m.end()].append(item.advance(m.group(0))) | |||
| delayed_matches[m.end()].append(item.advance(m.group(0))) | |||
| s = m.group(0) | |||
| for j in range(1, len(s)): | |||
| m = item.expect.match(s[:-j]) | |||
| if m: | |||
| matched_terminals[m.end()].append(item.advance(m.group(0))) | |||
| delayed_matches[m.end()].append(item.advance(m.group(0))) | |||
| next_set = Column(i+1) | |||
| next_set.add(matched_terminals[i+1]) | |||
| next_set.add(delayed_matches[i+1]) | |||
| del delayed_matches[i+1] # No longer needed, so unburden memory | |||
| return next_set | |||
| @@ -220,73 +121,3 @@ class Parser: | |||
| return ApplyCallbacks(self.postprocess).transform(tree) | |||
| class ApplyCallbacks(Transformer_NoRecurse): | |||
| def __init__(self, postprocess): | |||
| self.postprocess = postprocess | |||
| def drv(self, tree): | |||
| children = tree.children | |||
| callback = self.postprocess[tree.rule] | |||
| if callback: | |||
| return callback(children) | |||
| else: | |||
| return Tree(rule.origin, children) | |||
| def _compare_rules(rule1, rule2): | |||
| assert rule1.origin == rule2.origin | |||
| c = compare( len(rule1.expansion), len(rule2.expansion)) | |||
| if rule1.origin.startswith('__'): # XXX hack! We need to set priority in parser, not here | |||
| c = -c | |||
| return c | |||
| def _compare_drv(tree1, tree2): | |||
| if not (isinstance(tree1, Tree) and isinstance(tree2, Tree)): | |||
| return -compare(tree1, tree2) | |||
| c = _compare_rules(tree1.rule, tree2.rule) | |||
| if c: | |||
| return c | |||
| # rules are "equal", so compare trees | |||
| for t1, t2 in zip(tree1.children, tree2.children): | |||
| c = _compare_drv(t1, t2) | |||
| if c: | |||
| return c | |||
| return -compare(len(tree1.children), len(tree2.children)) | |||
| class ResolveAmbig(Visitor_NoRecurse): | |||
| """Resolves ambiguity in resulting parse tree. | |||
| Minimizes rule length, maximizes match length. | |||
| """ | |||
| def _ambig(self, tree): | |||
| best = min(tree.children, key=cmp_to_key(_compare_drv)) | |||
| assert best.data == 'drv' | |||
| tree.set('drv', best.children) | |||
| tree.rule = best.rule # needed for applying callbacks | |||
| # RULES = [ | |||
| # ('a', ['d']), | |||
| # ('d', ['b']), | |||
| # ('b', ['C']), | |||
| # ('b', ['b', 'C']), | |||
| # ('b', ['C', 'b']), | |||
| # ] | |||
| # p = Parser(RULES, 'a') | |||
| # for x in p.parse('CC'): | |||
| # print x.pretty() | |||
| #--------------- | |||
| # RULES = [ | |||
| # ('s', ['a', 'a']), | |||
| # ('a', ['b', 'b']), | |||
| # ('b', ['C'], lambda (x,): x), | |||
| # ('b', ['b', 'C']), | |||
| # ] | |||
| # p = Parser(RULES, 's', {}) | |||
| # print p.parse('CCCCC').pretty() | |||
| @@ -4,10 +4,23 @@ import unittest | |||
| import logging | |||
| from .test_trees import TestTrees | |||
| # from .test_selectors import TestSelectors | |||
| from .test_parser import TestLalrStandard, TestEarleyStandard, TestLalrContextual, TestParsers, TestEarleyScanless, TestEarley, TestEarleyDynamic | |||
| # from .test_grammars import TestPythonG, TestConfigG | |||
| from .test_parser import ( | |||
| TestLalrStandard, | |||
| TestEarleyStandard, | |||
| TestLalrContextual, | |||
| TestEarleyScanless, | |||
| TestEarleyDynamic, | |||
| TestFullEarleyScanless, | |||
| TestFullEarleyDynamic, | |||
| TestParsers, | |||
| ) | |||
| logging.basicConfig(level=logging.INFO) | |||
| if __name__ == '__main__': | |||
| @@ -51,90 +51,95 @@ class TestParsers(unittest.TestCase): | |||
| self.assertRaises(ParseError, l.parse, 'a') | |||
| class TestEarley(unittest.TestCase): | |||
| def test_anon_in_scanless(self): | |||
| # Fails an Earley implementation without special handling for empty rules, | |||
| # or re-processing of already completed rules. | |||
| g = Lark(r"""start: B | |||
| B: ("ab"|/[^b]/)* | |||
| """, lexer='dynamic') | |||
| def _make_full_earley_test(LEXER): | |||
| class _TestFullEarley(unittest.TestCase): | |||
| def test_anon_in_scanless(self): | |||
| # Fails an Earley implementation without special handling for empty rules, | |||
| # or re-processing of already completed rules. | |||
| g = Lark(r"""start: B | |||
| B: ("ab"|/[^b]/)* | |||
| """, lexer=LEXER) | |||
| self.assertEqual( g.parse('abc').children[0], 'abc') | |||
| self.assertEqual( g.parse('abc').children[0], 'abc') | |||
| def test_earley_scanless(self): | |||
| g = Lark("""start: A "b" c | |||
| A: "a"+ | |||
| c: "abc" | |||
| """, parser="earley", lexer='dynamic') | |||
| x = g.parse('aaaababc') | |||
| def test_earley_scanless(self): | |||
| g = Lark("""start: A "b" c | |||
| A: "a"+ | |||
| c: "abc" | |||
| """, parser="earley", lexer=LEXER) | |||
| x = g.parse('aaaababc') | |||
| def test_earley_scanless2(self): | |||
| grammar = """ | |||
| start: statement+ | |||
| def test_earley_scanless2(self): | |||
| grammar = """ | |||
| start: statement+ | |||
| statement: "r" | |||
| | "c" /[a-z]/+ | |||
| statement: "r" | |||
| | "c" /[a-z]/+ | |||
| %ignore " " | |||
| """ | |||
| %ignore " " | |||
| """ | |||
| program = """c b r""" | |||
| program = """c b r""" | |||
| l = Lark(grammar, parser='earley', lexer='dynamic') | |||
| l.parse(program) | |||
| l = Lark(grammar, parser='earley', lexer=LEXER) | |||
| l.parse(program) | |||
| def test_earley_scanless3(self): | |||
| "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" | |||
| def test_earley_scanless3(self): | |||
| "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" | |||
| grammar = """ | |||
| start: A A | |||
| A: "a"+ | |||
| """ | |||
| grammar = """ | |||
| start: A A | |||
| A: "a"+ | |||
| """ | |||
| l = Lark(grammar, parser='earley', lexer='dynamic') | |||
| res = l.parse("aaa") | |||
| self.assertEqual(res.children, ['aa', 'a']) | |||
| l = Lark(grammar, parser='earley', lexer=LEXER) | |||
| res = l.parse("aaa") | |||
| self.assertEqual(res.children, ['aa', 'a']) | |||
| def test_earley_scanless4(self): | |||
| grammar = """ | |||
| start: A A? | |||
| A: "a"+ | |||
| """ | |||
| def test_earley_scanless4(self): | |||
| grammar = """ | |||
| start: A A? | |||
| A: "a"+ | |||
| """ | |||
| l = Lark(grammar, parser='earley', lexer=LEXER) | |||
| res = l.parse("aaa") | |||
| self.assertEqual(res.children, ['aaa']) | |||
| l = Lark(grammar, parser='earley', lexer='dynamic') | |||
| res = l.parse("aaa") | |||
| self.assertEqual(res.children, ['aaa']) | |||
| def test_earley_repeating_empty(self): | |||
| # This was a sneaky bug! | |||
| def test_earley_repeating_empty(self): | |||
| # This was a sneaky bug! | |||
| grammar = """ | |||
| !start: "a" empty empty "b" | |||
| empty: empty2 | |||
| empty2: | |||
| """ | |||
| grammar = """ | |||
| !start: "a" empty empty "b" | |||
| empty: empty2 | |||
| empty2: | |||
| """ | |||
| parser = Lark(grammar, parser='earley', lexer=LEXER) | |||
| res = parser.parse('ab') | |||
| parser = Lark(grammar, parser='earley', lexer='dynamic') | |||
| res = parser.parse('ab') | |||
| empty_tree = Tree('empty', [Tree('empty2', [])]) | |||
| self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b']) | |||
| empty_tree = Tree('empty', [Tree('empty2', [])]) | |||
| self.assertSequenceEqual(res.children, ['a', empty_tree, empty_tree, 'b']) | |||
| def test_earley_explicit_ambiguity(self): | |||
| # This was a sneaky bug! | |||
| def test_earley_explicit_ambiguity(self): | |||
| # This was a sneaky bug! | |||
| grammar = """ | |||
| start: a b | ab | |||
| a: "a" | |||
| b: "b" | |||
| ab: "ab" | |||
| """ | |||
| grammar = """ | |||
| start: a b | ab | |||
| a: "a" | |||
| b: "b" | |||
| ab: "ab" | |||
| """ | |||
| parser = Lark(grammar, parser='earley', lexer=LEXER, ambiguity='explicit') | |||
| res = parser.parse('ab') | |||
| parser = Lark(grammar, parser='earley', lexer='dynamic', ambiguity='explicit') | |||
| res = parser.parse('ab') | |||
| self.assertEqual( res.data, '_ambig') | |||
| self.assertEqual( len(res.children), 2) | |||
| self.assertEqual( res.data, '_ambig') | |||
| self.assertEqual( len(res.children), 2) | |||
| _NAME = "TestFullEarley" + (LEXER or 'Scanless').capitalize() | |||
| _TestFullEarley.__name__ = _NAME | |||
| globals()[_NAME] = _TestFullEarley | |||
| def _make_parser_test(LEXER, PARSER): | |||
| @@ -444,7 +449,7 @@ def _make_parser_test(LEXER, PARSER): | |||
| """) | |||
| x = g.parse('aababc') | |||
| @unittest.skipIf(LEXER is None, "Known bug with scanless parsing") # TODO | |||
| @unittest.skipIf(LEXER in (None, 'dynamic'), "Known bug with scanless parsing") # TODO | |||
| def test_token_not_anon(self): | |||
| """Tests that "a" is matched as A, rather than an anonymous token. | |||
| @@ -664,6 +669,8 @@ _TO_TEST = [ | |||
| for _LEXER, _PARSER in _TO_TEST: | |||
| _make_parser_test(_LEXER, _PARSER) | |||
| for _LEXER in (None, 'dynamic'): | |||
| _make_full_earley_test(_LEXER) | |||
| if __name__ == '__main__': | |||
| unittest.main() | |||