From 73178d6ae0846d624e61fefb68355e25a4190b61 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Sat, 4 Feb 2017 22:49:52 +0200 Subject: [PATCH] Lark big first commit. Examples working. --- lark/__init__.py | 0 lark/earley.py | 148 ++++++++++++++ lark/examples/__init__.py | 0 lark/examples/calc.py | 59 ++++++ lark/examples/json_example.py | 62 ++++++ lark/grammar_analysis.py | 207 ++++++++++++++++++++ lark/lark.py | 217 +++++++++++++++++++++ lark/lexer.py | 84 ++++++++ lark/load_grammar.py | 358 ++++++++++++++++++++++++++++++++++ lark/parser.py | 61 ++++++ lark/tree.py | 83 ++++++++ lark/utils.py | 51 +++++ 12 files changed, 1330 insertions(+) create mode 100644 lark/__init__.py create mode 100644 lark/earley.py create mode 100644 lark/examples/__init__.py create mode 100644 lark/examples/calc.py create mode 100644 lark/examples/json_example.py create mode 100644 lark/grammar_analysis.py create mode 100644 lark/lark.py create mode 100644 lark/lexer.py create mode 100644 lark/load_grammar.py create mode 100644 lark/parser.py create mode 100644 lark/tree.py create mode 100644 lark/utils.py diff --git a/lark/__init__.py b/lark/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lark/earley.py b/lark/earley.py new file mode 100644 index 0000000..e2d4814 --- /dev/null +++ b/lark/earley.py @@ -0,0 +1,148 @@ +"My name is Earley" + +from .utils import classify + +class MatchFailed(object): + pass + +class AbortParseMatch(Exception): + pass + +class Rule(object): + def __init__(self, name, symbols, postprocess): + self.name = name + self.symbols = symbols + self.postprocess = postprocess + +class State(object): + def __init__(self, rule, expect, reference, data=None): + self.rule = rule + self.expect = expect + self.reference = reference + self.data = data or [] + + self.is_complete = (self.expect == len(self.rule.symbols)) + if not self.is_complete: + self.expect_symbol = self.rule.symbols[self.expect] + self.is_literal = isinstance(self.expect_symbol, dict) + if self.is_literal: + self.expect_symbol = self.expect_symbol['literal'] + assert isinstance(self.expect_symbol, (str, unicode)), self.expect_symbol + + def next_state(self, data): + return State(self.rule, self.expect+1, self.reference, self.data + [data]) + + def consume_terminal(self, inp): + if not self.is_complete and self.is_literal: + # PORT: originally tests regexp + + if self.expect_symbol == inp.type: + return self.next_state(inp) + + def consume_nonterminal(self, inp): + if not self.is_complete and not self.is_literal: + + if self.expect_symbol == inp: + return self.next_state(inp) + + def process(self, location, ind, table, rules, added_rules): + if self.is_complete: + # Completed a rule + if self.rule.postprocess: + try: + # self.data = self.rule.postprocess(self.data, self.reference) + # import pdb + # pdb.set_trace() + self.data = self.rule.postprocess(self.data) + except AbortParseMatch: + self.data = MatchFailed + + if self.data is not MatchFailed: + for s in table[self.reference]: + x = s.consume_nonterminal(self.rule.name) + if x: + x.data[-1] = self.data + x.epsilon_closure(location, ind, table) + + else: + exp = self.rule.symbols[self.expect] + if isinstance(exp, dict): + return + + for r in rules[exp]: + assert r.name == exp + if r not in added_rules: + if r.symbols: + added_rules.add(r) + State(r, 0, location).epsilon_closure(location, ind, table) + else: + # Empty rule + new_copy = self.consume_nonterminal(r.name) + if r.postprocess: + new_copy.data[-1] = r.postprocess([]) + # new_copy.data[-1] = r.postprocess([], self.reference) + else: + new_copy.data[-1] = [] + + new_copy.epsilon_closure(location, ind, table) + + def epsilon_closure(self, location, ind, table, result=None): + col = table[location] + if not result: + result = col + + result.append(self) + + if not self.is_complete: + for i in xrange(ind): + state = col[i] + if state.is_complete and state.reference == location: + x = self.consume_nonterminal(state.rule.name) + if x: + x.data[-1] = state.data + x.epsilon_closure(location, ind, table) + + +class Parser(object): + def __init__(self, rules, start=None): + self.table = [[]] + self.rules = [Rule(r['name'], r['symbols'], r.get('postprocess', None)) for r in rules] + self.rules_by_name = classify(self.rules, lambda r: r.name) + self.start = start or self.rules[0].name + initial_rules = set(self.rules_by_name[self.start]) + self.table[0] += [State(r, 0, 0) for r in initial_rules] + self.advance_to(0, initial_rules) + self.current = 0 + + def advance_to(self, n, added_rules): + for w, s in enumerate(self.table[n]): + s.process(n, w, self.table, self.rules_by_name, added_rules) + + def parse(self, chunk): + chunk_pos = 0 + for chunk_pos, chunk_item in enumerate(chunk): + self.table.append([]) + + for s in self.table[self.current + chunk_pos]: + x = s.consume_terminal(chunk_item) + if x: + self.table[self.current + chunk_pos + 1].append(x) + + + added_rules = set() + self.advance_to(self.current + chunk_pos + 1, added_rules) + + if not self.table[-1]: + raise Exception('Error at line {t.line}:{t.column}'.format(t=chunk[chunk_pos])) + + self.current += chunk_pos + return list(self.finish()) + + def finish(self): + for t in self.table[-1]: + if (t.rule.name == self.start + and t.expect == len(t.rule.symbols) + and t.reference == 0 + and t.data != MatchFailed): + yield t.data + diff --git a/lark/examples/__init__.py b/lark/examples/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/lark/examples/calc.py b/lark/examples/calc.py new file mode 100644 index 0000000..2c2e1b3 --- /dev/null +++ b/lark/examples/calc.py @@ -0,0 +1,59 @@ +from lark.tree import Transformer +from lark.lark import Lark + +calc_grammar = """ + ?start: sum + | NAME "=" sum -> *assign_var + + ?sum: product + | sum "+" product -> *add + | sum "-" product -> *sub + + ?product: atom + | product "*" atom -> *mul + | product "/" atom -> *div + + ?atom: /[\d.]+/ -> *number + | "-" atom -> *neg + | NAME -> *var + | "(" sum ")" + + NAME: /\w+/ + WS.ignore: /\s+/ +""" + +class CalculateTree(Transformer): + from operator import add, sub, mul, div, neg + number = float + + def __init__(self): + self.vars = {} + + def assign_var(self, name, value): + self.vars[name] = value + return value + + def var(self, name): + return self.vars[name] + + + +calc_parser = Lark(calc_grammar, parser='lalr', transformer=CalculateTree()) +calc = calc_parser.parse + +def main(): + while True: + try: + s = raw_input('> ') + except EOFError: + break + print(calc(s)) + +def test(): + print calc("a = 1+2") + print calc("1+a*-3") + +if __name__ == '__main__': + test() + # main() + diff --git a/lark/examples/json_example.py b/lark/examples/json_example.py new file mode 100644 index 0000000..4e7d09d --- /dev/null +++ b/lark/examples/json_example.py @@ -0,0 +1,62 @@ +import sys +from lark.lark import Lark +from lark.tree import Transformer + +json_grammar = r""" + ?start: value + + ?value: object + | array + | string + | number + | "true" -> *true + | "false" -> *false + | "null" -> *null + + array : "[" [value ("," value)*] "]" + object : "{" [pair ("," pair)*] "}" + pair : string ":" value + + *number : /-?\d+(\.\d+)?([eE][+-]?\d+)?/ + *string : /".*?(?' % (self.origin, ' '.join(self.expansion)) + +class RulePtr(object): + def __init__(self, rule, index): + assert isinstance(rule, Rule) + assert index <= len(rule.expansion) + self.rule = rule + self.index = index + + def __repr__(self): + before = self.rule.expansion[:self.index] + after = self.rule.expansion[self.index:] + return '<%s : %s * %s>' % (self.rule.origin, ' '.join(before), ' '.join(after)) + + @property + def next(self): + return self.rule.expansion[self.index] + + def advance(self, sym): + assert self.next == sym + return RulePtr(self.rule, self.index+1) + + @property + def is_satisfied(self): + return self.index == len(self.rule.expansion) + + def __eq__(self, other): + return self.rule == other.rule and self.index == other.index + def __hash__(self): + return hash((self.rule, self.index)) + + +def pairs(lst): + return zip(lst[:-1], lst[1:]) + +def update_set(set1, set2): + copy = set(set1) + set1 |= set2 + return set1 != copy + +class GrammarAnalyzer(object): + def __init__(self, rule_tuples): + rule_tuples = list(rule_tuples) + rule_tuples.append(('$root', ['start', '$end'])) + rule_tuples = [(t[0], t[1], None) if len(t)==2 else t for t in rule_tuples] + + self.rules = set() + self.rules_by_origin = {o: [] for o, _x, _a in rule_tuples} + for origin, exp, alias in rule_tuples: + r = Rule( origin, exp, alias ) + self.rules.add(r) + self.rules_by_origin[origin].append(r) + + for r in self.rules: + for sym in r.expansion: + if not (is_terminal(sym) or sym in self.rules_by_origin): + raise GrammarError("Using an undefined rule: %s" % sym) + + self.init_state = self.expand_rule('start') + + def expand_rule(self, rule): + "Returns all init_ptrs accessible by rule (recursive)" + init_ptrs = set() + def _expand_rule(rule): + assert not is_terminal(rule) + + for r in self.rules_by_origin[rule]: + init_ptr = RulePtr(r, 0) + init_ptrs.add(init_ptr) + + new_r = init_ptr.next + if not is_terminal(new_r): + yield new_r + + _ = list(bfs([rule], _expand_rule)) + + return fzset(init_ptrs) + + def _first(self, r): + if is_terminal(r): + return {r} + else: + return {rp.next for rp in self.expand_rule(r) if is_terminal(rp.next)} + + def _calc(self): + """Calculate FOLLOW sets. + + Adapted from: http://lara.epfl.ch/w/cc09:algorithm_for_first_and_follow_sets""" + symbols = {sym for rule in self.rules for sym in rule.expansion} + symbols.add('$root') # what about other unused rules? + + # foreach grammar rule X ::= Y(1) ... Y(k) + # if k=0 or {Y(1),...,Y(k)} subset of NULLABLE then + # NULLABLE = NULLABLE union {X} + # for i = 1 to k + # if i=1 or {Y(1),...,Y(i-1)} subset of NULLABLE then + # FIRST(X) = FIRST(X) union FIRST(Y(i)) + # for j = i+1 to k + # if i=k or {Y(i+1),...Y(k)} subset of NULLABLE then + # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FOLLOW(X) + # if i+1=j or {Y(i+1),...,Y(j-1)} subset of NULLABLE then + # FOLLOW(Y(i)) = FOLLOW(Y(i)) union FIRST(Y(j)) + # until none of NULLABLE,FIRST,FOLLOW changed in last iteration + + NULLABLE = set() + FIRST = {} + FOLLOW = {} + for sym in symbols: + FIRST[sym]={sym} if is_terminal(sym) else set() + FOLLOW[sym]=set() + + changed = True + while changed: + changed = False + + for rule in self.rules: + if set(rule.expansion) <= NULLABLE: + if update_set(NULLABLE, {rule.origin}): + changed = True + + for i, sym in enumerate(rule.expansion): + if set(rule.expansion[:i]) <= NULLABLE: + if update_set(FIRST[rule.origin], FIRST[sym]): + changed = True + if i==len(rule.expansion)-1 or set(rule.expansion[i:]) <= NULLABLE: + if update_set(FOLLOW[sym], FOLLOW[rule.origin]): + changed = True + + for j in range(i+1, len(rule.expansion)): + if set(rule.expansion[i+1:j]) <= NULLABLE: + if update_set(FOLLOW[sym], FIRST[rule.expansion[j]]): + changed = True + + self.FOLLOW = FOLLOW + + def analyze(self): + self._calc() + + self.states = {} + def step(state): + lookahead = defaultdict(list) + sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied) + for rp in sat: + for term in self.FOLLOW.get(rp.rule.origin, ()): + lookahead[term].append(('reduce', rp.rule)) + + d = classify(unsat, lambda rp: rp.next) + for sym, rps in d.items(): + rps = {rp.advance(sym) for rp in rps} + + for rp in set(rps): + if not rp.is_satisfied and not is_terminal(rp.next): + rps |= self.expand_rule(rp.next) + + lookahead[sym].append(('shift', fzset(rps))) + yield fzset(rps) + + for k, v in lookahead.items(): + if len(v) > 1: + for x in v: + # XXX resolving shift/reduce into shift, like PLY + # Give a proper warning + if x[0] == 'shift': + lookahead[k] = [x] + + for k, v in lookahead.items(): + assert len(v) == 1, ("Collision", k, v) + + self.states[state] = {k:v[0] for k, v in lookahead.items()} + + x = list(bfs([self.init_state], step)) + + # -- + self.enum = list(self.states) + self.enum_rev = {s:i for i,s in enumerate(self.enum)} + self.states_idx = {} + + for s, la in self.states.items(): + la = {k:(ACTION_SHIFT, self.enum_rev[v[1]]) if v[0]=='shift' else v for k,v in la.items()} + self.states_idx[ self.enum_rev[s] ] = la + + + self.init_state_idx = self.enum_rev[self.init_state] + diff --git a/lark/lark.py b/lark/lark.py new file mode 100644 index 0000000..cbca35d --- /dev/null +++ b/lark/lark.py @@ -0,0 +1,217 @@ +from __future__ import absolute_import + +from .utils import STRING_TYPE +from .load_grammar import load_grammar +from .tree import Tree, Transformer + +from .lexer import Lexer +from .grammar_analysis import GrammarAnalyzer, is_terminal +from . import parser, earley + +class LarkOptions(object): + """Specifies the options for Lark + + """ + OPTIONS_DOC = """ + parser - Which parser engine to use ("earley" or "lalr". Default: "earley") + Note: Both will use Lark's lexer. + transformer - Applies the transformer to every parse tree + debug - Affects verbosity (default: False) + only_lex - Don't build a parser. Useful for debugging (default: False) + keep_all_tokens - Don't automagically remove "punctuation" tokens (default: True) + cache_grammar - Cache the Lark grammar (Default: False) + ignore_postproc - Don't call the post-processing function (default: False) + """ + __doc__ += OPTIONS_DOC + def __init__(self, options_dict): + o = dict(options_dict) + + self.debug = bool(o.pop('debug', False)) + self.only_lex = bool(o.pop('only_lex', False)) + self.keep_all_tokens = bool(o.pop('keep_all_tokens', False)) + self.keep_empty_trees = bool(o.pop('keep_empty_trees', True)) + self.tree_class = o.pop('tree_class', Tree) + self.cache_grammar = o.pop('cache_grammar', False) + self.ignore_postproc = bool(o.pop('ignore_postproc', False)) + self.parser = o.pop('parser', 'earley') + self.transformer = o.pop('transformer', None) + + if o: + raise ValueError("Unknown options: %s" % o.keys()) + + +class Callback(object): + pass + + +class RuleTreeToText(Transformer): + def expansions(self, *x): + return x + def expansion(self, *symbols): + return [sym.value for sym in symbols], None + def alias(self, (expansion, _alias), alias): + assert _alias is None, (alias, expansion, '-', _alias) + return expansion, alias.value + + + +def create_rule_handler(expansion, usermethod): + to_include = [(i, sym.startswith('_')) for i, sym in enumerate(expansion) + if not (is_terminal(sym) and sym.startswith('_'))] + + def _build_ast(match): + children = [] + for i, to_expand in to_include: + if to_expand: + children += match[i].children + else: + children.append(match[i]) + + return usermethod(children) + return _build_ast + +def create_expand1_tree_builder_function(tree_builder): + def f(children): + if len(children) == 1: + return children[0] + else: + return tree_builder(children) + return f + +def create_rule_inline(f): + def _f(children): + return f(*children) + return _f + + +class LALR: + def build_parser(self, rules, callback): + ga = GrammarAnalyzer(rules) + ga.analyze() + return parser.Parser(ga, callback) + +class Earley: + @staticmethod + def _process_expansion(x): + return [{'literal': s} if is_terminal(s) else s for s in x] + + def build_parser(self, rules, callback): + rules = [{'name':n, 'symbols': self._process_expansion(x), 'postprocess':getattr(callback, a)} for n,x,a in rules] + return EarleyParser(earley.Parser(rules, 'start')) + +class EarleyParser: + def __init__(self, parser): + self.parser = parser + + def parse(self, text): + res = self.parser.parse(text) + assert len(res) ==1 , 'Ambiguious Parse! Not handled yet' + return res[0] + + + +class Lark: + def __init__(self, grammar, **options): + """ + grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax) + options : a dictionary controlling various aspects of Lark. + """ + self.options = LarkOptions(options) + + # Some, but not all file-like objects have a 'name' attribute + try: + source = grammar.name + except AttributeError: + source = '' + cache_file = "larkcache_%s" % str(hash(grammar)%(2**32)) + else: + cache_file = "larkcache_%s" % os.path.basename(source) + + # Drain file-like objects to get their contents + try: + read = grammar.read + except AttributeError: + pass + else: + grammar = read() + + assert isinstance(grammar, STRING_TYPE) + + if self.options.cache_grammar: + raise NotImplementedError("Not available yet") + + self.tokens, self.rules = load_grammar(grammar) + + self.lexer = self._build_lexer() + if not self.options.only_lex: + self.parser_engine = { + 'lalr': LALR, + 'earley': Earley, + }[self.options.parser]() + self.parser = self._build_parser() + + def _build_lexer(self): + ignore_tokens = [] + tokens = {} + for name, (value, flags) in self.tokens.items(): + if 'ignore' in flags: + ignore_tokens.append(name) + tokens[name] = value + return Lexer(tokens.items(), {}, ignore=ignore_tokens) + + + def _build_parser(self): + transformer = self.options.transformer + callback = Callback() + rules = [] + rule_tree_to_text = RuleTreeToText() + for origin, tree in self.rules.items(): + for expansion, alias in rule_tree_to_text.transform(tree): + if alias and origin.startswith('_'): + raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases" % origin) + + expand1 = origin.startswith('?') + inline_args = origin.startswith('*') or (alias and alias.startswith('*')) + _origin = origin.lstrip('?*') + if alias: + alias = alias.lstrip('*') + _alias = 'autoalias_%s_%s' % (_origin, '_'.join(expansion)) + + assert not hasattr(callback, _alias) + f = getattr(transformer, alias or _origin, None) + if f is None: + if alias: + f = self._create_tree_builder_function(alias) + else: + f = self._create_tree_builder_function(_origin) + if expand1: + f = create_expand1_tree_builder_function(f) + else: + if inline_args: + f = create_rule_inline(f) + + alias_handler = create_rule_handler(expansion, f) + + setattr(callback, _alias, alias_handler) + + rules.append((_origin, expansion, _alias)) + + return self.parser_engine.build_parser(rules, callback) + + + __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC + + def _create_tree_builder_function(self, name): + tree_class = self.options.tree_class + def f(children): + return tree_class(name, children) + return f + + def lex(self, text): + return self.lexer.lex(text) + + def parse(self, text): + assert not self.options.only_lex + l = list(self.lex(text)) + return self.parser.parse(l) + diff --git a/lark/lexer.py b/lark/lexer.py new file mode 100644 index 0000000..e118312 --- /dev/null +++ b/lark/lexer.py @@ -0,0 +1,84 @@ +## Lexer Implementation +from utils import Str + +class LexError(Exception): + pass + +class Token(Str): + def __new__(cls, type, value, pos_in_stream=None): + inst = Str.__new__(cls, value) + inst.type = type + inst.pos_in_stream = pos_in_stream + inst.value = value + return inst + +# class Token(object): +# def __init__(self, type, value, lexpos): +# self.type = type +# self.value = value +# self.lexpos = lexpos + + + def __repr__(self): + return 'Token(%s, %s, %s)' % (self.type, self.value, self.pos_in_stream) + +class Regex: + def __init__(self, pattern, flags=()): + self.pattern = pattern + self.flags = flags + + +import re +LIMIT = 50 # Stupid named groups limit in python re +class Lexer(object): + def __init__(self, tokens, callbacks, ignore=()): + self.ignore = ignore + + # Sanitization + token_names = {t[0] for t in tokens} + for t in tokens: + try: + re.compile(t[1]) + except: + raise LexError("Cannot compile token: %s: %s" % t) + assert all(t in token_names for t in ignore) + + # Init + self.tokens = tokens + self.callbacks = callbacks + + self.tokens.sort(key=lambda x:len(x[1]), reverse=True) + + self.mres = [] + self.name_from_index = [] + x = tokens + while x: + mre = re.compile(u'|'.join(u'(?P<%s>%s)'%t for t in x[:LIMIT])) + self.mres.append(mre) + self.name_from_index.append( {i:n for n,i in mre.groupindex.items()} ) + x = x[LIMIT:] + + def lex(self, stream): + lex_pos = 0 + while True: + i = 0 + for mre in self.mres: + m = mre.match(stream, lex_pos) + if m: + value = m.group(0) + type_ = self.name_from_index[i][m.lastindex] + t = Token(type_, value, lex_pos) + if t.type in self.callbacks: + self.callbacks[t.type](t) + if t.type not in self.ignore: + yield t + lex_pos += len(value) + break + i += 1 + else: + if lex_pos < len(stream): + context = stream[lex_pos:lex_pos+5] + raise LexError("No token defined for: '%s' in %s" % (stream[lex_pos], context)) + break + + diff --git a/lark/load_grammar.py b/lark/load_grammar.py new file mode 100644 index 0000000..0b41b91 --- /dev/null +++ b/lark/load_grammar.py @@ -0,0 +1,358 @@ +import re +from lexer import Lexer, Token +from grammar_analysis import GrammarAnalyzer +from parser import Parser + +from tree import Tree as T, Transformer, Visitor + +_TOKEN_NAMES = { + ':' : 'COLON', + ',' : 'COMMA', + ';' : 'SEMICOLON', + '+' : 'PLUS', + '-' : 'MINUS', + '*' : 'STAR', + '/' : 'SLASH', + '|' : 'VBAR', + '!' : 'BANG', + '?' : 'QMARK', + '#' : 'HASH', + '$' : 'DOLLAR', + '&' : 'AMPERSAND', + '<' : 'LESSTHAN', + '>' : 'MORETHAN', + '=' : 'EQUAL', + '.' : 'DOT', + '%' : 'PERCENT', + '`' : 'BACKQUOTE', + '^' : 'CIRCUMFLEX', + '"' : 'DBLQUOTE', + '\'' : 'QUOTE', + '~' : 'TILDE', + '@' : 'AT', + '(' : 'LPAR', + ')' : 'RPAR', + '{' : 'LBRACE', + '}' : 'RBRACE', + '[' : 'LSQB', + ']' : 'RSQB', +} + +# Grammar Parser +TOKENS = { + 'LPAR': '\(', + 'RPAR': '\)', + 'LBRA': '\[', + 'RBRA': '\]', + 'OP': '[+*?]', + 'COLON': ':', + 'OR': '\|', + 'DOT': '\.', + 'RULE': '[_?*]?[a-z][_a-z0-9]*', + 'TOKEN': '_?[A-Z][_A-Z0-9]*', + 'STRING': r'".*?[^\\]"', + 'REGEXP': r"/(.|\n)*?[^\\]/", + 'NL': r'(\r?\n)+\s*', + 'WS': r'[ \t]+', + 'COMMENT': r'#[^\n]*\n', + 'TO': '->' +} + +RULES = [ + ('start', ['list']), + ('list', ['item']), + ('list', ['list', 'item']), + ('item', ['rule']), + ('item', ['token']), + ('item', ['NL']), + + ('rule', ['RULE', 'COLON', 'expansions', 'NL']), + ('expansions', ['expansion']), + ('expansions', ['expansions', 'OR', 'expansion']), + ('expansions', ['expansions', 'NL', 'OR', 'expansion']), + + ('expansion', ['_expansion']), + ('expansion', ['_expansion', 'TO', 'RULE']), + + ('_expansion', ['expr']), + ('_expansion', ['_expansion', 'expr']), + + ('expr', ['atom']), + ('expr', ['atom', 'OP']), + + ('atom', ['LPAR', 'expansions', 'RPAR']), + ('atom', ['maybe']), + + ('atom', ['RULE']), + ('atom', ['TOKEN']), + ('atom', ['anontoken']), + + ('anontoken', ['tokenvalue']), + + ('maybe', ['LBRA', 'expansions', 'RBRA']), + + ('token', ['TOKEN', 'COLON', 'tokenvalue', 'NL']), + ('token', ['TOKEN', 'tokenmods', 'COLON', 'tokenvalue', 'NL']), + ('tokenvalue', ['REGEXP']), + ('tokenvalue', ['STRING']), + ('tokenmods', ['DOT', 'RULE']), + ('tokenmods', ['tokenmods', 'DOT', 'RULE']), +] + +class SaveDefinitions(object): + def __init__(self): + self.rules = {} + self.tokens = {} + self.i = 0 + + + def atom__3(self, _1, value, _2): + return value + def atom__1(self, value): + return value + + def expr__1(self, expr): + return expr + def expr(self, *x): + return T('expr', x) + + def expansion__1(self, expansion): + return expansion + def expansion__3(self, expansion, _, alias): + return T('alias', [expansion, alias]) + def _expansion(self, *x): + return T('expansion', x) + + def expansions(self, *x): + items = [i for i in x if isinstance(i, T)] + return T('expansions', items) + + def maybe(self, _1, expr, _2): + return T('expr', [expr, Token('OP', '?', -1)]) + + def rule(self, name, _1, expansion, _2): + name = name.value + if name in self.rules: + raise ValueError("Rule '%s' defined more than once" % name) + + self.rules[name] = expansion + + def token(self, *x): + name = x[0].value + if name in self.tokens: + raise ValueError("Token '%s' defined more than once" % name) + + if len(x) == 4: + self.tokens[name] = x[2][1], [] + else: + self.tokens[name] = x[3][1], x[1].children + + def tokenvalue(self, tokenvalue): + value = tokenvalue.value[1:-1] + if tokenvalue.type == 'STRING': + value = re.escape(value) + return tokenvalue, value + + def anontoken(self, (token, value)): + if token.type == 'STRING': + try: + token_name = _TOKEN_NAMES[token.value[1:-1]] + except KeyError: + if value.isalnum() and value[0].isalpha(): + token_name = value.upper() + else: + token_name = 'ANONSTR_%d' % self.i + self.i += 1 + token_name = '__' + token_name + + elif token.type == 'REGEXP': + token_name = 'ANONRE_%d' % self.i + self.i += 1 + else: + assert False, x + + if token_name not in self.tokens: + self.tokens[token_name] = value, [] + + return Token('TOKEN', token_name, -1) + + def tokenmods__2(self, _, rule): + return T('tokenmods', [rule.value]) + def tokenmods__3(self, tokenmods, _, rule): + return T('tokenmods', tokenmods.children + [rule.value]) + + def start(self, *x): pass + def list(self, *x): pass + def item(self, *x): pass + + +class EBNF_to_BNF(Transformer): + def __init__(self): + self.new_rules = {} + self.prefix = 'anon' + self.i = 0 + + def _add_recurse_rule(self, type_, expr): + new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) + self.i += 1 + t = Token('RULE', new_name, -1) + self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]) + return t + + def expr(self, rule, op): + if op.value == '?': + return T('expansions', [rule, T('expansion', [])]) + elif op.value == '+': + # a : b c+ d + # --> + # a : b _c d + # _c : _c c | c; + return self._add_recurse_rule('plus', rule) + elif op.value == '*': + # a : b c* d + # --> + # a : b _c? d + # _c : _c c | c; + new_name = self._add_recurse_rule('star', rule) + return T('expansions', [new_name, T('expansion', [])]) + assert False, op + + +class SimplifyRule_Visitor(Visitor): + + @staticmethod + def _flatten(tree): + while True: + to_expand = [i for i, child in enumerate(tree.children) + if isinstance(child, T) and child.data == tree.data] + if not to_expand: + break + tree.expand_kids_by_index(*to_expand) + + + def expansion(self, tree): + # rules_list unpacking + # a : b (c|d) e + # --> + # a : b c e | b d e + # + # In AST terms: + # expansion(b, expansions(c, d), e) + # --> + # expansions( expansion(b, c, e), expansion(b, d, e) ) + + while True: + self._flatten(tree) + + for i, child in enumerate(tree.children): + if isinstance(child, T) and child.data == 'expansions': + tree.data = 'expansions' + tree.children = [self.visit(T('expansion', [option if i==j else other + for j, other in enumerate(tree.children)])) + for option in child.children] + break + else: + break + + def alias(self, tree): + rule, alias_name = tree.children + if rule.data == 'expansions': + aliases = [] + for child in tree.children[0].children: + aliases.append(T('alias', [child, alias_name])) + tree.data = 'expansions' + tree.children = aliases + + expansions = _flatten + +def dict_update_safe(d1, d2): + for k, v in d2.iteritems(): + assert k not in d1 + d1[k] = v + + +def generate_aliases(): + sd = SaveDefinitions() + for name, expansion in RULES: + try: + f = getattr(sd, "%s__%s" % (name, len(expansion))) + except AttributeError: + f = getattr(sd, name) + yield name, expansion, f.__name__ + + +def inline_args(f): + def _f(self, args): + return f(*args) + return _f + + +class GrammarLoader: + def __init__(self): + self.rules = list(generate_aliases()) + self.ga = GrammarAnalyzer(self.rules) + self.ga.analyze() + self.lexer = Lexer(TOKENS.items(), {}, ignore=['WS', 'COMMENT']) + self.simplify_rule = SimplifyRule_Visitor() + + def _generate_parser_callbacks(self, callbacks): + d = {alias: inline_args(getattr(callbacks, alias)) + for _n, _x, alias in self.rules} + return type('Callback', (), d)() + + def load_grammar(self, grammar_text): + sd = SaveDefinitions() + c = self._generate_parser_callbacks(sd) + + p = Parser(self.ga, c) + p.parse( list(self.lexer.lex(grammar_text+"\n")) ) + + ebnf_to_bnf = EBNF_to_BNF() + + rules = {name: ebnf_to_bnf.transform(r) for name, r in sd.rules.items()} + dict_update_safe(rules, ebnf_to_bnf.new_rules) + + for r in rules.values(): + self.simplify_rule.visit(r) + + return sd.tokens, rules + +load_grammar = GrammarLoader().load_grammar + + +def test(): + g = """ + start: add + + # Rules + add: mul + | add _add_sym mul + + mul: _atom + | mul _add_mul _atom + + neg: "-" _atom + + _atom: neg + | number + | "(" add ")" + + # Tokens + number: /[\d.]+/ + _add_sym: "+" | "-" + _add_mul: "*" | "/" + + WS.ignore: /\s+/ + """ + + g2 = """ + start: a + a: "a" (b*|(c d)+) "b"? + b: "b" + c: "c" + d: "+" | "-" + """ + load_grammar(g) + + + diff --git a/lark/parser.py b/lark/parser.py new file mode 100644 index 0000000..2695009 --- /dev/null +++ b/lark/parser.py @@ -0,0 +1,61 @@ +from grammar_analysis import ACTION_SHIFT + +class ParseError(Exception): + pass + +class Parser(object): + def __init__(self, ga, callback, temp=False): + self.ga = ga + self.callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) + for rule in ga.rules} + + def parse(self, seq): + states_idx = self.ga.states_idx + + stack = [(None, self.ga.init_state_idx)] + i = 0 + res = None + + def get_action(key): + state = stack[-1][1] + try: + return states_idx[state][key] + except KeyError: + expected = states_idx[state].keys() + context = ' '.join(['%s(%r)' % (t.type, t.value) for t in seq[i:i+5]]) + raise ParseError("Unexpected input %r.\nExpected: %s\nContext: %s" % (key, expected, context)) + + def reduce(rule): + s = stack[-len(rule.expansion):] + del stack[-len(rule.expansion):] + + res = self.callbacks[rule]([x[0] for x in s]) + + if rule.origin == 'start': + return res + + _action, new_state = get_action(rule.origin) + assert _action == ACTION_SHIFT + stack.append((res, new_state)) + + # Main LALR-parser loop + while i < len(seq): + action, arg = get_action(seq[i].type) + + if action == ACTION_SHIFT: + stack.append((seq[i], arg)) + i+= 1 + else: + reduce(arg) + + while len(stack) > 1: + _action, rule = get_action('$end') + assert _action == 'reduce' + res = reduce(rule) + if res: + break + + assert stack == [(None, self.ga.init_state_idx)], len(stack) + return res + + diff --git a/lark/tree.py b/lark/tree.py new file mode 100644 index 0000000..e840b46 --- /dev/null +++ b/lark/tree.py @@ -0,0 +1,83 @@ + +class Tree(object): + def __init__(self, data, children): + self.data = data + self.children = list(children) + + def __repr__(self): + return 'Tree(%s, %s)' % (self.data, self.children) + + def _pretty(self, level, indent_str): + if len(self.children) == 1 and not isinstance(self.children[0], Tree): + return [ indent_str*level, self.data, '\t', '%s' % self.children[0], '\n'] + + l = [ indent_str*level, self.data, '\n' ] + for n in self.children: + if isinstance(n, Tree): + l += n._pretty(level+1, indent_str) + else: + l += [ indent_str*(level+1), '%s' % n, '\n' ] + + return l + + def pretty(self, indent_str=' '): + return ''.join(self._pretty(0, indent_str)) + + def expand_kids_by_index(self, *indices): + for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices + kid = self.children[i] + self.children[i:i+1] = kid.children + + + # def find_path(self, pred): + # if pred(self): + # yield [] + # else: + # for i, c in enumerate(self.children): + # if isinstance(c, Tree): + # for path in c.find_path(pred): + # yield [i] + path + + # def follow_path(self, path): + # x = self + # for step in path: + # x = x.children[step] + # return x + + # def set_at_path(self, path, value): + # x = self.follow_path(path[:-1]) + # x.children[path[-1]] = value + + def clone(self): + return Tree(self.data, [c.clone() if isinstance(c, Tree) else c for c in self.children]) + + +class Transformer(object): + def transform(self, tree): + items = [self.transform(c) if isinstance(c, Tree) else c for c in tree.children] + try: + f = getattr(self, tree.data) + except AttributeError: + return self.__default__(tree.data, items) + else: + return f(*items) + + + def __default__(self, data, children): + return Tree(data, children) + + +class Visitor(object): + def visit(self, tree): + for child in tree.children: + if isinstance(child, Tree): + self.visit(child) + + f = getattr(self, tree.data, self.__default__) + f(tree) + return tree + + def __default__(self, tree): + pass + + diff --git a/lark/utils.py b/lark/utils.py new file mode 100644 index 0000000..2ec1490 --- /dev/null +++ b/lark/utils.py @@ -0,0 +1,51 @@ +from collections import deque + +class fzset(frozenset): + def __repr__(self): + return '{%s}' % ', '.join(map(repr, self)) + + +def classify_bool(seq, pred): + true_elems = [] + false_elems = [] + + for elem in seq: + if pred(elem): + true_elems.append(elem) + else: + false_elems.append(elem) + + return true_elems, false_elems + +def classify(seq, key=None): + d = {} + for item in seq: + k = key(item) if (key is not None) else item + if k in d: + d[k].append(item) + else: + d[k] = [item] + return d + +def bfs(initial, expand): + open_q = deque(list(initial)) + visited = set(open_q) + while open_q: + node = open_q.popleft() + yield node + for next_node in expand(node): + if next_node not in visited: + visited.add(next_node) + open_q.append(next_node) + + + + +try: + STRING_TYPE = basestring +except NameError: # Python 3 + STRING_TYPE = str + +Str = type(u'') + +