| @@ -12,6 +12,7 @@ Lark can: | |||
| - Build a parse-tree automagically, no construction code required | |||
| - Outperform all other Python libraries when using LALR(1) (Yes, including PLY) | |||
| - Run on every Python interpreter (it's pure-python) | |||
| - Generate a stand-alone parser (for LALR(1) grammars) | |||
| And many more features. Read ahead and find out. | |||
| @@ -66,10 +67,11 @@ See more [examples in the wiki](https://github.com/erezsh/lark/wiki/Examples) | |||
| - Builds a parse-tree (AST) automagically, based on the structure of the grammar | |||
| - **Earley** parser | |||
| - Can parse *ALL* context-free grammars | |||
| - Full support for ambiguity in grammar | |||
| - Can parse all context-free grammars | |||
| - Full support for ambiguous grammars | |||
| - **LALR(1)** parser | |||
| - Competitive with PLY | |||
| - Fast and light, competitive with PLY | |||
| - Can generate a stand-alone parser | |||
| - **EBNF** grammar | |||
| - **Unicode** fully supported | |||
| - **Python 2 & 3** compatible | |||
| @@ -86,7 +88,7 @@ See the full list of [features in the wiki](https://github.com/erezsh/lark/wiki/ | |||
| #### Performance comparison | |||
| Lower is better! | |||
| Lark is the fastest and lightest (lower is better) | |||
|  | |||
| @@ -99,17 +101,17 @@ Check out the [JSON tutorial](/docs/json_tutorial.md#conclusion) for more detail | |||
| #### Feature comparison | |||
| | Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG? | |||
| |:--------|:----------|:----|:--------|:------------|:------------ | |||
| | **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | | |||
| | [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | | |||
| | [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | | |||
| | [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* | | |||
| | [funcparserlib](https://github.com/vlasovskikh/funcparserlib) | Recursive-Descent | Combinators | No | No | No | | |||
| | [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* | | |||
| | Library | Algorithm | Grammar | Builds tree? | Supports ambiguity? | Can handle every CFG? | Line/Column tracking | Generates Stand-alone | |||
| |:--------|:----------|:----|:--------|:------------|:------------|:----------|:---------- | |||
| | **Lark** | Earley/LALR(1) | EBNF | Yes! | Yes! | Yes! | Yes! | Yes! (LALR only) | | |||
| | [PLY](http://www.dabeaz.com/ply/) | LALR(1) | BNF | No | No | No | No | No | | |||
| | [PyParsing](http://pyparsing.wikispaces.com/) | PEG | Combinators | No | No | No\* | No | No | | |||
| | [Parsley](https://pypi.python.org/pypi/Parsley) | PEG | EBNF | No | No | No\* | No | No | | |||
| | [funcparserlib](https://github.com/vlasovskikh/funcparserlib) | Recursive-Descent | Combinators | No | No | No | No | No | | |||
| | [Parsimonious](https://github.com/erikrose/parsimonious) | PEG | EBNF | Yes | No | No\* | No | No | | |||
| (\* *According to Wikipedia, it remains unanswered whether PEGs can really parse all deterministic CFGs*) | |||
| (\* *PEGs cannot handle non-deterministic grammars. Also, according to Wikipedia, it remains unanswered whether PEGs can really parse all deterministic CFGs*) | |||
| ### Projects using Lark | |||
| @@ -0,0 +1 @@ | |||
| python -m lark.tools.standalone json.g > json_parser.py | |||
| @@ -0,0 +1,21 @@ | |||
| ?start: value | |||
| ?value: object | |||
| | array | |||
| | string | |||
| | SIGNED_NUMBER -> number | |||
| | "true" -> true | |||
| | "false" -> false | |||
| | "null" -> null | |||
| array : "[" [value ("," value)*] "]" | |||
| object : "{" [pair ("," pair)*] "}" | |||
| pair : string ":" value | |||
| string : ESCAPED_STRING | |||
| %import common.ESCAPED_STRING | |||
| %import common.SIGNED_NUMBER | |||
| %import common.WS | |||
| %ignore WS | |||
| @@ -0,0 +1,794 @@ | |||
| # The file was automatically generated by Lark v0.5.2 | |||
| # | |||
| # | |||
| # Lark Stand-alone Generator Tool | |||
| # ---------------------------------- | |||
| # Generates a stand-alone LALR(1) parser with a standard lexer | |||
| # | |||
| # Git: https://github.com/erezsh/lark | |||
| # Author: Erez Shinan (erezshin@gmail.com) | |||
| # | |||
| # | |||
| # >>> LICENSE | |||
| # | |||
| # This tool and its generated code use a separate license from Lark. | |||
| # | |||
| # It is licensed under GPLv2 or above. | |||
| # | |||
| # If you wish to purchase a commercial license for this tool and its | |||
| # generated code, contact me via email. | |||
| # | |||
| # This program is free software: you can redistribute it and/or modify | |||
| # it under the terms of the GNU General Public License as published by | |||
| # the Free Software Foundation, either version 2 of the License, or | |||
| # (at your option) any later version. | |||
| # | |||
| # This program is distributed in the hope that it will be useful, | |||
| # but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
| # GNU General Public License for more details. | |||
| # | |||
| # See <http://www.gnu.org/licenses/>. | |||
| # | |||
| # | |||
| import types | |||
| import functools | |||
| from contextlib import contextmanager | |||
| Str = type(u'') | |||
| def inline_args(f): | |||
| # print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType) | |||
| if isinstance(f, types.FunctionType): | |||
| @functools.wraps(f) | |||
| def _f_func(self, args): | |||
| return f(self, *args) | |||
| return _f_func | |||
| elif isinstance(f, (type, types.BuiltinFunctionType)): | |||
| @functools.wraps(f) | |||
| def _f_builtin(_self, args): | |||
| return f(*args) | |||
| return _f_builtin | |||
| elif isinstance(f, types.MethodType): | |||
| @functools.wraps(f.__func__) | |||
| def _f(self, args): | |||
| return f.__func__(self, *args) | |||
| return _f | |||
| else: | |||
| @functools.wraps(f.__call__.__func__) | |||
| def _f(self, args): | |||
| return f.__call__.__func__(self, *args) | |||
| return _f | |||
| try: | |||
| from contextlib import suppress # Python 3 | |||
| except ImportError: | |||
| @contextmanager | |||
| def suppress(*excs): | |||
| '''Catch and dismiss the provided exception | |||
| >>> x = 'hello' | |||
| >>> with suppress(IndexError): | |||
| ... x = x[10] | |||
| >>> x | |||
| 'hello' | |||
| ''' | |||
| try: | |||
| yield | |||
| except excs: | |||
| pass | |||
| def is_terminal(sym): | |||
| return sym.isupper() | |||
| class GrammarError(Exception): | |||
| pass | |||
| class ParseError(Exception): | |||
| pass | |||
| class UnexpectedToken(ParseError): | |||
| def __init__(self, token, expected, seq, index): | |||
| self.token = token | |||
| self.expected = expected | |||
| self.line = getattr(token, 'line', '?') | |||
| self.column = getattr(token, 'column', '?') | |||
| try: | |||
| context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]]) | |||
| except AttributeError: | |||
| context = seq[index:index+5] | |||
| except TypeError: | |||
| context = "<no context>" | |||
| message = ("Unexpected token %r at line %s, column %s.\n" | |||
| "Expected: %s\n" | |||
| "Context: %s" % (token, self.line, self.column, expected, context)) | |||
| super(UnexpectedToken, self).__init__(message) | |||
| class Tree(object): | |||
| def __init__(self, data, children): | |||
| self.data = data | |||
| self.children = list(children) | |||
| def __repr__(self): | |||
| return 'Tree(%s, %s)' % (self.data, self.children) | |||
| def _pretty_label(self): | |||
| return self.data | |||
| def _pretty(self, level, indent_str): | |||
| if len(self.children) == 1 and not isinstance(self.children[0], Tree): | |||
| return [ indent_str*level, self._pretty_label(), '\t', '%s' % self.children[0], '\n'] | |||
| l = [ indent_str*level, self._pretty_label(), '\n' ] | |||
| for n in self.children: | |||
| if isinstance(n, Tree): | |||
| l += n._pretty(level+1, indent_str) | |||
| else: | |||
| l += [ indent_str*(level+1), '%s' % n, '\n' ] | |||
| return l | |||
| def pretty(self, indent_str=' '): | |||
| return ''.join(self._pretty(0, indent_str)) | |||
| class Transformer(object): | |||
| def _get_func(self, name): | |||
| return getattr(self, name) | |||
| def transform(self, tree): | |||
| items = [] | |||
| for c in tree.children: | |||
| try: | |||
| items.append(self.transform(c) if isinstance(c, Tree) else c) | |||
| except Discard: | |||
| pass | |||
| try: | |||
| f = self._get_func(tree.data) | |||
| except AttributeError: | |||
| return self.__default__(tree.data, items) | |||
| else: | |||
| return f(items) | |||
| def __default__(self, data, children): | |||
| return Tree(data, children) | |||
| def __mul__(self, other): | |||
| return TransformerChain(self, other) | |||
| class Discard(Exception): | |||
| pass | |||
| class TransformerChain(object): | |||
| def __init__(self, *transformers): | |||
| self.transformers = transformers | |||
| def transform(self, tree): | |||
| for t in self.transformers: | |||
| tree = t.transform(tree) | |||
| return tree | |||
| def __mul__(self, other): | |||
| return TransformerChain(*self.transformers + (other,)) | |||
| class InlineTransformer(Transformer): | |||
| def _get_func(self, name): # use super()._get_func | |||
| return inline_args(getattr(self, name)).__get__(self) | |||
| class Visitor(object): | |||
| def visit(self, tree): | |||
| for child in tree.children: | |||
| if isinstance(child, Tree): | |||
| self.visit(child) | |||
| f = getattr(self, tree.data, self.__default__) | |||
| f(tree) | |||
| return tree | |||
| def __default__(self, tree): | |||
| pass | |||
| class Visitor_NoRecurse(Visitor): | |||
| def visit(self, tree): | |||
| subtrees = list(tree.iter_subtrees()) | |||
| for subtree in (subtrees): | |||
| getattr(self, subtree.data, self.__default__)(subtree) | |||
| return tree | |||
| class Transformer_NoRecurse(Transformer): | |||
| def transform(self, tree): | |||
| subtrees = list(tree.iter_subtrees()) | |||
| def _t(t): | |||
| # Assumes t is already transformed | |||
| try: | |||
| f = self._get_func(t.data) | |||
| except AttributeError: | |||
| return self.__default__(t) | |||
| else: | |||
| return f(t) | |||
| for subtree in subtrees: | |||
| children = [] | |||
| for c in subtree.children: | |||
| try: | |||
| children.append(_t(c) if isinstance(c, Tree) else c) | |||
| except Discard: | |||
| pass | |||
| subtree.children = children | |||
| return _t(tree) | |||
| def __default__(self, t): | |||
| return t | |||
| class Indenter: | |||
| def __init__(self): | |||
| self.paren_level = 0 | |||
| self.indent_level = [0] | |||
| def handle_NL(self, token): | |||
| if self.paren_level > 0: | |||
| return | |||
| yield token | |||
| indent_str = token.rsplit('\n', 1)[1] # Tabs and spaces | |||
| indent = indent_str.count(' ') + indent_str.count('\t') * self.tab_len | |||
| if indent > self.indent_level[-1]: | |||
| self.indent_level.append(indent) | |||
| yield Token.new_borrow_pos(self.INDENT_type, indent_str, token) | |||
| else: | |||
| while indent < self.indent_level[-1]: | |||
| self.indent_level.pop() | |||
| yield Token.new_borrow_pos(self.DEDENT_type, indent_str, token) | |||
| assert indent == self.indent_level[-1], '%s != %s' % (indent, self.indent_level[-1]) | |||
| def process(self, stream): | |||
| for token in stream: | |||
| if token.type == self.NL_type: | |||
| for t in self.handle_NL(token): | |||
| yield t | |||
| else: | |||
| yield token | |||
| if token.type in self.OPEN_PAREN_types: | |||
| self.paren_level += 1 | |||
| elif token.type in self.CLOSE_PAREN_types: | |||
| self.paren_level -= 1 | |||
| assert self.paren_level >= 0 | |||
| while len(self.indent_level) > 1: | |||
| self.indent_level.pop() | |||
| yield Token(self.DEDENT_type, '') | |||
| assert self.indent_level == [0], self.indent_level | |||
| # XXX Hack for ContextualLexer. Maybe there's a more elegant solution? | |||
| @property | |||
| def always_accept(self): | |||
| return (self.NL_type,) | |||
| class LexError(Exception): | |||
| pass | |||
| class UnexpectedInput(LexError): | |||
| def __init__(self, seq, lex_pos, line, column, allowed=None): | |||
| context = seq[lex_pos:lex_pos+5] | |||
| message = "No token defined for: '%s' in %r at line %d col %d" % (seq[lex_pos], context, line, column) | |||
| super(UnexpectedInput, self).__init__(message) | |||
| self.line = line | |||
| self.column = column | |||
| self.context = context | |||
| self.allowed = allowed | |||
| class Token(Str): | |||
| def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None): | |||
| inst = Str.__new__(cls, value) | |||
| inst.type = type_ | |||
| inst.pos_in_stream = pos_in_stream | |||
| inst.value = value | |||
| inst.line = line | |||
| inst.column = column | |||
| return inst | |||
| @classmethod | |||
| def new_borrow_pos(cls, type_, value, borrow_t): | |||
| return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column) | |||
| def __repr__(self): | |||
| return 'Token(%s, %r)' % (self.type, self.value) | |||
| def __deepcopy__(self, memo): | |||
| return Token(self.type, self.value, self.pos_in_stream, self.line, self.column) | |||
| def __eq__(self, other): | |||
| if isinstance(other, Token) and self.type != other.type: | |||
| return False | |||
| return Str.__eq__(self, other) | |||
| __hash__ = Str.__hash__ | |||
| class LineCounter: | |||
| def __init__(self): | |||
| self.newline_char = '\n' | |||
| self.char_pos = 0 | |||
| self.line = 1 | |||
| self.column = 0 | |||
| self.line_start_pos = 0 | |||
| def feed(self, token, test_newline=True): | |||
| """Consume a token and calculate the new line & column. | |||
| As an optional optimization, set test_newline=False is token doesn't contain a newline. | |||
| """ | |||
| if test_newline: | |||
| newlines = token.count(self.newline_char) | |||
| if newlines: | |||
| self.line += newlines | |||
| self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 | |||
| self.char_pos += len(token) | |||
| self.column = self.char_pos - self.line_start_pos | |||
| class _Lex: | |||
| "Built to serve both Lexer and ContextualLexer" | |||
| def __init__(self, lexer): | |||
| self.lexer = lexer | |||
| def lex(self, stream, newline_types, ignore_types): | |||
| newline_types = list(newline_types) | |||
| newline_types = list(newline_types) | |||
| line_ctr = LineCounter() | |||
| while True: | |||
| lexer = self.lexer | |||
| for mre, type_from_index in lexer.mres: | |||
| m = mre.match(stream, line_ctr.char_pos) | |||
| if m: | |||
| value = m.group(0) | |||
| type_ = type_from_index[m.lastindex] | |||
| if type_ not in ignore_types: | |||
| t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||
| if t.type in lexer.callback: | |||
| t = lexer.callback[t.type](t) | |||
| lexer = yield t | |||
| line_ctr.feed(value, type_ in newline_types) | |||
| break | |||
| else: | |||
| if line_ctr.char_pos < len(stream): | |||
| raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||
| break | |||
| class UnlessCallback: | |||
| def __init__(self, mres): | |||
| self.mres = mres | |||
| def __call__(self, t): | |||
| for mre, type_from_index in self.mres: | |||
| m = mre.match(t.value) | |||
| if m: | |||
| value = m.group(0) | |||
| t.type = type_from_index[m.lastindex] | |||
| break | |||
| return t | |||
| class NodeBuilder: | |||
| def __init__(self, tree_class, name): | |||
| self.tree_class = tree_class | |||
| self.name = name | |||
| def __call__(self, children): | |||
| return self.tree_class(self.name, children) | |||
| class Expand1: | |||
| def __init__(self, node_builder): | |||
| self.node_builder = node_builder | |||
| def __call__(self, children): | |||
| if len(children) == 1: | |||
| return children[0] | |||
| else: | |||
| return self.node_builder(children) | |||
| class Factory: | |||
| def __init__(self, cls, *args): | |||
| self.cls = cls | |||
| self.args = args | |||
| def __call__(self, node_builder): | |||
| return self.cls(node_builder, *self.args) | |||
| class TokenWrapper: | |||
| "Used for fixing the results of scanless parsing" | |||
| def __init__(self, node_builder, token_name): | |||
| self.node_builder = node_builder | |||
| self.token_name = token_name | |||
| def __call__(self, children): | |||
| return self.node_builder( [Token(self.token_name, ''.join(children))] ) | |||
| def identity(node_builder): | |||
| return node_builder | |||
| class ChildFilter: | |||
| def __init__(self, node_builder, to_include): | |||
| self.node_builder = node_builder | |||
| self.to_include = to_include | |||
| def __call__(self, children): | |||
| filtered = [] | |||
| for i, to_expand in self.to_include: | |||
| if to_expand: | |||
| filtered += children[i].children | |||
| else: | |||
| filtered.append(children[i]) | |||
| return self.node_builder(filtered) | |||
| def create_rule_handler(expansion, keep_all_tokens, filter_out): | |||
| # if not keep_all_tokens: | |||
| to_include = [(i, not is_terminal(sym) and sym.startswith('_')) | |||
| for i, sym in enumerate(expansion) | |||
| if keep_all_tokens | |||
| or not ((is_terminal(sym) and sym.startswith('_')) or sym in filter_out) | |||
| ] | |||
| if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): | |||
| return Factory(ChildFilter, to_include) | |||
| # else, if no filtering required.. | |||
| return identity | |||
| class PropagatePositions: | |||
| def __init__(self, node_builder): | |||
| self.node_builder = node_builder | |||
| def __call__(self, children): | |||
| res = self.node_builder(children) | |||
| if children: | |||
| for a in children: | |||
| with suppress(AttributeError): | |||
| res.line = a.line | |||
| res.column = a.column | |||
| break | |||
| for a in reversed(children): | |||
| with suppress(AttributeError): | |||
| res.end_line = a.end_line | |||
| res.end_col = a.end_col | |||
| break | |||
| return res | |||
| class Callback(object): | |||
| pass | |||
| class ParseTreeBuilder: | |||
| def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False): | |||
| self.tree_class = tree_class | |||
| self.propagate_positions = propagate_positions | |||
| self.always_keep_all_tokens = keep_all_tokens | |||
| self.rule_builders = list(self._init_builders(rules)) | |||
| self.user_aliases = {} | |||
| def _init_builders(self, rules): | |||
| filter_out = set() | |||
| for rule in rules: | |||
| if rule.options and rule.options.filter_out: | |||
| assert rule.origin.startswith('_') # Just to make sure | |||
| filter_out.add(rule.origin) | |||
| for rule in rules: | |||
| options = rule.options | |||
| keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) | |||
| expand1 = options.expand1 if options else False | |||
| create_token = options.create_token if options else False | |||
| wrapper_chain = filter(None, [ | |||
| (expand1 and not rule.alias) and Expand1, | |||
| create_token and Factory(TokenWrapper, create_token), | |||
| create_rule_handler(rule.expansion, keep_all_tokens, filter_out), | |||
| self.propagate_positions and PropagatePositions, | |||
| ]) | |||
| yield rule, wrapper_chain | |||
| def create_callback(self, transformer=None): | |||
| callback = Callback() | |||
| for rule, wrapper_chain in self.rule_builders: | |||
| internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(rule.expansion)) | |||
| user_callback_name = rule.alias or rule.origin | |||
| try: | |||
| f = transformer._get_func(user_callback_name) | |||
| except AttributeError: | |||
| f = NodeBuilder(self.tree_class, user_callback_name) | |||
| self.user_aliases[rule] = rule.alias | |||
| rule.alias = internal_callback_name | |||
| for w in wrapper_chain: | |||
| f = w(f) | |||
| if hasattr(callback, internal_callback_name): | |||
| raise GrammarError("Rule '%s' already exists" % (rule,)) | |||
| setattr(callback, internal_callback_name, f) | |||
| return callback | |||
| class _Parser: | |||
| def __init__(self, parse_table, callbacks): | |||
| self.states = parse_table.states | |||
| self.start_state = parse_table.start_state | |||
| self.end_state = parse_table.end_state | |||
| self.callbacks = callbacks | |||
| def parse(self, seq, set_state=None): | |||
| i = 0 | |||
| token = None | |||
| stream = iter(seq) | |||
| states = self.states | |||
| state_stack = [self.start_state] | |||
| value_stack = [] | |||
| if set_state: set_state(self.start_state) | |||
| def get_action(key): | |||
| state = state_stack[-1] | |||
| try: | |||
| return states[state][key] | |||
| except KeyError: | |||
| expected = states[state].keys() | |||
| raise UnexpectedToken(token, expected, seq, i) | |||
| def reduce(rule): | |||
| size = len(rule.expansion) | |||
| if size: | |||
| s = value_stack[-size:] | |||
| del state_stack[-size:] | |||
| del value_stack[-size:] | |||
| else: | |||
| s = [] | |||
| value = self.callbacks[rule](s) | |||
| _action, new_state = get_action(rule.origin) | |||
| assert _action is Shift | |||
| state_stack.append(new_state) | |||
| value_stack.append(value) | |||
| # Main LALR-parser loop | |||
| try: | |||
| token = next(stream) | |||
| i += 1 | |||
| while True: | |||
| action, arg = get_action(token.type) | |||
| assert arg != self.end_state | |||
| if action is Shift: | |||
| state_stack.append(arg) | |||
| value_stack.append(token) | |||
| if set_state: set_state(arg) | |||
| token = next(stream) | |||
| i += 1 | |||
| else: | |||
| reduce(arg) | |||
| except StopIteration: | |||
| pass | |||
| while True: | |||
| _action, arg = get_action('$END') | |||
| if _action is Shift: | |||
| assert arg == self.end_state | |||
| val ,= value_stack | |||
| return val | |||
| else: | |||
| reduce(arg) | |||
| class Rule(object): | |||
| """ | |||
| origin : a symbol | |||
| expansion : a list of symbols | |||
| """ | |||
| def __init__(self, origin, expansion, alias=None, options=None): | |||
| self.origin = origin | |||
| self.expansion = expansion | |||
| self.alias = alias | |||
| self.options = options | |||
| def __str__(self): | |||
| return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) | |||
| def __repr__(self): | |||
| return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options) | |||
| class RuleOptions: | |||
| def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): | |||
| self.keep_all_tokens = keep_all_tokens | |||
| self.expand1 = expand1 | |||
| self.create_token = create_token # used for scanless postprocessing | |||
| self.priority = priority | |||
| self.filter_out = filter_out # remove this rule from the tree | |||
| # used for "token"-rules in scanless | |||
| def __repr__(self): | |||
| return 'RuleOptions(%r, %r, %r, %r, %r)' % ( | |||
| self.keep_all_tokens, | |||
| self.expand1, | |||
| self.create_token, | |||
| self.priority, | |||
| self.filter_out | |||
| ) | |||
| Shift = 0 | |||
| Reduce = 1 | |||
| import re | |||
| MRES = ( | |||
| [('(?P<SIGNED_NUMBER>(?:(?:\\+|\\-))?(?:(?:(?:[0-9])+(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+|(?:(?:[0-9])+\\.(?:(?:[0-9])+)?|\\.(?:[0-9])+)(?:(?:e|E)(?:(?:\\+|\\-))?(?:[0-9])+)?)|(?:[0-9])+))|(?P<ESCAPED_STRING>\\"(?:(?:\\\\\\"|[^"]))*\\")|(?P<WS>(?:[ \t\x0c' | |||
| '\r\n' | |||
| '])+)|(?P<__FALSE1>false)|(?P<__NULL2>null)|(?P<__TRUE0>true)|(?P<__COLON>\\:)|(?P<__COMMA>\\,)|(?P<__LBRACE>\\{)|(?P<__LSQB>\\[)|(?P<__RBRACE>\\})|(?P<__RSQB>\\])', | |||
| {1: 'SIGNED_NUMBER', | |||
| 2: 'ESCAPED_STRING', | |||
| 3: 'WS', | |||
| 4: '__FALSE1', | |||
| 5: '__NULL2', | |||
| 6: '__TRUE0', | |||
| 7: '__COLON', | |||
| 8: '__COMMA', | |||
| 9: '__LBRACE', | |||
| 10: '__LSQB', | |||
| 11: '__RBRACE', | |||
| 12: '__RSQB'})] | |||
| ) | |||
| LEXER_CALLBACK = ( | |||
| {} | |||
| ) | |||
| NEWLINE_TYPES = ['WS'] | |||
| IGNORE_TYPES = ['WS'] | |||
| class LexerRegexps: pass | |||
| lexer_regexps = LexerRegexps() | |||
| lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES] | |||
| lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres]) | |||
| for n, mres in LEXER_CALLBACK.items()} | |||
| lexer = _Lex(lexer_regexps) | |||
| def lex(stream): | |||
| return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES) | |||
| RULES = { | |||
| 0: Rule('start', ['value'], None, RuleOptions(False, True, None, None, False)), | |||
| 1: Rule('value', ['object'], None, RuleOptions(False, True, None, None, False)), | |||
| 2: Rule('value', ['array'], None, RuleOptions(False, True, None, None, False)), | |||
| 3: Rule('value', ['string'], None, RuleOptions(False, True, None, None, False)), | |||
| 4: Rule('value', ['SIGNED_NUMBER'], 'number', RuleOptions(False, True, None, None, False)), | |||
| 5: Rule('value', ['__TRUE0'], 'true', RuleOptions(False, True, None, None, False)), | |||
| 6: Rule('value', ['__FALSE1'], 'false', RuleOptions(False, True, None, None, False)), | |||
| 7: Rule('value', ['__NULL2'], 'null', RuleOptions(False, True, None, None, False)), | |||
| 8: Rule('array', ['__LSQB', 'value', '__anon_star_0', '__RSQB'], None, RuleOptions(False, False, None, None, False)), | |||
| 9: Rule('array', ['__LSQB', 'value', '__RSQB'], None, RuleOptions(False, False, None, None, False)), | |||
| 10: Rule('array', ['__LSQB', '__RSQB'], None, RuleOptions(False, False, None, None, False)), | |||
| 11: Rule('object', ['__LBRACE', 'pair', '__anon_star_1', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), | |||
| 12: Rule('object', ['__LBRACE', 'pair', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), | |||
| 13: Rule('object', ['__LBRACE', '__RBRACE'], None, RuleOptions(False, False, None, None, False)), | |||
| 14: Rule('pair', ['string', '__COLON', 'value'], None, RuleOptions(False, False, None, None, False)), | |||
| 15: Rule('string', ['ESCAPED_STRING'], None, RuleOptions(False, False, None, None, False)), | |||
| 16: Rule('__anon_star_0', ['__COMMA', 'value'], None, None), | |||
| 17: Rule('__anon_star_0', ['__anon_star_0', '__COMMA', 'value'], None, None), | |||
| 18: Rule('__anon_star_1', ['__COMMA', 'pair'], None, None), | |||
| 19: Rule('__anon_star_1', ['__anon_star_1', '__COMMA', 'pair'], None, None), | |||
| } | |||
| parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree) | |||
| class ParseTable: pass | |||
| parse_table = ParseTable() | |||
| STATES = { | |||
| 0: {0: (0, 1), 1: (0, 2), 2: (0, 3), 3: (0, 4), 4: (0, 5), 5: (0, 6), 6: (0, 7), 7: (0, 8), 8: (0, 9), 9: (0, 10), 10: (0, 11), 11: (0, 12)}, | |||
| 1: {12: (1, 5), 13: (1, 5), 14: (1, 5), 15: (1, 5)}, | |||
| 2: {9: (0, 10), 14: (0, 13), 16: (0, 14), 11: (0, 15)}, | |||
| 3: {12: (1, 2), 13: (1, 2), 14: (1, 2), 15: (1, 2)}, | |||
| 4: {12: (1, 1), 13: (1, 1), 14: (1, 1), 15: (1, 1)}, | |||
| 5: {12: (0, 16)}, | |||
| 6: {7: (0, 17), 0: (0, 1), 1: (0, 2), 2: (0, 3), 3: (0, 4), 5: (0, 6), 6: (0, 7), 8: (0, 9), 9: (0, 10), 15: (0, 18), 10: (0, 11), 11: (0, 12)}, | |||
| 7: {12: (1, 4), 13: (1, 4), 14: (1, 4), 15: (1, 4)}, | |||
| 8: {12: (1, 0)}, | |||
| 9: {12: (1, 7), 13: (1, 7), 14: (1, 7), 15: (1, 7)}, | |||
| 10: {12: (1, 15), 17: (1, 15), 13: (1, 15), 14: (1, 15), 15: (1, 15)}, | |||
| 11: {12: (1, 6), 13: (1, 6), 14: (1, 6), 15: (1, 6)}, | |||
| 12: {12: (1, 3), 13: (1, 3), 14: (1, 3), 15: (1, 3)}, | |||
| 13: {13: (1, 13), 12: (1, 13), 14: (1, 13), 15: (1, 13)}, | |||
| 14: {14: (0, 19), 13: (0, 20), 18: (0, 21)}, | |||
| 15: {17: (0, 22)}, | |||
| 16: {}, | |||
| 17: {19: (0, 23), 15: (0, 24), 13: (0, 25)}, | |||
| 18: {13: (1, 10), 12: (1, 10), 14: (1, 10), 15: (1, 10)}, | |||
| 19: {13: (1, 12), 12: (1, 12), 14: (1, 12), 15: (1, 12)}, | |||
| 20: {9: (0, 10), 11: (0, 15), 16: (0, 26)}, | |||
| 21: {14: (0, 27), 13: (0, 28)}, | |||
| 22: {5: (0, 6), 1: (0, 2), 0: (0, 1), 8: (0, 9), 2: (0, 3), 3: (0, 4), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12), 7: (0, 29)}, | |||
| 23: {15: (0, 30), 13: (0, 31)}, | |||
| 24: {13: (1, 9), 12: (1, 9), 14: (1, 9), 15: (1, 9)}, | |||
| 25: {5: (0, 6), 1: (0, 2), 0: (0, 1), 8: (0, 9), 2: (0, 3), 3: (0, 4), 7: (0, 32), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12)}, | |||
| 26: {13: (1, 18), 14: (1, 18)}, | |||
| 27: {13: (1, 11), 12: (1, 11), 14: (1, 11), 15: (1, 11)}, | |||
| 28: {16: (0, 33), 9: (0, 10), 11: (0, 15)}, | |||
| 29: {13: (1, 14), 14: (1, 14)}, | |||
| 30: {13: (1, 8), 12: (1, 8), 14: (1, 8), 15: (1, 8)}, | |||
| 31: {5: (0, 6), 1: (0, 2), 0: (0, 1), 7: (0, 34), 8: (0, 9), 2: (0, 3), 3: (0, 4), 9: (0, 10), 6: (0, 7), 10: (0, 11), 11: (0, 12)}, | |||
| 32: {15: (1, 16), 13: (1, 16)}, | |||
| 33: {13: (1, 19), 14: (1, 19)}, | |||
| 34: {15: (1, 17), 13: (1, 17)}, | |||
| } | |||
| TOKEN_TYPES = ( | |||
| {0: '__TRUE0', | |||
| 1: '__LBRACE', | |||
| 2: 'array', | |||
| 3: 'object', | |||
| 4: 'start', | |||
| 5: '__LSQB', | |||
| 6: 'SIGNED_NUMBER', | |||
| 7: 'value', | |||
| 8: '__NULL2', | |||
| 9: 'ESCAPED_STRING', | |||
| 10: '__FALSE1', | |||
| 11: 'string', | |||
| 12: '$END', | |||
| 13: '__COMMA', | |||
| 14: '__RBRACE', | |||
| 15: '__RSQB', | |||
| 16: 'pair', | |||
| 17: '__COLON', | |||
| 18: '__anon_star_1', | |||
| 19: '__anon_star_0'} | |||
| ) | |||
| parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()} | |||
| for s, acts in STATES.items()} | |||
| parse_table.start_state = 0 | |||
| parse_table.end_state = 16 | |||
| class Lark_StandAlone: | |||
| def __init__(self, transformer=None, postlex=None): | |||
| callback = parse_tree_builder.create_callback(transformer=transformer) | |||
| callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES.values()} | |||
| self.parser = _Parser(parse_table, callbacks) | |||
| self.postlex = postlex | |||
| def parse(self, stream): | |||
| tokens = lex(stream) | |||
| if self.postlex: tokens = self.postlex.process(tokens) | |||
| return self.parser.parse(tokens) | |||
| @@ -0,0 +1,25 @@ | |||
| import sys | |||
| from json_parser import Lark_StandAlone, Transformer, inline_args | |||
| class TreeToJson(Transformer): | |||
| @inline_args | |||
| def string(self, s): | |||
| return s[1:-1].replace('\\"', '"') | |||
| array = list | |||
| pair = tuple | |||
| object = dict | |||
| number = inline_args(float) | |||
| null = lambda self, _: None | |||
| true = lambda self, _: True | |||
| false = lambda self, _: False | |||
| parser = Lark_StandAlone(transformer=TreeToJson()) | |||
| if __name__ == '__main__': | |||
| with open(sys.argv[1]) as f: | |||
| print(parser.parse(f.read())) | |||
| @@ -4,4 +4,4 @@ from .lexer import UnexpectedInput, LexError | |||
| from .lark import Lark | |||
| from .utils import inline_args | |||
| __version__ = "0.5.1" | |||
| __version__ = "0.5.2" | |||
| @@ -1,16 +1,21 @@ | |||
| import re | |||
| import sre_parse | |||
| import sys | |||
| from .utils import get_regexp_width | |||
| Py36 = (sys.version_info[:2] >= (3, 6)) | |||
| ###{standalone | |||
| def is_terminal(sym): | |||
| return sym.isupper() | |||
| class GrammarError(Exception): | |||
| pass | |||
| class ParseError(Exception): | |||
| pass | |||
| class UnexpectedToken(ParseError): | |||
| def __init__(self, token, expected, seq, index): | |||
| self.token = token | |||
| @@ -31,9 +36,8 @@ class UnexpectedToken(ParseError): | |||
| super(UnexpectedToken, self).__init__(message) | |||
| ###} | |||
| def is_terminal(sym): | |||
| return isinstance(sym, Terminal) or sym.isupper() or sym == '$end' | |||
| class LexerConf: | |||
| @@ -44,7 +48,6 @@ class LexerConf: | |||
| class ParserConf: | |||
| def __init__(self, rules, callback, start): | |||
| assert all(len(r) == 4 for r in rules) | |||
| self.rules = rules | |||
| self.callback = callback | |||
| self.start = start | |||
| @@ -93,10 +96,10 @@ class PatternRE(Pattern): | |||
| @property | |||
| def min_width(self): | |||
| return sre_parse.parse(self.to_regexp()).getwidth()[0] | |||
| return get_regexp_width(self.to_regexp())[0] | |||
| @property | |||
| def max_width(self): | |||
| return sre_parse.parse(self.to_regexp()).getwidth()[1] | |||
| return get_regexp_width(self.to_regexp())[1] | |||
| class TokenDef(object): | |||
| def __init__(self, name, pattern, priority=1): | |||
| @@ -108,27 +111,3 @@ class TokenDef(object): | |||
| def __repr__(self): | |||
| return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) | |||
| class Terminal: | |||
| def __init__(self, data): | |||
| self.data = data | |||
| def __repr__(self): | |||
| return '%r' % self.data | |||
| def __eq__(self, other): | |||
| return isinstance(other, type(self)) and self.data == other.data | |||
| def __hash__(self): | |||
| return hash(self.data) | |||
| class Terminal_Regexp(Terminal): | |||
| def __init__(self, name, regexp): | |||
| Terminal.__init__(self, regexp) | |||
| self.name = name | |||
| self.match = re.compile(regexp).match | |||
| class Terminal_Token(Terminal): | |||
| def match(self, other): | |||
| return self.data == other.type | |||
| @@ -0,0 +1,37 @@ | |||
| class Rule(object): | |||
| """ | |||
| origin : a symbol | |||
| expansion : a list of symbols | |||
| """ | |||
| def __init__(self, origin, expansion, alias=None, options=None): | |||
| self.origin = origin | |||
| self.expansion = expansion | |||
| self.alias = alias | |||
| self.options = options | |||
| def __str__(self): | |||
| return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) | |||
| def __repr__(self): | |||
| return 'Rule(%r, %r, %r, %r)' % (self.origin, self.expansion, self.alias, self.options) | |||
| class RuleOptions: | |||
| def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): | |||
| self.keep_all_tokens = keep_all_tokens | |||
| self.expand1 = expand1 | |||
| self.create_token = create_token # used for scanless postprocessing | |||
| self.priority = priority | |||
| self.filter_out = filter_out # remove this rule from the tree | |||
| # used for "token"-rules in scanless | |||
| def __repr__(self): | |||
| return 'RuleOptions(%r, %r, %r, %r, %r)' % ( | |||
| self.keep_all_tokens, | |||
| self.expand1, | |||
| self.create_token, | |||
| self.priority, | |||
| self.filter_out | |||
| ) | |||
| @@ -12,6 +12,7 @@ DECIMAL: INT "." INT? | "." INT | |||
| // float = /-?\d+(\.\d+)?([eE][+-]?\d+)?/ | |||
| _EXP: ("e"|"E") SIGNED_INT | |||
| FLOAT: INT _EXP | DECIMAL _EXP? | |||
| SIGNED_FLOAT: ["+"|"-"] INT | |||
| NUMBER: FLOAT | INT | |||
| SIGNED_NUMBER: ["+"|"-"] NUMBER | |||
| @@ -2,6 +2,7 @@ | |||
| from .lexer import Token | |||
| ###{standalone | |||
| class Indenter: | |||
| def __init__(self): | |||
| self.paren_level = 0 | |||
| @@ -50,3 +51,5 @@ class Indenter: | |||
| @property | |||
| def always_accept(self): | |||
| return (self.NL_type,) | |||
| ###} | |||
| @@ -169,13 +169,15 @@ class Lark: | |||
| def _build_parser(self): | |||
| self.parser_class = get_frontend(self.options.parser, self.options.lexer) | |||
| self.parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) | |||
| rules, callback = self.parse_tree_builder.apply(self.options.transformer) | |||
| self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) | |||
| callback = self._parse_tree_builder.create_callback(self.options.transformer) | |||
| if self.profiler: | |||
| for f in dir(callback): | |||
| if not (f.startswith('__') and f.endswith('__')): | |||
| setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f))) | |||
| parser_conf = ParserConf(rules, callback, self.options.start) | |||
| parser_conf = ParserConf(self.rules, callback, self.options.start) | |||
| return self.parser_class(self.lexer_conf, parser_conf, options=self.options) | |||
| @@ -5,6 +5,7 @@ import re | |||
| from .utils import Str, classify | |||
| from .common import is_terminal, PatternStr, PatternRE, TokenDef | |||
| ###{standalone | |||
| class LexError(Exception): | |||
| pass | |||
| @@ -48,27 +49,75 @@ class Token(Str): | |||
| __hash__ = Str.__hash__ | |||
| class Regex: | |||
| def __init__(self, pattern, flags=()): | |||
| self.pattern = pattern | |||
| self.flags = flags | |||
| def _regexp_has_newline(r): | |||
| return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) | |||
| class LineCounter: | |||
| def __init__(self): | |||
| self.newline_char = '\n' | |||
| self.char_pos = 0 | |||
| self.line = 1 | |||
| self.column = 0 | |||
| self.line_start_pos = 0 | |||
| def feed(self, token, test_newline=True): | |||
| """Consume a token and calculate the new line & column. | |||
| As an optional optimization, set test_newline=False is token doesn't contain a newline. | |||
| """ | |||
| if test_newline: | |||
| newlines = token.count(self.newline_char) | |||
| if newlines: | |||
| self.line += newlines | |||
| self.line_start_pos = self.char_pos + token.rindex(self.newline_char) + 1 | |||
| self.char_pos += len(token) | |||
| self.column = self.char_pos - self.line_start_pos | |||
| class _Lex: | |||
| "Built to serve both Lexer and ContextualLexer" | |||
| def __init__(self, lexer): | |||
| self.lexer = lexer | |||
| def lex(self, stream, newline_types, ignore_types): | |||
| newline_types = list(newline_types) | |||
| ignore_types = list(ignore_types) | |||
| line_ctr = LineCounter() | |||
| def _create_unless_callback(strs): | |||
| mres = build_mres(strs, match_whole=True) | |||
| def unless_callback(t): | |||
| # if t in strs: | |||
| # t.type = strs[t] | |||
| for mre, type_from_index in mres: | |||
| while True: | |||
| lexer = self.lexer | |||
| for mre, type_from_index in lexer.mres: | |||
| m = mre.match(stream, line_ctr.char_pos) | |||
| if m: | |||
| value = m.group(0) | |||
| type_ = type_from_index[m.lastindex] | |||
| if type_ not in ignore_types: | |||
| t = Token(type_, value, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||
| if t.type in lexer.callback: | |||
| t = lexer.callback[t.type](t) | |||
| yield t | |||
| line_ctr.feed(value, type_ in newline_types) | |||
| break | |||
| else: | |||
| if line_ctr.char_pos < len(stream): | |||
| raise UnexpectedInput(stream, line_ctr.char_pos, line_ctr.line, line_ctr.column) | |||
| break | |||
| class UnlessCallback: | |||
| def __init__(self, mres): | |||
| self.mres = mres | |||
| def __call__(self, t): | |||
| for mre, type_from_index in self.mres: | |||
| m = mre.match(t.value) | |||
| if m: | |||
| value = m.group(0) | |||
| t.type = type_from_index[m.lastindex] | |||
| break | |||
| return t | |||
| return unless_callback | |||
| ###} | |||
| def _create_unless(tokens): | |||
| tokens_by_type = classify(tokens, lambda t: type(t.pattern)) | |||
| @@ -85,7 +134,7 @@ def _create_unless(tokens): | |||
| if strtok.pattern.flags <= retok.pattern.flags: | |||
| embedded_strs.add(strtok) | |||
| if unless: | |||
| callback[retok.name] = _create_unless_callback(unless) | |||
| callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True)) | |||
| tokens = [t for t in tokens if t not in embedded_strs] | |||
| return tokens, callback | |||
| @@ -110,13 +159,13 @@ def _build_mres(tokens, max_size, match_whole): | |||
| def build_mres(tokens, match_whole=False): | |||
| return _build_mres(tokens, len(tokens), match_whole) | |||
| def _regexp_has_newline(r): | |||
| return '\n' in r or '\\n' in r or ('(?s)' in r and '.' in r) | |||
| class Lexer(object): | |||
| class Lexer: | |||
| def __init__(self, tokens, ignore=()): | |||
| assert all(isinstance(t, TokenDef) for t in tokens), tokens | |||
| self.ignore = ignore | |||
| self.newline_char = '\n' | |||
| tokens = list(tokens) | |||
| # Sanitization | |||
| @@ -129,14 +178,11 @@ class Lexer(object): | |||
| if t.pattern.min_width == 0: | |||
| raise LexError("Lexer does not allow zero-width tokens. (%s: %s)" % (t.name, t.pattern)) | |||
| token_names = {t.name for t in tokens} | |||
| for t in ignore: | |||
| if t not in token_names: | |||
| raise LexError("Token '%s' was marked to ignore but it is not defined!" % t) | |||
| assert set(ignore) <= {t.name for t in tokens} | |||
| # Init | |||
| self.newline_types = [t.name for t in tokens if _regexp_has_newline(t.pattern.to_regexp())] | |||
| self.ignore_types = [t for t in ignore] | |||
| self.ignore_types = list(ignore) | |||
| tokens.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) | |||
| @@ -147,46 +193,8 @@ class Lexer(object): | |||
| self.mres = build_mres(tokens) | |||
| def lex(self, stream): | |||
| lex_pos = 0 | |||
| line = 1 | |||
| col_start_pos = 0 | |||
| newline_types = list(self.newline_types) | |||
| ignore_types = list(self.ignore_types) | |||
| while True: | |||
| for mre, type_from_index in self.mres: | |||
| m = mre.match(stream, lex_pos) | |||
| if m: | |||
| value = m.group(0) | |||
| type_ = type_from_index[m.lastindex] | |||
| to_yield = type_ not in ignore_types | |||
| if to_yield: | |||
| t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos) | |||
| end_col = t.column + len(value) | |||
| if t.type in self.callback: | |||
| t = self.callback[t.type](t) | |||
| if type_ in newline_types: | |||
| newlines = value.count(self.newline_char) | |||
| if newlines: | |||
| line += newlines | |||
| last_newline_index = value.rindex(self.newline_char) + 1 | |||
| col_start_pos = lex_pos + last_newline_index | |||
| end_col = len(value) - last_newline_index | |||
| if to_yield: | |||
| t.end_line = line | |||
| t.end_col = end_col | |||
| yield t | |||
| lex_pos += len(value) | |||
| break | |||
| else: | |||
| if lex_pos < len(stream): | |||
| raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos) | |||
| break | |||
| return _Lex(self).lex(stream, self.newline_types, self.ignore_types) | |||
| class ContextualLexer: | |||
| @@ -204,7 +212,7 @@ class ContextualLexer: | |||
| lexer = lexer_by_tokens[key] | |||
| except KeyError: | |||
| accepts = set(accepts) | set(ignore) | set(always_accept) | |||
| state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end'] | |||
| state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END'] | |||
| lexer = Lexer(state_tokens, ignore=ignore) | |||
| lexer_by_tokens[key] = lexer | |||
| @@ -218,33 +226,9 @@ class ContextualLexer: | |||
| self.parser_state = state | |||
| def lex(self, stream): | |||
| lex_pos = 0 | |||
| line = 1 | |||
| col_start_pos = 0 | |||
| newline_types = list(self.root_lexer.newline_types) | |||
| ignore_types = list(self.root_lexer.ignore_types) | |||
| while True: | |||
| lexer = self.lexers[self.parser_state] | |||
| for mre, type_from_index in lexer.mres: | |||
| m = mre.match(stream, lex_pos) | |||
| if m: | |||
| value = m.group(0) | |||
| type_ = type_from_index[m.lastindex] | |||
| if type_ not in ignore_types: | |||
| t = Token(type_, value, lex_pos, line, lex_pos - col_start_pos) | |||
| if t.type in lexer.callback: | |||
| t = lexer.callback[t.type](t) | |||
| yield t | |||
| l = _Lex(self.lexers[self.parser_state]) | |||
| for x in l.lex(stream, self.root_lexer.newline_types, self.root_lexer.ignore_types): | |||
| yield x | |||
| l.lexer = self.lexers[self.parser_state] | |||
| if type_ in newline_types: | |||
| newlines = value.count(lexer.newline_char) | |||
| if newlines: | |||
| line += newlines | |||
| col_start_pos = lex_pos + value.rindex(lexer.newline_char) | |||
| lex_pos += len(value) | |||
| break | |||
| else: | |||
| if lex_pos < len(stream): | |||
| raise UnexpectedInput(stream, lex_pos, line, lex_pos - col_start_pos, lexer.tokens) | |||
| break | |||
| @@ -12,6 +12,7 @@ from .parse_tree_builder import ParseTreeBuilder | |||
| from .parser_frontends import LALR | |||
| from .parsers.lalr_parser import UnexpectedToken | |||
| from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef | |||
| from .grammar import RuleOptions, Rule | |||
| from .tree import Tree as T, Transformer, InlineTransformer, Visitor | |||
| @@ -127,7 +128,7 @@ RULES = { | |||
| class EBNF_to_BNF(InlineTransformer): | |||
| def __init__(self): | |||
| self.new_rules = {} | |||
| self.new_rules = [] | |||
| self.rules_by_expr = {} | |||
| self.prefix = 'anon' | |||
| self.i = 0 | |||
| @@ -140,7 +141,8 @@ class EBNF_to_BNF(InlineTransformer): | |||
| new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) | |||
| self.i += 1 | |||
| t = Token('RULE', new_name, -1) | |||
| self.new_rules[new_name] = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]), self.rule_options | |||
| tree = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]) | |||
| self.new_rules.append((new_name, tree, self.rule_options)) | |||
| self.rules_by_expr[expr] = t | |||
| return t | |||
| @@ -174,7 +176,6 @@ class SimplifyRule_Visitor(Visitor): | |||
| break | |||
| tree.expand_kids_by_index(*to_expand) | |||
| def expansion(self, tree): | |||
| # rules_list unpacking | |||
| # a : b (c|d) e | |||
| @@ -194,7 +195,7 @@ class SimplifyRule_Visitor(Visitor): | |||
| tree.data = 'expansions' | |||
| tree.children = [self.visit(T('expansion', [option if i==j else other | |||
| for j, other in enumerate(tree.children)])) | |||
| for option in child.children] | |||
| for option in set(child.children)] | |||
| break | |||
| else: | |||
| break | |||
| @@ -208,7 +209,10 @@ class SimplifyRule_Visitor(Visitor): | |||
| tree.data = 'expansions' | |||
| tree.children = aliases | |||
| expansions = _flatten | |||
| def expansions(self, tree): | |||
| self._flatten(tree) | |||
| tree.children = list(set(tree.children)) | |||
| class RuleTreeToText(Transformer): | |||
| def expansions(self, x): | |||
| @@ -389,12 +393,6 @@ def _interleave(l, item): | |||
| def _choice_of_rules(rules): | |||
| return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules]) | |||
| def dict_update_safe(d1, d2): | |||
| for k, v in d2.items(): | |||
| assert k not in d1 | |||
| d1[k] = v | |||
| class Grammar: | |||
| def __init__(self, rule_defs, token_defs, ignore): | |||
| self.token_defs = token_defs | |||
| @@ -411,6 +409,7 @@ class Grammar: | |||
| terms_to_ignore = {name:'__'+name for name in self.ignore} | |||
| if terms_to_ignore: | |||
| assert set(terms_to_ignore) <= {name for name, _t in term_defs} | |||
| term_defs = [(terms_to_ignore.get(name,name),t) for name,t in term_defs] | |||
| expr = Token('RULE', '__ignore') | |||
| for r, tree, _o in rule_defs: | |||
| @@ -466,57 +465,41 @@ class Grammar: | |||
| # ================= | |||
| # Compile Rules | |||
| # ================= | |||
| ebnf_to_bnf = EBNF_to_BNF() | |||
| simplify_rule = SimplifyRule_Visitor() | |||
| # 1. Pre-process terminals | |||
| transformer = PrepareLiterals() | |||
| if not lexer: | |||
| transformer *= SplitLiterals() | |||
| transformer *= ExtractAnonTokens(tokens) # Adds to tokens | |||
| rules = {} | |||
| # 2. Convert EBNF to BNF (and apply step 1) | |||
| ebnf_to_bnf = EBNF_to_BNF() | |||
| rules = [] | |||
| for name, rule_tree, options in rule_defs: | |||
| assert name not in rules, name | |||
| ebnf_to_bnf.rule_options = RuleOptions(keep_all_tokens=True) if options and options.keep_all_tokens else None | |||
| tree = transformer.transform(rule_tree) | |||
| rules[name] = ebnf_to_bnf.transform(tree), options | |||
| rules.append((name, ebnf_to_bnf.transform(tree), options)) | |||
| rules += ebnf_to_bnf.new_rules | |||
| dict_update_safe(rules, ebnf_to_bnf.new_rules) | |||
| for tree, _o in rules.values(): | |||
| simplify_rule.visit(tree) | |||
| assert len(rules) == len({name for name, _t, _o in rules}), "Whoops, name collision" | |||
| # 3. Compile tree to Rule objects | |||
| rule_tree_to_text = RuleTreeToText() | |||
| rules = {origin: (rule_tree_to_text.transform(tree), options) for origin, (tree, options) in rules.items()} | |||
| return tokens, rules, self.ignore | |||
| simplify_rule = SimplifyRule_Visitor() | |||
| compiled_rules = [] | |||
| for name, tree, options in rules: | |||
| simplify_rule.visit(tree) | |||
| expansions = rule_tree_to_text.transform(tree) | |||
| for expansion, alias in expansions: | |||
| if alias and name.startswith('_'): | |||
| raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) | |||
| class RuleOptions: | |||
| def __init__(self, keep_all_tokens=False, expand1=False, create_token=None, filter_out=False, priority=None): | |||
| self.keep_all_tokens = keep_all_tokens | |||
| self.expand1 = expand1 | |||
| self.create_token = create_token # used for scanless postprocessing | |||
| self.priority = priority | |||
| self.filter_out = filter_out # remove this rule from the tree | |||
| # used for "token"-rules in scanless | |||
| @classmethod | |||
| def from_rule(cls, name, *x): | |||
| if len(x) > 1: | |||
| priority, expansions = x | |||
| priority = int(priority) | |||
| else: | |||
| expansions ,= x | |||
| priority = None | |||
| keep_all_tokens = name.startswith('!') | |||
| name = name.lstrip('!') | |||
| expand1 = name.startswith('?') | |||
| name = name.lstrip('?') | |||
| rule = Rule(name, expansion, alias, options) | |||
| compiled_rules.append(rule) | |||
| return name, expansions, cls(keep_all_tokens, expand1, priority=priority) | |||
| return tokens, compiled_rules, self.ignore | |||
| @@ -553,15 +536,30 @@ def resolve_token_references(token_defs): | |||
| if not changed: | |||
| break | |||
| def options_from_rule(name, *x): | |||
| if len(x) > 1: | |||
| priority, expansions = x | |||
| priority = int(priority) | |||
| else: | |||
| expansions ,= x | |||
| priority = None | |||
| keep_all_tokens = name.startswith('!') | |||
| name = name.lstrip('!') | |||
| expand1 = name.startswith('?') | |||
| name = name.lstrip('?') | |||
| return name, expansions, RuleOptions(keep_all_tokens, expand1, priority=priority) | |||
| class GrammarLoader: | |||
| def __init__(self): | |||
| tokens = [TokenDef(name, PatternRE(value)) for name, value in TOKENS.items()] | |||
| rules = [RuleOptions.from_rule(name, x) for name, x in RULES.items()] | |||
| d = {r: ([(x.split(), None) for x in xs], o) for r, xs, o in rules} | |||
| rules, callback = ParseTreeBuilder(d, T).apply() | |||
| rules = [options_from_rule(name, x) for name, x in RULES.items()] | |||
| rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs] | |||
| callback = ParseTreeBuilder(rules, T).create_callback() | |||
| lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) | |||
| parser_conf = ParserConf(rules, callback, 'start') | |||
| self.parser = LALR(lexer_conf, parser_conf) | |||
| @@ -636,7 +634,6 @@ class GrammarLoader: | |||
| ignore_names.append(name) | |||
| token_defs.append((name, (t, 0))) | |||
| # Verify correctness 2 | |||
| token_names = set() | |||
| for name, _ in token_defs: | |||
| @@ -644,10 +641,13 @@ class GrammarLoader: | |||
| raise GrammarError("Token '%s' defined more than once" % name) | |||
| token_names.add(name) | |||
| if set(ignore_names) > token_names: | |||
| raise GrammarError("Tokens %s were marked to ignore but were not defined!" % (set(ignore_names) - token_names)) | |||
| # Resolve token references | |||
| resolve_token_references(token_defs) | |||
| rules = [RuleOptions.from_rule(*x) for x in rule_defs] | |||
| rules = [options_from_rule(*x) for x in rule_defs] | |||
| rule_names = set() | |||
| for name, _x, _o in rules: | |||
| @@ -1,6 +1,9 @@ | |||
| from .common import is_terminal, GrammarError | |||
| from .utils import suppress | |||
| from .lexer import Token | |||
| from .grammar import Rule | |||
| ###{standalone | |||
| class NodeBuilder: | |||
| def __init__(self, tree_class, name): | |||
| @@ -27,7 +30,7 @@ class Factory: | |||
| def __call__(self, node_builder): | |||
| return self.cls(node_builder, *self.args) | |||
| class TokenWrapper: | |||
| "Used for fixing the results of scanless parsing" | |||
| @@ -106,51 +109,53 @@ class ParseTreeBuilder: | |||
| self.rule_builders = list(self._init_builders(rules)) | |||
| self.user_aliases = {} | |||
| def _init_builders(self, rules): | |||
| filter_out = set() | |||
| for origin, (expansions, options) in rules.items(): | |||
| if options and options.filter_out: | |||
| assert origin.startswith('_') # Just to make sure | |||
| filter_out.add(origin) | |||
| for rule in rules: | |||
| if rule.options and rule.options.filter_out: | |||
| assert rule.origin.startswith('_') # Just to make sure | |||
| filter_out.add(rule.origin) | |||
| for origin, (expansions, options) in rules.items(): | |||
| for rule in rules: | |||
| options = rule.options | |||
| keep_all_tokens = self.always_keep_all_tokens or (options.keep_all_tokens if options else False) | |||
| expand1 = options.expand1 if options else False | |||
| create_token = options.create_token if options else False | |||
| for expansion, alias in expansions: | |||
| if alias and origin.startswith('_'): | |||
| raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (origin, alias)) | |||
| wrapper_chain = filter(None, [ | |||
| (expand1 and not rule.alias) and Expand1, | |||
| create_token and Factory(TokenWrapper, create_token), | |||
| create_rule_handler(rule.expansion, keep_all_tokens, filter_out), | |||
| self.propagate_positions and PropagatePositions, | |||
| ]) | |||
| wrapper_chain = filter(None, [ | |||
| (expand1 and not alias) and Expand1, | |||
| create_token and Factory(TokenWrapper, create_token), | |||
| create_rule_handler(expansion, keep_all_tokens, filter_out), | |||
| self.propagate_positions and PropagatePositions, | |||
| ]) | |||
| yield rule, wrapper_chain | |||
| yield origin, expansion, options, alias or origin, wrapper_chain | |||
| def apply(self, transformer=None): | |||
| def create_callback(self, transformer=None): | |||
| callback = Callback() | |||
| new_rules = [] | |||
| for origin, expansion, options, alias, wrapper_chain in self.rule_builders: | |||
| callback_name = '_callback_%s_%s' % (origin, '_'.join(expansion)) | |||
| for rule, wrapper_chain in self.rule_builders: | |||
| internal_callback_name = '_callback_%s_%s' % (rule.origin, '_'.join(rule.expansion)) | |||
| user_callback_name = rule.alias or rule.origin | |||
| try: | |||
| f = transformer._get_func(alias) | |||
| f = transformer._get_func(user_callback_name) | |||
| except AttributeError: | |||
| f = NodeBuilder(self.tree_class, alias) | |||
| f = NodeBuilder(self.tree_class, user_callback_name) | |||
| self.user_aliases[rule] = rule.alias | |||
| rule.alias = internal_callback_name | |||
| for w in wrapper_chain: | |||
| f = w(f) | |||
| if hasattr(callback, callback_name): | |||
| raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin)) | |||
| setattr(callback, callback_name, f) | |||
| if hasattr(callback, internal_callback_name): | |||
| raise GrammarError("Rule '%s' already exists" % (rule,)) | |||
| setattr(callback, internal_callback_name, f) | |||
| new_rules.append(( origin, expansion, callback_name, options )) | |||
| return callback | |||
| return new_rules, callback | |||
| ###} | |||
| @@ -1,5 +1,5 @@ | |||
| import re | |||
| import sre_parse | |||
| from .utils import get_regexp_width | |||
| from parsers.grammar_analysis import GrammarAnalyzer | |||
| from .lexer import Lexer, ContextualLexer, Token | |||
| @@ -9,10 +9,16 @@ from .parsers import lalr_parser, earley, xearley, resolve_ambig, cyk | |||
| from .tree import Tree | |||
| class WithLexer: | |||
| def __init__(self, lexer_conf): | |||
| def init_traditional_lexer(self, lexer_conf): | |||
| self.lexer_conf = lexer_conf | |||
| self.lexer = Lexer(lexer_conf.tokens, ignore=lexer_conf.ignore) | |||
| def init_contextual_lexer(self, lexer_conf, parser_conf): | |||
| self.lexer_conf = lexer_conf | |||
| d = {idx:t.keys() for idx, t in self.parser.analysis.parse_table.states.items()} | |||
| always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () | |||
| self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept) | |||
| def lex(self, text): | |||
| stream = self.lexer.lex(text) | |||
| if self.lexer_conf.postlex: | |||
| @@ -23,32 +29,22 @@ class WithLexer: | |||
| class LALR(WithLexer): | |||
| def __init__(self, lexer_conf, parser_conf, options=None): | |||
| WithLexer.__init__(self, lexer_conf) | |||
| self.parser_conf = parser_conf | |||
| self.parser = lalr_parser.Parser(parser_conf) | |||
| self.init_traditional_lexer(lexer_conf) | |||
| def parse(self, text): | |||
| tokens = self.lex(text) | |||
| return self.parser.parse(tokens) | |||
| token_stream = self.lex(text) | |||
| return self.parser.parse(token_stream) | |||
| class LALR_ContextualLexer: | |||
| class LALR_ContextualLexer(WithLexer): | |||
| def __init__(self, lexer_conf, parser_conf, options=None): | |||
| self.lexer_conf = lexer_conf | |||
| self.parser_conf = parser_conf | |||
| self.parser = lalr_parser.Parser(parser_conf) | |||
| d = {idx:t.keys() for idx, t in self.parser.analysis.states_idx.items()} | |||
| always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () | |||
| self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept) | |||
| self.init_contextual_lexer(lexer_conf, parser_conf) | |||
| def parse(self, text): | |||
| tokens = self.lexer.lex(text) | |||
| if self.lexer_conf.postlex: | |||
| tokens = self.lexer_conf.postlex.process(tokens) | |||
| return self.parser.parse(tokens, self.lexer.set_parser_state) | |||
| token_stream = self.lex(text) | |||
| return self.parser.parse(token_stream, self.lexer.set_parser_state) | |||
| def get_ambiguity_resolver(options): | |||
| if not options or options.ambiguity == 'resolve': | |||
| @@ -60,55 +56,47 @@ def get_ambiguity_resolver(options): | |||
| raise ValueError(options) | |||
| def tokenize_text(text): | |||
| new_text = [] | |||
| line = 1 | |||
| col_start_pos = 0 | |||
| for i, ch in enumerate(text): | |||
| if '\n' in ch: | |||
| line += ch.count('\n') | |||
| col_start_pos = i + ch.rindex('\n') | |||
| new_text.append(Token('CHAR', ch, line=line, column=i - col_start_pos)) | |||
| return new_text | |||
| yield Token('CHAR', ch, line=line, column=i - col_start_pos) | |||
| class Earley_NoLex: | |||
| def __init__(self, lexer_conf, parser_conf, options=None): | |||
| self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||
| self._prepare_match(lexer_conf) | |||
| rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] | |||
| self.parser = earley.Parser(rules, | |||
| parser_conf.start, | |||
| parser_conf.callback, | |||
| self.parser = earley.Parser(parser_conf, self.match, | |||
| resolve_ambiguity=get_ambiguity_resolver(options)) | |||
| def _prepare_expansion(self, expansion): | |||
| for sym in expansion: | |||
| if is_terminal(sym): | |||
| regexp = self.token_by_name[sym].pattern.to_regexp() | |||
| width = sre_parse.parse(regexp).getwidth() | |||
| if width != (1,1): | |||
| raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) | |||
| yield Terminal_Regexp(sym, regexp) | |||
| else: | |||
| yield sym | |||
| def match(self, term, text, index=0): | |||
| return self.regexps[term].match(text, index) | |||
| def _prepare_match(self, lexer_conf): | |||
| self.regexps = {} | |||
| for t in lexer_conf.tokens: | |||
| regexp = t.pattern.to_regexp() | |||
| width = get_regexp_width(regexp) | |||
| if width != (1,1): | |||
| raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width)) | |||
| self.regexps[t.name] = re.compile(regexp) | |||
| def parse(self, text): | |||
| new_text = tokenize_text(text) | |||
| return self.parser.parse(new_text) | |||
| token_stream = tokenize_text(text) | |||
| return self.parser.parse(token_stream) | |||
| class Earley(WithLexer): | |||
| def __init__(self, lexer_conf, parser_conf, options=None): | |||
| WithLexer.__init__(self, lexer_conf) | |||
| rules = [(n, self._prepare_expansion(x), a, o) for n,x,a,o in parser_conf.rules] | |||
| self.init_traditional_lexer(lexer_conf) | |||
| self.parser = earley.Parser(rules, | |||
| parser_conf.start, | |||
| parser_conf.callback, | |||
| self.parser = earley.Parser(parser_conf, self.match, | |||
| resolve_ambiguity=get_ambiguity_resolver(options)) | |||
| def _prepare_expansion(self, expansion): | |||
| return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion] | |||
| def match(self, term, token): | |||
| return term == token.type | |||
| def parse(self, text): | |||
| tokens = self.lex(text) | |||
| @@ -119,27 +107,31 @@ class XEarley: | |||
| def __init__(self, lexer_conf, parser_conf, options=None): | |||
| self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||
| rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules] | |||
| self._prepare_match(lexer_conf) | |||
| ignore = [Terminal_Regexp(x, self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore] | |||
| self.parser = xearley.Parser(rules, | |||
| parser_conf.start, | |||
| parser_conf.callback, | |||
| self.parser = xearley.Parser(parser_conf, | |||
| self.match, | |||
| resolve_ambiguity=get_ambiguity_resolver(options), | |||
| ignore=ignore, | |||
| ignore=lexer_conf.ignore, | |||
| predict_all=options.earley__predict_all | |||
| ) | |||
| def _prepare_expansion(self, expansion): | |||
| for sym in expansion: | |||
| if is_terminal(sym): | |||
| regexp = self.token_by_name[sym].pattern.to_regexp() | |||
| width = sre_parse.parse(regexp).getwidth() | |||
| assert width | |||
| yield Terminal_Regexp(sym, regexp) | |||
| def match(self, term, text, index=0): | |||
| return self.regexps[term].match(text, index) | |||
| def _prepare_match(self, lexer_conf): | |||
| self.regexps = {} | |||
| for t in lexer_conf.tokens: | |||
| regexp = t.pattern.to_regexp() | |||
| try: | |||
| width = get_regexp_width(regexp)[0] | |||
| except ValueError: | |||
| raise ValueError("Bad regexp in token %s: %s" % (t.name, regexp)) | |||
| else: | |||
| yield sym | |||
| if width == 0: | |||
| raise ValueError("Dynamic Earley doesn't allow zero-width regexps") | |||
| self.regexps[t.name] = re.compile(regexp) | |||
| def parse(self, text): | |||
| return self.parser.parse(text) | |||
| @@ -13,14 +13,11 @@ | |||
| # Author: Erez Shinan (2017) | |||
| # Email : erezshin@gmail.com | |||
| from ..common import ParseError, UnexpectedToken, Terminal | |||
| from ..common import ParseError, UnexpectedToken, is_terminal | |||
| from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse | |||
| from .grammar_analysis import GrammarAnalyzer | |||
| class EndToken: | |||
| type = '$end' | |||
| class Derivation(Tree): | |||
| _hash = None | |||
| @@ -35,8 +32,6 @@ class Derivation(Tree): | |||
| self._hash = Tree.__hash__(self) | |||
| return self._hash | |||
| END_TOKEN = EndToken() | |||
| class Item(object): | |||
| "An Earley Item, the atom of the algorithm." | |||
| @@ -59,11 +54,8 @@ class Item(object): | |||
| new_tree = Derivation(self.rule, self.tree.children + [tree]) | |||
| return self.__class__(self.rule, self.ptr+1, self.start, new_tree) | |||
| def similar(self, other): | |||
| return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule | |||
| def __eq__(self, other): | |||
| return self.similar(other) #and (self.tree == other.tree) | |||
| return self.start is other.start and self.ptr == other.ptr and self.rule == other.rule | |||
| def __hash__(self): | |||
| return hash((self.rule, self.ptr, id(self.start))) # Always runs Derivation.__hash__ | |||
| @@ -134,7 +126,7 @@ class Column: | |||
| self.completed[item_key] = item | |||
| self.to_reduce.append(item) | |||
| else: | |||
| if isinstance(item.expect, Terminal): | |||
| if is_terminal(item.expect): | |||
| self.to_scan.append(item) | |||
| else: | |||
| k = item_key if self.predict_all else item | |||
| @@ -151,31 +143,30 @@ class Column: | |||
| __nonzero__ = __bool__ # Py2 backwards-compatibility | |||
| class Parser: | |||
| def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None): | |||
| self.analysis = GrammarAnalyzer(rules, start_symbol) | |||
| self.start_symbol = start_symbol | |||
| def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None): | |||
| self.analysis = GrammarAnalyzer(parser_conf) | |||
| self.parser_conf = parser_conf | |||
| self.resolve_ambiguity = resolve_ambiguity | |||
| self.FIRST = self.analysis.FIRST | |||
| self.postprocess = {} | |||
| self.predictions = {} | |||
| self.FIRST = {} | |||
| for rule in self.analysis.rules: | |||
| if rule.origin != '$root': # XXX kinda ugly | |||
| a = rule.alias | |||
| self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) | |||
| self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||
| for rule in parser_conf.rules: | |||
| self.postprocess[rule] = getattr(parser_conf.callback, rule.alias) | |||
| self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||
| self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin] | |||
| self.term_matcher = term_matcher | |||
| def parse(self, stream, start_symbol=None): | |||
| # Define parser functions | |||
| start_symbol = start_symbol or self.start_symbol | |||
| start_symbol = start_symbol or self.parser_conf.start | |||
| _Item = Item | |||
| match = self.term_matcher | |||
| def predict(nonterm, column): | |||
| assert not isinstance(nonterm, Terminal), nonterm | |||
| assert not is_terminal(nonterm), nonterm | |||
| return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]] | |||
| def complete(item): | |||
| @@ -195,14 +186,13 @@ class Parser: | |||
| for item in to_reduce: | |||
| new_items = list(complete(item)) | |||
| for new_item in new_items: | |||
| if new_item.similar(item): | |||
| raise ParseError('Infinite recursion detected! (rule %s)' % new_item.rule) | |||
| if item in new_items: | |||
| raise ParseError('Infinite recursion detected! (rule %s)' % item.rule) | |||
| column.add(new_items) | |||
| def scan(i, token, column): | |||
| next_set = Column(i, self.FIRST) | |||
| next_set.add(item.advance(token) for item in column.to_scan if item.expect.match(token)) | |||
| next_set.add(item.advance(token) for item in column.to_scan if match(item.expect, token)) | |||
| if not next_set: | |||
| expect = {i.expect for i in column.to_scan} | |||
| @@ -249,24 +239,3 @@ class ApplyCallbacks(Transformer_NoRecurse): | |||
| return callback(children) | |||
| else: | |||
| return Tree(rule.origin, children) | |||
| # RULES = [ | |||
| # ('a', ['d']), | |||
| # ('d', ['b']), | |||
| # ('b', ['C']), | |||
| # ('b', ['b', 'C']), | |||
| # ('b', ['C', 'b']), | |||
| # ] | |||
| # p = Parser(RULES, 'a') | |||
| # for x in p.parse('CC'): | |||
| # print x.pretty() | |||
| #--------------- | |||
| # RULES = [ | |||
| # ('s', ['a', 'a']), | |||
| # ('a', ['b', 'b']), | |||
| # ('b', ['C'], lambda (x,): x), | |||
| # ('b', ['b', 'C']), | |||
| # ] | |||
| # p = Parser(RULES, 's', {}) | |||
| # print p.parse('CCCCC').pretty() | |||
| @@ -1,20 +1,8 @@ | |||
| from ..utils import bfs, fzset | |||
| from ..common import GrammarError, is_terminal | |||
| from ..grammar import Rule | |||
| class Rule(object): | |||
| """ | |||
| origin : a symbol | |||
| expansion : a list of symbols | |||
| """ | |||
| def __init__(self, origin, expansion, alias=None, options=None): | |||
| self.origin = origin | |||
| self.expansion = expansion | |||
| self.alias = alias | |||
| self.options = options | |||
| def __repr__(self): | |||
| return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion))) | |||
| class RulePtr(object): | |||
| def __init__(self, rule, index): | |||
| @@ -106,28 +94,30 @@ def calculate_sets(rules): | |||
| class GrammarAnalyzer(object): | |||
| def __init__(self, rule_tuples, start_symbol, debug=False): | |||
| self.start_symbol = start_symbol | |||
| def __init__(self, parser_conf, debug=False): | |||
| rules = parser_conf.rules | |||
| assert len(rules) == len(set(rules)) | |||
| self.start_symbol = parser_conf.start | |||
| self.debug = debug | |||
| rule_tuples = list(rule_tuples) | |||
| rule_tuples.append(('$root', [start_symbol, '$end'])) | |||
| rule_tuples = [(t[0], t[1], None, None) if len(t)==2 else t for t in rule_tuples] | |||
| self.rules = set() | |||
| self.rules_by_origin = {o: [] for o, _x, _a, _opt in rule_tuples} | |||
| for origin, exp, alias, options in rule_tuples: | |||
| r = Rule( origin, exp, alias, options ) | |||
| self.rules.add(r) | |||
| self.rules_by_origin[origin].append(r) | |||
| for r in self.rules: | |||
| root_rule = Rule('$root', [self.start_symbol, '$END']) | |||
| self.rules_by_origin = {r.origin: [] for r in rules} | |||
| for r in rules: | |||
| self.rules_by_origin[r.origin].append(r) | |||
| self.rules_by_origin[root_rule.origin] = [root_rule] | |||
| for r in rules: | |||
| for sym in r.expansion: | |||
| if not (is_terminal(sym) or sym in self.rules_by_origin): | |||
| raise GrammarError("Using an undefined rule: %s" % sym) | |||
| self.init_state = self.expand_rule('$root') | |||
| self.start_state = self.expand_rule('$root') | |||
| self.rules = rules | |||
| self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(self.rules) | |||
| self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules + [root_rule]) | |||
| def expand_rule(self, rule): | |||
| "Returns all init_ptrs accessible by rule (recursive)" | |||
| @@ -14,7 +14,43 @@ from ..common import GrammarError, is_terminal | |||
| from .grammar_analysis import GrammarAnalyzer | |||
| ACTION_SHIFT = 0 | |||
| class Action: | |||
| def __init__(self, name): | |||
| self.name = name | |||
| def __str__(self): | |||
| return self.name | |||
| def __repr__(self): | |||
| return str(self) | |||
| Shift = Action('Shift') | |||
| Reduce = Action('Reduce') | |||
| class ParseTable: | |||
| def __init__(self, states, start_state, end_state): | |||
| self.states = states | |||
| self.start_state = start_state | |||
| self.end_state = end_state | |||
| class IntParseTable(ParseTable): | |||
| @classmethod | |||
| def from_ParseTable(cls, parse_table): | |||
| enum = list(parse_table.states) | |||
| state_to_idx = {s:i for i,s in enumerate(enum)} | |||
| int_states = {} | |||
| for s, la in parse_table.states.items(): | |||
| la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v | |||
| for k,v in la.items()} | |||
| int_states[ state_to_idx[s] ] = la | |||
| start_state = state_to_idx[parse_table.start_state] | |||
| end_state = state_to_idx[parse_table.end_state] | |||
| return cls(int_states, start_state, end_state) | |||
| class LALR_Analyzer(GrammarAnalyzer): | |||
| @@ -27,7 +63,7 @@ class LALR_Analyzer(GrammarAnalyzer): | |||
| sat, unsat = classify_bool(state, lambda rp: rp.is_satisfied) | |||
| for rp in sat: | |||
| for term in self.FOLLOW.get(rp.rule.origin, ()): | |||
| lookahead[term].append(('reduce', rp.rule)) | |||
| lookahead[term].append((Reduce, rp.rule)) | |||
| d = classify(unsat, lambda rp: rp.next) | |||
| for sym, rps in d.items(): | |||
| @@ -38,8 +74,8 @@ class LALR_Analyzer(GrammarAnalyzer): | |||
| rps |= self.expand_rule(rp.next) | |||
| new_state = fzset(rps) | |||
| lookahead[sym].append(('shift', new_state)) | |||
| if sym == '$end': | |||
| lookahead[sym].append((Shift, new_state)) | |||
| if sym == '$END': | |||
| self.end_states.append( new_state ) | |||
| yield fzset(rps) | |||
| @@ -50,7 +86,7 @@ class LALR_Analyzer(GrammarAnalyzer): | |||
| for x in v: | |||
| # XXX resolving shift/reduce into shift, like PLY | |||
| # Give a proper warning | |||
| if x[0] == 'shift': | |||
| if x[0] is Shift: | |||
| lookahead[k] = [x] | |||
| for k, v in lookahead.items(): | |||
| @@ -59,22 +95,15 @@ class LALR_Analyzer(GrammarAnalyzer): | |||
| self.states[state] = {k:v[0] for k, v in lookahead.items()} | |||
| for _ in bfs([self.init_state], step): | |||
| for _ in bfs([self.start_state], step): | |||
| pass | |||
| self.end_state ,= self.end_states | |||
| # -- | |||
| self.enum = list(self.states) | |||
| self.enum_rev = {s:i for i,s in enumerate(self.enum)} | |||
| self.states_idx = {} | |||
| for s, la in self.states.items(): | |||
| la = {k:(ACTION_SHIFT, self.enum_rev[v[1]]) if v[0]=='shift' | |||
| else (v[0], (v[1], len(v[1].expansion))) # Reduce | |||
| for k,v in la.items()} | |||
| self.states_idx[ self.enum_rev[s] ] = la | |||
| self._parse_table = ParseTable(self.states, self.start_state, self.end_state) | |||
| if self.debug: | |||
| self.parse_table = self._parse_table | |||
| else: | |||
| self.parse_table = IntParseTable.from_ParseTable(self._parse_table) | |||
| self.init_state_idx = self.enum_rev[self.init_state] | |||
| self.end_state_idx = self.enum_rev[self.end_state] | |||
| @@ -3,30 +3,30 @@ | |||
| # Author: Erez Shinan (2017) | |||
| # Email : erezshin@gmail.com | |||
| from ..common import ParseError, UnexpectedToken | |||
| from ..common import UnexpectedToken | |||
| from .lalr_analysis import LALR_Analyzer, ACTION_SHIFT | |||
| class FinalReduce: | |||
| def __init__(self, value): | |||
| self.value = value | |||
| from .lalr_analysis import LALR_Analyzer, Shift | |||
| class Parser: | |||
| def __init__(self, parser_conf): | |||
| assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization" | |||
| self.analysis = analysis = LALR_Analyzer(parser_conf.rules, parser_conf.start) | |||
| assert all(r.options is None or r.options.priority is None | |||
| for r in parser_conf.rules), "LALR doesn't yet support prioritization" | |||
| self.analysis = analysis = LALR_Analyzer(parser_conf) | |||
| analysis.compute_lookahead() | |||
| callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) | |||
| for rule in analysis.rules} | |||
| self.parser = _Parser(analysis.states_idx, analysis.init_state_idx, analysis.end_state_idx, callbacks) | |||
| self.parser_conf = parser_conf | |||
| self.parser = _Parser(analysis.parse_table, callbacks) | |||
| self.parse = self.parser.parse | |||
| ###{standalone | |||
| class _Parser: | |||
| def __init__(self, states, init_state, end_state, callbacks): | |||
| self.states = states | |||
| self.init_state = init_state | |||
| self.end_state = end_state | |||
| def __init__(self, parse_table, callbacks): | |||
| self.states = parse_table.states | |||
| self.start_state = parse_table.start_state | |||
| self.end_state = parse_table.end_state | |||
| self.callbacks = callbacks | |||
| def parse(self, seq, set_state=None): | |||
| @@ -35,10 +35,10 @@ class _Parser: | |||
| stream = iter(seq) | |||
| states = self.states | |||
| state_stack = [self.init_state] | |||
| state_stack = [self.start_state] | |||
| value_stack = [] | |||
| if set_state: set_state(self.init_state) | |||
| if set_state: set_state(self.start_state) | |||
| def get_action(key): | |||
| state = state_stack[-1] | |||
| @@ -49,7 +49,8 @@ class _Parser: | |||
| raise UnexpectedToken(token, expected, seq, i) | |||
| def reduce(rule, size): | |||
| def reduce(rule): | |||
| size = len(rule.expansion) | |||
| if size: | |||
| s = value_stack[-size:] | |||
| del state_stack[-size:] | |||
| @@ -60,7 +61,7 @@ class _Parser: | |||
| value = self.callbacks[rule](s) | |||
| _action, new_state = get_action(rule.origin) | |||
| assert _action == ACTION_SHIFT | |||
| assert _action is Shift | |||
| state_stack.append(new_state) | |||
| value_stack.append(value) | |||
| @@ -72,22 +73,24 @@ class _Parser: | |||
| action, arg = get_action(token.type) | |||
| assert arg != self.end_state | |||
| if action == ACTION_SHIFT: | |||
| if action is Shift: | |||
| state_stack.append(arg) | |||
| value_stack.append(token) | |||
| if set_state: set_state(arg) | |||
| token = next(stream) | |||
| i += 1 | |||
| else: | |||
| reduce(*arg) | |||
| reduce(arg) | |||
| except StopIteration: | |||
| pass | |||
| while True: | |||
| _action, arg = get_action('$end') | |||
| if _action == ACTION_SHIFT: | |||
| _action, arg = get_action('$END') | |||
| if _action is Shift: | |||
| assert arg == self.end_state | |||
| val ,= value_stack | |||
| return val | |||
| else: | |||
| reduce(*arg) | |||
| reduce(arg) | |||
| ###} | |||
| @@ -20,7 +20,7 @@ | |||
| from collections import defaultdict | |||
| from ..common import ParseError, UnexpectedToken, Terminal | |||
| from ..common import ParseError, UnexpectedToken, is_terminal | |||
| from ..lexer import Token, UnexpectedInput | |||
| from ..tree import Tree | |||
| from .grammar_analysis import GrammarAnalyzer | |||
| @@ -28,37 +28,34 @@ from .grammar_analysis import GrammarAnalyzer | |||
| from .earley import ApplyCallbacks, Item, Column | |||
| class Parser: | |||
| def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=(), predict_all=False): | |||
| self.analysis = GrammarAnalyzer(rules, start_symbol) | |||
| self.start_symbol = start_symbol | |||
| def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False): | |||
| self.analysis = GrammarAnalyzer(parser_conf) | |||
| self.parser_conf = parser_conf | |||
| self.resolve_ambiguity = resolve_ambiguity | |||
| self.ignore = list(ignore) | |||
| self.predict_all = predict_all | |||
| self.FIRST = self.analysis.FIRST | |||
| self.postprocess = {} | |||
| self.predictions = {} | |||
| self.FIRST = {} | |||
| for rule in self.analysis.rules: | |||
| if rule.origin != '$root': # XXX kinda ugly | |||
| a = rule.alias | |||
| self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a)) | |||
| self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||
| for rule in parser_conf.rules: | |||
| self.postprocess[rule] = getattr(parser_conf.callback, rule.alias) | |||
| self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||
| self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin] | |||
| self.term_matcher = term_matcher | |||
| def parse(self, stream, start_symbol=None): | |||
| # Define parser functions | |||
| start_symbol = start_symbol or self.start_symbol | |||
| start_symbol = start_symbol or self.parser_conf.start | |||
| delayed_matches = defaultdict(list) | |||
| match = self.term_matcher | |||
| text_line = 1 | |||
| text_column = 0 | |||
| def predict(nonterm, column): | |||
| assert not isinstance(nonterm, Terminal), nonterm | |||
| assert not is_terminal(nonterm), nonterm | |||
| return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] | |||
| def complete(item): | |||
| @@ -77,16 +74,15 @@ class Parser: | |||
| column.add( predict(nonterm, column) ) | |||
| for item in to_reduce: | |||
| new_items = list(complete(item)) | |||
| for new_item in new_items: | |||
| if new_item.similar(item): | |||
| raise ParseError('Infinite recursion detected! (rule %s)' % new_item.rule) | |||
| if item in new_items: | |||
| raise ParseError('Infinite recursion detected! (rule %s)' % item.rule) | |||
| column.add(new_items) | |||
| def scan(i, token, column): | |||
| to_scan = column.to_scan | |||
| for x in self.ignore: | |||
| m = x.match(stream, i) | |||
| m = match(x, stream, i) | |||
| if m: | |||
| delayed_matches[m.end()] += set(to_scan) | |||
| delayed_matches[m.end()] += set(column.to_reduce) | |||
| @@ -99,16 +95,16 @@ class Parser: | |||
| # delayed_matches[m.end()] += to_scan | |||
| for item in to_scan: | |||
| m = item.expect.match(stream, i) | |||
| m = match(item.expect, stream, i) | |||
| if m: | |||
| t = Token(item.expect.name, m.group(0), i, text_line, text_column) | |||
| t = Token(item.expect, m.group(0), i, text_line, text_column) | |||
| delayed_matches[m.end()].append(item.advance(t)) | |||
| s = m.group(0) | |||
| for j in range(1, len(s)): | |||
| m = item.expect.match(s[:-j]) | |||
| m = match(item.expect, s[:-j]) | |||
| if m: | |||
| t = Token(item.expect.name, m.group(0), i, text_line, text_column) | |||
| t = Token(item.expect, m.group(0), i, text_line, text_column) | |||
| delayed_matches[i+m.end()].append(item.advance(t)) | |||
| next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) | |||
| @@ -131,7 +127,7 @@ class Parser: | |||
| if token == '\n': | |||
| text_line += 1 | |||
| text_column = 1 | |||
| text_column = 0 | |||
| else: | |||
| text_column += 1 | |||
| @@ -143,7 +139,7 @@ class Parser: | |||
| if n.rule.origin==start_symbol and n.start is column0] | |||
| if not solutions: | |||
| expected_tokens = [t.expect.name for t in column.to_scan] | |||
| expected_tokens = [t.expect for t in column.to_scan] | |||
| raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens) | |||
| elif len(solutions) == 1: | |||
| @@ -0,0 +1,203 @@ | |||
| ###{standalone | |||
| # | |||
| # | |||
| # Lark Stand-alone Generator Tool | |||
| # ---------------------------------- | |||
| # Generates a stand-alone LALR(1) parser with a standard lexer | |||
| # | |||
| # Git: https://github.com/erezsh/lark | |||
| # Author: Erez Shinan (erezshin@gmail.com) | |||
| # | |||
| # | |||
| # >>> LICENSE | |||
| # | |||
| # This tool and its generated code use a separate license from Lark. | |||
| # | |||
| # It is licensed under GPLv2 or above. | |||
| # | |||
| # If you wish to purchase a commercial license for this tool and its | |||
| # generated code, contact me via email. | |||
| # | |||
| # This program is free software: you can redistribute it and/or modify | |||
| # it under the terms of the GNU General Public License as published by | |||
| # the Free Software Foundation, either version 2 of the License, or | |||
| # (at your option) any later version. | |||
| # | |||
| # This program is distributed in the hope that it will be useful, | |||
| # but WITHOUT ANY WARRANTY; without even the implied warranty of | |||
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |||
| # GNU General Public License for more details. | |||
| # | |||
| # See <http://www.gnu.org/licenses/>. | |||
| # | |||
| # | |||
| ###} | |||
| import codecs | |||
| import sys | |||
| import os | |||
| from pprint import pprint | |||
| from os import path | |||
| from collections import defaultdict | |||
| import lark | |||
| from lark import Lark | |||
| from lark.parsers.lalr_analysis import Shift, Reduce | |||
| from ..grammar import Rule | |||
| __dir__ = path.dirname(__file__) | |||
| __larkdir__ = path.join(__dir__, path.pardir) | |||
| EXTRACT_STANDALONE_FILES = [ | |||
| 'tools/standalone.py', | |||
| 'utils.py', | |||
| 'common.py', | |||
| 'tree.py', | |||
| 'indenter.py', | |||
| 'lexer.py', | |||
| 'parse_tree_builder.py', | |||
| 'parsers/lalr_parser.py', | |||
| ] | |||
| def extract_sections(lines): | |||
| section = None | |||
| text = [] | |||
| sections = defaultdict(list) | |||
| for l in lines: | |||
| if l.startswith('###'): | |||
| if l[3] == '{': | |||
| section = l[4:].strip() | |||
| elif l[3] == '}': | |||
| sections[section] += text | |||
| section = None | |||
| text = [] | |||
| else: | |||
| raise ValueError(l) | |||
| elif section: | |||
| text.append(l) | |||
| return {name:''.join(text) for name, text in sections.items()} | |||
| class LexerAtoms: | |||
| def __init__(self, lexer): | |||
| self.mres = [(p.pattern,d) for p,d in lexer.mres] | |||
| self.newline_types = lexer.newline_types | |||
| self.ignore_types = lexer.ignore_types | |||
| self.callback = {name:[(p.pattern,d) for p,d in c.mres] | |||
| for name, c in lexer.callback.items()} | |||
| def print_python(self): | |||
| print('import re') | |||
| print('MRES = (') | |||
| pprint(self.mres) | |||
| print(')') | |||
| print('LEXER_CALLBACK = (') | |||
| pprint(self.callback) | |||
| print(')') | |||
| print('NEWLINE_TYPES = %s' % self.newline_types) | |||
| print('IGNORE_TYPES = %s' % self.ignore_types) | |||
| print('class LexerRegexps: pass') | |||
| print('lexer_regexps = LexerRegexps()') | |||
| print('lexer_regexps.mres = [(re.compile(p), d) for p, d in MRES]') | |||
| print('lexer_regexps.callback = {n: UnlessCallback([(re.compile(p), d) for p, d in mres])') | |||
| print(' for n, mres in LEXER_CALLBACK.items()}') | |||
| print('lexer = _Lex(lexer_regexps)') | |||
| print('def lex(stream):') | |||
| print(' return lexer.lex(stream, NEWLINE_TYPES, IGNORE_TYPES)') | |||
| class GetRule: | |||
| def __init__(self, rule_id): | |||
| self.rule_id = rule_id | |||
| def __repr__(self): | |||
| return 'RULES[%d]' % self.rule_id | |||
| rule_ids = {} | |||
| token_types = {} | |||
| def _get_token_type(token_type): | |||
| if token_type not in token_types: | |||
| token_types[token_type] = len(token_types) | |||
| return token_types[token_type] | |||
| class ParserAtoms: | |||
| def __init__(self, parser): | |||
| self.parse_table = parser.analysis.parse_table | |||
| def print_python(self): | |||
| print('class ParseTable: pass') | |||
| print('parse_table = ParseTable()') | |||
| print('STATES = {') | |||
| for state, actions in self.parse_table.states.items(): | |||
| print(' %r: %r,' % (state, {_get_token_type(token): ((1, rule_ids[arg]) if action is Reduce else (0, arg)) | |||
| for token, (action, arg) in actions.items()})) | |||
| print('}') | |||
| print('TOKEN_TYPES = (') | |||
| pprint({v:k for k, v in token_types.items()}) | |||
| print(')') | |||
| print('parse_table.states = {s: {TOKEN_TYPES[t]: (a, RULES[x] if a is Reduce else x) for t, (a, x) in acts.items()}') | |||
| print(' for s, acts in STATES.items()}') | |||
| print('parse_table.start_state = %s' % self.parse_table.start_state) | |||
| print('parse_table.end_state = %s' % self.parse_table.end_state) | |||
| print('class Lark_StandAlone:') | |||
| print(' def __init__(self, transformer=None, postlex=None):') | |||
| print(' callback = parse_tree_builder.create_callback(transformer=transformer)') | |||
| print(' callbacks = {rule: getattr(callback, rule.alias or rule.origin, None) for rule in RULES.values()}') | |||
| print(' self.parser = _Parser(parse_table, callbacks)') | |||
| print(' self.postlex = postlex') | |||
| print(' def parse(self, stream):') | |||
| print(' tokens = lex(stream)') | |||
| print(' if self.postlex: tokens = self.postlex.process(tokens)') | |||
| print(' return self.parser.parse(tokens)') | |||
| class TreeBuilderAtoms: | |||
| def __init__(self, lark): | |||
| self.rules = lark.rules | |||
| self.ptb = lark._parse_tree_builder | |||
| def print_python(self): | |||
| print('RULES = {') | |||
| for i, r in enumerate(self.rules): | |||
| rule_ids[r] = i | |||
| print(' %d: Rule(%r, %r, %r, %r),' % (i, r.origin, r.expansion, self.ptb.user_aliases[r], r.options )) | |||
| print('}') | |||
| print('parse_tree_builder = ParseTreeBuilder(RULES.values(), Tree)') | |||
| def main(fn, start): | |||
| with codecs.open(fn, encoding='utf8') as f: | |||
| lark_inst = Lark(f, parser="lalr", start=start) | |||
| lexer_atoms = LexerAtoms(lark_inst.parser.lexer) | |||
| parser_atoms = ParserAtoms(lark_inst.parser.parser) | |||
| tree_builder_atoms = TreeBuilderAtoms(lark_inst) | |||
| print('# The file was automatically generated by Lark v%s' % lark.__version__) | |||
| for pyfile in EXTRACT_STANDALONE_FILES: | |||
| print (extract_sections(open(os.path.join(__larkdir__, pyfile)))['standalone']) | |||
| print(open(os.path.join(__larkdir__, 'grammar.py')).read()) | |||
| print('Shift = 0') | |||
| print('Reduce = 1') | |||
| lexer_atoms.print_python() | |||
| tree_builder_atoms.print_python() | |||
| parser_atoms.print_python() | |||
| if __name__ == '__main__': | |||
| if len(sys.argv) < 2: | |||
| print("Lark Stand-alone Generator Tool") | |||
| print("Usage: python -m lark.tools.standalone <grammar-file> [<start>]") | |||
| sys.exit(1) | |||
| if len(sys.argv) == 3: | |||
| fn, start = sys.argv[1:] | |||
| elif len(sys.argv) == 2: | |||
| fn, start = sys.argv[1], 'start' | |||
| else: | |||
| assert False, sys.argv | |||
| main(fn, start) | |||
| @@ -7,6 +7,7 @@ from copy import deepcopy | |||
| from .utils import inline_args | |||
| ###{standalone | |||
| class Tree(object): | |||
| def __init__(self, data, children, rule=None): | |||
| self.data = data | |||
| @@ -34,6 +35,7 @@ class Tree(object): | |||
| def pretty(self, indent_str=' '): | |||
| return ''.join(self._pretty(0, indent_str)) | |||
| ###} | |||
| def expand_kids_by_index(self, *indices): | |||
| for i in sorted(indices, reverse=True): # reverse so that changing tail won't affect indices | |||
| @@ -100,6 +102,7 @@ class Tree(object): | |||
| ###{standalone | |||
| class Transformer(object): | |||
| def _get_func(self, name): | |||
| return getattr(self, name) | |||
| @@ -139,7 +142,7 @@ class TransformerChain(object): | |||
| def __mul__(self, other): | |||
| return TransformerChain(*self.transformers + (other,)) | |||
| class InlineTransformer(Transformer): | |||
| @@ -196,6 +199,7 @@ class Transformer_NoRecurse(Transformer): | |||
| def __default__(self, t): | |||
| return t | |||
| ###} | |||
| def pydot__tree_to_png(tree, filename): | |||
| @@ -1,7 +1,4 @@ | |||
| import functools | |||
| import types | |||
| from collections import deque | |||
| from contextlib import contextmanager | |||
| class fzset(frozenset): | |||
| def __repr__(self): | |||
| @@ -49,8 +46,13 @@ try: | |||
| except NameError: # Python 3 | |||
| STRING_TYPE = str | |||
| Str = type(u'') | |||
| ###{standalone | |||
| import types | |||
| import functools | |||
| from contextlib import contextmanager | |||
| Str = type(u'') | |||
| def inline_args(f): | |||
| # print '@@', f.__name__, type(f), isinstance(f, types.FunctionType), isinstance(f, types.TypeType), isinstance(f, types.BuiltinFunctionType) | |||
| @@ -76,19 +78,6 @@ def inline_args(f): | |||
| return _f | |||
| try: | |||
| compare = cmp | |||
| except NameError: | |||
| def compare(a, b): | |||
| if a == b: | |||
| return 0 | |||
| elif a > b: | |||
| return 1 | |||
| else: | |||
| return -1 | |||
| try: | |||
| from contextlib import suppress # Python 3 | |||
| except ImportError: | |||
| @@ -107,6 +96,26 @@ except ImportError: | |||
| except excs: | |||
| pass | |||
| ###} | |||
| try: | |||
| compare = cmp | |||
| except NameError: | |||
| def compare(a, b): | |||
| if a == b: | |||
| return 0 | |||
| elif a > b: | |||
| return 1 | |||
| else: | |||
| return -1 | |||
| import sre_parse | |||
| import sre_constants | |||
| def get_regexp_width(regexp): | |||
| try: | |||
| return sre_parse.parse(regexp).getwidth() | |||
| except sre_constants.error: | |||
| raise ValueError(regexp) | |||
| @@ -126,7 +126,7 @@ class TestParsers(unittest.TestCase): | |||
| r = T().transform(g.parse("x")) | |||
| self.assertEqual( r.children, ["<b>"] ) | |||
| g = Lark("""start: a | |||
| ?a : b | |||
| b : "x" | |||
| @@ -142,14 +142,14 @@ class TestParsers(unittest.TestCase): | |||
| r = T().transform(g.parse("xx")) | |||
| self.assertEqual( r.children, ["<c>"] ) | |||
| g = Lark("""start: a | |||
| ?a : b b -> c | |||
| b : "x" | |||
| """, parser='lalr', transformer=T()) | |||
| r = g.parse("xx") | |||
| self.assertEqual( r.children, ["<c>"] ) | |||
| @@ -159,7 +159,7 @@ def _make_full_earley_test(LEXER): | |||
| # Fails an Earley implementation without special handling for empty rules, | |||
| # or re-processing of already completed rules. | |||
| g = Lark(r"""start: B | |||
| B: ("ab"|/[^b]/)* | |||
| B: ("ab"|/[^b]/)+ | |||
| """, lexer=LEXER) | |||
| self.assertEqual( g.parse('abc').children[0], 'abc') | |||
| @@ -796,6 +796,49 @@ def _make_parser_test(LEXER, PARSER): | |||
| self.assertEqual(tree.children, ['a', 'A']) | |||
| def test_twice_empty(self): | |||
| g = """!start: [["A"]] | |||
| """ | |||
| l = _Lark(g) | |||
| tree = l.parse('A') | |||
| self.assertEqual(tree.children, ['A']) | |||
| tree = l.parse('') | |||
| self.assertEqual(tree.children, []) | |||
| def test_undefined_ignore(self): | |||
| g = """!start: "A" | |||
| %ignore B | |||
| """ | |||
| self.assertRaises( GrammarError, _Lark, g) | |||
| @unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO | |||
| def test_line_and_column(self): | |||
| g = r"""!start: "A" bc "D" | |||
| !bc: "B\nC" | |||
| """ | |||
| l = _Lark(g) | |||
| a, bc, d = l.parse("AB\nCD").children | |||
| self.assertEqual(a.line, 1) | |||
| self.assertEqual(a.column, 0) | |||
| bc ,= bc.children | |||
| self.assertEqual(bc.line, 1) | |||
| self.assertEqual(bc.column, 1) | |||
| self.assertEqual(d.line, 2) | |||
| self.assertEqual(d.column, 1) | |||
| # self.assertEqual(a.end_line, 1) | |||
| # self.assertEqual(a.end_col, 1) | |||
| # self.assertEqual(bc.end_line, 2) | |||
| # self.assertEqual(bc.end_col, 1) | |||
| # self.assertEqual(d.end_line, 2) | |||
| # self.assertEqual(d.end_col, 2) | |||
| def test_reduce_cycle(self): | |||
| """Tests an edge-condition in the LALR parser, in which a transition state looks exactly like the end state. | |||
| It seems that the correct solution is to explicitely distinguish finalization in the reduce() function. | |||
| @@ -969,7 +1012,7 @@ def _make_parser_test(LEXER, PARSER): | |||
| parser = _Lark(grammar) | |||
| tree = parser.parse("int 1 ! This is a comment\n") | |||
| tree = parser.parse("int 1 ! This is a comment\n") | |||
| self.assertEqual(tree.children, ['1']) | |||
| tree = parser.parse("int 1 ! This is a comment") # A trailing ignore token can be tricky! | |||
| @@ -983,6 +1026,7 @@ def _make_parser_test(LEXER, PARSER): | |||
| self.assertEqual(tree.children, []) | |||
| @unittest.skipIf(LEXER==None, "Scanless doesn't support regular expressions") | |||
| def test_regex_escaping(self): | |||
| g = _Lark("start: /[ab]/") | |||