- Merging updated upstream into branch for file extension changes. - Will push so Pull Request has no remaining conflicts. - Also will change the file type of lark example grammar.tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.6.0
| @@ -165,3 +165,5 @@ If you're interested in taking one of these on, let me know and I will provide m | |||||
| If you have any questions or want my assistance, you can email me at erezshin at gmail com. | If you have any questions or want my assistance, you can email me at erezshin at gmail com. | ||||
| I'm also available for contract work. | I'm also available for contract work. | ||||
| -- [Erez](https://github.com/erezsh) | |||||
| @@ -7,9 +7,11 @@ | |||||
| - [indented\_tree.py](indented\_tree.py) - A demonstration of parsing indentation ("whitespace significant" language) | - [indented\_tree.py](indented\_tree.py) - A demonstration of parsing indentation ("whitespace significant" language) | ||||
| - [fruitflies.py](fruitflies.py) - A demonstration of ambiguity | - [fruitflies.py](fruitflies.py) - A demonstration of ambiguity | ||||
| - [turtle\_dsl.py](turtle_dsl.py) - Implements a LOGO-like toy language for Python's turtle, with interpreter. | - [turtle\_dsl.py](turtle_dsl.py) - Implements a LOGO-like toy language for Python's turtle, with interpreter. | ||||
| - [lark\_grammar.py](lark_grammar.py) + [lark.g](lark.g) - A reference implementation of the Lark grammar (using LALR(1) + standard lexer) | |||||
| ### Advanced | ### Advanced | ||||
| - [error\_reporting\_lalr.py](error_reporting_lalr.py) - A demonstration of example-driven error reporting with the LALR parser | |||||
| - [python\_parser.py](python_parser.py) - A fully-working Python 2 & 3 parser (but not production ready yet!) | - [python\_parser.py](python_parser.py) - A fully-working Python 2 & 3 parser (but not production ready yet!) | ||||
| - [conf.py](conf.py) - Demonstrates the power of LALR's contextual lexer on a toy configuration language | - [conf.py](conf.py) - Demonstrates the power of LALR's contextual lexer on a toy configuration language | ||||
| - [reconstruct\_json.py](reconstruct_json.py) - Demonstrates the experimental text-reconstruction feature | - [reconstruct\_json.py](reconstruct_json.py) - Demonstrates the experimental text-reconstruction feature | ||||
| @@ -0,0 +1,81 @@ | |||||
| # | |||||
| # This demonstrates example-driven error reporting with the LALR parser | |||||
| # | |||||
| from lark import Lark, UnexpectedToken | |||||
| from .json_parser import json_grammar # Using the grammar from the json_parser example | |||||
| json_parser = Lark(json_grammar, parser='lalr') | |||||
| class JsonSyntaxError(SyntaxError): | |||||
| def __str__(self): | |||||
| context, line, column = self.args | |||||
| return '%s at line %s, column %s.\n\n%s' % (self.label, line, column, context) | |||||
| class JsonMissingValue(JsonSyntaxError): | |||||
| label = 'Missing Value' | |||||
| class JsonMissingOpening(JsonSyntaxError): | |||||
| label = 'Missing Opening' | |||||
| class JsonMissingClosing(JsonSyntaxError): | |||||
| label = 'Missing Closing' | |||||
| class JsonMissingComma(JsonSyntaxError): | |||||
| label = 'Missing Comma' | |||||
| class JsonTrailingComma(JsonSyntaxError): | |||||
| label = 'Trailing Comma' | |||||
| def parse(json_text): | |||||
| try: | |||||
| j = json_parser.parse(json_text) | |||||
| except UnexpectedToken as ut: | |||||
| exc_class = ut.match_examples(json_parser.parse, { | |||||
| JsonMissingValue: ['{"foo": }'], | |||||
| JsonMissingOpening: ['{"foo": ]}', | |||||
| '{"foor": }}'], | |||||
| JsonMissingClosing: ['{"foo": [}', | |||||
| '{', | |||||
| '{"a": 1', | |||||
| '[1'], | |||||
| JsonMissingComma: ['[1 2]', | |||||
| '[false 1]', | |||||
| '["b" 1]', | |||||
| '{"a":true 1:4}', | |||||
| '{"a":1 1:4}', | |||||
| '{"a":"b" 1:4}'], | |||||
| JsonTrailingComma: ['[,]', | |||||
| '[1,]', | |||||
| '[1,2,]', | |||||
| '{"foo":1,}', | |||||
| '{"foo":false,"bar":true,}'] | |||||
| }) | |||||
| if not exc_class: | |||||
| raise | |||||
| raise exc_class(ut.get_context(json_text), ut.line, ut.column) | |||||
| def test(): | |||||
| try: | |||||
| parse('{"key":') | |||||
| except JsonMissingValue: | |||||
| pass | |||||
| try: | |||||
| parse('{"key": "value"') | |||||
| except JsonMissingClosing: | |||||
| pass | |||||
| try: | |||||
| parse('{"key": ] ') | |||||
| except JsonMissingOpening: | |||||
| pass | |||||
| if __name__ == '__main__': | |||||
| test() | |||||
| @@ -0,0 +1,49 @@ | |||||
| start: (_item | _NL)* | |||||
| _item: rule | |||||
| | token | |||||
| | statement | |||||
| rule: RULE priority? ":" expansions _NL | |||||
| token: TOKEN priority? ":" expansions _NL | |||||
| priority: "." NUMBER | |||||
| statement: "%ignore" expansions _NL -> ignore | |||||
| | "%import" import_args ["->" TOKEN] _NL -> import | |||||
| import_args: name ("." name)* | |||||
| ?expansions: alias (_VBAR alias)* | |||||
| ?alias: expansion ["->" RULE] | |||||
| ?expansion: expr* | |||||
| ?expr: atom [OP | "~" NUMBER [".." NUMBER]] | |||||
| ?atom: "(" expansions ")" | |||||
| | "[" expansions "]" -> maybe | |||||
| | STRING ".." STRING -> literal_range | |||||
| | name | |||||
| | (REGEXP | STRING) -> literal | |||||
| name: RULE | |||||
| | TOKEN | |||||
| _VBAR: _NL? "|" | |||||
| OP: /[+*][?]?|[?](?![a-z])/ | |||||
| RULE: /!?[_?]?[a-z][_a-z0-9]*/ | |||||
| TOKEN: /_?[A-Z][_A-Z0-9]*/ | |||||
| STRING: _STRING "i"? | |||||
| REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/\n])*?\/[imslux]*/ | |||||
| _NL: /(\r?\n)+\s*/ | |||||
| %import common.ESCAPED_STRING -> _STRING | |||||
| %import common.INT -> NUMBER | |||||
| %import common.WS_INLINE | |||||
| COMMENT: "//" /[^\n]/* | |||||
| %ignore WS_INLINE | |||||
| %ignore COMMENT | |||||
| @@ -0,0 +1,18 @@ | |||||
| from lark import Lark | |||||
| parser = Lark(open('examples/lark.g'), parser="lalr") | |||||
| grammar_files = [ | |||||
| 'examples/python2.g', | |||||
| 'examples/python3.g', | |||||
| 'examples/lark.g', | |||||
| 'lark/grammars/common.g', | |||||
| ] | |||||
| def test(): | |||||
| for grammar_file in grammar_files: | |||||
| tree = parser.parse(open(grammar_file).read()) | |||||
| print("All grammars parsed successfully") | |||||
| if __name__ == '__main__': | |||||
| test() | |||||
| @@ -4,4 +4,4 @@ from .lexer import UnexpectedInput, LexError | |||||
| from .lark import Lark | from .lark import Lark | ||||
| from .utils import inline_args | from .utils import inline_args | ||||
| __version__ = "0.5.5" | |||||
| __version__ = "0.5.6" | |||||
| @@ -1,7 +1,7 @@ | |||||
| import re | import re | ||||
| import sys | import sys | ||||
| from .utils import get_regexp_width | |||||
| from .utils import get_regexp_width, STRING_TYPE | |||||
| Py36 = (sys.version_info[:2] >= (3, 6)) | Py36 = (sys.version_info[:2] >= (3, 6)) | ||||
| @@ -17,12 +17,13 @@ class ParseError(Exception): | |||||
| pass | pass | ||||
| class UnexpectedToken(ParseError): | class UnexpectedToken(ParseError): | ||||
| def __init__(self, token, expected, seq, index, considered_rules=None): | |||||
| def __init__(self, token, expected, seq, index, considered_rules=None, state=None): | |||||
| self.token = token | self.token = token | ||||
| self.expected = expected | self.expected = expected | ||||
| self.line = getattr(token, 'line', '?') | self.line = getattr(token, 'line', '?') | ||||
| self.column = getattr(token, 'column', '?') | self.column = getattr(token, 'column', '?') | ||||
| self.considered_rules = considered_rules | self.considered_rules = considered_rules | ||||
| self.state = state | |||||
| try: | try: | ||||
| context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]]) | context = ' '.join(['%r(%s)' % (t.value, t.type) for t in seq[index:index+5]]) | ||||
| @@ -36,7 +37,36 @@ class UnexpectedToken(ParseError): | |||||
| super(UnexpectedToken, self).__init__(message) | super(UnexpectedToken, self).__init__(message) | ||||
| def match_examples(self, parse_fn, examples): | |||||
| """ Given a parser instance and a dictionary mapping some label with | |||||
| some malformed syntax examples, it'll return the label for the | |||||
| example that bests matches the current error. | |||||
| """ | |||||
| assert self.state, "Not supported for this exception" | |||||
| candidate = None | |||||
| for label, example in examples.items(): | |||||
| assert not isinstance(example, STRING_TYPE) | |||||
| for malformed in example: | |||||
| try: | |||||
| parse_fn(malformed) | |||||
| except UnexpectedToken as ut: | |||||
| if ut.state == self.state: | |||||
| if ut.token == self.token: # Try exact match first | |||||
| return label | |||||
| elif not candidate: | |||||
| candidate = label | |||||
| return candidate | |||||
| def get_context(self, text, span=10): | |||||
| pos = self.token.pos_in_stream | |||||
| start = max(pos - span, 0) | |||||
| end = pos + span | |||||
| before = text[start:pos].rsplit('\n', 1)[-1] | |||||
| after = text[pos:end].split('\n', 1)[0] | |||||
| return before + after + '\n' + ' ' * len(before) + '^\n' | |||||
| ###} | ###} | ||||
| @@ -20,6 +20,7 @@ SIGNED_NUMBER: ["+"|"-"] NUMBER | |||||
| // | // | ||||
| // Strings | // Strings | ||||
| // | // | ||||
| //STRING: /"(\\\"|\\\\|[^"\n])*?"i?/ | |||||
| STRING_INNER: ("\\\""|/[^"]/) | STRING_INNER: ("\\\""|/[^"]/) | ||||
| ESCAPED_STRING: "\"" STRING_INNER* "\"" | ESCAPED_STRING: "\"" STRING_INNER* "\"" | ||||
| @@ -172,7 +172,7 @@ class Lark: | |||||
| def _build_parser(self): | def _build_parser(self): | ||||
| self.parser_class = get_frontend(self.options.parser, self.options.lexer) | self.parser_class = get_frontend(self.options.parser, self.options.lexer) | ||||
| self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens) | |||||
| self._parse_tree_builder = ParseTreeBuilder(self.rules, self.options.tree_class, self.options.propagate_positions, self.options.keep_all_tokens, self.options.parser!='lalr') | |||||
| callback = self._parse_tree_builder.create_callback(self.options.transformer) | callback = self._parse_tree_builder.create_callback(self.options.transformer) | ||||
| if self.profiler: | if self.profiler: | ||||
| for f in dir(callback): | for f in dir(callback): | ||||
| @@ -25,6 +25,8 @@ class UnexpectedInput(LexError): | |||||
| self.considered_rules = considered_rules | self.considered_rules = considered_rules | ||||
| class Token(Str): | class Token(Str): | ||||
| __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column') | |||||
| def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None): | def __new__(cls, type_, value, pos_in_stream=None, line=None, column=None): | ||||
| self = super(Token, cls).__new__(cls, value) | self = super(Token, cls).__new__(cls, value) | ||||
| self.type = type_ | self.type = type_ | ||||
| @@ -39,7 +41,7 @@ class Token(Str): | |||||
| return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column) | return cls(type_, value, borrow_t.pos_in_stream, line=borrow_t.line, column=borrow_t.column) | ||||
| def __reduce__(self): | def __reduce__(self): | ||||
| return (self.__class__, (self.type, self.pos_in_stream, self.value, self.line, self.column, )) | |||||
| return (self.__class__, (self.type, self.value, self.pos_in_stream, self.line, self.column, )) | |||||
| def __repr__(self): | def __repr__(self): | ||||
| return 'Token(%s, %r)' % (self.type, self.value) | return 'Token(%s, %r)' % (self.type, self.value) | ||||
| @@ -141,6 +143,8 @@ def _create_unless(tokens): | |||||
| for retok in tokens_by_type.get(PatternRE, []): | for retok in tokens_by_type.get(PatternRE, []): | ||||
| unless = [] # {} | unless = [] # {} | ||||
| for strtok in tokens_by_type.get(PatternStr, []): | for strtok in tokens_by_type.get(PatternStr, []): | ||||
| if strtok.priority > retok.priority: | |||||
| continue | |||||
| s = strtok.pattern.value | s = strtok.pattern.value | ||||
| m = re.match(retok.pattern.to_regexp(), s) | m = re.match(retok.pattern.to_regexp(), s) | ||||
| if m and m.group(0) == s: | if m and m.group(0) == s: | ||||
| @@ -14,7 +14,7 @@ from .parsers.lalr_parser import UnexpectedToken | |||||
| from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef | from .common import is_terminal, GrammarError, LexerConf, ParserConf, PatternStr, PatternRE, TokenDef | ||||
| from .grammar import RuleOptions, Rule | from .grammar import RuleOptions, Rule | ||||
| from .tree import Tree as T, Transformer, InlineTransformer, Visitor | |||||
| from .tree import Tree, Transformer, InlineTransformer, Visitor, SlottedTree as ST | |||||
| __path__ = os.path.dirname(__file__) | __path__ = os.path.dirname(__file__) | ||||
| IMPORT_PATHS = [os.path.join(__path__, 'grammars')] | IMPORT_PATHS = [os.path.join(__path__, 'grammars')] | ||||
| @@ -122,7 +122,7 @@ RULES = { | |||||
| 'statement': ['ignore', 'import'], | 'statement': ['ignore', 'import'], | ||||
| 'ignore': ['_IGNORE expansions _NL'], | 'ignore': ['_IGNORE expansions _NL'], | ||||
| 'import': ['_IMPORT import_args _NL', | 'import': ['_IMPORT import_args _NL', | ||||
| '_IMPORT import_args _TO TOKEN'], | |||||
| '_IMPORT import_args _TO TOKEN _NL'], | |||||
| 'import_args': ['_import_args'], | 'import_args': ['_import_args'], | ||||
| '_import_args': ['name', '_import_args _DOT name'], | '_import_args': ['name', '_import_args _DOT name'], | ||||
| @@ -145,14 +145,14 @@ class EBNF_to_BNF(InlineTransformer): | |||||
| new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) | new_name = '__%s_%s_%d' % (self.prefix, type_, self.i) | ||||
| self.i += 1 | self.i += 1 | ||||
| t = Token('RULE', new_name, -1) | t = Token('RULE', new_name, -1) | ||||
| tree = T('expansions', [T('expansion', [expr]), T('expansion', [t, expr])]) | |||||
| tree = ST('expansions', [ST('expansion', [expr]), ST('expansion', [t, expr])]) | |||||
| self.new_rules.append((new_name, tree, self.rule_options)) | self.new_rules.append((new_name, tree, self.rule_options)) | ||||
| self.rules_by_expr[expr] = t | self.rules_by_expr[expr] = t | ||||
| return t | return t | ||||
| def expr(self, rule, op, *args): | def expr(self, rule, op, *args): | ||||
| if op.value == '?': | if op.value == '?': | ||||
| return T('expansions', [rule, T('expansion', [])]) | |||||
| return ST('expansions', [rule, ST('expansion', [])]) | |||||
| elif op.value == '+': | elif op.value == '+': | ||||
| # a : b c+ d | # a : b c+ d | ||||
| # --> | # --> | ||||
| @@ -165,7 +165,7 @@ class EBNF_to_BNF(InlineTransformer): | |||||
| # a : b _c? d | # a : b _c? d | ||||
| # _c : _c c | c; | # _c : _c c | c; | ||||
| new_name = self._add_recurse_rule('star', rule) | new_name = self._add_recurse_rule('star', rule) | ||||
| return T('expansions', [new_name, T('expansion', [])]) | |||||
| return ST('expansions', [new_name, ST('expansion', [])]) | |||||
| elif op.value == '~': | elif op.value == '~': | ||||
| if len(args) == 1: | if len(args) == 1: | ||||
| mn = mx = int(args[0]) | mn = mx = int(args[0]) | ||||
| @@ -173,7 +173,7 @@ class EBNF_to_BNF(InlineTransformer): | |||||
| mn, mx = map(int, args) | mn, mx = map(int, args) | ||||
| if mx < mn: | if mx < mn: | ||||
| raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) | raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) | ||||
| return T('expansions', [T('expansion', [rule] * n) for n in range(mn, mx+1)]) | |||||
| return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)]) | |||||
| assert False, op | assert False, op | ||||
| @@ -183,7 +183,7 @@ class SimplifyRule_Visitor(Visitor): | |||||
| def _flatten(tree): | def _flatten(tree): | ||||
| while True: | while True: | ||||
| to_expand = [i for i, child in enumerate(tree.children) | to_expand = [i for i, child in enumerate(tree.children) | ||||
| if isinstance(child, T) and child.data == tree.data] | |||||
| if isinstance(child, Tree) and child.data == tree.data] | |||||
| if not to_expand: | if not to_expand: | ||||
| break | break | ||||
| tree.expand_kids_by_index(*to_expand) | tree.expand_kids_by_index(*to_expand) | ||||
| @@ -203,9 +203,9 @@ class SimplifyRule_Visitor(Visitor): | |||||
| self._flatten(tree) | self._flatten(tree) | ||||
| for i, child in enumerate(tree.children): | for i, child in enumerate(tree.children): | ||||
| if isinstance(child, T) and child.data == 'expansions': | |||||
| if isinstance(child, Tree) and child.data == 'expansions': | |||||
| tree.data = 'expansions' | tree.data = 'expansions' | ||||
| tree.children = [self.visit(T('expansion', [option if i==j else other | |||||
| tree.children = [self.visit(ST('expansion', [option if i==j else other | |||||
| for j, other in enumerate(tree.children)])) | for j, other in enumerate(tree.children)])) | ||||
| for option in set(child.children)] | for option in set(child.children)] | ||||
| break | break | ||||
| @@ -217,7 +217,7 @@ class SimplifyRule_Visitor(Visitor): | |||||
| if rule.data == 'expansions': | if rule.data == 'expansions': | ||||
| aliases = [] | aliases = [] | ||||
| for child in tree.children[0].children: | for child in tree.children[0].children: | ||||
| aliases.append(T('alias', [child, alias_name])) | |||||
| aliases.append(ST('alias', [child, alias_name])) | |||||
| tree.data = 'expansions' | tree.data = 'expansions' | ||||
| tree.children = aliases | tree.children = aliases | ||||
| @@ -239,7 +239,7 @@ class RuleTreeToText(Transformer): | |||||
| class CanonizeTree(InlineTransformer): | class CanonizeTree(InlineTransformer): | ||||
| def maybe(self, expr): | def maybe(self, expr): | ||||
| return T('expr', [expr, Token('OP', '?', -1)]) | |||||
| return ST('expr', [expr, Token('OP', '?', -1)]) | |||||
| def tokenmods(self, *args): | def tokenmods(self, *args): | ||||
| if len(args) == 1: | if len(args) == 1: | ||||
| @@ -353,7 +353,7 @@ def _literal_to_pattern(literal): | |||||
| class PrepareLiterals(InlineTransformer): | class PrepareLiterals(InlineTransformer): | ||||
| def literal(self, literal): | def literal(self, literal): | ||||
| return T('pattern', [_literal_to_pattern(literal)]) | |||||
| return ST('pattern', [_literal_to_pattern(literal)]) | |||||
| def range(self, start, end): | def range(self, start, end): | ||||
| assert start.type == end.type == 'STRING' | assert start.type == end.type == 'STRING' | ||||
| @@ -361,13 +361,13 @@ class PrepareLiterals(InlineTransformer): | |||||
| end = end.value[1:-1] | end = end.value[1:-1] | ||||
| assert len(start) == len(end) == 1, (start, end, len(start), len(end)) | assert len(start) == len(end) == 1, (start, end, len(start), len(end)) | ||||
| regexp = '[%s-%s]' % (start, end) | regexp = '[%s-%s]' % (start, end) | ||||
| return T('pattern', [PatternRE(regexp)]) | |||||
| return ST('pattern', [PatternRE(regexp)]) | |||||
| class SplitLiterals(InlineTransformer): | class SplitLiterals(InlineTransformer): | ||||
| def pattern(self, p): | def pattern(self, p): | ||||
| if isinstance(p, PatternStr) and len(p.value)>1: | if isinstance(p, PatternStr) and len(p.value)>1: | ||||
| return T('expansion', [T('pattern', [PatternStr(ch, flags=p.flags)]) for ch in p.value]) | |||||
| return T('pattern', [p]) | |||||
| return ST('expansion', [ST('pattern', [PatternStr(ch, flags=p.flags)]) for ch in p.value]) | |||||
| return ST('pattern', [p]) | |||||
| class TokenTreeToPattern(Transformer): | class TokenTreeToPattern(Transformer): | ||||
| def pattern(self, ps): | def pattern(self, ps): | ||||
| @@ -375,6 +375,7 @@ class TokenTreeToPattern(Transformer): | |||||
| return p | return p | ||||
| def expansion(self, items): | def expansion(self, items): | ||||
| assert items | |||||
| if len(items) == 1: | if len(items) == 1: | ||||
| return items[0] | return items[0] | ||||
| if len({i.flags for i in items}) > 1: | if len({i.flags for i in items}) > 1: | ||||
| @@ -402,18 +403,20 @@ class TokenTreeToPattern(Transformer): | |||||
| assert len(args) == 2 | assert len(args) == 2 | ||||
| return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags) | return PatternRE('(?:%s)%s' % (inner.to_regexp(), op), inner.flags) | ||||
| def alias(self, t): | |||||
| raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)") | |||||
| def _interleave(l, item): | def _interleave(l, item): | ||||
| for e in l: | for e in l: | ||||
| yield e | yield e | ||||
| if isinstance(e, T): | |||||
| if isinstance(e, Tree): | |||||
| if e.data in ('literal', 'range'): | if e.data in ('literal', 'range'): | ||||
| yield item | yield item | ||||
| elif is_terminal(e): | elif is_terminal(e): | ||||
| yield item | yield item | ||||
| def _choice_of_rules(rules): | def _choice_of_rules(rules): | ||||
| return T('expansions', [T('expansion', [Token('RULE', name)]) for name in rules]) | |||||
| return ST('expansions', [ST('expansion', [Token('RULE', name)]) for name in rules]) | |||||
| class Grammar: | class Grammar: | ||||
| def __init__(self, rule_defs, token_defs, ignore): | def __init__(self, rule_defs, token_defs, ignore): | ||||
| @@ -440,9 +443,9 @@ class Grammar: | |||||
| if r == start: | if r == start: | ||||
| exp.children = [expr] + exp.children | exp.children = [expr] + exp.children | ||||
| for exp in tree.find_data('expr'): | for exp in tree.find_data('expr'): | ||||
| exp.children[0] = T('expansion', list(_interleave(exp.children[:1], expr))) | |||||
| exp.children[0] = ST('expansion', list(_interleave(exp.children[:1], expr))) | |||||
| _ignore_tree = T('expr', [_choice_of_rules(terms_to_ignore.values()), Token('OP', '?')]) | |||||
| _ignore_tree = ST('expr', [_choice_of_rules(terms_to_ignore.values()), Token('OP', '?')]) | |||||
| rule_defs.append(('__ignore', _ignore_tree, None)) | rule_defs.append(('__ignore', _ignore_tree, None)) | ||||
| # Convert all tokens to rules | # Convert all tokens to rules | ||||
| @@ -455,6 +458,9 @@ class Grammar: | |||||
| exp.children[i] = Token(sym.type, new_terminal_names[sym]) | exp.children[i] = Token(sym.type, new_terminal_names[sym]) | ||||
| for name, (tree, priority) in term_defs: # TODO transfer priority to rule? | for name, (tree, priority) in term_defs: # TODO transfer priority to rule? | ||||
| if any(tree.find_data('alias')): | |||||
| raise GrammarError("Aliasing not allowed in terminals (You used -> in the wrong place)") | |||||
| if name.startswith('_'): | if name.startswith('_'): | ||||
| options = RuleOptions(filter_out=True, priority=-priority) | options = RuleOptions(filter_out=True, priority=-priority) | ||||
| else: | else: | ||||
| @@ -481,6 +487,11 @@ class Grammar: | |||||
| # Convert token-trees to strings/regexps | # Convert token-trees to strings/regexps | ||||
| transformer = PrepareLiterals() * TokenTreeToPattern() | transformer = PrepareLiterals() * TokenTreeToPattern() | ||||
| for name, (token_tree, priority) in token_defs: | |||||
| for t in token_tree.find_data('expansion'): | |||||
| if not t.children: | |||||
| raise GrammarError("Tokens cannot be empty (%s)" % name) | |||||
| tokens = [TokenDef(name, transformer.transform(token_tree), priority) | tokens = [TokenDef(name, transformer.transform(token_tree), priority) | ||||
| for name, (token_tree, priority) in token_defs] | for name, (token_tree, priority) in token_defs] | ||||
| @@ -516,7 +527,7 @@ class Grammar: | |||||
| for expansion, alias in expansions: | for expansion, alias in expansions: | ||||
| if alias and name.startswith('_'): | if alias and name.startswith('_'): | ||||
| raise Exception("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) | |||||
| raise GrammarError("Rule %s is marked for expansion (it starts with an underscore) and isn't allowed to have aliases (alias=%s)" % (name, alias)) | |||||
| rule = Rule(name, expansion, alias, options) | rule = Rule(name, expansion, alias, options) | ||||
| compiled_rules.append(rule) | compiled_rules.append(rule) | ||||
| @@ -579,7 +590,7 @@ class GrammarLoader: | |||||
| rules = [options_from_rule(name, x) for name, x in RULES.items()] | rules = [options_from_rule(name, x) for name, x in RULES.items()] | ||||
| rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs] | rules = [Rule(r, x.split(), None, o) for r, xs, o in rules for x in xs] | ||||
| callback = ParseTreeBuilder(rules, T).create_callback() | |||||
| callback = ParseTreeBuilder(rules, ST).create_callback() | |||||
| lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) | lexer_conf = LexerConf(tokens, ['WS', 'COMMENT']) | ||||
| parser_conf = ParserConf(rules, callback, 'start') | parser_conf = ParserConf(rules, callback, 'start') | ||||
| @@ -595,14 +606,22 @@ class GrammarLoader: | |||||
| except UnexpectedInput as e: | except UnexpectedInput as e: | ||||
| raise GrammarError("Unexpected input %r at line %d column %d in %s" % (e.context, e.line, e.column, name)) | raise GrammarError("Unexpected input %r at line %d column %d in %s" % (e.context, e.line, e.column, name)) | ||||
| except UnexpectedToken as e: | except UnexpectedToken as e: | ||||
| if e.expected == ['_COLON']: | |||||
| raise GrammarError("Missing colon at line %s column %s" % (e.line, e.column)) | |||||
| elif e.expected == ['RULE']: | |||||
| raise GrammarError("Missing alias at line %s column %s" % (e.line, e.column)) | |||||
| context = e.get_context(grammar_text) | |||||
| error = e.match_examples(self.parser.parse, { | |||||
| 'Unclosed parenthesis': ['a: (\n'], | |||||
| 'Umatched closing parenthesis': ['a: )\n', 'a: [)\n', 'a: (]\n'], | |||||
| 'Expecting rule or token definition (missing colon)': ['a\n', 'a->\n', 'A->\n', 'a A\n'], | |||||
| 'Alias expects lowercase name': ['a: -> "a"\n'], | |||||
| 'Unexpected colon': ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n'], | |||||
| 'Misplaced operator': ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n'], | |||||
| 'Expecting option ("|") or a new rule or token definition': ['a:a\n()\n'], | |||||
| '%import expects a name': ['%import "a"\n'], | |||||
| '%ignore expects a value': ['%ignore %import\n'], | |||||
| }) | |||||
| if error: | |||||
| raise GrammarError("%s at line %s column %s\n\n%s" % (error, e.line, e.column, context)) | |||||
| elif 'STRING' in e.expected: | elif 'STRING' in e.expected: | ||||
| raise GrammarError("Expecting a value at line %s column %s" % (e.line, e.column)) | |||||
| elif e.expected == ['_OR']: | |||||
| raise GrammarError("Newline without starting a new option (Expecting '|') at line %s column %s" % (e.line, e.column)) | |||||
| raise GrammarError("Expecting a value at line %s column %s\n\n%s" % (e.line, e.column, context)) | |||||
| raise | raise | ||||
| # Extract grammar items | # Extract grammar items | ||||
| @@ -57,6 +57,19 @@ class ChildFilter: | |||||
| self.node_builder = node_builder | self.node_builder = node_builder | ||||
| self.to_include = to_include | self.to_include = to_include | ||||
| def __call__(self, children): | |||||
| filtered = [] | |||||
| for i, to_expand in self.to_include: | |||||
| if to_expand: | |||||
| filtered += children[i].children | |||||
| else: | |||||
| filtered.append(children[i]) | |||||
| return self.node_builder(filtered) | |||||
| class ChildFilterLALR(ChildFilter): | |||||
| "Optimized childfilter for LALR (assumes no duplication in parse tree, so it's safe to change it)" | |||||
| def __call__(self, children): | def __call__(self, children): | ||||
| filtered = [] | filtered = [] | ||||
| for i, to_expand in self.to_include: | for i, to_expand in self.to_include: | ||||
| @@ -73,21 +86,22 @@ class ChildFilter: | |||||
| def _should_expand(sym): | def _should_expand(sym): | ||||
| return not is_terminal(sym) and sym.startswith('_') | return not is_terminal(sym) and sym.startswith('_') | ||||
| def maybe_create_child_filter(expansion, filter_out): | |||||
| def maybe_create_child_filter(expansion, filter_out, ambiguous): | |||||
| to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) if sym not in filter_out] | to_include = [(i, _should_expand(sym)) for i, sym in enumerate(expansion) if sym not in filter_out] | ||||
| if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): | if len(to_include) < len(expansion) or any(to_expand for i, to_expand in to_include): | ||||
| return partial(ChildFilter, to_include) | |||||
| return partial(ChildFilter if ambiguous else ChildFilterLALR, to_include) | |||||
| class Callback(object): | class Callback(object): | ||||
| pass | pass | ||||
| class ParseTreeBuilder: | class ParseTreeBuilder: | ||||
| def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False): | |||||
| def __init__(self, rules, tree_class, propagate_positions=False, keep_all_tokens=False, ambiguous=False): | |||||
| self.tree_class = tree_class | self.tree_class = tree_class | ||||
| self.propagate_positions = propagate_positions | self.propagate_positions = propagate_positions | ||||
| self.always_keep_all_tokens = keep_all_tokens | self.always_keep_all_tokens = keep_all_tokens | ||||
| self.ambiguous = ambiguous | |||||
| self.rule_builders = list(self._init_builders(rules)) | self.rule_builders = list(self._init_builders(rules)) | ||||
| @@ -107,7 +121,7 @@ class ParseTreeBuilder: | |||||
| wrapper_chain = filter(None, [ | wrapper_chain = filter(None, [ | ||||
| create_token and partial(CreateToken, create_token), | create_token and partial(CreateToken, create_token), | ||||
| (expand_single_child and not rule.alias) and ExpandSingleChild, | (expand_single_child and not rule.alias) and ExpandSingleChild, | ||||
| maybe_create_child_filter(rule.expansion, () if keep_all_tokens else filter_out), | |||||
| maybe_create_child_filter(rule.expansion, () if keep_all_tokens else filter_out, self.ambiguous), | |||||
| self.propagate_positions and PropagatePositions, | self.propagate_positions and PropagatePositions, | ||||
| ]) | ]) | ||||
| @@ -15,9 +15,9 @@ class WithLexer: | |||||
| def init_contextual_lexer(self, lexer_conf, parser_conf): | def init_contextual_lexer(self, lexer_conf, parser_conf): | ||||
| self.lexer_conf = lexer_conf | self.lexer_conf = lexer_conf | ||||
| d = {idx:t.keys() for idx, t in self.parser.analysis.parse_table.states.items()} | |||||
| states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} | |||||
| always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () | always_accept = lexer_conf.postlex.always_accept if lexer_conf.postlex else () | ||||
| self.lexer = ContextualLexer(lexer_conf.tokens, d, ignore=lexer_conf.ignore, always_accept=always_accept, user_callbacks=lexer_conf.callbacks) | |||||
| self.lexer = ContextualLexer(lexer_conf.tokens, states, ignore=lexer_conf.ignore, always_accept=always_accept, user_callbacks=lexer_conf.callbacks) | |||||
| def lex(self, text): | def lex(self, text): | ||||
| stream = self.lexer.lex(text) | stream = self.lexer.lex(text) | ||||
| @@ -145,16 +145,16 @@ class Column: | |||||
| class Parser: | class Parser: | ||||
| def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None): | def __init__(self, parser_conf, term_matcher, resolve_ambiguity=None): | ||||
| self.analysis = GrammarAnalyzer(parser_conf) | |||||
| analysis = GrammarAnalyzer(parser_conf) | |||||
| self.parser_conf = parser_conf | self.parser_conf = parser_conf | ||||
| self.resolve_ambiguity = resolve_ambiguity | self.resolve_ambiguity = resolve_ambiguity | ||||
| self.FIRST = self.analysis.FIRST | |||||
| self.FIRST = analysis.FIRST | |||||
| self.postprocess = {} | self.postprocess = {} | ||||
| self.predictions = {} | self.predictions = {} | ||||
| for rule in parser_conf.rules: | for rule in parser_conf.rules: | ||||
| self.postprocess[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias) | self.postprocess[rule] = rule.alias if callable(rule.alias) else getattr(parser_conf.callback, rule.alias) | ||||
| self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)] | |||||
| self.predictions[rule.origin] = [x.rule for x in analysis.expand_rule(rule.origin)] | |||||
| self.term_matcher = term_matcher | self.term_matcher = term_matcher | ||||
| @@ -5,6 +5,8 @@ from ..grammar import Rule | |||||
| class RulePtr(object): | class RulePtr(object): | ||||
| __slots__ = ('rule', 'index') | |||||
| def __init__(self, rule, index): | def __init__(self, rule, index): | ||||
| assert isinstance(rule, Rule) | assert isinstance(rule, Rule) | ||||
| assert index <= len(rule.expansion) | assert index <= len(rule.expansion) | ||||
| @@ -134,7 +136,8 @@ class GrammarAnalyzer(object): | |||||
| if not is_terminal(new_r): | if not is_terminal(new_r): | ||||
| yield new_r | yield new_r | ||||
| _ = list(bfs([rule], _expand_rule)) | |||||
| for _ in bfs([rule], _expand_rule): | |||||
| pass | |||||
| return fzset(init_ptrs) | return fzset(init_ptrs) | ||||
| @@ -2,7 +2,6 @@ | |||||
| """ | """ | ||||
| # Author: Erez Shinan (2017) | # Author: Erez Shinan (2017) | ||||
| # Email : erezshin@gmail.com | # Email : erezshin@gmail.com | ||||
| from ..common import UnexpectedToken | from ..common import UnexpectedToken | ||||
| from .lalr_analysis import LALR_Analyzer, Shift | from .lalr_analysis import LALR_Analyzer, Shift | ||||
| @@ -11,11 +10,12 @@ class Parser: | |||||
| def __init__(self, parser_conf): | def __init__(self, parser_conf): | ||||
| assert all(r.options is None or r.options.priority is None | assert all(r.options is None or r.options.priority is None | ||||
| for r in parser_conf.rules), "LALR doesn't yet support prioritization" | for r in parser_conf.rules), "LALR doesn't yet support prioritization" | ||||
| self.analysis = analysis = LALR_Analyzer(parser_conf) | |||||
| analysis = LALR_Analyzer(parser_conf) | |||||
| analysis.compute_lookahead() | analysis.compute_lookahead() | ||||
| callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) | callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) | ||||
| for rule in parser_conf.rules} | for rule in parser_conf.rules} | ||||
| self._parse_table = analysis.parse_table | |||||
| self.parser_conf = parser_conf | self.parser_conf = parser_conf | ||||
| self.parser = _Parser(analysis.parse_table, callbacks) | self.parser = _Parser(analysis.parse_table, callbacks) | ||||
| self.parse = self.parser.parse | self.parse = self.parser.parse | ||||
| @@ -46,8 +46,7 @@ class _Parser: | |||||
| return states[state][key] | return states[state][key] | ||||
| except KeyError: | except KeyError: | ||||
| expected = states[state].keys() | expected = states[state].keys() | ||||
| raise UnexpectedToken(token, expected, seq, i) | |||||
| raise UnexpectedToken(token, expected, seq, i, state=state) | |||||
| def reduce(rule): | def reduce(rule): | ||||
| size = len(rule.expansion) | size = len(rule.expansion) | ||||
| @@ -9,11 +9,7 @@ from ..tree import Tree, Visitor_NoRecurse | |||||
| # Author: Erez Sh | # Author: Erez Sh | ||||
| def _compare_rules(rule1, rule2): | def _compare_rules(rule1, rule2): | ||||
| c = -compare( len(rule1.expansion), len(rule2.expansion)) | |||||
| if rule1.origin.startswith('__'): # XXX hack! We should set priority in parser, not here | |||||
| c = -c | |||||
| return c | |||||
| return -compare( len(rule1.expansion), len(rule2.expansion)) | |||||
| def _sum_priority(tree): | def _sum_priority(tree): | ||||
| p = 0 | p = 0 | ||||
| @@ -126,7 +126,7 @@ def _get_token_type(token_type): | |||||
| class ParserAtoms: | class ParserAtoms: | ||||
| def __init__(self, parser): | def __init__(self, parser): | ||||
| self.parse_table = parser.analysis.parse_table | |||||
| self.parse_table = parser._parse_table | |||||
| def print_python(self): | def print_python(self): | ||||
| print('class ParseTable: pass') | print('class ParseTable: pass') | ||||
| @@ -99,6 +99,8 @@ class Tree(object): | |||||
| self.data = data | self.data = data | ||||
| self.children = children | self.children = children | ||||
| class SlottedTree(Tree): | |||||
| __slots__ = 'data', 'children', 'rule' | |||||
| ###{standalone | ###{standalone | ||||
| @@ -172,6 +174,30 @@ class Visitor_NoRecurse(Visitor): | |||||
| return tree | return tree | ||||
| from functools import wraps | |||||
| def visit_children_decor(func): | |||||
| @wraps(func) | |||||
| def inner(cls, tree): | |||||
| values = cls.visit_children(tree) | |||||
| return func(cls, values) | |||||
| return inner | |||||
| class Interpreter(object): | |||||
| def visit(self, tree): | |||||
| return getattr(self, tree.data)(tree) | |||||
| def visit_children(self, tree): | |||||
| return [self.visit(child) if isinstance(child, Tree) else child | |||||
| for child in tree.children] | |||||
| def __getattr__(self, name): | |||||
| return self.__default__ | |||||
| def __default__(self, tree): | |||||
| return self.visit_children(tree) | |||||
| class Transformer_NoRecurse(Transformer): | class Transformer_NoRecurse(Transformer): | ||||
| def transform(self, tree): | def transform(self, tree): | ||||
| subtrees = list(tree.iter_subtrees()) | subtrees = list(tree.iter_subtrees()) | ||||
| @@ -187,17 +187,22 @@ def _make_full_earley_test(LEXER): | |||||
| l.parse(program) | l.parse(program) | ||||
| def test_earley_scanless3(self): | |||||
| "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" | |||||
| # XXX Fails for scanless mode | |||||
| # XXX Decided not to fix, because | |||||
| # a) It's a subtle bug | |||||
| # b) Scanless is intended for deprecation | |||||
| # | |||||
| # def test_earley_scanless3(self): | |||||
| # "Tests prioritization and disambiguation for pseudo-terminals (there should be only one result)" | |||||
| grammar = """ | |||||
| start: A A | |||||
| A: "a"+ | |||||
| """ | |||||
| # grammar = """ | |||||
| # start: A A | |||||
| # A: "a"+ | |||||
| # """ | |||||
| l = Lark(grammar, parser='earley', lexer=LEXER) | |||||
| res = l.parse("aaa") | |||||
| self.assertEqual(res.children, ['aa', 'a']) | |||||
| # l = Lark(grammar, parser='earley', lexer=LEXER) | |||||
| # res = l.parse("aaa") | |||||
| # self.assertEqual(res.children, ['aa', 'a']) | |||||
| def test_earley_scanless4(self): | def test_earley_scanless4(self): | ||||
| grammar = """ | grammar = """ | ||||
| @@ -293,6 +298,39 @@ def _make_full_earley_test(LEXER): | |||||
| self.assertEqual(res, expected) | self.assertEqual(res, expected) | ||||
| def test_explicit_ambiguity2(self): | |||||
| grammar = r""" | |||||
| start: NAME+ | |||||
| NAME: /\w+/ | |||||
| %ignore " " | |||||
| """ | |||||
| text = """cat""" | |||||
| parser = Lark(grammar, start='start', ambiguity='explicit') | |||||
| tree = parser.parse(text) | |||||
| self.assertEqual(tree.data, '_ambig') | |||||
| combinations = {tuple(str(s) for s in t.children) for t in tree.children} | |||||
| self.assertEqual(combinations, { | |||||
| ('cat',), | |||||
| ('ca', 't'), | |||||
| ('c', 'at'), | |||||
| ('c', 'a' ,'t') | |||||
| }) | |||||
| def test_term_ambig_resolve(self): | |||||
| grammar = r""" | |||||
| !start: NAME+ | |||||
| NAME: /\w+/ | |||||
| %ignore " " | |||||
| """ | |||||
| text = """foo bar""" | |||||
| parser = Lark(grammar) | |||||
| tree = parser.parse(text) | |||||
| self.assertEqual(tree.children, ['foo', 'bar']) | |||||
| @@ -822,6 +860,12 @@ def _make_parser_test(LEXER, PARSER): | |||||
| """ | """ | ||||
| self.assertRaises( GrammarError, _Lark, g) | self.assertRaises( GrammarError, _Lark, g) | ||||
| def test_alias_in_terminal(self): | |||||
| g = """start: TERM | |||||
| TERM: "a" -> alias | |||||
| """ | |||||
| self.assertRaises( GrammarError, _Lark, g) | |||||
| @unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO | @unittest.skipIf(LEXER==None, "TODO: Fix scanless parsing or get rid of it") # TODO | ||||
| def test_line_and_column(self): | def test_line_and_column(self): | ||||
| g = r"""!start: "A" bc "D" | g = r"""!start: "A" bc "D" | ||||
| @@ -1129,6 +1173,18 @@ def _make_parser_test(LEXER, PARSER): | |||||
| self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') | self.assertRaises((ParseError, UnexpectedInput), l.parse, u'ABB') | ||||
| self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') | self.assertRaises((ParseError, UnexpectedInput), l.parse, u'AAAABB') | ||||
| @unittest.skipIf(PARSER=='earley', "Priority not handled correctly right now") # TODO XXX | |||||
| def test_priority_vs_embedded(self): | |||||
| g = """ | |||||
| A.2: "a" | |||||
| WORD: ("a".."z")+ | |||||
| start: (A | WORD)+ | |||||
| """ | |||||
| l = _Lark(g) | |||||
| t = l.parse('abc') | |||||
| self.assertEqual(t.children, ['a', 'bc']) | |||||
| self.assertEqual(t.children[0].type, 'A') | |||||
| @@ -5,7 +5,7 @@ from unittest import TestCase | |||||
| import copy | import copy | ||||
| import pickle | import pickle | ||||
| from lark.tree import Tree | |||||
| from lark.tree import Tree, Interpreter, visit_children_decor | |||||
| class TestTrees(TestCase): | class TestTrees(TestCase): | ||||
| @@ -21,6 +21,45 @@ class TestTrees(TestCase): | |||||
| assert pickle.loads(data) == s | assert pickle.loads(data) == s | ||||
| def test_interp(self): | |||||
| t = Tree('a', [Tree('b', []), Tree('c', []), 'd']) | |||||
| class Interp1(Interpreter): | |||||
| def a(self, tree): | |||||
| return self.visit_children(tree) + ['e'] | |||||
| def b(self, tree): | |||||
| return 'B' | |||||
| def c(self, tree): | |||||
| return 'C' | |||||
| self.assertEqual(Interp1().visit(t), list('BCde')) | |||||
| class Interp2(Interpreter): | |||||
| @visit_children_decor | |||||
| def a(self, values): | |||||
| return values + ['e'] | |||||
| def b(self, tree): | |||||
| return 'B' | |||||
| def c(self, tree): | |||||
| return 'C' | |||||
| self.assertEqual(Interp2().visit(t), list('BCde')) | |||||
| class Interp3(Interpreter): | |||||
| def b(self, tree): | |||||
| return 'B' | |||||
| def c(self, tree): | |||||
| return 'C' | |||||
| self.assertEqual(Interp3().visit(t), list('BCd')) | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| unittest.main() | unittest.main() | ||||