From 51644a6c584eb9833af71c40198fdc5d8a99c904 Mon Sep 17 00:00:00 2001 From: Erez Shinan Date: Wed, 25 Apr 2018 19:06:33 +0300 Subject: [PATCH] Added examples/lark.g - Reference implementation of the Lark grammar (inspired by issue #116) --- examples/README.md | 1 + examples/lark.g | 49 ++++++++++++++++++++++++++++++++++++++++ examples/lark_grammar.py | 18 +++++++++++++++ lark/grammars/common.g | 1 + lark/load_grammar.py | 8 ++++++- 5 files changed, 76 insertions(+), 1 deletion(-) create mode 100644 examples/lark.g create mode 100644 examples/lark_grammar.py diff --git a/examples/README.md b/examples/README.md index 3fbe3ea..25bf504 100644 --- a/examples/README.md +++ b/examples/README.md @@ -7,6 +7,7 @@ - [indented\_tree.py](indented\_tree.py) - A demonstration of parsing indentation ("whitespace significant" language) - [fruitflies.py](fruitflies.py) - A demonstration of ambiguity - [turtle\_dsl.py](turtle_dsl.py) - Implements a LOGO-like toy language for Python's turtle, with interpreter. +- [lark\_grammar.py](lark_grammar.py) - A reference implementation of the Lark grammar (using LALR(1) + standard lexer) ### Advanced diff --git a/examples/lark.g b/examples/lark.g new file mode 100644 index 0000000..1fbf592 --- /dev/null +++ b/examples/lark.g @@ -0,0 +1,49 @@ +start: (_item | _NL)* + +_item: rule + | token + | statement + +rule: RULE priority? ":" expansions _NL +token: TOKEN priority? ":" expansions _NL + +priority: "." NUMBER + +statement: "%ignore" expansions _NL -> ignore + | "%import" import_args ["->" TOKEN] _NL -> import + +import_args: name ("." name)* + +?expansions: alias (_VBAR alias)* + +?alias: expansion ["->" RULE] + +?expansion: expr* + +?expr: atom [OP | "~" NUMBER [".." NUMBER]] + +?atom: "(" expansions ")" + | "[" expansions "]" -> maybe + | STRING ".." STRING -> literal_range + | name + | (REGEXP | STRING) -> literal + +name: RULE + | TOKEN + +_VBAR: _NL? "|" +OP: /[+*][?]?|[?](?![a-z])/ +RULE: /!?[_?]?[a-z][_a-z0-9]*/ +TOKEN: /_?[A-Z][_A-Z0-9]*/ +STRING: _STRING "i"? +REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/\n])*?\/[imslux]*/ +_NL: /(\r?\n)+\s*/ + +%import common.ESCAPED_STRING -> _STRING +%import common.INT -> NUMBER +%import common.WS_INLINE + +COMMENT: "//" /[^\n]/* + +%ignore WS_INLINE +%ignore COMMENT diff --git a/examples/lark_grammar.py b/examples/lark_grammar.py new file mode 100644 index 0000000..88fc4cf --- /dev/null +++ b/examples/lark_grammar.py @@ -0,0 +1,18 @@ +from lark import Lark + +parser = Lark(open('examples/lark.g'), parser="lalr") + +grammar_files = [ + 'examples/python2.g', + 'examples/python3.g', + 'examples/lark.g', + 'lark/grammars/common.g', +] + +def test(): + for grammar_file in grammar_files: + tree = parser.parse(open(grammar_file).read()) + print("All grammars parsed successfully") + +if __name__ == '__main__': + test() diff --git a/lark/grammars/common.g b/lark/grammars/common.g index 2bd02d0..8bc8079 100644 --- a/lark/grammars/common.g +++ b/lark/grammars/common.g @@ -20,6 +20,7 @@ SIGNED_NUMBER: ["+"|"-"] NUMBER // // Strings // +//STRING: /"(\\\"|\\\\|[^"\n])*?"i?/ STRING_INNER: ("\\\""|/[^"]/) ESCAPED_STRING: "\"" STRING_INNER* "\"" diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 43d1bf5..13aeff0 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -122,7 +122,7 @@ RULES = { 'statement': ['ignore', 'import'], 'ignore': ['_IGNORE expansions _NL'], 'import': ['_IMPORT import_args _NL', - '_IMPORT import_args _TO TOKEN'], + '_IMPORT import_args _TO TOKEN _NL'], 'import_args': ['_import_args'], '_import_args': ['name', '_import_args _DOT name'], @@ -375,6 +375,7 @@ class TokenTreeToPattern(Transformer): return p def expansion(self, items): + assert items if len(items) == 1: return items[0] if len({i.flags for i in items}) > 1: @@ -486,6 +487,11 @@ class Grammar: # Convert token-trees to strings/regexps transformer = PrepareLiterals() * TokenTreeToPattern() + for name, (token_tree, priority) in token_defs: + for t in token_tree.find_data('expansion'): + if not t.children: + raise GrammarError("Tokens cannot be empty (%s)" % name) + tokens = [TokenDef(name, transformer.transform(token_tree), priority) for name, (token_tree, priority) in token_defs]