From 51644a6c584eb9833af71c40198fdc5d8a99c904 Mon Sep 17 00:00:00 2001
From: Erez Shinan <erezshin+git@gmail.com>
Date: Wed, 25 Apr 2018 19:06:33 +0300
Subject: [PATCH] Added examples/lark.g - Reference implementation of the Lark
 grammar (inspired by issue #116)

---
 examples/README.md       |  1 +
 examples/lark.g          | 49 ++++++++++++++++++++++++++++++++++++++++
 examples/lark_grammar.py | 18 +++++++++++++++
 lark/grammars/common.g   |  1 +
 lark/load_grammar.py     |  8 ++++++-
 5 files changed, 76 insertions(+), 1 deletion(-)
 create mode 100644 examples/lark.g
 create mode 100644 examples/lark_grammar.py

diff --git a/examples/README.md b/examples/README.md
index 3fbe3ea..25bf504 100644
--- a/examples/README.md
+++ b/examples/README.md
@@ -7,6 +7,7 @@
 - [indented\_tree.py](indented\_tree.py) - A demonstration of parsing indentation ("whitespace significant" language)
 - [fruitflies.py](fruitflies.py) - A demonstration of ambiguity
 - [turtle\_dsl.py](turtle_dsl.py) - Implements a LOGO-like toy language for Python's turtle, with interpreter.
+- [lark\_grammar.py](lark_grammar.py) - A reference implementation of the Lark grammar (using LALR(1) + standard lexer)
 
 ### Advanced
 
diff --git a/examples/lark.g b/examples/lark.g
new file mode 100644
index 0000000..1fbf592
--- /dev/null
+++ b/examples/lark.g
@@ -0,0 +1,49 @@
+start: (_item | _NL)*
+
+_item: rule
+     | token
+     | statement
+
+rule: RULE priority? ":" expansions _NL
+token: TOKEN priority? ":" expansions _NL
+
+priority: "." NUMBER
+
+statement: "%ignore" expansions _NL                -> ignore
+         | "%import" import_args ["->" TOKEN] _NL  -> import
+
+import_args: name ("." name)*
+
+?expansions: alias (_VBAR alias)*
+
+?alias: expansion ["->" RULE]
+
+?expansion: expr*
+
+?expr: atom [OP | "~" NUMBER [".." NUMBER]]
+
+?atom: "(" expansions ")"
+     | "[" expansions "]" -> maybe
+     | STRING ".." STRING -> literal_range
+     | name
+     | (REGEXP | STRING) -> literal
+
+name: RULE
+    | TOKEN
+
+_VBAR: _NL? "|"
+OP: /[+*][?]?|[?](?![a-z])/
+RULE: /!?[_?]?[a-z][_a-z0-9]*/
+TOKEN: /_?[A-Z][_A-Z0-9]*/
+STRING: _STRING "i"?
+REGEXP: /\/(?!\/)(\\\/|\\\\|[^\/\n])*?\/[imslux]*/
+_NL: /(\r?\n)+\s*/
+
+%import common.ESCAPED_STRING -> _STRING
+%import common.INT -> NUMBER
+%import common.WS_INLINE
+
+COMMENT: "//" /[^\n]/*
+
+%ignore WS_INLINE
+%ignore COMMENT
diff --git a/examples/lark_grammar.py b/examples/lark_grammar.py
new file mode 100644
index 0000000..88fc4cf
--- /dev/null
+++ b/examples/lark_grammar.py
@@ -0,0 +1,18 @@
+from lark import Lark
+
+parser = Lark(open('examples/lark.g'), parser="lalr")
+
+grammar_files = [
+    'examples/python2.g',
+    'examples/python3.g',
+    'examples/lark.g',
+    'lark/grammars/common.g',
+]
+
+def test():
+    for grammar_file in grammar_files:
+        tree = parser.parse(open(grammar_file).read())
+    print("All grammars parsed successfully")
+
+if __name__ == '__main__':
+    test()
diff --git a/lark/grammars/common.g b/lark/grammars/common.g
index 2bd02d0..8bc8079 100644
--- a/lark/grammars/common.g
+++ b/lark/grammars/common.g
@@ -20,6 +20,7 @@ SIGNED_NUMBER: ["+"|"-"] NUMBER
 //
 // Strings
 //
+//STRING: /"(\\\"|\\\\|[^"\n])*?"i?/
 STRING_INNER: ("\\\""|/[^"]/)
 ESCAPED_STRING: "\"" STRING_INNER* "\""
 
diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index 43d1bf5..13aeff0 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -122,7 +122,7 @@ RULES = {
     'statement': ['ignore', 'import'],
     'ignore': ['_IGNORE expansions _NL'],
     'import': ['_IMPORT import_args _NL',
-               '_IMPORT import_args _TO TOKEN'],
+               '_IMPORT import_args _TO TOKEN _NL'],
     'import_args': ['_import_args'],
     '_import_args': ['name', '_import_args _DOT name'],
 
@@ -375,6 +375,7 @@ class TokenTreeToPattern(Transformer):
         return p
 
     def expansion(self, items):
+        assert items
         if len(items) == 1:
             return items[0]
         if len({i.flags for i in items}) > 1:
@@ -486,6 +487,11 @@ class Grammar:
 
         # Convert token-trees to strings/regexps
         transformer = PrepareLiterals() * TokenTreeToPattern()
+        for name, (token_tree, priority) in token_defs:
+            for t in token_tree.find_data('expansion'):
+                if not t.children:
+                    raise GrammarError("Tokens cannot be empty (%s)" % name)
+
         tokens = [TokenDef(name, transformer.transform(token_tree), priority)
                   for name, (token_tree, priority) in token_defs]