From 009cc105907987a1afab22b769685eb7c89b0e82 Mon Sep 17 00:00:00 2001
From: MegaIng1 <trampchamp@hotmail.de>
Date: Sun, 27 Sep 2020 16:03:39 +0200
Subject: [PATCH] Added `FromPackageLoader` and `open_from_package`

---
 lark-stubs/lark.pyi  | 12 +++++++
 lark/lark.py         | 38 +++++++++++++++++++---
 lark/load_grammar.py | 75 ++++++++++++++++++++++++++++++++------------
 tests/test_parser.py | 26 +++++++++++----
 4 files changed, 120 insertions(+), 31 deletions(-)
diff --git a/lark-stubs/lark.pyi b/lark-stubs/lark.pyi
index deb8849..5cb94b2 100644
--- a/lark-stubs/lark.pyi
+++ b/lark-stubs/lark.pyi
@@ -33,6 +33,13 @@ class LarkOptions:
     g_regex_flags: int
     use_bytes: bool
     import_sources: List[Union[str, Callable[[str, str], str]]]
+    source: Optional[str]
+
+
+class FromPackageLoader:
+    def __init__(self, pkg_name: str, search_paths: Tuple[str, ...] = ...): ...
+    
+    def __call__(self, base_paths: List[str], grammar_path: str) -> Tuple[str, str]: ...
 
 
 class Lark:
@@ -62,6 +69,7 @@ class Lark:
         g_regex_flags: int = ...,
         use_bytes: bool = False,
         import_sources: List[Union[str, Callable[[List[str], str], Tuple[str, str]]]] = ...,
+        source: Optional[str],
     ):
         ...
 
@@ -71,6 +79,10 @@ class Lark:
     @classmethod
     def open(cls: Type[_T], grammar_filename: str, rel_to: Optional[str] = None, **options) -> _T:
         ...
+    
+    @classmethod
+    def open_from_package(cls: Type[_T], package: str, grammar_path: str, search_paths: Tuple[str, ...] = ..., **options) -> _T:
+        ...
 
     def lex(self, text: str) -> Iterator[Token]:
         ...
diff --git a/lark/lark.py b/lark/lark.py
index 9877b00..9f53841 100644
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -5,7 +5,7 @@ from io import open
 
 
 from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS, isascii, logger
-from .load_grammar import load_grammar
+from .load_grammar import load_grammar, FromPackageLoader
 from .tree import Tree
 from .common import LexerConf, ParserConf
 
@@ -92,6 +92,8 @@ class LarkOptions(Serialize):
             A callback for editing the terminals before parse.
     import_sources
             A List of either paths or loader functions to specify from where grammars are imported 
+    source
+            Override the source of from where the grammar was loaded. Usefull for relative imports and unconventional grammar loading
 
     **=== End Options ===**
     """
@@ -118,6 +120,7 @@ class LarkOptions(Serialize):
         'g_regex_flags': 0,
         'use_bytes': False,
         'import_sources': [],
+        'source': None,
     }
 
     def __init__(self, options_dict):
@@ -193,10 +196,13 @@ class Lark(Serialize):
             re_module = re
 
         # Some, but not all file-like objects have a 'name' attribute
-        try:
-            self.source = grammar.name
-        except AttributeError:
-            self.source = '<string>'
+        if self.options.source is None:
+            try:
+                self.source = grammar.name
+            except AttributeError:
+                self.source = '<string>'
+        else:
+            self.source = self.options.source
 
         # Drain file-like objects to get their contents
         try:
@@ -404,6 +410,28 @@ class Lark(Serialize):
             grammar_filename = os.path.join(basepath, grammar_filename)
         with open(grammar_filename, encoding='utf8') as f:
             return cls(f, **options)
+    
+    @classmethod
+    def open_from_package(cls, package, grammar_path, search_paths=("",), **options):
+        """Create an instance of Lark with the grammar loaded from within the package `package`.
+        This allows grammar loading from zipapps.
+        
+        Will also create a `FromPackageLoader` instance and add it to the `import_sources` to simplify importing
+        
+        ``search_paths`` is passed to `FromPackageLoader`
+        
+        Example:
+            
+            Lark.open_from_package(__name__, "example.lark", ("grammars",), parser=...)
+        """
+        package = FromPackageLoader(package, search_paths)
+        full_path, text = package([], grammar_path)
+        options.setdefault('source', full_path)
+        if 'import_sources' in options:
+            options['import_sources'].append(package)
+        else:
+            options['import_sources'] = [package]
+        return cls(text, **options)
 
     def __repr__(self):
         return 'Lark(open(%r), parser=%r, lexer=%r, ...)' % (self.source, self.options.parser, self.options.lexer)
diff --git a/lark/load_grammar.py b/lark/load_grammar.py
index 8849f76..ba1f0f2 100644
--- a/lark/load_grammar.py
+++ b/lark/load_grammar.py
@@ -4,6 +4,7 @@ import os.path
 import sys
 from copy import copy, deepcopy
 from io import open
+import pkgutil
 
 from .utils import bfs, eval_escaping, Py36, logger, classify_bool
 from .lexer import Token, TerminalDef, PatternStr, PatternRE
@@ -648,35 +649,69 @@ class Grammar:
         return terminals, compiled_rules, self.ignore
 
 
-def stdlib_loader(base_paths, grammar_path):
-    import pkgutil
-    for path in IMPORT_PATHS:
-        text = pkgutil.get_data('lark', path + '/' + grammar_path)
-        if text is None:
-            continue
-        return '<stdlib:' + grammar_path + '>', text.decode()
-    raise FileNotFoundError()
+class FromPackageLoader(object):
+    """
+    Provides a simple way of creating custom import loaders that load from packages via ``pkgutil.get_data`` instead of using `open`.
+    This allows them to be compatible even from within zip files.
+    
+    Relative imports are handled, so you can just freely use them.
+    
+    pkg_name: The name of the package. You can probably provide `__name__` most of the time
+    search_paths: All the path that will be search on absolute imports.
+    """
+    def __init__(self, pkg_name, search_paths=("", )):
+        self.pkg_name = pkg_name
+        self.search_paths = search_paths
+        
+    def __repr__(self):
+        return "%s(%r, %r)" % (type(self).__name__, self.pkg_name, self.search_paths)
+
+    def __call__(self, base_paths, grammar_path):
+        if len(base_paths) == 0:
+            to_try = self.search_paths
+        else:
+            assert len(base_paths) == 1
+            if not base_paths[0].startswith('<%s:' % (self.pkg_name,)):
+                # Technically false, but FileNotFound doesn't exist in python2.7, and this message should never reach the end user anyway
+                raise IOError()
+            base_path = base_paths[0].partition(':')[2]
+            if base_path and base_path[0] == '/':
+                base_path = base_path[1:]
+            to_try = [base_path]
+        for path in to_try:
+            full_path = os.path.join(path, grammar_path)
+            text = None
+            with suppress(IOError):
+                text = pkgutil.get_data(self.pkg_name, full_path)
+            if text is None:
+                continue
+            return '<%s:/%s>' % (self.pkg_name, full_path), text.decode()
+        raise IOError()
+
+stdlib_loader = FromPackageLoader('lark', IMPORT_PATHS)
 
 
 _imported_grammars = {}
-def import_grammar(grammar_path, re_, base_paths=(), import_sources=()):
+def import_grammar(grammar_path, re_, base_paths=[], import_sources=[]):
     if grammar_path not in _imported_grammars:
-        import_paths = import_sources + base_paths + [stdlib_loader]
+        # import_sources take priority over base_paths since they should handle relative imports and ignore everthing else.
+        # Question: should the stdlib_loader really be pushed to the end?
+        import_paths = import_sources + base_paths + [stdlib_loader] 
         for source in import_paths:
-            if callable(source):
-                with suppress(IOError):
+            text = None
+            with suppress(IOError):
+                if callable(source):
                     joined_path, text = source(base_paths, grammar_path)
-                    grammar = load_grammar(text, joined_path, re_, import_sources)
-                    _imported_grammars[grammar_path] = grammar
-                    break
-            else:
-                with suppress(IOError):
+                else:
                     joined_path = os.path.join(source, grammar_path)
                     with open(joined_path, encoding='utf8') as f:
                         text = f.read()
-                    grammar = load_grammar(text, joined_path, re_, import_sources)
-                    _imported_grammars[grammar_path] = grammar
-                    break
+            if text is not None:
+                # Don't load the grammar from within the suppress statement. Otherwise the underlying error message will be swallowed 
+                # and the wrong file will be reported as missing
+                grammar = load_grammar(text, joined_path, re_, import_sources) 
+                _imported_grammars[grammar_path] = grammar
+                break
         else:
             open(grammar_path, encoding='utf8')
             assert False
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 6779f64..0406f46 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -11,6 +11,7 @@ from copy import copy, deepcopy
 from lark.utils import Py36, isascii
 
 from lark import Token
+from lark.load_grammar import FromPackageLoader
 
 try:
     from cStringIO import StringIO as cStringIO
@@ -1783,12 +1784,7 @@ def _make_parser_test(LEXER, PARSER):
             self.assertRaises(IOError, _Lark, grammar)
 
         def test_import_custom_sources(self):
-            def custom_loader(base_paths, grammar_path):
-                import pkgutil
-                text = pkgutil.get_data('tests', 'grammars/' + grammar_path)
-                if text is None:
-                    raise FileNotFoundError()
-                return '<tests.grammars:' + grammar_path + '>', text.decode()
+            custom_loader = FromPackageLoader('tests', ('grammars', ))
 
             grammar = """
             start: startab
@@ -1800,6 +1796,24 @@ def _make_parser_test(LEXER, PARSER):
             self.assertEqual(p.parse('ab'),
                              Tree('start', [Tree('startab', [Tree('ab__expr', [Token('ab__A', 'a'), Token('ab__B', 'b')])])]))
 
+            grammar = """
+            start: rule_to_import
+
+            %import test_relative_import_of_nested_grammar__grammar_to_import.rule_to_import
+            """
+            p = _Lark(grammar, import_sources=[custom_loader])
+            x = p.parse('N')
+            self.assertEqual(next(x.find_data('rule_to_import')).children, ['N'])
+            
+            custom_loader2 = FromPackageLoader('tests')
+            grammar = """
+            %import .test_relative_import (start, WS)
+            %ignore WS
+            """
+            p = _Lark(grammar, import_sources=[custom_loader2])
+            x = p.parse('12 capybaras')
+            self.assertEqual(x.children, ['12', 'capybaras'])
+
         @unittest.skipIf(PARSER != 'earley', "Currently only Earley supports priority in rules")
         def test_earley_prioritization(self):
             "Tests effect of priority on result"