From 9cc57abd8a4e0bed0aac14644846f27c71f4e348 Mon Sep 17 00:00:00 2001
From: Erez Sh <erezshin@gmail.com>
Date: Tue, 31 Mar 2020 00:03:16 +0300
Subject: [PATCH] Added 'cache' option to Lark (Issue #479)

---
 docs/classes.md                    |  5 +-
 examples/standalone/json_parser.py |  2 +-
 lark/lark.py                       | 87 ++++++++++++++++++++----------
 lark/tools/standalone.py           |  3 +-
 lark/utils.py                      | 10 ++--
 lark_stubs/lark.pyi                |  2 +-
 tests/__main__.py                  |  1 +
 tests/test_cache.py                | 82 ++++++++++++++++++++++++++++
 tests/test_parser.py               | 16 +++---
 9 files changed, 165 insertions(+), 43 deletions(-)
 create mode 100644 tests/test_cache.py

diff --git a/docs/classes.md b/docs/classes.md
index e29443c..60d08ef 100644
--- a/docs/classes.md
+++ b/docs/classes.md
@@ -63,7 +63,10 @@ Useful for caching and multiprocessing.
 
 **keep_all_tokens** - Prevent the tree builder from automagically removing "punctuation" tokens (default: False)
 
-**cache_grammar** - Cache the Lark grammar (Default: False)
+**cache** - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now.
+- When `False`, does nothing (default)
+- When `True`, caches to a temporary file in the local directory
+- When given a string, caches to the path pointed by the string
 
 #### Algorithm
 
diff --git a/examples/standalone/json_parser.py b/examples/standalone/json_parser.py
index 5b3e06c..8a92a14 100644
--- a/examples/standalone/json_parser.py
+++ b/examples/standalone/json_parser.py
@@ -1809,7 +1809,7 @@ class LarkOptions(Serialize):
         'debug': False,
         'keep_all_tokens': False,
         'tree_class': None,
-        'cache_grammar': False,
+        'cache: False,
         'postlex': None,
         'parser': 'earley',
         'lexer': 'auto',
diff --git a/lark/lark.py b/lark/lark.py
index 302b526..e0038eb 100644
--- a/lark/lark.py
+++ b/lark/lark.py
@@ -1,11 +1,10 @@
 from __future__ import absolute_import
 
-import os
+import sys, os, pickle, hashlib, logging
 from io import open
-import pickle
 
 
-from .utils import STRING_TYPE, Serialize, SerializeMemoizer
+from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS
 from .load_grammar import load_grammar
 from .tree import Tree
 from .common import LexerConf, ParserConf
@@ -35,7 +34,12 @@ class LarkOptions(Serialize):
                          When `False`,  `[]` behaves like the `?` operator,
                              and returns no value at all.
                          (default=`False`. Recommended to set to `True`)
-    cache_grammar - Cache the Lark grammar (Default: False)
+    cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading.
+            LALR only for now.
+        When `False`, does nothing (default)
+        When `True`, caches to a temporary file in the local directory
+        When given a string, caches to the path pointed by the string
+
     g_regex_flags - Flags that are applied to all terminals
                     (both regex and strings)
     keep_all_tokens - Prevent the tree builder from automagically
@@ -80,7 +84,7 @@ class LarkOptions(Serialize):
         'debug': False,
         'keep_all_tokens': False,
         'tree_class': None,
-        'cache_grammar': False,
+        'cache': False,
         'postlex': None,
         'parser': 'earley',
         'lexer': 'auto',
@@ -102,7 +106,7 @@ class LarkOptions(Serialize):
         for name, default in self._defaults.items():
             if name in o:
                 value = o.pop(name)
-                if isinstance(default, bool):
+                if isinstance(default, bool) and name != 'cache':
                     value = bool(value)
             else:
                 value = default
@@ -147,6 +151,7 @@ class Lark(Serialize):
             grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax)
             options : a dictionary controlling various aspects of Lark.
         """
+
         self.options = LarkOptions(options)
 
         # Some, but not all file-like objects have a 'name' attribute
@@ -165,8 +170,24 @@ class Lark(Serialize):
 
         assert isinstance(grammar, STRING_TYPE)
 
-        if self.options.cache_grammar:
-            raise NotImplementedError("Not available yet")
+        cache_fn = None
+        if self.options.cache:
+            if isinstance(self.options.cache, STRING_TYPE):
+                cache_fn = self.options.cache
+            else:
+                if self.options.cache is not True:
+                    raise ValueError("cache must be bool or str")
+                unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals')
+                options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable)
+                s = grammar + options_str
+                md5 = hashlib.md5(s.encode()).hexdigest()
+                cache_fn = '.lark_cache_%s.tmp' % md5
+
+            if FS.exists(cache_fn):
+                logging.debug('Loading grammar from cache: %s', cache_fn)
+                with FS.open(cache_fn, 'rb') as f:
+                    self._load(f, self.options.transformer, self.options.postlex)
+                return
 
         if self.options.lexer == 'auto':
             if self.options.parser == 'lalr':
@@ -241,6 +262,11 @@ class Lark(Serialize):
         elif lexer:
             self.lexer = self._build_lexer()
 
+        if cache_fn:
+            logging.debug('Saving grammar to cache: %s', cache_fn)
+            with FS.open(cache_fn, 'wb') as f:
+                self.save(f)
+
     if __init__.__doc__:
         __init__.__doc__ += "\nOptions:\n" + LarkOptions.OPTIONS_DOC
 
@@ -259,34 +285,41 @@ class Lark(Serialize):
         parser_conf = ParserConf(self.rules, self._callbacks, self.options.start)
         return self.parser_class(self.lexer_conf, parser_conf, options=self.options)
 
+    def save(self, f):
+        data, m = self.memo_serialize([TerminalDef, Rule])
+        pickle.dump({'data': data, 'memo': m}, f)
+
     @classmethod
-    def deserialize(cls, data, namespace, memo, transformer=None, postlex=None):
-        if memo:
-            memo = SerializeMemoizer.deserialize(memo, namespace, {})
+    def load(cls, f):
         inst = cls.__new__(cls)
+        return inst._load(f)
+
+    def _load(self, f, transformer=None, postlex=None):
+        if isinstance(f, dict):
+            d = f
+        else:
+            d = pickle.load(f)
+        memo = d['memo']
+        data = d['data']
+
+        assert memo
+        memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {})
         options = dict(data['options'])
         if transformer is not None:
             options['transformer'] = transformer
         if postlex is not None:
             options['postlex'] = postlex
-        inst.options = LarkOptions.deserialize(options, memo)
-        inst.rules = [Rule.deserialize(r, memo) for r in data['rules']]
-        inst.source = '<deserialized>'
-        inst._prepare_callbacks()
-        inst.parser = inst.parser_class.deserialize(data['parser'], memo, inst._callbacks, inst.options.postlex)
-        return inst
-
-    def save(self, f):
-        data, m = self.memo_serialize([TerminalDef, Rule])
-        pickle.dump({'data': data, 'memo': m}, f)
+        self.options = LarkOptions.deserialize(options, memo)
+        self.rules = [Rule.deserialize(r, memo) for r in data['rules']]
+        self.source = '<deserialized>'
+        self._prepare_callbacks()
+        self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex)
+        return self
 
     @classmethod
-    def load(cls, f):
-        d = pickle.load(f)
-        namespace = {'Rule': Rule, 'TerminalDef': TerminalDef}
-        memo = d['memo']
-        return Lark.deserialize(d['data'], namespace, memo)
-
+    def _load_from_dict(cls, data, memo, transformer=None, postlex=None):
+        inst = cls.__new__(cls)
+        return inst._load({'data': data, 'memo': memo}, transformer, postlex)
 
     @classmethod
     def open(cls, grammar_filename, rel_to=None, **options):
diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py
index f345a1d..72042cd 100644
--- a/lark/tools/standalone.py
+++ b/lark/tools/standalone.py
@@ -106,8 +106,7 @@ def main(fobj, start):
     print('Shift = 0')
     print('Reduce = 1')
     print("def Lark_StandAlone(transformer=None, postlex=None):")
-    print("  namespace = {'Rule': Rule, 'TerminalDef': TerminalDef}")
-    print("  return Lark.deserialize(DATA, namespace, MEMO, transformer=transformer, postlex=postlex)")
+    print("  return Lark._load_from_dict(DATA, MEMO, transformer=transformer, postlex=postlex)")
 
 
 
diff --git a/lark/utils.py b/lark/utils.py
index a7298a1..199071c 100644
--- a/lark/utils.py
+++ b/lark/utils.py
@@ -1,4 +1,5 @@
 import sys
+import os
 from functools import reduce
 from ast import literal_eval
 from collections import deque
@@ -37,9 +38,6 @@ def bfs(initial, expand):
 
 
 def _serialize(value, memo):
-    # if memo and memo.in_types(value):
-    #     return {'__memo__': memo.memoized.get(value)}
-
     if isinstance(value, Serialize):
         return value.serialize(memo)
     elif isinstance(value, list):
@@ -287,3 +285,9 @@ def combine_alternatives(lists):
     assert all(l for l in lists), lists
     init = [[x] for x in lists[0]]
     return reduce(lambda a,b: [i+[j] for i in a for j in b], lists[1:], init)
+
+
+
+class FS:
+    open = open
+    exists = os.path.exists
\ No newline at end of file
diff --git a/lark_stubs/lark.pyi b/lark_stubs/lark.pyi
index 76a6a54..4dd36ab 100644
--- a/lark_stubs/lark.pyi
+++ b/lark_stubs/lark.pyi
@@ -33,7 +33,7 @@ class LarkOptions:
     propagate_positions: bool
     maybe_placeholders: bool
     lexer_callbacks: Dict[str, Callable[[Token], Token]]
-    cache_grammar: bool
+    cache: Union[bool, str]
     g_regex_flags: int
 
 
diff --git a/tests/__main__.py b/tests/__main__.py
index 477789f..cb26eb4 100644
--- a/tests/__main__.py
+++ b/tests/__main__.py
@@ -5,6 +5,7 @@ import logging
 
 from .test_trees import TestTrees
 from .test_tools import TestStandalone
+from .test_cache import TestCache
 from .test_reconstructor import TestReconstructor
 
 try:
diff --git a/tests/test_cache.py b/tests/test_cache.py
new file mode 100644
index 0000000..9436081
--- /dev/null
+++ b/tests/test_cache.py
@@ -0,0 +1,82 @@
+from __future__ import absolute_import
+
+import sys
+from unittest import TestCase, main
+
+from lark import Lark, Tree
+import lark.lark as lark_module
+
+try:
+    from StringIO import StringIO
+except ImportError:
+    from io import BytesIO as StringIO
+
+import tempfile, os
+
+
+class MockFile(StringIO):
+    def close(self):
+        pass
+    def __enter__(self):
+        return self
+    def __exit__(self, *args):
+        pass
+
+class MockFS:
+    def __init__(self):
+        self.files = {}
+
+    def open(self, name, mode=None):
+        if name not in self.files:
+            f = self.files[name] = MockFile()
+        else:
+            f = self.files[name]
+            f.seek(0)
+        return f
+
+    def exists(self, name):
+        return name in self.files
+
+
+class TestCache(TestCase):
+    def setUp(self):
+        pass
+
+    def test_simple(self):
+        g = '''start: "a"'''
+
+        fn = "bla"
+
+        fs = lark_module.FS
+        mock_fs = MockFS()
+        try:
+            lark_module.FS = mock_fs
+            Lark(g, parser='lalr', cache=fn)
+            assert fn in mock_fs.files
+            parser = Lark(g, parser='lalr', cache=fn)
+            assert parser.parse('a') == Tree('start', [])
+
+            mock_fs.files = {}
+            assert len(mock_fs.files) == 0
+            Lark(g, parser='lalr', cache=True)
+            assert len(mock_fs.files) == 1
+            parser = Lark(g, parser='lalr', cache=True)
+            assert parser.parse('a') == Tree('start', [])
+
+            parser = Lark(g+' "b"', parser='lalr', cache=True)
+            assert len(mock_fs.files) == 2
+            assert parser.parse('ab') == Tree('start', [])
+
+            parser = Lark(g, parser='lalr', cache=True)
+            assert parser.parse('a') == Tree('start', [])
+
+        finally:
+            lark_module.FS = fs
+
+
+
+if __name__ == '__main__':
+    main()
+
+
+
diff --git a/tests/test_parser.py b/tests/test_parser.py
index 0102c62..5a41c57 100644
--- a/tests/test_parser.py
+++ b/tests/test_parser.py
@@ -14,6 +14,7 @@ except ImportError:
     cStringIO = None
 from io import (
         StringIO as uStringIO,
+        BytesIO,
         open,
     )
 
@@ -26,6 +27,8 @@ from lark.visitors import Transformer, Transformer_InPlace, v_args
 from lark.grammar import Rule
 from lark.lexer import TerminalDef, Lexer, TraditionalLexer
 
+
+
 __path__ = os.path.dirname(__file__)
 def _read(n, *args):
     with open(os.path.join(__path__, n), *args) as f:
@@ -873,7 +876,7 @@ def _make_parser_test(LEXER, PARSER):
             self.assertSequenceEqual(x.children, [Tree('expr', [])])
             x = g.parse("BC")
             self.assertSequenceEqual(x.children, [Tree('b', [])])
-            
+
         def test_templates_modifiers(self):
             g = _Lark(r"""
                        start: expr{"B"}
@@ -1736,15 +1739,12 @@ def _make_parser_test(LEXER, PARSER):
                 b: "B"
             """
             parser = _Lark(grammar)
-            d = parser.serialize()
-            parser2 = Lark.deserialize(d, {}, {})
+            s = BytesIO()
+            parser.save(s)
+            s.seek(0)
+            parser2 = Lark.load(s)
             self.assertEqual(parser2.parse('ABC'), Tree('start', [Tree('b', [])]) )
 
-            namespace = {'Rule': Rule, 'TerminalDef': TerminalDef}
-            d, m = parser.memo_serialize(namespace.values())
-            parser3 = Lark.deserialize(d, namespace, m)
-            self.assertEqual(parser3.parse('ABC'), Tree('start', [Tree('b', [])]) )
-
         def test_multi_start(self):
             parser = _Lark('''
                 a: "x" "a"?