From 9cc57abd8a4e0bed0aac14644846f27c71f4e348 Mon Sep 17 00:00:00 2001 From: Erez Sh Date: Tue, 31 Mar 2020 00:03:16 +0300 Subject: [PATCH] Added 'cache' option to Lark (Issue #479) --- docs/classes.md | 5 +- examples/standalone/json_parser.py | 2 +- lark/lark.py | 87 ++++++++++++++++++++---------- lark/tools/standalone.py | 3 +- lark/utils.py | 10 ++-- lark_stubs/lark.pyi | 2 +- tests/__main__.py | 1 + tests/test_cache.py | 82 ++++++++++++++++++++++++++++ tests/test_parser.py | 16 +++--- 9 files changed, 165 insertions(+), 43 deletions(-) create mode 100644 tests/test_cache.py diff --git a/docs/classes.md b/docs/classes.md index e29443c..60d08ef 100644 --- a/docs/classes.md +++ b/docs/classes.md @@ -63,7 +63,10 @@ Useful for caching and multiprocessing. **keep_all_tokens** - Prevent the tree builder from automagically removing "punctuation" tokens (default: False) -**cache_grammar** - Cache the Lark grammar (Default: False) +**cache** - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now. +- When `False`, does nothing (default) +- When `True`, caches to a temporary file in the local directory +- When given a string, caches to the path pointed by the string #### Algorithm diff --git a/examples/standalone/json_parser.py b/examples/standalone/json_parser.py index 5b3e06c..8a92a14 100644 --- a/examples/standalone/json_parser.py +++ b/examples/standalone/json_parser.py @@ -1809,7 +1809,7 @@ class LarkOptions(Serialize): 'debug': False, 'keep_all_tokens': False, 'tree_class': None, - 'cache_grammar': False, + 'cache: False, 'postlex': None, 'parser': 'earley', 'lexer': 'auto', diff --git a/lark/lark.py b/lark/lark.py index 302b526..e0038eb 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -1,11 +1,10 @@ from __future__ import absolute_import -import os +import sys, os, pickle, hashlib, logging from io import open -import pickle -from .utils import STRING_TYPE, Serialize, SerializeMemoizer +from .utils import STRING_TYPE, Serialize, SerializeMemoizer, FS from .load_grammar import load_grammar from .tree import Tree from .common import LexerConf, ParserConf @@ -35,7 +34,12 @@ class LarkOptions(Serialize): When `False`, `[]` behaves like the `?` operator, and returns no value at all. (default=`False`. Recommended to set to `True`) - cache_grammar - Cache the Lark grammar (Default: False) + cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. + LALR only for now. + When `False`, does nothing (default) + When `True`, caches to a temporary file in the local directory + When given a string, caches to the path pointed by the string + g_regex_flags - Flags that are applied to all terminals (both regex and strings) keep_all_tokens - Prevent the tree builder from automagically @@ -80,7 +84,7 @@ class LarkOptions(Serialize): 'debug': False, 'keep_all_tokens': False, 'tree_class': None, - 'cache_grammar': False, + 'cache': False, 'postlex': None, 'parser': 'earley', 'lexer': 'auto', @@ -102,7 +106,7 @@ class LarkOptions(Serialize): for name, default in self._defaults.items(): if name in o: value = o.pop(name) - if isinstance(default, bool): + if isinstance(default, bool) and name != 'cache': value = bool(value) else: value = default @@ -147,6 +151,7 @@ class Lark(Serialize): grammar : a string or file-object containing the grammar spec (using Lark's ebnf syntax) options : a dictionary controlling various aspects of Lark. """ + self.options = LarkOptions(options) # Some, but not all file-like objects have a 'name' attribute @@ -165,8 +170,24 @@ class Lark(Serialize): assert isinstance(grammar, STRING_TYPE) - if self.options.cache_grammar: - raise NotImplementedError("Not available yet") + cache_fn = None + if self.options.cache: + if isinstance(self.options.cache, STRING_TYPE): + cache_fn = self.options.cache + else: + if self.options.cache is not True: + raise ValueError("cache must be bool or str") + unhashable = ('transformer', 'postlex', 'lexer_callbacks', 'edit_terminals') + options_str = ''.join(k+str(v) for k, v in options.items() if k not in unhashable) + s = grammar + options_str + md5 = hashlib.md5(s.encode()).hexdigest() + cache_fn = '.lark_cache_%s.tmp' % md5 + + if FS.exists(cache_fn): + logging.debug('Loading grammar from cache: %s', cache_fn) + with FS.open(cache_fn, 'rb') as f: + self._load(f, self.options.transformer, self.options.postlex) + return if self.options.lexer == 'auto': if self.options.parser == 'lalr': @@ -241,6 +262,11 @@ class Lark(Serialize): elif lexer: self.lexer = self._build_lexer() + if cache_fn: + logging.debug('Saving grammar to cache: %s', cache_fn) + with FS.open(cache_fn, 'wb') as f: + self.save(f) + if __init__.__doc__: __init__.__doc__ += "\nOptions:\n" + LarkOptions.OPTIONS_DOC @@ -259,34 +285,41 @@ class Lark(Serialize): parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) return self.parser_class(self.lexer_conf, parser_conf, options=self.options) + def save(self, f): + data, m = self.memo_serialize([TerminalDef, Rule]) + pickle.dump({'data': data, 'memo': m}, f) + @classmethod - def deserialize(cls, data, namespace, memo, transformer=None, postlex=None): - if memo: - memo = SerializeMemoizer.deserialize(memo, namespace, {}) + def load(cls, f): inst = cls.__new__(cls) + return inst._load(f) + + def _load(self, f, transformer=None, postlex=None): + if isinstance(f, dict): + d = f + else: + d = pickle.load(f) + memo = d['memo'] + data = d['data'] + + assert memo + memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {}) options = dict(data['options']) if transformer is not None: options['transformer'] = transformer if postlex is not None: options['postlex'] = postlex - inst.options = LarkOptions.deserialize(options, memo) - inst.rules = [Rule.deserialize(r, memo) for r in data['rules']] - inst.source = '' - inst._prepare_callbacks() - inst.parser = inst.parser_class.deserialize(data['parser'], memo, inst._callbacks, inst.options.postlex) - return inst - - def save(self, f): - data, m = self.memo_serialize([TerminalDef, Rule]) - pickle.dump({'data': data, 'memo': m}, f) + self.options = LarkOptions.deserialize(options, memo) + self.rules = [Rule.deserialize(r, memo) for r in data['rules']] + self.source = '' + self._prepare_callbacks() + self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex) + return self @classmethod - def load(cls, f): - d = pickle.load(f) - namespace = {'Rule': Rule, 'TerminalDef': TerminalDef} - memo = d['memo'] - return Lark.deserialize(d['data'], namespace, memo) - + def _load_from_dict(cls, data, memo, transformer=None, postlex=None): + inst = cls.__new__(cls) + return inst._load({'data': data, 'memo': memo}, transformer, postlex) @classmethod def open(cls, grammar_filename, rel_to=None, **options): diff --git a/lark/tools/standalone.py b/lark/tools/standalone.py index f345a1d..72042cd 100644 --- a/lark/tools/standalone.py +++ b/lark/tools/standalone.py @@ -106,8 +106,7 @@ def main(fobj, start): print('Shift = 0') print('Reduce = 1') print("def Lark_StandAlone(transformer=None, postlex=None):") - print(" namespace = {'Rule': Rule, 'TerminalDef': TerminalDef}") - print(" return Lark.deserialize(DATA, namespace, MEMO, transformer=transformer, postlex=postlex)") + print(" return Lark._load_from_dict(DATA, MEMO, transformer=transformer, postlex=postlex)") diff --git a/lark/utils.py b/lark/utils.py index a7298a1..199071c 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -1,4 +1,5 @@ import sys +import os from functools import reduce from ast import literal_eval from collections import deque @@ -37,9 +38,6 @@ def bfs(initial, expand): def _serialize(value, memo): - # if memo and memo.in_types(value): - # return {'__memo__': memo.memoized.get(value)} - if isinstance(value, Serialize): return value.serialize(memo) elif isinstance(value, list): @@ -287,3 +285,9 @@ def combine_alternatives(lists): assert all(l for l in lists), lists init = [[x] for x in lists[0]] return reduce(lambda a,b: [i+[j] for i in a for j in b], lists[1:], init) + + + +class FS: + open = open + exists = os.path.exists \ No newline at end of file diff --git a/lark_stubs/lark.pyi b/lark_stubs/lark.pyi index 76a6a54..4dd36ab 100644 --- a/lark_stubs/lark.pyi +++ b/lark_stubs/lark.pyi @@ -33,7 +33,7 @@ class LarkOptions: propagate_positions: bool maybe_placeholders: bool lexer_callbacks: Dict[str, Callable[[Token], Token]] - cache_grammar: bool + cache: Union[bool, str] g_regex_flags: int diff --git a/tests/__main__.py b/tests/__main__.py index 477789f..cb26eb4 100644 --- a/tests/__main__.py +++ b/tests/__main__.py @@ -5,6 +5,7 @@ import logging from .test_trees import TestTrees from .test_tools import TestStandalone +from .test_cache import TestCache from .test_reconstructor import TestReconstructor try: diff --git a/tests/test_cache.py b/tests/test_cache.py new file mode 100644 index 0000000..9436081 --- /dev/null +++ b/tests/test_cache.py @@ -0,0 +1,82 @@ +from __future__ import absolute_import + +import sys +from unittest import TestCase, main + +from lark import Lark, Tree +import lark.lark as lark_module + +try: + from StringIO import StringIO +except ImportError: + from io import BytesIO as StringIO + +import tempfile, os + + +class MockFile(StringIO): + def close(self): + pass + def __enter__(self): + return self + def __exit__(self, *args): + pass + +class MockFS: + def __init__(self): + self.files = {} + + def open(self, name, mode=None): + if name not in self.files: + f = self.files[name] = MockFile() + else: + f = self.files[name] + f.seek(0) + return f + + def exists(self, name): + return name in self.files + + +class TestCache(TestCase): + def setUp(self): + pass + + def test_simple(self): + g = '''start: "a"''' + + fn = "bla" + + fs = lark_module.FS + mock_fs = MockFS() + try: + lark_module.FS = mock_fs + Lark(g, parser='lalr', cache=fn) + assert fn in mock_fs.files + parser = Lark(g, parser='lalr', cache=fn) + assert parser.parse('a') == Tree('start', []) + + mock_fs.files = {} + assert len(mock_fs.files) == 0 + Lark(g, parser='lalr', cache=True) + assert len(mock_fs.files) == 1 + parser = Lark(g, parser='lalr', cache=True) + assert parser.parse('a') == Tree('start', []) + + parser = Lark(g+' "b"', parser='lalr', cache=True) + assert len(mock_fs.files) == 2 + assert parser.parse('ab') == Tree('start', []) + + parser = Lark(g, parser='lalr', cache=True) + assert parser.parse('a') == Tree('start', []) + + finally: + lark_module.FS = fs + + + +if __name__ == '__main__': + main() + + + diff --git a/tests/test_parser.py b/tests/test_parser.py index 0102c62..5a41c57 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -14,6 +14,7 @@ except ImportError: cStringIO = None from io import ( StringIO as uStringIO, + BytesIO, open, ) @@ -26,6 +27,8 @@ from lark.visitors import Transformer, Transformer_InPlace, v_args from lark.grammar import Rule from lark.lexer import TerminalDef, Lexer, TraditionalLexer + + __path__ = os.path.dirname(__file__) def _read(n, *args): with open(os.path.join(__path__, n), *args) as f: @@ -873,7 +876,7 @@ def _make_parser_test(LEXER, PARSER): self.assertSequenceEqual(x.children, [Tree('expr', [])]) x = g.parse("BC") self.assertSequenceEqual(x.children, [Tree('b', [])]) - + def test_templates_modifiers(self): g = _Lark(r""" start: expr{"B"} @@ -1736,15 +1739,12 @@ def _make_parser_test(LEXER, PARSER): b: "B" """ parser = _Lark(grammar) - d = parser.serialize() - parser2 = Lark.deserialize(d, {}, {}) + s = BytesIO() + parser.save(s) + s.seek(0) + parser2 = Lark.load(s) self.assertEqual(parser2.parse('ABC'), Tree('start', [Tree('b', [])]) ) - namespace = {'Rule': Rule, 'TerminalDef': TerminalDef} - d, m = parser.memo_serialize(namespace.values()) - parser3 = Lark.deserialize(d, namespace, m) - self.assertEqual(parser3.parse('ABC'), Tree('start', [Tree('b', [])]) ) - def test_multi_start(self): parser = _Lark(''' a: "x" "a"?