| @@ -5,5 +5,7 @@ from .visitors import * | |||||
| from .exceptions import * | from .exceptions import * | ||||
| from .lexer import * | from .lexer import * | ||||
| from .lark import * | from .lark import * | ||||
| from logging import Logger as _Logger | |||||
| logger: _Logger | |||||
| __version__: str = ... | __version__: str = ... | ||||
| @@ -2,7 +2,7 @@ | |||||
| from typing import ( | from typing import ( | ||||
| TypeVar, Type, List, Dict, IO, Iterator, Callable, Union, Optional, | TypeVar, Type, List, Dict, IO, Iterator, Callable, Union, Optional, | ||||
| Literal, Protocol, Tuple, | |||||
| Literal, Protocol, Tuple, Iterable, | |||||
| ) | ) | ||||
| from .visitors import Transformer | from .visitors import Transformer | ||||
| from .lexer import Token, Lexer, TerminalDef | from .lexer import Token, Lexer, TerminalDef | ||||
| @@ -14,6 +14,8 @@ class PostLex(Protocol): | |||||
| def process(self, stream: Iterator[Token]) -> Iterator[Token]: | def process(self, stream: Iterator[Token]) -> Iterator[Token]: | ||||
| ... | ... | ||||
| always_accept: Iterable[str] | |||||
| class LarkOptions: | class LarkOptions: | ||||
| @@ -7,4 +7,4 @@ from .exceptions import (ParseError, LexError, GrammarError, UnexpectedToken, | |||||
| from .lexer import Token | from .lexer import Token | ||||
| from .lark import Lark | from .lark import Lark | ||||
| __version__ = "0.10.0" | |||||
| __version__ = "0.10.1" | |||||
| @@ -169,6 +169,10 @@ class LarkOptions(Serialize): | |||||
| return cls(data) | return cls(data) | ||||
| _LOAD_ALLOWED_OPTIONS = {'postlex', 'transformer', 'use_bytes', 'debug', 'g_regex_flags', | |||||
| 'regex', 'propagate_positions', 'keep_all_tokens', 'tree_class'} | |||||
| class Lark(Serialize): | class Lark(Serialize): | ||||
| """Main interface for the library. | """Main interface for the library. | ||||
| @@ -239,8 +243,11 @@ class Lark(Serialize): | |||||
| if FS.exists(cache_fn): | if FS.exists(cache_fn): | ||||
| logger.debug('Loading grammar from cache: %s', cache_fn) | logger.debug('Loading grammar from cache: %s', cache_fn) | ||||
| # Remove options that aren't relevant for loading from cache | |||||
| for name in (set(options) - _LOAD_ALLOWED_OPTIONS): | |||||
| del options[name] | |||||
| with FS.open(cache_fn, 'rb') as f: | with FS.open(cache_fn, 'rb') as f: | ||||
| self._load(f, self.options.transformer, self.options.postlex) | |||||
| self._load(f, **options) | |||||
| return | return | ||||
| if self.options.lexer == 'auto': | if self.options.lexer == 'auto': | ||||
| @@ -278,8 +285,13 @@ class Lark(Serialize): | |||||
| # Parse the grammar file and compose the grammars (TODO) | # Parse the grammar file and compose the grammars (TODO) | ||||
| self.grammar = load_grammar(grammar, self.source_path, re_module, self.options.import_paths) | self.grammar = load_grammar(grammar, self.source_path, re_module, self.options.import_paths) | ||||
| if self.options.postlex is not None: | |||||
| terminals_to_keep = set(self.options.postlex.always_accept) | |||||
| else: | |||||
| terminals_to_keep = set() | |||||
| # Compile the EBNF grammar into BNF | # Compile the EBNF grammar into BNF | ||||
| self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) | |||||
| self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start, terminals_to_keep) | |||||
| if self.options.edit_terminals: | if self.options.edit_terminals: | ||||
| for t in self.terminals: | for t in self.terminals: | ||||
| @@ -319,7 +331,8 @@ class Lark(Serialize): | |||||
| with FS.open(cache_fn, 'wb') as f: | with FS.open(cache_fn, 'wb') as f: | ||||
| self.save(f) | self.save(f) | ||||
| __doc__ += "\n\n" + LarkOptions.OPTIONS_DOC | |||||
| if __doc__: | |||||
| __doc__ += "\n\n" + LarkOptions.OPTIONS_DOC | |||||
| __serialize_fields__ = 'parser', 'rules', 'options' | __serialize_fields__ = 'parser', 'rules', 'options' | ||||
| @@ -345,7 +358,7 @@ class Lark(Serialize): | |||||
| Useful for caching and multiprocessing. | Useful for caching and multiprocessing. | ||||
| """ | """ | ||||
| data, m = self.memo_serialize([TerminalDef, Rule]) | data, m = self.memo_serialize([TerminalDef, Rule]) | ||||
| pickle.dump({'data': data, 'memo': m}, f) | |||||
| pickle.dump({'data': data, 'memo': m}, f, protocol=pickle.HIGHEST_PROTOCOL) | |||||
| @classmethod | @classmethod | ||||
| def load(cls, f): | def load(cls, f): | ||||
| @@ -356,7 +369,7 @@ class Lark(Serialize): | |||||
| inst = cls.__new__(cls) | inst = cls.__new__(cls) | ||||
| return inst._load(f) | return inst._load(f) | ||||
| def _load(self, f, transformer=None, postlex=None): | |||||
| def _load(self, f, **kwargs): | |||||
| if isinstance(f, dict): | if isinstance(f, dict): | ||||
| d = f | d = f | ||||
| else: | else: | ||||
| @@ -367,12 +380,11 @@ class Lark(Serialize): | |||||
| assert memo | assert memo | ||||
| memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {}) | memo = SerializeMemoizer.deserialize(memo, {'Rule': Rule, 'TerminalDef': TerminalDef}, {}) | ||||
| options = dict(data['options']) | options = dict(data['options']) | ||||
| if transformer is not None: | |||||
| options['transformer'] = transformer | |||||
| if postlex is not None: | |||||
| options['postlex'] = postlex | |||||
| if (set(kwargs) - _LOAD_ALLOWED_OPTIONS) & set(LarkOptions._defaults): | |||||
| raise ValueError("Some options are not allowed when loading a Parser: {}" | |||||
| .format(set(kwargs) - _LOAD_ALLOWED_OPTIONS)) | |||||
| options.update(kwargs) | |||||
| self.options = LarkOptions.deserialize(options, memo) | self.options = LarkOptions.deserialize(options, memo) | ||||
| re_module = regex if self.options.regex else re | |||||
| self.rules = [Rule.deserialize(r, memo) for r in data['rules']] | self.rules = [Rule.deserialize(r, memo) for r in data['rules']] | ||||
| self.source_path = '<deserialized>' | self.source_path = '<deserialized>' | ||||
| self._prepare_callbacks() | self._prepare_callbacks() | ||||
| @@ -380,18 +392,16 @@ class Lark(Serialize): | |||||
| data['parser'], | data['parser'], | ||||
| memo, | memo, | ||||
| self._callbacks, | self._callbacks, | ||||
| self.options.postlex, | |||||
| self.options.transformer, | |||||
| re_module | |||||
| self.options, # Not all, but multiple attributes are used | |||||
| ) | ) | ||||
| self.terminals = self.parser.lexer_conf.tokens | self.terminals = self.parser.lexer_conf.tokens | ||||
| self._terminals_dict = {t.name: t for t in self.terminals} | self._terminals_dict = {t.name: t for t in self.terminals} | ||||
| return self | return self | ||||
| @classmethod | @classmethod | ||||
| def _load_from_dict(cls, data, memo, transformer=None, postlex=None): | |||||
| def _load_from_dict(cls, data, memo, **kwargs): | |||||
| inst = cls.__new__(cls) | inst = cls.__new__(cls) | ||||
| return inst._load({'data': data, 'memo': memo}, transformer, postlex) | |||||
| return inst._load({'data': data, 'memo': memo}, **kwargs) | |||||
| @classmethod | @classmethod | ||||
| def open(cls, grammar_filename, rel_to=None, **options): | def open(cls, grammar_filename, rel_to=None, **options): | ||||
| @@ -527,7 +527,7 @@ class Grammar: | |||||
| self.rule_defs = rule_defs | self.rule_defs = rule_defs | ||||
| self.ignore = ignore | self.ignore = ignore | ||||
| def compile(self, start): | |||||
| def compile(self, start, terminals_to_keep): | |||||
| # We change the trees in-place (to support huge grammars) | # We change the trees in-place (to support huge grammars) | ||||
| # So deepcopy allows calling compile more than once. | # So deepcopy allows calling compile more than once. | ||||
| term_defs = deepcopy(list(self.term_defs)) | term_defs = deepcopy(list(self.term_defs)) | ||||
| @@ -642,7 +642,7 @@ class Grammar: | |||||
| used_terms = {t.name for r in compiled_rules | used_terms = {t.name for r in compiled_rules | ||||
| for t in r.expansion | for t in r.expansion | ||||
| if isinstance(t, Terminal)} | if isinstance(t, Terminal)} | ||||
| terminals, unused = classify_bool(terminals, lambda t: t.name in used_terms or t.name in self.ignore) | |||||
| terminals, unused = classify_bool(terminals, lambda t: t.name in used_terms or t.name in self.ignore or t.name in terminals_to_keep) | |||||
| if unused: | if unused: | ||||
| logger.debug("Unused terminals: %s", [t.name for t in unused]) | logger.debug("Unused terminals: %s", [t.name for t in unused]) | ||||
| @@ -6,6 +6,11 @@ from .parsers.lalr_parser import LALR_Parser | |||||
| from .grammar import Rule | from .grammar import Rule | ||||
| from .tree import Tree | from .tree import Tree | ||||
| from .common import LexerConf | from .common import LexerConf | ||||
| try: | |||||
| import regex | |||||
| except ImportError: | |||||
| regex = None | |||||
| import re | |||||
| ###{standalone | ###{standalone | ||||
| @@ -82,16 +87,18 @@ class WithLexer(_ParserFrontend): | |||||
| self.postlex = lexer_conf.postlex | self.postlex = lexer_conf.postlex | ||||
| @classmethod | @classmethod | ||||
| def deserialize(cls, data, memo, callbacks, postlex, transformer, re_module): | |||||
| def deserialize(cls, data, memo, callbacks, options): | |||||
| inst = super(WithLexer, cls).deserialize(data, memo) | inst = super(WithLexer, cls).deserialize(data, memo) | ||||
| inst.postlex = postlex | |||||
| inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) | |||||
| inst.postlex = options.postlex | |||||
| inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks, options.debug) | |||||
| terminals = [item for item in memo.values() if isinstance(item, TerminalDef)] | terminals = [item for item in memo.values() if isinstance(item, TerminalDef)] | ||||
| inst.lexer_conf.callbacks = _get_lexer_callbacks(transformer, terminals) | |||||
| inst.lexer_conf.re_module = re_module | |||||
| inst.lexer_conf.skip_validation=True | |||||
| inst.lexer_conf.callbacks = _get_lexer_callbacks(options.transformer, terminals) | |||||
| inst.lexer_conf.re_module = regex if options.regex else re | |||||
| inst.lexer_conf.use_bytes = options.use_bytes | |||||
| inst.lexer_conf.g_regex_flags = options.g_regex_flags | |||||
| inst.lexer_conf.skip_validation = True | |||||
| inst.init_lexer() | inst.init_lexer() | ||||
| return inst | return inst | ||||
| @@ -246,13 +246,14 @@ class LALR_Analyzer(GrammarAnalyzer): | |||||
| def compute_lalr1_states(self): | def compute_lalr1_states(self): | ||||
| m = {} | m = {} | ||||
| reduce_reduce = [] | |||||
| for state in self.lr0_states: | for state in self.lr0_states: | ||||
| actions = {} | actions = {} | ||||
| for la, next_state in state.transitions.items(): | for la, next_state in state.transitions.items(): | ||||
| actions[la] = (Shift, next_state.closure) | actions[la] = (Shift, next_state.closure) | ||||
| for la, rules in state.lookaheads.items(): | for la, rules in state.lookaheads.items(): | ||||
| if len(rules) > 1: | if len(rules) > 1: | ||||
| raise GrammarError('Reduce/Reduce collision in %s between the following rules: %s' % (la, ''.join([ '\n\t\t- ' + str(r) for r in rules ]))) | |||||
| reduce_reduce.append((la, rules)) | |||||
| if la in actions: | if la in actions: | ||||
| if self.debug: | if self.debug: | ||||
| logger.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) | logger.warning('Shift/Reduce conflict for terminal %s: (resolving as shift)', la.name) | ||||
| @@ -261,6 +262,12 @@ class LALR_Analyzer(GrammarAnalyzer): | |||||
| actions[la] = (Reduce, list(rules)[0]) | actions[la] = (Reduce, list(rules)[0]) | ||||
| m[state] = { k.name: v for k, v in actions.items() } | m[state] = { k.name: v for k, v in actions.items() } | ||||
| if reduce_reduce: | |||||
| msgs = [ 'Reduce/Reduce collision in %s between the following rules: %s' | |||||
| % (la, ''.join([ '\n\t\t- ' + str(r) for r in rules ])) | |||||
| for la, rules in reduce_reduce] | |||||
| raise GrammarError('\n\n'.join(msgs)) | |||||
| states = { k.closure: v for k, v in m.items() } | states = { k.closure: v for k, v in m.items() } | ||||
| # compute end states | # compute end states | ||||
| @@ -23,10 +23,10 @@ class LALR_Parser(object): | |||||
| self.parser = _Parser(analysis.parse_table, callbacks, debug) | self.parser = _Parser(analysis.parse_table, callbacks, debug) | ||||
| @classmethod | @classmethod | ||||
| def deserialize(cls, data, memo, callbacks): | |||||
| def deserialize(cls, data, memo, callbacks, debug=False): | |||||
| inst = cls.__new__(cls) | inst = cls.__new__(cls) | ||||
| inst._parse_table = IntParseTable.deserialize(data, memo) | inst._parse_table = IntParseTable.deserialize(data, memo) | ||||
| inst.parser = _Parser(inst._parse_table, callbacks) | |||||
| inst.parser = _Parser(inst._parse_table, callbacks, debug) | |||||
| return inst | return inst | ||||
| def serialize(self, memo): | def serialize(self, memo): | ||||
| @@ -145,8 +145,8 @@ def main(fobj, start, print=print): | |||||
| print('Shift = 0') | print('Shift = 0') | ||||
| print('Reduce = 1') | print('Reduce = 1') | ||||
| print("def Lark_StandAlone(transformer=None, postlex=None):") | |||||
| print(" return Lark._load_from_dict(DATA, MEMO, transformer=transformer, postlex=postlex)") | |||||
| print("def Lark_StandAlone(**kwargs):") | |||||
| print(" return Lark._load_from_dict(DATA, MEMO, **kwargs)") | |||||
| @@ -81,7 +81,8 @@ class TreeMatcher: | |||||
| def __init__(self, parser): | def __init__(self, parser): | ||||
| # XXX TODO calling compile twice returns different results! | # XXX TODO calling compile twice returns different results! | ||||
| assert parser.options.maybe_placeholders == False | assert parser.options.maybe_placeholders == False | ||||
| self.tokens, rules, _extra = parser.grammar.compile(parser.options.start) | |||||
| # XXX TODO: we just ignore the potential existence of a postlexer | |||||
| self.tokens, rules, _extra = parser.grammar.compile(parser.options.start, set()) | |||||
| self.rules_for_root = defaultdict(list) | self.rules_for_root = defaultdict(list) | ||||
| @@ -86,6 +86,12 @@ class TestCache(TestCase): | |||||
| parser = Lark(g, parser='lalr', lexer=CustomLexer, cache=True) | parser = Lark(g, parser='lalr', lexer=CustomLexer, cache=True) | ||||
| assert len(mock_fs.files) == 1 | assert len(mock_fs.files) == 1 | ||||
| assert parser.parse('a') == Tree('start', []) | assert parser.parse('a') == Tree('start', []) | ||||
| # Test options persistence | |||||
| mock_fs.files = {} | |||||
| Lark(g, parser="lalr", debug=True, cache=True) | |||||
| parser = Lark(g, parser="lalr", debug=True, cache=True) | |||||
| assert parser.options.options['debug'] | |||||
| finally: | finally: | ||||
| lark_module.FS = fs | lark_module.FS = fs | ||||
| @@ -93,6 +99,3 @@ class TestCache(TestCase): | |||||
| if __name__ == '__main__': | if __name__ == '__main__': | ||||
| main() | main() | ||||
| @@ -1782,6 +1782,29 @@ def _make_parser_test(LEXER, PARSER): | |||||
| %import bad_test.NUMBER | %import bad_test.NUMBER | ||||
| """ | """ | ||||
| self.assertRaises(IOError, _Lark, grammar) | self.assertRaises(IOError, _Lark, grammar) | ||||
| @unittest.skipIf(LEXER=='dynamic', "%declare/postlex doesn't work with dynamic") | |||||
| def test_postlex_declare(self): # Note: this test does a lot. maybe split it up? | |||||
| class TestPostLexer: | |||||
| def process(self, stream): | |||||
| for t in stream: | |||||
| if t.type == 'A': | |||||
| t.type = 'B' | |||||
| yield t | |||||
| else: | |||||
| yield t | |||||
| always_accept = ('A',) | |||||
| parser = _Lark(""" | |||||
| start: B | |||||
| A: "A" | |||||
| %declare B | |||||
| """, postlex=TestPostLexer()) | |||||
| test_file = "A" | |||||
| tree = parser.parse(test_file) | |||||
| self.assertEqual(tree.children, [Token('B', 'A')]) | |||||
| def test_import_custom_sources(self): | def test_import_custom_sources(self): | ||||
| custom_loader = FromPackageLoader('tests', ('grammars', )) | custom_loader = FromPackageLoader('tests', ('grammars', )) | ||||
| @@ -25,7 +25,7 @@ class TestStandalone(TestCase): | |||||
| standalone.main(StringIO(grammar), 'start', print=pr) | standalone.main(StringIO(grammar), 'start', print=pr) | ||||
| code = code_buf.getvalue() | code = code_buf.getvalue() | ||||
| context = {} | |||||
| context = {'__doc__': None} | |||||
| exec(code, context) | exec(code, context) | ||||
| return context | return context | ||||
| @@ -20,7 +20,7 @@ class TestTrees(TestCase): | |||||
| def test_pickle(self): | def test_pickle(self): | ||||
| s = copy.deepcopy(self.tree1) | s = copy.deepcopy(self.tree1) | ||||
| data = pickle.dumps(s) | |||||
| data = pickle.dumps(s, protocol=pickle.HIGHEST_PROTOCOL) | |||||
| assert pickle.loads(data) == s | assert pickle.loads(data) == s | ||||
| def test_repr_runnable(self): | def test_repr_runnable(self): | ||||