diff --git a/lark/common.py b/lark/common.py index 4bf04ec..e217063 100644 --- a/lark/common.py +++ b/lark/common.py @@ -5,7 +5,7 @@ from .lexer import TerminalDef class LexerConf(Serialize): - __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes' + __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags', 'use_bytes', 'lexer_type' __serialize_namespace__ = TerminalDef, def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False, use_bytes=False): @@ -18,12 +18,18 @@ class LexerConf(Serialize): self.skip_validation = skip_validation self.use_bytes = use_bytes -###} + self.lexer_type = None + +class ParserConf(Serialize): + __serialize_fields__ = 'rules', 'start', 'parser_type' -class ParserConf: def __init__(self, rules, callbacks, start): assert isinstance(start, list) self.rules = rules self.callbacks = callbacks self.start = start + + self.parser_type = None + +###} diff --git a/lark/exceptions.py b/lark/exceptions.py index 72f6c6f..46740ed 100644 --- a/lark/exceptions.py +++ b/lark/exceptions.py @@ -11,6 +11,11 @@ class ConfigurationError(LarkError, ValueError): pass +def assert_config(value, options, msg='Got %r, expected one of %s'): + if value not in options: + raise ConfigurationError(msg % (value, options)) + + class GrammarError(LarkError): pass @@ -198,4 +203,6 @@ class VisitError(LarkError): message = 'Error trying to process rule "%s":\n\n%s' % (rule, orig_exc) super(VisitError, self).__init__(message) + + ###} diff --git a/lark/lark.py b/lark/lark.py index b94f26b..842df5f 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -1,5 +1,5 @@ from __future__ import absolute_import -from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken, ConfigurationError +from lark.exceptions import UnexpectedCharacters, UnexpectedInput, UnexpectedToken, ConfigurationError, assert_config import sys, os, pickle, hashlib from io import open @@ -24,10 +24,6 @@ except ImportError: ###{standalone -def assert_config(value, options, msg='Got %r, expected one of %s'): - if value not in options: - raise ConfigurationError(msg % (value, options)) - class LarkOptions(Serialize): """Specifies the options for Lark diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 70fd7eb..76834f4 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -11,7 +11,7 @@ from .utils import bfs, Py36, logger, classify_bool from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder -from .parser_frontends import LALR_TraditionalLexer +from .parser_frontends import ParsingFrontend from .common import LexerConf, ParserConf from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify, suppress, dedup_list, Str @@ -883,9 +883,10 @@ class GrammarLoader: callback = ParseTreeBuilder(rules, ST).create_callback() import re lexer_conf = LexerConf(terminals, re, ['WS', 'COMMENT']) - parser_conf = ParserConf(rules, callback, ['start']) - self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf) + lexer_conf.lexer_type = 'standard' + parser_conf.parser_type = 'lalr' + self.parser = ParsingFrontend(lexer_conf, parser_conf, {}) self.canonize_tree = CanonizeTree() self.global_keep_all_tokens = global_keep_all_tokens diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 5d32589..5cffdb1 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -1,12 +1,11 @@ -from .exceptions import ConfigurationError, GrammarError +from .exceptions import ConfigurationError, GrammarError, assert_config from .utils import get_regexp_width, Serialize from .parsers.grammar_analysis import GrammarAnalyzer from .lexer import LexerThread, TraditionalLexer, ContextualLexer, Lexer, Token, TerminalDef from .parsers import earley, xearley, cyk from .parsers.lalr_parser import LALR_Parser -from .grammar import Rule from .tree import Tree -from .common import LexerConf +from .common import LexerConf, ParserConf try: import regex except ImportError: @@ -27,56 +26,106 @@ def _wrap_lexer(lexer_class): return self.lexer.lex(lexer_state.text) return CustomLexerWrapper -def get_frontend(parser, lexer): - if parser=='lalr': - if lexer is None: - raise ConfigurationError('The LALR parser requires use of a lexer') - elif lexer == 'standard': - return LALR_TraditionalLexer - elif lexer == 'contextual': - return LALR_ContextualLexer - elif issubclass(lexer, Lexer): - wrapped = _wrap_lexer(lexer) - class LALR_CustomLexerWrapper(LALR_WithLexer): - def init_lexer(self): - self.lexer = wrapped(self.lexer_conf) - return LALR_CustomLexerWrapper - else: - raise ConfigurationError('Unknown lexer: %s' % lexer) - elif parser=='earley': - if lexer=='standard': - return Earley_Traditional - elif lexer=='dynamic': - return XEarley - elif lexer=='dynamic_complete': - return XEarley_CompleteLex - elif lexer=='contextual': - raise ConfigurationError('The Earley parser does not support the contextual parser') - elif issubclass(lexer, Lexer): - wrapped = _wrap_lexer(lexer) - class Earley_CustomLexerWrapper(Earley_WithLexer): - def init_lexer(self, **kw): - self.lexer = wrapped(self.lexer_conf) - return Earley_CustomLexerWrapper + +class MakeParsingFrontend: + def __init__(self, parser_type, lexer_type): + self.parser_type = parser_type + self.lexer_type = lexer_type + + def __call__(self, lexer_conf, parser_conf, options): + assert isinstance(lexer_conf, LexerConf) + assert isinstance(parser_conf, ParserConf) + parser_conf.parser_type = self.parser_type + lexer_conf.lexer_type = self.lexer_type + return ParsingFrontend(lexer_conf, parser_conf, options) + + @classmethod + def deserialize(cls, data, memo, callbacks, options): + lexer_conf = LexerConf.deserialize(data['lexer_conf'], memo) + parser_conf = ParserConf.deserialize(data['parser_conf'], memo) + parser = LALR_Parser.deserialize(data['parser'], memo, callbacks, options.debug) + parser_conf.callbacks = callbacks + + terminals = [item for item in memo.values() if isinstance(item, TerminalDef)] + + lexer_conf.callbacks = _get_lexer_callbacks(options.transformer, terminals) + lexer_conf.re_module = regex if options.regex else re + lexer_conf.use_bytes = options.use_bytes + lexer_conf.g_regex_flags = options.g_regex_flags + lexer_conf.skip_validation = True + lexer_conf.postlex = options.postlex + + return ParsingFrontend(lexer_conf, parser_conf, options, parser=parser) + + + + +class ParsingFrontend(Serialize): + __serialize_fields__ = 'lexer_conf', 'parser_conf', 'parser', 'options' + + def __init__(self, lexer_conf, parser_conf, options, parser=None): + self.parser_conf = parser_conf + self.lexer_conf = lexer_conf + self.options = options + + # Set-up parser + if parser: # From cache + self.parser = parser else: - raise ConfigurationError('Unknown lexer: %s' % lexer) - elif parser == 'cyk': - if lexer == 'standard': - return CYK + create_parser = { + 'lalr': create_lalr_parser, + 'earley': create_earley_parser, + 'cyk': CYK_FrontEnd, + }[parser_conf.parser_type] + self.parser = create_parser(lexer_conf, parser_conf, options) + + # Set-up lexer + lexer_type = lexer_conf.lexer_type + self.skip_lexer = False + if lexer_type in ('dynamic', 'dynamic_complete'): + self.skip_lexer = True + return + + try: + create_lexer = { + 'standard': create_traditional_lexer, + 'contextual': create_contextual_lexer, + }[lexer_type] + except KeyError: + assert issubclass(lexer_type, Lexer), lexer_type + self.lexer = _wrap_lexer(lexer_type)(lexer_conf) else: - raise ConfigurationError('CYK parser requires using standard parser.') - else: - raise ConfigurationError('Unknown parser: %s' % parser) + self.lexer = create_lexer(lexer_conf, self.parser, lexer_conf.postlex) + if lexer_conf.postlex: + self.lexer = PostLexConnector(self.lexer, lexer_conf.postlex) -class _ParserFrontend(Serialize): - def _parse(self, start, input, *args): + + def parse(self, text, start=None): if start is None: - start = self.start + start = self.parser_conf.start if len(start) > 1: raise ConfigurationError("Lark initialized with more than 1 possible start rule. Must specify which start rule to parse", start) start ,= start - return self.parser.parse(input, start, *args) + + if self.skip_lexer: + return self.parser.parse(text, start) + + lexer_thread = LexerThread(self.lexer, text) + return self.parser.parse(lexer_thread, start) + + +def get_frontend(parser, lexer): + assert_config(parser, ('lalr', 'earley', 'cyk')) + if not isinstance(lexer, type): # not custom lexer? + expected = { + 'lalr': ('standard', 'contextual'), + 'earley': ('standard', 'dynamic', 'dynamic_complete'), + 'cyk': ('standard', ), + }[parser] + assert_config(lexer, expected, 'Parser %r does not support lexer %%r, expected one of %%s' % parser) + + return MakeParsingFrontend(parser, lexer) def _get_lexer_callbacks(transformer, terminals): @@ -100,119 +149,26 @@ class PostLexConnector: return self.postlexer.process(i) -class WithLexer(_ParserFrontend): - lexer = None - parser = None - lexer_conf = None - start = None - - __serialize_fields__ = 'parser', 'lexer_conf', 'start' - __serialize_namespace__ = LexerConf, - - def __init__(self, lexer_conf, parser_conf, options=None): - self.lexer_conf = lexer_conf - self.start = parser_conf.start - self.postlex = lexer_conf.postlex - - @classmethod - def deserialize(cls, data, memo, callbacks, options): - inst = super(WithLexer, cls).deserialize(data, memo) - - inst.postlex = options.postlex - inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks, options.debug) - - terminals = [item for item in memo.values() if isinstance(item, TerminalDef)] - inst.lexer_conf.callbacks = _get_lexer_callbacks(options.transformer, terminals) - inst.lexer_conf.re_module = regex if options.regex else re - inst.lexer_conf.use_bytes = options.use_bytes - inst.lexer_conf.g_regex_flags = options.g_regex_flags - inst.lexer_conf.skip_validation = True - inst.init_lexer() - - return inst - - def _serialize(self, data, memo): - data['parser'] = data['parser'].serialize(memo) - - def make_lexer(self, text): - lexer = self.lexer - if self.postlex: - lexer = PostLexConnector(self.lexer, self.postlex) - return LexerThread(lexer, text) - - def parse(self, text, start=None): - return self._parse(start, self.make_lexer(text)) - - def init_traditional_lexer(self): - self.lexer = TraditionalLexer(self.lexer_conf) - -class LALR_WithLexer(WithLexer): - def __init__(self, lexer_conf, parser_conf, options=None): - debug = options.debug if options else False - self.parser = LALR_Parser(parser_conf, debug=debug) - WithLexer.__init__(self, lexer_conf, parser_conf, options) - self.init_lexer() +def create_traditional_lexer(lexer_conf, parser, postlex): + return TraditionalLexer(lexer_conf) - def init_lexer(self, **kw): - raise NotImplementedError() +def create_contextual_lexer(lexer_conf, parser, postlex): + states = {idx:list(t.keys()) for idx, t in parser._parse_table.states.items()} + always_accept = postlex.always_accept if postlex else () + return ContextualLexer(lexer_conf, states, always_accept=always_accept) -class LALR_TraditionalLexer(LALR_WithLexer): - def init_lexer(self): - self.init_traditional_lexer() +def create_lalr_parser(lexer_conf, parser_conf, options=None): + debug = options.debug if options else False + return LALR_Parser(parser_conf, debug=debug) -class LALR_ContextualLexer(LALR_WithLexer): - def init_lexer(self): - states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} - always_accept = self.postlex.always_accept if self.postlex else () - self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept) +create_earley_parser = NotImplemented +CYK_FrontEnd = NotImplemented ###} - -class Earley_WithLexer(WithLexer): - def __init__(self, lexer_conf, parser_conf, options=None): - WithLexer.__init__(self, lexer_conf, parser_conf, options) - self.init_lexer() - - resolve_ambiguity = options.ambiguity == 'resolve' - debug = options.debug if options else False - tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None - self.parser = earley.Parser(parser_conf, self.match, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class) - - def match(self, term, token): - return term.name == token.type - - def init_lexer(self, **kw): - raise NotImplementedError() - -class Earley_Traditional(Earley_WithLexer): - def init_lexer(self, **kw): - self.init_traditional_lexer() - - -class XEarley(_ParserFrontend): - def __init__(self, lexer_conf, parser_conf, options=None, **kw): - self.token_by_name = {t.name:t for t in lexer_conf.tokens} - self.start = parser_conf.start - - self._prepare_match(lexer_conf) - resolve_ambiguity = options.ambiguity == 'resolve' - debug = options.debug if options else False - tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None - self.parser = xearley.Parser(parser_conf, - self.match, - ignore=lexer_conf.ignore, - resolve_ambiguity=resolve_ambiguity, - debug=debug, - tree_class=tree_class, - **kw - ) - - def match(self, term, text, index=0): - return self.regexps[term.name].match(text, index) - - def _prepare_match(self, lexer_conf): +class EarleyRegexpMatcher: + def __init__(self, lexer_conf): self.regexps = {} for t in lexer_conf.tokens: if t.priority != 1: @@ -230,31 +186,49 @@ class XEarley(_ParserFrontend): self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) - def parse(self, text, start): - return self._parse(start, text) + def match(self, term, text, index=0): + return self.regexps[term.name].match(text, index) -class XEarley_CompleteLex(XEarley): - def __init__(self, *args, **kw): - XEarley.__init__(self, *args, complete_lex=True, **kw) +def create_earley_parser__dynamic(lexer_conf, parser_conf, options=None, **kw): + earley_matcher = EarleyRegexpMatcher(lexer_conf) + return xearley.Parser(parser_conf, earley_matcher.match, ignore=lexer_conf.ignore, **kw) +def _match_earley_basic(term, token): + return term.name == token.type -class CYK(WithLexer): +def create_earley_parser__basic(lexer_conf, parser_conf, options, **kw): + return earley.Parser(parser_conf, _match_earley_basic, **kw) - def __init__(self, lexer_conf, parser_conf, options=None): - WithLexer.__init__(self, lexer_conf, parser_conf, options) - self.init_traditional_lexer() +def create_earley_parser(lexer_conf, parser_conf, options): + resolve_ambiguity = options.ambiguity == 'resolve' + debug = options.debug if options else False + tree_class = options.tree_class or Tree if options.ambiguity != 'forest' else None + + extra = {} + if lexer_conf.lexer_type == 'dynamic': + f = create_earley_parser__dynamic + elif lexer_conf.lexer_type == 'dynamic_complete': + extra['complete_lex'] =True + f = create_earley_parser__dynamic + else: + f = create_earley_parser__basic + + return f(lexer_conf, parser_conf, options, resolve_ambiguity=resolve_ambiguity, debug=debug, tree_class=tree_class, **extra) + + +class CYK_FrontEnd: + def __init__(self, lexer_conf, parser_conf, options=None): self._analysis = GrammarAnalyzer(parser_conf) self.parser = cyk.Parser(parser_conf.rules) self.callbacks = parser_conf.callbacks - def parse(self, text, start): - tokens = list(self.make_lexer(text).lex(None)) - parse = self._parse(start, tokens) - parse = self._transform(parse) - return parse + def parse(self, lexer_thread, start): + tokens = list(lexer_thread.lex(None)) + tree = self.parser.parse(tokens, start) + return self._transform(tree) def _transform(self, tree): subtrees = list(tree.iter_subtrees()) diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 3d006e7..f7ff8fe 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -5,13 +5,14 @@ from copy import deepcopy, copy from ..exceptions import UnexpectedInput, UnexpectedToken from ..lexer import Token +from ..utils import Serialize from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable from .lalr_puppet import ParserPuppet ###{standalone -class LALR_Parser(object): +class LALR_Parser(Serialize): def __init__(self, parser_conf, debug=False): analysis = LALR_Analyzer(parser_conf, debug=debug) analysis.compute_lalr() diff --git a/lark/utils.py b/lark/utils.py index 366922b..3b5b8a8 100644 --- a/lark/utils.py +++ b/lark/utils.py @@ -302,4 +302,5 @@ def _serialize(value, memo): return list(value) # TODO reversible? elif isinstance(value, dict): return {key:_serialize(elem, memo) for key, elem in value.items()} + # assert value is None or isinstance(value, (int, float, str, tuple)), value return value diff --git a/tests/test_parser.py b/tests/test_parser.py index 44125ff..9b011f7 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -2471,6 +2471,7 @@ _TO_TEST = [ ('contextual', 'lalr'), ('custom_new', 'lalr'), + ('custom_new', 'cyk'), ('custom_old', 'earley'), ]