diff --git a/examples/python_parser.py b/examples/python_parser.py index 988fd97..82bfcb9 100644 --- a/examples/python_parser.py +++ b/examples/python_parser.py @@ -26,6 +26,13 @@ python_parser2 = Lark.open('python2.lark', parser='lalr', **kwargs) python_parser3 = Lark.open('python3.lark',parser='lalr', **kwargs) python_parser2_earley = Lark.open('python2.lark', parser='earley', lexer='standard', **kwargs) +try: + xrange +except NameError: + chosen_parser = python_parser3 +else: + chosen_parser = python_parser2 + def _read(fn, *args): kwargs = {'encoding': 'iso-8859-1'} @@ -42,24 +49,13 @@ def _get_lib_path(): return [x for x in sys.path if x.endswith('%s.%s' % sys.version_info[:2])][0] def test_python_lib(): - path = _get_lib_path() start = time.time() files = glob.glob(path+'/*.py') for f in files: print( f ) - try: - # print list(python_parser.lex(_read(os.path.join(path, f)) + '\n')) - try: - xrange - except NameError: - python_parser3.parse(_read(os.path.join(path, f)) + '\n') - else: - python_parser2.parse(_read(os.path.join(path, f)) + '\n') - except: - print ('At %s' % f) - raise + chosen_parser.parse(_read(os.path.join(path, f)) + '\n') end = time.time() print( "test_python_lib (%d files), time: %s secs"%(len(files), end-start) ) diff --git a/lark/common.py b/lark/common.py index c44f9ce..5c55b8c 100644 --- a/lark/common.py +++ b/lark/common.py @@ -7,12 +7,14 @@ class LexerConf(Serialize): __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags' __serialize_namespace__ = TerminalDef, - def __init__(self, tokens, ignore=(), postlex=None, callbacks=None, g_regex_flags=0): - self.tokens = tokens + def __init__(self, tokens, re_module, ignore=(), postlex=None, callbacks=None, g_regex_flags=0, skip_validation=False): + self.tokens = tokens # TODO should be terminals self.ignore = ignore self.postlex = postlex self.callbacks = callbacks or {} self.g_regex_flags = g_regex_flags + self.re_module = re_module + self.skip_validation = skip_validation def _deserialize(self): self.callbacks = {} # TODO diff --git a/lark/lark.py b/lark/lark.py index 2b783cb..e17da6b 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -166,11 +166,11 @@ class Lark(Serialize): use_regex = self.options.regex if use_regex: if regex: - self.re = regex + re_module = regex else: raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.') else: - self.re = re + re_module = re # Some, but not all file-like objects have a 'name' attribute try: @@ -243,7 +243,7 @@ class Lark(Serialize): assert self.options.ambiguity in ('resolve', 'explicit', 'auto', ) # Parse the grammar file and compose the grammars (TODO) - self.grammar = load_grammar(grammar, self.source, self.re) + self.grammar = load_grammar(grammar, self.source, re_module) # Compile the EBNF grammar into BNF self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) @@ -276,7 +276,7 @@ class Lark(Serialize): if hasattr(t, term.name): lexer_callbacks[term.name] = getattr(t, term.name) - self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags) + self.lexer_conf = LexerConf(self.terminals, re_module, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags) if self.options.parser: self.parser = self._build_parser() @@ -304,7 +304,7 @@ class Lark(Serialize): def _build_parser(self): self._prepare_callbacks() parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) - return self.parser_class(self.lexer_conf, parser_conf, self.re, options=self.options) + return self.parser_class(self.lexer_conf, parser_conf, options=self.options) def save(self, f): data, m = self.memo_serialize([TerminalDef, Rule]) @@ -331,11 +331,11 @@ class Lark(Serialize): if postlex is not None: options['postlex'] = postlex self.options = LarkOptions.deserialize(options, memo) - self.re = regex if self.options.regex else re + re_module = regex if self.options.regex else re self.rules = [Rule.deserialize(r, memo) for r in data['rules']] self.source = '' self._prepare_callbacks() - self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, self.re) + self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, re_module) return self @classmethod diff --git a/lark/lexer.py b/lark/lexer.py index bff5de9..4979500 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -6,6 +6,7 @@ from .utils import Str, classify, get_regexp_width, Py36, Serialize from .exceptions import UnexpectedCharacters, LexError, UnexpectedToken ###{standalone +from copy import copy class Pattern(Serialize): @@ -88,7 +89,6 @@ class TerminalDef(Serialize): return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) - class Token(Str): __slots__ = ('type', 'pos_in_stream', 'value', 'line', 'column', 'end_line', 'end_column', 'end_pos') @@ -294,35 +294,39 @@ class Lexer(object): class TraditionalLexer(Lexer): - def __init__(self, terminals, re_, ignore=(), user_callbacks={}, g_regex_flags=0): + def __init__(self, conf): + terminals = list(conf.tokens) assert all(isinstance(t, TerminalDef) for t in terminals), terminals - terminals = list(terminals) + self.re = conf.re_module - self.re = re_ - # Sanitization - for t in terminals: - try: - self.re.compile(t.pattern.to_regexp(), g_regex_flags) - except self.re.error: - raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) + if not conf.skip_validation: + # Sanitization + for t in terminals: + try: + self.re.compile(t.pattern.to_regexp(), conf.g_regex_flags) + except self.re.error: + raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) - if t.pattern.min_width == 0: - raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern)) + if t.pattern.min_width == 0: + raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern)) - assert set(ignore) <= {t.name for t in terminals} + assert set(conf.ignore) <= {t.name for t in terminals} # Init self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())] - self.ignore_types = list(ignore) + self.ignore_types = list(conf.ignore) terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) self.terminals = terminals - self.user_callbacks = user_callbacks - self.build(g_regex_flags) + self.user_callbacks = conf.callbacks + self.g_regex_flags = conf.g_regex_flags + + self._mres = None + # self.build(g_regex_flags) - def build(self, g_regex_flags=0): - terminals, self.callback = _create_unless(self.terminals, g_regex_flags, re_=self.re) + def _build(self): + terminals, self.callback = _create_unless(self.terminals, self.g_regex_flags, re_=self.re) assert all(self.callback.values()) for type_, f in self.user_callbacks.items(): @@ -332,7 +336,13 @@ class TraditionalLexer(Lexer): else: self.callback[type_] = f - self.mres = build_mres(terminals, g_regex_flags, self.re) + self._mres = build_mres(terminals, self.g_regex_flags, self.re) + + @property + def mres(self): + if self._mres is None: + self._build() + return self._mres def match(self, stream, pos): for mre, type_from_index in self.mres: @@ -348,13 +358,15 @@ class TraditionalLexer(Lexer): class ContextualLexer(Lexer): - def __init__(self, terminals, states, re_, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0): - self.re = re_ + def __init__(self, conf, states, always_accept=()): + terminals = list(conf.tokens) tokens_by_name = {} for t in terminals: assert t.name not in tokens_by_name, t tokens_by_name[t.name] = t + trad_conf = type(conf)(terminals, conf.re_module, conf.ignore, callbacks=conf.callbacks, g_regex_flags=conf.g_regex_flags, skip_validation=conf.skip_validation) + lexer_by_tokens = {} self.lexers = {} for state, accepts in states.items(): @@ -362,14 +374,17 @@ class ContextualLexer(Lexer): try: lexer = lexer_by_tokens[key] except KeyError: - accepts = set(accepts) | set(ignore) | set(always_accept) + accepts = set(accepts) | set(conf.ignore) | set(always_accept) state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] - lexer = TraditionalLexer(state_tokens, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) + lexer_conf = copy(trad_conf) + lexer_conf.tokens = state_tokens + lexer = TraditionalLexer(lexer_conf) lexer_by_tokens[key] = lexer self.lexers[state] = lexer - self.root_lexer = TraditionalLexer(terminals, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) + assert trad_conf.tokens is terminals + self.root_lexer = TraditionalLexer(trad_conf) def lex(self, stream, get_parser_state): parser_state = get_parser_state() diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 407d8d1..ee0f1c0 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -755,19 +755,19 @@ def _find_used_symbols(tree): for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} class GrammarLoader: - def __init__(self, re_): - self.re = re_ + def __init__(self, re_module): terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] rules = [options_from_rule(name, None, x) for name, x in RULES.items()] rules = [Rule(NonTerminal(r), symbols_from_strcase(x.split()), i, None, o) for r, _p, xs, o in rules for i, x in enumerate(xs)] callback = ParseTreeBuilder(rules, ST).create_callback() - lexer_conf = LexerConf(terminals, ['WS', 'COMMENT']) + lexer_conf = LexerConf(terminals, re_module, ['WS', 'COMMENT']) parser_conf = ParserConf(rules, callback, ['start']) - self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf, re_) + self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf) self.canonize_tree = CanonizeTree() + self.re_module = re_module def load_grammar(self, grammar_text, grammar_name=''): "Parse grammar_text, verify, and create Grammar object. Display nice messages on error." @@ -863,7 +863,7 @@ class GrammarLoader: # import grammars for dotted_path, (base_paths, aliases) in imports.items(): grammar_path = os.path.join(*dotted_path) + EXT - g = import_grammar(grammar_path, self.re, base_paths=base_paths) + g = import_grammar(grammar_path, self.re_module, base_paths=base_paths) new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases) term_defs += new_td diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index c453ab6..08f4756 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -62,18 +62,18 @@ class WithLexer(_ParserFrontend): __serialize_fields__ = 'parser', 'lexer_conf', 'start' __serialize_namespace__ = LexerConf, - def __init__(self, lexer_conf, parser_conf, re_, options=None): + def __init__(self, lexer_conf, parser_conf, options=None): self.lexer_conf = lexer_conf self.start = parser_conf.start self.postlex = lexer_conf.postlex - self.re = re_ @classmethod - def deserialize(cls, data, memo, callbacks, postlex, re_): + def deserialize(cls, data, memo, callbacks, postlex, re_module): inst = super(WithLexer, cls).deserialize(data, memo) - inst.re = re_ inst.postlex = postlex inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) + inst.lexer_conf.re_module = re_module + inst.lexer_conf.skip_validation=True inst.init_lexer() return inst @@ -89,18 +89,17 @@ class WithLexer(_ParserFrontend): return self._parse(token_stream, start) def init_traditional_lexer(self): - self.lexer = TraditionalLexer(self.lexer_conf.tokens, re_=self.re, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags) + self.lexer = TraditionalLexer(self.lexer_conf) class LALR_WithLexer(WithLexer): - def __init__(self, lexer_conf, parser_conf, re_, options=None): + def __init__(self, lexer_conf, parser_conf, options=None): debug = options.debug if options else False - self.re = re_ self.parser = LALR_Parser(parser_conf, debug=debug) - WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) + WithLexer.__init__(self, lexer_conf, parser_conf, options) self.init_lexer() - def init_lexer(self): + def init_lexer(self, **kw): raise NotImplementedError() class LALR_TraditionalLexer(LALR_WithLexer): @@ -111,12 +110,7 @@ class LALR_ContextualLexer(LALR_WithLexer): def init_lexer(self): states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} always_accept = self.postlex.always_accept if self.postlex else () - self.lexer = ContextualLexer(self.lexer_conf.tokens, states, - re_=self.re, - ignore=self.lexer_conf.ignore, - always_accept=always_accept, - user_callbacks=self.lexer_conf.callbacks, - g_regex_flags=self.lexer_conf.g_regex_flags) + self.lexer = ContextualLexer(self.lexer_conf, states, always_accept=always_accept) def parse(self, text, start=None): @@ -129,11 +123,11 @@ class LALR_ContextualLexer(LALR_WithLexer): ###} class LALR_CustomLexer(LALR_WithLexer): - def __init__(self, lexer_cls, lexer_conf, parser_conf, re_, options=None): - self.lexer = lexer_cls(lexer_conf, re_=re_) + def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): + self.lexer = lexer_cls(lexer_conf) debug = options.debug if options else False self.parser = LALR_Parser(parser_conf, debug=debug) - WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) + WithLexer.__init__(self, lexer_conf, parser_conf, options) def tokenize_text(text): @@ -146,8 +140,8 @@ def tokenize_text(text): yield Token('CHAR', ch, line=line, column=i - col_start_pos) class Earley(WithLexer): - def __init__(self, lexer_conf, parser_conf, re_, options=None): - WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) + def __init__(self, lexer_conf, parser_conf, options=None): + WithLexer.__init__(self, lexer_conf, parser_conf, options) self.init_traditional_lexer() resolve_ambiguity = options.ambiguity == 'resolve' @@ -159,9 +153,7 @@ class Earley(WithLexer): class XEarley(_ParserFrontend): - def __init__(self, lexer_conf, parser_conf, re_, options=None, **kw): - self.re = re_ - + def __init__(self, lexer_conf, parser_conf, options=None, **kw): self.token_by_name = {t.name:t for t in lexer_conf.tokens} self.start = parser_conf.start @@ -193,7 +185,7 @@ class XEarley(_ParserFrontend): if width == 0: raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t) - self.regexps[t.name] = self.re.compile(regexp, lexer_conf.g_regex_flags) + self.regexps[t.name] = lexer_conf.re_module.compile(regexp, lexer_conf.g_regex_flags) def parse(self, text, start): return self._parse(text, start) @@ -206,8 +198,8 @@ class XEarley_CompleteLex(XEarley): class CYK(WithLexer): - def __init__(self, lexer_conf, parser_conf, re_, options=None): - WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) + def __init__(self, lexer_conf, parser_conf, options=None): + WithLexer.__init__(self, lexer_conf, parser_conf, options) self.init_traditional_lexer() self._analysis = GrammarAnalyzer(parser_conf) diff --git a/tests/test_parser.py b/tests/test_parser.py index df09307..def4eca 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -6,7 +6,7 @@ import unittest import logging import os import sys -from copy import deepcopy +from copy import copy, deepcopy try: from cStringIO import StringIO as cStringIO except ImportError: @@ -553,8 +553,8 @@ class CustomLexer(Lexer): Purpose of this custom lexer is to test the integration, so it uses the traditionalparser as implementation without custom lexing behaviour. """ - def __init__(self, lexer_conf, re_): - self.lexer = TraditionalLexer(lexer_conf.tokens, re_, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags) + def __init__(self, lexer_conf): + self.lexer = TraditionalLexer(copy(lexer_conf)) def lex(self, *args, **kwargs): return self.lexer.lex(*args, **kwargs)