| @@ -22,7 +22,7 @@ jobs: | |||
| - name: Install dependencies | |||
| run: | | |||
| python -m pip install --upgrade pip | |||
| pip install -r nearley-requirements.txt | |||
| pip install -r test-requirements.txt | |||
| - name: Run tests | |||
| run: | | |||
| python -m tests | |||
| @@ -70,6 +70,8 @@ Useful for caching and multiprocessing. | |||
| **g_regex_flags** - Flags that are applied to all terminals (both regex and strings) | |||
| **regex** - Use the `regex` library instead of the built-in `re` module (See below) | |||
| **keep_all_tokens** - Prevent the tree builder from automagically removing "punctuation" tokens (default: False) | |||
| **cache** - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. LALR only for now. | |||
| @@ -94,13 +96,35 @@ Useful for caching and multiprocessing. | |||
| - "resolve": The parser will automatically choose the simplest derivation (it chooses consistently: greedy for tokens, non-greedy for rules) | |||
| - "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). | |||
| #### Domain Specific | |||
| #### Misc. | |||
| - **postlex** - Lexer post-processing (Default: None) Only works with the standard and contextual lexers. | |||
| - **priority** - How priorities should be evaluated - auto, none, normal, invert (Default: auto) | |||
| - **lexer_callbacks** - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. | |||
| - **edit_terminals** - A callback | |||
| #### Using Unicode character classes with `regex` | |||
| Python's builtin `re` module has a few persistent known bugs and also won't parse | |||
| advanced regex features such as character classes. | |||
| With `pip install lark-parser[regex]`, the `regex` module will be installed alongside `lark` | |||
| and can act as a drop-in replacement to `re`. | |||
| Any instance of `Lark` instantiated with `regex=True` will now use the `regex` module | |||
| instead of `re`. For example, we can now use character classes to match PEP-3131 compliant Python identifiers. | |||
| ```python | |||
| from lark import Lark | |||
| >>> g = Lark(r""" | |||
| ?start: NAME | |||
| NAME: ID_START ID_CONTINUE* | |||
| ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ | |||
| ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}·]+/ | |||
| """, regex=True) | |||
| >>> g.parse('வணக்கம்') | |||
| 'வணக்கம்' | |||
| ``` | |||
| ---- | |||
| ## Tree | |||
| @@ -23,6 +23,7 @@ class LarkOptions: | |||
| transformer: Optional[Transformer] | |||
| postlex: Optional[PostLex] | |||
| ambiguity: str | |||
| regex: bool | |||
| debug: bool | |||
| keep_all_tokens: bool | |||
| propagate_positions: bool | |||
| @@ -48,6 +49,7 @@ class Lark: | |||
| transformer: Optional[Transformer] = None, | |||
| postlex: Optional[PostLex] = None, | |||
| ambiguity: Literal["explicit", "resolve"] = "resolve", | |||
| regex: bool = False, | |||
| debug: bool = False, | |||
| keep_all_tokens: bool = False, | |||
| propagate_positions: bool = False, | |||
| @@ -1,5 +1,5 @@ | |||
| # -*- coding: utf-8 -*- | |||
| from types import ModuleType | |||
| from typing import ( | |||
| TypeVar, Type, Tuple, List, Dict, Iterator, Collection, Callable, Optional, | |||
| Pattern as REPattern, | |||
| @@ -107,10 +107,12 @@ class TraditionalLexer(Lexer): | |||
| user_callbacks: Dict[str, _Callback] | |||
| callback: Dict[str, _Callback] | |||
| mres: List[Tuple[REPattern, Dict[int, str]]] | |||
| re: ModuleType | |||
| def __init__( | |||
| self, | |||
| terminals: Collection[TerminalDef], | |||
| re_: ModuleType, | |||
| ignore: Collection[str] = ..., | |||
| user_callbacks: Dict[str, _Callback] = ..., | |||
| g_regex_flags: int = ... | |||
| @@ -135,6 +137,7 @@ class ContextualLexer(Lexer): | |||
| self, | |||
| terminals: Collection[TerminalDef], | |||
| states: Dict[str, Collection[str]], | |||
| re_: ModuleType, | |||
| ignore: Collection[str] = ..., | |||
| always_accept: Collection[str] = ..., | |||
| user_callbacks: Dict[str, _Callback] = ..., | |||
| @@ -14,6 +14,12 @@ from .parse_tree_builder import ParseTreeBuilder | |||
| from .parser_frontends import get_frontend | |||
| from .grammar import Rule | |||
| import re | |||
| try: | |||
| import regex | |||
| except ImportError: | |||
| regex = None | |||
| ###{standalone | |||
| class LarkOptions(Serialize): | |||
| @@ -34,6 +40,7 @@ class LarkOptions(Serialize): | |||
| When `False`, `[]` behaves like the `?` operator, | |||
| and returns no value at all. | |||
| (default=`False`. Recommended to set to `True`) | |||
| regex - When True, uses the `regex` module instead of the stdlib `re`. | |||
| cache - Cache the results of the Lark grammar analysis, for x2 to x3 faster loading. | |||
| LALR only for now. | |||
| When `False`, does nothing (default) | |||
| @@ -92,6 +99,7 @@ class LarkOptions(Serialize): | |||
| 'start': 'start', | |||
| 'priority': 'auto', | |||
| 'ambiguity': 'auto', | |||
| 'regex': False, | |||
| 'propagate_positions': False, | |||
| 'lexer_callbacks': {}, | |||
| 'maybe_placeholders': False, | |||
| @@ -154,6 +162,16 @@ class Lark(Serialize): | |||
| self.options = LarkOptions(options) | |||
| # Set regex or re module | |||
| use_regex = self.options.regex | |||
| if use_regex: | |||
| if regex: | |||
| self.re = regex | |||
| else: | |||
| raise ImportError('`regex` module must be installed if calling `Lark(regex=True)`.') | |||
| else: | |||
| self.re = re | |||
| # Some, but not all file-like objects have a 'name' attribute | |||
| try: | |||
| self.source = grammar.name | |||
| @@ -225,7 +243,7 @@ class Lark(Serialize): | |||
| assert self.options.ambiguity in ('resolve', 'explicit', 'auto', ) | |||
| # Parse the grammar file and compose the grammars (TODO) | |||
| self.grammar = load_grammar(grammar, self.source) | |||
| self.grammar = load_grammar(grammar, self.source, self.re) | |||
| # Compile the EBNF grammar into BNF | |||
| self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) | |||
| @@ -286,7 +304,7 @@ class Lark(Serialize): | |||
| def _build_parser(self): | |||
| self._prepare_callbacks() | |||
| parser_conf = ParserConf(self.rules, self._callbacks, self.options.start) | |||
| return self.parser_class(self.lexer_conf, parser_conf, options=self.options) | |||
| return self.parser_class(self.lexer_conf, parser_conf, self.re, options=self.options) | |||
| def save(self, f): | |||
| data, m = self.memo_serialize([TerminalDef, Rule]) | |||
| @@ -313,10 +331,11 @@ class Lark(Serialize): | |||
| if postlex is not None: | |||
| options['postlex'] = postlex | |||
| self.options = LarkOptions.deserialize(options, memo) | |||
| self.re = regex if self.options.regex else re | |||
| self.rules = [Rule.deserialize(r, memo) for r in data['rules']] | |||
| self.source = '<deserialized>' | |||
| self._prepare_callbacks() | |||
| self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex) | |||
| self.parser = self.parser_class.deserialize(data['parser'], memo, self._callbacks, self.options.postlex, self.re) | |||
| return self | |||
| @classmethod | |||
| @@ -230,7 +230,7 @@ class CallChain: | |||
| def _create_unless(terminals, g_regex_flags): | |||
| def _create_unless(terminals, g_regex_flags, re_): | |||
| tokens_by_type = classify(terminals, lambda t: type(t.pattern)) | |||
| assert len(tokens_by_type) <= 2, tokens_by_type.keys() | |||
| embedded_strs = set() | |||
| @@ -241,19 +241,19 @@ def _create_unless(terminals, g_regex_flags): | |||
| if strtok.priority > retok.priority: | |||
| continue | |||
| s = strtok.pattern.value | |||
| m = re.match(retok.pattern.to_regexp(), s, g_regex_flags) | |||
| m = re_.match(retok.pattern.to_regexp(), s, g_regex_flags) | |||
| if m and m.group(0) == s: | |||
| unless.append(strtok) | |||
| if strtok.pattern.flags <= retok.pattern.flags: | |||
| embedded_strs.add(strtok) | |||
| if unless: | |||
| callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, match_whole=True)) | |||
| callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, re_, match_whole=True)) | |||
| terminals = [t for t in terminals if t not in embedded_strs] | |||
| return terminals, callback | |||
| def _build_mres(terminals, max_size, g_regex_flags, match_whole): | |||
| def _build_mres(terminals, max_size, g_regex_flags, match_whole, re_): | |||
| # Python sets an unreasonable group limit (currently 100) in its re module | |||
| # Worse, the only way to know we reached it is by catching an AssertionError! | |||
| # This function recursively tries less and less groups until it's successful. | |||
| @@ -261,17 +261,17 @@ def _build_mres(terminals, max_size, g_regex_flags, match_whole): | |||
| mres = [] | |||
| while terminals: | |||
| try: | |||
| mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags) | |||
| mre = re_.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags) | |||
| except AssertionError: # Yes, this is what Python provides us.. :/ | |||
| return _build_mres(terminals, max_size//2, g_regex_flags, match_whole) | |||
| return _build_mres(terminals, max_size//2, g_regex_flags, match_whole, re_) | |||
| # terms_from_name = {t.name: t for t in terminals[:max_size]} | |||
| mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) | |||
| terminals = terminals[max_size:] | |||
| return mres | |||
| def build_mres(terminals, g_regex_flags, match_whole=False): | |||
| return _build_mres(terminals, len(terminals), g_regex_flags, match_whole) | |||
| def build_mres(terminals, g_regex_flags, re_, match_whole=False): | |||
| return _build_mres(terminals, len(terminals), g_regex_flags, match_whole, re_) | |||
| def _regexp_has_newline(r): | |||
| r"""Expressions that may indicate newlines in a regexp: | |||
| @@ -294,16 +294,17 @@ class Lexer(object): | |||
| class TraditionalLexer(Lexer): | |||
| def __init__(self, terminals, ignore=(), user_callbacks={}, g_regex_flags=0): | |||
| def __init__(self, terminals, re_, ignore=(), user_callbacks={}, g_regex_flags=0): | |||
| assert all(isinstance(t, TerminalDef) for t in terminals), terminals | |||
| terminals = list(terminals) | |||
| self.re = re_ | |||
| # Sanitization | |||
| for t in terminals: | |||
| try: | |||
| re.compile(t.pattern.to_regexp(), g_regex_flags) | |||
| except re.error: | |||
| self.re.compile(t.pattern.to_regexp(), g_regex_flags) | |||
| except self.re.error: | |||
| raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) | |||
| if t.pattern.min_width == 0: | |||
| @@ -321,7 +322,7 @@ class TraditionalLexer(Lexer): | |||
| self.build(g_regex_flags) | |||
| def build(self, g_regex_flags=0): | |||
| terminals, self.callback = _create_unless(self.terminals, g_regex_flags) | |||
| terminals, self.callback = _create_unless(self.terminals, g_regex_flags, re_=self.re) | |||
| assert all(self.callback.values()) | |||
| for type_, f in self.user_callbacks.items(): | |||
| @@ -331,7 +332,7 @@ class TraditionalLexer(Lexer): | |||
| else: | |||
| self.callback[type_] = f | |||
| self.mres = build_mres(terminals, g_regex_flags) | |||
| self.mres = build_mres(terminals, g_regex_flags, self.re) | |||
| def match(self, stream, pos): | |||
| for mre, type_from_index in self.mres: | |||
| @@ -347,7 +348,8 @@ class TraditionalLexer(Lexer): | |||
| class ContextualLexer(Lexer): | |||
| def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0): | |||
| def __init__(self, terminals, states, re_, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0): | |||
| self.re = re_ | |||
| tokens_by_name = {} | |||
| for t in terminals: | |||
| assert t.name not in tokens_by_name, t | |||
| @@ -362,12 +364,12 @@ class ContextualLexer(Lexer): | |||
| except KeyError: | |||
| accepts = set(accepts) | set(ignore) | set(always_accept) | |||
| state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] | |||
| lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) | |||
| lexer = TraditionalLexer(state_tokens, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) | |||
| lexer_by_tokens[key] = lexer | |||
| self.lexers[state] = lexer | |||
| self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) | |||
| self.root_lexer = TraditionalLexer(terminals, re_=self.re, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) | |||
| def lex(self, stream, get_parser_state): | |||
| parser_state = get_parser_state() | |||
| @@ -616,7 +616,7 @@ class Grammar: | |||
| _imported_grammars = {} | |||
| def import_grammar(grammar_path, base_paths=[]): | |||
| def import_grammar(grammar_path, re_, base_paths=[]): | |||
| if grammar_path not in _imported_grammars: | |||
| import_paths = base_paths + IMPORT_PATHS | |||
| for import_path in import_paths: | |||
| @@ -624,7 +624,7 @@ def import_grammar(grammar_path, base_paths=[]): | |||
| joined_path = os.path.join(import_path, grammar_path) | |||
| with open(joined_path, encoding='utf8') as f: | |||
| text = f.read() | |||
| grammar = load_grammar(text, joined_path) | |||
| grammar = load_grammar(text, joined_path, re_) | |||
| _imported_grammars[grammar_path] = grammar | |||
| break | |||
| else: | |||
| @@ -755,7 +755,8 @@ def _find_used_symbols(tree): | |||
| for t in x.scan_values(lambda t: t.type in ('RULE', 'TERMINAL'))} | |||
| class GrammarLoader: | |||
| def __init__(self): | |||
| def __init__(self, re_): | |||
| self.re = re_ | |||
| terminals = [TerminalDef(name, PatternRE(value)) for name, value in TERMINALS.items()] | |||
| rules = [options_from_rule(name, None, x) for name, x in RULES.items()] | |||
| @@ -764,7 +765,7 @@ class GrammarLoader: | |||
| lexer_conf = LexerConf(terminals, ['WS', 'COMMENT']) | |||
| parser_conf = ParserConf(rules, callback, ['start']) | |||
| self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf) | |||
| self.parser = LALR_TraditionalLexer(lexer_conf, parser_conf, re_) | |||
| self.canonize_tree = CanonizeTree() | |||
| @@ -862,7 +863,7 @@ class GrammarLoader: | |||
| # import grammars | |||
| for dotted_path, (base_paths, aliases) in imports.items(): | |||
| grammar_path = os.path.join(*dotted_path) + EXT | |||
| g = import_grammar(grammar_path, base_paths=base_paths) | |||
| g = import_grammar(grammar_path, self.re, base_paths=base_paths) | |||
| new_td, new_rd = import_from_grammar_into_namespace(g, '__'.join(dotted_path), aliases) | |||
| term_defs += new_td | |||
| @@ -942,4 +943,5 @@ class GrammarLoader: | |||
| load_grammar = GrammarLoader().load_grammar | |||
| def load_grammar(grammar, source, re_): | |||
| return GrammarLoader(re_).load_grammar(grammar, source) | |||
| @@ -1,4 +1,3 @@ | |||
| import re | |||
| from functools import partial | |||
| from .utils import get_regexp_width, Serialize | |||
| @@ -63,14 +62,16 @@ class WithLexer(_ParserFrontend): | |||
| __serialize_fields__ = 'parser', 'lexer_conf', 'start' | |||
| __serialize_namespace__ = LexerConf, | |||
| def __init__(self, lexer_conf, parser_conf, options=None): | |||
| def __init__(self, lexer_conf, parser_conf, re_, options=None): | |||
| self.lexer_conf = lexer_conf | |||
| self.start = parser_conf.start | |||
| self.postlex = lexer_conf.postlex | |||
| self.re = re_ | |||
| @classmethod | |||
| def deserialize(cls, data, memo, callbacks, postlex): | |||
| def deserialize(cls, data, memo, callbacks, postlex, re_): | |||
| inst = super(WithLexer, cls).deserialize(data, memo) | |||
| inst.re = re_ | |||
| inst.postlex = postlex | |||
| inst.parser = LALR_Parser.deserialize(inst.parser, memo, callbacks) | |||
| inst.init_lexer() | |||
| @@ -88,13 +89,14 @@ class WithLexer(_ParserFrontend): | |||
| return self._parse(token_stream, start) | |||
| def init_traditional_lexer(self): | |||
| self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags) | |||
| self.lexer = TraditionalLexer(self.lexer_conf.tokens, re_=self.re, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags) | |||
| class LALR_WithLexer(WithLexer): | |||
| def __init__(self, lexer_conf, parser_conf, options=None): | |||
| def __init__(self, lexer_conf, parser_conf, re_, options=None): | |||
| debug = options.debug if options else False | |||
| self.re = re_ | |||
| self.parser = LALR_Parser(parser_conf, debug=debug) | |||
| WithLexer.__init__(self, lexer_conf, parser_conf, options) | |||
| WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) | |||
| self.init_lexer() | |||
| @@ -110,6 +112,7 @@ class LALR_ContextualLexer(LALR_WithLexer): | |||
| states = {idx:list(t.keys()) for idx, t in self.parser._parse_table.states.items()} | |||
| always_accept = self.postlex.always_accept if self.postlex else () | |||
| self.lexer = ContextualLexer(self.lexer_conf.tokens, states, | |||
| re_=self.re, | |||
| ignore=self.lexer_conf.ignore, | |||
| always_accept=always_accept, | |||
| user_callbacks=self.lexer_conf.callbacks, | |||
| @@ -126,11 +129,11 @@ class LALR_ContextualLexer(LALR_WithLexer): | |||
| ###} | |||
| class LALR_CustomLexer(LALR_WithLexer): | |||
| def __init__(self, lexer_cls, lexer_conf, parser_conf, options=None): | |||
| self.lexer = lexer_cls(lexer_conf) | |||
| def __init__(self, lexer_cls, lexer_conf, parser_conf, re_, options=None): | |||
| self.lexer = lexer_cls(lexer_conf, re_=re_) | |||
| debug = options.debug if options else False | |||
| self.parser = LALR_Parser(parser_conf, debug=debug) | |||
| WithLexer.__init__(self, lexer_conf, parser_conf, options) | |||
| WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) | |||
| def tokenize_text(text): | |||
| @@ -143,8 +146,8 @@ def tokenize_text(text): | |||
| yield Token('CHAR', ch, line=line, column=i - col_start_pos) | |||
| class Earley(WithLexer): | |||
| def __init__(self, lexer_conf, parser_conf, options=None): | |||
| WithLexer.__init__(self, lexer_conf, parser_conf, options) | |||
| def __init__(self, lexer_conf, parser_conf, re_, options=None): | |||
| WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) | |||
| self.init_traditional_lexer() | |||
| resolve_ambiguity = options.ambiguity == 'resolve' | |||
| @@ -156,7 +159,9 @@ class Earley(WithLexer): | |||
| class XEarley(_ParserFrontend): | |||
| def __init__(self, lexer_conf, parser_conf, options=None, **kw): | |||
| def __init__(self, lexer_conf, parser_conf, re_, options=None, **kw): | |||
| self.re = re_ | |||
| self.token_by_name = {t.name:t for t in lexer_conf.tokens} | |||
| self.start = parser_conf.start | |||
| @@ -188,7 +193,7 @@ class XEarley(_ParserFrontend): | |||
| if width == 0: | |||
| raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t) | |||
| self.regexps[t.name] = re.compile(regexp, lexer_conf.g_regex_flags) | |||
| self.regexps[t.name] = self.re.compile(regexp, lexer_conf.g_regex_flags) | |||
| def parse(self, text, start): | |||
| return self._parse(text, start) | |||
| @@ -201,8 +206,8 @@ class XEarley_CompleteLex(XEarley): | |||
| class CYK(WithLexer): | |||
| def __init__(self, lexer_conf, parser_conf, options=None): | |||
| WithLexer.__init__(self, lexer_conf, parser_conf, options) | |||
| def __init__(self, lexer_conf, parser_conf, re_, options=None): | |||
| WithLexer.__init__(self, lexer_conf, parser_conf, re_, options) | |||
| self.init_traditional_lexer() | |||
| self._analysis = GrammarAnalyzer(parser_conf) | |||
| @@ -165,16 +165,31 @@ def smart_decorator(f, create_decorator): | |||
| else: | |||
| return create_decorator(f.__func__.__call__, True) | |||
| try: | |||
| import regex | |||
| except ImportError: | |||
| regex = None | |||
| import sys, re | |||
| Py36 = (sys.version_info[:2] >= (3, 6)) | |||
| import sre_parse | |||
| import sre_constants | |||
| def get_regexp_width(regexp): | |||
| categ_pattern = re.compile(r'\\p{[A-Za-z_]+}') | |||
| def get_regexp_width(expr): | |||
| if regex: | |||
| # Since `sre_parse` cannot deal with Unicode categories of the form `\p{Mn}`, we replace these with | |||
| # a simple letter, which makes no difference as we are only trying to get the possible lengths of the regex | |||
| # match here below. | |||
| regexp_final = re.sub(categ_pattern, 'A', expr) | |||
| else: | |||
| if re.search(categ_pattern, expr): | |||
| raise ImportError('`regex` module must be installed in order to use Unicode categories.', expr) | |||
| regexp_final = expr | |||
| try: | |||
| return [int(x) for x in sre_parse.parse(regexp).getwidth()] | |||
| return [int(x) for x in sre_parse.parse(regexp_final).getwidth()] | |||
| except sre_constants.error: | |||
| raise ValueError(regexp) | |||
| raise ValueError(expr) | |||
| ###} | |||
| @@ -182,7 +197,7 @@ def get_regexp_width(regexp): | |||
| def dedup_list(l): | |||
| """Given a list (l) will removing duplicates from the list, | |||
| preserving the original order of the list. Assumes that | |||
| the list entrie are hashable.""" | |||
| the list entries are hashable.""" | |||
| dedup = set() | |||
| return [ x for x in l if not (x in dedup or dedup.add(x))] | |||
| @@ -1,4 +1,7 @@ | |||
| import re | |||
| try: | |||
| import regex as re | |||
| except ImportError: | |||
| import re | |||
| from setuptools import find_packages, setup | |||
| __version__ ,= re.findall('__version__ = "(.*)"', open('lark/__init__.py').read()) | |||
| @@ -11,6 +14,10 @@ setup( | |||
| requires = [], | |||
| install_requires = [], | |||
| extras_require = { | |||
| "regex": ["regex"] | |||
| }, | |||
| package_data = {'': ['*.md', '*.lark'], 'lark-stubs': ['*.pyi']}, | |||
| test_suite = 'tests.__main__', | |||
| @@ -1 +1,2 @@ | |||
| Js2Py==0.68 | |||
| regex | |||
| @@ -20,6 +20,11 @@ from io import ( | |||
| logging.basicConfig(level=logging.INFO) | |||
| try: | |||
| import regex | |||
| except ImportError: | |||
| regex = None | |||
| from lark.lark import Lark | |||
| from lark.exceptions import GrammarError, ParseError, UnexpectedToken, UnexpectedInput, UnexpectedCharacters | |||
| from lark.tree import Tree | |||
| @@ -548,8 +553,8 @@ class CustomLexer(Lexer): | |||
| Purpose of this custom lexer is to test the integration, | |||
| so it uses the traditionalparser as implementation without custom lexing behaviour. | |||
| """ | |||
| def __init__(self, lexer_conf): | |||
| self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags) | |||
| def __init__(self, lexer_conf, re_): | |||
| self.lexer = TraditionalLexer(lexer_conf.tokens, re_, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags) | |||
| def lex(self, *args, **kwargs): | |||
| return self.lexer.lex(*args, **kwargs) | |||
| @@ -1784,6 +1789,23 @@ def _make_parser_test(LEXER, PARSER): | |||
| self.assertEqual(a.line, 1) | |||
| self.assertEqual(b.line, 2) | |||
| @unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') | |||
| def test_unicode_class(self): | |||
| "Tests that character classes from the `regex` module work correctly." | |||
| g = _Lark(r"""?start: NAME | |||
| NAME: ID_START ID_CONTINUE* | |||
| ID_START: /[\p{Lu}\p{Ll}\p{Lt}\p{Lm}\p{Lo}\p{Nl}_]+/ | |||
| ID_CONTINUE: ID_START | /[\p{Mn}\p{Mc}\p{Nd}\p{Pc}]+/""", regex=True) | |||
| self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') | |||
| @unittest.skipIf(not regex or sys.version_info[0] == 2, 'Unicode and Python 2 do not place nicely together.') | |||
| def test_unicode_word(self): | |||
| "Tests that a persistent bug in the `re` module works when `regex` is enabled." | |||
| g = _Lark(r"""?start: NAME | |||
| NAME: /[\w]+/ | |||
| """, regex=True) | |||
| self.assertEqual(g.parse('வணக்கம்'), 'வணக்கம்') | |||
| _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() | |||
| _TestParser.__name__ = _NAME | |||
| @@ -14,7 +14,7 @@ pypy3 = pypy3 | |||
| [testenv] | |||
| whitelist_externals = git | |||
| deps = | |||
| -rnearley-requirements.txt | |||
| -rtest-requirements.txt | |||
| # to always force recreation and avoid unexpected side effects | |||
| recreate=True | |||