diff --git a/docs/classes.md b/docs/classes.md index fd9ee3d..e29443c 100644 --- a/docs/classes.md +++ b/docs/classes.md @@ -8,41 +8,86 @@ This page details the important classes in Lark. The Lark class is the main interface for the library. It's mostly a thin wrapper for the many different parsers, and for the tree constructor. -#### \_\_init\_\_(self, grammar, **options) +#### \_\_init\_\_(self, grammar_string, **options) -The Lark class accepts a grammar string or file object, and keyword options: +Creates an instance of Lark with the given grammar -* **start** - A list of the rules in the grammar that begin the parse (Default: `["start"]`) +#### open(cls, grammar_filename, rel_to=None, **options) -* **parser** - Decides which parser engine to use, "earley", "lalr" or "cyk". (Default: `"earley"`) +Creates an instance of Lark with the grammar given by its filename -* **lexer** - Overrides default lexer, depending on parser. +If rel_to is provided, the function will find the grammar filename in relation to it. -* **transformer** - Applies the provided transformer instead of building a parse tree (only allowed with parser="lalr") +Example: -* **postlex** - Lexer post-processing (Default: `None`. only works when lexer is "standard" or "contextual") +```python + >>> Lark.open("grammar_file.lark", rel_to=__file__, parser="lalr") + Lark(...) +``` -* **ambiguity** (only relevant for earley and cyk) +#### parse(self, text) - * "explicit" - Return all derivations inside an "_ambig" data node. +Return a complete parse tree for the text (of type Tree) - * "resolve" - Let the parser choose the best derivation (greedy for tokens, non-greedy for rules. Default) +If a transformer is supplied to `__init__`, returns whatever is the result of the transformation. -* **debug** - Display warnings (such as Shift-Reduce warnings for LALR) -* **keep_all_tokens** - Don't throw away any terminals from the tree (Default=`False`) +#### save(self, f) / load(cls, f) -* **propagate_positions** - Propagate line/column count to tree nodes, at the cost of performance (default=`False`) +Useful for caching and multiprocessing. -* **maybe_placeholders** - When True, the `[]` operator returns `None` when not matched. When `False`, `[]` behaves like the `?` operator, and return no value at all, which may be a little faster (default=`False`) +`save` saves the instance into the given file object -* **lexer_callbacks** - A dictionary of callbacks of type f(Token) -> Token, used to interface with the lexer Token generation. Only works with the standard and contextual lexers. See [Recipes](recipes.md) for more information. +`load` loads an instance from the given file object -#### parse(self, text) +#### -Return a complete parse tree for the text (of type Tree) -If a transformer is supplied to `__init__`, returns whatever is the result of the transformation. +### Lark Options +#### General options + +**start** - The start symbol. Either a string, or a list of strings for multiple possible starts (Default: "start") + +**debug** - Display debug information, such as warnings (default: False) + +**transformer** - Applies the transformer to every parse tree (equivlent to applying it after the parse, but faster) + +**propagate_positions** - Propagates (line, column, end_line, end_column) attributes into all tree branches. + +**maybe_placeholders** - +- When True, the `[]` operator returns `None` when not matched. +- When `False`, `[]` behaves like the `?` operator, and returns no value at all. +- (default=`False`. Recommended to set to `True`) + +**g_regex_flags** - Flags that are applied to all terminals (both regex and strings) + +**keep_all_tokens** - Prevent the tree builder from automagically removing "punctuation" tokens (default: False) + +**cache_grammar** - Cache the Lark grammar (Default: False) + +#### Algorithm + +**parser** - Decides which parser engine to use, "earley" or "lalr". (Default: "earley") + (there is also a "cyk" option for legacy) + +**lexer** - Decides whether or not to use a lexer stage + +- "auto" (default): Choose for me based on the parser +- "standard": Use a standard lexer +- "contextual": Stronger lexer (only works with parser="lalr") +- "dynamic": Flexible and powerful (only with parser="earley") +- "dynamic_complete": Same as dynamic, but tries *every* variation of tokenizing possible. (only with parser="earley") + +**ambiguity** - Decides how to handle ambiguity in the parse. Only relevant if parser="earley" +- "resolve": The parser will automatically choose the simplest derivation (it chooses consistently: greedy for tokens, non-greedy for rules) +- "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). + +#### Domain Specific + +- **postlex** - Lexer post-processing (Default: None) Only works with the standard and contextual lexers. +- **priority** - How priorities should be evaluated - auto, none, normal, invert (Default: auto) +- **lexer_callbacks** - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. +- **edit_terminals** - A callback ---- diff --git a/lark/common.py b/lark/common.py index 7103d14..c44f9ce 100644 --- a/lark/common.py +++ b/lark/common.py @@ -4,14 +4,15 @@ from .lexer import TerminalDef ###{standalone class LexerConf(Serialize): - __serialize_fields__ = 'tokens', 'ignore' + __serialize_fields__ = 'tokens', 'ignore', 'g_regex_flags' __serialize_namespace__ = TerminalDef, - def __init__(self, tokens, ignore=(), postlex=None, callbacks=None): + def __init__(self, tokens, ignore=(), postlex=None, callbacks=None, g_regex_flags=0): self.tokens = tokens self.ignore = ignore self.postlex = postlex self.callbacks = callbacks or {} + self.g_regex_flags = g_regex_flags def _deserialize(self): self.callbacks = {} # TODO diff --git a/lark/lark.py b/lark/lark.py index 906a412..302b526 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -22,32 +22,56 @@ class LarkOptions(Serialize): """ OPTIONS_DOC = """ - parser - Decides which parser engine to use, "earley" or "lalr". (Default: "earley") - Note: "lalr" requires a lexer - - lexer - Decides whether or not to use a lexer stage - "standard": Use a standard lexer - "contextual": Stronger lexer (only works with parser="lalr") - "dynamic": Flexible and powerful (only with parser="earley") - "dynamic_complete": Same as dynamic, but tries *every* variation - of tokenizing possible. (only with parser="earley") - "auto" (default): Choose for me based on grammar and parser - - ambiguity - Decides how to handle ambiguity in the parse. Only relevant if parser="earley" - "resolve": The parser will automatically choose the simplest derivation - (it chooses consistently: greedy for tokens, non-greedy for rules) - "explicit": The parser will return all derivations wrapped in "_ambig" tree nodes (i.e. a forest). - - transformer - Applies the transformer to every parse tree - debug - Affects verbosity (default: False) - keep_all_tokens - Don't automagically remove "punctuation" tokens (default: False) - cache_grammar - Cache the Lark grammar (Default: False) - postlex - Lexer post-processing (Default: None) Only works with the standard and contextual lexers. - start - The start symbol, either a string, or a list of strings for multiple possible starts (Default: "start") - priority - How priorities should be evaluated - auto, none, normal, invert (Default: auto) - propagate_positions - Propagates [line, column, end_line, end_column] attributes into all tree branches. - lexer_callbacks - Dictionary of callbacks for the lexer. May alter tokens during lexing. Use with caution. - maybe_placeholders - Experimental feature. Instead of omitting optional rules (i.e. rule?), replace them with None +# General + + start - The start symbol. Either a string, or a list of strings for + multiple possible starts (Default: "start") + debug - Display debug information, such as warnings (default: False) + transformer - Applies the transformer to every parse tree (equivlent to + applying it after the parse, but faster) + propagate_positions - Propagates (line, column, end_line, end_column) + attributes into all tree branches. + maybe_placeholders - When True, the `[]` operator returns `None` when not matched. + When `False`, `[]` behaves like the `?` operator, + and returns no value at all. + (default=`False`. Recommended to set to `True`) + cache_grammar - Cache the Lark grammar (Default: False) + g_regex_flags - Flags that are applied to all terminals + (both regex and strings) + keep_all_tokens - Prevent the tree builder from automagically + removing "punctuation" tokens (default: False) + +# Algorithm + + parser - Decides which parser engine to use + Accepts "earley" or "lalr". (Default: "earley") + (there is also a "cyk" option for legacy) + + lexer - Decides whether or not to use a lexer stage + "auto" (default): Choose for me based on the parser + "standard": Use a standard lexer + "contextual": Stronger lexer (only works with parser="lalr") + "dynamic": Flexible and powerful (only with parser="earley") + "dynamic_complete": Same as dynamic, but tries *every* variation + of tokenizing possible. + + ambiguity - Decides how to handle ambiguity in the parse. + Only relevant if parser="earley" + "resolve": The parser will automatically choose the simplest + derivation (it chooses consistently: greedy for + tokens, non-greedy for rules) + "explicit": The parser will return all derivations wrapped + in "_ambig" tree nodes (i.e. a forest). + +# Domain Specific + + postlex - Lexer post-processing (Default: None) Only works with the + standard and contextual lexers. + priority - How priorities should be evaluated - auto, none, normal, + invert (Default: auto) + lexer_callbacks - Dictionary of callbacks for the lexer. May alter + tokens during lexing. Use with caution. + edit_terminals - A callback """ if __doc__: __doc__ += OPTIONS_DOC @@ -68,6 +92,7 @@ class LarkOptions(Serialize): 'lexer_callbacks': {}, 'maybe_placeholders': False, 'edit_terminals': None, + 'g_regex_flags': 0, } def __init__(self, options_dict): @@ -209,7 +234,7 @@ class Lark(Serialize): if hasattr(t, term.name): lexer_callbacks[term.name] = getattr(t, term.name) - self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, lexer_callbacks) + self.lexer_conf = LexerConf(self.terminals, self.ignore_tokens, self.options.postlex, lexer_callbacks, self.options.g_regex_flags) if self.options.parser: self.parser = self._build_parser() @@ -217,12 +242,12 @@ class Lark(Serialize): self.lexer = self._build_lexer() if __init__.__doc__: - __init__.__doc__ += "\nOPTIONS:" + LarkOptions.OPTIONS_DOC + __init__.__doc__ += "\nOptions:\n" + LarkOptions.OPTIONS_DOC __serialize_fields__ = 'parser', 'rules', 'options' def _build_lexer(self): - return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) + return TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags) def _prepare_callbacks(self): self.parser_class = get_frontend(self.options.parser, self.options.lexer) diff --git a/lark/lexer.py b/lark/lexer.py index ecff75f..32bfe78 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -230,7 +230,7 @@ class CallChain: -def _create_unless(terminals): +def _create_unless(terminals, g_regex_flags): tokens_by_type = classify(terminals, lambda t: type(t.pattern)) assert len(tokens_by_type) <= 2, tokens_by_type.keys() embedded_strs = set() @@ -241,19 +241,19 @@ def _create_unless(terminals): if strtok.priority > retok.priority: continue s = strtok.pattern.value - m = re.match(retok.pattern.to_regexp(), s) + m = re.match(retok.pattern.to_regexp(), s, g_regex_flags) if m and m.group(0) == s: unless.append(strtok) if strtok.pattern.flags <= retok.pattern.flags: embedded_strs.add(strtok) if unless: - callback[retok.name] = UnlessCallback(build_mres(unless, match_whole=True)) + callback[retok.name] = UnlessCallback(build_mres(unless, g_regex_flags, match_whole=True)) terminals = [t for t in terminals if t not in embedded_strs] return terminals, callback -def _build_mres(terminals, max_size, match_whole): +def _build_mres(terminals, max_size, g_regex_flags, match_whole): # Python sets an unreasonable group limit (currently 100) in its re module # Worse, the only way to know we reached it is by catching an AssertionError! # This function recursively tries less and less groups until it's successful. @@ -261,17 +261,17 @@ def _build_mres(terminals, max_size, match_whole): mres = [] while terminals: try: - mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size])) + mre = re.compile(u'|'.join(u'(?P<%s>%s)'%(t.name, t.pattern.to_regexp()+postfix) for t in terminals[:max_size]), g_regex_flags) except AssertionError: # Yes, this is what Python provides us.. :/ - return _build_mres(terminals, max_size//2, match_whole) + return _build_mres(terminals, max_size//2, g_regex_flags, match_whole) # terms_from_name = {t.name: t for t in terminals[:max_size]} mres.append((mre, {i:n for n,i in mre.groupindex.items()} )) terminals = terminals[max_size:] return mres -def build_mres(terminals, match_whole=False): - return _build_mres(terminals, len(terminals), match_whole) +def build_mres(terminals, g_regex_flags, match_whole=False): + return _build_mres(terminals, len(terminals), g_regex_flags, match_whole) def _regexp_has_newline(r): r"""Expressions that may indicate newlines in a regexp: @@ -294,7 +294,7 @@ class Lexer(object): class TraditionalLexer(Lexer): - def __init__(self, terminals, ignore=(), user_callbacks={}): + def __init__(self, terminals, ignore=(), user_callbacks={}, g_regex_flags=0): assert all(isinstance(t, TerminalDef) for t in terminals), terminals terminals = list(terminals) @@ -302,7 +302,7 @@ class TraditionalLexer(Lexer): # Sanitization for t in terminals: try: - re.compile(t.pattern.to_regexp()) + re.compile(t.pattern.to_regexp(), g_regex_flags) except re.error: raise LexError("Cannot compile token %s: %s" % (t.name, t.pattern)) @@ -318,10 +318,10 @@ class TraditionalLexer(Lexer): terminals.sort(key=lambda x:(-x.priority, -x.pattern.max_width, -len(x.pattern.value), x.name)) self.terminals = terminals self.user_callbacks = user_callbacks - self.build() + self.build(g_regex_flags) - def build(self): - terminals, self.callback = _create_unless(self.terminals) + def build(self, g_regex_flags=0): + terminals, self.callback = _create_unless(self.terminals, g_regex_flags) assert all(self.callback.values()) for type_, f in self.user_callbacks.items(): @@ -331,7 +331,7 @@ class TraditionalLexer(Lexer): else: self.callback[type_] = f - self.mres = build_mres(terminals) + self.mres = build_mres(terminals, g_regex_flags) def match(self, stream, pos): for mre, type_from_index in self.mres: @@ -347,7 +347,7 @@ class TraditionalLexer(Lexer): class ContextualLexer(Lexer): - def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}): + def __init__(self, terminals, states, ignore=(), always_accept=(), user_callbacks={}, g_regex_flags=0): tokens_by_name = {} for t in terminals: assert t.name not in tokens_by_name, t @@ -362,12 +362,12 @@ class ContextualLexer(Lexer): except KeyError: accepts = set(accepts) | set(ignore) | set(always_accept) state_tokens = [tokens_by_name[n] for n in accepts if n and n in tokens_by_name] - lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks) + lexer = TraditionalLexer(state_tokens, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) lexer_by_tokens[key] = lexer self.lexers[state] = lexer - self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks) + self.root_lexer = TraditionalLexer(terminals, ignore=ignore, user_callbacks=user_callbacks, g_regex_flags=g_regex_flags) def lex(self, stream, get_parser_state): parser_state = get_parser_state() diff --git a/lark/parser_frontends.py b/lark/parser_frontends.py index 8b42772..d68d186 100644 --- a/lark/parser_frontends.py +++ b/lark/parser_frontends.py @@ -88,7 +88,7 @@ class WithLexer(_ParserFrontend): return self._parse(token_stream, start) def init_traditional_lexer(self): - self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks) + self.lexer = TraditionalLexer(self.lexer_conf.tokens, ignore=self.lexer_conf.ignore, user_callbacks=self.lexer_conf.callbacks, g_regex_flags=self.lexer_conf.g_regex_flags) class LALR_WithLexer(WithLexer): def __init__(self, lexer_conf, parser_conf, options=None): @@ -112,7 +112,8 @@ class LALR_ContextualLexer(LALR_WithLexer): self.lexer = ContextualLexer(self.lexer_conf.tokens, states, ignore=self.lexer_conf.ignore, always_accept=always_accept, - user_callbacks=self.lexer_conf.callbacks) + user_callbacks=self.lexer_conf.callbacks, + g_regex_flags=self.lexer_conf.g_regex_flags) def parse(self, text, start=None): @@ -187,7 +188,7 @@ class XEarley(_ParserFrontend): if width == 0: raise ValueError("Dynamic Earley doesn't allow zero-width regexps", t) - self.regexps[t.name] = re.compile(regexp) + self.regexps[t.name] = re.compile(regexp, lexer_conf.g_regex_flags) def parse(self, text, start): return self._parse(text, start) diff --git a/lark_stubs/lark.pyi b/lark_stubs/lark.pyi index b256036..76a6a54 100644 --- a/lark_stubs/lark.pyi +++ b/lark_stubs/lark.pyi @@ -34,6 +34,7 @@ class LarkOptions: maybe_placeholders: bool lexer_callbacks: Dict[str, Callable[[Token], Token]] cache_grammar: bool + g_regex_flags: int class Lark: @@ -56,7 +57,8 @@ class Lark: keep_all_tokens: bool = False, propagate_positions: bool = False, maybe_placeholders: bool = False, - lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None + lexer_callbacks: Optional[Dict[str, Callable[[Token], Token]]] = None, + g_regex_flags: int = ... ): ... diff --git a/lark_stubs/lexer.pyi b/lark_stubs/lexer.pyi index 12f4e0a..a43b754 100644 --- a/lark_stubs/lexer.pyi +++ b/lark_stubs/lexer.pyi @@ -112,7 +112,8 @@ class TraditionalLexer(Lexer): self, terminals: Collection[TerminalDef], ignore: Collection[str] = ..., - user_callbacks: Dict[str, _Callback] = ... + user_callbacks: Dict[str, _Callback] = ..., + g_regex_flags: int = ... ): ... @@ -136,7 +137,8 @@ class ContextualLexer(Lexer): states: Dict[str, Collection[str]], ignore: Collection[str] = ..., always_accept: Collection[str] = ..., - user_callbacks: Dict[str, _Callback] = ... + user_callbacks: Dict[str, _Callback] = ..., + g_regex_flags: int = ... ): ... diff --git a/tests/test_parser.py b/tests/test_parser.py index 9a98f54..2a64c77 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1,6 +1,7 @@ # -*- coding: utf-8 -*- from __future__ import absolute_import +import re import unittest import logging import os @@ -538,7 +539,7 @@ class CustomLexer(Lexer): so it uses the traditionalparser as implementation without custom lexing behaviour. """ def __init__(self, lexer_conf): - self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks) + self.lexer = TraditionalLexer(lexer_conf.tokens, ignore=lexer_conf.ignore, user_callbacks=lexer_conf.callbacks, g_regex_flags=lexer_conf.g_regex_flags) def lex(self, *args, **kwargs): return self.lexer.lex(*args, **kwargs) @@ -845,7 +846,16 @@ def _make_parser_test(LEXER, PARSER): x = g.parse("starts") self.assertSequenceEqual(x.children, ['starts']) - + + def test_g_regex_flags(self): + g = _Lark(""" + start: "a" /b+/ C + C: "C" | D + D: "D" E + E: "e" + """, g_regex_flags=re.I) + x1 = g.parse("ABBc") + x2 = g.parse("abdE") # def test_string_priority(self): # g = _Lark("""start: (A | /a?bb/)+ @@ -1715,6 +1725,7 @@ def _make_parser_test(LEXER, PARSER): _NAME = "Test" + PARSER.capitalize() + LEXER.capitalize() _TestParser.__name__ = _NAME + _TestParser.__qualname__ = "tests.test_parser." + _NAME globals()[_NAME] = _TestParser # Note: You still have to import them in __main__ for the tests to run