From 9923987e94547ded8a17d7a03840c4cebce39188 Mon Sep 17 00:00:00 2001 From: decorator-factory <42166884+decorator-factory@users.noreply.github.com> Date: Mon, 10 Aug 2020 23:07:55 +0300 Subject: [PATCH] allow multiline regexes with 'x' (verbose) flag --- lark/load_grammar.py | 13 ++++++++++--- tests/test_parser.py | 26 ++++++++++++++++++++++++++ 2 files changed, 36 insertions(+), 3 deletions(-) diff --git a/lark/load_grammar.py b/lark/load_grammar.py index ae7ec32..d716ec1 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -13,7 +13,7 @@ from .parser_frontends import LALR_TraditionalLexer from .common import LexerConf, ParserConf from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol from .utils import classify, suppress, dedup_list, Str -from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken +from .exceptions import GrammarError, LarkError, UnexpectedCharacters, UnexpectedToken from .tree import Tree, SlottedTree as ST from .visitors import Transformer, Visitor, v_args, Transformer_InPlace, Transformer_NonRecursive @@ -85,7 +85,7 @@ TERMINALS = { 'RULE': '!?[_?]?[a-z][_a-z0-9]*', 'TERMINAL': '_?[A-Z][_A-Z0-9]*', 'STRING': r'"(\\"|\\\\|[^"\n])*?"i?', - 'REGEXP': r'/(?!/)(\\/|\\\\|[^/\n])*?/[%s]*' % _RE_FLAGS, + 'REGEXP': r'/(?!/)(\\/|\\\\|[^/])*?/[%s]*' % _RE_FLAGS, '_NL': r'(\r?\n)+\s*', 'WS': r'[ \t]+', 'COMMENT': r'\s*//[^\n]*', @@ -336,7 +336,7 @@ class PrepareAnonTerminals(Transformer_InPlace): term_name = None elif isinstance(p, PatternRE): - if p in self.term_reverse: # Kind of a wierd placement.name + if p in self.term_reverse: # Kind of a weird placement.name term_name = self.term_reverse[p].name else: assert False, p @@ -409,6 +409,13 @@ def _literal_to_pattern(literal): flags = v[flag_start:] assert all(f in _RE_FLAGS for f in flags), flags + if literal.type == 'STRING' and '\n' in v: + raise GrammarError('You cannot put newlines in string literals') + + if literal.type == 'REGEXP' and '\n' in v and 'x' not in flags: + raise GrammarError('You can only use newlines in regular expressions ' + 'with the `x` (verbose) flag') + v = v[:flag_start] assert v[0] == v[-1] and v[0] in '"/' x = v[1:-1] diff --git a/tests/test_parser.py b/tests/test_parser.py index cd3ea4d..48a4674 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1262,6 +1262,32 @@ def _make_parser_test(LEXER, PARSER): tree = l.parse('aA') self.assertEqual(tree.children, ['a', 'A']) + def test_token_flags_verbose(self): + g = _Lark(r"""start: NL | ABC + ABC: / [a-z] /x + NL: /\n/ + """) + x = g.parse('a') + self.assertEqual(x.children, ['a']) + + def test_token_flags_verbose_multiline(self): + g = _Lark(r"""start: ABC + ABC: / a b c + d + e f + /x + """) + x = g.parse('abcdef') + self.assertEqual(x.children, ['abcdef']) + + def test_token_multiline_only_works_with_x_flag(self): + g = r"""start: ABC + ABC: / a b c + d + e f + /i + """ + self.assertRaises( GrammarError, _Lark, g) @unittest.skipIf(PARSER == 'cyk', "No empty rules") def test_twice_empty(self):