diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 78f8fee..750695c 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -881,7 +881,7 @@ def _grammar_parser(): _grammar_parser.cache = ParsingFrontend(lexer_conf, parser_conf, {}) return _grammar_parser.cache -_GRAMMAR_ERRORS = [ +GRAMMAR_ERRORS = [ ('Unclosed parenthesis', ['a: (\n']), ('Unmatched closing parenthesis', ['a: )\n', 'a: [)\n', 'a: (]\n']), ('Expecting rule or terminal definition (missing colon)', ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n']), @@ -897,14 +897,14 @@ _GRAMMAR_ERRORS = [ def _parse_grammar(text, name, start='start'): try: - return _grammar_parser().parse(text + '\n', start) + return PrepareGrammar().transform(_grammar_parser().parse(text + '\n', start)) except UnexpectedCharacters as e: context = e.get_context(text) raise GrammarError("Unexpected input at line %d column %d in %s: \n\n%s" % (e.line, e.column, name, context)) except UnexpectedToken as e: context = e.get_context(text) - error = e.match_examples(_grammar_parser().parse, _GRAMMAR_ERRORS, use_accepts=True) + error = e.match_examples(_grammar_parser().parse, GRAMMAR_ERRORS, use_accepts=True) if error: raise GrammarError("%s, at line %s column %s\n\n%s" % (error, e.line, e.column, context)) elif 'STRING' in e.expected: @@ -913,19 +913,6 @@ def _parse_grammar(text, name, start='start'): class GrammarLoader: - ERRORS = [ - ('Unclosed parenthesis', ['a: (\n']), - ('Unmatched closing parenthesis', ['a: )\n', 'a: [)\n', 'a: (]\n']), - ('Expecting rule or terminal definition (missing colon)', ['a\n', 'A\n', 'a->\n', 'A->\n', 'a A\n']), - ('Illegal name for rules or terminals', ['Aa:\n']), - ('Alias expects lowercase name', ['a: -> "a"\n']), - ('Unexpected colon', ['a::\n', 'a: b:\n', 'a: B:\n', 'a: "a":\n']), - ('Misplaced operator', ['a: b??', 'a: b(?)', 'a:+\n', 'a:?\n', 'a:*\n', 'a:|*\n']), - ('Expecting option ("|") or a new rule or terminal definition', ['a:a\n()\n']), - ('Terminal names cannot contain dots', ['A.B\n']), - ('%import expects a name', ['%import "a"\n']), - ('%ignore expects a value', ['%ignore %import\n']), - ] def __init__(self, global_keep_all_tokens=False): self.global_keep_all_tokens = global_keep_all_tokens @@ -959,7 +946,7 @@ class GrammarLoader: """Parse grammar_text, verify, and create Grammar object. Display nice messages on error.""" tree = _parse_grammar(grammar_text+'\n', grammar_name) - tree = PrepareGrammar().transform(tree) + # tree = PrepareGrammar().transform(tree) # Extract grammar items defs = classify(tree.children, lambda c: c.data, lambda c: c.children) @@ -1167,7 +1154,7 @@ class GrammarLoader: for sym in _find_used_symbols(expansions): if sym.type == 'TERMINAL': if sym not in terminal_names: - raise GrammarError("Token '%s' used but not defined (in rule %s)" % (sym, name)) + raise GrammarError("Terminal '%s' used but not defined (in rule %s)" % (sym, name)) else: if sym not in rule_names and sym not in params: raise GrammarError("Rule '%s' used but not defined (in rule %s)" % (sym, name)) @@ -1180,52 +1167,63 @@ class GrammarBuilder: self.global_keep_all_tokens = global_keep_all_tokens self.import_paths = import_paths or [] - self._term_defs = {} - self._rule_defs = {} + self._definitions = {} + self._extend = {} + self._override = {} self._ignore_names = [] + self._import_set = {} + + def _is_term(self, name): + return name.isupper() - def define_term(self, name, exp, priority=1, override=False): - if (name in self._term_defs) ^ override: - if override: - raise GrammarError("Cannot override a nonexisting terminal" % name) - else: - raise GrammarError("Terminal '%s' defined more than once" % name) - if name.startswith('__'): - raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) - self._term_defs[name] = (exp, priority) + def _grammer_error(self, msg, name): + low_type = ("rule", "terminal")[self._is_term(name)] + up_type = low_type.title() + raise GrammarError(msg.format(name=name, type=low_type, Type=up_type)) - def define_rule(self, name, params, exp, options, override=False): - if (name in self._rule_defs) ^ override: + def _check_options(self, name, options): + if self._is_term(name): + if options is None: + options = 1 + elif not isinstance(options, int): + raise GrammarError("Terminal require a single int as 'options' (e.g. priority)") + else: + if options is None: + options = RuleOptions() + elif not isinstance(options, RuleOptions): + raise GrammarError("Rules require a RuleOptions instance as 'options'") + if self.global_keep_all_tokens: + options.keep_all_tokens = True + return options + + + def define(self, name, exp, params=(), options=None, override=False): + if (name in self._definitions) ^ override: if override: - raise GrammarError("Cannot override a nonexisting rule: %s" % name) + self._grammer_error("Cannot override a nonexisting {type} {name}", name) else: - raise GrammarError("Rule '%s' defined more than once" % name) + self._grammer_error("{Type} '{name}' defined more than once", name) if name.startswith('__'): - raise GrammarError('Names starting with double-underscore are reserved (Error at %s)' % name) - self._rule_defs[name] = (params, exp, options) - - def extend_term(self, name, exp, priority=1): - if name not in self._term_defs: - raise GrammarError("Can't extend terminal %s as it wasn't defined before" % name) - old_expansions = self._term_defs[name][0] - extend_expansions(old_expansions, exp) - - def extend_rule(self, name, params, exp, options): - if name not in self._rule_defs: - raise GrammarError("Can't extend rule %s as it wasn't defined before" % name) - if params != self._rule_defs[name][0]: - raise GrammarError("Cannot extend templates with different parameters: %s" % name) - # TODO: think about what to do with RuleOptions - old_expansions = self._rule_defs[name][1] + self._grammer_error('Names starting with double-underscore are reserved (Error at {name})', name) + self._definitions[name] = (params, exp, self._check_options(name, options)) + + def extend(self, name, exp, params=(), options=None): + if name not in self._definitions: + self._grammer_error("Can't extend {type} {name} as it wasn't defined before", name) + if tuple(params) != tuple(self._definitions[name][0]): + print(params, self._definitions[name][0]) + self._grammer_error("Cannot extend {type} with different parameters: {name}", name) + # TODO: think about what to do with 'options' + old_expansions = self._definitions[name][1] extend_expansions(old_expansions, exp) - + def ignore(self, exp_or_name): if isinstance(exp_or_name, str): self._ignore_names.append(exp_or_name) else: assert isinstance(exp_or_name, Tree) t = exp_or_name - if t.data=='expansions' and len(t.children) == 1: + if t.data == 'expansions' and len(t.children) == 1: t2 ,= t.children if t2.data=='expansion' and len(t2.children) == 1: item ,= t2.children @@ -1237,62 +1235,179 @@ class GrammarBuilder: name = '__IGNORE_%d'% len(self._ignore_names) self._ignore_names.append(name) - self._term_defs[name] = (t, 1) + self._definitions[name] = ((), t, 1) def declare(self, *names): for name in names: - self.define_term(name, None, None) + self.define(name, None) + # TODO: options/priority gets filled by this. We have to make sure that this doesn't break anything + + def _mangle_exp(self, exp, mangle): + if mangle is None: + return exp + exp = deepcopy(exp) # TODO: is this needed + for t in exp.iter_subtrees(): + for i, c in enumerate(t.children): + if isinstance(c, Token) and c.type in ('RULE', 'TERMINAL'): + t.children[i] = Token(c.type, mangle(c.value)) + return exp + - def _unpack_term_def(self, tree): - name = tree.children[0].value - exp = tree.children[-1] - p = int(tree.children[1]) if len(tree.children) == 3 else 1 - return name, exp, p + def _unpack_definition(self, tree, mangle): + if tree.data == 'rule': + name, params, exp, opts = options_from_rule(*tree.children) + else: + name = tree.children[0].value + params = () + opts = int(tree.children[1]) if len(tree.children) == 3 else 1 # priority + exp = tree.children[-1] + if mangle is not None: + params = tuple(mangle(p) for p in params) + name = mangle(name) + exp = self._mangle_exp(exp, mangle) + return name, exp, params, opts - def _unpack_rule_def(self, tree): - # FIXME: A little pointless at the moment, but I want to rework this (e.g. move the work from `options_from_rule` to here) - r = options_from_rule(*tree.children) - return r + def _unpack_import(self, stmt, grammar_name): + if len(stmt.children) > 1: + path_node, arg1 = stmt.children + else: + path_node, = stmt.children + arg1 = None + + if isinstance(arg1, Tree): # Multi import + dotted_path = tuple(path_node.children) + names = arg1.children + aliases = dict(zip(names, names)) # Can't have aliased multi import, so all aliases will be the same as names + else: # Single import + dotted_path = tuple(path_node.children[:-1]) + name = path_node.children[-1] # Get name from dotted path + aliases = {name.value: (arg1 or name).value} # Aliases if exist + + if path_node.data == 'import_lib': # Import from library + base_path = None + else: # Relative import + if grammar_name == '': # Import relative to script file path if grammar is coded in script + try: + base_file = os.path.abspath(sys.modules['__main__'].__file__) + except AttributeError: + base_file = None + else: + base_file = grammar_name # Import relative to grammar file path if external grammar file + if base_file: + if isinstance(base_file, PackageResource): + base_path = PackageResource(base_file.pkg_name, os.path.split(base_file.path)[0]) + else: + base_path = os.path.split(base_file)[0] + else: + base_path = os.path.abspath(os.path.curdir) + + return dotted_path, base_path, aliases - def load_grammar(self, grammar_text, grammar_source=""): - tree = _parse_grammar(grammar_text, grammar_source) + def load_grammar(self, grammar_text, grammar_name="", mangle=None): + tree = _parse_grammar(grammar_text, grammar_name) + imports = {} # imports are collect over the whole file to prevent duplications + actions = [] # Some statements need to be delayed (override and extend) till after imports are handled for stmt in tree.children: - if stmt.data == 'term': - self.define_term(*self._unpack_term_def(stmt)) - continue - elif stmt.data == 'rule': - self.define_rule(*self._unpack_rule_def(stmt)) + if stmt.data in ('term', 'rule'): + self.define(*self._unpack_definition(stmt, mangle)) continue assert stmt.data == 'statement', stmt.data stmt ,= stmt.children - if stmt.data == 'ignore': - self.ignore(*stmt.children) + if stmt.data == 'import': + dotted_path, base_path, aliases = self._unpack_import(stmt, grammar_name) + try: + import_base_path, import_aliases = imports[dotted_path] + assert base_path == import_base_path, 'Inconsistent base_path for %s.' % '.'.join(dotted_path) + import_aliases.update(aliases) + except KeyError: + imports[dotted_path] = base_path, aliases + elif stmt.data == 'ignore': + # if mangle is not None, we shouldn't apply ignore, since we aren't in a toplevel grammar + if mangle is None: + self.ignore(*stmt.children) elif stmt.data == 'declare': - self.declare(*(t.value for t in stmt.children)) + if mangle is None: + self.declare(*(t.value for t in stmt.children)) + else: + self.declare(*(mangle(t.value) for t in stmt.children)) elif stmt.data == 'override': r ,= stmt.children - if r.data == 'rule': - self.define_rule(*self._unpack_rule_def(r), override=True) - else: - assert r.data == 'term' - self.define_term(*self._unpack_term_def(r), override=True) + actions.append((self.define, self._unpack_definition(r, mangle)+ (True,))) elif stmt.data == 'extend': r ,= stmt.children - if r.data == 'rule': - self.extend_rule(*self._unpack_rule_def(r)) - else: - assert r.data == 'term' - self.extend_term(*self._unpack_term_def(r)) + actions.append((self.extend, self._unpack_definition(r, mangle))) else: assert False, stmt + + for dotted_path, (base_path, aliases) in imports.items(): + self.do_import(dotted_path, base_path, aliases, mangle) + + for f, args in actions: + f(*args) + def do_import(self, dotted_path, base_path, aliases, base_mangle=None): + mangle = self.get_mangle('__'.join(dotted_path), aliases, base_mangle) + grammar_path = os.path.join(*dotted_path) + EXT + to_try = self.import_paths + ([base_path] if base_path is not None else []) + [stdlib_loader] + for source in to_try: + try: + if callable(source): + joined_path, text = source(base_path, grammar_path) + else: + joined_path = os.path.join(source, grammar_path) + with open(joined_path, encoding='utf8') as f: + text = f.read() + except IOError: + continue + else: + self.load_grammar(text, joined_path, mangle) + break + else: + # Search failed. Make Python throw a nice error. + open(grammar_path, encoding='utf8') + assert False, "Couldn't import grammar %s, but a corresponding file was found at a place where lark doesn't search for it" % (dotted_path,) + + def get_mangle(self, prefix, aliases, base_mangle=None): + prefixes = (prefix, prefix.upper()) + def mangle(s): + if s in aliases: + s = aliases[s] + else: + ns = prefixes[self._is_term(s)] + if s[0] == '_': + s = '_%s__%s' % (ns, s[1:]) + else: + s = '%s__%s' % (ns, s) + if base_mangle is not None: + s = base_mangle(s) + return s + return mangle + def check(self): - pass - - def build(self) -> Grammar: - return Grammar([(n, *r) for n, r in self._rule_defs.items()], - [(n, t) for n, t in self._term_defs], - self._ignore_names) + for name, (params, exp, options) in self._definitions.items(): + if self._is_term(name): + assert isinstance(options, int) + if exp is not None: + for sym in _find_used_symbols(exp): + if sym not in self._definitions and sym not in params: + self._grammer_error("{Type} '{name}' used but not defined (in rule %s)" % name, sym.value) + if not set(self._definitions).issuperset(self._ignore_names): + raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(self._ignore_names) - set(self._definitions))) + + def build(self): + self.check() + rule_defs = [] + term_defs = [] + for name, (params, exp, options) in self._definitions.items(): + if self._is_term(name): + term_defs.append((name, (exp, options))) + else: + rule_defs.append((name, params, exp, options)) + resolve_term_references(term_defs) + return Grammar(rule_defs, term_defs, self._ignore_names) def load_grammar(grammar, source, import_paths, global_keep_all_tokens): - return GrammarLoader(global_keep_all_tokens).load_grammar(grammar, source, import_paths) + builder = GrammarBuilder(global_keep_all_tokens, import_paths) + builder.load_grammar(grammar, source) + return builder.build() + # return GrammarLoader(global_keep_all_tokens).load_grammar(grammar, source, import_paths) diff --git a/tests/test_grammar.py b/tests/test_grammar.py index ad29c75..221fbc0 100644 --- a/tests/test_grammar.py +++ b/tests/test_grammar.py @@ -4,7 +4,7 @@ import sys from unittest import TestCase, main from lark import Lark, Token, Tree -from lark.load_grammar import GrammarLoader, GrammarError +from lark.load_grammar import GrammarError, GRAMMAR_ERRORS class TestGrammar(TestCase): @@ -12,7 +12,7 @@ class TestGrammar(TestCase): pass def test_errors(self): - for msg, examples in GrammarLoader.ERRORS: + for msg, examples in GRAMMAR_ERRORS: for example in examples: try: p = Lark(example) diff --git a/tests/test_parser.py b/tests/test_parser.py index 9b011f7..9be348e 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1961,7 +1961,7 @@ def _make_parser_test(LEXER, PARSER): p = _Lark(grammar, import_paths=[custom_loader]) self.assertEqual(p.parse('ab'), - Tree('start', [Tree('startab', [Tree('ab__expr', [Token('ab__A', 'a'), Token('ab__B', 'b')])])])) + Tree('start', [Tree('startab', [Tree('ab__expr', [Token('AB__A', 'a'), Token('AB__B', 'b')])])])) grammar = """ start: rule_to_import