diff --git a/lark/grammar.py b/lark/grammar.py index cf8cf64..d95b95c 100644 --- a/lark/grammar.py +++ b/lark/grammar.py @@ -1,6 +1,7 @@ from .utils import Serialize ###{standalone +END = '_END$' class Symbol(Serialize): __slots__ = ('name',) diff --git a/lark/lexer.py b/lark/lexer.py index ecff75f..8edd93f 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -309,7 +309,7 @@ class TraditionalLexer(Lexer): if t.pattern.min_width == 0: raise LexError("Lexer does not allow zero-width terminals. (%s: %s)" % (t.name, t.pattern)) - assert set(ignore) <= {t.name for t in terminals} + assert set(ignore) <= {t.name for t in terminals}, (ignore, terminals) # Init self.newline_types = [t.name for t in terminals if _regexp_has_newline(t.pattern.to_regexp())] diff --git a/lark/load_grammar.py b/lark/load_grammar.py index 051f8cd..ad61239 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -11,7 +11,7 @@ from .lexer import Token, TerminalDef, PatternStr, PatternRE from .parse_tree_builder import ParseTreeBuilder from .parser_frontends import LALR_TraditionalLexer from .common import LexerConf, ParserConf -from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol +from .grammar import RuleOptions, Rule, Terminal, NonTerminal, Symbol, END from .utils import classify, suppress, dedup_list, Str from .exceptions import GrammarError, UnexpectedCharacters, UnexpectedToken @@ -91,7 +91,12 @@ TERMINALS = { '_IGNORE': r'%ignore', '_DECLARE': r'%declare', '_IMPORT': r'%import', +<<<<<<< HEAD 'NUMBER': r'[+-]?\d+', +======= + 'NUMBER': r'\d+', + '_END': r'\$', +>>>>>>> end_symbol } RULES = { @@ -123,7 +128,8 @@ RULES = { 'value': ['terminal', 'nonterminal', 'literal', - 'range'], + 'range', + 'end'], 'terminal': ['TERMINAL'], 'nonterminal': ['RULE'], @@ -131,7 +137,12 @@ RULES = { '?name': ['RULE', 'TERMINAL'], 'maybe': ['_LBRA expansions _RBRA'], +<<<<<<< HEAD 'range': ['STRING _DOTDOT STRING'], +======= + 'range': ['STRING _DOT _DOT STRING'], + 'end': ['_END'], +>>>>>>> end_symbol 'term': ['TERMINAL _COLON expansions _NL', 'TERMINAL _DOT NUMBER _COLON expansions _NL'], @@ -286,6 +297,9 @@ class CanonizeTree(Transformer_InPlace): tokenmods, value = args return tokenmods + [value] + def end(self): + return Token('TERMINAL', END) + class PrepareAnonTerminals(Transformer_InPlace): "Create a unique list of anonymous terminals. Attempt to give meaningful names to them when we add them" @@ -733,6 +747,7 @@ class GrammarLoader: term_defs = [td if len(td)==3 else (td[0], 1, td[1]) for td in term_defs] term_defs = [(name.value, (t, int(p))) for name, p, t in term_defs] + term_defs.append((END, (None, 0))) rule_defs = [options_from_rule(*x) for x in rule_defs] # Execute statements @@ -825,7 +840,7 @@ class GrammarLoader: raise GrammarError("Terminal '%s' defined more than once" % name) terminal_names.add(name) - if set(ignore_names) > terminal_names: + if set(ignore_names) - terminal_names: raise GrammarError("Terminals %s were marked to ignore but were not defined!" % (set(ignore_names) - terminal_names)) resolve_term_references(term_defs) diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 94c32cc..20c5ba1 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -2,7 +2,7 @@ from collections import Counter, defaultdict from ..utils import bfs, fzset, classify from ..exceptions import GrammarError -from ..grammar import Rule, Terminal, NonTerminal +from ..grammar import Rule, Terminal, NonTerminal, END class RulePtr(object): @@ -125,7 +125,7 @@ class GrammarAnalyzer(object): def __init__(self, parser_conf, debug=False): self.debug = debug - root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal('$END')]) + root_rules = {start: Rule(NonTerminal('$root_' + start), [NonTerminal(start), Terminal(END)]) for start in parser_conf.start} rules = parser_conf.rules + list(root_rules.values()) diff --git a/lark/parsers/lalr_analysis.py b/lark/parsers/lalr_analysis.py index 05c1ce8..ade6163 100644 --- a/lark/parsers/lalr_analysis.py +++ b/lark/parsers/lalr_analysis.py @@ -13,7 +13,7 @@ from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator from ..exceptions import GrammarError from .grammar_analysis import GrammarAnalyzer, Terminal, LR0ItemSet -from ..grammar import Rule +from ..grammar import Rule, END ###{standalone diff --git a/lark/parsers/lalr_parser.py b/lark/parsers/lalr_parser.py index 4265ca5..11d3407 100644 --- a/lark/parsers/lalr_parser.py +++ b/lark/parsers/lalr_parser.py @@ -5,6 +5,7 @@ from ..exceptions import UnexpectedToken from ..lexer import Token from ..utils import Enumerator, Serialize +from ..grammar import END from .lalr_analysis import LALR_Analyzer, Shift, Reduce, IntParseTable @@ -93,7 +94,7 @@ class _Parser: else: reduce(arg) - token = Token.new_borrow_pos('$END', '', token) if token else Token('$END', '', 0, 1, 1) + token = Token.new_borrow_pos(END, None, token) if token else Token(END, None, 0, 1, 1) while True: _action, arg = get_action(token) assert(_action is Reduce) diff --git a/tests/test_parser.py b/tests/test_parser.py index 7edfd3a..1b0a093 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1648,6 +1648,18 @@ def _make_parser_test(LEXER, PARSER): """ parser = _Lark(grammar) + @unittest.skipIf(PARSER!='lalr', "Using the end symbol currently works for LALR only") + def test_end_symbol(self): + grammar = """ + start: a b? + a: "a" $ + b: "b" + """ + parser = _Lark(grammar) + + self.assertEqual(parser.parse('a'), Tree('start', [Tree('a', [])])) + self.assertRaises(UnexpectedInput, parser.parse, 'ab') + @unittest.skipIf(PARSER!='lalr' or LEXER=='custom', "Serialize currently only works for LALR parsers without custom lexers (though it should be easy to extend)") def test_serialize(self):