diff --git a/docs/grammar.md b/docs/grammar.md index ad70f6e..9343ee4 100644 --- a/docs/grammar.md +++ b/docs/grammar.md @@ -45,6 +45,12 @@ Literals can be one of: * `/re with flags/imulx` * Literal range: `"a".."z"`, `"1".."9"`, etc. +### Priority + +Terminals can be assigned priority only when using a lexer (future versions may support Earley's dynamic lexing). + +Priority can be either positive or negative. In not specified for a terminal, it's assumed to be 1 (i.e. the default). + #### Notes for when using a lexer: When using a lexer (standard or contextual), it is the grammar-author's responsibility to make sure the literals don't collide, or that if they do, they are matched in the desired order. Literals are matched in an order according to the following criteria: @@ -90,7 +96,7 @@ Each item is one of: * `item*` - Zero or more instances of item * `item+` - One or more instances of item * `item ~ n` - Exactly *n* instances of item -* `item ~ n..m` - Between *n* to *m* instances of item +* `item ~ n..m` - Between *n* to *m* instances of item (not recommended for wide ranges, due to performance issues) **Examples:** ```perl @@ -102,6 +108,11 @@ expr: expr operator expr four_words: word ~ 4 ``` +### Priority + +Rules can be assigned priority only when using Earley (future versions may support LALR as well). + +Priority can be either positive or negative. In not specified for a terminal, it's assumed to be 1 (i.e. the default). ## Directives diff --git a/docs/parsers.md b/docs/parsers.md index 35de223..fb7c997 100644 --- a/docs/parsers.md +++ b/docs/parsers.md @@ -7,7 +7,7 @@ An [Earley Parser](https://www.wikiwand.com/en/Earley_parser) is a chart parser Lark's Earley implementation runs on top of a skipping chart parser, which allows it to use regular expressions, instead of matching characters one-by-one. This is a huge improvement to Earley that is unique to Lark. This feature is used by default, but can also be requested explicitely using `lexer='dynamic'`. -It's possible to bypass the dynamic lexer, and use the regular Earley parser with a traditional lexer, that tokenizes as an independant first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'` +It's possible to bypass the dynamic lexing, and use the regular Earley parser with a traditional lexer, that tokenizes as an independant first step. Doing so will provide a speed benefit, but will tokenize without using Earley's ambiguity-resolution ability. So choose this only if you know why! Activate with `lexer='standard'` **SPPF & Ambiguity resolution** diff --git a/lark/__init__.py b/lark/__init__.py index 7fd92ee..db2ce44 100644 --- a/lark/__init__.py +++ b/lark/__init__.py @@ -5,4 +5,4 @@ from .exceptions import ParseError, LexError, GrammarError, UnexpectedToken, Une from .lexer import Token from .lark import Lark -__version__ = "0.7.1" +__version__ = "0.7.2" diff --git a/lark/lark.py b/lark/lark.py index 5c43fa8..ae71d56 100644 --- a/lark/lark.py +++ b/lark/lark.py @@ -205,6 +205,8 @@ class Lark(Serialize): # Compile the EBNF grammar into BNF self.terminals, self.rules, self.ignore_tokens = self.grammar.compile(self.options.start) + self._terminals_dict = {t.name:t for t in self.terminals} + # If the user asked to invert the priorities, negate them all here. # This replaces the old 'resolve__antiscore_sum' option. if self.options.priority == 'invert': @@ -290,6 +292,10 @@ class Lark(Serialize): return self.options.postlex.process(stream) return stream + def get_terminal(self, name): + "Get information about a terminal" + return self._terminals_dict[name] + def parse(self, text, start=None): """Parse the given text, according to the options provided. diff --git a/lark/lexer.py b/lark/lexer.py index 3e881f8..d3e4af6 100644 --- a/lark/lexer.py +++ b/lark/lexer.py @@ -41,6 +41,8 @@ class Pattern(Serialize): class PatternStr(Pattern): + type = "str" + def to_regexp(self): return self._get_flags(re.escape(self.value)) @@ -50,15 +52,23 @@ class PatternStr(Pattern): max_width = min_width class PatternRE(Pattern): + type = "re" + def to_regexp(self): return self._get_flags(self.value) + _width = None + def _get_width(self): + if self._width is None: + self._width = get_regexp_width(self.to_regexp()) + return self._width + @property def min_width(self): - return get_regexp_width(self.to_regexp())[0] + return self._get_width()[0] @property def max_width(self): - return get_regexp_width(self.to_regexp())[1] + return self._get_width()[1] class TerminalDef(Serialize): @@ -88,7 +98,7 @@ class Token(Str): self.type = type_ self.pos_in_stream = pos_in_stream - self.value = value + self.value = Str(value) self.line = line self.column = column self.end_line = end_line diff --git a/lark/load_grammar.py b/lark/load_grammar.py index f7b1011..f6c1d22 100644 --- a/lark/load_grammar.py +++ b/lark/load_grammar.py @@ -90,7 +90,7 @@ TERMINALS = { '_IGNORE': r'%ignore', '_DECLARE': r'%declare', '_IMPORT': r'%import', - 'NUMBER': r'\d+', + 'NUMBER': r'[+-]?\d+', } RULES = { @@ -196,7 +196,7 @@ class EBNF_to_BNF(Transformer_InPlace): mn = mx = int(args[0]) else: mn, mx = map(int, args) - if mx < mn: + if mx < mn or mn < 0: raise GrammarError("Bad Range for %s (%d..%d isn't allowed)" % (rule, mn, mx)) return ST('expansions', [ST('expansion', [rule] * n) for n in range(mn, mx+1)]) assert False, op diff --git a/lark/parsers/grammar_analysis.py b/lark/parsers/grammar_analysis.py index 5a4d0e8..4085ea5 100644 --- a/lark/parsers/grammar_analysis.py +++ b/lark/parsers/grammar_analysis.py @@ -55,7 +55,7 @@ class LR0ItemSet(object): def update_set(set1, set2): - if not set2: + if not set2 or set1 > set2: return False copy = set(set1) @@ -102,6 +102,8 @@ def calculate_sets(rules): if set(rule.expansion[:i]) <= NULLABLE: if update_set(FIRST[rule.origin], FIRST[sym]): changed = True + else: + break # Calculate FOLLOW changed = True @@ -159,7 +161,7 @@ class GrammarAnalyzer(object): self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules) - def expand_rule(self, rule, rules_by_origin=None): + def expand_rule(self, source_rule, rules_by_origin=None): "Returns all init_ptrs accessible by rule (recursive)" if rules_by_origin is None: @@ -178,13 +180,7 @@ class GrammarAnalyzer(object): if not new_r.is_term: yield new_r - for _ in bfs([rule], _expand_rule): + for _ in bfs([source_rule], _expand_rule): pass return fzset(init_ptrs) - - def _first(self, r): - if r.is_term: - return {r} - else: - return {rp.next for rp in self.expand_rule(r) if rp.next.is_term} diff --git a/tests/test_parser.py b/tests/test_parser.py index 3238ead..599406f 100644 --- a/tests/test_parser.py +++ b/tests/test_parser.py @@ -1029,6 +1029,32 @@ def _make_parser_test(LEXER, PARSER): self.assertEqual(res.children, ['ab']) + grammar = """ + start: A B | AB + A: "a" + B.-20: "b" + AB.-10: "ab" + """ + l = _Lark(grammar) + res = l.parse("ab") + self.assertEqual(res.children, ['a', 'b']) + + + grammar = """ + start: A B | AB + A.-99999999999999999999999: "a" + B: "b" + AB: "ab" + """ + l = _Lark(grammar) + res = l.parse("ab") + + self.assertEqual(res.children, ['ab']) + + + + + def test_import(self): grammar = """