Browse Source

Big Refactor: Grammars now build in half the time. Code shorter & cleaner.

tags/gm/2021-09-23T00Z/github.com--lark-parser-lark/0.5.2
Erez Shinan 8 years ago
parent
commit
1cc4c965e8
11 changed files with 104 additions and 118 deletions
  1. +1
    -26
      lark/common.py
  2. +16
    -0
      lark/grammar.py
  3. +1
    -0
      lark/lark.py
  4. +1
    -1
      lark/lexer.py
  5. +3
    -2
      lark/parse_tree_builder.py
  6. +33
    -34
      lark/parser_frontends.py
  7. +13
    -11
      lark/parsers/earley.py
  8. +16
    -27
      lark/parsers/grammar_analysis.py
  9. +1
    -1
      lark/parsers/lalr_analysis.py
  10. +3
    -2
      lark/parsers/lalr_parser.py
  11. +16
    -14
      lark/parsers/xearley.py

+ 1
- 26
lark/common.py View File

@@ -33,7 +33,7 @@ class UnexpectedToken(ParseError):




def is_terminal(sym): def is_terminal(sym):
return isinstance(sym, Terminal) or sym.isupper() or sym == '$end'
return sym.isupper()




class LexerConf: class LexerConf:
@@ -44,7 +44,6 @@ class LexerConf:


class ParserConf: class ParserConf:
def __init__(self, rules, callback, start): def __init__(self, rules, callback, start):
assert all(len(r) == 4 for r in rules)
self.rules = rules self.rules = rules
self.callback = callback self.callback = callback
self.start = start self.start = start
@@ -108,27 +107,3 @@ class TokenDef(object):
def __repr__(self): def __repr__(self):
return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern) return '%s(%r, %r)' % (type(self).__name__, self.name, self.pattern)



class Terminal:
def __init__(self, data):
self.data = data

def __repr__(self):
return '%r' % self.data

def __eq__(self, other):
return isinstance(other, type(self)) and self.data == other.data
def __hash__(self):
return hash(self.data)


class Terminal_Regexp(Terminal):
def __init__(self, name, regexp):
Terminal.__init__(self, regexp)
self.name = name
self.match = re.compile(regexp).match

class Terminal_Token(Terminal):
def match(self, other):
return self.data == other.type


+ 16
- 0
lark/grammar.py View File

@@ -0,0 +1,16 @@

class Rule(object):
"""
origin : a symbol
expansion : a list of symbols
"""
def __init__(self, origin, expansion, alias=None, options=None):
self.origin = origin
self.expansion = expansion
self.alias = alias
self.options = options

def __repr__(self):
return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion)))



+ 1
- 0
lark/lark.py View File

@@ -171,6 +171,7 @@ class Lark:
for f in dir(callback): for f in dir(callback):
if not (f.startswith('__') and f.endswith('__')): if not (f.startswith('__') and f.endswith('__')):
setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f))) setattr(callback, f, self.profiler.make_wrapper('transformer', getattr(callback, f)))

parser_conf = ParserConf(rules, callback, self.options.start) parser_conf = ParserConf(rules, callback, self.options.start)


return self.parser_class(self.lexer_conf, parser_conf, options=self.options) return self.parser_class(self.lexer_conf, parser_conf, options=self.options)


+ 1
- 1
lark/lexer.py View File

@@ -204,7 +204,7 @@ class ContextualLexer:
lexer = lexer_by_tokens[key] lexer = lexer_by_tokens[key]
except KeyError: except KeyError:
accepts = set(accepts) | set(ignore) | set(always_accept) accepts = set(accepts) | set(ignore) | set(always_accept)
state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$end']
state_tokens = [tokens_by_name[n] for n in accepts if is_terminal(n) and n!='$END']
lexer = Lexer(state_tokens, ignore=ignore) lexer = Lexer(state_tokens, ignore=ignore)
lexer_by_tokens[key] = lexer lexer_by_tokens[key] = lexer




+ 3
- 2
lark/parse_tree_builder.py View File

@@ -1,6 +1,7 @@
from .common import is_terminal, GrammarError from .common import is_terminal, GrammarError
from .utils import suppress from .utils import suppress
from .lexer import Token from .lexer import Token
from .grammar import Rule


class NodeBuilder: class NodeBuilder:
def __init__(self, tree_class, name): def __init__(self, tree_class, name):
@@ -27,7 +28,7 @@ class Factory:


def __call__(self, node_builder): def __call__(self, node_builder):
return self.cls(node_builder, *self.args) return self.cls(node_builder, *self.args)


class TokenWrapper: class TokenWrapper:
"Used for fixing the results of scanless parsing" "Used for fixing the results of scanless parsing"
@@ -151,6 +152,6 @@ class ParseTreeBuilder:
raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin)) raise GrammarError("Rule expansion '%s' already exists in rule %s" % (' '.join(expansion), origin))
setattr(callback, callback_name, f) setattr(callback, callback_name, f)


new_rules.append(( origin, expansion, callback_name, options ))
new_rules.append( Rule( origin, expansion, callback_name, options ))


return new_rules, callback return new_rules, callback

+ 33
- 34
lark/parser_frontends.py View File

@@ -3,7 +3,7 @@ import sre_parse


from .lexer import Lexer, ContextualLexer, Token from .lexer import Lexer, ContextualLexer, Token


from .common import is_terminal, GrammarError, ParserConf, Terminal_Regexp, Terminal_Token
from .common import is_terminal, GrammarError, ParserConf
from .parsers import lalr_parser, earley, xearley, resolve_ambig from .parsers import lalr_parser, earley, xearley, resolve_ambig


class WithLexer: class WithLexer:
@@ -70,25 +70,26 @@ def tokenize_text(text):


class Earley_NoLex: class Earley_NoLex:
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
self.token_by_name = {t.name:t for t in lexer_conf.tokens}

rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules]
self._prepare_match(lexer_conf)


self.parser = earley.Parser(rules,
self.parser = earley.Parser(parser_conf.rules,
parser_conf.start, parser_conf.start,
parser_conf.callback, parser_conf.callback,
self.match,
resolve_ambiguity=get_ambiguity_resolver(options)) resolve_ambiguity=get_ambiguity_resolver(options))


def _prepare_expansion(self, expansion):
for sym in expansion:
if is_terminal(sym):
regexp = self.token_by_name[sym].pattern.to_regexp()
width = sre_parse.parse(regexp).getwidth()
if width != (1,1):
raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width))
yield Terminal_Regexp(sym, regexp)
else:
yield sym

def match(self, term, text, index=0):
return self.regexps[term].match(text, index)

def _prepare_match(self, lexer_conf):
self.regexps = {}
for t in lexer_conf.tokens:
regexp = t.pattern.to_regexp()
width = sre_parse.parse(regexp).getwidth()
if width != (1,1):
raise GrammarError('Scanless parsing (lexer=None) requires all tokens to have a width of 1 (terminal %s: %s is %s)' % (sym, regexp, width))
self.regexps[t.name] = re.compile(regexp)


def parse(self, text): def parse(self, text):
new_text = tokenize_text(text) new_text = tokenize_text(text)
@@ -98,15 +99,14 @@ class Earley(WithLexer):
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
WithLexer.__init__(self, lexer_conf) WithLexer.__init__(self, lexer_conf)


rules = [(n, self._prepare_expansion(x), a, o) for n,x,a,o in parser_conf.rules]

self.parser = earley.Parser(rules,
self.parser = earley.Parser(parser_conf.rules,
parser_conf.start, parser_conf.start,
parser_conf.callback, parser_conf.callback,
self.match,
resolve_ambiguity=get_ambiguity_resolver(options)) resolve_ambiguity=get_ambiguity_resolver(options))


def _prepare_expansion(self, expansion):
return [Terminal_Token(sym) if is_terminal(sym) else sym for sym in expansion]
def match(self, term, token):
return term == token.type


def parse(self, text): def parse(self, text):
tokens = self.lex(text) tokens = self.lex(text)
@@ -117,27 +117,26 @@ class XEarley:
def __init__(self, lexer_conf, parser_conf, options=None): def __init__(self, lexer_conf, parser_conf, options=None):
self.token_by_name = {t.name:t for t in lexer_conf.tokens} self.token_by_name = {t.name:t for t in lexer_conf.tokens}


rules = [(n, list(self._prepare_expansion(x)), a, o) for n,x,a,o in parser_conf.rules]

ignore = [Terminal_Regexp(x, self.token_by_name[x].pattern.to_regexp()) for x in lexer_conf.ignore]
self._prepare_match(lexer_conf)


self.parser = xearley.Parser(rules,
self.parser = xearley.Parser(parser_conf.rules,
parser_conf.start, parser_conf.start,
parser_conf.callback, parser_conf.callback,
self.match,
resolve_ambiguity=get_ambiguity_resolver(options), resolve_ambiguity=get_ambiguity_resolver(options),
ignore=ignore,
ignore=lexer_conf.ignore,
predict_all=options.earley__predict_all predict_all=options.earley__predict_all
) )


def _prepare_expansion(self, expansion):
for sym in expansion:
if is_terminal(sym):
regexp = self.token_by_name[sym].pattern.to_regexp()
width = sre_parse.parse(regexp).getwidth()
assert width
yield Terminal_Regexp(sym, regexp)
else:
yield sym
def match(self, term, text, index=0):
return self.regexps[term].match(text, index)
def _prepare_match(self, lexer_conf):
self.regexps = {}
for t in lexer_conf.tokens:
regexp = t.pattern.to_regexp()
assert sre_parse.parse(regexp).getwidth()
self.regexps[t.name] = re.compile(regexp)


def parse(self, text): def parse(self, text):
return self.parser.parse(text) return self.parser.parse(text)


+ 13
- 11
lark/parsers/earley.py View File

@@ -13,13 +13,13 @@
# Author: Erez Shinan (2017) # Author: Erez Shinan (2017)
# Email : erezshin@gmail.com # Email : erezshin@gmail.com


from ..common import ParseError, UnexpectedToken, Terminal
from ..common import ParseError, UnexpectedToken, is_terminal
from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse from ..tree import Tree, Visitor_NoRecurse, Transformer_NoRecurse
from .grammar_analysis import GrammarAnalyzer from .grammar_analysis import GrammarAnalyzer




class EndToken: class EndToken:
type = '$end'
type = '$END'


class Derivation(Tree): class Derivation(Tree):
_hash = None _hash = None
@@ -135,7 +135,7 @@ class Column:
self.completed[item_key] = item self.completed[item_key] = item
self.to_reduce.append(item) self.to_reduce.append(item)
else: else:
if isinstance(item.expect, Terminal):
if is_terminal(item.expect):
self.to_scan.append(item) self.to_scan.append(item)
else: else:
k = item_key if self.predict_all else item k = item_key if self.predict_all else item
@@ -152,7 +152,7 @@ class Column:
__nonzero__ = __bool__ # Py2 backwards-compatibility __nonzero__ = __bool__ # Py2 backwards-compatibility


class Parser: class Parser:
def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None):
def __init__(self, rules, start_symbol, callback, term_matcher, resolve_ambiguity=None):
self.analysis = GrammarAnalyzer(rules, start_symbol) self.analysis = GrammarAnalyzer(rules, start_symbol)
self.start_symbol = start_symbol self.start_symbol = start_symbol
self.resolve_ambiguity = resolve_ambiguity self.resolve_ambiguity = resolve_ambiguity
@@ -161,12 +161,13 @@ class Parser:
self.predictions = {} self.predictions = {}
self.FIRST = {} self.FIRST = {}
for rule in self.analysis.rules: for rule in self.analysis.rules:
if rule.origin != '$root': # XXX kinda ugly
a = rule.alias
self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a))
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
a = rule.alias
self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a))
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]


self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin]
self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin]

self.term_matcher = term_matcher




def parse(self, stream, start_symbol=None): def parse(self, stream, start_symbol=None):
@@ -174,9 +175,10 @@ class Parser:
start_symbol = start_symbol or self.start_symbol start_symbol = start_symbol or self.start_symbol


_Item = Item _Item = Item
match = self.term_matcher


def predict(nonterm, column): def predict(nonterm, column):
assert not isinstance(nonterm, Terminal), nonterm
assert not is_terminal(nonterm), nonterm
return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]] return [_Item(rule, 0, column, None) for rule in self.predictions[nonterm]]


def complete(item): def complete(item):
@@ -203,7 +205,7 @@ class Parser:


def scan(i, token, column): def scan(i, token, column):
next_set = Column(i, self.FIRST) next_set = Column(i, self.FIRST)
next_set.add(item.advance(token) for item in column.to_scan if item.expect.match(token))
next_set.add(item.advance(token) for item in column.to_scan if match(item.expect, token))


if not next_set: if not next_set:
expect = {i.expect for i in column.to_scan} expect = {i.expect for i in column.to_scan}


+ 16
- 27
lark/parsers/grammar_analysis.py View File

@@ -1,20 +1,8 @@


from ..utils import bfs, fzset from ..utils import bfs, fzset
from ..common import GrammarError, is_terminal from ..common import GrammarError, is_terminal
from ..grammar import Rule


class Rule(object):
"""
origin : a symbol
expansion : a list of symbols
"""
def __init__(self, origin, expansion, alias=None, options=None):
self.origin = origin
self.expansion = expansion
self.alias = alias
self.options = options

def __repr__(self):
return '<%s : %s>' % (self.origin, ' '.join(map(str,self.expansion)))


class RulePtr(object): class RulePtr(object):
def __init__(self, rule, index): def __init__(self, rule, index):
@@ -106,28 +94,29 @@ def calculate_sets(rules):




class GrammarAnalyzer(object): class GrammarAnalyzer(object):
def __init__(self, rule_tuples, start_symbol, debug=False):
def __init__(self, rules, start_symbol, debug=False):
assert len(rules) == len(set(rules))

self.start_symbol = start_symbol self.start_symbol = start_symbol
self.debug = debug self.debug = debug
rule_tuples = list(rule_tuples)
rule_tuples.append(('$root', [start_symbol, '$end']))
rule_tuples = [(t[0], t[1], None, None) if len(t)==2 else t for t in rule_tuples]

self.rules = set()
self.rules_by_origin = {o: [] for o, _x, _a, _opt in rule_tuples}
for origin, exp, alias, options in rule_tuples:
r = Rule( origin, exp, alias, options )
self.rules.add(r)
self.rules_by_origin[origin].append(r)

for r in self.rules:

root_rule = Rule('$root', [start_symbol, '$END'])

self.rules_by_origin = {r.origin: [] for r in rules}
for r in rules:
self.rules_by_origin[r.origin].append(r)

self.rules_by_origin[root_rule.origin] = [root_rule]

for r in rules:
for sym in r.expansion: for sym in r.expansion:
if not (is_terminal(sym) or sym in self.rules_by_origin): if not (is_terminal(sym) or sym in self.rules_by_origin):
raise GrammarError("Using an undefined rule: %s" % sym) raise GrammarError("Using an undefined rule: %s" % sym)


self.start_state = self.expand_rule('$root') self.start_state = self.expand_rule('$root')
self.rules = rules


self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(self.rules)
self.FIRST, self.FOLLOW, self.NULLABLE = calculate_sets(rules + [root_rule])


def expand_rule(self, rule): def expand_rule(self, rule):
"Returns all init_ptrs accessible by rule (recursive)" "Returns all init_ptrs accessible by rule (recursive)"


+ 1
- 1
lark/parsers/lalr_analysis.py View File

@@ -73,7 +73,7 @@ class LALR_Analyzer(GrammarAnalyzer):


new_state = fzset(rps) new_state = fzset(rps)
lookahead[sym].append((Shift, new_state)) lookahead[sym].append((Shift, new_state))
if sym == '$end':
if sym == '$END':
self.end_states.append( new_state ) self.end_states.append( new_state )
yield fzset(rps) yield fzset(rps)




+ 3
- 2
lark/parsers/lalr_parser.py View File

@@ -13,7 +13,8 @@ class FinalReduce:


class Parser: class Parser:
def __init__(self, parser_conf): def __init__(self, parser_conf):
assert all(o is None or o.priority is None for n,x,a,o in parser_conf.rules), "LALR doesn't yet support prioritization"
assert all(r.options is None or r.options.priority is None
for r in parser_conf.rules), "LALR doesn't yet support prioritization"
self.analysis = analysis = LALR_Analyzer(parser_conf.rules, parser_conf.start) self.analysis = analysis = LALR_Analyzer(parser_conf.rules, parser_conf.start)
analysis.compute_lookahead() analysis.compute_lookahead()
callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None) callbacks = {rule: getattr(parser_conf.callback, rule.alias or rule.origin, None)
@@ -85,7 +86,7 @@ class _Parser:
pass pass


while True: while True:
_action, arg = get_action('$end')
_action, arg = get_action('$END')
if _action is Shift: if _action is Shift:
assert arg == self.end_state assert arg == self.end_state
val ,= value_stack val ,= value_stack


+ 16
- 14
lark/parsers/xearley.py View File

@@ -20,7 +20,7 @@


from collections import defaultdict from collections import defaultdict


from ..common import ParseError, UnexpectedToken, Terminal
from ..common import ParseError, UnexpectedToken, is_terminal
from ..lexer import Token, UnexpectedInput from ..lexer import Token, UnexpectedInput
from ..tree import Tree from ..tree import Tree
from .grammar_analysis import GrammarAnalyzer from .grammar_analysis import GrammarAnalyzer
@@ -28,7 +28,7 @@ from .grammar_analysis import GrammarAnalyzer
from .earley import ApplyCallbacks, Item, Column from .earley import ApplyCallbacks, Item, Column


class Parser: class Parser:
def __init__(self, rules, start_symbol, callback, resolve_ambiguity=None, ignore=(), predict_all=False):
def __init__(self, rules, start_symbol, callback, term_matcher, resolve_ambiguity=None, ignore=(), predict_all=False):
self.analysis = GrammarAnalyzer(rules, start_symbol) self.analysis = GrammarAnalyzer(rules, start_symbol)
self.start_symbol = start_symbol self.start_symbol = start_symbol
self.resolve_ambiguity = resolve_ambiguity self.resolve_ambiguity = resolve_ambiguity
@@ -41,24 +41,26 @@ class Parser:
self.FIRST = {} self.FIRST = {}


for rule in self.analysis.rules: for rule in self.analysis.rules:
if rule.origin != '$root': # XXX kinda ugly
a = rule.alias
self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a))
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]
a = rule.alias
self.postprocess[rule] = a if callable(a) else (a and getattr(callback, a))
self.predictions[rule.origin] = [x.rule for x in self.analysis.expand_rule(rule.origin)]


self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin]
self.FIRST[rule.origin] = self.analysis.FIRST[rule.origin]

self.term_matcher = term_matcher




def parse(self, stream, start_symbol=None): def parse(self, stream, start_symbol=None):
# Define parser functions # Define parser functions
start_symbol = start_symbol or self.start_symbol start_symbol = start_symbol or self.start_symbol
delayed_matches = defaultdict(list) delayed_matches = defaultdict(list)
match = self.term_matcher


text_line = 1 text_line = 1
text_column = 0 text_column = 0


def predict(nonterm, column): def predict(nonterm, column):
assert not isinstance(nonterm, Terminal), nonterm
assert not is_terminal(nonterm), nonterm
return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]] return [Item(rule, 0, column, None) for rule in self.predictions[nonterm]]


def complete(item): def complete(item):
@@ -86,7 +88,7 @@ class Parser:
to_scan = column.to_scan to_scan = column.to_scan


for x in self.ignore: for x in self.ignore:
m = x.match(stream, i)
m = match(x, stream, i)
if m: if m:
delayed_matches[m.end()] += set(to_scan) delayed_matches[m.end()] += set(to_scan)
delayed_matches[m.end()] += set(column.to_reduce) delayed_matches[m.end()] += set(column.to_reduce)
@@ -99,16 +101,16 @@ class Parser:
# delayed_matches[m.end()] += to_scan # delayed_matches[m.end()] += to_scan


for item in to_scan: for item in to_scan:
m = item.expect.match(stream, i)
m = match(item.expect, stream, i)
if m: if m:
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
t = Token(item.expect, m.group(0), i, text_line, text_column)
delayed_matches[m.end()].append(item.advance(t)) delayed_matches[m.end()].append(item.advance(t))


s = m.group(0) s = m.group(0)
for j in range(1, len(s)): for j in range(1, len(s)):
m = item.expect.match(s[:-j])
m = match(item.expect, s[:-j])
if m: if m:
t = Token(item.expect.name, m.group(0), i, text_line, text_column)
t = Token(item.expect, m.group(0), i, text_line, text_column)
delayed_matches[i+m.end()].append(item.advance(t)) delayed_matches[i+m.end()].append(item.advance(t))


next_set = Column(i+1, self.FIRST, predict_all=self.predict_all) next_set = Column(i+1, self.FIRST, predict_all=self.predict_all)
@@ -143,7 +145,7 @@ class Parser:
if n.rule.origin==start_symbol and n.start is column0] if n.rule.origin==start_symbol and n.start is column0]


if not solutions: if not solutions:
expected_tokens = [t.expect.name for t in column.to_scan]
expected_tokens = [t.expect for t in column.to_scan]
raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens) raise ParseError('Unexpected end of input! Expecting a terminal of: %s' % expected_tokens)


elif len(solutions) == 1: elif len(solutions) == 1:


Loading…
Cancel
Save