jmg
/
gitmirror


			
							"""This module builds a LALR(1) transition-table for lalr_parser.py

For now, shift/reduce conflicts are automatically resolved as shifts.
"""

# Author: Erez Shinan (2017)
# Email : erezshin@gmail.com

import logging
from collections import defaultdict, deque

from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator
from ..exceptions import GrammarError

from .grammar_analysis import GrammarAnalyzer, Terminal, RulePtr, LR0ItemSet
from ..grammar import Rule
from . import grammar_analysis

import time

###{standalone

class Action:
    def __init__(self, name):
        self.name = name
    def __str__(self):
        return self.name
    def __repr__(self):
        return str(self)

Shift = Action('Shift')
Reduce = Action('Reduce')

t_set_0 = 0
t_set_1 = 0
t_expand = 0
t_rules = 0
t_append = 0
t_z = 0
t_begin = 0
t_count = 0
t_call = 0

class ParseTable:
    def __init__(self, states, start_states, end_states):
        self.states = states
        self.start_states = start_states
        self.end_states = end_states

    def serialize(self, memo):
        tokens = Enumerator()
        rules = Enumerator()

        states = {
            state: {tokens.get(token): ((1, arg.serialize(memo)) if action is Reduce else (0, arg))
                    for token, (action, arg) in actions.items()}
            for state, actions in self.states.items()
        }

        return {
            'tokens': tokens.reversed(),
            'states': states,
            'start_states': self.start_states,
            'end_states': self.end_states,
        }

    @classmethod
    def deserialize(cls, data, memo):
        tokens = data['tokens']
        states = {
            state: {tokens[token]: ((Reduce, Rule.deserialize(arg, memo)) if action==1 else (Shift, arg))
                    for token, (action, arg) in actions.items()}
            for state, actions in data['states'].items()
        }
        return cls(states, data['start_states'], data['end_states'])


class IntParseTable(ParseTable):

    @classmethod
    def from_ParseTable(cls, parse_table):
        enum = list(parse_table.states)
        state_to_idx = {s:i for i,s in enumerate(enum)}
        int_states = {}

        for s, la in parse_table.states.items():
            la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v
                  for k,v in la.items()}
            int_states[ state_to_idx[s] ] = la


        start_states = {start:state_to_idx[s] for start, s in parse_table.start_states.items()}
        end_states = {start:state_to_idx[s] for start, s in parse_table.end_states.items()}
        return cls(int_states, start_states, end_states)

###}

class LALR_Analyzer(GrammarAnalyzer):

    def generate_lr0_states(self):
        self.states = set()
        # map of kernels to LR0ItemSets
        cache = {}

        def step(state):
            _, unsat = classify_bool(state.closure, lambda rp: rp.is_satisfied)

            d = classify(unsat, lambda rp: rp.next)
            for sym, rps in d.items():
                kernel = fzset({rp.advance(sym) for rp in rps})
                new_state = cache.get(kernel, None)
                if new_state is None:
                    closure = set(kernel)
                    for rp in kernel:
                        if not rp.is_satisfied and not rp.next.is_term:
                            closure |= self.expand_rule(rp.next, self.lr0_rules_by_origin)
                    new_state = LR0ItemSet(kernel, closure)
                    cache[kernel] = new_state

                state.transitions[sym] = new_state
                yield new_state

            self.states.add(state)

        for _ in bfs(self.lr0_start_states.values(), step):
            pass

    def discover_lookaheads(self):
        # lookaheads is now a member of LR0ItemSet, so don't need to look up a dictionary here
        # state -> rule -> set of lookaheads
        #self.lookaheads = defaultdict(lambda: defaultdict(set))
        # state -> rule -> list of (set of lookaheads) to propagate to
        #self.propagates = defaultdict(lambda: defaultdict(list))
        self.propagates = {}

        t0 = time.time()

        t = Terminal('$END')
        for s in self.lr0_start_states.values():
            for rp in s.kernel:
                #self.lookaheads[s][rp].add(Terminal('$END'))
                s.lookaheads[rp].add(t)

        t_closure = 0

        # There is a 1 to 1 correspondance between LR0 and LALR1 states.
        # We calculate the lookaheads for LALR1 kernel items from the LR0 kernel items.
        # use a terminal that does not exist in the grammar
        t = Terminal('$#')
        for s in self.states:
            p = {}
            self.propagates[s] = p
            for rp in s.kernel:
                q = []
                p[rp] = q
                t2 = time.time()
                z = self.generate_lr1_closure([rp.lookahead(t)], time.time())
                t3 = time.time()
                t_closure += t3 - t2
                #for rp2, la in self.generate_lr1_closure([(rp, t)], time.time()):
                for rp2_la in z:
                    rp2 = rp2_la.rp
                    la = rp2_la.la
                    if rp2.is_satisfied:
                        continue
                    next_symbol = rp2.next
                    next_state = s.transitions[next_symbol]
                    rp3 = rp2.advance(next_symbol)
                    assert(rp3 in next_state.kernel)
                    #x = self.lookaheads[next_state][rp3]
                    x = next_state.lookaheads[rp3]
                    if la == t:
                        # we must propagate rp's lookaheads to rp3's lookahead set
                        q.append(x)
                    else:
                        # this lookahead is "generated spontaneously" for rp3
                        x.add(la)

        t1 = time.time()
        print('Discovering took {:.3f} (generating closure), {:.3f} (total)'.format(t_closure, t1 - t0))

    def propagate_lookaheads(self):
        changed = True
        while changed:
            changed = False
            for s in self.states:
                for rp in s.kernel:
                    # from (from is a keyword)
                    #f = self.lookaheads[s][rp]
                    f = s.lookaheads[rp]
                    # to
                    t = self.propagates[s][rp]
                    for x in t:
                        old = len(x)
                        x |= f
                        changed = changed or (len(x) != old)

    def generate_lalr1_states(self):
        t0 = time.time()
        # 1 to 1 correspondance between LR0 and LALR1 states
        # We must fetch the lookaheads we calculated,
        # to create the LALR1 kernels from the LR0 kernels.
        # Then, we generate the LALR1 states by taking the LR1 closure of the new kernel items.
        # map of LR0 states to LALR1 states
        m = {}
        t_closure = 0
        z = 0
        for s in self.states:
            z = max(z, len(s.closure))
            kernel = []
            for rp in s.kernel:
                #las = self.lookaheads[s][rp]
                las = s.lookaheads[rp]
                assert(len(las) > 0)
                for la in las:
                    kernel.append(rp.lookahead(la))
            t0_0 = time.time()
            m[s] = self.generate_lr1_closure(kernel, time.time())
            t0_1 = time.time()
            t_closure += t0_1 - t0_0

        print('Generating lalr1 closure for lalr kernels took {:.3f}'.format(t_closure))
        print('Max lr0 state size was {}'.format(z))

        t1 = time.time()

        self.states = {}
        for s, v in m.items():
            actions = {}
            for la, next_state in s.transitions.items():
                actions[la] = (Shift, next_state.closure)

            sat, _ = classify_bool(v, lambda x: x.rp.is_satisfied)
            reductions = classify(sat, lambda x: x.la, lambda x: x.rp)
            for la, rps in reductions.items():
                if len(rps) > 1:
                    raise GrammarError("Collision in %s: %s" % (la, ', '.join([ str(r.rule) for r in rps ])))
                if la in actions:
                    if self.debug:
                        logging.warning("Shift/reduce conflict for terminal %s:  (resolving as shift)", la.name)
                        logging.warning(' * %s', str(rps[0]))
                else:
                    actions[la] = (Reduce, rps[0].rule)

            self.states[s.closure] = {k.name: v for k, v in actions.items()}

        t2 = time.time()

        end_states = {}
        for s in self.states:
            for rp in s:
                for start in self.lr0_start_states:
                    if rp.rule.origin.name == ('$root_' + start) and rp.is_satisfied:
                        assert(not start in end_states)
                        end_states[start] = s

        t3 = time.time()

        self._parse_table = ParseTable(self.states, {start: state.closure for start, state in self.lr0_start_states.items()}, end_states)

        t4 = time.time()

        if self.debug:
            self.parse_table = self._parse_table
        else:
            self.parse_table = IntParseTable.from_ParseTable(self._parse_table)

        t5 = time.time()

        print(('Generating lalr1 states took ' + ', '.join([ '{:.3f}' ] * 5)).format(t1 - t0, t2 - t1, t3 - t2, t4 - t3, t5 - t4))
        print('Generating firsts took {:.3f} (time actually calculating), {:.3f} (end to end), {:.3f} (just function call)'.format(grammar_analysis.t_firsts, grammar_analysis.t_xy, grammar_analysis.t_call))

    def generate_lr1_closure(self, kernel, t_caller):
        global t_call
        global t_set_0
        global t_set_1
        global t_expand
        global t_rules
        global t_append
        global t_z
        global t_begin
        global t_count

        t_start = time.time()
        t_call += t_start - t_caller

        # cache the results of this function
        # not many hits, no noticeable performance improvement
        '''
        k = fzset(kernel)
        cached = self.lr1_cache.get(k, None)
        if not cached is None:
            return cached
        '''

        closure = set()
        closure_hash = {}

        y = 0

        q = list(kernel)
        while len(q) > 0:
            t_a = time.time()
            rp_la = q.pop()
            #rp_la_hash = hash(rp_la)
            t0 = time.time()
            t_begin += t0 - t_a
            # try to manually maintain hashtable,
            # as a set of just hashes (ints) was notably faster
            '''
            if rp_la_hash in closure_hash:
                if rp_la in closure_hash[rp_la_hash]:
                    t0_0 = time.time()
                    t_set_0 += t0_0 - t0
                    continue
                t0_0 = time.time()
                t_set_0 += t0_0 - t0
            else:
                closure_hash[rp_la_hash] = []
            '''
            if rp_la in closure:
                t0_0 = time.time()
                t_set_0 += t0_0 - t0
                continue
            t0_0 = time.time()
            closure.add(rp_la)
            #closure_hash[rp_la_hash].append(rp_la)
            t1 = time.time()
            t_set_0 += t0_0 - t0
            t_set_1 += t1 - t0_0
            rp = rp_la.rp
            la = rp_la.la

            if rp.is_satisfied:
                continue
            if rp.next.is_term:
                continue

            t2 = time.time()

            # cache these calculations inside each RulePtr
            # see grammar_analysis.py:79
            l = []
            '''
            i = rp.index + 1
            n = len(rp.rule.expansion)
            l2_i = self.lr1_cache2.get((rp.rule, i), None)
            l2 = []
            if l2_i is None:
                while i < n:
                    s = rp.rule.expansion[i]
                    l2.extend(self.FIRST.get(s, []))
                    if not s in self.NULLABLE:
                        break
                    i += 1
                self.lr1_cache2[(rp.rule, i)] = (l2, i)
            else:
                l2 = l2_i[0]
                i = l2_i[1]

            l.extend(l2)
            '''
            # this function call seems really slow (see grammar_analysis.t_call above)
            # tried making it not a method call so don't need to look up vtable
            # still equally slow
            l2, nullable = rp.first(rp.index + 1, self.FIRST, self.NULLABLE, time.time())
            #l2, nullable = grammar_analysis.first(rp, rp.index + 1, self.FIRST, self.NULLABLE, time.time())
            #l.extend(l2)
            l = l2
            t3 = time.time()

            t_expand += t3 - t2

            # if we don't modify l2 and add an extra check in the loop below,
            # we don't have to copy it
            # if all of rp.rule.expansion[rp.index + 1:] were nullable:
            #if nullable:
            #    l.append(la)

            t4 = time.time()
            x = rp.next_rules_by_origin(self.lr0_rules_by_origin)
            t5 = time.time()

            # usually between 20-60? seen as high as ~175
            y = max(y, len(x) * len(l))
            #print('adding {} * {} rules to closure max {}'.format(len(x), len(l), y))
            for r in x:
                for s in l:
                    # cache RulePtr(r, 0) in r (no duplicate RulePtr objects)
                    # cache r._rp in _rp (1 less object property lookup?)
                    _rp = r._rp
                    if _rp is None:
                        _rp = RulePtr(r, 0)
                        r._rp = _rp
                    q.append(_rp.lookahead(s))
                    #q.append((r._rp, s))
                if nullable:
                    _rp = r._rp
                    if _rp is None:
                        _rp = RulePtr(r, 0)
                        r._rp = _rp
                    q.append(_rp.lookahead(la))
                    #q.append((r._rp, la))

            t6 = time.time()
            t_rules += t5 - t4
            t_append += t6 - t5

        #self.lr1_cache[k] = closure

        t_end = time.time()
        t_z += t_end - t_start

        t_count += 1

        if t_count % 1000 == 0:
            print('\tGenerating lr1 closure took begin {:.3f}, set contains {:.3f}, set add {:.3f}, get first {:.3f}'.format(t_begin, t_set_0, t_set_1, t_expand))
            print('\tget next rules {:.3f}, append rules {:.3f}, total {:.3f}, call time {:.3f}, count {}'.format(t_rules, t_append, t_z, t_call, t_count))
            print('\tmax number of appends {}'.format(y))

        return closure