This repo contains code to mirror other repos. It also contains the code that is getting mirrored.
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

422 lines
14 KiB

  1. """This module builds a LALR(1) transition-table for lalr_parser.py
  2. For now, shift/reduce conflicts are automatically resolved as shifts.
  3. """
  4. # Author: Erez Shinan (2017)
  5. # Email : erezshin@gmail.com
  6. import logging
  7. from collections import defaultdict, deque
  8. from ..utils import classify, classify_bool, bfs, fzset, Serialize, Enumerator
  9. from ..exceptions import GrammarError
  10. from .grammar_analysis import GrammarAnalyzer, Terminal, RulePtr, LR0ItemSet
  11. from ..grammar import Rule
  12. from . import grammar_analysis
  13. import time
  14. ###{standalone
  15. class Action:
  16. def __init__(self, name):
  17. self.name = name
  18. def __str__(self):
  19. return self.name
  20. def __repr__(self):
  21. return str(self)
  22. Shift = Action('Shift')
  23. Reduce = Action('Reduce')
  24. t_set_0 = 0
  25. t_set_1 = 0
  26. t_expand = 0
  27. t_rules = 0
  28. t_append = 0
  29. t_z = 0
  30. t_begin = 0
  31. t_count = 0
  32. t_call = 0
  33. class ParseTable:
  34. def __init__(self, states, start_states, end_states):
  35. self.states = states
  36. self.start_states = start_states
  37. self.end_states = end_states
  38. def serialize(self, memo):
  39. tokens = Enumerator()
  40. rules = Enumerator()
  41. states = {
  42. state: {tokens.get(token): ((1, arg.serialize(memo)) if action is Reduce else (0, arg))
  43. for token, (action, arg) in actions.items()}
  44. for state, actions in self.states.items()
  45. }
  46. return {
  47. 'tokens': tokens.reversed(),
  48. 'states': states,
  49. 'start_states': self.start_states,
  50. 'end_states': self.end_states,
  51. }
  52. @classmethod
  53. def deserialize(cls, data, memo):
  54. tokens = data['tokens']
  55. states = {
  56. state: {tokens[token]: ((Reduce, Rule.deserialize(arg, memo)) if action==1 else (Shift, arg))
  57. for token, (action, arg) in actions.items()}
  58. for state, actions in data['states'].items()
  59. }
  60. return cls(states, data['start_states'], data['end_states'])
  61. class IntParseTable(ParseTable):
  62. @classmethod
  63. def from_ParseTable(cls, parse_table):
  64. enum = list(parse_table.states)
  65. state_to_idx = {s:i for i,s in enumerate(enum)}
  66. int_states = {}
  67. for s, la in parse_table.states.items():
  68. la = {k:(v[0], state_to_idx[v[1]]) if v[0] is Shift else v
  69. for k,v in la.items()}
  70. int_states[ state_to_idx[s] ] = la
  71. start_states = {start:state_to_idx[s] for start, s in parse_table.start_states.items()}
  72. end_states = {start:state_to_idx[s] for start, s in parse_table.end_states.items()}
  73. return cls(int_states, start_states, end_states)
  74. ###}
  75. class LALR_Analyzer(GrammarAnalyzer):
  76. def generate_lr0_states(self):
  77. self.states = set()
  78. # map of kernels to LR0ItemSets
  79. cache = {}
  80. def step(state):
  81. _, unsat = classify_bool(state.closure, lambda rp: rp.is_satisfied)
  82. d = classify(unsat, lambda rp: rp.next)
  83. for sym, rps in d.items():
  84. kernel = fzset({rp.advance(sym) for rp in rps})
  85. new_state = cache.get(kernel, None)
  86. if new_state is None:
  87. closure = set(kernel)
  88. for rp in kernel:
  89. if not rp.is_satisfied and not rp.next.is_term:
  90. closure |= self.expand_rule(rp.next, self.lr0_rules_by_origin)
  91. new_state = LR0ItemSet(kernel, closure)
  92. cache[kernel] = new_state
  93. state.transitions[sym] = new_state
  94. yield new_state
  95. self.states.add(state)
  96. for _ in bfs(self.lr0_start_states.values(), step):
  97. pass
  98. def discover_lookaheads(self):
  99. # lookaheads is now a member of LR0ItemSet, so don't need to look up a dictionary here
  100. # state -> rule -> set of lookaheads
  101. #self.lookaheads = defaultdict(lambda: defaultdict(set))
  102. # state -> rule -> list of (set of lookaheads) to propagate to
  103. #self.propagates = defaultdict(lambda: defaultdict(list))
  104. self.propagates = {}
  105. t0 = time.time()
  106. t = Terminal('$END')
  107. for s in self.lr0_start_states.values():
  108. for rp in s.kernel:
  109. #self.lookaheads[s][rp].add(Terminal('$END'))
  110. s.lookaheads[rp].add(t)
  111. t_closure = 0
  112. # There is a 1 to 1 correspondance between LR0 and LALR1 states.
  113. # We calculate the lookaheads for LALR1 kernel items from the LR0 kernel items.
  114. # use a terminal that does not exist in the grammar
  115. t = Terminal('$#')
  116. for s in self.states:
  117. p = {}
  118. self.propagates[s] = p
  119. for rp in s.kernel:
  120. q = []
  121. p[rp] = q
  122. t2 = time.time()
  123. z = self.generate_lr1_closure([rp.lookahead(t)], time.time())
  124. t3 = time.time()
  125. t_closure += t3 - t2
  126. #for rp2, la in self.generate_lr1_closure([(rp, t)], time.time()):
  127. for rp2_la in z:
  128. rp2 = rp2_la.rp
  129. la = rp2_la.la
  130. if rp2.is_satisfied:
  131. continue
  132. next_symbol = rp2.next
  133. next_state = s.transitions[next_symbol]
  134. rp3 = rp2.advance(next_symbol)
  135. assert(rp3 in next_state.kernel)
  136. #x = self.lookaheads[next_state][rp3]
  137. x = next_state.lookaheads[rp3]
  138. if la == t:
  139. # we must propagate rp's lookaheads to rp3's lookahead set
  140. q.append(x)
  141. else:
  142. # this lookahead is "generated spontaneously" for rp3
  143. x.add(la)
  144. t1 = time.time()
  145. print('Discovering took {:.3f} (generating closure), {:.3f} (total)'.format(t_closure, t1 - t0))
  146. def propagate_lookaheads(self):
  147. changed = True
  148. while changed:
  149. changed = False
  150. for s in self.states:
  151. for rp in s.kernel:
  152. # from (from is a keyword)
  153. #f = self.lookaheads[s][rp]
  154. f = s.lookaheads[rp]
  155. # to
  156. t = self.propagates[s][rp]
  157. for x in t:
  158. old = len(x)
  159. x |= f
  160. changed = changed or (len(x) != old)
  161. def generate_lalr1_states(self):
  162. t0 = time.time()
  163. # 1 to 1 correspondance between LR0 and LALR1 states
  164. # We must fetch the lookaheads we calculated,
  165. # to create the LALR1 kernels from the LR0 kernels.
  166. # Then, we generate the LALR1 states by taking the LR1 closure of the new kernel items.
  167. # map of LR0 states to LALR1 states
  168. m = {}
  169. t_closure = 0
  170. z = 0
  171. for s in self.states:
  172. z = max(z, len(s.closure))
  173. kernel = []
  174. for rp in s.kernel:
  175. #las = self.lookaheads[s][rp]
  176. las = s.lookaheads[rp]
  177. assert(len(las) > 0)
  178. for la in las:
  179. kernel.append(rp.lookahead(la))
  180. t0_0 = time.time()
  181. m[s] = self.generate_lr1_closure(kernel, time.time())
  182. t0_1 = time.time()
  183. t_closure += t0_1 - t0_0
  184. print('Generating lalr1 closure for lalr kernels took {:.3f}'.format(t_closure))
  185. print('Max lr0 state size was {}'.format(z))
  186. t1 = time.time()
  187. self.states = {}
  188. for s, v in m.items():
  189. actions = {}
  190. for la, next_state in s.transitions.items():
  191. actions[la] = (Shift, next_state.closure)
  192. sat, _ = classify_bool(v, lambda x: x.rp.is_satisfied)
  193. reductions = classify(sat, lambda x: x.la, lambda x: x.rp)
  194. for la, rps in reductions.items():
  195. if len(rps) > 1:
  196. raise GrammarError("Collision in %s: %s" % (la, ', '.join([ str(r.rule) for r in rps ])))
  197. if la in actions:
  198. if self.debug:
  199. logging.warning("Shift/reduce conflict for terminal %s: (resolving as shift)", la.name)
  200. logging.warning(' * %s', str(rps[0]))
  201. else:
  202. actions[la] = (Reduce, rps[0].rule)
  203. self.states[s.closure] = {k.name: v for k, v in actions.items()}
  204. t2 = time.time()
  205. end_states = {}
  206. for s in self.states:
  207. for rp in s:
  208. for start in self.lr0_start_states:
  209. if rp.rule.origin.name == ('$root_' + start) and rp.is_satisfied:
  210. assert(not start in end_states)
  211. end_states[start] = s
  212. t3 = time.time()
  213. self._parse_table = ParseTable(self.states, {start: state.closure for start, state in self.lr0_start_states.items()}, end_states)
  214. t4 = time.time()
  215. if self.debug:
  216. self.parse_table = self._parse_table
  217. else:
  218. self.parse_table = IntParseTable.from_ParseTable(self._parse_table)
  219. t5 = time.time()
  220. print(('Generating lalr1 states took ' + ', '.join([ '{:.3f}' ] * 5)).format(t1 - t0, t2 - t1, t3 - t2, t4 - t3, t5 - t4))
  221. print('Generating firsts took {:.3f} (time actually calculating), {:.3f} (end to end), {:.3f} (just function call)'.format(grammar_analysis.t_firsts, grammar_analysis.t_xy, grammar_analysis.t_call))
  222. def generate_lr1_closure(self, kernel, t_caller):
  223. global t_call
  224. global t_set_0
  225. global t_set_1
  226. global t_expand
  227. global t_rules
  228. global t_append
  229. global t_z
  230. global t_begin
  231. global t_count
  232. t_start = time.time()
  233. t_call += t_start - t_caller
  234. # cache the results of this function
  235. # not many hits, no noticeable performance improvement
  236. '''
  237. k = fzset(kernel)
  238. cached = self.lr1_cache.get(k, None)
  239. if not cached is None:
  240. return cached
  241. '''
  242. closure = set()
  243. closure_hash = {}
  244. y = 0
  245. q = list(kernel)
  246. while len(q) > 0:
  247. t_a = time.time()
  248. rp_la = q.pop()
  249. #rp_la_hash = hash(rp_la)
  250. t0 = time.time()
  251. t_begin += t0 - t_a
  252. # try to manually maintain hashtable,
  253. # as a set of just hashes (ints) was notably faster
  254. '''
  255. if rp_la_hash in closure_hash:
  256. if rp_la in closure_hash[rp_la_hash]:
  257. t0_0 = time.time()
  258. t_set_0 += t0_0 - t0
  259. continue
  260. t0_0 = time.time()
  261. t_set_0 += t0_0 - t0
  262. else:
  263. closure_hash[rp_la_hash] = []
  264. '''
  265. if rp_la in closure:
  266. t0_0 = time.time()
  267. t_set_0 += t0_0 - t0
  268. continue
  269. t0_0 = time.time()
  270. closure.add(rp_la)
  271. #closure_hash[rp_la_hash].append(rp_la)
  272. t1 = time.time()
  273. t_set_0 += t0_0 - t0
  274. t_set_1 += t1 - t0_0
  275. rp = rp_la.rp
  276. la = rp_la.la
  277. if rp.is_satisfied:
  278. continue
  279. if rp.next.is_term:
  280. continue
  281. t2 = time.time()
  282. # cache these calculations inside each RulePtr
  283. # see grammar_analysis.py:79
  284. l = []
  285. '''
  286. i = rp.index + 1
  287. n = len(rp.rule.expansion)
  288. l2_i = self.lr1_cache2.get((rp.rule, i), None)
  289. l2 = []
  290. if l2_i is None:
  291. while i < n:
  292. s = rp.rule.expansion[i]
  293. l2.extend(self.FIRST.get(s, []))
  294. if not s in self.NULLABLE:
  295. break
  296. i += 1
  297. self.lr1_cache2[(rp.rule, i)] = (l2, i)
  298. else:
  299. l2 = l2_i[0]
  300. i = l2_i[1]
  301. l.extend(l2)
  302. '''
  303. # this function call seems really slow (see grammar_analysis.t_call above)
  304. # tried making it not a method call so don't need to look up vtable
  305. # still equally slow
  306. l2, nullable = rp.first(rp.index + 1, self.FIRST, self.NULLABLE, time.time())
  307. #l2, nullable = grammar_analysis.first(rp, rp.index + 1, self.FIRST, self.NULLABLE, time.time())
  308. #l.extend(l2)
  309. l = l2
  310. t3 = time.time()
  311. t_expand += t3 - t2
  312. # if we don't modify l2 and add an extra check in the loop below,
  313. # we don't have to copy it
  314. # if all of rp.rule.expansion[rp.index + 1:] were nullable:
  315. #if nullable:
  316. # l.append(la)
  317. t4 = time.time()
  318. x = rp.next_rules_by_origin(self.lr0_rules_by_origin)
  319. t5 = time.time()
  320. # usually between 20-60? seen as high as ~175
  321. y = max(y, len(x) * len(l))
  322. #print('adding {} * {} rules to closure max {}'.format(len(x), len(l), y))
  323. for r in x:
  324. for s in l:
  325. # cache RulePtr(r, 0) in r (no duplicate RulePtr objects)
  326. # cache r._rp in _rp (1 less object property lookup?)
  327. _rp = r._rp
  328. if _rp is None:
  329. _rp = RulePtr(r, 0)
  330. r._rp = _rp
  331. q.append(_rp.lookahead(s))
  332. #q.append((r._rp, s))
  333. if nullable:
  334. _rp = r._rp
  335. if _rp is None:
  336. _rp = RulePtr(r, 0)
  337. r._rp = _rp
  338. q.append(_rp.lookahead(la))
  339. #q.append((r._rp, la))
  340. t6 = time.time()
  341. t_rules += t5 - t4
  342. t_append += t6 - t5
  343. #self.lr1_cache[k] = closure
  344. t_end = time.time()
  345. t_z += t_end - t_start
  346. t_count += 1
  347. if t_count % 1000 == 0:
  348. print('\tGenerating lr1 closure took begin {:.3f}, set contains {:.3f}, set add {:.3f}, get first {:.3f}'.format(t_begin, t_set_0, t_set_1, t_expand))
  349. print('\tget next rules {:.3f}, append rules {:.3f}, total {:.3f}, call time {:.3f}, count {}'.format(t_rules, t_append, t_z, t_call, t_count))
  350. print('\tmax number of appends {}'.format(y))
  351. return closure