1 | /* |
2 | ** LOOP: Loop Optimizations. |
3 | ** Copyright (C) 2005-2014 Mike Pall. See Copyright Notice in luajit.h |
4 | */ |
5 | |
6 | #define lj_opt_loop_c |
7 | #define LUA_CORE |
8 | |
9 | #include "lj_obj.h" |
10 | |
11 | #if LJ_HASJIT |
12 | |
13 | #include "lj_err.h" |
14 | #include "lj_str.h" |
15 | #include "lj_ir.h" |
16 | #include "lj_jit.h" |
17 | #include "lj_iropt.h" |
18 | #include "lj_trace.h" |
19 | #include "lj_snap.h" |
20 | #include "lj_vm.h" |
21 | |
22 | /* Loop optimization: |
23 | ** |
24 | ** Traditional Loop-Invariant Code Motion (LICM) splits the instructions |
25 | ** of a loop into invariant and variant instructions. The invariant |
26 | ** instructions are hoisted out of the loop and only the variant |
27 | ** instructions remain inside the loop body. |
28 | ** |
29 | ** Unfortunately LICM is mostly useless for compiling dynamic languages. |
30 | ** The IR has many guards and most of the subsequent instructions are |
31 | ** control-dependent on them. The first non-hoistable guard would |
32 | ** effectively prevent hoisting of all subsequent instructions. |
33 | ** |
34 | ** That's why we use a special form of unrolling using copy-substitution, |
35 | ** combined with redundancy elimination: |
36 | ** |
37 | ** The recorded instruction stream is re-emitted to the compiler pipeline |
38 | ** with substituted operands. The substitution table is filled with the |
39 | ** refs returned by re-emitting each instruction. This can be done |
40 | ** on-the-fly, because the IR is in strict SSA form, where every ref is |
41 | ** defined before its use. |
42 | ** |
43 | ** This aproach generates two code sections, separated by the LOOP |
44 | ** instruction: |
45 | ** |
46 | ** 1. The recorded instructions form a kind of pre-roll for the loop. It |
47 | ** contains a mix of invariant and variant instructions and performs |
48 | ** exactly one loop iteration (but not necessarily the 1st iteration). |
49 | ** |
50 | ** 2. The loop body contains only the variant instructions and performs |
51 | ** all remaining loop iterations. |
52 | ** |
53 | ** On first sight that looks like a waste of space, because the variant |
54 | ** instructions are present twice. But the key insight is that the |
55 | ** pre-roll honors the control-dependencies for *both* the pre-roll itself |
56 | ** *and* the loop body! |
57 | ** |
58 | ** It also means one doesn't have to explicitly model control-dependencies |
59 | ** (which, BTW, wouldn't help LICM much). And it's much easier to |
60 | ** integrate sparse snapshotting with this approach. |
61 | ** |
62 | ** One of the nicest aspects of this approach is that all of the |
63 | ** optimizations of the compiler pipeline (FOLD, CSE, FWD, etc.) can be |
64 | ** reused with only minor restrictions (e.g. one should not fold |
65 | ** instructions across loop-carried dependencies). |
66 | ** |
67 | ** But in general all optimizations can be applied which only need to look |
68 | ** backwards into the generated instruction stream. At any point in time |
69 | ** during the copy-substitution process this contains both a static loop |
70 | ** iteration (the pre-roll) and a dynamic one (from the to-be-copied |
71 | ** instruction up to the end of the partial loop body). |
72 | ** |
73 | ** Since control-dependencies are implicitly kept, CSE also applies to all |
74 | ** kinds of guards. The major advantage is that all invariant guards can |
75 | ** be hoisted, too. |
76 | ** |
77 | ** Load/store forwarding works across loop iterations, too. This is |
78 | ** important if loop-carried dependencies are kept in upvalues or tables. |
79 | ** E.g. 'self.idx = self.idx + 1' deep down in some OO-style method may |
80 | ** become a forwarded loop-recurrence after inlining. |
81 | ** |
82 | ** Since the IR is in SSA form, loop-carried dependencies have to be |
83 | ** modeled with PHI instructions. The potential candidates for PHIs are |
84 | ** collected on-the-fly during copy-substitution. After eliminating the |
85 | ** redundant ones, PHI instructions are emitted *below* the loop body. |
86 | ** |
87 | ** Note that this departure from traditional SSA form doesn't change the |
88 | ** semantics of the PHI instructions themselves. But it greatly simplifies |
89 | ** on-the-fly generation of the IR and the machine code. |
90 | */ |
91 | |
92 | /* Some local macros to save typing. Undef'd at the end. */ |
93 | #define IR(ref) (&J->cur.ir[(ref)]) |
94 | |
95 | /* Pass IR on to next optimization in chain (FOLD). */ |
96 | #define emitir(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J)) |
97 | |
98 | /* Emit raw IR without passing through optimizations. */ |
99 | #define emitir_raw(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J)) |
100 | |
101 | /* -- PHI elimination ----------------------------------------------------- */ |
102 | |
103 | /* Emit or eliminate collected PHIs. */ |
104 | static void loop_emit_phi(jit_State *J, IRRef1 *subst, IRRef1 *phi, IRRef nphi, |
105 | SnapNo onsnap) |
106 | { |
107 | int passx = 0; |
108 | IRRef i, j, nslots; |
109 | IRRef invar = J->chain[IR_LOOP]; |
110 | /* Pass #1: mark redundant and potentially redundant PHIs. */ |
111 | for (i = 0, j = 0; i < nphi; i++) { |
112 | IRRef lref = phi[i]; |
113 | IRRef rref = subst[lref]; |
114 | if (lref == rref || rref == REF_DROP) { /* Invariants are redundant. */ |
115 | irt_clearphi(IR(lref)->t); |
116 | } else { |
117 | phi[j++] = (IRRef1)lref; |
118 | if (!(IR(rref)->op1 == lref || IR(rref)->op2 == lref)) { |
119 | /* Quick check for simple recurrences failed, need pass2. */ |
120 | irt_setmark(IR(lref)->t); |
121 | passx = 1; |
122 | } |
123 | } |
124 | } |
125 | nphi = j; |
126 | /* Pass #2: traverse variant part and clear marks of non-redundant PHIs. */ |
127 | if (passx) { |
128 | SnapNo s; |
129 | for (i = J->cur.nins-1; i > invar; i--) { |
130 | IRIns *ir = IR(i); |
131 | if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t); |
132 | if (!irref_isk(ir->op1)) { |
133 | irt_clearmark(IR(ir->op1)->t); |
134 | if (ir->op1 < invar && |
135 | ir->o >= IR_CALLN && ir->o <= IR_CARG) { /* ORDER IR */ |
136 | ir = IR(ir->op1); |
137 | while (ir->o == IR_CARG) { |
138 | if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t); |
139 | if (irref_isk(ir->op1)) break; |
140 | ir = IR(ir->op1); |
141 | irt_clearmark(ir->t); |
142 | } |
143 | } |
144 | } |
145 | } |
146 | for (s = J->cur.nsnap-1; s >= onsnap; s--) { |
147 | SnapShot *snap = &J->cur.snap[s]; |
148 | SnapEntry *map = &J->cur.snapmap[snap->mapofs]; |
149 | MSize n, nent = snap->nent; |
150 | for (n = 0; n < nent; n++) { |
151 | IRRef ref = snap_ref(map[n]); |
152 | if (!irref_isk(ref)) irt_clearmark(IR(ref)->t); |
153 | } |
154 | } |
155 | } |
156 | /* Pass #3: add PHIs for variant slots without a corresponding SLOAD. */ |
157 | nslots = J->baseslot+J->maxslot; |
158 | for (i = 1; i < nslots; i++) { |
159 | IRRef ref = tref_ref(J->slot[i]); |
160 | while (!irref_isk(ref) && ref != subst[ref]) { |
161 | IRIns *ir = IR(ref); |
162 | irt_clearmark(ir->t); /* Unmark potential uses, too. */ |
163 | if (irt_isphi(ir->t) || irt_ispri(ir->t)) |
164 | break; |
165 | irt_setphi(ir->t); |
166 | if (nphi >= LJ_MAX_PHI) |
167 | lj_trace_err(J, LJ_TRERR_PHIOV); |
168 | phi[nphi++] = (IRRef1)ref; |
169 | ref = subst[ref]; |
170 | if (ref > invar) |
171 | break; |
172 | } |
173 | } |
174 | /* Pass #4: propagate non-redundant PHIs. */ |
175 | while (passx) { |
176 | passx = 0; |
177 | for (i = 0; i < nphi; i++) { |
178 | IRRef lref = phi[i]; |
179 | IRIns *ir = IR(lref); |
180 | if (!irt_ismarked(ir->t)) { /* Propagate only from unmarked PHIs. */ |
181 | IRIns *irr = IR(subst[lref]); |
182 | if (irt_ismarked(irr->t)) { /* Right ref points to other PHI? */ |
183 | irt_clearmark(irr->t); /* Mark that PHI as non-redundant. */ |
184 | passx = 1; /* Retry. */ |
185 | } |
186 | } |
187 | } |
188 | } |
189 | /* Pass #5: emit PHI instructions or eliminate PHIs. */ |
190 | for (i = 0; i < nphi; i++) { |
191 | IRRef lref = phi[i]; |
192 | IRIns *ir = IR(lref); |
193 | if (!irt_ismarked(ir->t)) { /* Emit PHI if not marked. */ |
194 | IRRef rref = subst[lref]; |
195 | if (rref > invar) |
196 | irt_setphi(IR(rref)->t); |
197 | emitir_raw(IRT(IR_PHI, irt_type(ir->t)), lref, rref); |
198 | } else { /* Otherwise eliminate PHI. */ |
199 | irt_clearmark(ir->t); |
200 | irt_clearphi(ir->t); |
201 | } |
202 | } |
203 | } |
204 | |
205 | /* -- Loop unrolling using copy-substitution ------------------------------ */ |
206 | |
207 | /* Copy-substitute snapshot. */ |
208 | static void loop_subst_snap(jit_State *J, SnapShot *osnap, |
209 | SnapEntry *loopmap, IRRef1 *subst) |
210 | { |
211 | SnapEntry *nmap, *omap = &J->cur.snapmap[osnap->mapofs]; |
212 | SnapEntry *nextmap = &J->cur.snapmap[snap_nextofs(&J->cur, osnap)]; |
213 | MSize nmapofs; |
214 | MSize on, ln, nn, onent = osnap->nent; |
215 | BCReg nslots = osnap->nslots; |
216 | SnapShot *snap = &J->cur.snap[J->cur.nsnap]; |
217 | if (irt_isguard(J->guardemit)) { /* Guard inbetween? */ |
218 | nmapofs = J->cur.nsnapmap; |
219 | J->cur.nsnap++; /* Add new snapshot. */ |
220 | } else { /* Otherwise overwrite previous snapshot. */ |
221 | snap--; |
222 | nmapofs = snap->mapofs; |
223 | } |
224 | J->guardemit.irt = 0; |
225 | /* Setup new snapshot. */ |
226 | snap->mapofs = (uint16_t)nmapofs; |
227 | snap->ref = (IRRef1)J->cur.nins; |
228 | snap->nslots = nslots; |
229 | snap->topslot = osnap->topslot; |
230 | snap->count = 0; |
231 | nmap = &J->cur.snapmap[nmapofs]; |
232 | /* Substitute snapshot slots. */ |
233 | on = ln = nn = 0; |
234 | while (on < onent) { |
235 | SnapEntry osn = omap[on], lsn = loopmap[ln]; |
236 | if (snap_slot(lsn) < snap_slot(osn)) { /* Copy slot from loop map. */ |
237 | nmap[nn++] = lsn; |
238 | ln++; |
239 | } else { /* Copy substituted slot from snapshot map. */ |
240 | if (snap_slot(lsn) == snap_slot(osn)) ln++; /* Shadowed loop slot. */ |
241 | if (!irref_isk(snap_ref(osn))) |
242 | osn = snap_setref(osn, subst[snap_ref(osn)]); |
243 | nmap[nn++] = osn; |
244 | on++; |
245 | } |
246 | } |
247 | while (snap_slot(loopmap[ln]) < nslots) /* Copy remaining loop slots. */ |
248 | nmap[nn++] = loopmap[ln++]; |
249 | snap->nent = (uint8_t)nn; |
250 | omap += onent; |
251 | nmap += nn; |
252 | while (omap < nextmap) /* Copy PC + frame links. */ |
253 | *nmap++ = *omap++; |
254 | J->cur.nsnapmap = (uint16_t)(nmap - J->cur.snapmap); |
255 | } |
256 | |
257 | /* Unroll loop. */ |
258 | static void loop_unroll(jit_State *J) |
259 | { |
260 | IRRef1 phi[LJ_MAX_PHI]; |
261 | uint32_t nphi = 0; |
262 | IRRef1 *subst; |
263 | SnapNo onsnap; |
264 | SnapShot *osnap, *loopsnap; |
265 | SnapEntry *loopmap, *psentinel; |
266 | IRRef ins, invar; |
267 | |
268 | /* Use temp buffer for substitution table. |
269 | ** Only non-constant refs in [REF_BIAS,invar) are valid indexes. |
270 | ** Caveat: don't call into the VM or run the GC or the buffer may be gone. |
271 | */ |
272 | invar = J->cur.nins; |
273 | subst = (IRRef1 *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf, |
274 | (invar-REF_BIAS)*sizeof(IRRef1)) - REF_BIAS; |
275 | subst[REF_BASE] = REF_BASE; |
276 | |
277 | /* LOOP separates the pre-roll from the loop body. */ |
278 | emitir_raw(IRTG(IR_LOOP, IRT_NIL), 0, 0); |
279 | |
280 | /* Grow snapshot buffer and map for copy-substituted snapshots. |
281 | ** Need up to twice the number of snapshots minus #0 and loop snapshot. |
282 | ** Need up to twice the number of entries plus fallback substitutions |
283 | ** from the loop snapshot entries for each new snapshot. |
284 | ** Caveat: both calls may reallocate J->cur.snap and J->cur.snapmap! |
285 | */ |
286 | onsnap = J->cur.nsnap; |
287 | lj_snap_grow_buf(J, 2*onsnap-2); |
288 | lj_snap_grow_map(J, J->cur.nsnapmap*2+(onsnap-2)*J->cur.snap[onsnap-1].nent); |
289 | |
290 | /* The loop snapshot is used for fallback substitutions. */ |
291 | loopsnap = &J->cur.snap[onsnap-1]; |
292 | loopmap = &J->cur.snapmap[loopsnap->mapofs]; |
293 | /* The PC of snapshot #0 and the loop snapshot must match. */ |
294 | psentinel = &loopmap[loopsnap->nent]; |
295 | lua_assert(*psentinel == J->cur.snapmap[J->cur.snap[0].nent]); |
296 | *psentinel = SNAP(255, 0, 0); /* Replace PC with temporary sentinel. */ |
297 | |
298 | /* Start substitution with snapshot #1 (#0 is empty for root traces). */ |
299 | osnap = &J->cur.snap[1]; |
300 | |
301 | /* Copy and substitute all recorded instructions and snapshots. */ |
302 | for (ins = REF_FIRST; ins < invar; ins++) { |
303 | IRIns *ir; |
304 | IRRef op1, op2; |
305 | |
306 | if (ins >= osnap->ref) /* Instruction belongs to next snapshot? */ |
307 | loop_subst_snap(J, osnap++, loopmap, subst); /* Copy-substitute it. */ |
308 | |
309 | /* Substitute instruction operands. */ |
310 | ir = IR(ins); |
311 | op1 = ir->op1; |
312 | if (!irref_isk(op1)) op1 = subst[op1]; |
313 | op2 = ir->op2; |
314 | if (!irref_isk(op2)) op2 = subst[op2]; |
315 | if (irm_kind(lj_ir_mode[ir->o]) == IRM_N && |
316 | op1 == ir->op1 && op2 == ir->op2) { /* Regular invariant ins? */ |
317 | subst[ins] = (IRRef1)ins; /* Shortcut. */ |
318 | } else { |
319 | /* Re-emit substituted instruction to the FOLD/CSE/etc. pipeline. */ |
320 | IRType1 t = ir->t; /* Get this first, since emitir may invalidate ir. */ |
321 | IRRef ref = tref_ref(emitir(ir->ot & ~IRT_ISPHI, op1, op2)); |
322 | subst[ins] = (IRRef1)ref; |
323 | if (ref != ins) { |
324 | IRIns *irr = IR(ref); |
325 | if (ref < invar) { /* Loop-carried dependency? */ |
326 | /* Potential PHI? */ |
327 | if (!irref_isk(ref) && !irt_isphi(irr->t) && !irt_ispri(irr->t)) { |
328 | irt_setphi(irr->t); |
329 | if (nphi >= LJ_MAX_PHI) |
330 | lj_trace_err(J, LJ_TRERR_PHIOV); |
331 | phi[nphi++] = (IRRef1)ref; |
332 | } |
333 | /* Check all loop-carried dependencies for type instability. */ |
334 | if (!irt_sametype(t, irr->t)) { |
335 | if (irt_isinteger(t) && irt_isinteger(irr->t)) |
336 | continue; |
337 | else if (irt_isnum(t) && irt_isinteger(irr->t)) /* Fix int->num. */ |
338 | ref = tref_ref(emitir(IRTN(IR_CONV), ref, IRCONV_NUM_INT)); |
339 | else if (irt_isnum(irr->t) && irt_isinteger(t)) /* Fix num->int. */ |
340 | ref = tref_ref(emitir(IRTGI(IR_CONV), ref, |
341 | IRCONV_INT_NUM|IRCONV_CHECK)); |
342 | else |
343 | lj_trace_err(J, LJ_TRERR_TYPEINS); |
344 | subst[ins] = (IRRef1)ref; |
345 | irr = IR(ref); |
346 | goto phiconv; |
347 | } |
348 | } else if (ref != REF_DROP && irr->o == IR_CONV && |
349 | ref > invar && irr->op1 < invar) { |
350 | /* May need an extra PHI for a CONV. */ |
351 | ref = irr->op1; |
352 | irr = IR(ref); |
353 | phiconv: |
354 | if (ref < invar && !irref_isk(ref) && !irt_isphi(irr->t)) { |
355 | irt_setphi(irr->t); |
356 | if (nphi >= LJ_MAX_PHI) |
357 | lj_trace_err(J, LJ_TRERR_PHIOV); |
358 | phi[nphi++] = (IRRef1)ref; |
359 | } |
360 | } |
361 | } |
362 | } |
363 | } |
364 | if (!irt_isguard(J->guardemit)) /* Drop redundant snapshot. */ |
365 | J->cur.nsnapmap = (uint16_t)J->cur.snap[--J->cur.nsnap].mapofs; |
366 | lua_assert(J->cur.nsnapmap <= J->sizesnapmap); |
367 | *psentinel = J->cur.snapmap[J->cur.snap[0].nent]; /* Restore PC. */ |
368 | |
369 | loop_emit_phi(J, subst, phi, nphi, onsnap); |
370 | } |
371 | |
372 | /* Undo any partial changes made by the loop optimization. */ |
373 | static void loop_undo(jit_State *J, IRRef ins, SnapNo nsnap, MSize nsnapmap) |
374 | { |
375 | ptrdiff_t i; |
376 | SnapShot *snap = &J->cur.snap[nsnap-1]; |
377 | SnapEntry *map = J->cur.snapmap; |
378 | map[snap->mapofs + snap->nent] = map[J->cur.snap[0].nent]; /* Restore PC. */ |
379 | J->cur.nsnapmap = (uint16_t)nsnapmap; |
380 | J->cur.nsnap = nsnap; |
381 | J->guardemit.irt = 0; |
382 | lj_ir_rollback(J, ins); |
383 | for (i = 0; i < BPROP_SLOTS; i++) { /* Remove backprop. cache entries. */ |
384 | BPropEntry *bp = &J->bpropcache[i]; |
385 | if (bp->val >= ins) |
386 | bp->key = 0; |
387 | } |
388 | for (ins--; ins >= REF_FIRST; ins--) { /* Remove flags. */ |
389 | IRIns *ir = IR(ins); |
390 | irt_clearphi(ir->t); |
391 | irt_clearmark(ir->t); |
392 | } |
393 | } |
394 | |
395 | /* Protected callback for loop optimization. */ |
396 | static TValue *cploop_opt(lua_State *L, lua_CFunction dummy, void *ud) |
397 | { |
398 | UNUSED(L); UNUSED(dummy); |
399 | loop_unroll((jit_State *)ud); |
400 | return NULL; |
401 | } |
402 | |
403 | /* Loop optimization. */ |
404 | int lj_opt_loop(jit_State *J) |
405 | { |
406 | IRRef nins = J->cur.nins; |
407 | SnapNo nsnap = J->cur.nsnap; |
408 | MSize nsnapmap = J->cur.nsnapmap; |
409 | int errcode = lj_vm_cpcall(J->L, NULL, J, cploop_opt); |
410 | if (LJ_UNLIKELY(errcode)) { |
411 | lua_State *L = J->L; |
412 | if (errcode == LUA_ERRRUN && tvisnumber(L->top-1)) { /* Trace error? */ |
413 | int32_t e = numberVint(L->top-1); |
414 | switch ((TraceError)e) { |
415 | case LJ_TRERR_TYPEINS: /* Type instability. */ |
416 | case LJ_TRERR_GFAIL: /* Guard would always fail. */ |
417 | /* Unrolling via recording fixes many cases, e.g. a flipped boolean. */ |
418 | if (--J->instunroll < 0) /* But do not unroll forever. */ |
419 | break; |
420 | L->top--; /* Remove error object. */ |
421 | loop_undo(J, nins, nsnap, nsnapmap); |
422 | return 1; /* Loop optimization failed, continue recording. */ |
423 | default: |
424 | break; |
425 | } |
426 | } |
427 | lj_err_throw(L, errcode); /* Propagate all other errors. */ |
428 | } |
429 | return 0; /* Loop optimization is ok. */ |
430 | } |
431 | |
432 | #undef IR |
433 | #undef emitir |
434 | #undef emitir_raw |
435 | |
436 | #endif |
437 | |