1 | /* |
2 | ** LOOP: Loop Optimizations. |
3 | ** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h |
4 | */ |
5 | |
6 | #define lj_opt_loop_c |
7 | #define LUA_CORE |
8 | |
9 | #include "lj_obj.h" |
10 | |
11 | #if LJ_HASJIT |
12 | |
13 | #include "lj_err.h" |
14 | #include "lj_buf.h" |
15 | #include "lj_ir.h" |
16 | #include "lj_jit.h" |
17 | #include "lj_iropt.h" |
18 | #include "lj_trace.h" |
19 | #include "lj_snap.h" |
20 | #include "lj_vm.h" |
21 | |
22 | /* Loop optimization: |
23 | ** |
24 | ** Traditional Loop-Invariant Code Motion (LICM) splits the instructions |
25 | ** of a loop into invariant and variant instructions. The invariant |
26 | ** instructions are hoisted out of the loop and only the variant |
27 | ** instructions remain inside the loop body. |
28 | ** |
29 | ** Unfortunately LICM is mostly useless for compiling dynamic languages. |
30 | ** The IR has many guards and most of the subsequent instructions are |
31 | ** control-dependent on them. The first non-hoistable guard would |
32 | ** effectively prevent hoisting of all subsequent instructions. |
33 | ** |
34 | ** That's why we use a special form of unrolling using copy-substitution, |
35 | ** combined with redundancy elimination: |
36 | ** |
37 | ** The recorded instruction stream is re-emitted to the compiler pipeline |
38 | ** with substituted operands. The substitution table is filled with the |
39 | ** refs returned by re-emitting each instruction. This can be done |
40 | ** on-the-fly, because the IR is in strict SSA form, where every ref is |
41 | ** defined before its use. |
42 | ** |
43 | ** This aproach generates two code sections, separated by the LOOP |
44 | ** instruction: |
45 | ** |
46 | ** 1. The recorded instructions form a kind of pre-roll for the loop. It |
47 | ** contains a mix of invariant and variant instructions and performs |
48 | ** exactly one loop iteration (but not necessarily the 1st iteration). |
49 | ** |
50 | ** 2. The loop body contains only the variant instructions and performs |
51 | ** all remaining loop iterations. |
52 | ** |
53 | ** On first sight that looks like a waste of space, because the variant |
54 | ** instructions are present twice. But the key insight is that the |
55 | ** pre-roll honors the control-dependencies for *both* the pre-roll itself |
56 | ** *and* the loop body! |
57 | ** |
58 | ** It also means one doesn't have to explicitly model control-dependencies |
59 | ** (which, BTW, wouldn't help LICM much). And it's much easier to |
60 | ** integrate sparse snapshotting with this approach. |
61 | ** |
62 | ** One of the nicest aspects of this approach is that all of the |
63 | ** optimizations of the compiler pipeline (FOLD, CSE, FWD, etc.) can be |
64 | ** reused with only minor restrictions (e.g. one should not fold |
65 | ** instructions across loop-carried dependencies). |
66 | ** |
67 | ** But in general all optimizations can be applied which only need to look |
68 | ** backwards into the generated instruction stream. At any point in time |
69 | ** during the copy-substitution process this contains both a static loop |
70 | ** iteration (the pre-roll) and a dynamic one (from the to-be-copied |
71 | ** instruction up to the end of the partial loop body). |
72 | ** |
73 | ** Since control-dependencies are implicitly kept, CSE also applies to all |
74 | ** kinds of guards. The major advantage is that all invariant guards can |
75 | ** be hoisted, too. |
76 | ** |
77 | ** Load/store forwarding works across loop iterations, too. This is |
78 | ** important if loop-carried dependencies are kept in upvalues or tables. |
79 | ** E.g. 'self.idx = self.idx + 1' deep down in some OO-style method may |
80 | ** become a forwarded loop-recurrence after inlining. |
81 | ** |
82 | ** Since the IR is in SSA form, loop-carried dependencies have to be |
83 | ** modeled with PHI instructions. The potential candidates for PHIs are |
84 | ** collected on-the-fly during copy-substitution. After eliminating the |
85 | ** redundant ones, PHI instructions are emitted *below* the loop body. |
86 | ** |
87 | ** Note that this departure from traditional SSA form doesn't change the |
88 | ** semantics of the PHI instructions themselves. But it greatly simplifies |
89 | ** on-the-fly generation of the IR and the machine code. |
90 | */ |
91 | |
92 | /* Some local macros to save typing. Undef'd at the end. */ |
93 | #define IR(ref) (&J->cur.ir[(ref)]) |
94 | |
95 | /* Pass IR on to next optimization in chain (FOLD). */ |
96 | #define emitir(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J)) |
97 | |
98 | /* Emit raw IR without passing through optimizations. */ |
99 | #define emitir_raw(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J)) |
100 | |
101 | /* -- PHI elimination ----------------------------------------------------- */ |
102 | |
103 | /* Emit or eliminate collected PHIs. */ |
104 | static void loop_emit_phi(jit_State *J, IRRef1 *subst, IRRef1 *phi, IRRef nphi, |
105 | SnapNo onsnap) |
106 | { |
107 | int passx = 0; |
108 | IRRef i, j, nslots; |
109 | IRRef invar = J->chain[IR_LOOP]; |
110 | /* Pass #1: mark redundant and potentially redundant PHIs. */ |
111 | for (i = 0, j = 0; i < nphi; i++) { |
112 | IRRef lref = phi[i]; |
113 | IRRef rref = subst[lref]; |
114 | if (lref == rref || rref == REF_DROP) { /* Invariants are redundant. */ |
115 | irt_clearphi(IR(lref)->t); |
116 | } else { |
117 | phi[j++] = (IRRef1)lref; |
118 | if (!(IR(rref)->op1 == lref || IR(rref)->op2 == lref)) { |
119 | /* Quick check for simple recurrences failed, need pass2. */ |
120 | irt_setmark(IR(lref)->t); |
121 | passx = 1; |
122 | } |
123 | } |
124 | } |
125 | nphi = j; |
126 | /* Pass #2: traverse variant part and clear marks of non-redundant PHIs. */ |
127 | if (passx) { |
128 | SnapNo s; |
129 | for (i = J->cur.nins-1; i > invar; i--) { |
130 | IRIns *ir = IR(i); |
131 | if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t); |
132 | if (!irref_isk(ir->op1)) { |
133 | irt_clearmark(IR(ir->op1)->t); |
134 | if (ir->op1 < invar && |
135 | ir->o >= IR_CALLN && ir->o <= IR_CARG) { /* ORDER IR */ |
136 | ir = IR(ir->op1); |
137 | while (ir->o == IR_CARG) { |
138 | if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t); |
139 | if (irref_isk(ir->op1)) break; |
140 | ir = IR(ir->op1); |
141 | irt_clearmark(ir->t); |
142 | } |
143 | } |
144 | } |
145 | } |
146 | for (s = J->cur.nsnap-1; s >= onsnap; s--) { |
147 | SnapShot *snap = &J->cur.snap[s]; |
148 | SnapEntry *map = &J->cur.snapmap[snap->mapofs]; |
149 | MSize n, nent = snap->nent; |
150 | for (n = 0; n < nent; n++) { |
151 | IRRef ref = snap_ref(map[n]); |
152 | if (!irref_isk(ref)) irt_clearmark(IR(ref)->t); |
153 | } |
154 | } |
155 | } |
156 | /* Pass #3: add PHIs for variant slots without a corresponding SLOAD. */ |
157 | nslots = J->baseslot+J->maxslot; |
158 | for (i = 1; i < nslots; i++) { |
159 | IRRef ref = tref_ref(J->slot[i]); |
160 | while (!irref_isk(ref) && ref != subst[ref]) { |
161 | IRIns *ir = IR(ref); |
162 | irt_clearmark(ir->t); /* Unmark potential uses, too. */ |
163 | if (irt_isphi(ir->t) || irt_ispri(ir->t)) |
164 | break; |
165 | irt_setphi(ir->t); |
166 | if (nphi >= LJ_MAX_PHI) |
167 | lj_trace_err(J, LJ_TRERR_PHIOV); |
168 | phi[nphi++] = (IRRef1)ref; |
169 | ref = subst[ref]; |
170 | if (ref > invar) |
171 | break; |
172 | } |
173 | } |
174 | /* Pass #4: propagate non-redundant PHIs. */ |
175 | while (passx) { |
176 | passx = 0; |
177 | for (i = 0; i < nphi; i++) { |
178 | IRRef lref = phi[i]; |
179 | IRIns *ir = IR(lref); |
180 | if (!irt_ismarked(ir->t)) { /* Propagate only from unmarked PHIs. */ |
181 | IRIns *irr = IR(subst[lref]); |
182 | if (irt_ismarked(irr->t)) { /* Right ref points to other PHI? */ |
183 | irt_clearmark(irr->t); /* Mark that PHI as non-redundant. */ |
184 | passx = 1; /* Retry. */ |
185 | } |
186 | } |
187 | } |
188 | } |
189 | /* Pass #5: emit PHI instructions or eliminate PHIs. */ |
190 | for (i = 0; i < nphi; i++) { |
191 | IRRef lref = phi[i]; |
192 | IRIns *ir = IR(lref); |
193 | if (!irt_ismarked(ir->t)) { /* Emit PHI if not marked. */ |
194 | IRRef rref = subst[lref]; |
195 | if (rref > invar) |
196 | irt_setphi(IR(rref)->t); |
197 | emitir_raw(IRT(IR_PHI, irt_type(ir->t)), lref, rref); |
198 | } else { /* Otherwise eliminate PHI. */ |
199 | irt_clearmark(ir->t); |
200 | irt_clearphi(ir->t); |
201 | } |
202 | } |
203 | } |
204 | |
205 | /* -- Loop unrolling using copy-substitution ------------------------------ */ |
206 | |
207 | /* Copy-substitute snapshot. */ |
208 | static void loop_subst_snap(jit_State *J, SnapShot *osnap, |
209 | SnapEntry *loopmap, IRRef1 *subst) |
210 | { |
211 | SnapEntry *nmap, *omap = &J->cur.snapmap[osnap->mapofs]; |
212 | SnapEntry *nextmap = &J->cur.snapmap[snap_nextofs(&J->cur, osnap)]; |
213 | MSize nmapofs; |
214 | MSize on, ln, nn, onent = osnap->nent; |
215 | BCReg nslots = osnap->nslots; |
216 | SnapShot *snap = &J->cur.snap[J->cur.nsnap]; |
217 | if (irt_isguard(J->guardemit)) { /* Guard inbetween? */ |
218 | nmapofs = J->cur.nsnapmap; |
219 | J->cur.nsnap++; /* Add new snapshot. */ |
220 | } else { /* Otherwise overwrite previous snapshot. */ |
221 | snap--; |
222 | nmapofs = snap->mapofs; |
223 | } |
224 | J->guardemit.irt = 0; |
225 | /* Setup new snapshot. */ |
226 | snap->mapofs = (uint32_t)nmapofs; |
227 | snap->ref = (IRRef1)J->cur.nins; |
228 | snap->mcofs = 0; |
229 | snap->nslots = nslots; |
230 | snap->topslot = osnap->topslot; |
231 | snap->count = 0; |
232 | nmap = &J->cur.snapmap[nmapofs]; |
233 | /* Substitute snapshot slots. */ |
234 | on = ln = nn = 0; |
235 | while (on < onent) { |
236 | SnapEntry osn = omap[on], lsn = loopmap[ln]; |
237 | if (snap_slot(lsn) < snap_slot(osn)) { /* Copy slot from loop map. */ |
238 | nmap[nn++] = lsn; |
239 | ln++; |
240 | } else { /* Copy substituted slot from snapshot map. */ |
241 | if (snap_slot(lsn) == snap_slot(osn)) ln++; /* Shadowed loop slot. */ |
242 | if (!irref_isk(snap_ref(osn))) |
243 | osn = snap_setref(osn, subst[snap_ref(osn)]); |
244 | nmap[nn++] = osn; |
245 | on++; |
246 | } |
247 | } |
248 | while (snap_slot(loopmap[ln]) < nslots) /* Copy remaining loop slots. */ |
249 | nmap[nn++] = loopmap[ln++]; |
250 | snap->nent = (uint8_t)nn; |
251 | omap += onent; |
252 | nmap += nn; |
253 | while (omap < nextmap) /* Copy PC + frame links. */ |
254 | *nmap++ = *omap++; |
255 | J->cur.nsnapmap = (uint32_t)(nmap - J->cur.snapmap); |
256 | } |
257 | |
258 | typedef struct LoopState { |
259 | jit_State *J; |
260 | IRRef1 *subst; |
261 | MSize sizesubst; |
262 | } LoopState; |
263 | |
264 | /* Unroll loop. */ |
265 | static void loop_unroll(LoopState *lps) |
266 | { |
267 | jit_State *J = lps->J; |
268 | IRRef1 phi[LJ_MAX_PHI]; |
269 | uint32_t nphi = 0; |
270 | IRRef1 *subst; |
271 | SnapNo onsnap; |
272 | SnapShot *osnap, *loopsnap; |
273 | SnapEntry *loopmap, *psentinel; |
274 | IRRef ins, invar; |
275 | |
276 | /* Allocate substitution table. |
277 | ** Only non-constant refs in [REF_BIAS,invar) are valid indexes. |
278 | */ |
279 | invar = J->cur.nins; |
280 | lps->sizesubst = invar - REF_BIAS; |
281 | lps->subst = lj_mem_newvec(J->L, lps->sizesubst, IRRef1); |
282 | subst = lps->subst - REF_BIAS; |
283 | subst[REF_BASE] = REF_BASE; |
284 | |
285 | /* LOOP separates the pre-roll from the loop body. */ |
286 | emitir_raw(IRTG(IR_LOOP, IRT_NIL), 0, 0); |
287 | |
288 | /* Grow snapshot buffer and map for copy-substituted snapshots. |
289 | ** Need up to twice the number of snapshots minus #0 and loop snapshot. |
290 | ** Need up to twice the number of entries plus fallback substitutions |
291 | ** from the loop snapshot entries for each new snapshot. |
292 | ** Caveat: both calls may reallocate J->cur.snap and J->cur.snapmap! |
293 | */ |
294 | onsnap = J->cur.nsnap; |
295 | lj_snap_grow_buf(J, 2*onsnap-2); |
296 | lj_snap_grow_map(J, J->cur.nsnapmap*2+(onsnap-2)*J->cur.snap[onsnap-1].nent); |
297 | |
298 | /* The loop snapshot is used for fallback substitutions. */ |
299 | loopsnap = &J->cur.snap[onsnap-1]; |
300 | loopmap = &J->cur.snapmap[loopsnap->mapofs]; |
301 | /* The PC of snapshot #0 and the loop snapshot must match. */ |
302 | psentinel = &loopmap[loopsnap->nent]; |
303 | lj_assertJ(*psentinel == J->cur.snapmap[J->cur.snap[0].nent], |
304 | "mismatched PC for loop snapshot" ); |
305 | *psentinel = SNAP(255, 0, 0); /* Replace PC with temporary sentinel. */ |
306 | |
307 | /* Start substitution with snapshot #1 (#0 is empty for root traces). */ |
308 | osnap = &J->cur.snap[1]; |
309 | |
310 | /* Copy and substitute all recorded instructions and snapshots. */ |
311 | for (ins = REF_FIRST; ins < invar; ins++) { |
312 | IRIns *ir; |
313 | IRRef op1, op2; |
314 | |
315 | if (ins >= osnap->ref) /* Instruction belongs to next snapshot? */ |
316 | loop_subst_snap(J, osnap++, loopmap, subst); /* Copy-substitute it. */ |
317 | |
318 | /* Substitute instruction operands. */ |
319 | ir = IR(ins); |
320 | op1 = ir->op1; |
321 | if (!irref_isk(op1)) op1 = subst[op1]; |
322 | op2 = ir->op2; |
323 | if (!irref_isk(op2)) op2 = subst[op2]; |
324 | if (irm_kind(lj_ir_mode[ir->o]) == IRM_N && |
325 | op1 == ir->op1 && op2 == ir->op2) { /* Regular invariant ins? */ |
326 | subst[ins] = (IRRef1)ins; /* Shortcut. */ |
327 | } else { |
328 | /* Re-emit substituted instruction to the FOLD/CSE/etc. pipeline. */ |
329 | IRType1 t = ir->t; /* Get this first, since emitir may invalidate ir. */ |
330 | IRRef ref = tref_ref(emitir(ir->ot & ~IRT_ISPHI, op1, op2)); |
331 | subst[ins] = (IRRef1)ref; |
332 | if (ref != ins) { |
333 | IRIns *irr = IR(ref); |
334 | if (ref < invar) { /* Loop-carried dependency? */ |
335 | /* Potential PHI? */ |
336 | if (!irref_isk(ref) && !irt_isphi(irr->t) && !irt_ispri(irr->t)) { |
337 | irt_setphi(irr->t); |
338 | if (nphi >= LJ_MAX_PHI) |
339 | lj_trace_err(J, LJ_TRERR_PHIOV); |
340 | phi[nphi++] = (IRRef1)ref; |
341 | } |
342 | /* Check all loop-carried dependencies for type instability. */ |
343 | if (!irt_sametype(t, irr->t)) { |
344 | if (irt_isinteger(t) && irt_isinteger(irr->t)) |
345 | continue; |
346 | else if (irt_isnum(t) && irt_isinteger(irr->t)) /* Fix int->num. */ |
347 | ref = tref_ref(emitir(IRTN(IR_CONV), ref, IRCONV_NUM_INT)); |
348 | else if (irt_isnum(irr->t) && irt_isinteger(t)) /* Fix num->int. */ |
349 | ref = tref_ref(emitir(IRTGI(IR_CONV), ref, |
350 | IRCONV_INT_NUM|IRCONV_CHECK)); |
351 | else |
352 | lj_trace_err(J, LJ_TRERR_TYPEINS); |
353 | subst[ins] = (IRRef1)ref; |
354 | irr = IR(ref); |
355 | goto phiconv; |
356 | } |
357 | } else if (ref != REF_DROP && ref > invar && |
358 | ((irr->o == IR_CONV && irr->op1 < invar) || |
359 | (irr->o == IR_ALEN && irr->op2 < invar && |
360 | irr->op2 != REF_NIL))) { |
361 | /* May need an extra PHI for a CONV or ALEN hint. */ |
362 | ref = irr->o == IR_CONV ? irr->op1 : irr->op2; |
363 | irr = IR(ref); |
364 | phiconv: |
365 | if (ref < invar && !irref_isk(ref) && !irt_isphi(irr->t)) { |
366 | irt_setphi(irr->t); |
367 | if (nphi >= LJ_MAX_PHI) |
368 | lj_trace_err(J, LJ_TRERR_PHIOV); |
369 | phi[nphi++] = (IRRef1)ref; |
370 | } |
371 | } |
372 | } |
373 | } |
374 | } |
375 | if (!irt_isguard(J->guardemit)) /* Drop redundant snapshot. */ |
376 | J->cur.nsnapmap = (uint32_t)J->cur.snap[--J->cur.nsnap].mapofs; |
377 | lj_assertJ(J->cur.nsnapmap <= J->sizesnapmap, "bad snapshot map index" ); |
378 | *psentinel = J->cur.snapmap[J->cur.snap[0].nent]; /* Restore PC. */ |
379 | |
380 | loop_emit_phi(J, subst, phi, nphi, onsnap); |
381 | } |
382 | |
383 | /* Undo any partial changes made by the loop optimization. */ |
384 | static void loop_undo(jit_State *J, IRRef ins, SnapNo nsnap, MSize nsnapmap) |
385 | { |
386 | ptrdiff_t i; |
387 | SnapShot *snap = &J->cur.snap[nsnap-1]; |
388 | SnapEntry *map = J->cur.snapmap; |
389 | map[snap->mapofs + snap->nent] = map[J->cur.snap[0].nent]; /* Restore PC. */ |
390 | J->cur.nsnapmap = (uint32_t)nsnapmap; |
391 | J->cur.nsnap = nsnap; |
392 | J->guardemit.irt = 0; |
393 | lj_ir_rollback(J, ins); |
394 | for (i = 0; i < BPROP_SLOTS; i++) { /* Remove backprop. cache entries. */ |
395 | BPropEntry *bp = &J->bpropcache[i]; |
396 | if (bp->val >= ins) |
397 | bp->key = 0; |
398 | } |
399 | for (ins--; ins >= REF_FIRST; ins--) { /* Remove flags. */ |
400 | IRIns *ir = IR(ins); |
401 | irt_clearphi(ir->t); |
402 | irt_clearmark(ir->t); |
403 | } |
404 | } |
405 | |
406 | /* Protected callback for loop optimization. */ |
407 | static TValue *cploop_opt(lua_State *L, lua_CFunction dummy, void *ud) |
408 | { |
409 | UNUSED(L); UNUSED(dummy); |
410 | loop_unroll((LoopState *)ud); |
411 | return NULL; |
412 | } |
413 | |
414 | /* Loop optimization. */ |
415 | int lj_opt_loop(jit_State *J) |
416 | { |
417 | IRRef nins = J->cur.nins; |
418 | SnapNo nsnap = J->cur.nsnap; |
419 | MSize nsnapmap = J->cur.nsnapmap; |
420 | LoopState lps; |
421 | int errcode; |
422 | lps.J = J; |
423 | lps.subst = NULL; |
424 | lps.sizesubst = 0; |
425 | errcode = lj_vm_cpcall(J->L, NULL, &lps, cploop_opt); |
426 | lj_mem_freevec(J2G(J), lps.subst, lps.sizesubst, IRRef1); |
427 | if (LJ_UNLIKELY(errcode)) { |
428 | lua_State *L = J->L; |
429 | if (errcode == LUA_ERRRUN && tvisnumber(L->top-1)) { /* Trace error? */ |
430 | int32_t e = numberVint(L->top-1); |
431 | switch ((TraceError)e) { |
432 | case LJ_TRERR_TYPEINS: /* Type instability. */ |
433 | case LJ_TRERR_GFAIL: /* Guard would always fail. */ |
434 | /* Unrolling via recording fixes many cases, e.g. a flipped boolean. */ |
435 | if (--J->instunroll < 0) /* But do not unroll forever. */ |
436 | break; |
437 | L->top--; /* Remove error object. */ |
438 | loop_undo(J, nins, nsnap, nsnapmap); |
439 | return 1; /* Loop optimization failed, continue recording. */ |
440 | default: |
441 | break; |
442 | } |
443 | } |
444 | lj_err_throw(L, errcode); /* Propagate all other errors. */ |
445 | } |
446 | return 0; /* Loop optimization is ok. */ |
447 | } |
448 | |
449 | #undef IR |
450 | #undef emitir |
451 | #undef emitir_raw |
452 | |
453 | #endif |
454 | |