lj_opt_loop.c source code [LuaJIT/lj_opt_loop.c]

1	/*
2	** LOOP: Loop Optimizations.
3	** Copyright (C) 2005-2021 Mike Pall. See Copyright Notice in luajit.h
4	*/
5
6	#define lj_opt_loop_c
7	#define LUA_CORE
8
9	#include "lj_obj.h"
10
11	#if LJ_HASJIT
12
13	#include "lj_err.h"
14	#include "lj_buf.h"
15	#include "lj_ir.h"
16	#include "lj_jit.h"
17	#include "lj_iropt.h"
18	#include "lj_trace.h"
19	#include "lj_snap.h"
20	#include "lj_vm.h"
21
22	/ Loop optimization:*
23	**
24	** Traditional Loop-Invariant Code Motion (LICM) splits the instructions
25	** of a loop into invariant and variant instructions. The invariant
26	** instructions are hoisted out of the loop and only the variant
27	** instructions remain inside the loop body.
28	**
29	** Unfortunately LICM is mostly useless for compiling dynamic languages.
30	** The IR has many guards and most of the subsequent instructions are
31	** control-dependent on them. The first non-hoistable guard would
32	** effectively prevent hoisting of all subsequent instructions.
33	**
34	** That's why we use a special form of unrolling using copy-substitution,
35	** combined with redundancy elimination:
36	**
37	** The recorded instruction stream is re-emitted to the compiler pipeline
38	** with substituted operands. The substitution table is filled with the
39	** refs returned by re-emitting each instruction. This can be done
40	** on-the-fly, because the IR is in strict SSA form, where every ref is
41	** defined before its use.
42	**
43	** This aproach generates two code sections, separated by the LOOP
44	** instruction:
45	**
46	** 1. The recorded instructions form a kind of pre-roll for the loop. It
47	** contains a mix of invariant and variant instructions and performs
48	** exactly one loop iteration (but not necessarily the 1st iteration).
49	**
50	** 2. The loop body contains only the variant instructions and performs
51	** all remaining loop iterations.
52	**
53	** On first sight that looks like a waste of space, because the variant
54	** instructions are present twice. But the key insight is that the
55	** pre-roll honors the control-dependencies for both the pre-roll itself
56	** and the loop body!
57	**
58	** It also means one doesn't have to explicitly model control-dependencies
59	** (which, BTW, wouldn't help LICM much). And it's much easier to
60	** integrate sparse snapshotting with this approach.
61	**
62	** One of the nicest aspects of this approach is that all of the
63	** optimizations of the compiler pipeline (FOLD, CSE, FWD, etc.) can be
64	** reused with only minor restrictions (e.g. one should not fold
65	** instructions across loop-carried dependencies).
66	**
67	** But in general all optimizations can be applied which only need to look
68	** backwards into the generated instruction stream. At any point in time
69	** during the copy-substitution process this contains both a static loop
70	** iteration (the pre-roll) and a dynamic one (from the to-be-copied
71	** instruction up to the end of the partial loop body).
72	**
73	** Since control-dependencies are implicitly kept, CSE also applies to all
74	** kinds of guards. The major advantage is that all invariant guards can
75	** be hoisted, too.
76	**
77	** Load/store forwarding works across loop iterations, too. This is
78	** important if loop-carried dependencies are kept in upvalues or tables.
79	** E.g. 'self.idx = self.idx + 1' deep down in some OO-style method may
80	** become a forwarded loop-recurrence after inlining.
81	**
82	** Since the IR is in SSA form, loop-carried dependencies have to be
83	** modeled with PHI instructions. The potential candidates for PHIs are
84	** collected on-the-fly during copy-substitution. After eliminating the
85	** redundant ones, PHI instructions are emitted below the loop body.
86	**
87	** Note that this departure from traditional SSA form doesn't change the
88	** semantics of the PHI instructions themselves. But it greatly simplifies
89	** on-the-fly generation of the IR and the machine code.
90	*/
91
92	/ Some local macros to save typing. Undef'd at the end. /
93	#define IR(ref) (&J->cur.ir[(ref)])
94
95	/ Pass IR on to next optimization in chain (FOLD). /
96	#define emitir(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
97
98	/ Emit raw IR without passing through optimizations. /
99	#define emitir_raw(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))
100
101	/ -- PHI elimination ----------------------------------------------------- /
102
103	/ Emit or eliminate collected PHIs. /
104	static void loop_emit_phi(jit_State J, IRRef1 subst, IRRef1 *phi, IRRef nphi,
105	SnapNo onsnap)
106	{
107	int passx = `0`;
108	IRRef i, j, nslots;
109	IRRef invar = J->chain[IR_LOOP];
110	/ Pass #1: mark redundant and potentially redundant PHIs. /
111	for (i = `0`, j = `0`; i < nphi; i++) {
112	IRRef lref = phi[i];
113	IRRef rref = subst[lref];
114	if (lref == rref \|\| rref == REF_DROP) { / Invariants are redundant. /
115	irt_clearphi(IR(lref)->t);
116	} else {
117	phi[j++] = (IRRef1)lref;
118	if (!(IR(rref)->op1 == lref \|\| IR(rref)->op2 == lref)) {
119	/ Quick check for simple recurrences failed, need pass2. /
120	irt_setmark(IR(lref)->t);
121	passx = `1`;
122	}
123	}
124	}
125	nphi = j;
126	/ Pass #2: traverse variant part and clear marks of non-redundant PHIs. /
127	if (passx) {
128	SnapNo s;
129	for (i = J->cur.nins-`1`; i > invar; i--) {
130	IRIns *ir = IR(i);
131	if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
132	if (!irref_isk(ir->op1)) {
133	irt_clearmark(IR(ir->op1)->t);
134	if (ir->op1 < invar &&
135	ir->o >= IR_CALLN && ir->o <= IR_CARG) { / ORDER IR /
136	ir = IR(ir->op1);
137	while (ir->o == IR_CARG) {
138	if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
139	if (irref_isk(ir->op1)) break;
140	ir = IR(ir->op1);
141	irt_clearmark(ir->t);
142	}
143	}
144	}
145	}
146	for (s = J->cur.nsnap-`1`; s >= onsnap; s--) {
147	SnapShot *snap = &J->cur.snap[s];
148	SnapEntry *map = &J->cur.snapmap[snap->mapofs];
149	MSize n, nent = snap->nent;
150	for (n = `0`; n < nent; n++) {
151	IRRef ref = snap_ref(map[n]);
152	if (!irref_isk(ref)) irt_clearmark(IR(ref)->t);
153	}
154	}
155	}
156	/ Pass #3: add PHIs for variant slots without a corresponding SLOAD. /
157	nslots = J->baseslot+J->maxslot;
158	for (i = `1`; i < nslots; i++) {
159	IRRef ref = tref_ref(J->slot[i]);
160	while (!irref_isk(ref) && ref != subst[ref]) {
161	IRIns *ir = IR(ref);
162	irt_clearmark(ir->t); / Unmark potential uses, too. /
163	if (irt_isphi(ir->t) \|\| irt_ispri(ir->t))
164	break;
165	irt_setphi(ir->t);
166	if (nphi >= LJ_MAX_PHI)
167	lj_trace_err(J, LJ_TRERR_PHIOV);
168	phi[nphi++] = (IRRef1)ref;
169	ref = subst[ref];
170	if (ref > invar)
171	break;
172	}
173	}
174	/ Pass #4: propagate non-redundant PHIs. /
175	while (passx) {
176	passx = `0`;
177	for (i = `0`; i < nphi; i++) {
178	IRRef lref = phi[i];
179	IRIns *ir = IR(lref);
180	if (!irt_ismarked(ir->t)) { / Propagate only from unmarked PHIs. /
181	IRIns *irr = IR(subst[lref]);
182	if (irt_ismarked(irr->t)) { / Right ref points to other PHI? /
183	irt_clearmark(irr->t); / Mark that PHI as non-redundant. /
184	passx = `1`; / Retry. /
185	}
186	}
187	}
188	}
189	/ Pass #5: emit PHI instructions or eliminate PHIs. /
190	for (i = `0`; i < nphi; i++) {
191	IRRef lref = phi[i];
192	IRIns *ir = IR(lref);
193	if (!irt_ismarked(ir->t)) { / Emit PHI if not marked. /
194	IRRef rref = subst[lref];
195	if (rref > invar)
196	irt_setphi(IR(rref)->t);
197	emitir_raw(IRT(IR_PHI, irt_type(ir->t)), lref, rref);
198	} else { / Otherwise eliminate PHI. /
199	irt_clearmark(ir->t);
200	irt_clearphi(ir->t);
201	}
202	}
203	}
204
205	/ -- Loop unrolling using copy-substitution ------------------------------ /
206
207	/ Copy-substitute snapshot. /
208	static void loop_subst_snap(jit_State J, SnapShot osnap,
209	SnapEntry loopmap, IRRef1 subst)
210	{
211	SnapEntry nmap, omap = &J->cur.snapmap[osnap->mapofs];
212	SnapEntry *nextmap = &J->cur.snapmap[snap_nextofs(&J->cur, osnap)];
213	MSize nmapofs;
214	MSize on, ln, nn, onent = osnap->nent;
215	BCReg nslots = osnap->nslots;
216	SnapShot *snap = &J->cur.snap[J->cur.nsnap];
217	if (irt_isguard(J->guardemit)) { / Guard inbetween? /
218	nmapofs = J->cur.nsnapmap;
219	J->cur.nsnap++; / Add new snapshot. /
220	} else { / Otherwise overwrite previous snapshot. /
221	snap--;
222	nmapofs = snap->mapofs;
223	}
224	J->guardemit.irt = `0`;
225	/ Setup new snapshot. /
226	snap->mapofs = (uint32_t)nmapofs;
227	snap->ref = (IRRef1)J->cur.nins;
228	snap->mcofs = `0`;
229	snap->nslots = nslots;
230	snap->topslot = osnap->topslot;
231	snap->count = `0`;
232	nmap = &J->cur.snapmap[nmapofs];
233	/ Substitute snapshot slots. /
234	on = ln = nn = `0`;
235	while (on < onent) {
236	SnapEntry osn = omap[on], lsn = loopmap[ln];
237	if (snap_slot(lsn) < snap_slot(osn)) { / Copy slot from loop map. /
238	nmap[nn++] = lsn;
239	ln++;
240	} else { / Copy substituted slot from snapshot map. /
241	if (snap_slot(lsn) == snap_slot(osn)) ln++; / Shadowed loop slot. /
242	if (!irref_isk(snap_ref(osn)))
243	osn = snap_setref(osn, subst[snap_ref(osn)]);
244	nmap[nn++] = osn;
245	on++;
246	}
247	}
248	while (snap_slot(loopmap[ln]) < nslots) / Copy remaining loop slots. /
249	nmap[nn++] = loopmap[ln++];
250	snap->nent = (uint8_t)nn;
251	omap += onent;
252	nmap += nn;
253	while (omap < nextmap) / Copy PC + frame links. /
254	nmap++ = omap++;
255	J->cur.nsnapmap = (uint32_t)(nmap - J->cur.snapmap);
256	}
257
258	typedef struct LoopState {
259	jit_State *J;
260	IRRef1 *subst;
261	MSize sizesubst;
262	} LoopState;
263
264	/ Unroll loop. /
265	static void loop_unroll(LoopState *lps)
266	{
267	jit_State *J = lps->J;
268	IRRef1 phi[LJ_MAX_PHI];
269	uint32_t nphi = `0`;
270	IRRef1 *subst;
271	SnapNo onsnap;
272	SnapShot osnap, loopsnap;
273	SnapEntry loopmap, psentinel;
274	IRRef ins, invar;
275
276	/ Allocate substitution table.*
277	** Only non-constant refs in [REF_BIAS,invar) are valid indexes.
278	*/
279	invar = J->cur.nins;
280	lps->sizesubst = invar - REF_BIAS;
281	lps->subst = lj_mem_newvec(J->L, lps->sizesubst, IRRef1);
282	subst = lps->subst - REF_BIAS;
283	subst[REF_BASE] = REF_BASE;
284
285	/ LOOP separates the pre-roll from the loop body. /
286	emitir_raw(IRTG(IR_LOOP, IRT_NIL), `0`, `0`);
287
288	/ Grow snapshot buffer and map for copy-substituted snapshots.*
289	** Need up to twice the number of snapshots minus #0 and loop snapshot.
290	** Need up to twice the number of entries plus fallback substitutions
291	** from the loop snapshot entries for each new snapshot.
292	** Caveat: both calls may reallocate J->cur.snap and J->cur.snapmap!
293	*/
294	onsnap = J->cur.nsnap;
295	lj_snap_grow_buf(J, `2`*onsnap-`2`);
296	lj_snap_grow_map(J, J->cur.nsnapmap`2`+(onsnap-`2`)J->cur.snap[onsnap-`1`].nent);
297
298	/ The loop snapshot is used for fallback substitutions. /
299	loopsnap = &J->cur.snap[onsnap-`1`];
300	loopmap = &J->cur.snapmap[loopsnap->mapofs];
301	/ The PC of snapshot #0 and the loop snapshot must match. /
302	psentinel = &loopmap[loopsnap->nent];
303	lj_assertJ(*psentinel == J->cur.snapmap[J->cur.snap[`0`].nent],
304	"mismatched PC for loop snapshot");
305	psentinel = SNAP(`255`, `0`, `0`); /* Replace PC with temporary sentinel. /
306
307	/ Start substitution with snapshot #1 (#0 is empty for root traces). /
308	osnap = &J->cur.snap[`1`];
309
310	/ Copy and substitute all recorded instructions and snapshots. /
311	for (ins = REF_FIRST; ins < invar; ins++) {
312	IRIns *ir;
313	IRRef op1, op2;
314
315	if (ins >= osnap->ref) / Instruction belongs to next snapshot? /
316	loop_subst_snap(J, osnap++, loopmap, subst); / Copy-substitute it. /
317
318	/ Substitute instruction operands. /
319	ir = IR(ins);
320	op1 = ir->op1;
321	if (!irref_isk(op1)) op1 = subst[op1];
322	op2 = ir->op2;
323	if (!irref_isk(op2)) op2 = subst[op2];
324	if (irm_kind(lj_ir_mode[ir->o]) == IRM_N &&
325	op1 == ir->op1 && op2 == ir->op2) { / Regular invariant ins? /
326	subst[ins] = (IRRef1)ins; / Shortcut. /
327	} else {
328	/ Re-emit substituted instruction to the FOLD/CSE/etc. pipeline. /
329	IRType1 t = ir->t; / Get this first, since emitir may invalidate ir. /
330	IRRef ref = tref_ref(emitir(ir->ot & ~IRT_ISPHI, op1, op2));
331	subst[ins] = (IRRef1)ref;
332	if (ref != ins) {
333	IRIns *irr = IR(ref);
334	if (ref < invar) { / Loop-carried dependency? /
335	/ Potential PHI? /
336	if (!irref_isk(ref) && !irt_isphi(irr->t) && !irt_ispri(irr->t)) {
337	irt_setphi(irr->t);
338	if (nphi >= LJ_MAX_PHI)
339	lj_trace_err(J, LJ_TRERR_PHIOV);
340	phi[nphi++] = (IRRef1)ref;
341	}
342	/ Check all loop-carried dependencies for type instability. /
343	if (!irt_sametype(t, irr->t)) {
344	if (irt_isinteger(t) && irt_isinteger(irr->t))
345	continue;
346	else if (irt_isnum(t) && irt_isinteger(irr->t)) / Fix int->num. /
347	ref = tref_ref(emitir(IRTN(IR_CONV), ref, IRCONV_NUM_INT));
348	else if (irt_isnum(irr->t) && irt_isinteger(t)) / Fix num->int. /
349	ref = tref_ref(emitir(IRTGI(IR_CONV), ref,
350	IRCONV_INT_NUM\|IRCONV_CHECK));
351	else
352	lj_trace_err(J, LJ_TRERR_TYPEINS);
353	subst[ins] = (IRRef1)ref;
354	irr = IR(ref);
355	goto phiconv;
356	}
357	} else if (ref != REF_DROP && ref > invar &&
358	((irr->o == IR_CONV && irr->op1 < invar) \|\|
359	(irr->o == IR_ALEN && irr->op2 < invar &&
360	irr->op2 != REF_NIL))) {
361	/ May need an extra PHI for a CONV or ALEN hint. /
362	ref = irr->o == IR_CONV ? irr->op1 : irr->op2;
363	irr = IR(ref);
364	phiconv:
365	if (ref < invar && !irref_isk(ref) && !irt_isphi(irr->t)) {
366	irt_setphi(irr->t);
367	if (nphi >= LJ_MAX_PHI)
368	lj_trace_err(J, LJ_TRERR_PHIOV);
369	phi[nphi++] = (IRRef1)ref;
370	}
371	}
372	}
373	}
374	}
375	if (!irt_isguard(J->guardemit)) / Drop redundant snapshot. /
376	J->cur.nsnapmap = (uint32_t)J->cur.snap[--J->cur.nsnap].mapofs;
377	lj_assertJ(J->cur.nsnapmap <= J->sizesnapmap, "bad snapshot map index");
378	psentinel = J->cur.snapmap[J->cur.snap[`0`].nent]; /* Restore PC. /
379
380	loop_emit_phi(J, subst, phi, nphi, onsnap);
381	}
382
383	/ Undo any partial changes made by the loop optimization. /
384	static void loop_undo(jit_State *J, IRRef ins, SnapNo nsnap, MSize nsnapmap)
385	{
386	ptrdiff_t i;
387	SnapShot *snap = &J->cur.snap[nsnap-`1`];
388	SnapEntry *map = J->cur.snapmap;
389	map[snap->mapofs + snap->nent] = map[J->cur.snap[`0`].nent]; / Restore PC. /
390	J->cur.nsnapmap = (uint32_t)nsnapmap;
391	J->cur.nsnap = nsnap;
392	J->guardemit.irt = `0`;
393	lj_ir_rollback(J, ins);
394	for (i = `0`; i < BPROP_SLOTS; i++) { / Remove backprop. cache entries. /
395	BPropEntry *bp = &J->bpropcache[i];
396	if (bp->val >= ins)
397	bp->key = `0`;
398	}
399	for (ins--; ins >= REF_FIRST; ins--) { / Remove flags. /
400	IRIns *ir = IR(ins);
401	irt_clearphi(ir->t);
402	irt_clearmark(ir->t);
403	}
404	}
405
406	/ Protected callback for loop optimization. /
407	static TValue cploop_opt(lua_State L, lua_CFunction dummy, void *ud)
408	{
409	UNUSED(L); UNUSED(dummy);
410	loop_unroll((LoopState *)ud);
411	return NULL;
412	}
413
414	/ Loop optimization. /
415	int lj_opt_loop(jit_State *J)
416	{
417	IRRef nins = J->cur.nins;
418	SnapNo nsnap = J->cur.nsnap;
419	MSize nsnapmap = J->cur.nsnapmap;
420	LoopState lps;
421	int errcode;
422	lps.J = J;
423	lps.subst = NULL;
424	lps.sizesubst = `0`;
425	errcode = lj_vm_cpcall(J->L, NULL, &lps, cploop_opt);
426	lj_mem_freevec(J2G(J), lps.subst, lps.sizesubst, IRRef1);
427	if (LJ_UNLIKELY(errcode)) {
428	lua_State *L = J->L;
429	if (errcode == LUA_ERRRUN && tvisnumber(L->top-`1`)) { / Trace error? /
430	int32_t e = numberVint(L->top-`1`);
431	switch ((TraceError)e) {
432	case LJ_TRERR_TYPEINS: / Type instability. /
433	case LJ_TRERR_GFAIL: / Guard would always fail. /
434	/ Unrolling via recording fixes many cases, e.g. a flipped boolean. /
435	if (--J->instunroll < `0`) / But do not unroll forever. /
436	break;
437	L->top--; / Remove error object. /
438	loop_undo(J, nins, nsnap, nsnapmap);
439	return `1`; / Loop optimization failed, continue recording. /
440	default:
441	break;
442	}
443	}
444	lj_err_throw(L, errcode); / Propagate all other errors. /
445	}
446	return `0`; / Loop optimization is ok. /
447	}
448
449	#undef IR
450	#undef emitir
451	#undef emitir_raw
452
453	#endif
454

Browse the source code of LuaJIT/lj_opt_loop.c