lj_opt_loop.c source code [Aerospike/modules/luajit/src/lj_opt_loop.c]

1	/*
2	** LOOP: Loop Optimizations.
3	** Copyright (C) 2005-2014 Mike Pall. See Copyright Notice in luajit.h
4	*/
5
6	#define lj_opt_loop_c
7	#define LUA_CORE
8
9	#include "lj_obj.h"
10
11	#if LJ_HASJIT
12
13	#include "lj_err.h"
14	#include "lj_str.h"
15	#include "lj_ir.h"
16	#include "lj_jit.h"
17	#include "lj_iropt.h"
18	#include "lj_trace.h"
19	#include "lj_snap.h"
20	#include "lj_vm.h"
21
22	/ Loop optimization:*
23	**
24	** Traditional Loop-Invariant Code Motion (LICM) splits the instructions
25	** of a loop into invariant and variant instructions. The invariant
26	** instructions are hoisted out of the loop and only the variant
27	** instructions remain inside the loop body.
28	**
29	** Unfortunately LICM is mostly useless for compiling dynamic languages.
30	** The IR has many guards and most of the subsequent instructions are
31	** control-dependent on them. The first non-hoistable guard would
32	** effectively prevent hoisting of all subsequent instructions.
33	**
34	** That's why we use a special form of unrolling using copy-substitution,
35	** combined with redundancy elimination:
36	**
37	** The recorded instruction stream is re-emitted to the compiler pipeline
38	** with substituted operands. The substitution table is filled with the
39	** refs returned by re-emitting each instruction. This can be done
40	** on-the-fly, because the IR is in strict SSA form, where every ref is
41	** defined before its use.
42	**
43	** This aproach generates two code sections, separated by the LOOP
44	** instruction:
45	**
46	** 1. The recorded instructions form a kind of pre-roll for the loop. It
47	** contains a mix of invariant and variant instructions and performs
48	** exactly one loop iteration (but not necessarily the 1st iteration).
49	**
50	** 2. The loop body contains only the variant instructions and performs
51	** all remaining loop iterations.
52	**
53	** On first sight that looks like a waste of space, because the variant
54	** instructions are present twice. But the key insight is that the
55	** pre-roll honors the control-dependencies for both the pre-roll itself
56	** and the loop body!
57	**
58	** It also means one doesn't have to explicitly model control-dependencies
59	** (which, BTW, wouldn't help LICM much). And it's much easier to
60	** integrate sparse snapshotting with this approach.
61	**
62	** One of the nicest aspects of this approach is that all of the
63	** optimizations of the compiler pipeline (FOLD, CSE, FWD, etc.) can be
64	** reused with only minor restrictions (e.g. one should not fold
65	** instructions across loop-carried dependencies).
66	**
67	** But in general all optimizations can be applied which only need to look
68	** backwards into the generated instruction stream. At any point in time
69	** during the copy-substitution process this contains both a static loop
70	** iteration (the pre-roll) and a dynamic one (from the to-be-copied
71	** instruction up to the end of the partial loop body).
72	**
73	** Since control-dependencies are implicitly kept, CSE also applies to all
74	** kinds of guards. The major advantage is that all invariant guards can
75	** be hoisted, too.
76	**
77	** Load/store forwarding works across loop iterations, too. This is
78	** important if loop-carried dependencies are kept in upvalues or tables.
79	** E.g. 'self.idx = self.idx + 1' deep down in some OO-style method may
80	** become a forwarded loop-recurrence after inlining.
81	**
82	** Since the IR is in SSA form, loop-carried dependencies have to be
83	** modeled with PHI instructions. The potential candidates for PHIs are
84	** collected on-the-fly during copy-substitution. After eliminating the
85	** redundant ones, PHI instructions are emitted below the loop body.
86	**
87	** Note that this departure from traditional SSA form doesn't change the
88	** semantics of the PHI instructions themselves. But it greatly simplifies
89	** on-the-fly generation of the IR and the machine code.
90	*/
91
92	/ Some local macros to save typing. Undef'd at the end. /
93	#define IR(ref) (&J->cur.ir[(ref)])
94
95	/ Pass IR on to next optimization in chain (FOLD). /
96	#define emitir(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_opt_fold(J))
97
98	/ Emit raw IR without passing through optimizations. /
99	#define emitir_raw(ot, a, b) (lj_ir_set(J, (ot), (a), (b)), lj_ir_emit(J))
100
101	/ -- PHI elimination ----------------------------------------------------- /
102
103	/ Emit or eliminate collected PHIs. /
104	static void loop_emit_phi(jit_State J, IRRef1 subst, IRRef1 *phi, IRRef nphi,
105	SnapNo onsnap)
106	{
107	int passx = `0`;
108	IRRef i, j, nslots;
109	IRRef invar = J->chain[IR_LOOP];
110	/ Pass #1: mark redundant and potentially redundant PHIs. /
111	for (i = `0`, j = `0`; i < nphi; i++) {
112	IRRef lref = phi[i];
113	IRRef rref = subst[lref];
114	if (lref == rref \|\| rref == REF_DROP) { / Invariants are redundant. /
115	irt_clearphi(IR(lref)->t);
116	} else {
117	phi[j++] = (IRRef1)lref;
118	if (!(IR(rref)->op1 == lref \|\| IR(rref)->op2 == lref)) {
119	/ Quick check for simple recurrences failed, need pass2. /
120	irt_setmark(IR(lref)->t);
121	passx = `1`;
122	}
123	}
124	}
125	nphi = j;
126	/ Pass #2: traverse variant part and clear marks of non-redundant PHIs. /
127	if (passx) {
128	SnapNo s;
129	for (i = J->cur.nins-`1`; i > invar; i--) {
130	IRIns *ir = IR(i);
131	if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
132	if (!irref_isk(ir->op1)) {
133	irt_clearmark(IR(ir->op1)->t);
134	if (ir->op1 < invar &&
135	ir->o >= IR_CALLN && ir->o <= IR_CARG) { / ORDER IR /
136	ir = IR(ir->op1);
137	while (ir->o == IR_CARG) {
138	if (!irref_isk(ir->op2)) irt_clearmark(IR(ir->op2)->t);
139	if (irref_isk(ir->op1)) break;
140	ir = IR(ir->op1);
141	irt_clearmark(ir->t);
142	}
143	}
144	}
145	}
146	for (s = J->cur.nsnap-`1`; s >= onsnap; s--) {
147	SnapShot *snap = &J->cur.snap[s];
148	SnapEntry *map = &J->cur.snapmap[snap->mapofs];
149	MSize n, nent = snap->nent;
150	for (n = `0`; n < nent; n++) {
151	IRRef ref = snap_ref(map[n]);
152	if (!irref_isk(ref)) irt_clearmark(IR(ref)->t);
153	}
154	}
155	}
156	/ Pass #3: add PHIs for variant slots without a corresponding SLOAD. /
157	nslots = J->baseslot+J->maxslot;
158	for (i = `1`; i < nslots; i++) {
159	IRRef ref = tref_ref(J->slot[i]);
160	while (!irref_isk(ref) && ref != subst[ref]) {
161	IRIns *ir = IR(ref);
162	irt_clearmark(ir->t); / Unmark potential uses, too. /
163	if (irt_isphi(ir->t) \|\| irt_ispri(ir->t))
164	break;
165	irt_setphi(ir->t);
166	if (nphi >= LJ_MAX_PHI)
167	lj_trace_err(J, LJ_TRERR_PHIOV);
168	phi[nphi++] = (IRRef1)ref;
169	ref = subst[ref];
170	if (ref > invar)
171	break;
172	}
173	}
174	/ Pass #4: propagate non-redundant PHIs. /
175	while (passx) {
176	passx = `0`;
177	for (i = `0`; i < nphi; i++) {
178	IRRef lref = phi[i];
179	IRIns *ir = IR(lref);
180	if (!irt_ismarked(ir->t)) { / Propagate only from unmarked PHIs. /
181	IRIns *irr = IR(subst[lref]);
182	if (irt_ismarked(irr->t)) { / Right ref points to other PHI? /
183	irt_clearmark(irr->t); / Mark that PHI as non-redundant. /
184	passx = `1`; / Retry. /
185	}
186	}
187	}
188	}
189	/ Pass #5: emit PHI instructions or eliminate PHIs. /
190	for (i = `0`; i < nphi; i++) {
191	IRRef lref = phi[i];
192	IRIns *ir = IR(lref);
193	if (!irt_ismarked(ir->t)) { / Emit PHI if not marked. /
194	IRRef rref = subst[lref];
195	if (rref > invar)
196	irt_setphi(IR(rref)->t);
197	emitir_raw(IRT(IR_PHI, irt_type(ir->t)), lref, rref);
198	} else { / Otherwise eliminate PHI. /
199	irt_clearmark(ir->t);
200	irt_clearphi(ir->t);
201	}
202	}
203	}
204
205	/ -- Loop unrolling using copy-substitution ------------------------------ /
206
207	/ Copy-substitute snapshot. /
208	static void loop_subst_snap(jit_State J, SnapShot osnap,
209	SnapEntry loopmap, IRRef1 subst)
210	{
211	SnapEntry nmap, omap = &J->cur.snapmap[osnap->mapofs];
212	SnapEntry *nextmap = &J->cur.snapmap[snap_nextofs(&J->cur, osnap)];
213	MSize nmapofs;
214	MSize on, ln, nn, onent = osnap->nent;
215	BCReg nslots = osnap->nslots;
216	SnapShot *snap = &J->cur.snap[J->cur.nsnap];
217	if (irt_isguard(J->guardemit)) { / Guard inbetween? /
218	nmapofs = J->cur.nsnapmap;
219	J->cur.nsnap++; / Add new snapshot. /
220	} else { / Otherwise overwrite previous snapshot. /
221	snap--;
222	nmapofs = snap->mapofs;
223	}
224	J->guardemit.irt = `0`;
225	/ Setup new snapshot. /
226	snap->mapofs = (uint16_t)nmapofs;
227	snap->ref = (IRRef1)J->cur.nins;
228	snap->nslots = nslots;
229	snap->topslot = osnap->topslot;
230	snap->count = `0`;
231	nmap = &J->cur.snapmap[nmapofs];
232	/ Substitute snapshot slots. /
233	on = ln = nn = `0`;
234	while (on < onent) {
235	SnapEntry osn = omap[on], lsn = loopmap[ln];
236	if (snap_slot(lsn) < snap_slot(osn)) { / Copy slot from loop map. /
237	nmap[nn++] = lsn;
238	ln++;
239	} else { / Copy substituted slot from snapshot map. /
240	if (snap_slot(lsn) == snap_slot(osn)) ln++; / Shadowed loop slot. /
241	if (!irref_isk(snap_ref(osn)))
242	osn = snap_setref(osn, subst[snap_ref(osn)]);
243	nmap[nn++] = osn;
244	on++;
245	}
246	}
247	while (snap_slot(loopmap[ln]) < nslots) / Copy remaining loop slots. /
248	nmap[nn++] = loopmap[ln++];
249	snap->nent = (uint8_t)nn;
250	omap += onent;
251	nmap += nn;
252	while (omap < nextmap) / Copy PC + frame links. /
253	nmap++ = omap++;
254	J->cur.nsnapmap = (uint16_t)(nmap - J->cur.snapmap);
255	}
256
257	/ Unroll loop. /
258	static void loop_unroll(jit_State *J)
259	{
260	IRRef1 phi[LJ_MAX_PHI];
261	uint32_t nphi = `0`;
262	IRRef1 *subst;
263	SnapNo onsnap;
264	SnapShot osnap, loopsnap;
265	SnapEntry loopmap, psentinel;
266	IRRef ins, invar;
267
268	/ Use temp buffer for substitution table.*
269	** Only non-constant refs in [REF_BIAS,invar) are valid indexes.
270	** Caveat: don't call into the VM or run the GC or the buffer may be gone.
271	*/
272	invar = J->cur.nins;
273	subst = (IRRef1 *)lj_str_needbuf(J->L, &G(J->L)->tmpbuf,
274	(invar-REF_BIAS)*sizeof(IRRef1)) - REF_BIAS;
275	subst[REF_BASE] = REF_BASE;
276
277	/ LOOP separates the pre-roll from the loop body. /
278	emitir_raw(IRTG(IR_LOOP, IRT_NIL), `0`, `0`);
279
280	/ Grow snapshot buffer and map for copy-substituted snapshots.*
281	** Need up to twice the number of snapshots minus #0 and loop snapshot.
282	** Need up to twice the number of entries plus fallback substitutions
283	** from the loop snapshot entries for each new snapshot.
284	** Caveat: both calls may reallocate J->cur.snap and J->cur.snapmap!
285	*/
286	onsnap = J->cur.nsnap;
287	lj_snap_grow_buf(J, `2`*onsnap-`2`);
288	lj_snap_grow_map(J, J->cur.nsnapmap`2`+(onsnap-`2`)J->cur.snap[onsnap-`1`].nent);
289
290	/ The loop snapshot is used for fallback substitutions. /
291	loopsnap = &J->cur.snap[onsnap-`1`];
292	loopmap = &J->cur.snapmap[loopsnap->mapofs];
293	/ The PC of snapshot #0 and the loop snapshot must match. /
294	psentinel = &loopmap[loopsnap->nent];
295	lua_assert(*psentinel == J->cur.snapmap[J->cur.snap[`0`].nent]);
296	psentinel = SNAP(`255`, `0`, `0`); /* Replace PC with temporary sentinel. /
297
298	/ Start substitution with snapshot #1 (#0 is empty for root traces). /
299	osnap = &J->cur.snap[`1`];
300
301	/ Copy and substitute all recorded instructions and snapshots. /
302	for (ins = REF_FIRST; ins < invar; ins++) {
303	IRIns *ir;
304	IRRef op1, op2;
305
306	if (ins >= osnap->ref) / Instruction belongs to next snapshot? /
307	loop_subst_snap(J, osnap++, loopmap, subst); / Copy-substitute it. /
308
309	/ Substitute instruction operands. /
310	ir = IR(ins);
311	op1 = ir->op1;
312	if (!irref_isk(op1)) op1 = subst[op1];
313	op2 = ir->op2;
314	if (!irref_isk(op2)) op2 = subst[op2];
315	if (irm_kind(lj_ir_mode[ir->o]) == IRM_N &&
316	op1 == ir->op1 && op2 == ir->op2) { / Regular invariant ins? /
317	subst[ins] = (IRRef1)ins; / Shortcut. /
318	} else {
319	/ Re-emit substituted instruction to the FOLD/CSE/etc. pipeline. /
320	IRType1 t = ir->t; / Get this first, since emitir may invalidate ir. /
321	IRRef ref = tref_ref(emitir(ir->ot & ~IRT_ISPHI, op1, op2));
322	subst[ins] = (IRRef1)ref;
323	if (ref != ins) {
324	IRIns *irr = IR(ref);
325	if (ref < invar) { / Loop-carried dependency? /
326	/ Potential PHI? /
327	if (!irref_isk(ref) && !irt_isphi(irr->t) && !irt_ispri(irr->t)) {
328	irt_setphi(irr->t);
329	if (nphi >= LJ_MAX_PHI)
330	lj_trace_err(J, LJ_TRERR_PHIOV);
331	phi[nphi++] = (IRRef1)ref;
332	}
333	/ Check all loop-carried dependencies for type instability. /
334	if (!irt_sametype(t, irr->t)) {
335	if (irt_isinteger(t) && irt_isinteger(irr->t))
336	continue;
337	else if (irt_isnum(t) && irt_isinteger(irr->t)) / Fix int->num. /
338	ref = tref_ref(emitir(IRTN(IR_CONV), ref, IRCONV_NUM_INT));
339	else if (irt_isnum(irr->t) && irt_isinteger(t)) / Fix num->int. /
340	ref = tref_ref(emitir(IRTGI(IR_CONV), ref,
341	IRCONV_INT_NUM\|IRCONV_CHECK));
342	else
343	lj_trace_err(J, LJ_TRERR_TYPEINS);
344	subst[ins] = (IRRef1)ref;
345	irr = IR(ref);
346	goto phiconv;
347	}
348	} else if (ref != REF_DROP && irr->o == IR_CONV &&
349	ref > invar && irr->op1 < invar) {
350	/ May need an extra PHI for a CONV. /
351	ref = irr->op1;
352	irr = IR(ref);
353	phiconv:
354	if (ref < invar && !irref_isk(ref) && !irt_isphi(irr->t)) {
355	irt_setphi(irr->t);
356	if (nphi >= LJ_MAX_PHI)
357	lj_trace_err(J, LJ_TRERR_PHIOV);
358	phi[nphi++] = (IRRef1)ref;
359	}
360	}
361	}
362	}
363	}
364	if (!irt_isguard(J->guardemit)) / Drop redundant snapshot. /
365	J->cur.nsnapmap = (uint16_t)J->cur.snap[--J->cur.nsnap].mapofs;
366	lua_assert(J->cur.nsnapmap <= J->sizesnapmap);
367	psentinel = J->cur.snapmap[J->cur.snap[`0`].nent]; /* Restore PC. /
368
369	loop_emit_phi(J, subst, phi, nphi, onsnap);
370	}
371
372	/ Undo any partial changes made by the loop optimization. /
373	static void loop_undo(jit_State *J, IRRef ins, SnapNo nsnap, MSize nsnapmap)
374	{
375	ptrdiff_t i;
376	SnapShot *snap = &J->cur.snap[nsnap-`1`];
377	SnapEntry *map = J->cur.snapmap;
378	map[snap->mapofs + snap->nent] = map[J->cur.snap[`0`].nent]; / Restore PC. /
379	J->cur.nsnapmap = (uint16_t)nsnapmap;
380	J->cur.nsnap = nsnap;
381	J->guardemit.irt = `0`;
382	lj_ir_rollback(J, ins);
383	for (i = `0`; i < BPROP_SLOTS; i++) { / Remove backprop. cache entries. /
384	BPropEntry *bp = &J->bpropcache[i];
385	if (bp->val >= ins)
386	bp->key = `0`;
387	}
388	for (ins--; ins >= REF_FIRST; ins--) { / Remove flags. /
389	IRIns *ir = IR(ins);
390	irt_clearphi(ir->t);
391	irt_clearmark(ir->t);
392	}
393	}
394
395	/ Protected callback for loop optimization. /
396	static TValue cploop_opt(lua_State L, lua_CFunction dummy, void *ud)
397	{
398	UNUSED(L); UNUSED(dummy);
399	loop_unroll((jit_State *)ud);
400	return NULL;
401	}
402
403	/ Loop optimization. /
404	int lj_opt_loop(jit_State *J)
405	{
406	IRRef nins = J->cur.nins;
407	SnapNo nsnap = J->cur.nsnap;
408	MSize nsnapmap = J->cur.nsnapmap;
409	int errcode = lj_vm_cpcall(J->L, NULL, J, cploop_opt);
410	if (LJ_UNLIKELY(errcode)) {
411	lua_State *L = J->L;
412	if (errcode == LUA_ERRRUN && tvisnumber(L->top-`1`)) { / Trace error? /
413	int32_t e = numberVint(L->top-`1`);
414	switch ((TraceError)e) {
415	case LJ_TRERR_TYPEINS: / Type instability. /
416	case LJ_TRERR_GFAIL: / Guard would always fail. /
417	/ Unrolling via recording fixes many cases, e.g. a flipped boolean. /
418	if (--J->instunroll < `0`) / But do not unroll forever. /
419	break;
420	L->top--; / Remove error object. /
421	loop_undo(J, nins, nsnap, nsnapmap);
422	return `1`; / Loop optimization failed, continue recording. /
423	default:
424	break;
425	}
426	}
427	lj_err_throw(L, errcode); / Propagate all other errors. /
428	}
429	return `0`; / Loop optimization is ok. /
430	}
431
432	#undef IR
433	#undef emitir
434	#undef emitir_raw
435
436	#endif
437

Browse the source code of Aerospike/modules/luajit/src/lj_opt_loop.c