1#include "all.h"
2
3static void
4aggreg(Blk *hd, Blk *b)
5{
6 int k;
7
8 /* aggregate looping information at
9 * loop headers */
10 bsunion(hd->gen, b->gen);
11 for (k=0; k<2; k++)
12 if (b->nlive[k] > hd->nlive[k])
13 hd->nlive[k] = b->nlive[k];
14}
15
16static void
17tmpuse(Ref r, int use, int loop, Fn *fn)
18{
19 Mem *m;
20 Tmp *t;
21
22 if (rtype(r) == RMem) {
23 m = &fn->mem[r.val];
24 tmpuse(m->base, 1, loop, fn);
25 tmpuse(m->index, 1, loop, fn);
26 }
27 else if (rtype(r) == RTmp && r.val >= Tmp0) {
28 t = &fn->tmp[r.val];
29 t->nuse += use;
30 t->ndef += !use;
31 t->cost += loop;
32 }
33}
34
35/* evaluate spill costs of temporaries,
36 * this also fills usage information
37 * requires rpo, preds
38 */
39void
40fillcost(Fn *fn)
41{
42 int n;
43 uint a;
44 Blk *b;
45 Ins *i;
46 Tmp *t;
47 Phi *p;
48
49 loopiter(fn, aggreg);
50 if (debug['S']) {
51 fprintf(stderr, "\n> Loop information:\n");
52 for (b=fn->start; b; b=b->link) {
53 for (a=0; a<b->npred; ++a)
54 if (b->id <= b->pred[a]->id)
55 break;
56 if (a != b->npred) {
57 fprintf(stderr, "\t%-10s", b->name);
58 fprintf(stderr, " (% 3d ", b->nlive[0]);
59 fprintf(stderr, "% 3d) ", b->nlive[1]);
60 dumpts(b->gen, fn->tmp, stderr);
61 }
62 }
63 }
64 for (t=fn->tmp; t-fn->tmp < fn->ntmp; t++) {
65 t->cost = t-fn->tmp < Tmp0 ? UINT_MAX : 0;
66 t->nuse = 0;
67 t->ndef = 0;
68 }
69 for (b=fn->start; b; b=b->link) {
70 for (p=b->phi; p; p=p->link) {
71 t = &fn->tmp[p->to.val];
72 tmpuse(p->to, 0, 0, fn);
73 for (a=0; a<p->narg; a++) {
74 n = p->blk[a]->loop;
75 t->cost += n;
76 tmpuse(p->arg[a], 1, n, fn);
77 }
78 }
79 n = b->loop;
80 for (i=b->ins; i<&b->ins[b->nins]; i++) {
81 tmpuse(i->to, 0, n, fn);
82 tmpuse(i->arg[0], 1, n, fn);
83 tmpuse(i->arg[1], 1, n, fn);
84 }
85 tmpuse(b->jmp.arg, 1, n, fn);
86 }
87 if (debug['S']) {
88 fprintf(stderr, "\n> Spill costs:\n");
89 for (n=Tmp0; n<fn->ntmp; n++)
90 fprintf(stderr, "\t%-10s %d\n",
91 fn->tmp[n].name,
92 fn->tmp[n].cost);
93 fprintf(stderr, "\n");
94 }
95}
96
97static BSet *fst; /* temps to prioritize in registers (for tcmp1) */
98static Tmp *tmp; /* current temporaries (for tcmpX) */
99static int ntmp; /* current # of temps (for limit) */
100static int locs; /* stack size used by locals */
101static int slot4; /* next slot of 4 bytes */
102static int slot8; /* ditto, 8 bytes */
103static BSet mask[2][1]; /* class masks */
104
105static int
106tcmp0(const void *pa, const void *pb)
107{
108 uint ca, cb;
109
110 ca = tmp[*(int *)pa].cost;
111 cb = tmp[*(int *)pb].cost;
112 return (cb < ca) ? -1 : (cb > ca);
113}
114
115static int
116tcmp1(const void *pa, const void *pb)
117{
118 int c;
119
120 c = bshas(fst, *(int *)pb) - bshas(fst, *(int *)pa);
121 return c ? c : tcmp0(pa, pb);
122}
123
124static Ref
125slot(int t)
126{
127 int s;
128
129 assert(t >= Tmp0 && "cannot spill register");
130 s = tmp[t].slot;
131 if (s == -1) {
132 /* specific to NAlign == 3 */
133 /* nice logic to pack stack slots
134 * on demand, there can be only
135 * one hole and slot4 points to it
136 *
137 * invariant: slot4 <= slot8
138 */
139 if (KWIDE(tmp[t].cls)) {
140 s = slot8;
141 if (slot4 == slot8)
142 slot4 += 2;
143 slot8 += 2;
144 } else {
145 s = slot4;
146 if (slot4 == slot8) {
147 slot8 += 2;
148 slot4 += 1;
149 } else
150 slot4 = slot8;
151 }
152 s += locs;
153 tmp[t].slot = s;
154 }
155 return SLOT(s);
156}
157
158/* restricts b to hold at most k
159 * temporaries, preferring those
160 * present in f (if given), then
161 * those with the largest spill
162 * cost
163 */
164static void
165limit(BSet *b, int k, BSet *f)
166{
167 static int *tarr, maxt;
168 int i, t, nt;
169
170 nt = bscount(b);
171 if (nt <= k)
172 return;
173 if (nt > maxt) {
174 free(tarr);
175 tarr = emalloc(nt * sizeof tarr[0]);
176 maxt = nt;
177 }
178 for (i=0, t=0; bsiter(b, &t); t++) {
179 bsclr(b, t);
180 tarr[i++] = t;
181 }
182 if (nt > 1) {
183 if (!f)
184 qsort(tarr, nt, sizeof tarr[0], tcmp0);
185 else {
186 fst = f;
187 qsort(tarr, nt, sizeof tarr[0], tcmp1);
188 }
189 }
190 for (i=0; i<k && i<nt; i++)
191 bsset(b, tarr[i]);
192 for (; i<nt; i++)
193 slot(tarr[i]);
194}
195
196/* spills temporaries to fit the
197 * target limits using the same
198 * preferences as limit(); assumes
199 * that k1 gprs and k2 fprs are
200 * currently in use
201 */
202static void
203limit2(BSet *b1, int k1, int k2, BSet *f)
204{
205 BSet b2[1];
206
207 bsinit(b2, ntmp); /* todo, free those */
208 bscopy(b2, b1);
209 bsinter(b1, mask[0]);
210 bsinter(b2, mask[1]);
211 limit(b1, T.ngpr - k1, f);
212 limit(b2, T.nfpr - k2, f);
213 bsunion(b1, b2);
214}
215
216static void
217sethint(BSet *u, bits r)
218{
219 int t;
220
221 for (t=Tmp0; bsiter(u, &t); t++)
222 tmp[phicls(t, tmp)].hint.m |= r;
223}
224
225/* reloads temporaries in u that are
226 * not in v from their slots
227 */
228static void
229reloads(BSet *u, BSet *v)
230{
231 int t;
232
233 for (t=Tmp0; bsiter(u, &t); t++)
234 if (!bshas(v, t))
235 emit(Oload, tmp[t].cls, TMP(t), slot(t), R);
236}
237
238static void
239store(Ref r, int s)
240{
241 if (s != -1)
242 emit(Ostorew + tmp[r.val].cls, 0, R, r, SLOT(s));
243}
244
245static int
246regcpy(Ins *i)
247{
248 return i->op == Ocopy && isreg(i->arg[0]);
249}
250
251static Ins *
252dopm(Blk *b, Ins *i, BSet *v)
253{
254 int n, t;
255 BSet u[1];
256 Ins *i1;
257 bits r;
258
259 bsinit(u, ntmp); /* todo, free those */
260 /* consecutive copies from
261 * registers need to be handled
262 * as one large instruction
263 *
264 * fixme: there is an assumption
265 * that calls are always followed
266 * by copy instructions here, this
267 * might not be true if previous
268 * passes change
269 */
270 i1 = ++i;
271 do {
272 i--;
273 t = i->to.val;
274 if (!req(i->to, R))
275 if (bshas(v, t)) {
276 bsclr(v, t);
277 store(i->to, tmp[t].slot);
278 }
279 bsset(v, i->arg[0].val);
280 } while (i != b->ins && regcpy(i-1));
281 bscopy(u, v);
282 if (i != b->ins && (i-1)->op == Ocall) {
283 v->t[0] &= ~T.retregs((i-1)->arg[1], 0);
284 limit2(v, T.nrsave[0], T.nrsave[1], 0);
285 for (n=0, r=0; T.rsave[n]>=0; n++)
286 r |= BIT(T.rsave[n]);
287 v->t[0] |= T.argregs((i-1)->arg[1], 0);
288 } else {
289 limit2(v, 0, 0, 0);
290 r = v->t[0];
291 }
292 sethint(v, r);
293 reloads(u, v);
294 do
295 emiti(*--i1);
296 while (i1 != i);
297 return i;
298}
299
300static void
301merge(BSet *u, Blk *bu, BSet *v, Blk *bv)
302{
303 int t;
304
305 if (bu->loop <= bv->loop)
306 bsunion(u, v);
307 else
308 for (t=0; bsiter(v, &t); t++)
309 if (tmp[t].slot == -1)
310 bsset(u, t);
311}
312
313/* spill code insertion
314 * requires spill costs, rpo, liveness
315 *
316 * Note: this will replace liveness
317 * information (in, out) with temporaries
318 * that must be in registers at block
319 * borders
320 *
321 * Be careful with:
322 * - Ocopy instructions to ensure register
323 * constraints
324 */
325void
326spill(Fn *fn)
327{
328 Blk *b, *s1, *s2, *hd, **bp;
329 int j, l, t, k, lvarg[2];
330 uint n;
331 BSet u[1], v[1], w[1];
332 Ins *i;
333 Phi *p;
334 Mem *m;
335 bits r;
336
337 tmp = fn->tmp;
338 ntmp = fn->ntmp;
339 bsinit(u, ntmp);
340 bsinit(v, ntmp);
341 bsinit(w, ntmp);
342 bsinit(mask[0], ntmp);
343 bsinit(mask[1], ntmp);
344 locs = fn->slot;
345 slot4 = 0;
346 slot8 = 0;
347 for (t=0; t<ntmp; t++) {
348 k = 0;
349 if (t >= T.fpr0 && t < T.fpr0 + T.nfpr)
350 k = 1;
351 if (t >= Tmp0)
352 k = KBASE(tmp[t].cls);
353 bsset(mask[k], t);
354 }
355
356 for (bp=&fn->rpo[fn->nblk]; bp!=fn->rpo;) {
357 b = *--bp;
358 /* invariant: all blocks with bigger rpo got
359 * their in,out updated. */
360
361 /* 1. find temporaries in registers at
362 * the end of the block (put them in v) */
363 curi = 0;
364 s1 = b->s1;
365 s2 = b->s2;
366 hd = 0;
367 if (s1 && s1->id <= b->id)
368 hd = s1;
369 if (s2 && s2->id <= b->id)
370 if (!hd || s2->id >= hd->id)
371 hd = s2;
372 if (hd) {
373 /* back-edge */
374 bszero(v);
375 hd->gen->t[0] |= T.rglob; /* don't spill registers */
376 for (k=0; k<2; k++) {
377 n = k == 0 ? T.ngpr : T.nfpr;
378 bscopy(u, b->out);
379 bsinter(u, mask[k]);
380 bscopy(w, u);
381 bsinter(u, hd->gen);
382 bsdiff(w, hd->gen);
383 if (bscount(u) < n) {
384 j = bscount(w); /* live through */
385 l = hd->nlive[k];
386 limit(w, n - (l - j), 0);
387 bsunion(u, w);
388 } else
389 limit(u, n, 0);
390 bsunion(v, u);
391 }
392 } else if (s1) {
393 /* avoid reloading temporaries
394 * in the middle of loops */
395 bszero(v);
396 liveon(w, b, s1);
397 merge(v, b, w, s1);
398 if (s2) {
399 liveon(u, b, s2);
400 merge(v, b, u, s2);
401 bsinter(w, u);
402 }
403 limit2(v, 0, 0, w);
404 } else {
405 bscopy(v, b->out);
406 if (rtype(b->jmp.arg) == RCall)
407 v->t[0] |= T.retregs(b->jmp.arg, 0);
408 }
409 for (t=Tmp0; bsiter(b->out, &t); t++)
410 if (!bshas(v, t))
411 slot(t);
412 bscopy(b->out, v);
413
414 /* 2. process the block instructions */
415 if (rtype(b->jmp.arg) == RTmp) {
416 t = b->jmp.arg.val;
417 assert(KBASE(tmp[t].cls) == 0);
418 lvarg[0] = bshas(v, t);
419 bsset(v, t);
420 bscopy(u, v);
421 limit2(v, 0, 0, NULL);
422 if (!bshas(v, t)) {
423 if (!lvarg[0])
424 bsclr(u, t);
425 b->jmp.arg = slot(t);
426 }
427 reloads(u, v);
428 }
429 curi = &insb[NIns];
430 for (i=&b->ins[b->nins]; i!=b->ins;) {
431 i--;
432 if (regcpy(i)) {
433 i = dopm(b, i, v);
434 continue;
435 }
436 bszero(w);
437 if (!req(i->to, R)) {
438 assert(rtype(i->to) == RTmp);
439 t = i->to.val;
440 if (bshas(v, t))
441 bsclr(v, t);
442 else {
443 /* make sure we have a reg
444 * for the result */
445 assert(t >= Tmp0 && "dead reg");
446 bsset(v, t);
447 bsset(w, t);
448 }
449 }
450 j = T.memargs(i->op);
451 for (n=0; n<2; n++)
452 if (rtype(i->arg[n]) == RMem)
453 j--;
454 for (n=0; n<2; n++)
455 switch (rtype(i->arg[n])) {
456 case RMem:
457 t = i->arg[n].val;
458 m = &fn->mem[t];
459 if (rtype(m->base) == RTmp) {
460 bsset(v, m->base.val);
461 bsset(w, m->base.val);
462 }
463 if (rtype(m->index) == RTmp) {
464 bsset(v, m->index.val);
465 bsset(w, m->index.val);
466 }
467 break;
468 case RTmp:
469 t = i->arg[n].val;
470 lvarg[n] = bshas(v, t);
471 bsset(v, t);
472 if (j-- <= 0)
473 bsset(w, t);
474 break;
475 }
476 bscopy(u, v);
477 limit2(v, 0, 0, w);
478 for (n=0; n<2; n++)
479 if (rtype(i->arg[n]) == RTmp) {
480 t = i->arg[n].val;
481 if (!bshas(v, t)) {
482 /* do not reload if the
483 * argument is dead
484 */
485 if (!lvarg[n])
486 bsclr(u, t);
487 i->arg[n] = slot(t);
488 }
489 }
490 reloads(u, v);
491 if (!req(i->to, R)) {
492 t = i->to.val;
493 store(i->to, tmp[t].slot);
494 if (t >= Tmp0)
495 /* in case i->to was a
496 * dead temporary */
497 bsclr(v, t);
498 }
499 emiti(*i);
500 r = v->t[0]; /* Tmp0 is NBit */
501 if (r)
502 sethint(v, r);
503 }
504 if (b == fn->start)
505 assert(v->t[0] == (T.rglob | fn->reg));
506 else
507 assert(v->t[0] == T.rglob);
508
509 for (p=b->phi; p; p=p->link) {
510 assert(rtype(p->to) == RTmp);
511 t = p->to.val;
512 if (bshas(v, t)) {
513 bsclr(v, t);
514 store(p->to, tmp[t].slot);
515 } else if (bshas(b->in, t))
516 /* only if the phi is live */
517 p->to = slot(p->to.val);
518 }
519 bscopy(b->in, v);
520 b->nins = &insb[NIns] - curi;
521 idup(&b->ins, curi, b->nins);
522 }
523
524 /* align the locals to a 16 byte boundary */
525 /* specific to NAlign == 3 */
526 slot8 += slot8 & 3;
527 fn->slot += slot8;
528
529 if (debug['S']) {
530 fprintf(stderr, "\n> Block information:\n");
531 for (b=fn->start; b; b=b->link) {
532 fprintf(stderr, "\t%-10s (% 5d) ", b->name, b->loop);
533 dumpts(b->out, fn->tmp, stderr);
534 }
535 fprintf(stderr, "\n> After spilling:\n");
536 printfn(fn, stderr);
537 }
538}
539