1 | #include "all.h" |
2 | |
3 | static void |
4 | aggreg(Blk *hd, Blk *b) |
5 | { |
6 | int k; |
7 | |
8 | /* aggregate looping information at |
9 | * loop headers */ |
10 | bsunion(hd->gen, b->gen); |
11 | for (k=0; k<2; k++) |
12 | if (b->nlive[k] > hd->nlive[k]) |
13 | hd->nlive[k] = b->nlive[k]; |
14 | } |
15 | |
16 | static void |
17 | tmpuse(Ref r, int use, int loop, Fn *fn) |
18 | { |
19 | Mem *m; |
20 | Tmp *t; |
21 | |
22 | if (rtype(r) == RMem) { |
23 | m = &fn->mem[r.val]; |
24 | tmpuse(m->base, 1, loop, fn); |
25 | tmpuse(m->index, 1, loop, fn); |
26 | } |
27 | else if (rtype(r) == RTmp && r.val >= Tmp0) { |
28 | t = &fn->tmp[r.val]; |
29 | t->nuse += use; |
30 | t->ndef += !use; |
31 | t->cost += loop; |
32 | } |
33 | } |
34 | |
35 | /* evaluate spill costs of temporaries, |
36 | * this also fills usage information |
37 | * requires rpo, preds |
38 | */ |
39 | void |
40 | fillcost(Fn *fn) |
41 | { |
42 | int n; |
43 | uint a; |
44 | Blk *b; |
45 | Ins *i; |
46 | Tmp *t; |
47 | Phi *p; |
48 | |
49 | loopiter(fn, aggreg); |
50 | if (debug['S']) { |
51 | fprintf(stderr, "\n> Loop information:\n" ); |
52 | for (b=fn->start; b; b=b->link) { |
53 | for (a=0; a<b->npred; ++a) |
54 | if (b->id <= b->pred[a]->id) |
55 | break; |
56 | if (a != b->npred) { |
57 | fprintf(stderr, "\t%-10s" , b->name); |
58 | fprintf(stderr, " (% 3d " , b->nlive[0]); |
59 | fprintf(stderr, "% 3d) " , b->nlive[1]); |
60 | dumpts(b->gen, fn->tmp, stderr); |
61 | } |
62 | } |
63 | } |
64 | for (t=fn->tmp; t-fn->tmp < fn->ntmp; t++) { |
65 | t->cost = t-fn->tmp < Tmp0 ? UINT_MAX : 0; |
66 | t->nuse = 0; |
67 | t->ndef = 0; |
68 | } |
69 | for (b=fn->start; b; b=b->link) { |
70 | for (p=b->phi; p; p=p->link) { |
71 | t = &fn->tmp[p->to.val]; |
72 | tmpuse(p->to, 0, 0, fn); |
73 | for (a=0; a<p->narg; a++) { |
74 | n = p->blk[a]->loop; |
75 | t->cost += n; |
76 | tmpuse(p->arg[a], 1, n, fn); |
77 | } |
78 | } |
79 | n = b->loop; |
80 | for (i=b->ins; i<&b->ins[b->nins]; i++) { |
81 | tmpuse(i->to, 0, n, fn); |
82 | tmpuse(i->arg[0], 1, n, fn); |
83 | tmpuse(i->arg[1], 1, n, fn); |
84 | } |
85 | tmpuse(b->jmp.arg, 1, n, fn); |
86 | } |
87 | if (debug['S']) { |
88 | fprintf(stderr, "\n> Spill costs:\n" ); |
89 | for (n=Tmp0; n<fn->ntmp; n++) |
90 | fprintf(stderr, "\t%-10s %d\n" , |
91 | fn->tmp[n].name, |
92 | fn->tmp[n].cost); |
93 | fprintf(stderr, "\n" ); |
94 | } |
95 | } |
96 | |
97 | static BSet *fst; /* temps to prioritize in registers (for tcmp1) */ |
98 | static Tmp *tmp; /* current temporaries (for tcmpX) */ |
99 | static int ntmp; /* current # of temps (for limit) */ |
100 | static int locs; /* stack size used by locals */ |
101 | static int slot4; /* next slot of 4 bytes */ |
102 | static int slot8; /* ditto, 8 bytes */ |
103 | static BSet mask[2][1]; /* class masks */ |
104 | |
105 | static int |
106 | tcmp0(const void *pa, const void *pb) |
107 | { |
108 | uint ca, cb; |
109 | |
110 | ca = tmp[*(int *)pa].cost; |
111 | cb = tmp[*(int *)pb].cost; |
112 | return (cb < ca) ? -1 : (cb > ca); |
113 | } |
114 | |
115 | static int |
116 | tcmp1(const void *pa, const void *pb) |
117 | { |
118 | int c; |
119 | |
120 | c = bshas(fst, *(int *)pb) - bshas(fst, *(int *)pa); |
121 | return c ? c : tcmp0(pa, pb); |
122 | } |
123 | |
124 | static Ref |
125 | slot(int t) |
126 | { |
127 | int s; |
128 | |
129 | assert(t >= Tmp0 && "cannot spill register" ); |
130 | s = tmp[t].slot; |
131 | if (s == -1) { |
132 | /* specific to NAlign == 3 */ |
133 | /* nice logic to pack stack slots |
134 | * on demand, there can be only |
135 | * one hole and slot4 points to it |
136 | * |
137 | * invariant: slot4 <= slot8 |
138 | */ |
139 | if (KWIDE(tmp[t].cls)) { |
140 | s = slot8; |
141 | if (slot4 == slot8) |
142 | slot4 += 2; |
143 | slot8 += 2; |
144 | } else { |
145 | s = slot4; |
146 | if (slot4 == slot8) { |
147 | slot8 += 2; |
148 | slot4 += 1; |
149 | } else |
150 | slot4 = slot8; |
151 | } |
152 | s += locs; |
153 | tmp[t].slot = s; |
154 | } |
155 | return SLOT(s); |
156 | } |
157 | |
158 | /* restricts b to hold at most k |
159 | * temporaries, preferring those |
160 | * present in f (if given), then |
161 | * those with the largest spill |
162 | * cost |
163 | */ |
164 | static void |
165 | limit(BSet *b, int k, BSet *f) |
166 | { |
167 | static int *tarr, maxt; |
168 | int i, t, nt; |
169 | |
170 | nt = bscount(b); |
171 | if (nt <= k) |
172 | return; |
173 | if (nt > maxt) { |
174 | free(tarr); |
175 | tarr = emalloc(nt * sizeof tarr[0]); |
176 | maxt = nt; |
177 | } |
178 | for (i=0, t=0; bsiter(b, &t); t++) { |
179 | bsclr(b, t); |
180 | tarr[i++] = t; |
181 | } |
182 | if (nt > 1) { |
183 | if (!f) |
184 | qsort(tarr, nt, sizeof tarr[0], tcmp0); |
185 | else { |
186 | fst = f; |
187 | qsort(tarr, nt, sizeof tarr[0], tcmp1); |
188 | } |
189 | } |
190 | for (i=0; i<k && i<nt; i++) |
191 | bsset(b, tarr[i]); |
192 | for (; i<nt; i++) |
193 | slot(tarr[i]); |
194 | } |
195 | |
196 | /* spills temporaries to fit the |
197 | * target limits using the same |
198 | * preferences as limit(); assumes |
199 | * that k1 gprs and k2 fprs are |
200 | * currently in use |
201 | */ |
202 | static void |
203 | limit2(BSet *b1, int k1, int k2, BSet *f) |
204 | { |
205 | BSet b2[1]; |
206 | |
207 | bsinit(b2, ntmp); /* todo, free those */ |
208 | bscopy(b2, b1); |
209 | bsinter(b1, mask[0]); |
210 | bsinter(b2, mask[1]); |
211 | limit(b1, T.ngpr - k1, f); |
212 | limit(b2, T.nfpr - k2, f); |
213 | bsunion(b1, b2); |
214 | } |
215 | |
216 | static void |
217 | sethint(BSet *u, bits r) |
218 | { |
219 | int t; |
220 | |
221 | for (t=Tmp0; bsiter(u, &t); t++) |
222 | tmp[phicls(t, tmp)].hint.m |= r; |
223 | } |
224 | |
225 | /* reloads temporaries in u that are |
226 | * not in v from their slots |
227 | */ |
228 | static void |
229 | reloads(BSet *u, BSet *v) |
230 | { |
231 | int t; |
232 | |
233 | for (t=Tmp0; bsiter(u, &t); t++) |
234 | if (!bshas(v, t)) |
235 | emit(Oload, tmp[t].cls, TMP(t), slot(t), R); |
236 | } |
237 | |
238 | static void |
239 | store(Ref r, int s) |
240 | { |
241 | if (s != -1) |
242 | emit(Ostorew + tmp[r.val].cls, 0, R, r, SLOT(s)); |
243 | } |
244 | |
245 | static int |
246 | regcpy(Ins *i) |
247 | { |
248 | return i->op == Ocopy && isreg(i->arg[0]); |
249 | } |
250 | |
251 | static Ins * |
252 | dopm(Blk *b, Ins *i, BSet *v) |
253 | { |
254 | int n, t; |
255 | BSet u[1]; |
256 | Ins *i1; |
257 | bits r; |
258 | |
259 | bsinit(u, ntmp); /* todo, free those */ |
260 | /* consecutive copies from |
261 | * registers need to be handled |
262 | * as one large instruction |
263 | * |
264 | * fixme: there is an assumption |
265 | * that calls are always followed |
266 | * by copy instructions here, this |
267 | * might not be true if previous |
268 | * passes change |
269 | */ |
270 | i1 = ++i; |
271 | do { |
272 | i--; |
273 | t = i->to.val; |
274 | if (!req(i->to, R)) |
275 | if (bshas(v, t)) { |
276 | bsclr(v, t); |
277 | store(i->to, tmp[t].slot); |
278 | } |
279 | bsset(v, i->arg[0].val); |
280 | } while (i != b->ins && regcpy(i-1)); |
281 | bscopy(u, v); |
282 | if (i != b->ins && (i-1)->op == Ocall) { |
283 | v->t[0] &= ~T.retregs((i-1)->arg[1], 0); |
284 | limit2(v, T.nrsave[0], T.nrsave[1], 0); |
285 | for (n=0, r=0; T.rsave[n]>=0; n++) |
286 | r |= BIT(T.rsave[n]); |
287 | v->t[0] |= T.argregs((i-1)->arg[1], 0); |
288 | } else { |
289 | limit2(v, 0, 0, 0); |
290 | r = v->t[0]; |
291 | } |
292 | sethint(v, r); |
293 | reloads(u, v); |
294 | do |
295 | emiti(*--i1); |
296 | while (i1 != i); |
297 | return i; |
298 | } |
299 | |
300 | static void |
301 | merge(BSet *u, Blk *bu, BSet *v, Blk *bv) |
302 | { |
303 | int t; |
304 | |
305 | if (bu->loop <= bv->loop) |
306 | bsunion(u, v); |
307 | else |
308 | for (t=0; bsiter(v, &t); t++) |
309 | if (tmp[t].slot == -1) |
310 | bsset(u, t); |
311 | } |
312 | |
313 | /* spill code insertion |
314 | * requires spill costs, rpo, liveness |
315 | * |
316 | * Note: this will replace liveness |
317 | * information (in, out) with temporaries |
318 | * that must be in registers at block |
319 | * borders |
320 | * |
321 | * Be careful with: |
322 | * - Ocopy instructions to ensure register |
323 | * constraints |
324 | */ |
325 | void |
326 | spill(Fn *fn) |
327 | { |
328 | Blk *b, *s1, *s2, *hd, **bp; |
329 | int j, l, t, k, lvarg[2]; |
330 | uint n; |
331 | BSet u[1], v[1], w[1]; |
332 | Ins *i; |
333 | Phi *p; |
334 | Mem *m; |
335 | bits r; |
336 | |
337 | tmp = fn->tmp; |
338 | ntmp = fn->ntmp; |
339 | bsinit(u, ntmp); |
340 | bsinit(v, ntmp); |
341 | bsinit(w, ntmp); |
342 | bsinit(mask[0], ntmp); |
343 | bsinit(mask[1], ntmp); |
344 | locs = fn->slot; |
345 | slot4 = 0; |
346 | slot8 = 0; |
347 | for (t=0; t<ntmp; t++) { |
348 | k = 0; |
349 | if (t >= T.fpr0 && t < T.fpr0 + T.nfpr) |
350 | k = 1; |
351 | if (t >= Tmp0) |
352 | k = KBASE(tmp[t].cls); |
353 | bsset(mask[k], t); |
354 | } |
355 | |
356 | for (bp=&fn->rpo[fn->nblk]; bp!=fn->rpo;) { |
357 | b = *--bp; |
358 | /* invariant: all blocks with bigger rpo got |
359 | * their in,out updated. */ |
360 | |
361 | /* 1. find temporaries in registers at |
362 | * the end of the block (put them in v) */ |
363 | curi = 0; |
364 | s1 = b->s1; |
365 | s2 = b->s2; |
366 | hd = 0; |
367 | if (s1 && s1->id <= b->id) |
368 | hd = s1; |
369 | if (s2 && s2->id <= b->id) |
370 | if (!hd || s2->id >= hd->id) |
371 | hd = s2; |
372 | if (hd) { |
373 | /* back-edge */ |
374 | bszero(v); |
375 | hd->gen->t[0] |= T.rglob; /* don't spill registers */ |
376 | for (k=0; k<2; k++) { |
377 | n = k == 0 ? T.ngpr : T.nfpr; |
378 | bscopy(u, b->out); |
379 | bsinter(u, mask[k]); |
380 | bscopy(w, u); |
381 | bsinter(u, hd->gen); |
382 | bsdiff(w, hd->gen); |
383 | if (bscount(u) < n) { |
384 | j = bscount(w); /* live through */ |
385 | l = hd->nlive[k]; |
386 | limit(w, n - (l - j), 0); |
387 | bsunion(u, w); |
388 | } else |
389 | limit(u, n, 0); |
390 | bsunion(v, u); |
391 | } |
392 | } else if (s1) { |
393 | /* avoid reloading temporaries |
394 | * in the middle of loops */ |
395 | bszero(v); |
396 | liveon(w, b, s1); |
397 | merge(v, b, w, s1); |
398 | if (s2) { |
399 | liveon(u, b, s2); |
400 | merge(v, b, u, s2); |
401 | bsinter(w, u); |
402 | } |
403 | limit2(v, 0, 0, w); |
404 | } else { |
405 | bscopy(v, b->out); |
406 | if (rtype(b->jmp.arg) == RCall) |
407 | v->t[0] |= T.retregs(b->jmp.arg, 0); |
408 | } |
409 | for (t=Tmp0; bsiter(b->out, &t); t++) |
410 | if (!bshas(v, t)) |
411 | slot(t); |
412 | bscopy(b->out, v); |
413 | |
414 | /* 2. process the block instructions */ |
415 | if (rtype(b->jmp.arg) == RTmp) { |
416 | t = b->jmp.arg.val; |
417 | assert(KBASE(tmp[t].cls) == 0); |
418 | lvarg[0] = bshas(v, t); |
419 | bsset(v, t); |
420 | bscopy(u, v); |
421 | limit2(v, 0, 0, NULL); |
422 | if (!bshas(v, t)) { |
423 | if (!lvarg[0]) |
424 | bsclr(u, t); |
425 | b->jmp.arg = slot(t); |
426 | } |
427 | reloads(u, v); |
428 | } |
429 | curi = &insb[NIns]; |
430 | for (i=&b->ins[b->nins]; i!=b->ins;) { |
431 | i--; |
432 | if (regcpy(i)) { |
433 | i = dopm(b, i, v); |
434 | continue; |
435 | } |
436 | bszero(w); |
437 | if (!req(i->to, R)) { |
438 | assert(rtype(i->to) == RTmp); |
439 | t = i->to.val; |
440 | if (bshas(v, t)) |
441 | bsclr(v, t); |
442 | else { |
443 | /* make sure we have a reg |
444 | * for the result */ |
445 | assert(t >= Tmp0 && "dead reg" ); |
446 | bsset(v, t); |
447 | bsset(w, t); |
448 | } |
449 | } |
450 | j = T.memargs(i->op); |
451 | for (n=0; n<2; n++) |
452 | if (rtype(i->arg[n]) == RMem) |
453 | j--; |
454 | for (n=0; n<2; n++) |
455 | switch (rtype(i->arg[n])) { |
456 | case RMem: |
457 | t = i->arg[n].val; |
458 | m = &fn->mem[t]; |
459 | if (rtype(m->base) == RTmp) { |
460 | bsset(v, m->base.val); |
461 | bsset(w, m->base.val); |
462 | } |
463 | if (rtype(m->index) == RTmp) { |
464 | bsset(v, m->index.val); |
465 | bsset(w, m->index.val); |
466 | } |
467 | break; |
468 | case RTmp: |
469 | t = i->arg[n].val; |
470 | lvarg[n] = bshas(v, t); |
471 | bsset(v, t); |
472 | if (j-- <= 0) |
473 | bsset(w, t); |
474 | break; |
475 | } |
476 | bscopy(u, v); |
477 | limit2(v, 0, 0, w); |
478 | for (n=0; n<2; n++) |
479 | if (rtype(i->arg[n]) == RTmp) { |
480 | t = i->arg[n].val; |
481 | if (!bshas(v, t)) { |
482 | /* do not reload if the |
483 | * argument is dead |
484 | */ |
485 | if (!lvarg[n]) |
486 | bsclr(u, t); |
487 | i->arg[n] = slot(t); |
488 | } |
489 | } |
490 | reloads(u, v); |
491 | if (!req(i->to, R)) { |
492 | t = i->to.val; |
493 | store(i->to, tmp[t].slot); |
494 | if (t >= Tmp0) |
495 | /* in case i->to was a |
496 | * dead temporary */ |
497 | bsclr(v, t); |
498 | } |
499 | emiti(*i); |
500 | r = v->t[0]; /* Tmp0 is NBit */ |
501 | if (r) |
502 | sethint(v, r); |
503 | } |
504 | if (b == fn->start) |
505 | assert(v->t[0] == (T.rglob | fn->reg)); |
506 | else |
507 | assert(v->t[0] == T.rglob); |
508 | |
509 | for (p=b->phi; p; p=p->link) { |
510 | assert(rtype(p->to) == RTmp); |
511 | t = p->to.val; |
512 | if (bshas(v, t)) { |
513 | bsclr(v, t); |
514 | store(p->to, tmp[t].slot); |
515 | } else if (bshas(b->in, t)) |
516 | /* only if the phi is live */ |
517 | p->to = slot(p->to.val); |
518 | } |
519 | bscopy(b->in, v); |
520 | b->nins = &insb[NIns] - curi; |
521 | idup(&b->ins, curi, b->nins); |
522 | } |
523 | |
524 | /* align the locals to a 16 byte boundary */ |
525 | /* specific to NAlign == 3 */ |
526 | slot8 += slot8 & 3; |
527 | fn->slot += slot8; |
528 | |
529 | if (debug['S']) { |
530 | fprintf(stderr, "\n> Block information:\n" ); |
531 | for (b=fn->start; b; b=b->link) { |
532 | fprintf(stderr, "\t%-10s (% 5d) " , b->name, b->loop); |
533 | dumpts(b->out, fn->tmp, stderr); |
534 | } |
535 | fprintf(stderr, "\n> After spilling:\n" ); |
536 | printfn(fn, stderr); |
537 | } |
538 | } |
539 | |