1#include "all.h"
2
3#ifdef TEST_PMOV
4 #undef assert
5 #define assert(x) assert_test(#x, x)
6#endif
7
8typedef struct RMap RMap;
9
10struct RMap {
11 int t[Tmp0];
12 int r[Tmp0];
13 int w[Tmp0]; /* wait list, for unmatched hints */
14 BSet b[1];
15 int n;
16};
17
18static bits regu; /* registers used */
19static Tmp *tmp; /* function temporaries */
20static Mem *mem; /* function mem references */
21static struct {
22 Ref src, dst;
23 int cls;
24} pm[Tmp0]; /* parallel move constructed */
25static int npm; /* size of pm */
26static int loop; /* current loop level */
27
28static uint stmov; /* stats: added moves */
29static uint stblk; /* stats: added blocks */
30
31static int *
32hint(int t)
33{
34 return &tmp[phicls(t, tmp)].hint.r;
35}
36
37static void
38sethint(int t, int r)
39{
40 Tmp *p;
41
42 p = &tmp[phicls(t, tmp)];
43 if (p->hint.r == -1 || p->hint.w > loop) {
44 p->hint.r = r;
45 p->hint.w = loop;
46 tmp[t].visit = -1;
47 }
48}
49
50static void
51rcopy(RMap *ma, RMap *mb)
52{
53 memcpy(ma->t, mb->t, sizeof ma->t);
54 memcpy(ma->r, mb->r, sizeof ma->r);
55 memcpy(ma->w, mb->w, sizeof ma->w);
56 bscopy(ma->b, mb->b);
57 ma->n = mb->n;
58}
59
60static int
61rfind(RMap *m, int t)
62{
63 int i;
64
65 for (i=0; i<m->n; i++)
66 if (m->t[i] == t)
67 return m->r[i];
68 return -1;
69}
70
71static Ref
72rref(RMap *m, int t)
73{
74 int r, s;
75
76 r = rfind(m, t);
77 if (r == -1) {
78 s = tmp[t].slot;
79 assert(s != -1 && "should have spilled");
80 return SLOT(s);
81 } else
82 return TMP(r);
83}
84
85static void
86radd(RMap *m, int t, int r)
87{
88 assert((t >= Tmp0 || t == r) && "invalid temporary");
89 assert(((T.gpr0 <= r && r < T.gpr0 + T.ngpr)
90 || (T.fpr0 <= r && r < T.fpr0 + T.nfpr))
91 && "invalid register");
92 assert(!bshas(m->b, t) && "temporary has mapping");
93 assert(!bshas(m->b, r) && "register already allocated");
94 assert(m->n <= T.ngpr+T.nfpr && "too many mappings");
95 bsset(m->b, t);
96 bsset(m->b, r);
97 m->t[m->n] = t;
98 m->r[m->n] = r;
99 m->n++;
100 regu |= BIT(r);
101}
102
103static Ref
104ralloctry(RMap *m, int t, int try)
105{
106 bits regs;
107 int h, r, r0, r1;
108
109 if (t < Tmp0) {
110 assert(bshas(m->b, t));
111 return TMP(t);
112 }
113 if (bshas(m->b, t)) {
114 r = rfind(m, t);
115 assert(r != -1);
116 return TMP(r);
117 }
118 r = tmp[t].visit;
119 if (r == -1 || bshas(m->b, r))
120 r = *hint(t);
121 if (r == -1 || bshas(m->b, r)) {
122 if (try)
123 return R;
124 regs = tmp[phicls(t, tmp)].hint.m;
125 regs |= m->b->t[0];
126 if (KBASE(tmp[t].cls) == 0) {
127 r0 = T.gpr0;
128 r1 = r0 + T.ngpr;
129 } else {
130 r0 = T.fpr0;
131 r1 = r0 + T.nfpr;
132 }
133 for (r=r0; r<r1; r++)
134 if (!(regs & BIT(r)))
135 goto Found;
136 for (r=r0; r<r1; r++)
137 if (!bshas(m->b, r))
138 goto Found;
139 die("no more regs");
140 }
141Found:
142 radd(m, t, r);
143 sethint(t, r);
144 tmp[t].visit = r;
145 h = *hint(t);
146 if (h != -1 && h != r)
147 m->w[h] = t;
148 return TMP(r);
149}
150
151static inline Ref
152ralloc(RMap *m, int t)
153{
154 return ralloctry(m, t, 0);
155}
156
157static int
158rfree(RMap *m, int t)
159{
160 int i, r;
161
162 assert(t >= Tmp0 || !(BIT(t) & T.rglob));
163 if (!bshas(m->b, t))
164 return -1;
165 for (i=0; m->t[i] != t; i++)
166 assert(i+1 < m->n);
167 r = m->r[i];
168 bsclr(m->b, t);
169 bsclr(m->b, r);
170 m->n--;
171 memmove(&m->t[i], &m->t[i+1], (m->n-i) * sizeof m->t[0]);
172 memmove(&m->r[i], &m->r[i+1], (m->n-i) * sizeof m->r[0]);
173 assert(t >= Tmp0 || t == r);
174 return r;
175}
176
177static void
178mdump(RMap *m)
179{
180 int i;
181
182 for (i=0; i<m->n; i++)
183 if (m->t[i] >= Tmp0)
184 fprintf(stderr, " (%s, R%d)",
185 tmp[m->t[i]].name,
186 m->r[i]);
187 fprintf(stderr, "\n");
188}
189
190static void
191pmadd(Ref src, Ref dst, int k)
192{
193 if (npm == Tmp0)
194 die("cannot have more moves than registers");
195 pm[npm].src = src;
196 pm[npm].dst = dst;
197 pm[npm].cls = k;
198 npm++;
199}
200
201enum PMStat { ToMove, Moving, Moved };
202
203static int
204pmrec(enum PMStat *status, int i, int *k)
205{
206 int j, c;
207
208 /* note, this routine might emit
209 * too many large instructions
210 */
211 if (req(pm[i].src, pm[i].dst)) {
212 status[i] = Moved;
213 return -1;
214 }
215 assert(KBASE(pm[i].cls) == KBASE(*k));
216 assert((Kw|Kl) == Kl && (Ks|Kd) == Kd);
217 *k |= pm[i].cls;
218 for (j=0; j<npm; j++)
219 if (req(pm[j].dst, pm[i].src))
220 break;
221 switch (j == npm ? Moved : status[j]) {
222 case Moving:
223 c = j; /* start of cycle */
224 emit(Oswap, *k, R, pm[i].src, pm[i].dst);
225 break;
226 case ToMove:
227 status[i] = Moving;
228 c = pmrec(status, j, k);
229 if (c == i) {
230 c = -1; /* end of cycle */
231 break;
232 }
233 if (c != -1) {
234 emit(Oswap, *k, R, pm[i].src, pm[i].dst);
235 break;
236 }
237 /* fall through */
238 case Moved:
239 c = -1;
240 emit(Ocopy, pm[i].cls, pm[i].dst, pm[i].src, R);
241 break;
242 }
243 status[i] = Moved;
244 return c;
245}
246
247static void
248pmgen()
249{
250 int i;
251 enum PMStat *status;
252
253 status = alloc(npm * sizeof status[0]);
254 assert(!npm || status[npm-1] == ToMove);
255 for (i=0; i<npm; i++)
256 if (status[i] == ToMove)
257 pmrec(status, i, (int[]){pm[i].cls});
258}
259
260static void
261move(int r, Ref to, RMap *m)
262{
263 int n, t, r1;
264
265 r1 = req(to, R) ? -1 : rfree(m, to.val);
266 if (bshas(m->b, r)) {
267 /* r is used and not by to */
268 assert(r1 != r);
269 for (n=0; m->r[n] != r; n++)
270 assert(n+1 < m->n);
271 t = m->t[n];
272 rfree(m, t);
273 bsset(m->b, r);
274 ralloc(m, t);
275 bsclr(m->b, r);
276 }
277 t = req(to, R) ? r : to.val;
278 radd(m, t, r);
279}
280
281static int
282regcpy(Ins *i)
283{
284 return i->op == Ocopy && isreg(i->arg[0]);
285}
286
287static Ins *
288dopm(Blk *b, Ins *i, RMap *m)
289{
290 RMap m0;
291 int n, r, r1, t, s;
292 Ins *i1, *ip;
293 bits def;
294
295 m0 = *m; /* okay since we don't use m0.b */
296 m0.b->t = 0;
297 i1 = ++i;
298 do {
299 i--;
300 move(i->arg[0].val, i->to, m);
301 } while (i != b->ins && regcpy(i-1));
302 assert(m0.n <= m->n);
303 if (i != b->ins && (i-1)->op == Ocall) {
304 def = T.retregs((i-1)->arg[1], 0) | T.rglob;
305 for (r=0; T.rsave[r]>=0; r++)
306 if (!(BIT(T.rsave[r]) & def))
307 move(T.rsave[r], R, m);
308 }
309 for (npm=0, n=0; n<m->n; n++) {
310 t = m->t[n];
311 s = tmp[t].slot;
312 r1 = m->r[n];
313 r = rfind(&m0, t);
314 if (r != -1)
315 pmadd(TMP(r1), TMP(r), tmp[t].cls);
316 else if (s != -1)
317 pmadd(TMP(r1), SLOT(s), tmp[t].cls);
318 }
319 for (ip=i; ip<i1; ip++) {
320 if (!req(ip->to, R))
321 rfree(m, ip->to.val);
322 r = ip->arg[0].val;
323 if (rfind(m, r) == -1)
324 radd(m, r, r);
325 }
326 pmgen();
327 return i;
328}
329
330static int
331prio1(Ref r1, Ref r2)
332{
333 /* trivial heuristic to begin with,
334 * later we can use the distance to
335 * the definition instruction
336 */
337 (void) r2;
338 return *hint(r1.val) != -1;
339}
340
341static void
342insert(Ref *r, Ref **rs, int p)
343{
344 int i;
345
346 rs[i = p] = r;
347 while (i-- > 0 && prio1(*r, *rs[i])) {
348 rs[i+1] = rs[i];
349 rs[i] = r;
350 }
351}
352
353static void
354doblk(Blk *b, RMap *cur)
355{
356 int t, x, r, rf, rt, nr;
357 bits rs;
358 Ins *i, *i1;
359 Mem *m;
360 Ref *ra[4];
361
362 for (r=0; bsiter(b->out, &r) && r<Tmp0; r++)
363 radd(cur, r, r);
364 if (rtype(b->jmp.arg) == RTmp)
365 b->jmp.arg = ralloc(cur, b->jmp.arg.val);
366 curi = &insb[NIns];
367 for (i1=&b->ins[b->nins]; i1!=b->ins;) {
368 emiti(*--i1);
369 i = curi;
370 rf = -1;
371 switch (i->op) {
372 case Ocall:
373 rs = T.argregs(i->arg[1], 0) | T.rglob;
374 for (r=0; T.rsave[r]>=0; r++)
375 if (!(BIT(T.rsave[r]) & rs))
376 rfree(cur, T.rsave[r]);
377 break;
378 case Ocopy:
379 if (regcpy(i)) {
380 curi++;
381 i1 = dopm(b, i1, cur);
382 stmov += i+1 - curi;
383 continue;
384 }
385 if (isreg(i->to))
386 if (rtype(i->arg[0]) == RTmp)
387 sethint(i->arg[0].val, i->to.val);
388 /* fall through */
389 default:
390 if (!req(i->to, R)) {
391 assert(rtype(i->to) == RTmp);
392 r = i->to.val;
393 if (r < Tmp0 && (BIT(r) & T.rglob))
394 break;
395 rf = rfree(cur, r);
396 if (rf == -1) {
397 assert(!isreg(i->to));
398 curi++;
399 continue;
400 }
401 i->to = TMP(rf);
402 }
403 break;
404 }
405 for (x=0, nr=0; x<2; x++)
406 switch (rtype(i->arg[x])) {
407 case RMem:
408 m = &mem[i->arg[x].val];
409 if (rtype(m->base) == RTmp)
410 insert(&m->base, ra, nr++);
411 if (rtype(m->index) == RTmp)
412 insert(&m->index, ra, nr++);
413 break;
414 case RTmp:
415 insert(&i->arg[x], ra, nr++);
416 break;
417 }
418 for (r=0; r<nr; r++)
419 *ra[r] = ralloc(cur, ra[r]->val);
420 if (i->op == Ocopy && req(i->to, i->arg[0]))
421 curi++;
422
423 /* try to change the register of a hinted
424 * temporary if rf is available */
425 if (rf != -1 && (t = cur->w[rf]) != 0)
426 if (!bshas(cur->b, rf) && *hint(t) == rf
427 && (rt = rfree(cur, t)) != -1) {
428 tmp[t].visit = -1;
429 ralloc(cur, t);
430 assert(bshas(cur->b, rf));
431 emit(Ocopy, tmp[t].cls, TMP(rt), TMP(rf), R);
432 stmov += 1;
433 cur->w[rf] = 0;
434 for (r=0; r<nr; r++)
435 if (req(*ra[r], TMP(rt)))
436 *ra[r] = TMP(rf);
437 /* one could iterate this logic with
438 * the newly freed rt, but in this case
439 * the above loop must be changed */
440 }
441 }
442 b->nins = &insb[NIns] - curi;
443 idup(&b->ins, curi, b->nins);
444}
445
446/* qsort() comparison function to peel
447 * loop nests from inside out */
448static int
449carve(const void *a, const void *b)
450{
451 Blk *ba, *bb;
452
453 /* todo, evaluate if this order is really
454 * better than the simple postorder */
455 ba = *(Blk**)a;
456 bb = *(Blk**)b;
457 if (ba->loop == bb->loop)
458 return ba->id > bb->id ? -1 : ba->id < bb->id;
459 return ba->loop > bb->loop ? -1 : +1;
460}
461
462/* comparison function to order temporaries
463 * for allocation at the end of blocks */
464static int
465prio2(int t1, int t2)
466{
467 if ((tmp[t1].visit ^ tmp[t2].visit) < 0) /* != signs */
468 return tmp[t1].visit != -1 ? +1 : -1;
469 if ((*hint(t1) ^ *hint(t2)) < 0)
470 return *hint(t1) != -1 ? +1 : -1;
471 return tmp[t1].cost - tmp[t2].cost;
472}
473
474/* register allocation
475 * depends on rpo, phi, cost, (and obviously spill)
476 */
477void
478rega(Fn *fn)
479{
480 int j, t, r, x, rl[Tmp0];
481 Blk *b, *b1, *s, ***ps, *blist, **blk, **bp;
482 RMap *end, *beg, cur, old, *m;
483 Ins *i;
484 Phi *p;
485 uint u, n;
486 Ref src, dst;
487
488 /* 1. setup */
489 stmov = 0;
490 stblk = 0;
491 regu = 0;
492 tmp = fn->tmp;
493 mem = fn->mem;
494 blk = alloc(fn->nblk * sizeof blk[0]);
495 end = alloc(fn->nblk * sizeof end[0]);
496 beg = alloc(fn->nblk * sizeof beg[0]);
497 for (n=0; n<fn->nblk; n++) {
498 bsinit(end[n].b, fn->ntmp);
499 bsinit(beg[n].b, fn->ntmp);
500 }
501 bsinit(cur.b, fn->ntmp);
502 bsinit(old.b, fn->ntmp);
503
504 loop = INT_MAX;
505 for (t=0; t<fn->ntmp; t++) {
506 tmp[t].hint.r = t < Tmp0 ? t : -1;
507 tmp[t].hint.w = loop;
508 tmp[t].visit = -1;
509 }
510 for (bp=blk, b=fn->start; b; b=b->link)
511 *bp++ = b;
512 qsort(blk, fn->nblk, sizeof blk[0], carve);
513 for (b=fn->start, i=b->ins; i<&b->ins[b->nins]; i++)
514 if (i->op != Ocopy || !isreg(i->arg[0]))
515 break;
516 else {
517 assert(rtype(i->to) == RTmp);
518 sethint(i->to.val, i->arg[0].val);
519 }
520
521 /* 2. assign registers */
522 for (bp=blk; bp<&blk[fn->nblk]; bp++) {
523 b = *bp;
524 n = b->id;
525 loop = b->loop;
526 cur.n = 0;
527 bszero(cur.b);
528 memset(cur.w, 0, sizeof cur.w);
529 for (x=0, t=Tmp0; bsiter(b->out, &t); t++) {
530 j = x++;
531 rl[j] = t;
532 while (j-- > 0 && prio2(t, rl[j]) > 0) {
533 rl[j+1] = rl[j];
534 rl[j] = t;
535 }
536 }
537 for (j=0; j<x; j++)
538 ralloctry(&cur, rl[j], 1);
539 for (j=0; j<x; j++)
540 ralloc(&cur, rl[j]);
541 rcopy(&end[n], &cur);
542 doblk(b, &cur);
543 bscopy(b->in, cur.b);
544 for (p=b->phi; p; p=p->link)
545 if (rtype(p->to) == RTmp)
546 bsclr(b->in, p->to.val);
547 rcopy(&beg[n], &cur);
548 }
549
550 /* 3. emit copies shared by multiple edges
551 * to the same block */
552 for (s=fn->start; s; s=s->link) {
553 if (s->npred <= 1)
554 continue;
555 m = &beg[s->id];
556
557 /* rl maps a register that is live at the
558 * beginning of s to the one used in all
559 * predecessors (if any, -1 otherwise) */
560 memset(rl, 0, sizeof rl);
561
562 /* to find the register of a phi in a
563 * predecessor, we have to find the
564 * corresponding argument */
565 for (p=s->phi; p; p=p->link) {
566 if (rtype(p->to) != RTmp
567 || (r=rfind(m, p->to.val)) == -1)
568 continue;
569 for (u=0; u<p->narg; u++) {
570 b = p->blk[u];
571 src = p->arg[u];
572 if (rtype(src) != RTmp)
573 continue;
574 x = rfind(&end[b->id], src.val);
575 if (x == -1) /* spilled */
576 continue;
577 rl[r] = (!rl[r] || rl[r] == x) ? x : -1;
578 }
579 if (rl[r] == 0)
580 rl[r] = -1;
581 }
582
583 /* process non-phis temporaries */
584 for (j=0; j<m->n; j++) {
585 t = m->t[j];
586 r = m->r[j];
587 if (rl[r] || t < Tmp0 /* todo, remove this */)
588 continue;
589 for (bp=s->pred; bp<&s->pred[s->npred]; bp++) {
590 x = rfind(&end[(*bp)->id], t);
591 if (x == -1) /* spilled */
592 continue;
593 rl[r] = (!rl[r] || rl[r] == x) ? x : -1;
594 }
595 if (rl[r] == 0)
596 rl[r] = -1;
597 }
598
599 npm = 0;
600 for (j=0; j<m->n; j++) {
601 t = m->t[j];
602 r = m->r[j];
603 x = rl[r];
604 assert(x != 0 || t < Tmp0 /* todo, ditto */);
605 if (x > 0 && !bshas(m->b, x)) {
606 pmadd(TMP(x), TMP(r), tmp[t].cls);
607 m->r[j] = x;
608 bsset(m->b, x);
609 }
610 }
611 curi = &insb[NIns];
612 pmgen();
613 j = &insb[NIns] - curi;
614 if (j == 0)
615 continue;
616 stmov += j;
617 s->nins += j;
618 i = alloc(s->nins * sizeof(Ins));
619 icpy(icpy(i, curi, j), s->ins, s->nins-j);
620 s->ins = i;
621 }
622
623 if (debug['R']) {
624 fprintf(stderr, "\n> Register mappings:\n");
625 for (n=0; n<fn->nblk; n++) {
626 b = fn->rpo[n];
627 fprintf(stderr, "\t%-10s beg", b->name);
628 mdump(&beg[n]);
629 fprintf(stderr, "\t end");
630 mdump(&end[n]);
631 }
632 fprintf(stderr, "\n");
633 }
634
635 /* 4. emit remaining copies in new blocks */
636 blist = 0;
637 for (b=fn->start;; b=b->link) {
638 ps = (Blk**[3]){&b->s1, &b->s2, (Blk*[1]){0}};
639 for (; (s=**ps); ps++) {
640 npm = 0;
641 for (p=s->phi; p; p=p->link) {
642 dst = p->to;
643 assert(rtype(dst)==RSlot || rtype(dst)==RTmp);
644 if (rtype(dst) == RTmp) {
645 r = rfind(&beg[s->id], dst.val);
646 if (r == -1)
647 continue;
648 dst = TMP(r);
649 }
650 for (u=0; p->blk[u]!=b; u++)
651 assert(u+1 < p->narg);
652 src = p->arg[u];
653 if (rtype(src) == RTmp)
654 src = rref(&end[b->id], src.val);
655 pmadd(src, dst, p->cls);
656 }
657 for (t=Tmp0; bsiter(s->in, &t); t++) {
658 src = rref(&end[b->id], t);
659 dst = rref(&beg[s->id], t);
660 pmadd(src, dst, tmp[t].cls);
661 }
662 curi = &insb[NIns];
663 pmgen();
664 if (curi == &insb[NIns])
665 continue;
666 b1 = blknew();
667 b1->loop = (b->loop+s->loop) / 2;
668 b1->link = blist;
669 blist = b1;
670 fn->nblk++;
671 (void)!snprintf(b1->name, sizeof(b1->name), "%s_%s", b->name, s->name);
672 b1->nins = &insb[NIns] - curi;
673 stmov += b1->nins;
674 stblk += 1;
675 idup(&b1->ins, curi, b1->nins);
676 b1->jmp.type = Jjmp;
677 b1->s1 = s;
678 **ps = b1;
679 }
680 if (!b->link) {
681 b->link = blist;
682 break;
683 }
684 }
685 for (b=fn->start; b; b=b->link)
686 b->phi = 0;
687 fn->reg = regu;
688
689 if (debug['R']) {
690 fprintf(stderr, "\n> Register allocation statistics:\n");
691 fprintf(stderr, "\tnew moves: %d\n", stmov);
692 fprintf(stderr, "\tnew blocks: %d\n", stblk);
693 fprintf(stderr, "\n> After register allocation:\n");
694 printfn(fn, stderr);
695 }
696}
697