1#include "all.h"
2#include <limits.h>
3
4/* For x86_64, do the following:
5 *
6 * - check that constants are used only in
7 * places allowed
8 * - ensure immediates always fit in 32b
9 * - expose machine register contraints
10 * on instructions like division.
11 * - implement fast locals (the streak of
12 * constant allocX in the first basic block)
13 * - recognize complex addressing modes
14 *
15 * Invariant: the use counts that are used
16 * in sel() must be sound. This
17 * is not so trivial, maybe the
18 * dce should be moved out...
19 */
20
21typedef struct ANum ANum;
22
23struct ANum {
24 char n, l, r;
25 Ins *i;
26};
27
28static int amatch(Addr *, Ref, int, ANum *, Fn *);
29
30static int
31noimm(Ref r, Fn *fn)
32{
33 int64_t val;
34
35 if (rtype(r) != RCon)
36 return 0;
37 switch (fn->con[r.val].type) {
38 case CAddr:
39 /* we only support the 'small'
40 * code model of the ABI, this
41 * means that we can always
42 * address data with 32bits
43 */
44 return 0;
45 case CBits:
46 val = fn->con[r.val].bits.i;
47 return (val < INT32_MIN || val > INT32_MAX);
48 default:
49 die("invalid constant");
50 }
51}
52
53static int
54rslot(Ref r, Fn *fn)
55{
56 if (rtype(r) != RTmp)
57 return -1;
58 return fn->tmp[r.val].slot;
59}
60
61static void
62fixarg(Ref *r, int k, Ins *i, Fn *fn)
63{
64 char buf[32];
65 Addr a, *m;
66 Ref r0, r1;
67 int s, n, op;
68
69 r1 = r0 = *r;
70 s = rslot(r0, fn);
71 op = i ? i->op : Ocopy;
72 if (KBASE(k) == 1 && rtype(r0) == RCon) {
73 /* load floating points from memory
74 * slots, they can't be used as
75 * immediates
76 */
77 r1 = MEM(fn->nmem);
78 vgrow(&fn->mem, ++fn->nmem);
79 memset(&a, 0, sizeof a);
80 a.offset.type = CAddr;
81 a.offset.local = 1;
82 n = gasstash(&fn->con[r0.val].bits, KWIDE(k) ? 8 : 4);
83 sprintf(buf, "fp%d", n);
84 a.offset.label = intern(buf);
85 fn->mem[fn->nmem-1] = a;
86 }
87 else if (op != Ocopy && k == Kl && noimm(r0, fn)) {
88 /* load constants that do not fit in
89 * a 32bit signed integer into a
90 * long temporary
91 */
92 r1 = newtmp("isel", Kl, fn);
93 emit(Ocopy, Kl, r1, r0, R);
94 }
95 else if (s != -1) {
96 /* load fast locals' addresses into
97 * temporaries right before the
98 * instruction
99 */
100 r1 = newtmp("isel", Kl, fn);
101 emit(Oaddr, Kl, r1, SLOT(s), R);
102 }
103 else if (!(isstore(op) && r == &i->arg[1])
104 && !isload(op) && op != Ocall && rtype(r0) == RCon
105 && fn->con[r0.val].type == CAddr) {
106 /* apple as does not support 32-bit
107 * absolute addressing, use a rip-
108 * relative leaq instead
109 */
110 r1 = newtmp("isel", Kl, fn);
111 emit(Oaddr, Kl, r1, r0, R);
112 }
113 else if (rtype(r0) == RMem) {
114 /* eliminate memory operands of
115 * the form $foo(%rip, ...)
116 */
117 m = &fn->mem[r0.val];
118 if (req(m->base, R))
119 if (m->offset.type == CAddr) {
120 r0 = newtmp("isel", Kl, fn);
121 emit(Oaddr, Kl, r0, newcon(&m->offset, fn), R);
122 m->offset.type = CUndef;
123 m->base = r0;
124 }
125 }
126 *r = r1;
127}
128
129static void
130seladdr(Ref *r, ANum *an, Fn *fn)
131{
132 Addr a;
133 Ref r0;
134
135 r0 = *r;
136 if (rtype(r0) == RTmp) {
137 memset(&a, 0, sizeof a);
138 if (!amatch(&a, r0, an[r0.val].n, an, fn))
139 return;
140 if (!req(a.base, R))
141 if (a.offset.type == CAddr) {
142 /* apple as does not support
143 * $foo(%r0, %r1, M); try to
144 * rewrite it or bail out if
145 * impossible
146 */
147 if (!req(a.index, R) || rtype(a.base) != RTmp)
148 return;
149 else {
150 a.index = a.base;
151 a.scale = 1;
152 a.base = R;
153 }
154 }
155 chuse(r0, -1, fn);
156 vgrow(&fn->mem, ++fn->nmem);
157 fn->mem[fn->nmem-1] = a;
158 chuse(a.base, +1, fn);
159 chuse(a.index, +1, fn);
160 *r = MEM(fn->nmem-1);
161 }
162}
163
164static int
165cmpswap(Ref arg[2], int op)
166{
167 switch (op) {
168 case NCmpI+Cflt:
169 case NCmpI+Cfle:
170 return 1;
171 case NCmpI+Cfgt:
172 case NCmpI+Cfge:
173 return 0;
174 }
175 return rtype(arg[0]) == RCon;
176}
177
178static void
179selcmp(Ref arg[2], int k, int swap, Fn *fn)
180{
181 Ref r;
182 Ins *icmp;
183
184 if (swap) {
185 r = arg[1];
186 arg[1] = arg[0];
187 arg[0] = r;
188 }
189 emit(Oxcmp, k, R, arg[1], arg[0]);
190 icmp = curi;
191 if (rtype(arg[0]) == RCon) {
192 assert(k != Kw);
193 icmp->arg[1] = newtmp("isel", k, fn);
194 emit(Ocopy, k, icmp->arg[1], arg[0], R);
195 fixarg(&curi->arg[0], k, curi, fn);
196 }
197 fixarg(&icmp->arg[0], k, icmp, fn);
198 fixarg(&icmp->arg[1], k, icmp, fn);
199}
200
201static void
202sel(Ins i, ANum *an, Fn *fn)
203{
204 Ref r0, r1, tmp[7];
205 int x, j, k, kc, sh, swap;
206 Ins *i0, *i1;
207
208 if (rtype(i.to) == RTmp)
209 if (!isreg(i.to) && !isreg(i.arg[0]) && !isreg(i.arg[1]))
210 if (fn->tmp[i.to.val].nuse == 0) {
211 chuse(i.arg[0], -1, fn);
212 chuse(i.arg[1], -1, fn);
213 return;
214 }
215 i0 = curi;
216 k = i.cls;
217 switch (i.op) {
218 case Odiv:
219 case Orem:
220 case Oudiv:
221 case Ourem:
222 if (KBASE(k) == 1)
223 goto Emit;
224 if (i.op == Odiv || i.op == Oudiv)
225 r0 = TMP(RAX), r1 = TMP(RDX);
226 else
227 r0 = TMP(RDX), r1 = TMP(RAX);
228 emit(Ocopy, k, i.to, r0, R);
229 emit(Ocopy, k, R, r1, R);
230 if (rtype(i.arg[1]) == RCon) {
231 /* immediates not allowed for
232 * divisions in x86
233 */
234 r0 = newtmp("isel", k, fn);
235 } else
236 r0 = i.arg[1];
237 if (fn->tmp[r0.val].slot != -1)
238 err("unlikely argument %%%s in %s",
239 fn->tmp[r0.val].name, optab[i.op].name);
240 if (i.op == Odiv || i.op == Orem) {
241 emit(Oxidiv, k, R, r0, R);
242 emit(Osign, k, TMP(RDX), TMP(RAX), R);
243 } else {
244 emit(Oxdiv, k, R, r0, R);
245 emit(Ocopy, k, TMP(RDX), CON_Z, R);
246 }
247 emit(Ocopy, k, TMP(RAX), i.arg[0], R);
248 fixarg(&curi->arg[0], k, curi, fn);
249 if (rtype(i.arg[1]) == RCon)
250 emit(Ocopy, k, r0, i.arg[1], R);
251 break;
252 case Osar:
253 case Oshr:
254 case Oshl:
255 r0 = i.arg[1];
256 if (rtype(r0) == RCon)
257 goto Emit;
258 if (fn->tmp[r0.val].slot != -1)
259 err("unlikely argument %%%s in %s",
260 fn->tmp[r0.val].name, optab[i.op].name);
261 i.arg[1] = TMP(RCX);
262 emit(Ocopy, Kw, R, TMP(RCX), R);
263 emiti(i);
264 i1 = curi;
265 emit(Ocopy, Kw, TMP(RCX), r0, R);
266 fixarg(&i1->arg[0], argcls(&i, 0), i1, fn);
267 break;
268 case Ouwtof:
269 r0 = newtmp("utof", Kl, fn);
270 emit(Osltof, k, i.to, r0, R);
271 emit(Oextuw, Kl, r0, i.arg[0], R);
272 fixarg(&curi->arg[0], k, curi, fn);
273 break;
274 case Oultof:
275 /* %mask =l and %arg.0, 1
276 %isbig =l shr %arg.0, 63
277 %divided =l shr %arg.0, %isbig
278 %or =l or %mask, %divided
279 %float =d sltof %or
280 %cast =l cast %float
281 %addend =l shl %isbig, 52
282 %sum =l add %cast, %addend
283 %result =d cast %sum
284 */
285 r0 = newtmp("utof", k, fn);
286 if (k == Ks)
287 kc = Kw, sh = 23;
288 else
289 kc = Kl, sh = 52;
290 for (j=0; j<4; j++)
291 tmp[j] = newtmp("utof", Kl, fn);
292 for (; j<7; j++)
293 tmp[j] = newtmp("utof", kc, fn);
294 emit(Ocast, k, i.to, tmp[6], R);
295 emit(Oadd, kc, tmp[6], tmp[4], tmp[5]);
296 emit(Oshl, kc, tmp[5], tmp[1], getcon(sh, fn));
297 emit(Ocast, kc, tmp[4], r0, R);
298 emit(Osltof, k, r0, tmp[3], R);
299 emit(Oor, Kl, tmp[3], tmp[0], tmp[2]);
300 emit(Oshr, Kl, tmp[2], i.arg[0], tmp[1]);
301 sel(*curi++, 0, fn);
302 emit(Oshr, Kl, tmp[1], i.arg[0], getcon(63, fn));
303 fixarg(&curi->arg[0], Kl, curi, fn);
304 emit(Oand, Kl, tmp[0], i.arg[0], getcon(1, fn));
305 fixarg(&curi->arg[0], Kl, curi, fn);
306 break;
307 case Ostoui:
308 i.op = Ostosi;
309 kc = Ks;
310 tmp[4] = getcon(0xdf000000, fn);
311 goto Oftoui;
312 case Odtoui:
313 i.op = Odtosi;
314 kc = Kd;
315 tmp[4] = getcon(0xc3e0000000000000, fn);
316 Oftoui:
317 if (k == Kw)
318 goto Emit;
319 r0 = newtmp("ftou", kc, fn);
320 for (j=0; j<4; j++)
321 tmp[j] = newtmp("ftou", Kl, fn);
322 emit(Oor, Kl, i.to, tmp[0], tmp[3]);
323 emit(Oand, Kl, tmp[3], tmp[2], tmp[1]);
324 emit(i.op, Kl, tmp[2], r0, R);
325 emit(Oadd, kc, r0, tmp[4], i.arg[0]);
326 i1 = curi; /* fixarg() can change curi */
327 fixarg(&i1->arg[0], kc, i1, fn);
328 fixarg(&i1->arg[1], kc, i1, fn);
329 emit(Osar, Kl, tmp[1], tmp[0], getcon(63, fn));
330 emit(i.op, Kl, tmp[0], i.arg[0], R);
331 fixarg(&curi->arg[0], Kl, curi, fn);
332 break;
333 case Onop:
334 break;
335 case Ostored:
336 case Ostores:
337 case Ostorel:
338 case Ostorew:
339 case Ostoreh:
340 case Ostoreb:
341 if (rtype(i.arg[0]) == RCon) {
342 if (i.op == Ostored)
343 i.op = Ostorel;
344 if (i.op == Ostores)
345 i.op = Ostorew;
346 }
347 seladdr(&i.arg[1], an, fn);
348 goto Emit;
349 case_Oload:
350 seladdr(&i.arg[0], an, fn);
351 goto Emit;
352 case Ocall:
353 case Osalloc:
354 case Ocopy:
355 case Oadd:
356 case Osub:
357 case Oneg:
358 case Omul:
359 case Oand:
360 case Oor:
361 case Oxor:
362 case Oxtest:
363 case Ostosi:
364 case Odtosi:
365 case Oswtof:
366 case Osltof:
367 case Oexts:
368 case Otruncd:
369 case Ocast:
370 case_OExt:
371Emit:
372 emiti(i);
373 i1 = curi; /* fixarg() can change curi */
374 fixarg(&i1->arg[0], argcls(&i, 0), i1, fn);
375 fixarg(&i1->arg[1], argcls(&i, 1), i1, fn);
376 break;
377 case Oalloc4:
378 case Oalloc8:
379 case Oalloc16:
380 salloc(i.to, i.arg[0], fn);
381 break;
382 default:
383 if (isext(i.op))
384 goto case_OExt;
385 if (isload(i.op))
386 goto case_Oload;
387 if (iscmp(i.op, &kc, &x)) {
388 switch (x) {
389 case NCmpI+Cfeq:
390 /* zf is set when operands are
391 * unordered, so we may have to
392 * check pf
393 */
394 r0 = newtmp("isel", Kw, fn);
395 r1 = newtmp("isel", Kw, fn);
396 emit(Oand, Kw, i.to, r0, r1);
397 emit(Oflagfo, k, r1, R, R);
398 i.to = r0;
399 break;
400 case NCmpI+Cfne:
401 r0 = newtmp("isel", Kw, fn);
402 r1 = newtmp("isel", Kw, fn);
403 emit(Oor, Kw, i.to, r0, r1);
404 emit(Oflagfuo, k, r1, R, R);
405 i.to = r0;
406 break;
407 }
408 swap = cmpswap(i.arg, x);
409 if (swap)
410 x = cmpop(x);
411 emit(Oflag+x, k, i.to, R, R);
412 selcmp(i.arg, kc, swap, fn);
413 break;
414 }
415 die("unknown instruction %s", optab[i.op].name);
416 }
417
418 while (i0>curi && --i0) {
419 assert(rslot(i0->arg[0], fn) == -1);
420 assert(rslot(i0->arg[1], fn) == -1);
421 }
422}
423
424static Ins *
425flagi(Ins *i0, Ins *i)
426{
427 while (i>i0) {
428 i--;
429 if (amd64_op[i->op].zflag)
430 return i;
431 if (amd64_op[i->op].lflag)
432 continue;
433 return 0;
434 }
435 return 0;
436}
437
438static void
439seljmp(Blk *b, Fn *fn)
440{
441 Ref r;
442 int c, k, swap;
443 Ins *fi;
444 Tmp *t;
445
446 if (b->jmp.type == Jret0 || b->jmp.type == Jjmp)
447 return;
448 assert(b->jmp.type == Jjnz);
449 r = b->jmp.arg;
450 t = &fn->tmp[r.val];
451 b->jmp.arg = R;
452 assert(rtype(r) == RTmp);
453 if (b->s1 == b->s2) {
454 chuse(r, -1, fn);
455 b->jmp.type = Jjmp;
456 b->s2 = 0;
457 return;
458 }
459 fi = flagi(b->ins, &b->ins[b->nins]);
460 if (!fi || !req(fi->to, r)) {
461 selcmp((Ref[2]){r, CON_Z}, Kw, 0, fn); /* todo, long jnz */
462 b->jmp.type = Jjf + Cine;
463 }
464 else if (iscmp(fi->op, &k, &c)
465 && c != NCmpI+Cfeq /* see sel() */
466 && c != NCmpI+Cfne) {
467 swap = cmpswap(fi->arg, c);
468 if (swap)
469 c = cmpop(c);
470 if (t->nuse == 1) {
471 selcmp(fi->arg, k, swap, fn);
472 *fi = (Ins){.op = Onop};
473 }
474 b->jmp.type = Jjf + c;
475 }
476 else if (fi->op == Oand && t->nuse == 1
477 && (rtype(fi->arg[0]) == RTmp ||
478 rtype(fi->arg[1]) == RTmp)) {
479 fi->op = Oxtest;
480 fi->to = R;
481 b->jmp.type = Jjf + Cine;
482 if (rtype(fi->arg[1]) == RCon) {
483 r = fi->arg[1];
484 fi->arg[1] = fi->arg[0];
485 fi->arg[0] = r;
486 }
487 }
488 else {
489 /* since flags are not tracked in liveness,
490 * the result of the flag-setting instruction
491 * has to be marked as live
492 */
493 if (t->nuse == 1)
494 emit(Ocopy, Kw, R, r, R);
495 b->jmp.type = Jjf + Cine;
496 }
497}
498
499static int
500aref(Ref r, ANum *ai)
501{
502 switch (rtype(r)) {
503 case RCon:
504 return 2;
505 case RTmp:
506 return ai[r.val].n;
507 default:
508 die("constant or temporary expected");
509 }
510}
511
512static int
513ascale(Ref r, Con *con)
514{
515 int64_t n;
516
517 if (rtype(r) != RCon)
518 return 0;
519 if (con[r.val].type != CBits)
520 return 0;
521 n = con[r.val].bits.i;
522 return n == 1 || n == 2 || n == 4 || n == 8;
523}
524
525static void
526anumber(ANum *ai, Blk *b, Con *con)
527{
528 /* This should be made obsolete by a proper
529 * reassoc pass.
530 *
531 * Rules:
532 *
533 * RTmp(_) -> 0 tmp
534 * ( RTmp(_) -> 1 slot )
535 * RCon(_) -> 2 con
536 * 0 * 2 -> 3 s * i (when constant is 1,2,4,8)
537 */
538 static char add[10][10] = {
539 [2] [4] = 4, [4] [2] = 4,
540 [2] [6] = 6, [6] [2] = 6,
541 [2] [7] = 7, [7] [2] = 7,
542 [0] [2] = 4, [2] [0] = 4, /* 4: o + b */
543 [0] [0] = 5, /* 5: b + s * i */
544 [0] [3] = 5, [3] [0] = 5,
545 [2] [3] = 6, [3] [2] = 6, /* 6: o + s * i */
546 [2] [5] = 7, [5] [2] = 7, /* 7: o + b + s * i */
547 [0] [6] = 7, [6] [0] = 7,
548 [4] [3] = 7, [3] [4] = 7,
549 };
550 int a, a1, a2, n1, n2, t1, t2;
551 Ins *i;
552
553 for (i=b->ins; i<&b->ins[b->nins]; i++) {
554 if (rtype(i->to) == RTmp)
555 ai[i->to.val].i = i;
556 if (i->op != Oadd && i->op != Omul)
557 continue;
558 a1 = aref(i->arg[0], ai);
559 a2 = aref(i->arg[1], ai);
560 t1 = a1 != 1 && a1 != 2;
561 t2 = a2 != 1 && a2 != 2;
562 if (i->op == Oadd) {
563 a = add[n1 = a1][n2 = a2];
564 if (t1 && a < add[0][a2])
565 a = add[n1 = 0][n2 = a2];
566 if (t2 && a < add[a1][0])
567 a = add[n1 = a1][n2 = 0];
568 if (t1 && t2 && a < add[0][0])
569 a = add[n1 = 0][n2 = 0];
570 } else {
571 n1 = n2 = a = 0;
572 if (ascale(i->arg[0], con) && t2)
573 a = 3, n1 = 2, n2 = 0;
574 if (t1 && ascale(i->arg[1], con))
575 a = 3, n1 = 0, n2 = 2;
576 }
577 ai[i->to.val].n = a;
578 ai[i->to.val].l = n1;
579 ai[i->to.val].r = n2;
580 }
581}
582
583static int
584amatch(Addr *a, Ref r, int n, ANum *ai, Fn *fn)
585{
586 Ins *i;
587 int nl, nr, t, s;
588 Ref al, ar;
589
590 if (rtype(r) == RCon) {
591 if (!addcon(&a->offset, &fn->con[r.val]))
592 err("unlikely sum of $%s and $%s",
593 str(a->offset.label), str(fn->con[r.val].label));
594 return 1;
595 }
596 assert(rtype(r) == RTmp);
597 i = ai[r.val].i;
598 nl = ai[r.val].l;
599 nr = ai[r.val].r;
600 if (i) {
601 if (nl > nr) {
602 al = i->arg[1];
603 ar = i->arg[0];
604 t = nl, nl = nr, nr = t;
605 } else {
606 al = i->arg[0];
607 ar = i->arg[1];
608 }
609 }
610 switch (n) {
611 case 3: /* s * i */
612 a->index = al;
613 a->scale = fn->con[ar.val].bits.i;
614 return 0;
615 case 5: /* b + s * i */
616 switch (nr) {
617 case 0:
618 if (fn->tmp[ar.val].slot != -1) {
619 al = i->arg[1];
620 ar = i->arg[0];
621 }
622 a->index = ar;
623 a->scale = 1;
624 break;
625 case 3:
626 amatch(a, ar, nr, ai, fn);
627 break;
628 }
629 r = al;
630 /* fall through */
631 case 0:
632 s = fn->tmp[r.val].slot;
633 if (s != -1)
634 r = SLOT(s);
635 a->base = r;
636 return n || s != -1;
637 case 2: /* constants */
638 case 4: /* o + b */
639 case 6: /* o + s * i */
640 case 7: /* o + b + s * i */
641 amatch(a, ar, nr, ai, fn);
642 amatch(a, al, nl, ai, fn);
643 return 1;
644 default:
645 die("unreachable");
646 }
647}
648
649/* instruction selection
650 * requires use counts (as given by parsing)
651 */
652void
653amd64_isel(Fn *fn)
654{
655 Blk *b, **sb;
656 Ins *i;
657 Phi *p;
658 uint a;
659 int n, al;
660 int64_t sz;
661 ANum *ainfo;
662
663 /* assign slots to fast allocs */
664 b = fn->start;
665 /* specific to NAlign == 3 */ /* or change n=4 and sz /= 4 below */
666 for (al=Oalloc, n=4; al<=Oalloc1; al++, n*=2)
667 for (i=b->ins; i<&b->ins[b->nins]; i++)
668 if (i->op == al) {
669 if (rtype(i->arg[0]) != RCon)
670 break;
671 sz = fn->con[i->arg[0].val].bits.i;
672 if (sz < 0 || sz >= INT_MAX-15)
673 err("invalid alloc size %"PRId64, sz);
674 sz = (sz + n-1) & -n;
675 sz /= 4;
676 if (sz > INT_MAX - fn->slot)
677 die("alloc too large");
678 fn->tmp[i->to.val].slot = fn->slot;
679 fn->slot += sz;
680 *i = (Ins){.op = Onop};
681 }
682
683 /* process basic blocks */
684 n = fn->ntmp;
685 ainfo = emalloc(n * sizeof ainfo[0]);
686 for (b=fn->start; b; b=b->link) {
687 curi = &insb[NIns];
688 for (sb=(Blk*[3]){b->s1, b->s2, 0}; *sb; sb++)
689 for (p=(*sb)->phi; p; p=p->link) {
690 for (a=0; p->blk[a] != b; a++)
691 assert(a+1 < p->narg);
692 fixarg(&p->arg[a], p->cls, 0, fn);
693 }
694 memset(ainfo, 0, n * sizeof ainfo[0]);
695 anumber(ainfo, b, fn->con);
696 seljmp(b, fn);
697 for (i=&b->ins[b->nins]; i!=b->ins;)
698 sel(*--i, ainfo, fn);
699 b->nins = &insb[NIns] - curi;
700 idup(&b->ins, curi, b->nins);
701 }
702 free(ainfo);
703
704 if (debug['I']) {
705 fprintf(stderr, "\n> After instruction selection:\n");
706 printfn(fn, stderr);
707 }
708}
709