1 | /* |
2 | * Loongson Multimedia Instruction emulation helpers for QEMU. |
3 | * |
4 | * Copyright (c) 2011 Richard Henderson <rth@twiddle.net> |
5 | * |
6 | * This library is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU Lesser General Public |
8 | * License as published by the Free Software Foundation; either |
9 | * version 2 of the License, or (at your option) any later version. |
10 | * |
11 | * This library is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | * Lesser General Public License for more details. |
15 | * |
16 | * You should have received a copy of the GNU Lesser General Public |
17 | * License along with this library; if not, see <http://www.gnu.org/licenses/>. |
18 | */ |
19 | |
20 | #include "qemu/osdep.h" |
21 | #include "cpu.h" |
22 | #include "exec/helper-proto.h" |
23 | |
24 | /* |
25 | * If the byte ordering doesn't matter, i.e. all columns are treated |
26 | * identically, then this union can be used directly. If byte ordering |
27 | * does matter, we generally ignore dumping to memory. |
28 | */ |
29 | typedef union { |
30 | uint8_t ub[8]; |
31 | int8_t sb[8]; |
32 | uint16_t uh[4]; |
33 | int16_t sh[4]; |
34 | uint32_t uw[2]; |
35 | int32_t sw[2]; |
36 | uint64_t d; |
37 | } LMIValue; |
38 | |
39 | /* Some byte ordering issues can be mitigated by XORing in the following. */ |
40 | #ifdef HOST_WORDS_BIGENDIAN |
41 | # define BYTE_ORDER_XOR(N) N |
42 | #else |
43 | # define BYTE_ORDER_XOR(N) 0 |
44 | #endif |
45 | |
46 | #define SATSB(x) (x < -0x80 ? -0x80 : x > 0x7f ? 0x7f : x) |
47 | #define SATUB(x) (x > 0xff ? 0xff : x) |
48 | |
49 | #define SATSH(x) (x < -0x8000 ? -0x8000 : x > 0x7fff ? 0x7fff : x) |
50 | #define SATUH(x) (x > 0xffff ? 0xffff : x) |
51 | |
52 | #define SATSW(x) \ |
53 | (x < -0x80000000ll ? -0x80000000ll : x > 0x7fffffff ? 0x7fffffff : x) |
54 | #define SATUW(x) (x > 0xffffffffull ? 0xffffffffull : x) |
55 | |
56 | uint64_t helper_paddsb(uint64_t fs, uint64_t ft) |
57 | { |
58 | LMIValue vs, vt; |
59 | unsigned int i; |
60 | |
61 | vs.d = fs; |
62 | vt.d = ft; |
63 | for (i = 0; i < 8; ++i) { |
64 | int r = vs.sb[i] + vt.sb[i]; |
65 | vs.sb[i] = SATSB(r); |
66 | } |
67 | return vs.d; |
68 | } |
69 | |
70 | uint64_t helper_paddusb(uint64_t fs, uint64_t ft) |
71 | { |
72 | LMIValue vs, vt; |
73 | unsigned int i; |
74 | |
75 | vs.d = fs; |
76 | vt.d = ft; |
77 | for (i = 0; i < 8; ++i) { |
78 | int r = vs.ub[i] + vt.ub[i]; |
79 | vs.ub[i] = SATUB(r); |
80 | } |
81 | return vs.d; |
82 | } |
83 | |
84 | uint64_t helper_paddsh(uint64_t fs, uint64_t ft) |
85 | { |
86 | LMIValue vs, vt; |
87 | unsigned int i; |
88 | |
89 | vs.d = fs; |
90 | vt.d = ft; |
91 | for (i = 0; i < 4; ++i) { |
92 | int r = vs.sh[i] + vt.sh[i]; |
93 | vs.sh[i] = SATSH(r); |
94 | } |
95 | return vs.d; |
96 | } |
97 | |
98 | uint64_t helper_paddush(uint64_t fs, uint64_t ft) |
99 | { |
100 | LMIValue vs, vt; |
101 | unsigned int i; |
102 | |
103 | vs.d = fs; |
104 | vt.d = ft; |
105 | for (i = 0; i < 4; ++i) { |
106 | int r = vs.uh[i] + vt.uh[i]; |
107 | vs.uh[i] = SATUH(r); |
108 | } |
109 | return vs.d; |
110 | } |
111 | |
112 | uint64_t helper_paddb(uint64_t fs, uint64_t ft) |
113 | { |
114 | LMIValue vs, vt; |
115 | unsigned int i; |
116 | |
117 | vs.d = fs; |
118 | vt.d = ft; |
119 | for (i = 0; i < 8; ++i) { |
120 | vs.ub[i] += vt.ub[i]; |
121 | } |
122 | return vs.d; |
123 | } |
124 | |
125 | uint64_t helper_paddh(uint64_t fs, uint64_t ft) |
126 | { |
127 | LMIValue vs, vt; |
128 | unsigned int i; |
129 | |
130 | vs.d = fs; |
131 | vt.d = ft; |
132 | for (i = 0; i < 4; ++i) { |
133 | vs.uh[i] += vt.uh[i]; |
134 | } |
135 | return vs.d; |
136 | } |
137 | |
138 | uint64_t helper_paddw(uint64_t fs, uint64_t ft) |
139 | { |
140 | LMIValue vs, vt; |
141 | unsigned int i; |
142 | |
143 | vs.d = fs; |
144 | vt.d = ft; |
145 | for (i = 0; i < 2; ++i) { |
146 | vs.uw[i] += vt.uw[i]; |
147 | } |
148 | return vs.d; |
149 | } |
150 | |
151 | uint64_t helper_psubsb(uint64_t fs, uint64_t ft) |
152 | { |
153 | LMIValue vs, vt; |
154 | unsigned int i; |
155 | |
156 | vs.d = fs; |
157 | vt.d = ft; |
158 | for (i = 0; i < 8; ++i) { |
159 | int r = vs.sb[i] - vt.sb[i]; |
160 | vs.sb[i] = SATSB(r); |
161 | } |
162 | return vs.d; |
163 | } |
164 | |
165 | uint64_t helper_psubusb(uint64_t fs, uint64_t ft) |
166 | { |
167 | LMIValue vs, vt; |
168 | unsigned int i; |
169 | |
170 | vs.d = fs; |
171 | vt.d = ft; |
172 | for (i = 0; i < 8; ++i) { |
173 | int r = vs.ub[i] - vt.ub[i]; |
174 | vs.ub[i] = SATUB(r); |
175 | } |
176 | return vs.d; |
177 | } |
178 | |
179 | uint64_t helper_psubsh(uint64_t fs, uint64_t ft) |
180 | { |
181 | LMIValue vs, vt; |
182 | unsigned int i; |
183 | |
184 | vs.d = fs; |
185 | vt.d = ft; |
186 | for (i = 0; i < 4; ++i) { |
187 | int r = vs.sh[i] - vt.sh[i]; |
188 | vs.sh[i] = SATSH(r); |
189 | } |
190 | return vs.d; |
191 | } |
192 | |
193 | uint64_t helper_psubush(uint64_t fs, uint64_t ft) |
194 | { |
195 | LMIValue vs, vt; |
196 | unsigned int i; |
197 | |
198 | vs.d = fs; |
199 | vt.d = ft; |
200 | for (i = 0; i < 4; ++i) { |
201 | int r = vs.uh[i] - vt.uh[i]; |
202 | vs.uh[i] = SATUH(r); |
203 | } |
204 | return vs.d; |
205 | } |
206 | |
207 | uint64_t helper_psubb(uint64_t fs, uint64_t ft) |
208 | { |
209 | LMIValue vs, vt; |
210 | unsigned int i; |
211 | |
212 | vs.d = fs; |
213 | vt.d = ft; |
214 | for (i = 0; i < 8; ++i) { |
215 | vs.ub[i] -= vt.ub[i]; |
216 | } |
217 | return vs.d; |
218 | } |
219 | |
220 | uint64_t helper_psubh(uint64_t fs, uint64_t ft) |
221 | { |
222 | LMIValue vs, vt; |
223 | unsigned int i; |
224 | |
225 | vs.d = fs; |
226 | vt.d = ft; |
227 | for (i = 0; i < 4; ++i) { |
228 | vs.uh[i] -= vt.uh[i]; |
229 | } |
230 | return vs.d; |
231 | } |
232 | |
233 | uint64_t helper_psubw(uint64_t fs, uint64_t ft) |
234 | { |
235 | LMIValue vs, vt; |
236 | unsigned int i; |
237 | |
238 | vs.d = fs; |
239 | vt.d = ft; |
240 | for (i = 0; i < 2; ++i) { |
241 | vs.uw[i] -= vt.uw[i]; |
242 | } |
243 | return vs.d; |
244 | } |
245 | |
246 | uint64_t helper_pshufh(uint64_t fs, uint64_t ft) |
247 | { |
248 | unsigned host = BYTE_ORDER_XOR(3); |
249 | LMIValue vd, vs; |
250 | unsigned i; |
251 | |
252 | vs.d = fs; |
253 | vd.d = 0; |
254 | for (i = 0; i < 4; i++, ft >>= 2) { |
255 | vd.uh[i ^ host] = vs.uh[(ft & 3) ^ host]; |
256 | } |
257 | return vd.d; |
258 | } |
259 | |
260 | uint64_t helper_packsswh(uint64_t fs, uint64_t ft) |
261 | { |
262 | uint64_t fd = 0; |
263 | int64_t tmp; |
264 | |
265 | tmp = (int32_t)(fs >> 0); |
266 | tmp = SATSH(tmp); |
267 | fd |= (tmp & 0xffff) << 0; |
268 | |
269 | tmp = (int32_t)(fs >> 32); |
270 | tmp = SATSH(tmp); |
271 | fd |= (tmp & 0xffff) << 16; |
272 | |
273 | tmp = (int32_t)(ft >> 0); |
274 | tmp = SATSH(tmp); |
275 | fd |= (tmp & 0xffff) << 32; |
276 | |
277 | tmp = (int32_t)(ft >> 32); |
278 | tmp = SATSH(tmp); |
279 | fd |= (tmp & 0xffff) << 48; |
280 | |
281 | return fd; |
282 | } |
283 | |
284 | uint64_t helper_packsshb(uint64_t fs, uint64_t ft) |
285 | { |
286 | uint64_t fd = 0; |
287 | unsigned int i; |
288 | |
289 | for (i = 0; i < 4; ++i) { |
290 | int16_t tmp = fs >> (i * 16); |
291 | tmp = SATSB(tmp); |
292 | fd |= (uint64_t)(tmp & 0xff) << (i * 8); |
293 | } |
294 | for (i = 0; i < 4; ++i) { |
295 | int16_t tmp = ft >> (i * 16); |
296 | tmp = SATSB(tmp); |
297 | fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32); |
298 | } |
299 | |
300 | return fd; |
301 | } |
302 | |
303 | uint64_t helper_packushb(uint64_t fs, uint64_t ft) |
304 | { |
305 | uint64_t fd = 0; |
306 | unsigned int i; |
307 | |
308 | for (i = 0; i < 4; ++i) { |
309 | int16_t tmp = fs >> (i * 16); |
310 | tmp = SATUB(tmp); |
311 | fd |= (uint64_t)(tmp & 0xff) << (i * 8); |
312 | } |
313 | for (i = 0; i < 4; ++i) { |
314 | int16_t tmp = ft >> (i * 16); |
315 | tmp = SATUB(tmp); |
316 | fd |= (uint64_t)(tmp & 0xff) << (i * 8 + 32); |
317 | } |
318 | |
319 | return fd; |
320 | } |
321 | |
322 | uint64_t helper_punpcklwd(uint64_t fs, uint64_t ft) |
323 | { |
324 | return (fs & 0xffffffff) | (ft << 32); |
325 | } |
326 | |
327 | uint64_t helper_punpckhwd(uint64_t fs, uint64_t ft) |
328 | { |
329 | return (fs >> 32) | (ft & ~0xffffffffull); |
330 | } |
331 | |
332 | uint64_t helper_punpcklhw(uint64_t fs, uint64_t ft) |
333 | { |
334 | unsigned host = BYTE_ORDER_XOR(3); |
335 | LMIValue vd, vs, vt; |
336 | |
337 | vs.d = fs; |
338 | vt.d = ft; |
339 | vd.uh[0 ^ host] = vs.uh[0 ^ host]; |
340 | vd.uh[1 ^ host] = vt.uh[0 ^ host]; |
341 | vd.uh[2 ^ host] = vs.uh[1 ^ host]; |
342 | vd.uh[3 ^ host] = vt.uh[1 ^ host]; |
343 | |
344 | return vd.d; |
345 | } |
346 | |
347 | uint64_t helper_punpckhhw(uint64_t fs, uint64_t ft) |
348 | { |
349 | unsigned host = BYTE_ORDER_XOR(3); |
350 | LMIValue vd, vs, vt; |
351 | |
352 | vs.d = fs; |
353 | vt.d = ft; |
354 | vd.uh[0 ^ host] = vs.uh[2 ^ host]; |
355 | vd.uh[1 ^ host] = vt.uh[2 ^ host]; |
356 | vd.uh[2 ^ host] = vs.uh[3 ^ host]; |
357 | vd.uh[3 ^ host] = vt.uh[3 ^ host]; |
358 | |
359 | return vd.d; |
360 | } |
361 | |
362 | uint64_t helper_punpcklbh(uint64_t fs, uint64_t ft) |
363 | { |
364 | unsigned host = BYTE_ORDER_XOR(7); |
365 | LMIValue vd, vs, vt; |
366 | |
367 | vs.d = fs; |
368 | vt.d = ft; |
369 | vd.ub[0 ^ host] = vs.ub[0 ^ host]; |
370 | vd.ub[1 ^ host] = vt.ub[0 ^ host]; |
371 | vd.ub[2 ^ host] = vs.ub[1 ^ host]; |
372 | vd.ub[3 ^ host] = vt.ub[1 ^ host]; |
373 | vd.ub[4 ^ host] = vs.ub[2 ^ host]; |
374 | vd.ub[5 ^ host] = vt.ub[2 ^ host]; |
375 | vd.ub[6 ^ host] = vs.ub[3 ^ host]; |
376 | vd.ub[7 ^ host] = vt.ub[3 ^ host]; |
377 | |
378 | return vd.d; |
379 | } |
380 | |
381 | uint64_t helper_punpckhbh(uint64_t fs, uint64_t ft) |
382 | { |
383 | unsigned host = BYTE_ORDER_XOR(7); |
384 | LMIValue vd, vs, vt; |
385 | |
386 | vs.d = fs; |
387 | vt.d = ft; |
388 | vd.ub[0 ^ host] = vs.ub[4 ^ host]; |
389 | vd.ub[1 ^ host] = vt.ub[4 ^ host]; |
390 | vd.ub[2 ^ host] = vs.ub[5 ^ host]; |
391 | vd.ub[3 ^ host] = vt.ub[5 ^ host]; |
392 | vd.ub[4 ^ host] = vs.ub[6 ^ host]; |
393 | vd.ub[5 ^ host] = vt.ub[6 ^ host]; |
394 | vd.ub[6 ^ host] = vs.ub[7 ^ host]; |
395 | vd.ub[7 ^ host] = vt.ub[7 ^ host]; |
396 | |
397 | return vd.d; |
398 | } |
399 | |
400 | uint64_t helper_pavgh(uint64_t fs, uint64_t ft) |
401 | { |
402 | LMIValue vs, vt; |
403 | unsigned i; |
404 | |
405 | vs.d = fs; |
406 | vt.d = ft; |
407 | for (i = 0; i < 4; i++) { |
408 | vs.uh[i] = (vs.uh[i] + vt.uh[i] + 1) >> 1; |
409 | } |
410 | return vs.d; |
411 | } |
412 | |
413 | uint64_t helper_pavgb(uint64_t fs, uint64_t ft) |
414 | { |
415 | LMIValue vs, vt; |
416 | unsigned i; |
417 | |
418 | vs.d = fs; |
419 | vt.d = ft; |
420 | for (i = 0; i < 8; i++) { |
421 | vs.ub[i] = (vs.ub[i] + vt.ub[i] + 1) >> 1; |
422 | } |
423 | return vs.d; |
424 | } |
425 | |
426 | uint64_t helper_pmaxsh(uint64_t fs, uint64_t ft) |
427 | { |
428 | LMIValue vs, vt; |
429 | unsigned i; |
430 | |
431 | vs.d = fs; |
432 | vt.d = ft; |
433 | for (i = 0; i < 4; i++) { |
434 | vs.sh[i] = (vs.sh[i] >= vt.sh[i] ? vs.sh[i] : vt.sh[i]); |
435 | } |
436 | return vs.d; |
437 | } |
438 | |
439 | uint64_t helper_pminsh(uint64_t fs, uint64_t ft) |
440 | { |
441 | LMIValue vs, vt; |
442 | unsigned i; |
443 | |
444 | vs.d = fs; |
445 | vt.d = ft; |
446 | for (i = 0; i < 4; i++) { |
447 | vs.sh[i] = (vs.sh[i] <= vt.sh[i] ? vs.sh[i] : vt.sh[i]); |
448 | } |
449 | return vs.d; |
450 | } |
451 | |
452 | uint64_t helper_pmaxub(uint64_t fs, uint64_t ft) |
453 | { |
454 | LMIValue vs, vt; |
455 | unsigned i; |
456 | |
457 | vs.d = fs; |
458 | vt.d = ft; |
459 | for (i = 0; i < 4; i++) { |
460 | vs.ub[i] = (vs.ub[i] >= vt.ub[i] ? vs.ub[i] : vt.ub[i]); |
461 | } |
462 | return vs.d; |
463 | } |
464 | |
465 | uint64_t helper_pminub(uint64_t fs, uint64_t ft) |
466 | { |
467 | LMIValue vs, vt; |
468 | unsigned i; |
469 | |
470 | vs.d = fs; |
471 | vt.d = ft; |
472 | for (i = 0; i < 4; i++) { |
473 | vs.ub[i] = (vs.ub[i] <= vt.ub[i] ? vs.ub[i] : vt.ub[i]); |
474 | } |
475 | return vs.d; |
476 | } |
477 | |
478 | uint64_t helper_pcmpeqw(uint64_t fs, uint64_t ft) |
479 | { |
480 | LMIValue vs, vt; |
481 | unsigned i; |
482 | |
483 | vs.d = fs; |
484 | vt.d = ft; |
485 | for (i = 0; i < 2; i++) { |
486 | vs.uw[i] = -(vs.uw[i] == vt.uw[i]); |
487 | } |
488 | return vs.d; |
489 | } |
490 | |
491 | uint64_t helper_pcmpgtw(uint64_t fs, uint64_t ft) |
492 | { |
493 | LMIValue vs, vt; |
494 | unsigned i; |
495 | |
496 | vs.d = fs; |
497 | vt.d = ft; |
498 | for (i = 0; i < 2; i++) { |
499 | vs.uw[i] = -(vs.uw[i] > vt.uw[i]); |
500 | } |
501 | return vs.d; |
502 | } |
503 | |
504 | uint64_t helper_pcmpeqh(uint64_t fs, uint64_t ft) |
505 | { |
506 | LMIValue vs, vt; |
507 | unsigned i; |
508 | |
509 | vs.d = fs; |
510 | vt.d = ft; |
511 | for (i = 0; i < 4; i++) { |
512 | vs.uh[i] = -(vs.uh[i] == vt.uh[i]); |
513 | } |
514 | return vs.d; |
515 | } |
516 | |
517 | uint64_t helper_pcmpgth(uint64_t fs, uint64_t ft) |
518 | { |
519 | LMIValue vs, vt; |
520 | unsigned i; |
521 | |
522 | vs.d = fs; |
523 | vt.d = ft; |
524 | for (i = 0; i < 4; i++) { |
525 | vs.uh[i] = -(vs.uh[i] > vt.uh[i]); |
526 | } |
527 | return vs.d; |
528 | } |
529 | |
530 | uint64_t helper_pcmpeqb(uint64_t fs, uint64_t ft) |
531 | { |
532 | LMIValue vs, vt; |
533 | unsigned i; |
534 | |
535 | vs.d = fs; |
536 | vt.d = ft; |
537 | for (i = 0; i < 8; i++) { |
538 | vs.ub[i] = -(vs.ub[i] == vt.ub[i]); |
539 | } |
540 | return vs.d; |
541 | } |
542 | |
543 | uint64_t helper_pcmpgtb(uint64_t fs, uint64_t ft) |
544 | { |
545 | LMIValue vs, vt; |
546 | unsigned i; |
547 | |
548 | vs.d = fs; |
549 | vt.d = ft; |
550 | for (i = 0; i < 8; i++) { |
551 | vs.ub[i] = -(vs.ub[i] > vt.ub[i]); |
552 | } |
553 | return vs.d; |
554 | } |
555 | |
556 | uint64_t helper_psllw(uint64_t fs, uint64_t ft) |
557 | { |
558 | LMIValue vs; |
559 | unsigned i; |
560 | |
561 | ft &= 0x7f; |
562 | if (ft > 31) { |
563 | return 0; |
564 | } |
565 | vs.d = fs; |
566 | for (i = 0; i < 2; ++i) { |
567 | vs.uw[i] <<= ft; |
568 | } |
569 | return vs.d; |
570 | } |
571 | |
572 | uint64_t helper_psrlw(uint64_t fs, uint64_t ft) |
573 | { |
574 | LMIValue vs; |
575 | unsigned i; |
576 | |
577 | ft &= 0x7f; |
578 | if (ft > 31) { |
579 | return 0; |
580 | } |
581 | vs.d = fs; |
582 | for (i = 0; i < 2; ++i) { |
583 | vs.uw[i] >>= ft; |
584 | } |
585 | return vs.d; |
586 | } |
587 | |
588 | uint64_t helper_psraw(uint64_t fs, uint64_t ft) |
589 | { |
590 | LMIValue vs; |
591 | unsigned i; |
592 | |
593 | ft &= 0x7f; |
594 | if (ft > 31) { |
595 | ft = 31; |
596 | } |
597 | vs.d = fs; |
598 | for (i = 0; i < 2; ++i) { |
599 | vs.sw[i] >>= ft; |
600 | } |
601 | return vs.d; |
602 | } |
603 | |
604 | uint64_t helper_psllh(uint64_t fs, uint64_t ft) |
605 | { |
606 | LMIValue vs; |
607 | unsigned i; |
608 | |
609 | ft &= 0x7f; |
610 | if (ft > 15) { |
611 | return 0; |
612 | } |
613 | vs.d = fs; |
614 | for (i = 0; i < 4; ++i) { |
615 | vs.uh[i] <<= ft; |
616 | } |
617 | return vs.d; |
618 | } |
619 | |
620 | uint64_t helper_psrlh(uint64_t fs, uint64_t ft) |
621 | { |
622 | LMIValue vs; |
623 | unsigned i; |
624 | |
625 | ft &= 0x7f; |
626 | if (ft > 15) { |
627 | return 0; |
628 | } |
629 | vs.d = fs; |
630 | for (i = 0; i < 4; ++i) { |
631 | vs.uh[i] >>= ft; |
632 | } |
633 | return vs.d; |
634 | } |
635 | |
636 | uint64_t helper_psrah(uint64_t fs, uint64_t ft) |
637 | { |
638 | LMIValue vs; |
639 | unsigned i; |
640 | |
641 | ft &= 0x7f; |
642 | if (ft > 15) { |
643 | ft = 15; |
644 | } |
645 | vs.d = fs; |
646 | for (i = 0; i < 4; ++i) { |
647 | vs.sh[i] >>= ft; |
648 | } |
649 | return vs.d; |
650 | } |
651 | |
652 | uint64_t helper_pmullh(uint64_t fs, uint64_t ft) |
653 | { |
654 | LMIValue vs, vt; |
655 | unsigned i; |
656 | |
657 | vs.d = fs; |
658 | vt.d = ft; |
659 | for (i = 0; i < 4; ++i) { |
660 | vs.sh[i] *= vt.sh[i]; |
661 | } |
662 | return vs.d; |
663 | } |
664 | |
665 | uint64_t helper_pmulhh(uint64_t fs, uint64_t ft) |
666 | { |
667 | LMIValue vs, vt; |
668 | unsigned i; |
669 | |
670 | vs.d = fs; |
671 | vt.d = ft; |
672 | for (i = 0; i < 4; ++i) { |
673 | int32_t r = vs.sh[i] * vt.sh[i]; |
674 | vs.sh[i] = r >> 16; |
675 | } |
676 | return vs.d; |
677 | } |
678 | |
679 | uint64_t helper_pmulhuh(uint64_t fs, uint64_t ft) |
680 | { |
681 | LMIValue vs, vt; |
682 | unsigned i; |
683 | |
684 | vs.d = fs; |
685 | vt.d = ft; |
686 | for (i = 0; i < 4; ++i) { |
687 | uint32_t r = vs.uh[i] * vt.uh[i]; |
688 | vs.uh[i] = r >> 16; |
689 | } |
690 | return vs.d; |
691 | } |
692 | |
693 | uint64_t helper_pmaddhw(uint64_t fs, uint64_t ft) |
694 | { |
695 | unsigned host = BYTE_ORDER_XOR(3); |
696 | LMIValue vs, vt; |
697 | uint32_t p0, p1; |
698 | |
699 | vs.d = fs; |
700 | vt.d = ft; |
701 | p0 = vs.sh[0 ^ host] * vt.sh[0 ^ host]; |
702 | p0 += vs.sh[1 ^ host] * vt.sh[1 ^ host]; |
703 | p1 = vs.sh[2 ^ host] * vt.sh[2 ^ host]; |
704 | p1 += vs.sh[3 ^ host] * vt.sh[3 ^ host]; |
705 | |
706 | return ((uint64_t)p1 << 32) | p0; |
707 | } |
708 | |
709 | uint64_t helper_pasubub(uint64_t fs, uint64_t ft) |
710 | { |
711 | LMIValue vs, vt; |
712 | unsigned i; |
713 | |
714 | vs.d = fs; |
715 | vt.d = ft; |
716 | for (i = 0; i < 8; ++i) { |
717 | int r = vs.ub[i] - vt.ub[i]; |
718 | vs.ub[i] = (r < 0 ? -r : r); |
719 | } |
720 | return vs.d; |
721 | } |
722 | |
723 | uint64_t helper_biadd(uint64_t fs) |
724 | { |
725 | unsigned i, fd; |
726 | |
727 | for (i = fd = 0; i < 8; ++i) { |
728 | fd += (fs >> (i * 8)) & 0xff; |
729 | } |
730 | return fd & 0xffff; |
731 | } |
732 | |
733 | uint64_t helper_pmovmskb(uint64_t fs) |
734 | { |
735 | unsigned fd = 0; |
736 | |
737 | fd |= ((fs >> 7) & 1) << 0; |
738 | fd |= ((fs >> 15) & 1) << 1; |
739 | fd |= ((fs >> 23) & 1) << 2; |
740 | fd |= ((fs >> 31) & 1) << 3; |
741 | fd |= ((fs >> 39) & 1) << 4; |
742 | fd |= ((fs >> 47) & 1) << 5; |
743 | fd |= ((fs >> 55) & 1) << 6; |
744 | fd |= ((fs >> 63) & 1) << 7; |
745 | |
746 | return fd & 0xff; |
747 | } |
748 | |