1 | /* |
2 | * Generic vectorized operation runtime |
3 | * |
4 | * Copyright (c) 2018 Linaro |
5 | * |
6 | * This library is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU Lesser General Public |
8 | * License as published by the Free Software Foundation; either |
9 | * version 2.1 of the License, or (at your option) any later version. |
10 | * |
11 | * This library is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | * Lesser General Public License for more details. |
15 | * |
16 | * You should have received a copy of the GNU Lesser General Public |
17 | * License along with this library; if not, see <http://www.gnu.org/licenses/>. |
18 | */ |
19 | |
20 | #include "qemu/osdep.h" |
21 | #include "qemu/host-utils.h" |
22 | #include "cpu.h" |
23 | #include "exec/helper-proto.h" |
24 | #include "tcg-gvec-desc.h" |
25 | |
26 | |
27 | /* Virtually all hosts support 16-byte vectors. Those that don't can emulate |
28 | * them via GCC's generic vector extension. This turns out to be simpler and |
29 | * more reliable than getting the compiler to autovectorize. |
30 | * |
31 | * In tcg-op-gvec.c, we asserted that both the size and alignment of the data |
32 | * are multiples of 16. |
33 | * |
34 | * When the compiler does not support all of the operations we require, the |
35 | * loops are written so that we can always fall back on the base types. |
36 | */ |
37 | #ifdef CONFIG_VECTOR16 |
38 | typedef uint8_t vec8 __attribute__((vector_size(16))); |
39 | typedef uint16_t vec16 __attribute__((vector_size(16))); |
40 | typedef uint32_t vec32 __attribute__((vector_size(16))); |
41 | typedef uint64_t vec64 __attribute__((vector_size(16))); |
42 | |
43 | typedef int8_t svec8 __attribute__((vector_size(16))); |
44 | typedef int16_t svec16 __attribute__((vector_size(16))); |
45 | typedef int32_t svec32 __attribute__((vector_size(16))); |
46 | typedef int64_t svec64 __attribute__((vector_size(16))); |
47 | |
48 | #define DUP16(X) { X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X } |
49 | #define DUP8(X) { X, X, X, X, X, X, X, X } |
50 | #define DUP4(X) { X, X, X, X } |
51 | #define DUP2(X) { X, X } |
52 | #else |
53 | typedef uint8_t vec8; |
54 | typedef uint16_t vec16; |
55 | typedef uint32_t vec32; |
56 | typedef uint64_t vec64; |
57 | |
58 | typedef int8_t svec8; |
59 | typedef int16_t svec16; |
60 | typedef int32_t svec32; |
61 | typedef int64_t svec64; |
62 | |
63 | #define DUP16(X) X |
64 | #define DUP8(X) X |
65 | #define DUP4(X) X |
66 | #define DUP2(X) X |
67 | #endif /* CONFIG_VECTOR16 */ |
68 | |
69 | static inline void clear_high(void *d, intptr_t oprsz, uint32_t desc) |
70 | { |
71 | intptr_t maxsz = simd_maxsz(desc); |
72 | intptr_t i; |
73 | |
74 | if (unlikely(maxsz > oprsz)) { |
75 | for (i = oprsz; i < maxsz; i += sizeof(uint64_t)) { |
76 | *(uint64_t *)(d + i) = 0; |
77 | } |
78 | } |
79 | } |
80 | |
81 | void HELPER(gvec_add8)(void *d, void *a, void *b, uint32_t desc) |
82 | { |
83 | intptr_t oprsz = simd_oprsz(desc); |
84 | intptr_t i; |
85 | |
86 | for (i = 0; i < oprsz; i += sizeof(vec8)) { |
87 | *(vec8 *)(d + i) = *(vec8 *)(a + i) + *(vec8 *)(b + i); |
88 | } |
89 | clear_high(d, oprsz, desc); |
90 | } |
91 | |
92 | void HELPER(gvec_add16)(void *d, void *a, void *b, uint32_t desc) |
93 | { |
94 | intptr_t oprsz = simd_oprsz(desc); |
95 | intptr_t i; |
96 | |
97 | for (i = 0; i < oprsz; i += sizeof(vec16)) { |
98 | *(vec16 *)(d + i) = *(vec16 *)(a + i) + *(vec16 *)(b + i); |
99 | } |
100 | clear_high(d, oprsz, desc); |
101 | } |
102 | |
103 | void HELPER(gvec_add32)(void *d, void *a, void *b, uint32_t desc) |
104 | { |
105 | intptr_t oprsz = simd_oprsz(desc); |
106 | intptr_t i; |
107 | |
108 | for (i = 0; i < oprsz; i += sizeof(vec32)) { |
109 | *(vec32 *)(d + i) = *(vec32 *)(a + i) + *(vec32 *)(b + i); |
110 | } |
111 | clear_high(d, oprsz, desc); |
112 | } |
113 | |
114 | void HELPER(gvec_add64)(void *d, void *a, void *b, uint32_t desc) |
115 | { |
116 | intptr_t oprsz = simd_oprsz(desc); |
117 | intptr_t i; |
118 | |
119 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
120 | *(vec64 *)(d + i) = *(vec64 *)(a + i) + *(vec64 *)(b + i); |
121 | } |
122 | clear_high(d, oprsz, desc); |
123 | } |
124 | |
125 | void HELPER(gvec_adds8)(void *d, void *a, uint64_t b, uint32_t desc) |
126 | { |
127 | intptr_t oprsz = simd_oprsz(desc); |
128 | vec8 vecb = (vec8)DUP16(b); |
129 | intptr_t i; |
130 | |
131 | for (i = 0; i < oprsz; i += sizeof(vec8)) { |
132 | *(vec8 *)(d + i) = *(vec8 *)(a + i) + vecb; |
133 | } |
134 | clear_high(d, oprsz, desc); |
135 | } |
136 | |
137 | void HELPER(gvec_adds16)(void *d, void *a, uint64_t b, uint32_t desc) |
138 | { |
139 | intptr_t oprsz = simd_oprsz(desc); |
140 | vec16 vecb = (vec16)DUP8(b); |
141 | intptr_t i; |
142 | |
143 | for (i = 0; i < oprsz; i += sizeof(vec16)) { |
144 | *(vec16 *)(d + i) = *(vec16 *)(a + i) + vecb; |
145 | } |
146 | clear_high(d, oprsz, desc); |
147 | } |
148 | |
149 | void HELPER(gvec_adds32)(void *d, void *a, uint64_t b, uint32_t desc) |
150 | { |
151 | intptr_t oprsz = simd_oprsz(desc); |
152 | vec32 vecb = (vec32)DUP4(b); |
153 | intptr_t i; |
154 | |
155 | for (i = 0; i < oprsz; i += sizeof(vec32)) { |
156 | *(vec32 *)(d + i) = *(vec32 *)(a + i) + vecb; |
157 | } |
158 | clear_high(d, oprsz, desc); |
159 | } |
160 | |
161 | void HELPER(gvec_adds64)(void *d, void *a, uint64_t b, uint32_t desc) |
162 | { |
163 | intptr_t oprsz = simd_oprsz(desc); |
164 | vec64 vecb = (vec64)DUP2(b); |
165 | intptr_t i; |
166 | |
167 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
168 | *(vec64 *)(d + i) = *(vec64 *)(a + i) + vecb; |
169 | } |
170 | clear_high(d, oprsz, desc); |
171 | } |
172 | |
173 | void HELPER(gvec_sub8)(void *d, void *a, void *b, uint32_t desc) |
174 | { |
175 | intptr_t oprsz = simd_oprsz(desc); |
176 | intptr_t i; |
177 | |
178 | for (i = 0; i < oprsz; i += sizeof(vec8)) { |
179 | *(vec8 *)(d + i) = *(vec8 *)(a + i) - *(vec8 *)(b + i); |
180 | } |
181 | clear_high(d, oprsz, desc); |
182 | } |
183 | |
184 | void HELPER(gvec_sub16)(void *d, void *a, void *b, uint32_t desc) |
185 | { |
186 | intptr_t oprsz = simd_oprsz(desc); |
187 | intptr_t i; |
188 | |
189 | for (i = 0; i < oprsz; i += sizeof(vec16)) { |
190 | *(vec16 *)(d + i) = *(vec16 *)(a + i) - *(vec16 *)(b + i); |
191 | } |
192 | clear_high(d, oprsz, desc); |
193 | } |
194 | |
195 | void HELPER(gvec_sub32)(void *d, void *a, void *b, uint32_t desc) |
196 | { |
197 | intptr_t oprsz = simd_oprsz(desc); |
198 | intptr_t i; |
199 | |
200 | for (i = 0; i < oprsz; i += sizeof(vec32)) { |
201 | *(vec32 *)(d + i) = *(vec32 *)(a + i) - *(vec32 *)(b + i); |
202 | } |
203 | clear_high(d, oprsz, desc); |
204 | } |
205 | |
206 | void HELPER(gvec_sub64)(void *d, void *a, void *b, uint32_t desc) |
207 | { |
208 | intptr_t oprsz = simd_oprsz(desc); |
209 | intptr_t i; |
210 | |
211 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
212 | *(vec64 *)(d + i) = *(vec64 *)(a + i) - *(vec64 *)(b + i); |
213 | } |
214 | clear_high(d, oprsz, desc); |
215 | } |
216 | |
217 | void HELPER(gvec_subs8)(void *d, void *a, uint64_t b, uint32_t desc) |
218 | { |
219 | intptr_t oprsz = simd_oprsz(desc); |
220 | vec8 vecb = (vec8)DUP16(b); |
221 | intptr_t i; |
222 | |
223 | for (i = 0; i < oprsz; i += sizeof(vec8)) { |
224 | *(vec8 *)(d + i) = *(vec8 *)(a + i) - vecb; |
225 | } |
226 | clear_high(d, oprsz, desc); |
227 | } |
228 | |
229 | void HELPER(gvec_subs16)(void *d, void *a, uint64_t b, uint32_t desc) |
230 | { |
231 | intptr_t oprsz = simd_oprsz(desc); |
232 | vec16 vecb = (vec16)DUP8(b); |
233 | intptr_t i; |
234 | |
235 | for (i = 0; i < oprsz; i += sizeof(vec16)) { |
236 | *(vec16 *)(d + i) = *(vec16 *)(a + i) - vecb; |
237 | } |
238 | clear_high(d, oprsz, desc); |
239 | } |
240 | |
241 | void HELPER(gvec_subs32)(void *d, void *a, uint64_t b, uint32_t desc) |
242 | { |
243 | intptr_t oprsz = simd_oprsz(desc); |
244 | vec32 vecb = (vec32)DUP4(b); |
245 | intptr_t i; |
246 | |
247 | for (i = 0; i < oprsz; i += sizeof(vec32)) { |
248 | *(vec32 *)(d + i) = *(vec32 *)(a + i) - vecb; |
249 | } |
250 | clear_high(d, oprsz, desc); |
251 | } |
252 | |
253 | void HELPER(gvec_subs64)(void *d, void *a, uint64_t b, uint32_t desc) |
254 | { |
255 | intptr_t oprsz = simd_oprsz(desc); |
256 | vec64 vecb = (vec64)DUP2(b); |
257 | intptr_t i; |
258 | |
259 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
260 | *(vec64 *)(d + i) = *(vec64 *)(a + i) - vecb; |
261 | } |
262 | clear_high(d, oprsz, desc); |
263 | } |
264 | |
265 | void HELPER(gvec_mul8)(void *d, void *a, void *b, uint32_t desc) |
266 | { |
267 | intptr_t oprsz = simd_oprsz(desc); |
268 | intptr_t i; |
269 | |
270 | for (i = 0; i < oprsz; i += sizeof(vec8)) { |
271 | *(vec8 *)(d + i) = *(vec8 *)(a + i) * *(vec8 *)(b + i); |
272 | } |
273 | clear_high(d, oprsz, desc); |
274 | } |
275 | |
276 | void HELPER(gvec_mul16)(void *d, void *a, void *b, uint32_t desc) |
277 | { |
278 | intptr_t oprsz = simd_oprsz(desc); |
279 | intptr_t i; |
280 | |
281 | for (i = 0; i < oprsz; i += sizeof(vec16)) { |
282 | *(vec16 *)(d + i) = *(vec16 *)(a + i) * *(vec16 *)(b + i); |
283 | } |
284 | clear_high(d, oprsz, desc); |
285 | } |
286 | |
287 | void HELPER(gvec_mul32)(void *d, void *a, void *b, uint32_t desc) |
288 | { |
289 | intptr_t oprsz = simd_oprsz(desc); |
290 | intptr_t i; |
291 | |
292 | for (i = 0; i < oprsz; i += sizeof(vec32)) { |
293 | *(vec32 *)(d + i) = *(vec32 *)(a + i) * *(vec32 *)(b + i); |
294 | } |
295 | clear_high(d, oprsz, desc); |
296 | } |
297 | |
298 | void HELPER(gvec_mul64)(void *d, void *a, void *b, uint32_t desc) |
299 | { |
300 | intptr_t oprsz = simd_oprsz(desc); |
301 | intptr_t i; |
302 | |
303 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
304 | *(vec64 *)(d + i) = *(vec64 *)(a + i) * *(vec64 *)(b + i); |
305 | } |
306 | clear_high(d, oprsz, desc); |
307 | } |
308 | |
309 | void HELPER(gvec_muls8)(void *d, void *a, uint64_t b, uint32_t desc) |
310 | { |
311 | intptr_t oprsz = simd_oprsz(desc); |
312 | vec8 vecb = (vec8)DUP16(b); |
313 | intptr_t i; |
314 | |
315 | for (i = 0; i < oprsz; i += sizeof(vec8)) { |
316 | *(vec8 *)(d + i) = *(vec8 *)(a + i) * vecb; |
317 | } |
318 | clear_high(d, oprsz, desc); |
319 | } |
320 | |
321 | void HELPER(gvec_muls16)(void *d, void *a, uint64_t b, uint32_t desc) |
322 | { |
323 | intptr_t oprsz = simd_oprsz(desc); |
324 | vec16 vecb = (vec16)DUP8(b); |
325 | intptr_t i; |
326 | |
327 | for (i = 0; i < oprsz; i += sizeof(vec16)) { |
328 | *(vec16 *)(d + i) = *(vec16 *)(a + i) * vecb; |
329 | } |
330 | clear_high(d, oprsz, desc); |
331 | } |
332 | |
333 | void HELPER(gvec_muls32)(void *d, void *a, uint64_t b, uint32_t desc) |
334 | { |
335 | intptr_t oprsz = simd_oprsz(desc); |
336 | vec32 vecb = (vec32)DUP4(b); |
337 | intptr_t i; |
338 | |
339 | for (i = 0; i < oprsz; i += sizeof(vec32)) { |
340 | *(vec32 *)(d + i) = *(vec32 *)(a + i) * vecb; |
341 | } |
342 | clear_high(d, oprsz, desc); |
343 | } |
344 | |
345 | void HELPER(gvec_muls64)(void *d, void *a, uint64_t b, uint32_t desc) |
346 | { |
347 | intptr_t oprsz = simd_oprsz(desc); |
348 | vec64 vecb = (vec64)DUP2(b); |
349 | intptr_t i; |
350 | |
351 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
352 | *(vec64 *)(d + i) = *(vec64 *)(a + i) * vecb; |
353 | } |
354 | clear_high(d, oprsz, desc); |
355 | } |
356 | |
357 | void HELPER(gvec_neg8)(void *d, void *a, uint32_t desc) |
358 | { |
359 | intptr_t oprsz = simd_oprsz(desc); |
360 | intptr_t i; |
361 | |
362 | for (i = 0; i < oprsz; i += sizeof(vec8)) { |
363 | *(vec8 *)(d + i) = -*(vec8 *)(a + i); |
364 | } |
365 | clear_high(d, oprsz, desc); |
366 | } |
367 | |
368 | void HELPER(gvec_neg16)(void *d, void *a, uint32_t desc) |
369 | { |
370 | intptr_t oprsz = simd_oprsz(desc); |
371 | intptr_t i; |
372 | |
373 | for (i = 0; i < oprsz; i += sizeof(vec16)) { |
374 | *(vec16 *)(d + i) = -*(vec16 *)(a + i); |
375 | } |
376 | clear_high(d, oprsz, desc); |
377 | } |
378 | |
379 | void HELPER(gvec_neg32)(void *d, void *a, uint32_t desc) |
380 | { |
381 | intptr_t oprsz = simd_oprsz(desc); |
382 | intptr_t i; |
383 | |
384 | for (i = 0; i < oprsz; i += sizeof(vec32)) { |
385 | *(vec32 *)(d + i) = -*(vec32 *)(a + i); |
386 | } |
387 | clear_high(d, oprsz, desc); |
388 | } |
389 | |
390 | void HELPER(gvec_neg64)(void *d, void *a, uint32_t desc) |
391 | { |
392 | intptr_t oprsz = simd_oprsz(desc); |
393 | intptr_t i; |
394 | |
395 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
396 | *(vec64 *)(d + i) = -*(vec64 *)(a + i); |
397 | } |
398 | clear_high(d, oprsz, desc); |
399 | } |
400 | |
401 | void HELPER(gvec_abs8)(void *d, void *a, uint32_t desc) |
402 | { |
403 | intptr_t oprsz = simd_oprsz(desc); |
404 | intptr_t i; |
405 | |
406 | for (i = 0; i < oprsz; i += sizeof(int8_t)) { |
407 | int8_t aa = *(int8_t *)(a + i); |
408 | *(int8_t *)(d + i) = aa < 0 ? -aa : aa; |
409 | } |
410 | clear_high(d, oprsz, desc); |
411 | } |
412 | |
413 | void HELPER(gvec_abs16)(void *d, void *a, uint32_t desc) |
414 | { |
415 | intptr_t oprsz = simd_oprsz(desc); |
416 | intptr_t i; |
417 | |
418 | for (i = 0; i < oprsz; i += sizeof(int16_t)) { |
419 | int16_t aa = *(int16_t *)(a + i); |
420 | *(int16_t *)(d + i) = aa < 0 ? -aa : aa; |
421 | } |
422 | clear_high(d, oprsz, desc); |
423 | } |
424 | |
425 | void HELPER(gvec_abs32)(void *d, void *a, uint32_t desc) |
426 | { |
427 | intptr_t oprsz = simd_oprsz(desc); |
428 | intptr_t i; |
429 | |
430 | for (i = 0; i < oprsz; i += sizeof(int32_t)) { |
431 | int32_t aa = *(int32_t *)(a + i); |
432 | *(int32_t *)(d + i) = aa < 0 ? -aa : aa; |
433 | } |
434 | clear_high(d, oprsz, desc); |
435 | } |
436 | |
437 | void HELPER(gvec_abs64)(void *d, void *a, uint32_t desc) |
438 | { |
439 | intptr_t oprsz = simd_oprsz(desc); |
440 | intptr_t i; |
441 | |
442 | for (i = 0; i < oprsz; i += sizeof(int64_t)) { |
443 | int64_t aa = *(int64_t *)(a + i); |
444 | *(int64_t *)(d + i) = aa < 0 ? -aa : aa; |
445 | } |
446 | clear_high(d, oprsz, desc); |
447 | } |
448 | |
449 | void HELPER(gvec_mov)(void *d, void *a, uint32_t desc) |
450 | { |
451 | intptr_t oprsz = simd_oprsz(desc); |
452 | |
453 | memcpy(d, a, oprsz); |
454 | clear_high(d, oprsz, desc); |
455 | } |
456 | |
457 | void HELPER(gvec_dup64)(void *d, uint32_t desc, uint64_t c) |
458 | { |
459 | intptr_t oprsz = simd_oprsz(desc); |
460 | intptr_t i; |
461 | |
462 | if (c == 0) { |
463 | oprsz = 0; |
464 | } else { |
465 | for (i = 0; i < oprsz; i += sizeof(uint64_t)) { |
466 | *(uint64_t *)(d + i) = c; |
467 | } |
468 | } |
469 | clear_high(d, oprsz, desc); |
470 | } |
471 | |
472 | void HELPER(gvec_dup32)(void *d, uint32_t desc, uint32_t c) |
473 | { |
474 | intptr_t oprsz = simd_oprsz(desc); |
475 | intptr_t i; |
476 | |
477 | if (c == 0) { |
478 | oprsz = 0; |
479 | } else { |
480 | for (i = 0; i < oprsz; i += sizeof(uint32_t)) { |
481 | *(uint32_t *)(d + i) = c; |
482 | } |
483 | } |
484 | clear_high(d, oprsz, desc); |
485 | } |
486 | |
487 | void HELPER(gvec_dup16)(void *d, uint32_t desc, uint32_t c) |
488 | { |
489 | HELPER(gvec_dup32)(d, desc, 0x00010001 * (c & 0xffff)); |
490 | } |
491 | |
492 | void HELPER(gvec_dup8)(void *d, uint32_t desc, uint32_t c) |
493 | { |
494 | HELPER(gvec_dup32)(d, desc, 0x01010101 * (c & 0xff)); |
495 | } |
496 | |
497 | void HELPER(gvec_not)(void *d, void *a, uint32_t desc) |
498 | { |
499 | intptr_t oprsz = simd_oprsz(desc); |
500 | intptr_t i; |
501 | |
502 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
503 | *(vec64 *)(d + i) = ~*(vec64 *)(a + i); |
504 | } |
505 | clear_high(d, oprsz, desc); |
506 | } |
507 | |
508 | void HELPER(gvec_and)(void *d, void *a, void *b, uint32_t desc) |
509 | { |
510 | intptr_t oprsz = simd_oprsz(desc); |
511 | intptr_t i; |
512 | |
513 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
514 | *(vec64 *)(d + i) = *(vec64 *)(a + i) & *(vec64 *)(b + i); |
515 | } |
516 | clear_high(d, oprsz, desc); |
517 | } |
518 | |
519 | void HELPER(gvec_or)(void *d, void *a, void *b, uint32_t desc) |
520 | { |
521 | intptr_t oprsz = simd_oprsz(desc); |
522 | intptr_t i; |
523 | |
524 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
525 | *(vec64 *)(d + i) = *(vec64 *)(a + i) | *(vec64 *)(b + i); |
526 | } |
527 | clear_high(d, oprsz, desc); |
528 | } |
529 | |
530 | void HELPER(gvec_xor)(void *d, void *a, void *b, uint32_t desc) |
531 | { |
532 | intptr_t oprsz = simd_oprsz(desc); |
533 | intptr_t i; |
534 | |
535 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
536 | *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ *(vec64 *)(b + i); |
537 | } |
538 | clear_high(d, oprsz, desc); |
539 | } |
540 | |
541 | void HELPER(gvec_andc)(void *d, void *a, void *b, uint32_t desc) |
542 | { |
543 | intptr_t oprsz = simd_oprsz(desc); |
544 | intptr_t i; |
545 | |
546 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
547 | *(vec64 *)(d + i) = *(vec64 *)(a + i) &~ *(vec64 *)(b + i); |
548 | } |
549 | clear_high(d, oprsz, desc); |
550 | } |
551 | |
552 | void HELPER(gvec_orc)(void *d, void *a, void *b, uint32_t desc) |
553 | { |
554 | intptr_t oprsz = simd_oprsz(desc); |
555 | intptr_t i; |
556 | |
557 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
558 | *(vec64 *)(d + i) = *(vec64 *)(a + i) |~ *(vec64 *)(b + i); |
559 | } |
560 | clear_high(d, oprsz, desc); |
561 | } |
562 | |
563 | void HELPER(gvec_nand)(void *d, void *a, void *b, uint32_t desc) |
564 | { |
565 | intptr_t oprsz = simd_oprsz(desc); |
566 | intptr_t i; |
567 | |
568 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
569 | *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) & *(vec64 *)(b + i)); |
570 | } |
571 | clear_high(d, oprsz, desc); |
572 | } |
573 | |
574 | void HELPER(gvec_nor)(void *d, void *a, void *b, uint32_t desc) |
575 | { |
576 | intptr_t oprsz = simd_oprsz(desc); |
577 | intptr_t i; |
578 | |
579 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
580 | *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) | *(vec64 *)(b + i)); |
581 | } |
582 | clear_high(d, oprsz, desc); |
583 | } |
584 | |
585 | void HELPER(gvec_eqv)(void *d, void *a, void *b, uint32_t desc) |
586 | { |
587 | intptr_t oprsz = simd_oprsz(desc); |
588 | intptr_t i; |
589 | |
590 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
591 | *(vec64 *)(d + i) = ~(*(vec64 *)(a + i) ^ *(vec64 *)(b + i)); |
592 | } |
593 | clear_high(d, oprsz, desc); |
594 | } |
595 | |
596 | void HELPER(gvec_ands)(void *d, void *a, uint64_t b, uint32_t desc) |
597 | { |
598 | intptr_t oprsz = simd_oprsz(desc); |
599 | vec64 vecb = (vec64)DUP2(b); |
600 | intptr_t i; |
601 | |
602 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
603 | *(vec64 *)(d + i) = *(vec64 *)(a + i) & vecb; |
604 | } |
605 | clear_high(d, oprsz, desc); |
606 | } |
607 | |
608 | void HELPER(gvec_xors)(void *d, void *a, uint64_t b, uint32_t desc) |
609 | { |
610 | intptr_t oprsz = simd_oprsz(desc); |
611 | vec64 vecb = (vec64)DUP2(b); |
612 | intptr_t i; |
613 | |
614 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
615 | *(vec64 *)(d + i) = *(vec64 *)(a + i) ^ vecb; |
616 | } |
617 | clear_high(d, oprsz, desc); |
618 | } |
619 | |
620 | void HELPER(gvec_ors)(void *d, void *a, uint64_t b, uint32_t desc) |
621 | { |
622 | intptr_t oprsz = simd_oprsz(desc); |
623 | vec64 vecb = (vec64)DUP2(b); |
624 | intptr_t i; |
625 | |
626 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
627 | *(vec64 *)(d + i) = *(vec64 *)(a + i) | vecb; |
628 | } |
629 | clear_high(d, oprsz, desc); |
630 | } |
631 | |
632 | void HELPER(gvec_shl8i)(void *d, void *a, uint32_t desc) |
633 | { |
634 | intptr_t oprsz = simd_oprsz(desc); |
635 | int shift = simd_data(desc); |
636 | intptr_t i; |
637 | |
638 | for (i = 0; i < oprsz; i += sizeof(vec8)) { |
639 | *(vec8 *)(d + i) = *(vec8 *)(a + i) << shift; |
640 | } |
641 | clear_high(d, oprsz, desc); |
642 | } |
643 | |
644 | void HELPER(gvec_shl16i)(void *d, void *a, uint32_t desc) |
645 | { |
646 | intptr_t oprsz = simd_oprsz(desc); |
647 | int shift = simd_data(desc); |
648 | intptr_t i; |
649 | |
650 | for (i = 0; i < oprsz; i += sizeof(vec16)) { |
651 | *(vec16 *)(d + i) = *(vec16 *)(a + i) << shift; |
652 | } |
653 | clear_high(d, oprsz, desc); |
654 | } |
655 | |
656 | void HELPER(gvec_shl32i)(void *d, void *a, uint32_t desc) |
657 | { |
658 | intptr_t oprsz = simd_oprsz(desc); |
659 | int shift = simd_data(desc); |
660 | intptr_t i; |
661 | |
662 | for (i = 0; i < oprsz; i += sizeof(vec32)) { |
663 | *(vec32 *)(d + i) = *(vec32 *)(a + i) << shift; |
664 | } |
665 | clear_high(d, oprsz, desc); |
666 | } |
667 | |
668 | void HELPER(gvec_shl64i)(void *d, void *a, uint32_t desc) |
669 | { |
670 | intptr_t oprsz = simd_oprsz(desc); |
671 | int shift = simd_data(desc); |
672 | intptr_t i; |
673 | |
674 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
675 | *(vec64 *)(d + i) = *(vec64 *)(a + i) << shift; |
676 | } |
677 | clear_high(d, oprsz, desc); |
678 | } |
679 | |
680 | void HELPER(gvec_shr8i)(void *d, void *a, uint32_t desc) |
681 | { |
682 | intptr_t oprsz = simd_oprsz(desc); |
683 | int shift = simd_data(desc); |
684 | intptr_t i; |
685 | |
686 | for (i = 0; i < oprsz; i += sizeof(vec8)) { |
687 | *(vec8 *)(d + i) = *(vec8 *)(a + i) >> shift; |
688 | } |
689 | clear_high(d, oprsz, desc); |
690 | } |
691 | |
692 | void HELPER(gvec_shr16i)(void *d, void *a, uint32_t desc) |
693 | { |
694 | intptr_t oprsz = simd_oprsz(desc); |
695 | int shift = simd_data(desc); |
696 | intptr_t i; |
697 | |
698 | for (i = 0; i < oprsz; i += sizeof(vec16)) { |
699 | *(vec16 *)(d + i) = *(vec16 *)(a + i) >> shift; |
700 | } |
701 | clear_high(d, oprsz, desc); |
702 | } |
703 | |
704 | void HELPER(gvec_shr32i)(void *d, void *a, uint32_t desc) |
705 | { |
706 | intptr_t oprsz = simd_oprsz(desc); |
707 | int shift = simd_data(desc); |
708 | intptr_t i; |
709 | |
710 | for (i = 0; i < oprsz; i += sizeof(vec32)) { |
711 | *(vec32 *)(d + i) = *(vec32 *)(a + i) >> shift; |
712 | } |
713 | clear_high(d, oprsz, desc); |
714 | } |
715 | |
716 | void HELPER(gvec_shr64i)(void *d, void *a, uint32_t desc) |
717 | { |
718 | intptr_t oprsz = simd_oprsz(desc); |
719 | int shift = simd_data(desc); |
720 | intptr_t i; |
721 | |
722 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
723 | *(vec64 *)(d + i) = *(vec64 *)(a + i) >> shift; |
724 | } |
725 | clear_high(d, oprsz, desc); |
726 | } |
727 | |
728 | void HELPER(gvec_sar8i)(void *d, void *a, uint32_t desc) |
729 | { |
730 | intptr_t oprsz = simd_oprsz(desc); |
731 | int shift = simd_data(desc); |
732 | intptr_t i; |
733 | |
734 | for (i = 0; i < oprsz; i += sizeof(vec8)) { |
735 | *(svec8 *)(d + i) = *(svec8 *)(a + i) >> shift; |
736 | } |
737 | clear_high(d, oprsz, desc); |
738 | } |
739 | |
740 | void HELPER(gvec_sar16i)(void *d, void *a, uint32_t desc) |
741 | { |
742 | intptr_t oprsz = simd_oprsz(desc); |
743 | int shift = simd_data(desc); |
744 | intptr_t i; |
745 | |
746 | for (i = 0; i < oprsz; i += sizeof(vec16)) { |
747 | *(svec16 *)(d + i) = *(svec16 *)(a + i) >> shift; |
748 | } |
749 | clear_high(d, oprsz, desc); |
750 | } |
751 | |
752 | void HELPER(gvec_sar32i)(void *d, void *a, uint32_t desc) |
753 | { |
754 | intptr_t oprsz = simd_oprsz(desc); |
755 | int shift = simd_data(desc); |
756 | intptr_t i; |
757 | |
758 | for (i = 0; i < oprsz; i += sizeof(vec32)) { |
759 | *(svec32 *)(d + i) = *(svec32 *)(a + i) >> shift; |
760 | } |
761 | clear_high(d, oprsz, desc); |
762 | } |
763 | |
764 | void HELPER(gvec_sar64i)(void *d, void *a, uint32_t desc) |
765 | { |
766 | intptr_t oprsz = simd_oprsz(desc); |
767 | int shift = simd_data(desc); |
768 | intptr_t i; |
769 | |
770 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
771 | *(svec64 *)(d + i) = *(svec64 *)(a + i) >> shift; |
772 | } |
773 | clear_high(d, oprsz, desc); |
774 | } |
775 | |
776 | void HELPER(gvec_shl8v)(void *d, void *a, void *b, uint32_t desc) |
777 | { |
778 | intptr_t oprsz = simd_oprsz(desc); |
779 | intptr_t i; |
780 | |
781 | for (i = 0; i < oprsz; i += sizeof(uint8_t)) { |
782 | uint8_t sh = *(uint8_t *)(b + i) & 7; |
783 | *(uint8_t *)(d + i) = *(uint8_t *)(a + i) << sh; |
784 | } |
785 | clear_high(d, oprsz, desc); |
786 | } |
787 | |
788 | void HELPER(gvec_shl16v)(void *d, void *a, void *b, uint32_t desc) |
789 | { |
790 | intptr_t oprsz = simd_oprsz(desc); |
791 | intptr_t i; |
792 | |
793 | for (i = 0; i < oprsz; i += sizeof(uint16_t)) { |
794 | uint8_t sh = *(uint16_t *)(b + i) & 15; |
795 | *(uint16_t *)(d + i) = *(uint16_t *)(a + i) << sh; |
796 | } |
797 | clear_high(d, oprsz, desc); |
798 | } |
799 | |
800 | void HELPER(gvec_shl32v)(void *d, void *a, void *b, uint32_t desc) |
801 | { |
802 | intptr_t oprsz = simd_oprsz(desc); |
803 | intptr_t i; |
804 | |
805 | for (i = 0; i < oprsz; i += sizeof(uint32_t)) { |
806 | uint8_t sh = *(uint32_t *)(b + i) & 31; |
807 | *(uint32_t *)(d + i) = *(uint32_t *)(a + i) << sh; |
808 | } |
809 | clear_high(d, oprsz, desc); |
810 | } |
811 | |
812 | void HELPER(gvec_shl64v)(void *d, void *a, void *b, uint32_t desc) |
813 | { |
814 | intptr_t oprsz = simd_oprsz(desc); |
815 | intptr_t i; |
816 | |
817 | for (i = 0; i < oprsz; i += sizeof(uint64_t)) { |
818 | uint8_t sh = *(uint64_t *)(b + i) & 63; |
819 | *(uint64_t *)(d + i) = *(uint64_t *)(a + i) << sh; |
820 | } |
821 | clear_high(d, oprsz, desc); |
822 | } |
823 | |
824 | void HELPER(gvec_shr8v)(void *d, void *a, void *b, uint32_t desc) |
825 | { |
826 | intptr_t oprsz = simd_oprsz(desc); |
827 | intptr_t i; |
828 | |
829 | for (i = 0; i < oprsz; i += sizeof(uint8_t)) { |
830 | uint8_t sh = *(uint8_t *)(b + i) & 7; |
831 | *(uint8_t *)(d + i) = *(uint8_t *)(a + i) >> sh; |
832 | } |
833 | clear_high(d, oprsz, desc); |
834 | } |
835 | |
836 | void HELPER(gvec_shr16v)(void *d, void *a, void *b, uint32_t desc) |
837 | { |
838 | intptr_t oprsz = simd_oprsz(desc); |
839 | intptr_t i; |
840 | |
841 | for (i = 0; i < oprsz; i += sizeof(uint16_t)) { |
842 | uint8_t sh = *(uint16_t *)(b + i) & 15; |
843 | *(uint16_t *)(d + i) = *(uint16_t *)(a + i) >> sh; |
844 | } |
845 | clear_high(d, oprsz, desc); |
846 | } |
847 | |
848 | void HELPER(gvec_shr32v)(void *d, void *a, void *b, uint32_t desc) |
849 | { |
850 | intptr_t oprsz = simd_oprsz(desc); |
851 | intptr_t i; |
852 | |
853 | for (i = 0; i < oprsz; i += sizeof(uint32_t)) { |
854 | uint8_t sh = *(uint32_t *)(b + i) & 31; |
855 | *(uint32_t *)(d + i) = *(uint32_t *)(a + i) >> sh; |
856 | } |
857 | clear_high(d, oprsz, desc); |
858 | } |
859 | |
860 | void HELPER(gvec_shr64v)(void *d, void *a, void *b, uint32_t desc) |
861 | { |
862 | intptr_t oprsz = simd_oprsz(desc); |
863 | intptr_t i; |
864 | |
865 | for (i = 0; i < oprsz; i += sizeof(uint64_t)) { |
866 | uint8_t sh = *(uint64_t *)(b + i) & 63; |
867 | *(uint64_t *)(d + i) = *(uint64_t *)(a + i) >> sh; |
868 | } |
869 | clear_high(d, oprsz, desc); |
870 | } |
871 | |
872 | void HELPER(gvec_sar8v)(void *d, void *a, void *b, uint32_t desc) |
873 | { |
874 | intptr_t oprsz = simd_oprsz(desc); |
875 | intptr_t i; |
876 | |
877 | for (i = 0; i < oprsz; i += sizeof(int8_t)) { |
878 | uint8_t sh = *(uint8_t *)(b + i) & 7; |
879 | *(int8_t *)(d + i) = *(int8_t *)(a + i) >> sh; |
880 | } |
881 | clear_high(d, oprsz, desc); |
882 | } |
883 | |
884 | void HELPER(gvec_sar16v)(void *d, void *a, void *b, uint32_t desc) |
885 | { |
886 | intptr_t oprsz = simd_oprsz(desc); |
887 | intptr_t i; |
888 | |
889 | for (i = 0; i < oprsz; i += sizeof(int16_t)) { |
890 | uint8_t sh = *(uint16_t *)(b + i) & 15; |
891 | *(int16_t *)(d + i) = *(int16_t *)(a + i) >> sh; |
892 | } |
893 | clear_high(d, oprsz, desc); |
894 | } |
895 | |
896 | void HELPER(gvec_sar32v)(void *d, void *a, void *b, uint32_t desc) |
897 | { |
898 | intptr_t oprsz = simd_oprsz(desc); |
899 | intptr_t i; |
900 | |
901 | for (i = 0; i < oprsz; i += sizeof(int32_t)) { |
902 | uint8_t sh = *(uint32_t *)(b + i) & 31; |
903 | *(int32_t *)(d + i) = *(int32_t *)(a + i) >> sh; |
904 | } |
905 | clear_high(d, oprsz, desc); |
906 | } |
907 | |
908 | void HELPER(gvec_sar64v)(void *d, void *a, void *b, uint32_t desc) |
909 | { |
910 | intptr_t oprsz = simd_oprsz(desc); |
911 | intptr_t i; |
912 | |
913 | for (i = 0; i < oprsz; i += sizeof(int64_t)) { |
914 | uint8_t sh = *(uint64_t *)(b + i) & 63; |
915 | *(int64_t *)(d + i) = *(int64_t *)(a + i) >> sh; |
916 | } |
917 | clear_high(d, oprsz, desc); |
918 | } |
919 | |
920 | /* If vectors are enabled, the compiler fills in -1 for true. |
921 | Otherwise, we must take care of this by hand. */ |
922 | #ifdef CONFIG_VECTOR16 |
923 | # define DO_CMP0(X) X |
924 | #else |
925 | # define DO_CMP0(X) -(X) |
926 | #endif |
927 | |
928 | #define DO_CMP1(NAME, TYPE, OP) \ |
929 | void HELPER(NAME)(void *d, void *a, void *b, uint32_t desc) \ |
930 | { \ |
931 | intptr_t oprsz = simd_oprsz(desc); \ |
932 | intptr_t i; \ |
933 | for (i = 0; i < oprsz; i += sizeof(TYPE)) { \ |
934 | *(TYPE *)(d + i) = DO_CMP0(*(TYPE *)(a + i) OP *(TYPE *)(b + i)); \ |
935 | } \ |
936 | clear_high(d, oprsz, desc); \ |
937 | } |
938 | |
939 | #define DO_CMP2(SZ) \ |
940 | DO_CMP1(gvec_eq##SZ, vec##SZ, ==) \ |
941 | DO_CMP1(gvec_ne##SZ, vec##SZ, !=) \ |
942 | DO_CMP1(gvec_lt##SZ, svec##SZ, <) \ |
943 | DO_CMP1(gvec_le##SZ, svec##SZ, <=) \ |
944 | DO_CMP1(gvec_ltu##SZ, vec##SZ, <) \ |
945 | DO_CMP1(gvec_leu##SZ, vec##SZ, <=) |
946 | |
947 | DO_CMP2(8) |
948 | DO_CMP2(16) |
949 | DO_CMP2(32) |
950 | DO_CMP2(64) |
951 | |
952 | #undef DO_CMP0 |
953 | #undef DO_CMP1 |
954 | #undef DO_CMP2 |
955 | |
956 | void HELPER(gvec_ssadd8)(void *d, void *a, void *b, uint32_t desc) |
957 | { |
958 | intptr_t oprsz = simd_oprsz(desc); |
959 | intptr_t i; |
960 | |
961 | for (i = 0; i < oprsz; i += sizeof(int8_t)) { |
962 | int r = *(int8_t *)(a + i) + *(int8_t *)(b + i); |
963 | if (r > INT8_MAX) { |
964 | r = INT8_MAX; |
965 | } else if (r < INT8_MIN) { |
966 | r = INT8_MIN; |
967 | } |
968 | *(int8_t *)(d + i) = r; |
969 | } |
970 | clear_high(d, oprsz, desc); |
971 | } |
972 | |
973 | void HELPER(gvec_ssadd16)(void *d, void *a, void *b, uint32_t desc) |
974 | { |
975 | intptr_t oprsz = simd_oprsz(desc); |
976 | intptr_t i; |
977 | |
978 | for (i = 0; i < oprsz; i += sizeof(int16_t)) { |
979 | int r = *(int16_t *)(a + i) + *(int16_t *)(b + i); |
980 | if (r > INT16_MAX) { |
981 | r = INT16_MAX; |
982 | } else if (r < INT16_MIN) { |
983 | r = INT16_MIN; |
984 | } |
985 | *(int16_t *)(d + i) = r; |
986 | } |
987 | clear_high(d, oprsz, desc); |
988 | } |
989 | |
990 | void HELPER(gvec_ssadd32)(void *d, void *a, void *b, uint32_t desc) |
991 | { |
992 | intptr_t oprsz = simd_oprsz(desc); |
993 | intptr_t i; |
994 | |
995 | for (i = 0; i < oprsz; i += sizeof(int32_t)) { |
996 | int32_t ai = *(int32_t *)(a + i); |
997 | int32_t bi = *(int32_t *)(b + i); |
998 | int32_t di = ai + bi; |
999 | if (((di ^ ai) &~ (ai ^ bi)) < 0) { |
1000 | /* Signed overflow. */ |
1001 | di = (di < 0 ? INT32_MAX : INT32_MIN); |
1002 | } |
1003 | *(int32_t *)(d + i) = di; |
1004 | } |
1005 | clear_high(d, oprsz, desc); |
1006 | } |
1007 | |
1008 | void HELPER(gvec_ssadd64)(void *d, void *a, void *b, uint32_t desc) |
1009 | { |
1010 | intptr_t oprsz = simd_oprsz(desc); |
1011 | intptr_t i; |
1012 | |
1013 | for (i = 0; i < oprsz; i += sizeof(int64_t)) { |
1014 | int64_t ai = *(int64_t *)(a + i); |
1015 | int64_t bi = *(int64_t *)(b + i); |
1016 | int64_t di = ai + bi; |
1017 | if (((di ^ ai) &~ (ai ^ bi)) < 0) { |
1018 | /* Signed overflow. */ |
1019 | di = (di < 0 ? INT64_MAX : INT64_MIN); |
1020 | } |
1021 | *(int64_t *)(d + i) = di; |
1022 | } |
1023 | clear_high(d, oprsz, desc); |
1024 | } |
1025 | |
1026 | void HELPER(gvec_sssub8)(void *d, void *a, void *b, uint32_t desc) |
1027 | { |
1028 | intptr_t oprsz = simd_oprsz(desc); |
1029 | intptr_t i; |
1030 | |
1031 | for (i = 0; i < oprsz; i += sizeof(uint8_t)) { |
1032 | int r = *(int8_t *)(a + i) - *(int8_t *)(b + i); |
1033 | if (r > INT8_MAX) { |
1034 | r = INT8_MAX; |
1035 | } else if (r < INT8_MIN) { |
1036 | r = INT8_MIN; |
1037 | } |
1038 | *(uint8_t *)(d + i) = r; |
1039 | } |
1040 | clear_high(d, oprsz, desc); |
1041 | } |
1042 | |
1043 | void HELPER(gvec_sssub16)(void *d, void *a, void *b, uint32_t desc) |
1044 | { |
1045 | intptr_t oprsz = simd_oprsz(desc); |
1046 | intptr_t i; |
1047 | |
1048 | for (i = 0; i < oprsz; i += sizeof(int16_t)) { |
1049 | int r = *(int16_t *)(a + i) - *(int16_t *)(b + i); |
1050 | if (r > INT16_MAX) { |
1051 | r = INT16_MAX; |
1052 | } else if (r < INT16_MIN) { |
1053 | r = INT16_MIN; |
1054 | } |
1055 | *(int16_t *)(d + i) = r; |
1056 | } |
1057 | clear_high(d, oprsz, desc); |
1058 | } |
1059 | |
1060 | void HELPER(gvec_sssub32)(void *d, void *a, void *b, uint32_t desc) |
1061 | { |
1062 | intptr_t oprsz = simd_oprsz(desc); |
1063 | intptr_t i; |
1064 | |
1065 | for (i = 0; i < oprsz; i += sizeof(int32_t)) { |
1066 | int32_t ai = *(int32_t *)(a + i); |
1067 | int32_t bi = *(int32_t *)(b + i); |
1068 | int32_t di = ai - bi; |
1069 | if (((di ^ ai) & (ai ^ bi)) < 0) { |
1070 | /* Signed overflow. */ |
1071 | di = (di < 0 ? INT32_MAX : INT32_MIN); |
1072 | } |
1073 | *(int32_t *)(d + i) = di; |
1074 | } |
1075 | clear_high(d, oprsz, desc); |
1076 | } |
1077 | |
1078 | void HELPER(gvec_sssub64)(void *d, void *a, void *b, uint32_t desc) |
1079 | { |
1080 | intptr_t oprsz = simd_oprsz(desc); |
1081 | intptr_t i; |
1082 | |
1083 | for (i = 0; i < oprsz; i += sizeof(int64_t)) { |
1084 | int64_t ai = *(int64_t *)(a + i); |
1085 | int64_t bi = *(int64_t *)(b + i); |
1086 | int64_t di = ai - bi; |
1087 | if (((di ^ ai) & (ai ^ bi)) < 0) { |
1088 | /* Signed overflow. */ |
1089 | di = (di < 0 ? INT64_MAX : INT64_MIN); |
1090 | } |
1091 | *(int64_t *)(d + i) = di; |
1092 | } |
1093 | clear_high(d, oprsz, desc); |
1094 | } |
1095 | |
1096 | void HELPER(gvec_usadd8)(void *d, void *a, void *b, uint32_t desc) |
1097 | { |
1098 | intptr_t oprsz = simd_oprsz(desc); |
1099 | intptr_t i; |
1100 | |
1101 | for (i = 0; i < oprsz; i += sizeof(uint8_t)) { |
1102 | unsigned r = *(uint8_t *)(a + i) + *(uint8_t *)(b + i); |
1103 | if (r > UINT8_MAX) { |
1104 | r = UINT8_MAX; |
1105 | } |
1106 | *(uint8_t *)(d + i) = r; |
1107 | } |
1108 | clear_high(d, oprsz, desc); |
1109 | } |
1110 | |
1111 | void HELPER(gvec_usadd16)(void *d, void *a, void *b, uint32_t desc) |
1112 | { |
1113 | intptr_t oprsz = simd_oprsz(desc); |
1114 | intptr_t i; |
1115 | |
1116 | for (i = 0; i < oprsz; i += sizeof(uint16_t)) { |
1117 | unsigned r = *(uint16_t *)(a + i) + *(uint16_t *)(b + i); |
1118 | if (r > UINT16_MAX) { |
1119 | r = UINT16_MAX; |
1120 | } |
1121 | *(uint16_t *)(d + i) = r; |
1122 | } |
1123 | clear_high(d, oprsz, desc); |
1124 | } |
1125 | |
1126 | void HELPER(gvec_usadd32)(void *d, void *a, void *b, uint32_t desc) |
1127 | { |
1128 | intptr_t oprsz = simd_oprsz(desc); |
1129 | intptr_t i; |
1130 | |
1131 | for (i = 0; i < oprsz; i += sizeof(uint32_t)) { |
1132 | uint32_t ai = *(uint32_t *)(a + i); |
1133 | uint32_t bi = *(uint32_t *)(b + i); |
1134 | uint32_t di = ai + bi; |
1135 | if (di < ai) { |
1136 | di = UINT32_MAX; |
1137 | } |
1138 | *(uint32_t *)(d + i) = di; |
1139 | } |
1140 | clear_high(d, oprsz, desc); |
1141 | } |
1142 | |
1143 | void HELPER(gvec_usadd64)(void *d, void *a, void *b, uint32_t desc) |
1144 | { |
1145 | intptr_t oprsz = simd_oprsz(desc); |
1146 | intptr_t i; |
1147 | |
1148 | for (i = 0; i < oprsz; i += sizeof(uint64_t)) { |
1149 | uint64_t ai = *(uint64_t *)(a + i); |
1150 | uint64_t bi = *(uint64_t *)(b + i); |
1151 | uint64_t di = ai + bi; |
1152 | if (di < ai) { |
1153 | di = UINT64_MAX; |
1154 | } |
1155 | *(uint64_t *)(d + i) = di; |
1156 | } |
1157 | clear_high(d, oprsz, desc); |
1158 | } |
1159 | |
1160 | void HELPER(gvec_ussub8)(void *d, void *a, void *b, uint32_t desc) |
1161 | { |
1162 | intptr_t oprsz = simd_oprsz(desc); |
1163 | intptr_t i; |
1164 | |
1165 | for (i = 0; i < oprsz; i += sizeof(uint8_t)) { |
1166 | int r = *(uint8_t *)(a + i) - *(uint8_t *)(b + i); |
1167 | if (r < 0) { |
1168 | r = 0; |
1169 | } |
1170 | *(uint8_t *)(d + i) = r; |
1171 | } |
1172 | clear_high(d, oprsz, desc); |
1173 | } |
1174 | |
1175 | void HELPER(gvec_ussub16)(void *d, void *a, void *b, uint32_t desc) |
1176 | { |
1177 | intptr_t oprsz = simd_oprsz(desc); |
1178 | intptr_t i; |
1179 | |
1180 | for (i = 0; i < oprsz; i += sizeof(uint16_t)) { |
1181 | int r = *(uint16_t *)(a + i) - *(uint16_t *)(b + i); |
1182 | if (r < 0) { |
1183 | r = 0; |
1184 | } |
1185 | *(uint16_t *)(d + i) = r; |
1186 | } |
1187 | clear_high(d, oprsz, desc); |
1188 | } |
1189 | |
1190 | void HELPER(gvec_ussub32)(void *d, void *a, void *b, uint32_t desc) |
1191 | { |
1192 | intptr_t oprsz = simd_oprsz(desc); |
1193 | intptr_t i; |
1194 | |
1195 | for (i = 0; i < oprsz; i += sizeof(uint32_t)) { |
1196 | uint32_t ai = *(uint32_t *)(a + i); |
1197 | uint32_t bi = *(uint32_t *)(b + i); |
1198 | uint32_t di = ai - bi; |
1199 | if (ai < bi) { |
1200 | di = 0; |
1201 | } |
1202 | *(uint32_t *)(d + i) = di; |
1203 | } |
1204 | clear_high(d, oprsz, desc); |
1205 | } |
1206 | |
1207 | void HELPER(gvec_ussub64)(void *d, void *a, void *b, uint32_t desc) |
1208 | { |
1209 | intptr_t oprsz = simd_oprsz(desc); |
1210 | intptr_t i; |
1211 | |
1212 | for (i = 0; i < oprsz; i += sizeof(uint64_t)) { |
1213 | uint64_t ai = *(uint64_t *)(a + i); |
1214 | uint64_t bi = *(uint64_t *)(b + i); |
1215 | uint64_t di = ai - bi; |
1216 | if (ai < bi) { |
1217 | di = 0; |
1218 | } |
1219 | *(uint64_t *)(d + i) = di; |
1220 | } |
1221 | clear_high(d, oprsz, desc); |
1222 | } |
1223 | |
1224 | void HELPER(gvec_smin8)(void *d, void *a, void *b, uint32_t desc) |
1225 | { |
1226 | intptr_t oprsz = simd_oprsz(desc); |
1227 | intptr_t i; |
1228 | |
1229 | for (i = 0; i < oprsz; i += sizeof(int8_t)) { |
1230 | int8_t aa = *(int8_t *)(a + i); |
1231 | int8_t bb = *(int8_t *)(b + i); |
1232 | int8_t dd = aa < bb ? aa : bb; |
1233 | *(int8_t *)(d + i) = dd; |
1234 | } |
1235 | clear_high(d, oprsz, desc); |
1236 | } |
1237 | |
1238 | void HELPER(gvec_smin16)(void *d, void *a, void *b, uint32_t desc) |
1239 | { |
1240 | intptr_t oprsz = simd_oprsz(desc); |
1241 | intptr_t i; |
1242 | |
1243 | for (i = 0; i < oprsz; i += sizeof(int16_t)) { |
1244 | int16_t aa = *(int16_t *)(a + i); |
1245 | int16_t bb = *(int16_t *)(b + i); |
1246 | int16_t dd = aa < bb ? aa : bb; |
1247 | *(int16_t *)(d + i) = dd; |
1248 | } |
1249 | clear_high(d, oprsz, desc); |
1250 | } |
1251 | |
1252 | void HELPER(gvec_smin32)(void *d, void *a, void *b, uint32_t desc) |
1253 | { |
1254 | intptr_t oprsz = simd_oprsz(desc); |
1255 | intptr_t i; |
1256 | |
1257 | for (i = 0; i < oprsz; i += sizeof(int32_t)) { |
1258 | int32_t aa = *(int32_t *)(a + i); |
1259 | int32_t bb = *(int32_t *)(b + i); |
1260 | int32_t dd = aa < bb ? aa : bb; |
1261 | *(int32_t *)(d + i) = dd; |
1262 | } |
1263 | clear_high(d, oprsz, desc); |
1264 | } |
1265 | |
1266 | void HELPER(gvec_smin64)(void *d, void *a, void *b, uint32_t desc) |
1267 | { |
1268 | intptr_t oprsz = simd_oprsz(desc); |
1269 | intptr_t i; |
1270 | |
1271 | for (i = 0; i < oprsz; i += sizeof(int64_t)) { |
1272 | int64_t aa = *(int64_t *)(a + i); |
1273 | int64_t bb = *(int64_t *)(b + i); |
1274 | int64_t dd = aa < bb ? aa : bb; |
1275 | *(int64_t *)(d + i) = dd; |
1276 | } |
1277 | clear_high(d, oprsz, desc); |
1278 | } |
1279 | |
1280 | void HELPER(gvec_smax8)(void *d, void *a, void *b, uint32_t desc) |
1281 | { |
1282 | intptr_t oprsz = simd_oprsz(desc); |
1283 | intptr_t i; |
1284 | |
1285 | for (i = 0; i < oprsz; i += sizeof(int8_t)) { |
1286 | int8_t aa = *(int8_t *)(a + i); |
1287 | int8_t bb = *(int8_t *)(b + i); |
1288 | int8_t dd = aa > bb ? aa : bb; |
1289 | *(int8_t *)(d + i) = dd; |
1290 | } |
1291 | clear_high(d, oprsz, desc); |
1292 | } |
1293 | |
1294 | void HELPER(gvec_smax16)(void *d, void *a, void *b, uint32_t desc) |
1295 | { |
1296 | intptr_t oprsz = simd_oprsz(desc); |
1297 | intptr_t i; |
1298 | |
1299 | for (i = 0; i < oprsz; i += sizeof(int16_t)) { |
1300 | int16_t aa = *(int16_t *)(a + i); |
1301 | int16_t bb = *(int16_t *)(b + i); |
1302 | int16_t dd = aa > bb ? aa : bb; |
1303 | *(int16_t *)(d + i) = dd; |
1304 | } |
1305 | clear_high(d, oprsz, desc); |
1306 | } |
1307 | |
1308 | void HELPER(gvec_smax32)(void *d, void *a, void *b, uint32_t desc) |
1309 | { |
1310 | intptr_t oprsz = simd_oprsz(desc); |
1311 | intptr_t i; |
1312 | |
1313 | for (i = 0; i < oprsz; i += sizeof(int32_t)) { |
1314 | int32_t aa = *(int32_t *)(a + i); |
1315 | int32_t bb = *(int32_t *)(b + i); |
1316 | int32_t dd = aa > bb ? aa : bb; |
1317 | *(int32_t *)(d + i) = dd; |
1318 | } |
1319 | clear_high(d, oprsz, desc); |
1320 | } |
1321 | |
1322 | void HELPER(gvec_smax64)(void *d, void *a, void *b, uint32_t desc) |
1323 | { |
1324 | intptr_t oprsz = simd_oprsz(desc); |
1325 | intptr_t i; |
1326 | |
1327 | for (i = 0; i < oprsz; i += sizeof(int64_t)) { |
1328 | int64_t aa = *(int64_t *)(a + i); |
1329 | int64_t bb = *(int64_t *)(b + i); |
1330 | int64_t dd = aa > bb ? aa : bb; |
1331 | *(int64_t *)(d + i) = dd; |
1332 | } |
1333 | clear_high(d, oprsz, desc); |
1334 | } |
1335 | |
1336 | void HELPER(gvec_umin8)(void *d, void *a, void *b, uint32_t desc) |
1337 | { |
1338 | intptr_t oprsz = simd_oprsz(desc); |
1339 | intptr_t i; |
1340 | |
1341 | for (i = 0; i < oprsz; i += sizeof(uint8_t)) { |
1342 | uint8_t aa = *(uint8_t *)(a + i); |
1343 | uint8_t bb = *(uint8_t *)(b + i); |
1344 | uint8_t dd = aa < bb ? aa : bb; |
1345 | *(uint8_t *)(d + i) = dd; |
1346 | } |
1347 | clear_high(d, oprsz, desc); |
1348 | } |
1349 | |
1350 | void HELPER(gvec_umin16)(void *d, void *a, void *b, uint32_t desc) |
1351 | { |
1352 | intptr_t oprsz = simd_oprsz(desc); |
1353 | intptr_t i; |
1354 | |
1355 | for (i = 0; i < oprsz; i += sizeof(uint16_t)) { |
1356 | uint16_t aa = *(uint16_t *)(a + i); |
1357 | uint16_t bb = *(uint16_t *)(b + i); |
1358 | uint16_t dd = aa < bb ? aa : bb; |
1359 | *(uint16_t *)(d + i) = dd; |
1360 | } |
1361 | clear_high(d, oprsz, desc); |
1362 | } |
1363 | |
1364 | void HELPER(gvec_umin32)(void *d, void *a, void *b, uint32_t desc) |
1365 | { |
1366 | intptr_t oprsz = simd_oprsz(desc); |
1367 | intptr_t i; |
1368 | |
1369 | for (i = 0; i < oprsz; i += sizeof(uint32_t)) { |
1370 | uint32_t aa = *(uint32_t *)(a + i); |
1371 | uint32_t bb = *(uint32_t *)(b + i); |
1372 | uint32_t dd = aa < bb ? aa : bb; |
1373 | *(uint32_t *)(d + i) = dd; |
1374 | } |
1375 | clear_high(d, oprsz, desc); |
1376 | } |
1377 | |
1378 | void HELPER(gvec_umin64)(void *d, void *a, void *b, uint32_t desc) |
1379 | { |
1380 | intptr_t oprsz = simd_oprsz(desc); |
1381 | intptr_t i; |
1382 | |
1383 | for (i = 0; i < oprsz; i += sizeof(uint64_t)) { |
1384 | uint64_t aa = *(uint64_t *)(a + i); |
1385 | uint64_t bb = *(uint64_t *)(b + i); |
1386 | uint64_t dd = aa < bb ? aa : bb; |
1387 | *(uint64_t *)(d + i) = dd; |
1388 | } |
1389 | clear_high(d, oprsz, desc); |
1390 | } |
1391 | |
1392 | void HELPER(gvec_umax8)(void *d, void *a, void *b, uint32_t desc) |
1393 | { |
1394 | intptr_t oprsz = simd_oprsz(desc); |
1395 | intptr_t i; |
1396 | |
1397 | for (i = 0; i < oprsz; i += sizeof(uint8_t)) { |
1398 | uint8_t aa = *(uint8_t *)(a + i); |
1399 | uint8_t bb = *(uint8_t *)(b + i); |
1400 | uint8_t dd = aa > bb ? aa : bb; |
1401 | *(uint8_t *)(d + i) = dd; |
1402 | } |
1403 | clear_high(d, oprsz, desc); |
1404 | } |
1405 | |
1406 | void HELPER(gvec_umax16)(void *d, void *a, void *b, uint32_t desc) |
1407 | { |
1408 | intptr_t oprsz = simd_oprsz(desc); |
1409 | intptr_t i; |
1410 | |
1411 | for (i = 0; i < oprsz; i += sizeof(uint16_t)) { |
1412 | uint16_t aa = *(uint16_t *)(a + i); |
1413 | uint16_t bb = *(uint16_t *)(b + i); |
1414 | uint16_t dd = aa > bb ? aa : bb; |
1415 | *(uint16_t *)(d + i) = dd; |
1416 | } |
1417 | clear_high(d, oprsz, desc); |
1418 | } |
1419 | |
1420 | void HELPER(gvec_umax32)(void *d, void *a, void *b, uint32_t desc) |
1421 | { |
1422 | intptr_t oprsz = simd_oprsz(desc); |
1423 | intptr_t i; |
1424 | |
1425 | for (i = 0; i < oprsz; i += sizeof(uint32_t)) { |
1426 | uint32_t aa = *(uint32_t *)(a + i); |
1427 | uint32_t bb = *(uint32_t *)(b + i); |
1428 | uint32_t dd = aa > bb ? aa : bb; |
1429 | *(uint32_t *)(d + i) = dd; |
1430 | } |
1431 | clear_high(d, oprsz, desc); |
1432 | } |
1433 | |
1434 | void HELPER(gvec_umax64)(void *d, void *a, void *b, uint32_t desc) |
1435 | { |
1436 | intptr_t oprsz = simd_oprsz(desc); |
1437 | intptr_t i; |
1438 | |
1439 | for (i = 0; i < oprsz; i += sizeof(uint64_t)) { |
1440 | uint64_t aa = *(uint64_t *)(a + i); |
1441 | uint64_t bb = *(uint64_t *)(b + i); |
1442 | uint64_t dd = aa > bb ? aa : bb; |
1443 | *(uint64_t *)(d + i) = dd; |
1444 | } |
1445 | clear_high(d, oprsz, desc); |
1446 | } |
1447 | |
1448 | void HELPER(gvec_bitsel)(void *d, void *a, void *b, void *c, uint32_t desc) |
1449 | { |
1450 | intptr_t oprsz = simd_oprsz(desc); |
1451 | intptr_t i; |
1452 | |
1453 | for (i = 0; i < oprsz; i += sizeof(vec64)) { |
1454 | vec64 aa = *(vec64 *)(a + i); |
1455 | vec64 bb = *(vec64 *)(b + i); |
1456 | vec64 cc = *(vec64 *)(c + i); |
1457 | *(vec64 *)(d + i) = (bb & aa) | (cc & ~aa); |
1458 | } |
1459 | clear_high(d, oprsz, desc); |
1460 | } |
1461 | |