1 | /* |
2 | * ARM AdvSIMD / SVE Vector Operations |
3 | * |
4 | * Copyright (c) 2018 Linaro |
5 | * |
6 | * This library is free software; you can redistribute it and/or |
7 | * modify it under the terms of the GNU Lesser General Public |
8 | * License as published by the Free Software Foundation; either |
9 | * version 2 of the License, or (at your option) any later version. |
10 | * |
11 | * This library is distributed in the hope that it will be useful, |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
14 | * Lesser General Public License for more details. |
15 | * |
16 | * You should have received a copy of the GNU Lesser General Public |
17 | * License along with this library; if not, see <http://www.gnu.org/licenses/>. |
18 | */ |
19 | |
20 | #include "qemu/osdep.h" |
21 | #include "cpu.h" |
22 | #include "exec/helper-proto.h" |
23 | #include "tcg/tcg-gvec-desc.h" |
24 | #include "fpu/softfloat.h" |
25 | |
26 | |
27 | /* Note that vector data is stored in host-endian 64-bit chunks, |
28 | so addressing units smaller than that needs a host-endian fixup. */ |
29 | #ifdef HOST_WORDS_BIGENDIAN |
30 | #define H1(x) ((x) ^ 7) |
31 | #define H2(x) ((x) ^ 3) |
32 | #define H4(x) ((x) ^ 1) |
33 | #else |
34 | #define H1(x) (x) |
35 | #define H2(x) (x) |
36 | #define H4(x) (x) |
37 | #endif |
38 | |
39 | #define SET_QC() env->vfp.qc[0] = 1 |
40 | |
41 | static void clear_tail(void *vd, uintptr_t opr_sz, uintptr_t max_sz) |
42 | { |
43 | uint64_t *d = vd + opr_sz; |
44 | uintptr_t i; |
45 | |
46 | for (i = opr_sz; i < max_sz; i += 8) { |
47 | *d++ = 0; |
48 | } |
49 | } |
50 | |
51 | /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ |
52 | static uint16_t inl_qrdmlah_s16(CPUARMState *env, int16_t src1, |
53 | int16_t src2, int16_t src3) |
54 | { |
55 | /* Simplify: |
56 | * = ((a3 << 16) + ((e1 * e2) << 1) + (1 << 15)) >> 16 |
57 | * = ((a3 << 15) + (e1 * e2) + (1 << 14)) >> 15 |
58 | */ |
59 | int32_t ret = (int32_t)src1 * src2; |
60 | ret = ((int32_t)src3 << 15) + ret + (1 << 14); |
61 | ret >>= 15; |
62 | if (ret != (int16_t)ret) { |
63 | SET_QC(); |
64 | ret = (ret < 0 ? -0x8000 : 0x7fff); |
65 | } |
66 | return ret; |
67 | } |
68 | |
69 | uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, |
70 | uint32_t src2, uint32_t src3) |
71 | { |
72 | uint16_t e1 = inl_qrdmlah_s16(env, src1, src2, src3); |
73 | uint16_t e2 = inl_qrdmlah_s16(env, src1 >> 16, src2 >> 16, src3 >> 16); |
74 | return deposit32(e1, 16, 16, e2); |
75 | } |
76 | |
77 | void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, |
78 | void *ve, uint32_t desc) |
79 | { |
80 | uintptr_t opr_sz = simd_oprsz(desc); |
81 | int16_t *d = vd; |
82 | int16_t *n = vn; |
83 | int16_t *m = vm; |
84 | CPUARMState *env = ve; |
85 | uintptr_t i; |
86 | |
87 | for (i = 0; i < opr_sz / 2; ++i) { |
88 | d[i] = inl_qrdmlah_s16(env, n[i], m[i], d[i]); |
89 | } |
90 | clear_tail(d, opr_sz, simd_maxsz(desc)); |
91 | } |
92 | |
93 | /* Signed saturating rounding doubling multiply-subtract high half, 16-bit */ |
94 | static uint16_t inl_qrdmlsh_s16(CPUARMState *env, int16_t src1, |
95 | int16_t src2, int16_t src3) |
96 | { |
97 | /* Similarly, using subtraction: |
98 | * = ((a3 << 16) - ((e1 * e2) << 1) + (1 << 15)) >> 16 |
99 | * = ((a3 << 15) - (e1 * e2) + (1 << 14)) >> 15 |
100 | */ |
101 | int32_t ret = (int32_t)src1 * src2; |
102 | ret = ((int32_t)src3 << 15) - ret + (1 << 14); |
103 | ret >>= 15; |
104 | if (ret != (int16_t)ret) { |
105 | SET_QC(); |
106 | ret = (ret < 0 ? -0x8000 : 0x7fff); |
107 | } |
108 | return ret; |
109 | } |
110 | |
111 | uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, |
112 | uint32_t src2, uint32_t src3) |
113 | { |
114 | uint16_t e1 = inl_qrdmlsh_s16(env, src1, src2, src3); |
115 | uint16_t e2 = inl_qrdmlsh_s16(env, src1 >> 16, src2 >> 16, src3 >> 16); |
116 | return deposit32(e1, 16, 16, e2); |
117 | } |
118 | |
119 | void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, |
120 | void *ve, uint32_t desc) |
121 | { |
122 | uintptr_t opr_sz = simd_oprsz(desc); |
123 | int16_t *d = vd; |
124 | int16_t *n = vn; |
125 | int16_t *m = vm; |
126 | CPUARMState *env = ve; |
127 | uintptr_t i; |
128 | |
129 | for (i = 0; i < opr_sz / 2; ++i) { |
130 | d[i] = inl_qrdmlsh_s16(env, n[i], m[i], d[i]); |
131 | } |
132 | clear_tail(d, opr_sz, simd_maxsz(desc)); |
133 | } |
134 | |
135 | /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ |
136 | uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, |
137 | int32_t src2, int32_t src3) |
138 | { |
139 | /* Simplify similarly to int_qrdmlah_s16 above. */ |
140 | int64_t ret = (int64_t)src1 * src2; |
141 | ret = ((int64_t)src3 << 31) + ret + (1 << 30); |
142 | ret >>= 31; |
143 | if (ret != (int32_t)ret) { |
144 | SET_QC(); |
145 | ret = (ret < 0 ? INT32_MIN : INT32_MAX); |
146 | } |
147 | return ret; |
148 | } |
149 | |
150 | void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, |
151 | void *ve, uint32_t desc) |
152 | { |
153 | uintptr_t opr_sz = simd_oprsz(desc); |
154 | int32_t *d = vd; |
155 | int32_t *n = vn; |
156 | int32_t *m = vm; |
157 | CPUARMState *env = ve; |
158 | uintptr_t i; |
159 | |
160 | for (i = 0; i < opr_sz / 4; ++i) { |
161 | d[i] = helper_neon_qrdmlah_s32(env, n[i], m[i], d[i]); |
162 | } |
163 | clear_tail(d, opr_sz, simd_maxsz(desc)); |
164 | } |
165 | |
166 | /* Signed saturating rounding doubling multiply-subtract high half, 32-bit */ |
167 | uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, |
168 | int32_t src2, int32_t src3) |
169 | { |
170 | /* Simplify similarly to int_qrdmlsh_s16 above. */ |
171 | int64_t ret = (int64_t)src1 * src2; |
172 | ret = ((int64_t)src3 << 31) - ret + (1 << 30); |
173 | ret >>= 31; |
174 | if (ret != (int32_t)ret) { |
175 | SET_QC(); |
176 | ret = (ret < 0 ? INT32_MIN : INT32_MAX); |
177 | } |
178 | return ret; |
179 | } |
180 | |
181 | void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, |
182 | void *ve, uint32_t desc) |
183 | { |
184 | uintptr_t opr_sz = simd_oprsz(desc); |
185 | int32_t *d = vd; |
186 | int32_t *n = vn; |
187 | int32_t *m = vm; |
188 | CPUARMState *env = ve; |
189 | uintptr_t i; |
190 | |
191 | for (i = 0; i < opr_sz / 4; ++i) { |
192 | d[i] = helper_neon_qrdmlsh_s32(env, n[i], m[i], d[i]); |
193 | } |
194 | clear_tail(d, opr_sz, simd_maxsz(desc)); |
195 | } |
196 | |
197 | /* Integer 8 and 16-bit dot-product. |
198 | * |
199 | * Note that for the loops herein, host endianness does not matter |
200 | * with respect to the ordering of data within the 64-bit lanes. |
201 | * All elements are treated equally, no matter where they are. |
202 | */ |
203 | |
204 | void HELPER(gvec_sdot_b)(void *vd, void *vn, void *vm, uint32_t desc) |
205 | { |
206 | intptr_t i, opr_sz = simd_oprsz(desc); |
207 | uint32_t *d = vd; |
208 | int8_t *n = vn, *m = vm; |
209 | |
210 | for (i = 0; i < opr_sz / 4; ++i) { |
211 | d[i] += n[i * 4 + 0] * m[i * 4 + 0] |
212 | + n[i * 4 + 1] * m[i * 4 + 1] |
213 | + n[i * 4 + 2] * m[i * 4 + 2] |
214 | + n[i * 4 + 3] * m[i * 4 + 3]; |
215 | } |
216 | clear_tail(d, opr_sz, simd_maxsz(desc)); |
217 | } |
218 | |
219 | void HELPER(gvec_udot_b)(void *vd, void *vn, void *vm, uint32_t desc) |
220 | { |
221 | intptr_t i, opr_sz = simd_oprsz(desc); |
222 | uint32_t *d = vd; |
223 | uint8_t *n = vn, *m = vm; |
224 | |
225 | for (i = 0; i < opr_sz / 4; ++i) { |
226 | d[i] += n[i * 4 + 0] * m[i * 4 + 0] |
227 | + n[i * 4 + 1] * m[i * 4 + 1] |
228 | + n[i * 4 + 2] * m[i * 4 + 2] |
229 | + n[i * 4 + 3] * m[i * 4 + 3]; |
230 | } |
231 | clear_tail(d, opr_sz, simd_maxsz(desc)); |
232 | } |
233 | |
234 | void HELPER(gvec_sdot_h)(void *vd, void *vn, void *vm, uint32_t desc) |
235 | { |
236 | intptr_t i, opr_sz = simd_oprsz(desc); |
237 | uint64_t *d = vd; |
238 | int16_t *n = vn, *m = vm; |
239 | |
240 | for (i = 0; i < opr_sz / 8; ++i) { |
241 | d[i] += (int64_t)n[i * 4 + 0] * m[i * 4 + 0] |
242 | + (int64_t)n[i * 4 + 1] * m[i * 4 + 1] |
243 | + (int64_t)n[i * 4 + 2] * m[i * 4 + 2] |
244 | + (int64_t)n[i * 4 + 3] * m[i * 4 + 3]; |
245 | } |
246 | clear_tail(d, opr_sz, simd_maxsz(desc)); |
247 | } |
248 | |
249 | void HELPER(gvec_udot_h)(void *vd, void *vn, void *vm, uint32_t desc) |
250 | { |
251 | intptr_t i, opr_sz = simd_oprsz(desc); |
252 | uint64_t *d = vd; |
253 | uint16_t *n = vn, *m = vm; |
254 | |
255 | for (i = 0; i < opr_sz / 8; ++i) { |
256 | d[i] += (uint64_t)n[i * 4 + 0] * m[i * 4 + 0] |
257 | + (uint64_t)n[i * 4 + 1] * m[i * 4 + 1] |
258 | + (uint64_t)n[i * 4 + 2] * m[i * 4 + 2] |
259 | + (uint64_t)n[i * 4 + 3] * m[i * 4 + 3]; |
260 | } |
261 | clear_tail(d, opr_sz, simd_maxsz(desc)); |
262 | } |
263 | |
264 | void HELPER(gvec_sdot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) |
265 | { |
266 | intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4; |
267 | intptr_t index = simd_data(desc); |
268 | uint32_t *d = vd; |
269 | int8_t *n = vn; |
270 | int8_t *m_indexed = (int8_t *)vm + index * 4; |
271 | |
272 | /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd. |
273 | * Otherwise opr_sz is a multiple of 16. |
274 | */ |
275 | segend = MIN(4, opr_sz_4); |
276 | i = 0; |
277 | do { |
278 | int8_t m0 = m_indexed[i * 4 + 0]; |
279 | int8_t m1 = m_indexed[i * 4 + 1]; |
280 | int8_t m2 = m_indexed[i * 4 + 2]; |
281 | int8_t m3 = m_indexed[i * 4 + 3]; |
282 | |
283 | do { |
284 | d[i] += n[i * 4 + 0] * m0 |
285 | + n[i * 4 + 1] * m1 |
286 | + n[i * 4 + 2] * m2 |
287 | + n[i * 4 + 3] * m3; |
288 | } while (++i < segend); |
289 | segend = i + 4; |
290 | } while (i < opr_sz_4); |
291 | |
292 | clear_tail(d, opr_sz, simd_maxsz(desc)); |
293 | } |
294 | |
295 | void HELPER(gvec_udot_idx_b)(void *vd, void *vn, void *vm, uint32_t desc) |
296 | { |
297 | intptr_t i, segend, opr_sz = simd_oprsz(desc), opr_sz_4 = opr_sz / 4; |
298 | intptr_t index = simd_data(desc); |
299 | uint32_t *d = vd; |
300 | uint8_t *n = vn; |
301 | uint8_t *m_indexed = (uint8_t *)vm + index * 4; |
302 | |
303 | /* Notice the special case of opr_sz == 8, from aa64/aa32 advsimd. |
304 | * Otherwise opr_sz is a multiple of 16. |
305 | */ |
306 | segend = MIN(4, opr_sz_4); |
307 | i = 0; |
308 | do { |
309 | uint8_t m0 = m_indexed[i * 4 + 0]; |
310 | uint8_t m1 = m_indexed[i * 4 + 1]; |
311 | uint8_t m2 = m_indexed[i * 4 + 2]; |
312 | uint8_t m3 = m_indexed[i * 4 + 3]; |
313 | |
314 | do { |
315 | d[i] += n[i * 4 + 0] * m0 |
316 | + n[i * 4 + 1] * m1 |
317 | + n[i * 4 + 2] * m2 |
318 | + n[i * 4 + 3] * m3; |
319 | } while (++i < segend); |
320 | segend = i + 4; |
321 | } while (i < opr_sz_4); |
322 | |
323 | clear_tail(d, opr_sz, simd_maxsz(desc)); |
324 | } |
325 | |
326 | void HELPER(gvec_sdot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) |
327 | { |
328 | intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8; |
329 | intptr_t index = simd_data(desc); |
330 | uint64_t *d = vd; |
331 | int16_t *n = vn; |
332 | int16_t *m_indexed = (int16_t *)vm + index * 4; |
333 | |
334 | /* This is supported by SVE only, so opr_sz is always a multiple of 16. |
335 | * Process the entire segment all at once, writing back the results |
336 | * only after we've consumed all of the inputs. |
337 | */ |
338 | for (i = 0; i < opr_sz_8 ; i += 2) { |
339 | uint64_t d0, d1; |
340 | |
341 | d0 = n[i * 4 + 0] * (int64_t)m_indexed[i * 4 + 0]; |
342 | d0 += n[i * 4 + 1] * (int64_t)m_indexed[i * 4 + 1]; |
343 | d0 += n[i * 4 + 2] * (int64_t)m_indexed[i * 4 + 2]; |
344 | d0 += n[i * 4 + 3] * (int64_t)m_indexed[i * 4 + 3]; |
345 | d1 = n[i * 4 + 4] * (int64_t)m_indexed[i * 4 + 0]; |
346 | d1 += n[i * 4 + 5] * (int64_t)m_indexed[i * 4 + 1]; |
347 | d1 += n[i * 4 + 6] * (int64_t)m_indexed[i * 4 + 2]; |
348 | d1 += n[i * 4 + 7] * (int64_t)m_indexed[i * 4 + 3]; |
349 | |
350 | d[i + 0] += d0; |
351 | d[i + 1] += d1; |
352 | } |
353 | |
354 | clear_tail(d, opr_sz, simd_maxsz(desc)); |
355 | } |
356 | |
357 | void HELPER(gvec_udot_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) |
358 | { |
359 | intptr_t i, opr_sz = simd_oprsz(desc), opr_sz_8 = opr_sz / 8; |
360 | intptr_t index = simd_data(desc); |
361 | uint64_t *d = vd; |
362 | uint16_t *n = vn; |
363 | uint16_t *m_indexed = (uint16_t *)vm + index * 4; |
364 | |
365 | /* This is supported by SVE only, so opr_sz is always a multiple of 16. |
366 | * Process the entire segment all at once, writing back the results |
367 | * only after we've consumed all of the inputs. |
368 | */ |
369 | for (i = 0; i < opr_sz_8 ; i += 2) { |
370 | uint64_t d0, d1; |
371 | |
372 | d0 = n[i * 4 + 0] * (uint64_t)m_indexed[i * 4 + 0]; |
373 | d0 += n[i * 4 + 1] * (uint64_t)m_indexed[i * 4 + 1]; |
374 | d0 += n[i * 4 + 2] * (uint64_t)m_indexed[i * 4 + 2]; |
375 | d0 += n[i * 4 + 3] * (uint64_t)m_indexed[i * 4 + 3]; |
376 | d1 = n[i * 4 + 4] * (uint64_t)m_indexed[i * 4 + 0]; |
377 | d1 += n[i * 4 + 5] * (uint64_t)m_indexed[i * 4 + 1]; |
378 | d1 += n[i * 4 + 6] * (uint64_t)m_indexed[i * 4 + 2]; |
379 | d1 += n[i * 4 + 7] * (uint64_t)m_indexed[i * 4 + 3]; |
380 | |
381 | d[i + 0] += d0; |
382 | d[i + 1] += d1; |
383 | } |
384 | |
385 | clear_tail(d, opr_sz, simd_maxsz(desc)); |
386 | } |
387 | |
388 | void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, |
389 | void *vfpst, uint32_t desc) |
390 | { |
391 | uintptr_t opr_sz = simd_oprsz(desc); |
392 | float16 *d = vd; |
393 | float16 *n = vn; |
394 | float16 *m = vm; |
395 | float_status *fpst = vfpst; |
396 | uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); |
397 | uint32_t neg_imag = neg_real ^ 1; |
398 | uintptr_t i; |
399 | |
400 | /* Shift boolean to the sign bit so we can xor to negate. */ |
401 | neg_real <<= 15; |
402 | neg_imag <<= 15; |
403 | |
404 | for (i = 0; i < opr_sz / 2; i += 2) { |
405 | float16 e0 = n[H2(i)]; |
406 | float16 e1 = m[H2(i + 1)] ^ neg_imag; |
407 | float16 e2 = n[H2(i + 1)]; |
408 | float16 e3 = m[H2(i)] ^ neg_real; |
409 | |
410 | d[H2(i)] = float16_add(e0, e1, fpst); |
411 | d[H2(i + 1)] = float16_add(e2, e3, fpst); |
412 | } |
413 | clear_tail(d, opr_sz, simd_maxsz(desc)); |
414 | } |
415 | |
416 | void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, |
417 | void *vfpst, uint32_t desc) |
418 | { |
419 | uintptr_t opr_sz = simd_oprsz(desc); |
420 | float32 *d = vd; |
421 | float32 *n = vn; |
422 | float32 *m = vm; |
423 | float_status *fpst = vfpst; |
424 | uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); |
425 | uint32_t neg_imag = neg_real ^ 1; |
426 | uintptr_t i; |
427 | |
428 | /* Shift boolean to the sign bit so we can xor to negate. */ |
429 | neg_real <<= 31; |
430 | neg_imag <<= 31; |
431 | |
432 | for (i = 0; i < opr_sz / 4; i += 2) { |
433 | float32 e0 = n[H4(i)]; |
434 | float32 e1 = m[H4(i + 1)] ^ neg_imag; |
435 | float32 e2 = n[H4(i + 1)]; |
436 | float32 e3 = m[H4(i)] ^ neg_real; |
437 | |
438 | d[H4(i)] = float32_add(e0, e1, fpst); |
439 | d[H4(i + 1)] = float32_add(e2, e3, fpst); |
440 | } |
441 | clear_tail(d, opr_sz, simd_maxsz(desc)); |
442 | } |
443 | |
444 | void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, |
445 | void *vfpst, uint32_t desc) |
446 | { |
447 | uintptr_t opr_sz = simd_oprsz(desc); |
448 | float64 *d = vd; |
449 | float64 *n = vn; |
450 | float64 *m = vm; |
451 | float_status *fpst = vfpst; |
452 | uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); |
453 | uint64_t neg_imag = neg_real ^ 1; |
454 | uintptr_t i; |
455 | |
456 | /* Shift boolean to the sign bit so we can xor to negate. */ |
457 | neg_real <<= 63; |
458 | neg_imag <<= 63; |
459 | |
460 | for (i = 0; i < opr_sz / 8; i += 2) { |
461 | float64 e0 = n[i]; |
462 | float64 e1 = m[i + 1] ^ neg_imag; |
463 | float64 e2 = n[i + 1]; |
464 | float64 e3 = m[i] ^ neg_real; |
465 | |
466 | d[i] = float64_add(e0, e1, fpst); |
467 | d[i + 1] = float64_add(e2, e3, fpst); |
468 | } |
469 | clear_tail(d, opr_sz, simd_maxsz(desc)); |
470 | } |
471 | |
472 | void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, |
473 | void *vfpst, uint32_t desc) |
474 | { |
475 | uintptr_t opr_sz = simd_oprsz(desc); |
476 | float16 *d = vd; |
477 | float16 *n = vn; |
478 | float16 *m = vm; |
479 | float_status *fpst = vfpst; |
480 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); |
481 | uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); |
482 | uint32_t neg_real = flip ^ neg_imag; |
483 | uintptr_t i; |
484 | |
485 | /* Shift boolean to the sign bit so we can xor to negate. */ |
486 | neg_real <<= 15; |
487 | neg_imag <<= 15; |
488 | |
489 | for (i = 0; i < opr_sz / 2; i += 2) { |
490 | float16 e2 = n[H2(i + flip)]; |
491 | float16 e1 = m[H2(i + flip)] ^ neg_real; |
492 | float16 e4 = e2; |
493 | float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; |
494 | |
495 | d[H2(i)] = float16_muladd(e2, e1, d[H2(i)], 0, fpst); |
496 | d[H2(i + 1)] = float16_muladd(e4, e3, d[H2(i + 1)], 0, fpst); |
497 | } |
498 | clear_tail(d, opr_sz, simd_maxsz(desc)); |
499 | } |
500 | |
501 | void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, |
502 | void *vfpst, uint32_t desc) |
503 | { |
504 | uintptr_t opr_sz = simd_oprsz(desc); |
505 | float16 *d = vd; |
506 | float16 *n = vn; |
507 | float16 *m = vm; |
508 | float_status *fpst = vfpst; |
509 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); |
510 | uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); |
511 | intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); |
512 | uint32_t neg_real = flip ^ neg_imag; |
513 | intptr_t elements = opr_sz / sizeof(float16); |
514 | intptr_t eltspersegment = 16 / sizeof(float16); |
515 | intptr_t i, j; |
516 | |
517 | /* Shift boolean to the sign bit so we can xor to negate. */ |
518 | neg_real <<= 15; |
519 | neg_imag <<= 15; |
520 | |
521 | for (i = 0; i < elements; i += eltspersegment) { |
522 | float16 mr = m[H2(i + 2 * index + 0)]; |
523 | float16 mi = m[H2(i + 2 * index + 1)]; |
524 | float16 e1 = neg_real ^ (flip ? mi : mr); |
525 | float16 e3 = neg_imag ^ (flip ? mr : mi); |
526 | |
527 | for (j = i; j < i + eltspersegment; j += 2) { |
528 | float16 e2 = n[H2(j + flip)]; |
529 | float16 e4 = e2; |
530 | |
531 | d[H2(j)] = float16_muladd(e2, e1, d[H2(j)], 0, fpst); |
532 | d[H2(j + 1)] = float16_muladd(e4, e3, d[H2(j + 1)], 0, fpst); |
533 | } |
534 | } |
535 | clear_tail(d, opr_sz, simd_maxsz(desc)); |
536 | } |
537 | |
538 | void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, |
539 | void *vfpst, uint32_t desc) |
540 | { |
541 | uintptr_t opr_sz = simd_oprsz(desc); |
542 | float32 *d = vd; |
543 | float32 *n = vn; |
544 | float32 *m = vm; |
545 | float_status *fpst = vfpst; |
546 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); |
547 | uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); |
548 | uint32_t neg_real = flip ^ neg_imag; |
549 | uintptr_t i; |
550 | |
551 | /* Shift boolean to the sign bit so we can xor to negate. */ |
552 | neg_real <<= 31; |
553 | neg_imag <<= 31; |
554 | |
555 | for (i = 0; i < opr_sz / 4; i += 2) { |
556 | float32 e2 = n[H4(i + flip)]; |
557 | float32 e1 = m[H4(i + flip)] ^ neg_real; |
558 | float32 e4 = e2; |
559 | float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; |
560 | |
561 | d[H4(i)] = float32_muladd(e2, e1, d[H4(i)], 0, fpst); |
562 | d[H4(i + 1)] = float32_muladd(e4, e3, d[H4(i + 1)], 0, fpst); |
563 | } |
564 | clear_tail(d, opr_sz, simd_maxsz(desc)); |
565 | } |
566 | |
567 | void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, |
568 | void *vfpst, uint32_t desc) |
569 | { |
570 | uintptr_t opr_sz = simd_oprsz(desc); |
571 | float32 *d = vd; |
572 | float32 *n = vn; |
573 | float32 *m = vm; |
574 | float_status *fpst = vfpst; |
575 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); |
576 | uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); |
577 | intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); |
578 | uint32_t neg_real = flip ^ neg_imag; |
579 | intptr_t elements = opr_sz / sizeof(float32); |
580 | intptr_t eltspersegment = 16 / sizeof(float32); |
581 | intptr_t i, j; |
582 | |
583 | /* Shift boolean to the sign bit so we can xor to negate. */ |
584 | neg_real <<= 31; |
585 | neg_imag <<= 31; |
586 | |
587 | for (i = 0; i < elements; i += eltspersegment) { |
588 | float32 mr = m[H4(i + 2 * index + 0)]; |
589 | float32 mi = m[H4(i + 2 * index + 1)]; |
590 | float32 e1 = neg_real ^ (flip ? mi : mr); |
591 | float32 e3 = neg_imag ^ (flip ? mr : mi); |
592 | |
593 | for (j = i; j < i + eltspersegment; j += 2) { |
594 | float32 e2 = n[H4(j + flip)]; |
595 | float32 e4 = e2; |
596 | |
597 | d[H4(j)] = float32_muladd(e2, e1, d[H4(j)], 0, fpst); |
598 | d[H4(j + 1)] = float32_muladd(e4, e3, d[H4(j + 1)], 0, fpst); |
599 | } |
600 | } |
601 | clear_tail(d, opr_sz, simd_maxsz(desc)); |
602 | } |
603 | |
604 | void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, |
605 | void *vfpst, uint32_t desc) |
606 | { |
607 | uintptr_t opr_sz = simd_oprsz(desc); |
608 | float64 *d = vd; |
609 | float64 *n = vn; |
610 | float64 *m = vm; |
611 | float_status *fpst = vfpst; |
612 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); |
613 | uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); |
614 | uint64_t neg_real = flip ^ neg_imag; |
615 | uintptr_t i; |
616 | |
617 | /* Shift boolean to the sign bit so we can xor to negate. */ |
618 | neg_real <<= 63; |
619 | neg_imag <<= 63; |
620 | |
621 | for (i = 0; i < opr_sz / 8; i += 2) { |
622 | float64 e2 = n[i + flip]; |
623 | float64 e1 = m[i + flip] ^ neg_real; |
624 | float64 e4 = e2; |
625 | float64 e3 = m[i + 1 - flip] ^ neg_imag; |
626 | |
627 | d[i] = float64_muladd(e2, e1, d[i], 0, fpst); |
628 | d[i + 1] = float64_muladd(e4, e3, d[i + 1], 0, fpst); |
629 | } |
630 | clear_tail(d, opr_sz, simd_maxsz(desc)); |
631 | } |
632 | |
633 | #define DO_2OP(NAME, FUNC, TYPE) \ |
634 | void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ |
635 | { \ |
636 | intptr_t i, oprsz = simd_oprsz(desc); \ |
637 | TYPE *d = vd, *n = vn; \ |
638 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ |
639 | d[i] = FUNC(n[i], stat); \ |
640 | } \ |
641 | clear_tail(d, oprsz, simd_maxsz(desc)); \ |
642 | } |
643 | |
644 | DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) |
645 | DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) |
646 | DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) |
647 | |
648 | DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) |
649 | DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) |
650 | DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) |
651 | |
652 | #undef DO_2OP |
653 | |
654 | /* Floating-point trigonometric starting value. |
655 | * See the ARM ARM pseudocode function FPTrigSMul. |
656 | */ |
657 | static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) |
658 | { |
659 | float16 result = float16_mul(op1, op1, stat); |
660 | if (!float16_is_any_nan(result)) { |
661 | result = float16_set_sign(result, op2 & 1); |
662 | } |
663 | return result; |
664 | } |
665 | |
666 | static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) |
667 | { |
668 | float32 result = float32_mul(op1, op1, stat); |
669 | if (!float32_is_any_nan(result)) { |
670 | result = float32_set_sign(result, op2 & 1); |
671 | } |
672 | return result; |
673 | } |
674 | |
675 | static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) |
676 | { |
677 | float64 result = float64_mul(op1, op1, stat); |
678 | if (!float64_is_any_nan(result)) { |
679 | result = float64_set_sign(result, op2 & 1); |
680 | } |
681 | return result; |
682 | } |
683 | |
684 | #define DO_3OP(NAME, FUNC, TYPE) \ |
685 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ |
686 | { \ |
687 | intptr_t i, oprsz = simd_oprsz(desc); \ |
688 | TYPE *d = vd, *n = vn, *m = vm; \ |
689 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ |
690 | d[i] = FUNC(n[i], m[i], stat); \ |
691 | } \ |
692 | clear_tail(d, oprsz, simd_maxsz(desc)); \ |
693 | } |
694 | |
695 | DO_3OP(gvec_fadd_h, float16_add, float16) |
696 | DO_3OP(gvec_fadd_s, float32_add, float32) |
697 | DO_3OP(gvec_fadd_d, float64_add, float64) |
698 | |
699 | DO_3OP(gvec_fsub_h, float16_sub, float16) |
700 | DO_3OP(gvec_fsub_s, float32_sub, float32) |
701 | DO_3OP(gvec_fsub_d, float64_sub, float64) |
702 | |
703 | DO_3OP(gvec_fmul_h, float16_mul, float16) |
704 | DO_3OP(gvec_fmul_s, float32_mul, float32) |
705 | DO_3OP(gvec_fmul_d, float64_mul, float64) |
706 | |
707 | DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) |
708 | DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) |
709 | DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) |
710 | |
711 | #ifdef TARGET_AARCH64 |
712 | |
713 | DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) |
714 | DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) |
715 | DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) |
716 | |
717 | DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) |
718 | DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) |
719 | DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) |
720 | |
721 | #endif |
722 | #undef DO_3OP |
723 | |
724 | /* For the indexed ops, SVE applies the index per 128-bit vector segment. |
725 | * For AdvSIMD, there is of course only one such vector segment. |
726 | */ |
727 | |
728 | #define DO_MUL_IDX(NAME, TYPE, H) \ |
729 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ |
730 | { \ |
731 | intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ |
732 | intptr_t idx = simd_data(desc); \ |
733 | TYPE *d = vd, *n = vn, *m = vm; \ |
734 | for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ |
735 | TYPE mm = m[H(i + idx)]; \ |
736 | for (j = 0; j < segment; j++) { \ |
737 | d[i + j] = TYPE##_mul(n[i + j], mm, stat); \ |
738 | } \ |
739 | } \ |
740 | } |
741 | |
742 | DO_MUL_IDX(gvec_fmul_idx_h, float16, H2) |
743 | DO_MUL_IDX(gvec_fmul_idx_s, float32, H4) |
744 | DO_MUL_IDX(gvec_fmul_idx_d, float64, ) |
745 | |
746 | #undef DO_MUL_IDX |
747 | |
748 | #define DO_FMLA_IDX(NAME, TYPE, H) \ |
749 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ |
750 | void *stat, uint32_t desc) \ |
751 | { \ |
752 | intptr_t i, j, oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \ |
753 | TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ |
754 | intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ |
755 | TYPE *d = vd, *n = vn, *m = vm, *a = va; \ |
756 | op1_neg <<= (8 * sizeof(TYPE) - 1); \ |
757 | for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ |
758 | TYPE mm = m[H(i + idx)]; \ |
759 | for (j = 0; j < segment; j++) { \ |
760 | d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ |
761 | mm, a[i + j], 0, stat); \ |
762 | } \ |
763 | } \ |
764 | } |
765 | |
766 | DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) |
767 | DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) |
768 | DO_FMLA_IDX(gvec_fmla_idx_d, float64, ) |
769 | |
770 | #undef DO_FMLA_IDX |
771 | |
772 | #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ |
773 | void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ |
774 | { \ |
775 | intptr_t i, oprsz = simd_oprsz(desc); \ |
776 | TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ |
777 | bool q = false; \ |
778 | for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ |
779 | WTYPE dd = (WTYPE)n[i] OP m[i]; \ |
780 | if (dd < MIN) { \ |
781 | dd = MIN; \ |
782 | q = true; \ |
783 | } else if (dd > MAX) { \ |
784 | dd = MAX; \ |
785 | q = true; \ |
786 | } \ |
787 | d[i] = dd; \ |
788 | } \ |
789 | if (q) { \ |
790 | uint32_t *qc = vq; \ |
791 | qc[0] = 1; \ |
792 | } \ |
793 | clear_tail(d, oprsz, simd_maxsz(desc)); \ |
794 | } |
795 | |
796 | DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) |
797 | DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) |
798 | DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) |
799 | |
800 | DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) |
801 | DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) |
802 | DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) |
803 | |
804 | DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) |
805 | DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) |
806 | DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) |
807 | |
808 | DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) |
809 | DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) |
810 | DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) |
811 | |
812 | #undef DO_SAT |
813 | |
814 | void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, |
815 | void *vm, uint32_t desc) |
816 | { |
817 | intptr_t i, oprsz = simd_oprsz(desc); |
818 | uint64_t *d = vd, *n = vn, *m = vm; |
819 | bool q = false; |
820 | |
821 | for (i = 0; i < oprsz / 8; i++) { |
822 | uint64_t nn = n[i], mm = m[i], dd = nn + mm; |
823 | if (dd < nn) { |
824 | dd = UINT64_MAX; |
825 | q = true; |
826 | } |
827 | d[i] = dd; |
828 | } |
829 | if (q) { |
830 | uint32_t *qc = vq; |
831 | qc[0] = 1; |
832 | } |
833 | clear_tail(d, oprsz, simd_maxsz(desc)); |
834 | } |
835 | |
836 | void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, |
837 | void *vm, uint32_t desc) |
838 | { |
839 | intptr_t i, oprsz = simd_oprsz(desc); |
840 | uint64_t *d = vd, *n = vn, *m = vm; |
841 | bool q = false; |
842 | |
843 | for (i = 0; i < oprsz / 8; i++) { |
844 | uint64_t nn = n[i], mm = m[i], dd = nn - mm; |
845 | if (nn < mm) { |
846 | dd = 0; |
847 | q = true; |
848 | } |
849 | d[i] = dd; |
850 | } |
851 | if (q) { |
852 | uint32_t *qc = vq; |
853 | qc[0] = 1; |
854 | } |
855 | clear_tail(d, oprsz, simd_maxsz(desc)); |
856 | } |
857 | |
858 | void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, |
859 | void *vm, uint32_t desc) |
860 | { |
861 | intptr_t i, oprsz = simd_oprsz(desc); |
862 | int64_t *d = vd, *n = vn, *m = vm; |
863 | bool q = false; |
864 | |
865 | for (i = 0; i < oprsz / 8; i++) { |
866 | int64_t nn = n[i], mm = m[i], dd = nn + mm; |
867 | if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { |
868 | dd = (nn >> 63) ^ ~INT64_MIN; |
869 | q = true; |
870 | } |
871 | d[i] = dd; |
872 | } |
873 | if (q) { |
874 | uint32_t *qc = vq; |
875 | qc[0] = 1; |
876 | } |
877 | clear_tail(d, oprsz, simd_maxsz(desc)); |
878 | } |
879 | |
880 | void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, |
881 | void *vm, uint32_t desc) |
882 | { |
883 | intptr_t i, oprsz = simd_oprsz(desc); |
884 | int64_t *d = vd, *n = vn, *m = vm; |
885 | bool q = false; |
886 | |
887 | for (i = 0; i < oprsz / 8; i++) { |
888 | int64_t nn = n[i], mm = m[i], dd = nn - mm; |
889 | if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { |
890 | dd = (nn >> 63) ^ ~INT64_MIN; |
891 | q = true; |
892 | } |
893 | d[i] = dd; |
894 | } |
895 | if (q) { |
896 | uint32_t *qc = vq; |
897 | qc[0] = 1; |
898 | } |
899 | clear_tail(d, oprsz, simd_maxsz(desc)); |
900 | } |
901 | |
902 | /* |
903 | * Convert float16 to float32, raising no exceptions and |
904 | * preserving exceptional values, including SNaN. |
905 | * This is effectively an unpack+repack operation. |
906 | */ |
907 | static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) |
908 | { |
909 | const int f16_bias = 15; |
910 | const int f32_bias = 127; |
911 | uint32_t sign = extract32(f16, 15, 1); |
912 | uint32_t exp = extract32(f16, 10, 5); |
913 | uint32_t frac = extract32(f16, 0, 10); |
914 | |
915 | if (exp == 0x1f) { |
916 | /* Inf or NaN */ |
917 | exp = 0xff; |
918 | } else if (exp == 0) { |
919 | /* Zero or denormal. */ |
920 | if (frac != 0) { |
921 | if (fz16) { |
922 | frac = 0; |
923 | } else { |
924 | /* |
925 | * Denormal; these are all normal float32. |
926 | * Shift the fraction so that the msb is at bit 11, |
927 | * then remove bit 11 as the implicit bit of the |
928 | * normalized float32. Note that we still go through |
929 | * the shift for normal numbers below, to put the |
930 | * float32 fraction at the right place. |
931 | */ |
932 | int shift = clz32(frac) - 21; |
933 | frac = (frac << shift) & 0x3ff; |
934 | exp = f32_bias - f16_bias - shift + 1; |
935 | } |
936 | } |
937 | } else { |
938 | /* Normal number; adjust the bias. */ |
939 | exp += f32_bias - f16_bias; |
940 | } |
941 | sign <<= 31; |
942 | exp <<= 23; |
943 | frac <<= 23 - 10; |
944 | |
945 | return sign | exp | frac; |
946 | } |
947 | |
948 | static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) |
949 | { |
950 | /* |
951 | * Branchless load of u32[0], u64[0], u32[1], or u64[1]. |
952 | * Load the 2nd qword iff is_q & is_2. |
953 | * Shift to the 2nd dword iff !is_q & is_2. |
954 | * For !is_q & !is_2, the upper bits of the result are garbage. |
955 | */ |
956 | return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); |
957 | } |
958 | |
959 | /* |
960 | * Note that FMLAL requires oprsz == 8 or oprsz == 16, |
961 | * as there is not yet SVE versions that might use blocking. |
962 | */ |
963 | |
964 | static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, |
965 | uint32_t desc, bool fz16) |
966 | { |
967 | intptr_t i, oprsz = simd_oprsz(desc); |
968 | int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); |
969 | int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); |
970 | int is_q = oprsz == 16; |
971 | uint64_t n_4, m_4; |
972 | |
973 | /* Pre-load all of the f16 data, avoiding overlap issues. */ |
974 | n_4 = load4_f16(vn, is_q, is_2); |
975 | m_4 = load4_f16(vm, is_q, is_2); |
976 | |
977 | /* Negate all inputs for FMLSL at once. */ |
978 | if (is_s) { |
979 | n_4 ^= 0x8000800080008000ull; |
980 | } |
981 | |
982 | for (i = 0; i < oprsz / 4; i++) { |
983 | float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); |
984 | float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); |
985 | d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); |
986 | } |
987 | clear_tail(d, oprsz, simd_maxsz(desc)); |
988 | } |
989 | |
990 | void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, |
991 | void *venv, uint32_t desc) |
992 | { |
993 | CPUARMState *env = venv; |
994 | do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, |
995 | get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); |
996 | } |
997 | |
998 | void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, |
999 | void *venv, uint32_t desc) |
1000 | { |
1001 | CPUARMState *env = venv; |
1002 | do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc, |
1003 | get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); |
1004 | } |
1005 | |
1006 | static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, |
1007 | uint32_t desc, bool fz16) |
1008 | { |
1009 | intptr_t i, oprsz = simd_oprsz(desc); |
1010 | int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); |
1011 | int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); |
1012 | int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); |
1013 | int is_q = oprsz == 16; |
1014 | uint64_t n_4; |
1015 | float32 m_1; |
1016 | |
1017 | /* Pre-load all of the f16 data, avoiding overlap issues. */ |
1018 | n_4 = load4_f16(vn, is_q, is_2); |
1019 | |
1020 | /* Negate all inputs for FMLSL at once. */ |
1021 | if (is_s) { |
1022 | n_4 ^= 0x8000800080008000ull; |
1023 | } |
1024 | |
1025 | m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); |
1026 | |
1027 | for (i = 0; i < oprsz / 4; i++) { |
1028 | float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); |
1029 | d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); |
1030 | } |
1031 | clear_tail(d, oprsz, simd_maxsz(desc)); |
1032 | } |
1033 | |
1034 | void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, |
1035 | void *venv, uint32_t desc) |
1036 | { |
1037 | CPUARMState *env = venv; |
1038 | do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, |
1039 | get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); |
1040 | } |
1041 | |
1042 | void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, |
1043 | void *venv, uint32_t desc) |
1044 | { |
1045 | CPUARMState *env = venv; |
1046 | do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc, |
1047 | get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); |
1048 | } |
1049 | |