1 | /* Copyright (C) 2016 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_MUL_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_MUL_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/core/extract.h> |
17 | #include <simdpp/core/f_mul.h> |
18 | #include <simdpp/core/i_mul.h> |
19 | #include <simdpp/core/move_l.h> |
20 | #include <simdpp/core/make_uint.h> |
21 | #include <simdpp/core/to_int32.h> |
22 | #include <simdpp/detail/mem_block.h> |
23 | |
24 | namespace simdpp { |
25 | namespace SIMDPP_ARCH_NAMESPACE { |
26 | |
27 | // forward declarations |
28 | template<unsigned N, class E> SIMDPP_INL |
29 | int32_t reduce_mul(const int16<N,E>& a); |
30 | template<unsigned N, class E> SIMDPP_INL |
31 | uint32_t reduce_mul(const uint16<N,E>& a); |
32 | template<unsigned N, class E> SIMDPP_INL |
33 | int32_t reduce_mul(const int32<N,E>& a); |
34 | template<unsigned N, class E> SIMDPP_INL |
35 | uint32_t reduce_mul(const uint32<N,E>& a); |
36 | |
37 | namespace detail { |
38 | namespace insn { |
39 | |
40 | static SIMDPP_INL |
41 | uint32_t i_reduce_mul(const uint16x8& a) |
42 | { |
43 | #if SIMDPP_USE_NULL |
44 | uint32_t r = a.el(0); |
45 | for (unsigned i = 1; i < a.length; i++) { |
46 | r *= a.el(i); |
47 | } |
48 | return r; |
49 | #elif SIMDPP_USE_SSE2 |
50 | uint32x4 ca = (uint32x4)a; |
51 | // shift data zeroing out bits |
52 | uint32x4 al = shift_r<16>(ca); |
53 | uint32x4 ah = shift_l<16>(ca); |
54 | // due to zeroing bits in previous steps, the result can be OR'ed |
55 | uint32x4 lo = _mm_mullo_epi16(ca.native(), al.native()); |
56 | uint32x4 hi = _mm_mulhi_epu16(ca.native(), ah.native()); |
57 | uint32x4 r = bit_or(lo, hi); |
58 | r = _mm_mul_epu32(r.native(), move4_l<1>(r).eval().native()); |
59 | r = _mm_mul_epu32(r.native(), move4_l<2>(r).eval().native()); |
60 | return extract<0>(r); |
61 | #elif SIMDPP_USE_NEON |
62 | uint32x4 r = vmull_u16(vget_low_u16(a.native()), |
63 | vget_high_u16(a.native())); |
64 | r = mul_lo(r, move4_l<2>(r)); |
65 | r = mul_lo(r, move4_l<1>(r)); |
66 | return extract<0>(r); |
67 | #elif SIMDPP_USE_ALTIVEC |
68 | uint16x8 a2 = move8_l<1>(a); |
69 | uint32x4 r = vec_mule(a2.native(), a.native()); |
70 | mem_block<uint32x4> b = r; |
71 | return b[0] * b[1] * b[2] * b[3]; |
72 | #elif SIMDPP_USE_MSA |
73 | uint32<8> a32 = to_uint32(a); |
74 | uint32<4> r = mul_lo(a32.vec(0), a32.vec(1)); |
75 | r = mul_lo(r, move4_l<2>(r)); |
76 | r = mul_lo(r, move4_l<1>(r)); |
77 | return extract<0>(r); |
78 | #endif |
79 | } |
80 | |
81 | #if SIMDPP_USE_AVX2 |
82 | static SIMDPP_INL |
83 | uint32_t i_reduce_mul(const uint16x16& a) |
84 | { |
85 | uint32x8 ca = (uint32x8) a; |
86 | // shift data zeroing out bits |
87 | uint32x8 al = shift_r<16>(ca); |
88 | uint32x8 ah = shift_l<16>(ca); |
89 | // due to zoreing bits in previous steps, the result can be OR'ed |
90 | uint32x8 lo = _mm256_mullo_epi16(ca.native(), al.native()); |
91 | uint32x8 hi = _mm256_mulhi_epu16(ca.native(), ah.native()); |
92 | uint32x8 r = bit_or(lo, hi); |
93 | r = _mm256_mul_epu32(r.native(), move4_l<1>(r).eval().native()); |
94 | uint32x4 r2 = _mm_mul_epu32(detail::extract128<0>(r).native(), |
95 | detail::extract128<1>(r).native()); |
96 | r2 = _mm_mul_epu32(r2.native(), move4_l<2>(r2).eval().native()); |
97 | return extract<0>(r2); |
98 | } |
99 | #endif |
100 | |
101 | #if SIMDPP_USE_AVX512BW |
102 | SIMDPP_INL uint32_t i_reduce_mul(const uint16<32>& a) |
103 | { |
104 | uint32<16> ca = (uint32<16>) a; |
105 | // shift data zeroing out bits |
106 | uint32<16> al = shift_r<16>(ca); |
107 | uint32<16> ah = shift_l<16>(ca); |
108 | // due to zeroing bits in previous steps, the result can be OR'ed |
109 | uint32<16> lo = _mm512_mullo_epi16(ca.native(), al.native()); |
110 | uint32<16> hi = _mm512_mulhi_epu16(ca.native(), ah.native()); |
111 | uint32<16> r = bit_or(lo, hi); |
112 | return reduce_mul(r); |
113 | } |
114 | #endif |
115 | |
116 | template<unsigned N> |
117 | SIMDPP_INL uint32_t i_reduce_mul(const uint16<N>& a) |
118 | { |
119 | #if SIMDPP_USE_NULL |
120 | uint32_t r = 1; |
121 | for (unsigned j = 0; j < a.vec_length; ++j) { |
122 | for (unsigned i = 0; i < a.base_length; i++) { |
123 | r *= a.vec(j).el(i); |
124 | } |
125 | } |
126 | return r; |
127 | #elif SIMDPP_USE_AVX512BW |
128 | uint32<16> prod = make_uint(1); |
129 | for (unsigned j = 0; j < a.vec_length; ++j) { |
130 | uint32<16> ca = (uint32<16>) a.vec(j); |
131 | // shift data zeroing out bits |
132 | uint32<16> al = shift_r<16>(ca); |
133 | uint32<16> ah = shift_l<16>(ca); |
134 | // due to zoreing bits in previous steps, the result can be OR'ed |
135 | uint32<16> lo = _mm512_mullo_epi16(ca.native(), al.native()); |
136 | uint32<16> hi = _mm512_mulhi_epu16(ca.native(), ah.native()); |
137 | uint32<16> iprod = bit_or(lo, hi); |
138 | iprod = _mm512_mul_epu32(iprod.native(), move4_l<1>(iprod).eval().native()); |
139 | prod = _mm512_mul_epu32(prod.native(), iprod.native()); |
140 | } |
141 | return reduce_mul(prod); |
142 | #elif SIMDPP_USE_AVX2 |
143 | uint32x8 prod = make_uint(1); |
144 | for (unsigned j = 0; j < a.vec_length; ++j) { |
145 | uint32x8 ca = (uint32x8) a.vec(j); |
146 | // shift data zeroing out bits |
147 | uint32x8 al = shift_r<16>(ca); |
148 | uint32x8 ah = shift_l<16>(ca); |
149 | // due to zoreing bits in previous steps, the result can be OR'ed |
150 | uint32x8 lo = _mm256_mullo_epi16(ca.native(), al.native()); |
151 | uint32x8 hi = _mm256_mulhi_epu16(ca.native(), ah.native()); |
152 | uint32x8 iprod = bit_or(lo, hi); |
153 | iprod = _mm256_mul_epu32(iprod.native(), move4_l<1>(iprod).eval().native()); |
154 | prod = _mm256_mul_epu32(prod.native(), iprod.native()); |
155 | } |
156 | uint32x4 r2 = _mm_mul_epu32(detail::extract128<0>(prod).native(), |
157 | detail::extract128<1>(prod).native()); |
158 | r2 = _mm_mul_epu32(r2.native(), move4_l<2>(r2).eval().native()); |
159 | return extract<0>(r2); |
160 | #elif SIMDPP_USE_SSE2 |
161 | uint32x4 prod = make_uint(1); |
162 | for (unsigned j = 0; j < a.vec_length; ++j) { |
163 | uint32x4 ca = (uint32x4) a.vec(j); |
164 | // shift data zeroing out bits |
165 | uint32x4 al = shift_r<16>(ca); |
166 | uint32x4 ah = shift_l<16>(ca); |
167 | // due to zoreing bits in previous steps, the result can be OR'ed |
168 | uint32x4 lo = _mm_mullo_epi16(ca.native(), al.native()); |
169 | uint32x4 hi = _mm_mulhi_epu16(ca.native(), ah.native()); |
170 | uint32x4 r = bit_or(lo, hi); |
171 | r = _mm_mul_epu32(r.native(), move4_l<1>(r).eval().native()); |
172 | prod = _mm_mul_epu32(prod.native(), r.native()); |
173 | } |
174 | prod = _mm_mul_epu32(prod.native(), move4_l<2>(prod).eval().native()); |
175 | return extract<0>(prod); |
176 | #elif SIMDPP_USE_NEON |
177 | uint32x4 prod = make_uint(1); |
178 | for (unsigned j = 0; j < a.vec_length; ++j) { |
179 | uint32x4 r = vmull_u16(vget_low_u16(a.vec(j).native()), |
180 | vget_high_u16(a.vec(j).native())); |
181 | prod = mul_lo(prod, r); |
182 | } |
183 | prod = mul_lo(prod, move4_l<2>(prod)); |
184 | prod = mul_lo(prod, move4_l<1>(prod)); |
185 | return extract<0>(prod); |
186 | #elif SIMDPP_USE_ALTIVEC |
187 | uint32_t r = i_reduce_mul(a.vec(0)); |
188 | for (unsigned j = 1; j < a.vec_length; ++j) { |
189 | r *= i_reduce_mul(a.vec(j)); |
190 | } |
191 | return r; |
192 | #elif SIMDPP_USE_MSA |
193 | uint32x4 prod = make_uint(1); |
194 | for (unsigned j = 0; j < a.vec_length; ++j) { |
195 | uint32<8> a32 = to_uint32(a.vec(j)); |
196 | prod = mul_lo(prod, mul_lo(a32.vec(0), a32.vec(1))); |
197 | } |
198 | prod = mul_lo(prod, move4_l<2>(prod)); |
199 | prod = mul_lo(prod, move4_l<1>(prod)); |
200 | return extract<0>(prod); |
201 | #endif |
202 | } |
203 | |
204 | // ----------------------------------------------------------------------------- |
205 | |
206 | static SIMDPP_INL |
207 | int32_t i_reduce_mul(const int16x8& a) |
208 | { |
209 | #if SIMDPP_USE_NULL |
210 | int32_t r = a.el(0); |
211 | for (unsigned i = 1; i < a.length; i++) { |
212 | r *= a.el(i); |
213 | } |
214 | return r; |
215 | #elif SIMDPP_USE_SSE2 |
216 | uint32x4 ca = (uint32x4) a; |
217 | // shift data zeroing out bits |
218 | uint32x4 al = shift_r<16>(ca); |
219 | uint32x4 ah = shift_l<16>(ca); |
220 | // due to zoreing bits in previous steps, the result can be OR'ed |
221 | uint32x4 lo = _mm_mullo_epi16(ca.native(), al.native()); |
222 | uint32x4 hi = _mm_mulhi_epi16(ca.native(), ah.native()); |
223 | uint32x4 r = bit_or(lo, hi); |
224 | r = _mm_mul_epu32(r.native(), move4_l<1>(r).eval().native()); |
225 | r = _mm_mul_epu32(r.native(), move4_l<2>(r).eval().native()); |
226 | return extract<0>(r); |
227 | #elif SIMDPP_USE_NEON |
228 | int32x4 r = vmull_s16(vget_low_s16(a.native()), vget_high_s16(a.native())); |
229 | r = mul_lo(r, move4_l<2>(r)); |
230 | r = mul_lo(r, move4_l<1>(r)); |
231 | return extract<0>(r); |
232 | #elif SIMDPP_USE_ALTIVEC |
233 | int16x8 a2 = move8_l<1>(a); |
234 | int32x4 r = vec_mule(a2.native(), a.native()); |
235 | mem_block<int32x4> b = r; |
236 | return b[0] * b[1] * b[2] * b[3]; |
237 | #elif SIMDPP_USE_MSA |
238 | int32<8> a32 = to_int32(a); |
239 | int32<4> r = mul_lo(a32.vec(0), a32.vec(1)); |
240 | r = mul_lo(r, move4_l<2>(r)); |
241 | r = mul_lo(r, move4_l<1>(r)); |
242 | return extract<0>(r); |
243 | #endif |
244 | } |
245 | |
246 | #if SIMDPP_USE_AVX2 |
247 | static SIMDPP_INL |
248 | int32_t i_reduce_mul(const int16x16& a) |
249 | { |
250 | uint32x8 ca = (uint32x8) a; |
251 | // shift data zeroing out bits |
252 | uint32x8 al = shift_r<16>(ca); |
253 | uint32x8 ah = shift_l<16>(ca); |
254 | // due to zoreing bits in previous steps, the result can be OR'ed |
255 | uint32x8 lo = _mm256_mullo_epi16(ca.native(), al.native()); |
256 | uint32x8 hi = _mm256_mulhi_epi16(ca.native(), ah.native()); |
257 | uint32x8 r = bit_or(lo, hi); |
258 | r = _mm256_mul_epu32(r.native(), move4_l<1>(r).eval().native()); |
259 | uint32x4 r2 = _mm_mul_epu32(detail::extract128<0>(r).native(), |
260 | detail::extract128<1>(r).native()); |
261 | r2 = _mm_mul_epu32(r2.native(), move4_l<2>(r2).eval().native()); |
262 | return extract<0>(r2); |
263 | } |
264 | #endif |
265 | |
266 | #if SIMDPP_USE_AVX512BW |
267 | SIMDPP_INL int32_t i_reduce_mul(const int16<32>& a) |
268 | { |
269 | uint32<16> ca = (uint32<16>) a; |
270 | // shift data zeroing out bits |
271 | uint32<16> al = shift_r<16>(ca); |
272 | uint32<16> ah = shift_l<16>(ca); |
273 | // due to zoreing bits in previous steps, the result can be OR'ed |
274 | uint32<16> lo = _mm512_mullo_epi16(ca.native(), al.native()); |
275 | uint32<16> hi = _mm512_mulhi_epi16(ca.native(), ah.native()); |
276 | uint32<16> r = bit_or(lo, hi); |
277 | return reduce_mul(r); |
278 | } |
279 | #endif |
280 | |
281 | template<unsigned N> |
282 | SIMDPP_INL int32_t i_reduce_mul(const int16<N>& a) |
283 | { |
284 | #if SIMDPP_USE_NULL |
285 | uint32_t r = 1; |
286 | for (unsigned j = 0; j < a.vec_length; ++j) { |
287 | for (unsigned i = 0; i < a.base_length; i++) { |
288 | r *= a.vec(j).el(i); |
289 | } |
290 | } |
291 | return r; |
292 | #elif SIMDPP_USE_AVX512BW |
293 | uint32<16> prod = make_uint(1); |
294 | for (unsigned j = 0; j < a.vec_length; ++j) { |
295 | uint32<16> ca = (uint32<16>) a.vec(j); |
296 | // shift data zeroing out bits |
297 | uint32<16> al = shift_r<16>(ca); |
298 | uint32<16> ah = shift_l<16>(ca); |
299 | // due to zoreing bits in previous steps, the result can be OR'ed |
300 | uint32<16> lo = _mm512_mullo_epi16(ca.native(), al.native()); |
301 | uint32<16> hi = _mm512_mulhi_epi16(ca.native(), ah.native()); |
302 | uint32<16> iprod = bit_or(lo, hi); |
303 | iprod = _mm512_mul_epu32(iprod.native(), move4_l<1>(iprod).eval().native()); |
304 | prod = _mm512_mul_epu32(prod.native(), iprod.native()); |
305 | } |
306 | return reduce_mul(prod); |
307 | #elif SIMDPP_USE_AVX2 |
308 | uint32x8 prod = make_uint(1); |
309 | for (unsigned j = 0; j < a.vec_length; ++j) { |
310 | uint32x8 ca = (uint32x8) a.vec(j); |
311 | // shift data zeroing out bits |
312 | uint32x8 al = shift_r<16>(ca); |
313 | uint32x8 ah = shift_l<16>(ca); |
314 | // due to zoreing bits in previous steps, the result can be OR'ed |
315 | uint32x8 lo = _mm256_mullo_epi16(ca.native(), al.native()); |
316 | uint32x8 hi = _mm256_mulhi_epi16(ca.native(), ah.native()); |
317 | uint32x8 iprod = bit_or(lo, hi); |
318 | iprod = _mm256_mul_epu32(iprod.native(), move4_l<1>(iprod).eval().native()); |
319 | prod = _mm256_mul_epu32(prod.native(), iprod.native()); |
320 | } |
321 | uint32x4 r2 = _mm_mul_epu32(detail::extract128<0>(prod).native(), |
322 | detail::extract128<1>(prod).native()); |
323 | r2 = _mm_mul_epu32(r2.native(), move4_l<2>(r2).eval().native()); |
324 | return extract<0>(r2); |
325 | #elif SIMDPP_USE_SSE2 |
326 | uint32x4 prod = make_uint(1); |
327 | for (unsigned j = 0; j < a.vec_length; ++j) { |
328 | uint32x4 ca = (uint32x4) a.vec(j); |
329 | // shift data zeroing out bits |
330 | uint32x4 al = shift_r<16>(ca); |
331 | uint32x4 ah = shift_l<16>(ca); |
332 | // due to zoreing bits in previous steps, the result can be OR'ed |
333 | uint32x4 lo = _mm_mullo_epi16(ca.native(), al.native()); |
334 | uint32x4 hi = _mm_mulhi_epi16(ca.native(), ah.native()); |
335 | uint32x4 r = bit_or(lo, hi); |
336 | r = _mm_mul_epu32(r.native(), move4_l<1>(r).eval().native()); |
337 | prod = _mm_mul_epu32(prod.native(), r.native()); |
338 | } |
339 | prod = _mm_mul_epu32(prod.native(), move4_l<2>(prod).eval().native()); |
340 | return extract<0>(prod); |
341 | #elif SIMDPP_USE_NEON |
342 | int32x4 prod = make_uint(1); |
343 | for (unsigned j = 0; j < a.vec_length; ++j) { |
344 | int32x4 r = vmull_s16(vget_low_s16(a.vec(j).native()), |
345 | vget_high_s16(a.vec(j).native())); |
346 | prod = mul_lo(prod, r); |
347 | } |
348 | prod = mul_lo(prod, move4_l<2>(prod)); |
349 | prod = mul_lo(prod, move4_l<1>(prod)); |
350 | return extract<0>(prod); |
351 | #elif SIMDPP_USE_ALTIVEC |
352 | int32_t r = i_reduce_mul(a.vec(0)); |
353 | for (unsigned j = 1; j < a.vec_length; ++j) { |
354 | r *= i_reduce_mul(a.vec(j)); |
355 | } |
356 | return r; |
357 | #elif SIMDPP_USE_MSA |
358 | int32x4 prod = make_uint(1); |
359 | for (unsigned j = 0; j < a.vec_length; ++j) { |
360 | int32<8> a32 = to_int32(a.vec(j)); |
361 | prod = mul_lo(prod, mul_lo(a32.vec(0), a32.vec(1))); |
362 | } |
363 | prod = mul_lo(prod, move4_l<2>(prod)); |
364 | prod = mul_lo(prod, move4_l<1>(prod)); |
365 | return extract<0>(prod); |
366 | #endif |
367 | } |
368 | |
369 | // ----------------------------------------------------------------------------- |
370 | |
371 | static SIMDPP_INL |
372 | uint32_t i_reduce_mul(const uint32x4& a) |
373 | { |
374 | #if SIMDPP_USE_NULL |
375 | uint32_t r = a.el(0); |
376 | for (unsigned i = 1; i < a.length; i++) { |
377 | r *= a.el(i); |
378 | } |
379 | return r; |
380 | #elif SIMDPP_USE_SSE2 |
381 | uint32x4 r = _mm_mul_epu32(a.native(), move4_l<1>(a).eval().native()); |
382 | r = _mm_mul_epu32(r.native(), move4_l<2>(r).eval().native()); |
383 | return extract<0>(r); |
384 | #elif SIMDPP_USE_NEON || SIMDPP_USE_MSA |
385 | uint32x4 r = a; |
386 | r = mul_lo(r, move4_l<2>(r)); |
387 | r = mul_lo(r, move4_l<1>(r)); |
388 | return extract<0>(r); |
389 | #elif SIMDPP_USE_ALTIVEC |
390 | mem_block<uint32x4> b = a; |
391 | return b[0] * b[1] * b[2] * b[3]; |
392 | #endif |
393 | } |
394 | |
395 | #if SIMDPP_USE_AVX2 |
396 | static SIMDPP_INL |
397 | uint32_t i_reduce_mul(const uint32x8& a) |
398 | { |
399 | uint32x8 ra = _mm256_mul_epu32(a.native(), move4_l<1>(a).eval().native()); |
400 | uint32x4 r = _mm_mul_epu32(detail::extract128<0>(ra).native(), |
401 | detail::extract128<1>(ra).native()); |
402 | r = _mm_mul_epu32(r.native(), move4_l<2>(r).eval().native()); |
403 | return extract<0>(r); |
404 | } |
405 | #endif |
406 | |
407 | #if SIMDPP_USE_AVX512F |
408 | static SIMDPP_INL |
409 | uint32_t i_reduce_mul(const uint32<16>& a) |
410 | { |
411 | return i_reduce_mul(mul_lo(extract256<0>(a), extract256<1>(a))); |
412 | } |
413 | #endif |
414 | |
415 | template<unsigned N> |
416 | SIMDPP_INL uint32_t i_reduce_mul(const uint32<N>& a) |
417 | { |
418 | #if SIMDPP_USE_NULL |
419 | uint32_t r = 1; |
420 | for (unsigned j = 0; j < a.vec_length; ++j) { |
421 | for (unsigned i = 0; i < a.base_length; i++) { |
422 | r *= a.vec(j).el(i); |
423 | } |
424 | } |
425 | return r; |
426 | #elif SIMDPP_USE_AVX2 |
427 | uint32x8 prod = make_uint(1); |
428 | for (unsigned j = 0; j < a.vec_length; ++j) { |
429 | uint32x8 ai = a.vec(j); |
430 | uint32x8 ra = _mm256_mul_epu32(ai.native(), move4_l<1>(ai).eval().native()); |
431 | prod = _mm256_mul_epu32(prod.native(), ra.native()); |
432 | } |
433 | uint32x4 r = _mm_mul_epu32(detail::extract128<0>(prod).native(), |
434 | detail::extract128<1>(prod).native()); |
435 | r = _mm_mul_epu32(r.native(), move4_l<2>(r).eval().native()); |
436 | return extract<0>(r); |
437 | #elif SIMDPP_USE_SSE2 |
438 | uint32x4 r = make_uint(1); |
439 | for (unsigned j = 0; j < a.vec_length; ++j) { |
440 | uint32x4 ai = a.vec(j); |
441 | uint32x4 ra = _mm_mul_epu32(ai.native(), move4_l<1>(ai).eval().native()); |
442 | r = _mm_mul_epu32(r.native(), ra.native()); |
443 | } |
444 | r = _mm_mul_epu32(r.native(), move4_l<2>(r).eval().native()); |
445 | return extract<0>(r); |
446 | #elif SIMDPP_USE_NEON || SIMDPP_USE_MSA |
447 | uint32x4 prod = make_uint(1); |
448 | for (unsigned j = 0; j < a.vec_length; ++j) { |
449 | prod = mul_lo(prod, a.vec(j)); |
450 | } |
451 | prod = mul_lo(prod, move4_l<2>(prod)); |
452 | prod = mul_lo(prod, move4_l<1>(prod)); |
453 | return extract<0>(prod); |
454 | #elif SIMDPP_USE_ALTIVEC |
455 | uint32_t r = i_reduce_mul(a.vec(0)); |
456 | for (unsigned j = 1; j < a.vec_length; ++j) { |
457 | r *= i_reduce_mul(a.vec(j)); |
458 | } |
459 | return r; |
460 | #endif |
461 | } |
462 | |
463 | // ----------------------------------------------------------------------------- |
464 | |
465 | |
466 | } // namespace insn |
467 | } // namespace detail |
468 | } // namespace SIMDPP_ARCH_NAMESPACE |
469 | } // namespace simdpp |
470 | |
471 | #endif |
472 | |
473 | |