1 | /* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_LAST_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_LAST_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/detail/align.h> |
17 | #include <simdpp/core/blend.h> |
18 | #include <simdpp/core/cmp_gt.h> |
19 | #include <simdpp/core/load.h> |
20 | #include <simdpp/core/load_u.h> |
21 | #include <simdpp/core/move_l.h> |
22 | #include <simdpp/core/store.h> |
23 | #include <simdpp/detail/neon/memory_store.h> |
24 | #include <simdpp/detail/null/memory.h> |
25 | #include <simdpp/detail/extract128.h> |
26 | |
27 | namespace simdpp { |
28 | namespace SIMDPP_ARCH_NAMESPACE { |
29 | namespace detail { |
30 | namespace insn { |
31 | |
32 | // ----------------------------------------------------------------------------- |
33 | |
34 | static SIMDPP_INL |
35 | void i_store_last(char* p, const uint8x16& a, unsigned n) |
36 | { |
37 | p = detail::assume_aligned(p, 16); |
38 | #if SIMDPP_USE_NULL |
39 | detail::null::store_last(p, a, n); |
40 | #elif SIMDPP_USE_ALTIVEC && SIMDPP_BIG_ENDIAN |
41 | uint8x16 mask = vec_lvsl(n, (const uint8_t*)NULL); |
42 | mask = cmp_gt(mask, 0x0f); |
43 | uint8x16 b = load(p); |
44 | b = blend(a, b, mask); |
45 | store(p, b); |
46 | #elif SIMDPP_USE_ALTIVEC && SIMDPP_LITTLE_ENDIAN |
47 | uint8<16> mask = make_ones(); |
48 | uint8<16> shift = vec_splats((unsigned char)(n << 3)); |
49 | mask = vec_sro(mask.native(), shift.native()); |
50 | |
51 | uint8x16 b = load(p); |
52 | b = blend(b, a, mask); |
53 | store(p, b); |
54 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON |
55 | static const uint8_t mask_d[32] = {0,0,0,0,0,0,0,0, |
56 | 0,0,0,0,0,0,0,0, |
57 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
58 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff}; |
59 | |
60 | uint8x16 mask = load_u(mask_d + n); |
61 | uint8x16 b = load(p); |
62 | b = blend(a, b, mask); |
63 | store(p, b); |
64 | #elif SIMDPP_USE_MSA |
65 | int8x16 mask = make_ones(); |
66 | int8x16 zero = make_zero(); |
67 | mask = __msa_sld_b(mask.native(), zero.native(), n); |
68 | uint8x16 b = load(p); |
69 | b = blend(a, b, mask); |
70 | store(p, b); |
71 | #endif |
72 | } |
73 | |
74 | #if SIMDPP_USE_AVX2 |
75 | static SIMDPP_INL |
76 | void i_store_last(char* p, const uint8x32& a, unsigned n) |
77 | { |
78 | p = detail::assume_aligned(p, 32); |
79 | static const uint8_t mask_d[64] = {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
80 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
81 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
82 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
83 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
84 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff}; |
85 | uint8x32 mask = load_u(mask_d + n); |
86 | uint8x32 b = load(p); |
87 | b = blend(a, b, mask); |
88 | store(p, b); |
89 | } |
90 | #endif |
91 | |
92 | #if SIMDPP_USE_AVX512BW |
93 | SIMDPP_INL void i_store_last(char* p, const uint8<64>& a, unsigned n) |
94 | { |
95 | static const uint8_t mask_d[128] = {0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
96 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
97 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
98 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
99 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
100 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
101 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
102 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
103 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
104 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
105 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
106 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff }; |
107 | |
108 | uint8<64> mask = load_u(mask_d + n); |
109 | uint8<64> b = load(p); |
110 | b = blend(a, b, mask); |
111 | store(p, b); |
112 | } |
113 | #endif |
114 | |
115 | |
116 | // ----------------------------------------------------------------------------- |
117 | |
118 | static SIMDPP_INL |
119 | void i_store_last(char* p, const uint16x8& a, unsigned n) |
120 | { |
121 | p = detail::assume_aligned(p, 16); |
122 | #if SIMDPP_USE_NULL |
123 | detail::null::store_last(p, a, n); |
124 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
125 | i_store_last(p, (uint8x16)a, n*2); |
126 | #endif |
127 | } |
128 | |
129 | #if SIMDPP_USE_AVX2 |
130 | static SIMDPP_INL |
131 | void i_store_last(char* p, const uint16x16& a, unsigned n) |
132 | { |
133 | i_store_last(p, uint8x32(a), n*2); |
134 | } |
135 | #endif |
136 | |
137 | #if SIMDPP_USE_AVX512BW |
138 | SIMDPP_INL void i_store_last(char* p, const uint16<32>& a, unsigned n) |
139 | { |
140 | i_store_last(p, uint8<64>(a), n*2); |
141 | } |
142 | #endif |
143 | |
144 | // ----------------------------------------------------------------------------- |
145 | |
146 | static SIMDPP_INL |
147 | void i_store_last(char* p, const uint32x4& a, unsigned n) |
148 | { |
149 | p = detail::assume_aligned(p, 16); |
150 | #if SIMDPP_USE_NULL |
151 | detail::null::store_last(p, a, n); |
152 | #elif SIMDPP_USE_AVX2 |
153 | static const int32_t mask_d[8] = {0, 0, 0, 0, -1, -1, -1, -1}; |
154 | uint32x4 mask = load_u(mask_d + n); |
155 | _mm_maskstore_epi32(reinterpret_cast<int*>(p), mask.native(), a.native()); |
156 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
157 | i_store_last(p, (uint8x16)a, n*4); |
158 | #endif |
159 | } |
160 | |
161 | #if SIMDPP_USE_AVX2 |
162 | static SIMDPP_INL |
163 | void i_store_last(char* p, const uint32x8& a, unsigned n) |
164 | { |
165 | static const int32_t mask_d[16] = {0, 0, 0, 0, 0, 0, 0, 0, |
166 | -1, -1, -1, -1, -1, -1, -1, -1}; |
167 | uint32<8> mask = load_u(mask_d + n); |
168 | _mm256_maskstore_epi32(reinterpret_cast<int*>(p), mask.native(), a.native()); |
169 | } |
170 | #endif |
171 | |
172 | #if SIMDPP_USE_AVX512F |
173 | static SIMDPP_INL |
174 | void i_store_last(char* p, const uint32<16>& a, unsigned n) |
175 | { |
176 | _mm512_mask_store_epi32(p, 0xffff << (16-n), a.native()); |
177 | } |
178 | #endif |
179 | |
180 | // ----------------------------------------------------------------------------- |
181 | |
182 | static SIMDPP_INL |
183 | void i_store_last(char* p, const uint64x2& a, unsigned n) |
184 | { |
185 | p = detail::assume_aligned(p, 16); |
186 | #if SIMDPP_USE_SSE2 |
187 | if (n == 1) { |
188 | uint64x2 b = move2_l<1>(a); |
189 | _mm_store_sd(reinterpret_cast<double*>(p+8), _mm_castsi128_pd(b.native())); |
190 | } |
191 | #elif SIMDPP_USE_NEON |
192 | if (n == 1) { |
193 | neon::store_lane<1,1>(p+8, a); |
194 | } |
195 | #elif SIMDPP_USE_VSX_207 |
196 | if (n == 1) { |
197 | uint64_t* q = reinterpret_cast<uint64_t*>(p) + 1; |
198 | *q = vec_extract(a.native(), 1); |
199 | } |
200 | #elif SIMDPP_USE_MSA |
201 | i_store_last(p, uint8<16>(a), n*8); |
202 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
203 | detail::null::store_last(p, a, n); |
204 | #endif |
205 | } |
206 | |
207 | #if SIMDPP_USE_AVX2 |
208 | static SIMDPP_INL |
209 | void i_store_last(char* p, const uint64x4& a, unsigned n) |
210 | { |
211 | static const int64_t mask_d[8] = { 0, 0, 0, 0, -1, -1, -1, -1 }; |
212 | uint64<4> mask = load_u(mask_d + n); |
213 | #if __INTEL_COMPILER |
214 | _mm256_maskstore_epi64(reinterpret_cast<__int64*>(p), mask.native(), a.native()); |
215 | #else |
216 | _mm256_maskstore_epi64(reinterpret_cast<long long*>(p), mask.native(), a.native()); |
217 | #endif |
218 | } |
219 | #endif |
220 | |
221 | #if SIMDPP_USE_AVX512F |
222 | static SIMDPP_INL |
223 | void i_store_last(char* p, const uint64<8>& a, unsigned n) |
224 | { |
225 | _mm512_mask_store_epi64(p, 0xff << (8-n), a.native()); |
226 | } |
227 | #endif |
228 | |
229 | // ----------------------------------------------------------------------------- |
230 | |
231 | static SIMDPP_INL |
232 | void i_store_last(char* p, const float32x4& ca, unsigned n) |
233 | { |
234 | float32<4> a = ca; |
235 | p = detail::assume_aligned(p, 16); |
236 | #if SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC || SIMDPP_USE_NEON_FLT_SP || SIMDPP_USE_MSA |
237 | i_store_last(p, uint32x4(a), n); |
238 | #elif SIMDPP_USE_AVX && !SIMDPP_USE_AMD |
239 | static const int32_t mask_d[8] = { 0, 0, 0, 0, -1, -1, -1, -1 }; |
240 | float32x4 mask = load_u(mask_d + n); |
241 | _mm_maskstore_ps(reinterpret_cast<float*>(p), |
242 | _mm_castps_si128(mask.native()), a.native()); |
243 | #elif SIMDPP_USE_SSE2 |
244 | static const int32_t mask_d[8] = { 0, 0, 0, 0, -1, -1, -1, -1 }; |
245 | float32x4 mask = load_u(mask_d + n); |
246 | float32x4 old = load(p); |
247 | a = blend(a, old, mask); |
248 | store(p, a); |
249 | #elif SIMDPP_USE_NEON |
250 | // + VFP |
251 | if (n < 1) return; |
252 | neon::store_lane<3,1>(p+12, a); |
253 | if (n < 2) return; |
254 | neon::store_lane<2,1>(p+8, a); |
255 | if (n < 3) return; |
256 | neon::store_lane<1,1>(p+4, a); |
257 | #endif |
258 | } |
259 | |
260 | #if SIMDPP_USE_AVX |
261 | static SIMDPP_INL |
262 | void i_store_last(char* p, const float32x8& ca, unsigned n) |
263 | { |
264 | float32<8> a = ca; |
265 | static const int32_t mask_d[16] = { 0, 0, 0, 0, 0, 0, 0, 0, |
266 | -1, -1, -1, -1, -1, -1, -1, -1 }; |
267 | |
268 | float32x8 mask = load_u(mask_d + n); |
269 | #if !SIMDPP_USE_AMD |
270 | _mm256_maskstore_ps(reinterpret_cast<float*>(p), |
271 | _mm256_castps_si256(mask.native()), a.native()); |
272 | #else |
273 | float32x8 old = load(p); |
274 | a = blend(a, old, mask); |
275 | store(v, a); |
276 | #endif |
277 | } |
278 | #endif |
279 | |
280 | #if SIMDPP_USE_AVX512F |
281 | static SIMDPP_INL |
282 | void i_store_last(char* p, const float32<16>& a, unsigned n) |
283 | { |
284 | _mm512_mask_store_ps(p, 0xffff << (16-n), a.native()); |
285 | } |
286 | #endif |
287 | |
288 | // ----------------------------------------------------------------------------- |
289 | |
290 | static SIMDPP_INL |
291 | void i_store_last(char* p, const float64x2& a, unsigned n) |
292 | { |
293 | p = detail::assume_aligned(p, 16); |
294 | #if SIMDPP_USE_SSE2 |
295 | if (n == 1) { |
296 | float64x2 b = zip2_hi(a, a); |
297 | _mm_store_sd(reinterpret_cast<double*>(p)+1, b.native()); |
298 | } |
299 | #elif SIMDPP_USE_NEON64 |
300 | if (n == 1) { |
301 | vst1_f64(reinterpret_cast<double*>(p)+1, vget_high_f64(a.native())); |
302 | } |
303 | #elif SIMDPP_USE_VSX_206 |
304 | if (n == 1) { |
305 | *(reinterpret_cast<double*>(p)+1) = vec_extract(a.native(), 1); |
306 | } |
307 | #elif SIMDPP_USE_MSA |
308 | i_store_last(p, uint64x2(a), n); |
309 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC |
310 | detail::null::store_last(p, a, n); |
311 | #endif |
312 | } |
313 | |
314 | #if SIMDPP_USE_AVX |
315 | static SIMDPP_INL |
316 | void i_store_last(char* p, const float64x4& a, unsigned n) |
317 | { |
318 | static const int64_t mask_d[8] = { 0, 0, 0, 0, -1, -1, -1, -1 }; |
319 | float64x4 mask = load_u(mask_d + n); |
320 | |
321 | #if !SIMDPP_USE_AMD |
322 | _mm256_maskstore_pd(reinterpret_cast<double*>(p), |
323 | _mm256_castpd_si256(mask.native()), a.native()); |
324 | #else |
325 | float64x4 old = load(p); |
326 | a = blend(a, old, mask); |
327 | store(v, a); |
328 | #endif |
329 | } |
330 | #endif |
331 | |
332 | #if SIMDPP_USE_AVX512F |
333 | static SIMDPP_INL |
334 | void i_store_last(char* p, const float64<8>& a, unsigned n) |
335 | { |
336 | _mm512_mask_store_pd(p, 0xff << (8-n), a.native()); |
337 | } |
338 | #endif |
339 | |
340 | // ----------------------------------------------------------------------------- |
341 | |
342 | template<class V> SIMDPP_INL |
343 | void i_store_last(char* p, const V& ca, unsigned n) |
344 | { |
345 | const unsigned veclen = V::base_vector_type::length_bytes; |
346 | |
347 | typename detail::remove_sign<V>::type a = ca; |
348 | p = detail::assume_aligned(p, veclen); |
349 | unsigned el_to_skip = V::length - n; |
350 | |
351 | unsigned n_empty_vec = el_to_skip / V::base_vector_type::length; |
352 | unsigned mid_vec_skip_count = n % V::base_vector_type::length; |
353 | unsigned curr_vec = 0; |
354 | |
355 | p += n_empty_vec * veclen; |
356 | curr_vec += n_empty_vec; |
357 | if (mid_vec_skip_count > 0) { |
358 | i_store_last(p, a.vec(curr_vec), mid_vec_skip_count); |
359 | p += veclen; |
360 | curr_vec++; |
361 | } |
362 | |
363 | for (; curr_vec < V::vec_length; ++curr_vec) { |
364 | i_store(p, a.vec(curr_vec)); |
365 | p += veclen; |
366 | } |
367 | } |
368 | |
369 | } // namespace insn |
370 | } // namespace detail |
371 | } // namespace SIMDPP_ARCH_NAMESPACE |
372 | } // namespace simdpp |
373 | |
374 | #endif |
375 | |