1 | /* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_FIRST_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_FIRST_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/detail/align.h> |
17 | #include <simdpp/core/blend.h> |
18 | #include <simdpp/core/cmp_gt.h> |
19 | #include <simdpp/core/load.h> |
20 | #include <simdpp/core/load_u.h> |
21 | #include <simdpp/core/store.h> |
22 | #include <simdpp/detail/neon/memory_store.h> |
23 | #include <simdpp/detail/null/memory.h> |
24 | #include <simdpp/detail/extract128.h> |
25 | |
26 | namespace simdpp { |
27 | namespace SIMDPP_ARCH_NAMESPACE { |
28 | namespace detail { |
29 | namespace insn { |
30 | |
31 | static SIMDPP_INL |
32 | void i_store_first(char* p, const uint8x16& a, unsigned n) |
33 | { |
34 | p = detail::assume_aligned(p, 16); |
35 | #if SIMDPP_USE_NULL |
36 | detail::null::store_first(p, a, n); |
37 | #elif SIMDPP_USE_ALTIVEC && SIMDPP_BIG_ENDIAN |
38 | uint8x16 mask = vec_lvsr(n, (const uint8_t*)NULL); |
39 | mask = cmp_lt(mask, 0x10); |
40 | uint8x16 b = load(p); |
41 | b = blend(a, b, mask); |
42 | store(p, b); |
43 | #elif SIMDPP_USE_ALTIVEC && SIMDPP_LITTLE_ENDIAN |
44 | uint8<16> mask = make_ones(); |
45 | uint8<16> shift = vec_splats((unsigned char)(n << 3)); |
46 | mask = vec_slo(mask.native(), shift.native()); |
47 | |
48 | uint8x16 b = load(p); |
49 | b = blend(b, a, mask); |
50 | store(p, b); |
51 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_MSA |
52 | // for MSA we can't use __msa_sld_b because it does not work with shift==16 |
53 | static const uint8_t mask_d[32] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
54 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
55 | 0,0,0,0,0,0,0,0, |
56 | 0,0,0,0,0,0,0,0}; |
57 | |
58 | uint8x16 mask = load_u(mask_d + 16 - n); |
59 | uint8x16 b = load(p); |
60 | b = blend(a, b, mask); |
61 | store(p, b); |
62 | #endif |
63 | } |
64 | |
65 | #if SIMDPP_USE_AVX2 |
66 | static SIMDPP_INL |
67 | void i_store_first(char* p, const uint8x32& a, unsigned n) |
68 | { |
69 | static const uint8_t mask_d[64] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
70 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
71 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
72 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
73 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
74 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0}; |
75 | |
76 | uint8x32 mask = load_u(mask_d + 32 - n); |
77 | uint8x32 b = load(p); |
78 | b = blend(a, b, mask); |
79 | store(p, b); |
80 | } |
81 | #endif |
82 | |
83 | #if SIMDPP_USE_AVX512BW |
84 | SIMDPP_INL void i_store_first(char* p, const uint8<64>& a, unsigned n) |
85 | { |
86 | static const uint8_t mask_d[128] = {0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
87 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
88 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
89 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
90 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
91 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
92 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
93 | 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff, |
94 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
95 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
96 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0, |
97 | 0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0}; |
98 | |
99 | uint8<64> mask = load_u(mask_d + 64 - n); |
100 | uint8<64> b = load(p); |
101 | b = blend(a, b, mask); |
102 | store(p, b); |
103 | } |
104 | #endif |
105 | |
106 | // ----------------------------------------------------------------------------- |
107 | |
108 | static SIMDPP_INL |
109 | void i_store_first(char* p, const uint16x8& a, unsigned n) |
110 | { |
111 | p = detail::assume_aligned(p, 16); |
112 | #if SIMDPP_USE_NULL |
113 | detail::null::store_first(p, a, n); |
114 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
115 | i_store_first(p, (uint8x16)a, n*2); |
116 | #endif |
117 | } |
118 | |
119 | #if SIMDPP_USE_AVX2 |
120 | static SIMDPP_INL |
121 | void i_store_first(char* p, const uint16x16& a, unsigned n) |
122 | { |
123 | i_store_first(p, uint8x32(a), n*2); |
124 | } |
125 | #endif |
126 | |
127 | #if SIMDPP_USE_AVX512BW |
128 | SIMDPP_INL void i_store_first(char* p, const uint16<32>& a, unsigned n) |
129 | { |
130 | i_store_first(p, uint8<64>(a), n*2); |
131 | } |
132 | #endif |
133 | |
134 | // ----------------------------------------------------------------------------- |
135 | |
136 | static SIMDPP_INL |
137 | void i_store_first(char* p, const uint32x4& a, unsigned n) |
138 | { |
139 | p = detail::assume_aligned(p, 16); |
140 | #if SIMDPP_USE_NULL |
141 | detail::null::store_first(p, a, n); |
142 | #elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA |
143 | i_store_first(p, (uint8x16)a, n*4); |
144 | #endif |
145 | } |
146 | |
147 | #if SIMDPP_USE_AVX2 |
148 | static SIMDPP_INL |
149 | void i_store_first(char* p, const uint32x8& a, unsigned n) |
150 | { |
151 | static const int32_t mask_d[16] = { -1, -1, -1, -1, -1, -1, -1, -1, |
152 | 0, 0, 0, 0, 0, 0, 0, 0 }; |
153 | uint32<8> mask = load_u(mask_d + 8-n); |
154 | _mm256_maskstore_epi32(reinterpret_cast<int*>(p), mask.native(), a.native()); |
155 | } |
156 | #endif |
157 | |
158 | #if SIMDPP_USE_AVX512F |
159 | static SIMDPP_INL |
160 | void i_store_first(char* p, const uint32<16>& a, unsigned n) |
161 | { |
162 | _mm512_mask_store_epi32(p, 0xffff >> (16-n), a.native()); |
163 | } |
164 | #endif |
165 | |
166 | // ----------------------------------------------------------------------------- |
167 | |
168 | static SIMDPP_INL |
169 | void i_store_first(char* p, const uint64x2& a, unsigned n) |
170 | { |
171 | p = detail::assume_aligned(p, 16); |
172 | #if SIMDPP_USE_SSE2 |
173 | if (n == 1) { |
174 | _mm_store_sd(reinterpret_cast<double*>(p), _mm_castsi128_pd(a.native())); |
175 | } |
176 | #elif SIMDPP_USE_NEON |
177 | if (n == 1) { |
178 | neon::store_lane<0,1>(p, a); |
179 | } |
180 | #elif SIMDPP_USE_VSX_207 |
181 | if (n == 1) { |
182 | uint64_t* q = reinterpret_cast<uint64_t*>(p); |
183 | *q = vec_extract(a.native(), 0); |
184 | } |
185 | #elif SIMDPP_USE_MSA |
186 | i_store_first(p, uint8<16>(a), n*8); |
187 | #elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC |
188 | detail::null::store_first(p, a, n); |
189 | #endif |
190 | } |
191 | |
192 | #if SIMDPP_USE_AVX2 |
193 | static SIMDPP_INL |
194 | void i_store_first(char* p, const uint64x4& a, unsigned n) |
195 | { |
196 | static const int64_t mask_d[8] = { -1, -1, -1, -1, 0, 0, 0, 0 }; |
197 | uint64<4> mask = load_u(mask_d + 4-n); |
198 | #if __INTEL_COMPILER |
199 | _mm256_maskstore_epi64(reinterpret_cast<__int64*>(p), mask.native(), a.native()); |
200 | #else |
201 | _mm256_maskstore_epi64(reinterpret_cast<long long*>(p), mask.native(), a.native()); |
202 | #endif |
203 | } |
204 | #endif |
205 | |
206 | #if SIMDPP_USE_AVX512F |
207 | static SIMDPP_INL |
208 | void i_store_first(char* p, const uint64<8>& a, unsigned n) |
209 | { |
210 | _mm512_mask_store_epi64(p, 0xff >> (8-n), a.native()); |
211 | } |
212 | #endif |
213 | |
214 | // ----------------------------------------------------------------------------- |
215 | |
216 | static SIMDPP_INL |
217 | void i_store_first(char* p, const float32x4& a, unsigned n) |
218 | { |
219 | p = detail::assume_aligned(p, 16); |
220 | #if SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC || SIMDPP_USE_NEON_NO_FLT_SP || SIMDPP_USE_MSA |
221 | i_store_first(p, int32x4(a), n); |
222 | #elif SIMDPP_USE_AVX && !SIMDPP_USE_AMD |
223 | static const int32_t mask_d[8] = { -1, -1, -1, -1, 0, 0, 0, 0 }; |
224 | float32x4 mask = load_u(mask_d + 4-n); |
225 | _mm_maskstore_ps(reinterpret_cast<float*>(p), _mm_castps_si128(mask.native()), a.native()); |
226 | #elif SIMDPP_USE_SSE2 |
227 | static const int32_t mask_d[8] = { -1, -1, -1, -1, 0, 0, 0, 0 }; |
228 | float32x4 mask = load_u(mask_d + 4-n); |
229 | float32x4 b = load(p); |
230 | b = blend(a, b, mask); |
231 | store(p, b); |
232 | #elif SIMDPP_USE_NEON |
233 | // + VFP |
234 | if (n < 1) return; |
235 | neon::store_lane<0,1>(p, a); |
236 | if (n < 2) return; |
237 | neon::store_lane<1,1>(p+4, a); |
238 | if (n < 3) return; |
239 | neon::store_lane<2,1>(p+8, a); |
240 | #endif |
241 | } |
242 | |
243 | #if SIMDPP_USE_AVX |
244 | static SIMDPP_INL |
245 | void i_store_first(char* p, const float32x8& a, unsigned n) |
246 | { |
247 | static const int32_t mask_d[16] = { -1, -1, -1, -1, -1, -1, -1, -1, |
248 | 0, 0, 0, 0, 0, 0, 0, 0 }; |
249 | |
250 | float32x8 mask = load_u(mask_d + 8-n); |
251 | #if !SIMDPP_USE_AMD |
252 | _mm256_maskstore_ps(reinterpret_cast<float*>(p), _mm256_castps_si256(mask.native()), a.native()); |
253 | #else |
254 | float32x8 b = load(p); |
255 | b = blend(a, b, mask); |
256 | store(v, b); |
257 | #endif |
258 | } |
259 | #endif |
260 | |
261 | #if SIMDPP_USE_AVX512F |
262 | static SIMDPP_INL |
263 | void i_store_first(char* p, const float32<16>& a, unsigned n) |
264 | { |
265 | _mm512_mask_store_ps(p, 0xffff >> (16-n), a.native()); |
266 | } |
267 | #endif |
268 | |
269 | // ----------------------------------------------------------------------------- |
270 | |
271 | static SIMDPP_INL |
272 | void i_store_first(char* p, const float64x2& a, unsigned n) |
273 | { |
274 | p = detail::assume_aligned(p, 16); |
275 | #if SIMDPP_USE_SSE2 |
276 | if (n == 1) { |
277 | _mm_store_sd(reinterpret_cast<double*>(p), a.native()); |
278 | } |
279 | #elif SIMDPP_USE_NEON64 |
280 | if (n == 1) { |
281 | vst1_f64(reinterpret_cast<double*>(p), vget_low_f64(a.native())); |
282 | } |
283 | #elif SIMDPP_USE_VSX_206 |
284 | if (n == 1) { |
285 | double* q = reinterpret_cast<double*>(p); |
286 | *q = vec_extract(a.native(), 0); |
287 | } |
288 | #elif SIMDPP_USE_MSA |
289 | i_store_first(p, (uint64x2)a, n); |
290 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC |
291 | detail::null::store_first(p, a, n); |
292 | #endif |
293 | } |
294 | |
295 | #if SIMDPP_USE_AVX |
296 | static SIMDPP_INL |
297 | void i_store_first(char* p, const float64x4& a, unsigned n) |
298 | { |
299 | static const int64_t mask_d[16] = { -1, -1, -1, -1, 0, 0, 0, 0 }; |
300 | float64x4 mask = load_u(mask_d + 4-n); |
301 | #if !SIMDPP_USE_AMD |
302 | _mm256_maskstore_pd(reinterpret_cast<double*>(p), _mm256_castpd_si256(mask.native()), a.native()); |
303 | #else |
304 | float64x4 b = load(p); |
305 | b = blend(a, b, mask); |
306 | store(v, b); |
307 | #endif |
308 | } |
309 | #endif |
310 | |
311 | #if SIMDPP_USE_AVX512F |
312 | static SIMDPP_INL |
313 | void i_store_first(char* p, const float64<8>& a, unsigned n) |
314 | { |
315 | _mm512_mask_store_pd(p, 0xff >> (8-n), a.native()); |
316 | } |
317 | #endif |
318 | |
319 | // ----------------------------------------------------------------------------- |
320 | |
321 | template<class V> SIMDPP_INL |
322 | void i_store_first(char* p, const V& ca, unsigned n) |
323 | { |
324 | const unsigned veclen = V::base_vector_type::length_bytes; |
325 | |
326 | typename detail::remove_sign<V>::type a = ca; |
327 | p = detail::assume_aligned(p, veclen); |
328 | |
329 | unsigned n_full_vec = n / V::base_vector_type::length; |
330 | unsigned mid_vec_fill_count = n % V::base_vector_type::length; |
331 | unsigned curr_vec = 0; |
332 | |
333 | for (; curr_vec < n_full_vec; ++curr_vec) { |
334 | store(p, a.vec(curr_vec)); |
335 | p += veclen; |
336 | } |
337 | |
338 | if (mid_vec_fill_count > 0) { |
339 | i_store_first(p, a.vec(curr_vec), mid_vec_fill_count); |
340 | } |
341 | } |
342 | |
343 | } // namespace insn |
344 | } // namespace detail |
345 | } // namespace SIMDPP_ARCH_NAMESPACE |
346 | } // namespace simdpp |
347 | |
348 | #endif |
349 | |