1/* Copyright (C) 2016 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_MASKED_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_MASKED_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/load.h>
17#include <simdpp/core/store.h>
18#include <simdpp/detail/null/memory.h>
19#include <simdpp/detail/align.h>
20
21namespace simdpp {
22namespace SIMDPP_ARCH_NAMESPACE {
23namespace detail {
24namespace insn {
25
26static SIMDPP_INL
27void i_store_masked(char* p, const uint32<4>& a, const mask_int32<4>& mask)
28{
29#if SIMDPP_USE_NULL
30 null::store_masked(p, a, mask);
31#elif SIMDPP_USE_AVX512VL
32 _mm_mask_store_epi32(p, mask.native(), a.native());
33#elif SIMDPP_USE_AVX2
34 _mm_maskstore_epi32(reinterpret_cast<int*>(p), mask.native(), a.native());
35#elif SIMDPP_USE_AVX
36 _mm_maskstore_ps(reinterpret_cast<float*>(p), mask.native(),
37 _mm_castsi128_ps(a.native()));
38#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
39 uint32<4> b = load(p);
40 b = blend(a, b, mask);
41 store(p, b);
42#endif
43}
44
45#if SIMDPP_USE_AVX2
46static SIMDPP_INL
47void i_store_masked(char* p, const uint32<8>& a, const mask_int32<8>& mask)
48{
49#if SIMDPP_USE_AVX512VL
50 _mm256_mask_store_epi32(p, mask.native(), a.native());
51#else
52 _mm256_maskstore_epi32(reinterpret_cast<int*>(p), mask.native(), a.native());
53#endif
54}
55#endif
56
57#if SIMDPP_USE_AVX512F
58static SIMDPP_INL
59void i_store_masked(char* p, const uint32<16>& a, const mask_int32<16>& mask)
60{
61 _mm512_mask_store_epi32(reinterpret_cast<int*>(p), mask.native(), a.native());
62}
63#endif
64
65// -----------------------------------------------------------------------------
66
67static SIMDPP_INL
68void i_store_masked(char* p, const uint64<2>& a, const mask_int64<2>& mask)
69{
70#if SIMDPP_USE_AVX512VL
71#if __INTEL_COMPILER
72 _mm_mask_store_epi64(reinterpret_cast<__int64*>(p), mask.native(),
73 a.native());
74#else
75 _mm_mask_store_epi64(reinterpret_cast<long long*>(p), mask.native(),
76 a.native());
77#endif
78#elif SIMDPP_USE_AVX2
79#if __INTEL_COMPILER
80 _mm_maskstore_epi64(reinterpret_cast<__int64*>(p), mask.native(), a.native());
81#else
82 _mm_maskstore_epi64(reinterpret_cast<long long*>(p), mask.native(), a.native());
83#endif
84#elif SIMDPP_USE_AVX
85 _mm_maskstore_pd(reinterpret_cast<double*>(p), mask.native(), _mm_castsi128_pd(a.native()));
86#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
87 uint64<2> b = load(p);
88 b = blend(a, b, mask);
89 store(p, b);
90#elif SIMDPP_USE_NULL || SIMDPP_USE_ALTIVEC
91 null::store_masked(p, a, mask);
92#endif
93}
94
95#if SIMDPP_USE_AVX2
96static SIMDPP_INL
97void i_store_masked(char* p, const uint64<4>& a, const mask_int64<4>& mask)
98{
99#if SIMDPP_USE_AVX512VL
100#if __INTEL_COMPILER
101 _mm256_mask_store_epi64(reinterpret_cast<__int64*>(p), mask.native(),
102 a.native());
103#else
104 _mm256_mask_store_epi64(reinterpret_cast<long long*>(p), mask.native(),
105 a.native());
106#endif
107#else
108#if __INTEL_COMPILER
109 _mm256_maskstore_epi64(reinterpret_cast<__int64*>(p), mask.native(), a.native());
110#else
111 _mm256_maskstore_epi64(reinterpret_cast<long long*>(p), mask.native(), a.native());
112#endif
113#endif
114}
115#endif
116
117#if SIMDPP_USE_AVX512F
118static SIMDPP_INL
119void i_store_masked(char* p, const uint64<8>& a, const mask_int64<8>& mask)
120{
121#if __INTEL_COMPILER
122 _mm512_mask_store_epi64(reinterpret_cast<__int64*>(p), mask.native(), a.native());
123#else
124 _mm512_mask_store_epi64(reinterpret_cast<long long*>(p), mask.native(), a.native());
125#endif
126}
127#endif
128
129// -----------------------------------------------------------------------------
130
131static SIMDPP_INL
132void i_store_masked(char* p, const float32<4>& a, const mask_float32<4>& mask)
133{
134#if SIMDPP_USE_NULL
135 null::store_masked(p, a, mask);
136#elif SIMDPP_USE_AVX512VL
137 _mm_mask_store_ps(reinterpret_cast<float*>(p), mask.native(), a.native());
138#elif SIMDPP_USE_AVX
139 _mm_maskstore_ps(reinterpret_cast<float*>(p),
140 _mm_castps_si128(mask.native()), a.native());
141#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC || SIMDPP_USE_MSA
142 float32<4> b = load(p);
143 b = blend(a, b, mask);
144 store(p, b);
145#endif
146}
147
148#if SIMDPP_USE_AVX
149static SIMDPP_INL
150void i_store_masked(char* p, const float32<8>& a, const mask_float32<8>& mask)
151{
152#if SIMDPP_USE_AVX512VL
153 _mm256_mask_store_ps(reinterpret_cast<float*>(p), mask.native(),
154 a.native());
155#else
156 _mm256_maskstore_ps(reinterpret_cast<float*>(p),
157 _mm256_castps_si256(mask.native()), a.native());
158#endif
159}
160#endif
161
162#if SIMDPP_USE_AVX512F
163static SIMDPP_INL
164void i_store_masked(char* p, const float32<16>& a, const mask_float32<16>& mask)
165{
166 _mm512_mask_store_ps(reinterpret_cast<float*>(p), mask.native(), a.native());
167}
168#endif
169
170// -----------------------------------------------------------------------------
171
172static SIMDPP_INL
173void i_store_masked(char* p, const float64<2>& a, const mask_float64<2>& mask)
174{
175#if SIMDPP_USE_AVX512VL
176 _mm_mask_store_pd(reinterpret_cast<double*>(p), mask.native(), a.native());
177#elif SIMDPP_USE_AVX
178 _mm_maskstore_pd(reinterpret_cast<double*>(p),
179 _mm_castpd_si128(mask.native()), a.native());
180#elif SIMDPP_USE_SSE2 || SIMDPP_USE_NEON64 || SIMDPP_USE_VSX_206 || SIMDPP_USE_MSA
181 float64<2> b = load(p);
182 b = blend(a, b, mask);
183 store(p, b);
184#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON32 || SIMDPP_USE_ALTIVEC
185 null::store_masked(p, a, mask);
186#endif
187}
188
189#if SIMDPP_USE_AVX
190static SIMDPP_INL
191void i_store_masked(char* p, const float64<4>& a, const mask_float64<4>& mask)
192{
193#if SIMDPP_USE_AVX512VL
194 _mm256_mask_store_pd(reinterpret_cast<double*>(p), mask.native(),
195 a.native());
196#else
197 _mm256_maskstore_pd(reinterpret_cast<double*>(p),
198 _mm256_castpd_si256(mask.native()), a.native());
199#endif
200}
201#endif
202
203#if SIMDPP_USE_AVX512F
204static SIMDPP_INL
205void i_store_masked(char* p, const float64<8>& a, const mask_float64<8>& mask)
206{
207 _mm512_mask_store_pd(reinterpret_cast<double*>(p), mask.native(), a.native());
208}
209#endif
210
211// -----------------------------------------------------------------------------
212
213template<class V, class M>
214void i_store_masked(char* p, const V& a, const M& mask)
215{
216 const unsigned veclen = V::base_vector_type::length_bytes;
217
218 for (unsigned i = 0; i < a.vec_length; ++i) {
219 i_store_masked(p, a.vec(i), mask.vec(i));
220 p += veclen;
221 }
222}
223
224} // namespace insn
225} // namespace detail
226} // namespace SIMDPP_ARCH_NAMESPACE
227} // namespace simdpp
228
229#endif
230
231