1 | /* Copyright (C) 2015 |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_U_H |
9 | #define LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_U_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/detail/null/memory.h> |
17 | #include <simdpp/detail/align.h> |
18 | #include <simdpp/core/cast.h> |
19 | |
20 | namespace simdpp { |
21 | #ifndef SIMDPP_DOXYGEN |
22 | namespace SIMDPP_ARCH_NAMESPACE { |
23 | #endif |
24 | namespace detail { |
25 | namespace insn { |
26 | |
27 | static SIMDPP_INL |
28 | void i_store_u(char* p, const uint8<16>& a) |
29 | { |
30 | #if SIMDPP_USE_NULL |
31 | detail::null::store(p, a); |
32 | #elif SIMDPP_USE_SSE2 |
33 | _mm_storeu_si128(reinterpret_cast<__m128i*>(p), a.native()); |
34 | #elif SIMDPP_USE_NEON |
35 | vst1q_u8(reinterpret_cast<uint8_t*>(p), a.native()); |
36 | #elif SIMDPP_USE_VSX_206 |
37 | uint8_t* q = reinterpret_cast<uint8_t*>(p); |
38 | vec_vsx_st(a.native(), 0, q); |
39 | #elif SIMDPP_USE_ALTIVEC |
40 | // From https://web.archive.org/web/20110305043420/http://developer.apple.com/hardwaredrivers/ve/alignment.html |
41 | uint8_t* q = reinterpret_cast<uint8_t*>(p); |
42 | __vector uint8_t msq, lsq, edges; |
43 | __vector uint8_t edge_align, align; |
44 | msq = vec_ld(0, q); // most significant quadword |
45 | lsq = vec_ld(15, q); // least significant quadword |
46 | // The address offset is 15 to take into account storing into aligned |
47 | // addresses. If 16 were used, then when q is 16-byte aligned we would |
48 | // access the next second 16-byte block, which could be on different page |
49 | // and inaccessible. |
50 | #pragma GCC diagnostic push |
51 | #pragma GCC diagnostic ignored "-Wdeprecated" |
52 | edge_align = vec_lvsl(0, q); // permute map to extract edges |
53 | edges = vec_perm(lsq, msq, edge_align); // extract the edges |
54 | align = vec_lvsr(0, q); // permute map to misalign data |
55 | #pragma GCC diagnostic pop |
56 | msq = vec_perm(edges, a.native(), align); // misalign the data (msq) |
57 | lsq = vec_perm(a.native(), edges, align); // misalign the data (lsq) |
58 | vec_st(lsq, 15, q); // Store the lsq part first |
59 | vec_st(msq, 0, q); // Store the msq part |
60 | #elif SIMDPP_USE_MSA |
61 | __msa_st_b((v16i8) a.native(), p, 0); |
62 | #endif |
63 | } |
64 | |
65 | static SIMDPP_INL |
66 | void i_store_u(char* p, const uint16<8>& a) |
67 | { |
68 | #if SIMDPP_USE_NEON |
69 | vst1q_u16(reinterpret_cast<uint16_t*>(p), a.native()); |
70 | #elif SIMDPP_USE_MSA |
71 | __msa_st_h((v8i16) a.native(), p, 0); |
72 | #else |
73 | i_store_u(p, uint8<16>(a)); |
74 | #endif |
75 | } |
76 | |
77 | static SIMDPP_INL |
78 | void i_store_u(char* p, const uint32<4>& a) |
79 | { |
80 | #if SIMDPP_USE_NEON |
81 | vst1q_u32(reinterpret_cast<uint32_t*>(p), a.native()); |
82 | #elif SIMDPP_USE_VSX_206 |
83 | vec_vsx_st(a.native(), 0, reinterpret_cast<__vector uint32_t*>(p)); |
84 | #elif SIMDPP_USE_MSA |
85 | __msa_st_w((v4i32) a.native(), p, 0); |
86 | #else |
87 | i_store_u(p, uint8<16>(a)); |
88 | #endif |
89 | } |
90 | |
91 | static SIMDPP_INL |
92 | void i_store_u(char* p, const uint64<2>& a) |
93 | { |
94 | #if SIMDPP_USE_NEON |
95 | vst1q_u64(reinterpret_cast<uint64_t*>(p), a.native()); |
96 | #elif SIMDPP_USE_VSX_207 |
97 | vec_vsx_st((__vector uint32_t) a.native(), 0, |
98 | reinterpret_cast<__vector uint32_t*>(p)); |
99 | #elif SIMDPP_USE_MSA |
100 | __msa_st_d((v2i64) a.native(), p, 0); |
101 | #else |
102 | i_store_u(p, uint8<16>(a)); |
103 | #endif |
104 | } |
105 | |
106 | static SIMDPP_INL |
107 | void i_store_u(char* p, const float32x4& a) |
108 | { |
109 | float* q = reinterpret_cast<float*>(p); |
110 | (void) q; |
111 | #if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP |
112 | detail::null::store(p, a); |
113 | #elif SIMDPP_USE_SSE2 |
114 | _mm_storeu_ps(q, a.native()); |
115 | #elif SIMDPP_USE_NEON |
116 | vst1q_f32(q, a.native()); |
117 | #elif SIMDPP_USE_VSX_206 |
118 | vec_vsx_st(a.native(), 0, q); |
119 | #elif SIMDPP_USE_ALTIVEC |
120 | uint32x4 b = bit_cast<uint32x4>(a.eval()); |
121 | i_store_u(reinterpret_cast<char*>(q), b); |
122 | #elif SIMDPP_USE_MSA |
123 | __msa_st_w((v4i32) a.native(), q, 0); |
124 | #endif |
125 | } |
126 | |
127 | static SIMDPP_INL |
128 | void i_store_u(char* p, const float64x2& a) |
129 | { |
130 | double* q = reinterpret_cast<double*>(p); |
131 | (void) q; |
132 | #if SIMDPP_USE_SSE2 |
133 | _mm_storeu_pd(q, a.native()); |
134 | #elif SIMDPP_USE_NEON64 |
135 | vst1q_f64(q, a.native()); |
136 | #elif SIMDPP_USE_VSX_206 |
137 | vec_vsx_st(a.native(), 0, q); |
138 | #elif SIMDPP_USE_MSA |
139 | __msa_st_d((v2i64) a.native(), q, 0); |
140 | #elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC |
141 | detail::null::store(p, a); |
142 | #endif |
143 | } |
144 | |
145 | |
146 | #if SIMDPP_USE_AVX2 |
147 | static SIMDPP_INL |
148 | void i_store_u(char* p, const uint8<32>& a) |
149 | { |
150 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.native()); |
151 | } |
152 | |
153 | static SIMDPP_INL |
154 | void i_store_u(char* p, const uint16<16>& a) |
155 | { |
156 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.native()); |
157 | } |
158 | |
159 | static SIMDPP_INL |
160 | void i_store_u(char* p, const uint32<8>& a) |
161 | { |
162 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.native()); |
163 | } |
164 | |
165 | static SIMDPP_INL |
166 | void i_store_u(char* p, const uint64<4>& a) |
167 | { |
168 | _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.native()); |
169 | } |
170 | #endif |
171 | |
172 | #if SIMDPP_USE_AVX |
173 | static SIMDPP_INL |
174 | void i_store_u(char* p, const float32x8& a) |
175 | { |
176 | _mm256_storeu_ps(reinterpret_cast<float*>(p), a.native()); |
177 | } |
178 | |
179 | static SIMDPP_INL |
180 | void i_store_u(char* p, const float64x4& a) |
181 | { |
182 | _mm256_storeu_pd(reinterpret_cast<double*>(p), a.native()); |
183 | } |
184 | #endif |
185 | |
186 | #if SIMDPP_USE_AVX512BW |
187 | SIMDPP_INL void i_store_u(char* p, const uint8<64>& a) |
188 | { |
189 | _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), a.native()); |
190 | } |
191 | |
192 | SIMDPP_INL void i_store_u(char* p, const uint16<32>& a) |
193 | { |
194 | _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), a.native()); |
195 | } |
196 | #endif |
197 | |
198 | #if SIMDPP_USE_AVX512F |
199 | static SIMDPP_INL |
200 | void i_store_u(char* p, const uint32<16>& a) |
201 | { |
202 | _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), a.native()); |
203 | } |
204 | |
205 | static SIMDPP_INL |
206 | void i_store_u(char* p, const uint64<8>& a) |
207 | { |
208 | _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), a.native()); |
209 | } |
210 | |
211 | static SIMDPP_INL |
212 | void i_store_u(char* p, const float32<16>& a) |
213 | { |
214 | _mm512_storeu_ps(p, a.native()); |
215 | } |
216 | |
217 | static SIMDPP_INL |
218 | void i_store_u(char* p, const float64<8>& a) |
219 | { |
220 | _mm512_storeu_pd(p, a.native()); |
221 | } |
222 | #endif |
223 | |
224 | template<class V> SIMDPP_INL |
225 | void v_store_u(char* p, const V& a) |
226 | { |
227 | const unsigned veclen = V::base_vector_type::length_bytes; |
228 | |
229 | for (unsigned i = 0; i < V::vec_length; ++i) { |
230 | i_store_u(p, a.vec(i)); |
231 | p += veclen; |
232 | } |
233 | } |
234 | |
235 | } // namespace insn |
236 | } // namespace detail |
237 | #ifndef SIMDPP_DOXYGEN |
238 | } // namespace SIMDPP_ARCH_NAMESPACE |
239 | #endif |
240 | } // namespace simdpp |
241 | |
242 | #endif |
243 | |
244 | |