1/* Copyright (C) 2015
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_U_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_STORE_U_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/detail/null/memory.h>
17#include <simdpp/detail/align.h>
18#include <simdpp/core/cast.h>
19
20namespace simdpp {
21#ifndef SIMDPP_DOXYGEN
22namespace SIMDPP_ARCH_NAMESPACE {
23#endif
24namespace detail {
25namespace insn {
26
27static SIMDPP_INL
28void i_store_u(char* p, const uint8<16>& a)
29{
30#if SIMDPP_USE_NULL
31 detail::null::store(p, a);
32#elif SIMDPP_USE_SSE2
33 _mm_storeu_si128(reinterpret_cast<__m128i*>(p), a.native());
34#elif SIMDPP_USE_NEON
35 vst1q_u8(reinterpret_cast<uint8_t*>(p), a.native());
36#elif SIMDPP_USE_VSX_206
37 uint8_t* q = reinterpret_cast<uint8_t*>(p);
38 vec_vsx_st(a.native(), 0, q);
39#elif SIMDPP_USE_ALTIVEC
40 // From https://web.archive.org/web/20110305043420/http://developer.apple.com/hardwaredrivers/ve/alignment.html
41 uint8_t* q = reinterpret_cast<uint8_t*>(p);
42 __vector uint8_t msq, lsq, edges;
43 __vector uint8_t edge_align, align;
44 msq = vec_ld(0, q); // most significant quadword
45 lsq = vec_ld(15, q); // least significant quadword
46 // The address offset is 15 to take into account storing into aligned
47 // addresses. If 16 were used, then when q is 16-byte aligned we would
48 // access the next second 16-byte block, which could be on different page
49 // and inaccessible.
50#pragma GCC diagnostic push
51#pragma GCC diagnostic ignored "-Wdeprecated"
52 edge_align = vec_lvsl(0, q); // permute map to extract edges
53 edges = vec_perm(lsq, msq, edge_align); // extract the edges
54 align = vec_lvsr(0, q); // permute map to misalign data
55#pragma GCC diagnostic pop
56 msq = vec_perm(edges, a.native(), align); // misalign the data (msq)
57 lsq = vec_perm(a.native(), edges, align); // misalign the data (lsq)
58 vec_st(lsq, 15, q); // Store the lsq part first
59 vec_st(msq, 0, q); // Store the msq part
60#elif SIMDPP_USE_MSA
61 __msa_st_b((v16i8) a.native(), p, 0);
62#endif
63}
64
65static SIMDPP_INL
66void i_store_u(char* p, const uint16<8>& a)
67{
68#if SIMDPP_USE_NEON
69 vst1q_u16(reinterpret_cast<uint16_t*>(p), a.native());
70#elif SIMDPP_USE_MSA
71 __msa_st_h((v8i16) a.native(), p, 0);
72#else
73 i_store_u(p, uint8<16>(a));
74#endif
75}
76
77static SIMDPP_INL
78void i_store_u(char* p, const uint32<4>& a)
79{
80#if SIMDPP_USE_NEON
81 vst1q_u32(reinterpret_cast<uint32_t*>(p), a.native());
82#elif SIMDPP_USE_VSX_206
83 vec_vsx_st(a.native(), 0, reinterpret_cast<__vector uint32_t*>(p));
84#elif SIMDPP_USE_MSA
85 __msa_st_w((v4i32) a.native(), p, 0);
86#else
87 i_store_u(p, uint8<16>(a));
88#endif
89}
90
91static SIMDPP_INL
92void i_store_u(char* p, const uint64<2>& a)
93{
94#if SIMDPP_USE_NEON
95 vst1q_u64(reinterpret_cast<uint64_t*>(p), a.native());
96#elif SIMDPP_USE_VSX_207
97 vec_vsx_st((__vector uint32_t) a.native(), 0,
98 reinterpret_cast<__vector uint32_t*>(p));
99#elif SIMDPP_USE_MSA
100 __msa_st_d((v2i64) a.native(), p, 0);
101#else
102 i_store_u(p, uint8<16>(a));
103#endif
104}
105
106static SIMDPP_INL
107void i_store_u(char* p, const float32x4& a)
108{
109 float* q = reinterpret_cast<float*>(p);
110 (void) q;
111#if SIMDPP_USE_NULL || SIMDPP_USE_NEON_NO_FLT_SP
112 detail::null::store(p, a);
113#elif SIMDPP_USE_SSE2
114 _mm_storeu_ps(q, a.native());
115#elif SIMDPP_USE_NEON
116 vst1q_f32(q, a.native());
117#elif SIMDPP_USE_VSX_206
118 vec_vsx_st(a.native(), 0, q);
119#elif SIMDPP_USE_ALTIVEC
120 uint32x4 b = bit_cast<uint32x4>(a.eval());
121 i_store_u(reinterpret_cast<char*>(q), b);
122#elif SIMDPP_USE_MSA
123 __msa_st_w((v4i32) a.native(), q, 0);
124#endif
125}
126
127static SIMDPP_INL
128void i_store_u(char* p, const float64x2& a)
129{
130 double* q = reinterpret_cast<double*>(p);
131 (void) q;
132#if SIMDPP_USE_SSE2
133 _mm_storeu_pd(q, a.native());
134#elif SIMDPP_USE_NEON64
135 vst1q_f64(q, a.native());
136#elif SIMDPP_USE_VSX_206
137 vec_vsx_st(a.native(), 0, q);
138#elif SIMDPP_USE_MSA
139 __msa_st_d((v2i64) a.native(), q, 0);
140#elif SIMDPP_USE_NULL || SIMDPP_USE_NEON || SIMDPP_USE_ALTIVEC
141 detail::null::store(p, a);
142#endif
143}
144
145
146#if SIMDPP_USE_AVX2
147static SIMDPP_INL
148void i_store_u(char* p, const uint8<32>& a)
149{
150 _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.native());
151}
152
153static SIMDPP_INL
154void i_store_u(char* p, const uint16<16>& a)
155{
156 _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.native());
157}
158
159static SIMDPP_INL
160void i_store_u(char* p, const uint32<8>& a)
161{
162 _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.native());
163}
164
165static SIMDPP_INL
166void i_store_u(char* p, const uint64<4>& a)
167{
168 _mm256_storeu_si256(reinterpret_cast<__m256i*>(p), a.native());
169}
170#endif
171
172#if SIMDPP_USE_AVX
173static SIMDPP_INL
174void i_store_u(char* p, const float32x8& a)
175{
176 _mm256_storeu_ps(reinterpret_cast<float*>(p), a.native());
177}
178
179static SIMDPP_INL
180void i_store_u(char* p, const float64x4& a)
181{
182 _mm256_storeu_pd(reinterpret_cast<double*>(p), a.native());
183}
184#endif
185
186#if SIMDPP_USE_AVX512BW
187SIMDPP_INL void i_store_u(char* p, const uint8<64>& a)
188{
189 _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), a.native());
190}
191
192SIMDPP_INL void i_store_u(char* p, const uint16<32>& a)
193{
194 _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), a.native());
195}
196#endif
197
198#if SIMDPP_USE_AVX512F
199static SIMDPP_INL
200void i_store_u(char* p, const uint32<16>& a)
201{
202 _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), a.native());
203}
204
205static SIMDPP_INL
206void i_store_u(char* p, const uint64<8>& a)
207{
208 _mm512_storeu_si512(reinterpret_cast<__m512i*>(p), a.native());
209}
210
211static SIMDPP_INL
212void i_store_u(char* p, const float32<16>& a)
213{
214 _mm512_storeu_ps(p, a.native());
215}
216
217static SIMDPP_INL
218void i_store_u(char* p, const float64<8>& a)
219{
220 _mm512_storeu_pd(p, a.native());
221}
222#endif
223
224template<class V> SIMDPP_INL
225void v_store_u(char* p, const V& a)
226{
227 const unsigned veclen = V::base_vector_type::length_bytes;
228
229 for (unsigned i = 0; i < V::vec_length; ++i) {
230 i_store_u(p, a.vec(i));
231 p += veclen;
232 }
233}
234
235} // namespace insn
236} // namespace detail
237#ifndef SIMDPP_DOXYGEN
238} // namespace SIMDPP_ARCH_NAMESPACE
239#endif
240} // namespace simdpp
241
242#endif
243
244