1/***************************************************************************
2 * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
3 * Martin Renou *
4 * Copyright (c) QuantStack *
5 * Copyright (c) Serge Guelton *
6 * *
7 * Distributed under the terms of the BSD 3-Clause License. *
8 * *
9 * The full license is in the file LICENSE, distributed with this software. *
10 ****************************************************************************/
11
12#ifndef XSIMD_GENERIC_MEMORY_HPP
13#define XSIMD_GENERIC_MEMORY_HPP
14
15#include <algorithm>
16#include <complex>
17#include <stdexcept>
18
19#include "../../types/xsimd_batch_constant.hpp"
20#include "./xsimd_generic_details.hpp"
21
22namespace xsimd
23{
24 template <class batch_type, typename batch_type::value_type... Values>
25 struct batch_constant;
26
27 namespace kernel
28 {
29
30 using namespace types;
31
32 // extract_pair
33 template <class A, class T>
34 inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<generic>) noexcept
35 {
36 constexpr std::size_t size = batch<T, A>::size;
37 assert(i < size && "index in bounds");
38
39 alignas(A::alignment()) T self_buffer[size];
40 self.store_aligned(self_buffer);
41
42 alignas(A::alignment()) T other_buffer[size];
43 other.store_aligned(other_buffer);
44
45 alignas(A::alignment()) T concat_buffer[size];
46
47 for (std::size_t j = 0; j < (size - i); ++j)
48 {
49 concat_buffer[j] = other_buffer[i + j];
50 if (j < i)
51 {
52 concat_buffer[size - 1 - j] = self_buffer[i - 1 - j];
53 }
54 }
55 return batch<T, A>::load_aligned(concat_buffer);
56 }
57
58 // gather
59 namespace detail
60 {
61 template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
62 inline batch<T, A> gather(U const* src, batch<V, A> const& index,
63 ::xsimd::index<N> I) noexcept
64 {
65 return insert(batch<T, A> {}, static_cast<T>(src[index.get(I)]), I);
66 }
67
68 template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
69 inline batch<T, A>
70 gather(U const* src, batch<V, A> const& index, ::xsimd::index<N> I) noexcept
71 {
72 static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
73
74 const auto test = gather<N - 1, T, A>(src, index, {});
75 return insert(test, static_cast<T>(src[index.get(I)]), I);
76 }
77 } // namespace detail
78
79 template <typename T, typename A, typename V>
80 inline batch<T, A>
81 gather(batch<T, A> const&, T const* src, batch<V, A> const& index,
82 kernel::requires_arch<generic>) noexcept
83 {
84 static_assert(batch<T, A>::size == batch<V, A>::size,
85 "Index and destination sizes must match");
86
87 return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
88 }
89
90 // Gather with runtime indexes and mismatched strides.
91 template <typename T, typename A, typename U, typename V>
92 inline detail::sizes_mismatch_t<T, U, batch<T, A>>
93 gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
94 kernel::requires_arch<generic>) noexcept
95 {
96 static_assert(batch<T, A>::size == batch<V, A>::size,
97 "Index and destination sizes must match");
98
99 return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
100 }
101
102 // Gather with runtime indexes and matching strides.
103 template <typename T, typename A, typename U, typename V>
104 inline detail::stride_match_t<T, U, batch<T, A>>
105 gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
106 kernel::requires_arch<generic>) noexcept
107 {
108 static_assert(batch<T, A>::size == batch<V, A>::size,
109 "Index and destination sizes must match");
110
111 return batch_cast<T>(kernel::gather(batch<U, A> {}, src, index, A {}));
112 }
113
114 // insert
115 template <class A, class T, size_t I>
116 inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept
117 {
118 struct index_mask
119 {
120 static constexpr bool get(size_t index, size_t /* size*/)
121 {
122 return index != I;
123 }
124 };
125 batch<T, A> tmp(val);
126 return select(make_batch_bool_constant<batch<T, A>, index_mask>(), self, tmp);
127 }
128
129 // get
130 template <class A, size_t I, class T>
131 inline T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
132 {
133 alignas(A::alignment()) T buffer[batch<T, A>::size];
134 self.store_aligned(&buffer[0]);
135 return buffer[I];
136 }
137
138 template <class A, size_t I, class T>
139 inline T get(batch_bool<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
140 {
141 alignas(A::alignment()) T buffer[batch_bool<T, A>::size];
142 self.store_aligned(&buffer[0]);
143 return buffer[I];
144 }
145
146 template <class A, size_t I, class T>
147 inline auto get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
148 {
149 alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size];
150 self.store_aligned(&buffer[0]);
151 return buffer[I];
152 }
153
154 template <class A, class T>
155 inline T get(batch<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
156 {
157 alignas(A::alignment()) T buffer[batch<T, A>::size];
158 self.store_aligned(&buffer[0]);
159 return buffer[i];
160 }
161
162 template <class A, class T>
163 inline T get(batch_bool<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
164 {
165 alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
166 self.store_aligned(&buffer[0]);
167 return buffer[i];
168 }
169
170 template <class A, class T>
171 inline auto get(batch<std::complex<T>, A> const& self, std::size_t i, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
172 {
173 using T2 = typename batch<std::complex<T>, A>::value_type;
174 alignas(A::alignment()) T2 buffer[batch<std::complex<T>, A>::size];
175 self.store_aligned(&buffer[0]);
176 return buffer[i];
177 }
178
179 // load_aligned
180 namespace detail
181 {
182 template <class A, class T_in, class T_out>
183 inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
184 {
185 using batch_type_in = batch<T_in, A>;
186 using batch_type_out = batch<T_out, A>;
187 return fast_cast(batch_type_in::load_aligned(mem), batch_type_out(), A {});
188 }
189 template <class A, class T_in, class T_out>
190 inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_slow_conversion) noexcept
191 {
192 static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
193 using batch_type_out = batch<T_out, A>;
194 alignas(A::alignment()) T_out buffer[batch_type_out::size];
195 std::copy(mem, mem + batch_type_out::size, std::begin(buffer));
196 return batch_type_out::load_aligned(buffer);
197 }
198 }
199 template <class A, class T_in, class T_out>
200 inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
201 {
202 return detail::load_aligned<A>(mem, cvt, A {}, detail::conversion_type<A, T_in, T_out> {});
203 }
204
205 // load_unaligned
206 namespace detail
207 {
208 template <class A, class T_in, class T_out>
209 inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
210 {
211 using batch_type_in = batch<T_in, A>;
212 using batch_type_out = batch<T_out, A>;
213 return fast_cast(batch_type_in::load_unaligned(mem), batch_type_out(), A {});
214 }
215
216 template <class A, class T_in, class T_out>
217 inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>, with_slow_conversion) noexcept
218 {
219 static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
220 return load_aligned<A>(mem, cvt, generic {}, with_slow_conversion {});
221 }
222 }
223 template <class A, class T_in, class T_out>
224 inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
225 {
226 return detail::load_unaligned<A>(mem, cvt, generic {}, detail::conversion_type<A, T_in, T_out> {});
227 }
228
229 namespace detail
230 {
231 // Scatter with runtime indexes.
232 template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
233 inline void scatter(batch<T, A> const& src, U* dst,
234 batch<V, A> const& index,
235 ::xsimd::index<N> I) noexcept
236 {
237 dst[index.get(I)] = static_cast<U>(src.get(I));
238 }
239
240 template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
241 inline void
242 scatter(batch<T, A> const& src, U* dst, batch<V, A> const& index,
243 ::xsimd::index<N> I) noexcept
244 {
245 static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
246
247 kernel::detail::scatter<N - 1, T, A, U, V>(
248 src, dst, index, {});
249 dst[index.get(I)] = static_cast<U>(src.get(I));
250 }
251 } // namespace detail
252
253 template <typename A, typename T, typename V>
254 inline void
255 scatter(batch<T, A> const& src, T* dst,
256 batch<V, A> const& index,
257 kernel::requires_arch<generic>) noexcept
258 {
259 static_assert(batch<T, A>::size == batch<V, A>::size,
260 "Source and index sizes must match");
261 kernel::detail::scatter<batch<V, A>::size - 1, T, A, T, V>(
262 src, dst, index, {});
263 }
264
265 template <typename A, typename T, typename U, typename V>
266 inline detail::sizes_mismatch_t<T, U, void>
267 scatter(batch<T, A> const& src, U* dst,
268 batch<V, A> const& index,
269 kernel::requires_arch<generic>) noexcept
270 {
271 static_assert(batch<T, A>::size == batch<V, A>::size,
272 "Source and index sizes must match");
273 kernel::detail::scatter<batch<V, A>::size - 1, T, A, U, V>(
274 src, dst, index, {});
275 }
276
277 template <typename A, typename T, typename U, typename V>
278 inline detail::stride_match_t<T, U, void>
279 scatter(batch<T, A> const& src, U* dst,
280 batch<V, A> const& index,
281 kernel::requires_arch<generic>) noexcept
282 {
283 static_assert(batch<T, A>::size == batch<V, A>::size,
284 "Source and index sizes must match");
285 const auto tmp = batch_cast<U>(src);
286 kernel::scatter<A>(tmp, dst, index, A {});
287 }
288
289 // store
290 template <class T, class A>
291 inline void store(batch_bool<T, A> const& self, bool* mem, requires_arch<generic>) noexcept
292 {
293 using batch_type = batch<T, A>;
294 constexpr auto size = batch_bool<T, A>::size;
295 alignas(A::alignment()) T buffer[size];
296 kernel::store_aligned<A>(&buffer[0], batch_type(self), A {});
297 for (std::size_t i = 0; i < size; ++i)
298 mem[i] = bool(buffer[i]);
299 }
300
301 // store_aligned
302 template <class A, class T_in, class T_out>
303 inline void store_aligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
304 {
305 static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
306 alignas(A::alignment()) T_in buffer[batch<T_in, A>::size];
307 store_aligned(&buffer[0], self);
308 std::copy(std::begin(buffer), std::end(buffer), mem);
309 }
310
311 // store_unaligned
312 template <class A, class T_in, class T_out>
313 inline void store_unaligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
314 {
315 static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
316 return store_aligned<A>(mem, self, generic {});
317 }
318
319 // swizzle
320 template <class A, class T, class ITy, ITy... Vs>
321 inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<batch<ITy, A>, Vs...> mask, requires_arch<generic>) noexcept
322 {
323 return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };
324 }
325
326 namespace detail
327 {
328 template <class A, class T>
329 inline batch<std::complex<T>, A> load_complex(batch<T, A> const& /*hi*/, batch<T, A> const& /*lo*/, requires_arch<generic>) noexcept
330 {
331 static_assert(std::is_same<T, void>::value, "load_complex not implemented for the required architecture");
332 }
333
334 template <class A, class T>
335 inline batch<T, A> complex_high(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
336 {
337 static_assert(std::is_same<T, void>::value, "complex_high not implemented for the required architecture");
338 }
339
340 template <class A, class T>
341 inline batch<T, A> complex_low(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
342 {
343 static_assert(std::is_same<T, void>::value, "complex_low not implemented for the required architecture");
344 }
345 }
346
347 // load_complex_aligned
348 template <class A, class T_out, class T_in>
349 inline batch<std::complex<T_out>, A> load_complex_aligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
350 {
351 using real_batch = batch<T_out, A>;
352 T_in const* buffer = reinterpret_cast<T_in const*>(mem);
353 real_batch hi = real_batch::load_aligned(buffer),
354 lo = real_batch::load_aligned(buffer + real_batch::size);
355 return detail::load_complex(hi, lo, A {});
356 }
357
358 // load_complex_unaligned
359 template <class A, class T_out, class T_in>
360 inline batch<std::complex<T_out>, A> load_complex_unaligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
361 {
362 using real_batch = batch<T_out, A>;
363 T_in const* buffer = reinterpret_cast<T_in const*>(mem);
364 real_batch hi = real_batch::load_unaligned(buffer),
365 lo = real_batch::load_unaligned(buffer + real_batch::size);
366 return detail::load_complex(hi, lo, A {});
367 }
368
369 // store_complex_aligned
370 template <class A, class T_out, class T_in>
371 inline void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
372 {
373 using real_batch = batch<T_in, A>;
374 real_batch hi = detail::complex_high(src, A {});
375 real_batch lo = detail::complex_low(src, A {});
376 T_out* buffer = reinterpret_cast<T_out*>(dst);
377 lo.store_aligned(buffer);
378 hi.store_aligned(buffer + real_batch::size);
379 }
380
381 // store_compelx_unaligned
382 template <class A, class T_out, class T_in>
383 inline void store_complex_unaligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
384 {
385 using real_batch = batch<T_in, A>;
386 real_batch hi = detail::complex_high(src, A {});
387 real_batch lo = detail::complex_low(src, A {});
388 T_out* buffer = reinterpret_cast<T_out*>(dst);
389 lo.store_unaligned(buffer);
390 hi.store_unaligned(buffer + real_batch::size);
391 }
392
393 }
394
395}
396
397#endif
398