1 | /*************************************************************************** |
2 | * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * |
3 | * Martin Renou * |
4 | * Copyright (c) QuantStack * |
5 | * Copyright (c) Serge Guelton * |
6 | * * |
7 | * Distributed under the terms of the BSD 3-Clause License. * |
8 | * * |
9 | * The full license is in the file LICENSE, distributed with this software. * |
10 | ****************************************************************************/ |
11 | |
12 | #ifndef XSIMD_GENERIC_MEMORY_HPP |
13 | #define XSIMD_GENERIC_MEMORY_HPP |
14 | |
15 | #include <algorithm> |
16 | #include <complex> |
17 | #include <stdexcept> |
18 | |
19 | #include "../../types/xsimd_batch_constant.hpp" |
20 | #include "./xsimd_generic_details.hpp" |
21 | |
22 | namespace xsimd |
23 | { |
24 | template <class batch_type, typename batch_type::value_type... Values> |
25 | struct batch_constant; |
26 | |
27 | namespace kernel |
28 | { |
29 | |
30 | using namespace types; |
31 | |
32 | // extract_pair |
33 | template <class A, class T> |
34 | inline batch<T, A> (batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<generic>) noexcept |
35 | { |
36 | constexpr std::size_t size = batch<T, A>::size; |
37 | assert(i < size && "index in bounds" ); |
38 | |
39 | alignas(A::alignment()) T self_buffer[size]; |
40 | self.store_aligned(self_buffer); |
41 | |
42 | alignas(A::alignment()) T other_buffer[size]; |
43 | other.store_aligned(other_buffer); |
44 | |
45 | alignas(A::alignment()) T concat_buffer[size]; |
46 | |
47 | for (std::size_t j = 0; j < (size - i); ++j) |
48 | { |
49 | concat_buffer[j] = other_buffer[i + j]; |
50 | if (j < i) |
51 | { |
52 | concat_buffer[size - 1 - j] = self_buffer[i - 1 - j]; |
53 | } |
54 | } |
55 | return batch<T, A>::load_aligned(concat_buffer); |
56 | } |
57 | |
58 | // gather |
59 | namespace detail |
60 | { |
61 | template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0> |
62 | inline batch<T, A> gather(U const* src, batch<V, A> const& index, |
63 | ::xsimd::index<N> I) noexcept |
64 | { |
65 | return insert(batch<T, A> {}, static_cast<T>(src[index.get(I)]), I); |
66 | } |
67 | |
68 | template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0> |
69 | inline batch<T, A> |
70 | gather(U const* src, batch<V, A> const& index, ::xsimd::index<N> I) noexcept |
71 | { |
72 | static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!" ); |
73 | |
74 | const auto test = gather<N - 1, T, A>(src, index, {}); |
75 | return insert(test, static_cast<T>(src[index.get(I)]), I); |
76 | } |
77 | } // namespace detail |
78 | |
79 | template <typename T, typename A, typename V> |
80 | inline batch<T, A> |
81 | gather(batch<T, A> const&, T const* src, batch<V, A> const& index, |
82 | kernel::requires_arch<generic>) noexcept |
83 | { |
84 | static_assert(batch<T, A>::size == batch<V, A>::size, |
85 | "Index and destination sizes must match" ); |
86 | |
87 | return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {}); |
88 | } |
89 | |
90 | // Gather with runtime indexes and mismatched strides. |
91 | template <typename T, typename A, typename U, typename V> |
92 | inline detail::sizes_mismatch_t<T, U, batch<T, A>> |
93 | gather(batch<T, A> const&, U const* src, batch<V, A> const& index, |
94 | kernel::requires_arch<generic>) noexcept |
95 | { |
96 | static_assert(batch<T, A>::size == batch<V, A>::size, |
97 | "Index and destination sizes must match" ); |
98 | |
99 | return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {}); |
100 | } |
101 | |
102 | // Gather with runtime indexes and matching strides. |
103 | template <typename T, typename A, typename U, typename V> |
104 | inline detail::stride_match_t<T, U, batch<T, A>> |
105 | gather(batch<T, A> const&, U const* src, batch<V, A> const& index, |
106 | kernel::requires_arch<generic>) noexcept |
107 | { |
108 | static_assert(batch<T, A>::size == batch<V, A>::size, |
109 | "Index and destination sizes must match" ); |
110 | |
111 | return batch_cast<T>(kernel::gather(batch<U, A> {}, src, index, A {})); |
112 | } |
113 | |
114 | // insert |
115 | template <class A, class T, size_t I> |
116 | inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept |
117 | { |
118 | struct index_mask |
119 | { |
120 | static constexpr bool get(size_t index, size_t /* size*/) |
121 | { |
122 | return index != I; |
123 | } |
124 | }; |
125 | batch<T, A> tmp(val); |
126 | return select(make_batch_bool_constant<batch<T, A>, index_mask>(), self, tmp); |
127 | } |
128 | |
129 | // get |
130 | template <class A, size_t I, class T> |
131 | inline T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept |
132 | { |
133 | alignas(A::alignment()) T buffer[batch<T, A>::size]; |
134 | self.store_aligned(&buffer[0]); |
135 | return buffer[I]; |
136 | } |
137 | |
138 | template <class A, size_t I, class T> |
139 | inline T get(batch_bool<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept |
140 | { |
141 | alignas(A::alignment()) T buffer[batch_bool<T, A>::size]; |
142 | self.store_aligned(&buffer[0]); |
143 | return buffer[I]; |
144 | } |
145 | |
146 | template <class A, size_t I, class T> |
147 | inline auto get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type |
148 | { |
149 | alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size]; |
150 | self.store_aligned(&buffer[0]); |
151 | return buffer[I]; |
152 | } |
153 | |
154 | template <class A, class T> |
155 | inline T get(batch<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept |
156 | { |
157 | alignas(A::alignment()) T buffer[batch<T, A>::size]; |
158 | self.store_aligned(&buffer[0]); |
159 | return buffer[i]; |
160 | } |
161 | |
162 | template <class A, class T> |
163 | inline T get(batch_bool<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept |
164 | { |
165 | alignas(A::alignment()) bool buffer[batch_bool<T, A>::size]; |
166 | self.store_aligned(&buffer[0]); |
167 | return buffer[i]; |
168 | } |
169 | |
170 | template <class A, class T> |
171 | inline auto get(batch<std::complex<T>, A> const& self, std::size_t i, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type |
172 | { |
173 | using T2 = typename batch<std::complex<T>, A>::value_type; |
174 | alignas(A::alignment()) T2 buffer[batch<std::complex<T>, A>::size]; |
175 | self.store_aligned(&buffer[0]); |
176 | return buffer[i]; |
177 | } |
178 | |
179 | // load_aligned |
180 | namespace detail |
181 | { |
182 | template <class A, class T_in, class T_out> |
183 | inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept |
184 | { |
185 | using batch_type_in = batch<T_in, A>; |
186 | using batch_type_out = batch<T_out, A>; |
187 | return fast_cast(batch_type_in::load_aligned(mem), batch_type_out(), A {}); |
188 | } |
189 | template <class A, class T_in, class T_out> |
190 | inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_slow_conversion) noexcept |
191 | { |
192 | static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination" ); |
193 | using batch_type_out = batch<T_out, A>; |
194 | alignas(A::alignment()) T_out buffer[batch_type_out::size]; |
195 | std::copy(mem, mem + batch_type_out::size, std::begin(buffer)); |
196 | return batch_type_out::load_aligned(buffer); |
197 | } |
198 | } |
199 | template <class A, class T_in, class T_out> |
200 | inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept |
201 | { |
202 | return detail::load_aligned<A>(mem, cvt, A {}, detail::conversion_type<A, T_in, T_out> {}); |
203 | } |
204 | |
205 | // load_unaligned |
206 | namespace detail |
207 | { |
208 | template <class A, class T_in, class T_out> |
209 | inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept |
210 | { |
211 | using batch_type_in = batch<T_in, A>; |
212 | using batch_type_out = batch<T_out, A>; |
213 | return fast_cast(batch_type_in::load_unaligned(mem), batch_type_out(), A {}); |
214 | } |
215 | |
216 | template <class A, class T_in, class T_out> |
217 | inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>, with_slow_conversion) noexcept |
218 | { |
219 | static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination" ); |
220 | return load_aligned<A>(mem, cvt, generic {}, with_slow_conversion {}); |
221 | } |
222 | } |
223 | template <class A, class T_in, class T_out> |
224 | inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept |
225 | { |
226 | return detail::load_unaligned<A>(mem, cvt, generic {}, detail::conversion_type<A, T_in, T_out> {}); |
227 | } |
228 | |
229 | namespace detail |
230 | { |
231 | // Scatter with runtime indexes. |
232 | template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0> |
233 | inline void scatter(batch<T, A> const& src, U* dst, |
234 | batch<V, A> const& index, |
235 | ::xsimd::index<N> I) noexcept |
236 | { |
237 | dst[index.get(I)] = static_cast<U>(src.get(I)); |
238 | } |
239 | |
240 | template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0> |
241 | inline void |
242 | scatter(batch<T, A> const& src, U* dst, batch<V, A> const& index, |
243 | ::xsimd::index<N> I) noexcept |
244 | { |
245 | static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!" ); |
246 | |
247 | kernel::detail::scatter<N - 1, T, A, U, V>( |
248 | src, dst, index, {}); |
249 | dst[index.get(I)] = static_cast<U>(src.get(I)); |
250 | } |
251 | } // namespace detail |
252 | |
253 | template <typename A, typename T, typename V> |
254 | inline void |
255 | scatter(batch<T, A> const& src, T* dst, |
256 | batch<V, A> const& index, |
257 | kernel::requires_arch<generic>) noexcept |
258 | { |
259 | static_assert(batch<T, A>::size == batch<V, A>::size, |
260 | "Source and index sizes must match" ); |
261 | kernel::detail::scatter<batch<V, A>::size - 1, T, A, T, V>( |
262 | src, dst, index, {}); |
263 | } |
264 | |
265 | template <typename A, typename T, typename U, typename V> |
266 | inline detail::sizes_mismatch_t<T, U, void> |
267 | scatter(batch<T, A> const& src, U* dst, |
268 | batch<V, A> const& index, |
269 | kernel::requires_arch<generic>) noexcept |
270 | { |
271 | static_assert(batch<T, A>::size == batch<V, A>::size, |
272 | "Source and index sizes must match" ); |
273 | kernel::detail::scatter<batch<V, A>::size - 1, T, A, U, V>( |
274 | src, dst, index, {}); |
275 | } |
276 | |
277 | template <typename A, typename T, typename U, typename V> |
278 | inline detail::stride_match_t<T, U, void> |
279 | scatter(batch<T, A> const& src, U* dst, |
280 | batch<V, A> const& index, |
281 | kernel::requires_arch<generic>) noexcept |
282 | { |
283 | static_assert(batch<T, A>::size == batch<V, A>::size, |
284 | "Source and index sizes must match" ); |
285 | const auto tmp = batch_cast<U>(src); |
286 | kernel::scatter<A>(tmp, dst, index, A {}); |
287 | } |
288 | |
289 | // store |
290 | template <class T, class A> |
291 | inline void store(batch_bool<T, A> const& self, bool* mem, requires_arch<generic>) noexcept |
292 | { |
293 | using batch_type = batch<T, A>; |
294 | constexpr auto size = batch_bool<T, A>::size; |
295 | alignas(A::alignment()) T buffer[size]; |
296 | kernel::store_aligned<A>(&buffer[0], batch_type(self), A {}); |
297 | for (std::size_t i = 0; i < size; ++i) |
298 | mem[i] = bool(buffer[i]); |
299 | } |
300 | |
301 | // store_aligned |
302 | template <class A, class T_in, class T_out> |
303 | inline void store_aligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept |
304 | { |
305 | static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination" ); |
306 | alignas(A::alignment()) T_in buffer[batch<T_in, A>::size]; |
307 | store_aligned(&buffer[0], self); |
308 | std::copy(std::begin(buffer), std::end(buffer), mem); |
309 | } |
310 | |
311 | // store_unaligned |
312 | template <class A, class T_in, class T_out> |
313 | inline void store_unaligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept |
314 | { |
315 | static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination" ); |
316 | return store_aligned<A>(mem, self, generic {}); |
317 | } |
318 | |
319 | // swizzle |
320 | template <class A, class T, class ITy, ITy... Vs> |
321 | inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<batch<ITy, A>, Vs...> mask, requires_arch<generic>) noexcept |
322 | { |
323 | return { swizzle(self.real(), mask), swizzle(self.imag(), mask) }; |
324 | } |
325 | |
326 | namespace detail |
327 | { |
328 | template <class A, class T> |
329 | inline batch<std::complex<T>, A> load_complex(batch<T, A> const& /*hi*/, batch<T, A> const& /*lo*/, requires_arch<generic>) noexcept |
330 | { |
331 | static_assert(std::is_same<T, void>::value, "load_complex not implemented for the required architecture" ); |
332 | } |
333 | |
334 | template <class A, class T> |
335 | inline batch<T, A> complex_high(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept |
336 | { |
337 | static_assert(std::is_same<T, void>::value, "complex_high not implemented for the required architecture" ); |
338 | } |
339 | |
340 | template <class A, class T> |
341 | inline batch<T, A> complex_low(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept |
342 | { |
343 | static_assert(std::is_same<T, void>::value, "complex_low not implemented for the required architecture" ); |
344 | } |
345 | } |
346 | |
347 | // load_complex_aligned |
348 | template <class A, class T_out, class T_in> |
349 | inline batch<std::complex<T_out>, A> load_complex_aligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept |
350 | { |
351 | using real_batch = batch<T_out, A>; |
352 | T_in const* buffer = reinterpret_cast<T_in const*>(mem); |
353 | real_batch hi = real_batch::load_aligned(buffer), |
354 | lo = real_batch::load_aligned(buffer + real_batch::size); |
355 | return detail::load_complex(hi, lo, A {}); |
356 | } |
357 | |
358 | // load_complex_unaligned |
359 | template <class A, class T_out, class T_in> |
360 | inline batch<std::complex<T_out>, A> load_complex_unaligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept |
361 | { |
362 | using real_batch = batch<T_out, A>; |
363 | T_in const* buffer = reinterpret_cast<T_in const*>(mem); |
364 | real_batch hi = real_batch::load_unaligned(buffer), |
365 | lo = real_batch::load_unaligned(buffer + real_batch::size); |
366 | return detail::load_complex(hi, lo, A {}); |
367 | } |
368 | |
369 | // store_complex_aligned |
370 | template <class A, class T_out, class T_in> |
371 | inline void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept |
372 | { |
373 | using real_batch = batch<T_in, A>; |
374 | real_batch hi = detail::complex_high(src, A {}); |
375 | real_batch lo = detail::complex_low(src, A {}); |
376 | T_out* buffer = reinterpret_cast<T_out*>(dst); |
377 | lo.store_aligned(buffer); |
378 | hi.store_aligned(buffer + real_batch::size); |
379 | } |
380 | |
381 | // store_compelx_unaligned |
382 | template <class A, class T_out, class T_in> |
383 | inline void store_complex_unaligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept |
384 | { |
385 | using real_batch = batch<T_in, A>; |
386 | real_batch hi = detail::complex_high(src, A {}); |
387 | real_batch lo = detail::complex_low(src, A {}); |
388 | T_out* buffer = reinterpret_cast<T_out*>(dst); |
389 | lo.store_unaligned(buffer); |
390 | hi.store_unaligned(buffer + real_batch::size); |
391 | } |
392 | |
393 | } |
394 | |
395 | } |
396 | |
397 | #endif |
398 | |