1/* Copyright (C) 2017 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_DETAIL_FOR_EACH_H
9#define LIBSIMDPP_DETAIL_FOR_EACH_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/extract.h>
17#include <simdpp/detail/mem_block.h>
18#include <type_traits>
19
20namespace simdpp {
21namespace SIMDPP_ARCH_NAMESPACE {
22namespace detail {
23
24#if SIMDPP_USE_SSE2 || SIMDPP_USE_NEON || SIMDPP_USE_MSA
25template<class V, class F> SIMDPP_INL
26void foreach_impl(std::integral_constant<unsigned, 2>,
27 const V& v, F function)
28{
29 function(extract<0>(v));
30 function(extract<1>(v));
31}
32
33template<class V, class F> SIMDPP_INL
34void foreach_impl(std::integral_constant<unsigned, 4>,
35 const V& v, F function)
36{
37 function(extract<0>(v));
38 function(extract<1>(v));
39 function(extract<2>(v));
40 function(extract<3>(v));
41}
42
43template<unsigned N, class V, class F> SIMDPP_INL
44void foreach_impl(std::integral_constant<unsigned, N>,
45 const V& v, F function)
46{
47 // When we're operating on more than 4-5 elements it makes sense to move
48 // the vector to memory and load data from there. This has higher latency,
49 // but this is masked by extracting the first several elements directly
50 // from the SIMD register set. For the rest of elements it's very likely
51 // that loading through memory has higher throughput.
52 //
53 // Recent x86 (since Sandy Bridge) and NEON (since Cortex A73) processors
54 // are able to sustain more than one load memory access per cycle.
55 // All x86 processors (at least up to Skylake, newer not checked) are only
56 // able to sustain single cross domain data access instruction per cycle.
57
58 // TODO: needs tuning on ARM and MIPS
59 function(extract<0>(v));
60 function(extract<1>(v));
61 mem_block<V> mem(v);
62 for (unsigned i = 2; i < N; ++i)
63 function(mem[i]);
64}
65#else
66template<unsigned N, class V, class F> SIMDPP_INL
67void foreach_impl(std::integral_constant<unsigned, N>,
68 const V& v, F function)
69{
70 mem_block<V> mem(v);
71 for (unsigned i = 0; i < N; ++i)
72 function(mem[i]);
73}
74#endif
75
76template<unsigned N, class V, class F> SIMDPP_INL
77void for_each(const any_vec<N, V>& v, F function)
78{
79 using size_tag = std::integral_constant<unsigned, V::base_vector_type::length>;
80 for (unsigned i = 0; i < V::vec_length; ++i)
81 foreach_impl(size_tag(), v.wrapped().vec(i), function);
82}
83
84
85} // namespace detail
86} // namespace SIMDPP_ARCH_NAMESPACE
87} // namespace simdpp
88
89#endif
90