1 | /* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_CORE_PERMUTE4_H |
9 | #define LIBSIMDPP_SIMDPP_CORE_PERMUTE4_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/detail/insn/permute4.h> |
17 | |
18 | namespace simdpp { |
19 | namespace SIMDPP_ARCH_NAMESPACE { |
20 | |
21 | /** Permutes the 16-bit values within each 4 consecutive values of the vector. |
22 | The selector values must be in range [0; 3]. |
23 | |
24 | @code |
25 | r0 = a[s0] |
26 | ... |
27 | r3 = a[s3] |
28 | r4 = a[s0+4] |
29 | ... |
30 | r7 = a[s3+4] |
31 | |
32 | 256-bit version: |
33 | |
34 | r8 = a[s0+8] |
35 | ... |
36 | r11 = a[s3+8] |
37 | r12 = a[s0+12] |
38 | ... |
39 | r15 = a[s3+12] |
40 | @endcode |
41 | |
42 | @par: 128-bit version: |
43 | @icost{SSE2-AVX2, 2} |
44 | @icost{NEON, 1-5} |
45 | @icost{ALTIVEC, 1-2} |
46 | |
47 | @par: 256-bit version: |
48 | @icost{SSE2-AVX, 4} |
49 | @icost{AVX2, 2} |
50 | @icost{NEON, 2-10} |
51 | @icost{ALTIVEC, 2-3} |
52 | */ |
53 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, |
54 | unsigned N, class V> SIMDPP_INL |
55 | typename detail::get_expr_nomask<V>::empty |
56 | permute4(const any_vec16<N,V>& a) |
57 | { |
58 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
59 | typename detail::get_expr_nomask<V>::type ra; |
60 | ra = a.wrapped().eval(); |
61 | return detail::insn::i_permute4<s0,s1,s2,s3>(ra); |
62 | } |
63 | |
64 | /** Permutes the values of each set of four consecutive 32-bit values. The |
65 | selector values must be in range [0; 3]. |
66 | |
67 | @code |
68 | r0 = a[s0] |
69 | ... |
70 | r3 = a[s3] |
71 | |
72 | 256-bit version: |
73 | r4 = a[s0+4] |
74 | ... |
75 | r7 = a[s3+4] |
76 | @endcode |
77 | |
78 | @par integer |
79 | @par 128-bit version: |
80 | @icost{NEON, 1-4} |
81 | @icost{ALTIVEC, 1-2} |
82 | |
83 | @par 256-bit version: |
84 | @icost{SSE2-AVX, 2} |
85 | @icost{NEON, 2-8} |
86 | @icost{ALTIVEC, 2-3} |
87 | |
88 | @par floating-point |
89 | @par 128-bit version: |
90 | @icost{NEON, 1-4} |
91 | @icost{ALTIVEC, 1-2} |
92 | |
93 | @par 256-bit version: |
94 | @icost{SSE2-SSE4.1, 2} |
95 | @icost{NEON, 2-8} |
96 | @icost{ALTIVEC, 2-3} |
97 | */ |
98 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, |
99 | unsigned N, class V> SIMDPP_INL |
100 | typename detail::get_expr_nomask<V>::empty |
101 | permute4(const any_vec32<N,V>& a) |
102 | { |
103 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
104 | typename detail::get_expr_nomask<V>::type ra; |
105 | ra = a.wrapped().eval(); |
106 | return detail::insn::i_permute4<s0,s1,s2,s3>(ra); |
107 | } |
108 | |
109 | /** Permutes the values of each set of four consecutive 64-bit values. The |
110 | selector values must be in range [0; 3]. |
111 | |
112 | @code |
113 | r0 = a[s0] |
114 | r1 = a[s1] |
115 | r2 = a[s2] |
116 | r3 = a[s3] |
117 | @endcode |
118 | |
119 | @par integer |
120 | @icost{SSE2-AVX, 2} |
121 | |
122 | @par floating-point |
123 | @icost{SSE2-AVX, 1-2} |
124 | @icost{NEON, 1-4} |
125 | @icost{ALTIVEC, 1-4} |
126 | */ |
127 | template<unsigned s0, unsigned s1, unsigned s2, unsigned s3, |
128 | unsigned N, class V> SIMDPP_INL |
129 | typename detail::get_expr_nomask<V>::empty |
130 | permute4(const any_vec64<N,V>& a) |
131 | { |
132 | static_assert(s0 < 4 && s1 < 4 && s2 < 4 && s3 < 4, "Selector out of range" ); |
133 | typename detail::get_expr_nomask<V>::type ra; |
134 | ra = a.wrapped().eval(); |
135 | return detail::insn::i_permute4<s0,s1,s2,s3>(ra); |
136 | } |
137 | |
138 | } // namespace SIMDPP_ARCH_NAMESPACE |
139 | } // namespace simdpp |
140 | |
141 | #endif |
142 | |
143 | |