1 | /* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt> |
2 | |
3 | Distributed under the Boost Software License, Version 1.0. |
4 | (See accompanying file LICENSE_1_0.txt or copy at |
5 | http://www.boost.org/LICENSE_1_0.txt) |
6 | */ |
7 | |
8 | #ifndef LIBSIMDPP_SIMDPP_CORE_PERMUTE2_H |
9 | #define LIBSIMDPP_SIMDPP_CORE_PERMUTE2_H |
10 | |
11 | #ifndef LIBSIMDPP_SIMD_H |
12 | #error "This file must be included through simd.h" |
13 | #endif |
14 | |
15 | #include <simdpp/types.h> |
16 | #include <simdpp/detail/insn/permute2.h> |
17 | #include <simdpp/detail/get_expr.h> |
18 | |
19 | namespace simdpp { |
20 | namespace SIMDPP_ARCH_NAMESPACE { |
21 | |
22 | /** Permutes the 16-bit values within sets of two consecutive elements of the |
23 | vector. The selector values must be in range [0; 1]. |
24 | |
25 | @code |
26 | r0 = a[s0] |
27 | r1 = a[s1] |
28 | r2 = a[s0+2] |
29 | r3 = a[s1+2] |
30 | r4 = a[s0+4] |
31 | r5 = a[s1+4] |
32 | ... |
33 | @endcode |
34 | |
35 | @par: 128-bit version: |
36 | @icost{SSE2-AVX2, 2} |
37 | @icost{NEON, ALTIVEC, 1-2} |
38 | |
39 | @par: 256-bit version: |
40 | @icost{SSE2-AVX, 4} |
41 | @icost{AVX2, 2} |
42 | @icost{NEON, 2-4} |
43 | @icost{ALTIVEC, 2-3} |
44 | */ |
45 | template<unsigned s0, unsigned s1, unsigned N, class V> SIMDPP_INL |
46 | typename detail::get_expr_nomask<V>::empty |
47 | permute2(const any_vec16<N,V>& a) |
48 | { |
49 | static_assert(s0 < 2 && s1 < 2, "Selector out of range" ); |
50 | typename detail::get_expr_nomask<V>::type ra; |
51 | ra = a.wrapped().eval(); |
52 | return detail::insn::i_permute2<s0,s1>(ra); |
53 | } |
54 | |
55 | /** Permutes the values of each set of four consecutive 32-bit values. The |
56 | selector values must be in range [0; 1]. |
57 | |
58 | @code |
59 | r0 = a[s0] |
60 | r1 = a[s1] |
61 | r2 = a[s0+2] |
62 | r3 = a[s1+2] |
63 | 256-bit version: |
64 | r4 = a[s0+4] |
65 | r5 = a[s1+4] |
66 | r6 = a[s0+6] |
67 | r7 = a[s1+6] |
68 | @endcode |
69 | |
70 | @par integer |
71 | @par 128-bit version: |
72 | @icost{NEON, 2-4} |
73 | @icost{ALTIVEC, 1-2} |
74 | |
75 | @par 256-bit version: |
76 | @icost{SSE2-AVX, 2} |
77 | @icost{NEON, 4-8} |
78 | @icost{ALTIVEC, 2-3} |
79 | |
80 | @par floating-point |
81 | @par 128-bit version: |
82 | @icost{NEON, 2-4} |
83 | @icost{ALTIVEC, 1-2} |
84 | |
85 | @par 256-bit version: |
86 | @icost{SSE2-AVX, 2} |
87 | @icost{NEON, 4-8} |
88 | @icost{ALTIVEC, 2-3} |
89 | */ |
90 | template<unsigned s0, unsigned s1, unsigned N, class V> SIMDPP_INL |
91 | typename detail::get_expr_nomask<V>::empty |
92 | permute2(const any_vec32<N,V>& a) |
93 | { |
94 | static_assert(s0 < 2 && s1 < 2, "Selector out of range" ); |
95 | typename detail::get_expr_nomask<V>::type ra; |
96 | ra = a.wrapped().eval(); |
97 | return detail::insn::i_permute2<s0,s1>(ra); |
98 | } |
99 | |
100 | /** Permutes the values of each set of four consecutive 32-bit values. The |
101 | selector values must be in range [0; 1]. |
102 | |
103 | @code |
104 | r0 = a[s0] |
105 | r1 = a[s1] |
106 | |
107 | 256-bit version: |
108 | r2 = a[s0+2] |
109 | r3 = a[s1+2] |
110 | @endcode |
111 | |
112 | @par 128-bit version: |
113 | @icost{NEON, 1-2} |
114 | @icost{ALTIVEC, 1-2} |
115 | |
116 | @par 256-bit version: |
117 | @icost{SSE2-AVX, 2} |
118 | @icost{NEON, 2-4} |
119 | @icost{ALTIVEC, 2-4} |
120 | */ |
121 | template<unsigned s0, unsigned s1, unsigned N, class V> SIMDPP_INL |
122 | typename detail::get_expr_nomask<V>::empty |
123 | permute2(const any_vec64<N,V>& a) |
124 | { |
125 | static_assert(s0 < 2 && s1 < 2, "Selector out of range" ); |
126 | typename detail::get_expr_nomask<V>::type ra; |
127 | ra = a.wrapped().eval(); |
128 | return detail::insn::i_permute2<s0,s1>(ra); |
129 | } |
130 | |
131 | } // namespace SIMDPP_ARCH_NAMESPACE |
132 | } // namespace simdpp |
133 | |
134 | #endif |
135 | |
136 | |