1/* Copyright (C) 2013-2017 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_SHRINK_TO_INT16_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_CONV_SHRINK_TO_INT16_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/permute4.h>
17#include <simdpp/core/unzip_lo.h>
18#include <simdpp/detail/insn/conv_shrink_to_int32.h>
19
20namespace simdpp {
21namespace SIMDPP_ARCH_NAMESPACE {
22namespace detail {
23namespace insn {
24
25
26// -----------------------------------------------------------------------------
27
28SIMDPP_INL uint16<8> i_to_uint16(const uint32<8>& a)
29{
30#if SIMDPP_USE_NULL
31 uint16<8> r;
32 for (unsigned i = 0; i < 8; i++) {
33 r.el(i) = uint16_t(a.vec(i/4).el(i%4));
34 }
35 return r;
36#elif SIMDPP_USE_AVX512VL
37 return _mm256_cvtepi32_epi16(a.native());
38#elif SIMDPP_USE_SSSE3
39 uint16<16> perm_mask = make_shuffle_bytes16_mask<0,2,4,6,0,0,0,0>(perm_mask);
40 uint16<16> a16;
41 uint64<4> a64;
42 a16 = a;
43 a64 = permute_bytes16(a16, perm_mask);
44#if SIMDPP_USE_AVX2
45 a64 = permute4<0,2,0,2>(a64);
46 return _mm256_castsi256_si128(a64.native());
47#else
48 return (uint16<8>) zip2_lo(a64.vec(0), a64.vec(1));
49#endif
50#elif SIMDPP_USE_NEON64
51 uint16x4_t low = vmovn_u32(a.vec(0).native());
52 return vmovn_high_u32(low, a.vec(1).native());
53#elif SIMDPP_USE_NEON
54 uint16x4_t low = vmovn_u32(a.vec(0).native());
55 uint16x4_t high = vmovn_u32(a.vec(1).native());
56 return vcombine_u16(low, high);
57#elif SIMDPP_USE_ALTIVEC
58 return vec_pack(a.vec(0).native(), a.vec(1).native());
59#elif SIMDPP_USE_SSE2 || SIMDPP_USE_MSA
60 uint16<8> r1, r2;
61 r1 = a.vec(0);
62 r2 = a.vec(1);
63 return unzip8_lo(r1, r2);
64#endif
65}
66
67#if SIMDPP_USE_AVX2
68SIMDPP_INL uint16<16> i_to_uint16(const uint32<16>& a)
69{
70#if SIMDPP_USE_AVX512BW
71 return _mm512_cvtepi32_epi16(a.native());
72#elif SIMDPP_USE_AVX512F
73 uint16<32> perm_mask = make_shuffle_bytes16_mask<0,2,4,6,0,0,0,0>(perm_mask);
74 uint64<8> a64;
75 uint64<4> a64_0, a64_1;
76 a64 = permute_bytes16((uint16<32>) a, perm_mask);
77 split(a64, a64_0, a64_1);
78 a64_0 = zip2_lo(a64_0, a64_1);
79 a64_0 = permute4<0,2,1,3>(a64_0);
80 return (uint16<16>) a64_0;
81#else
82 uint16<16> perm_mask = make_shuffle_bytes16_mask<0,2,4,6,0,0,0,0>(perm_mask);
83 uint64<4> a64_0, a64_1;
84 a64_0 = permute_bytes16((uint16<16>) a.vec(0), perm_mask);
85 a64_1 = permute_bytes16((uint16<16>) a.vec(1), perm_mask);
86 a64_0 = zip2_lo(a64_0, a64_1);
87 a64_0 = permute4<0,2,1,3>(a64_0);
88 return (uint16<16>) a64_0;
89#endif
90}
91#endif
92
93#if SIMDPP_USE_AVX512BW
94SIMDPP_INL uint16<32> i_to_uint16(const uint32<32>& a)
95{
96 uint16<16> r1 = _mm512_cvtepi32_epi16(a.vec(0).native());
97 uint16<16> r2 = _mm512_cvtepi32_epi16(a.vec(1).native());
98 return combine(r1, r2);
99}
100#endif
101
102template<unsigned N> SIMDPP_INL
103uint16<N> i_to_uint16(const uint32<N>& a)
104{
105 SIMDPP_VEC_ARRAY_IMPL_CONV_EXTRACT(uint16<N>, i_to_uint16, a)
106}
107
108// -----------------------------------------------------------------------------
109
110SIMDPP_INL uint16<8> i_to_uint16(const uint64<8>& a)
111{
112#if SIMDPP_USE_NULL
113 uint16<8> r;
114 for (unsigned i = 0; i < 8; i++) {
115 r.el(i) = uint16_t(a.vec(i/2).el(i%2));
116 }
117 return r;
118#elif SIMDPP_USE_AVX512F
119 return _mm512_cvtepi64_epi16(a.native());
120#elif SIMDPP_USE_AVX2
121 uint16<16> perm_mask = make_shuffle_bytes16_mask<0,4,0,0,0,0,0,0>(perm_mask);
122 uint32<8> a32_0, a32_1;
123 uint64<4> a64_0;
124 uint32<4> b32;
125 a32_0 = permute_bytes16((uint16<16>) a.vec(0), perm_mask);
126 a32_1 = permute_bytes16((uint16<16>) a.vec(1), perm_mask);
127 a64_0 = zip4_lo(a32_0, a32_1);
128 a32_0 = permute4<0,2,1,3>(a64_0);
129 b32 = _mm256_castsi256_si128(a32_0.native());
130 b32 = permute4<0,2,1,3>(b32);
131 return (uint16<8>) b32;
132#elif SIMDPP_USE_SSSE3
133 uint16<8> perm_mask = make_shuffle_bytes16_mask<0,4,0,0,0,0,0,0>(perm_mask);
134 uint32<4> a32_0, a32_1, a32_2, a32_3;
135 uint64<2> a64_0, a64_1;
136 a32_0 = permute_bytes16((uint16<8>) a.vec(0), perm_mask);
137 a32_1 = permute_bytes16((uint16<8>) a.vec(1), perm_mask);
138 a32_2 = permute_bytes16((uint16<8>) a.vec(2), perm_mask);
139 a32_3 = permute_bytes16((uint16<8>) a.vec(3), perm_mask);
140 a64_0 = zip4_lo(a32_0, a32_1);
141 a64_1 = zip4_lo(a32_2, a32_3);
142 a64_0 = zip2_lo(a64_0, a64_1);
143 return (uint16<8>) a64_0;
144#else
145 uint32<8> a32 = i_to_uint32(a);
146 return i_to_uint16(a32);
147#endif
148}
149
150#if SIMDPP_USE_AVX2
151SIMDPP_INL uint16<16> i_to_uint16(const uint64<16>& a)
152{
153#if SIMDPP_USE_AVX512F
154 uint16<8> r0 = _mm512_cvtepi64_epi16(a.vec(0).native());
155 uint16<8> r1 = _mm512_cvtepi64_epi16(a.vec(1).native());
156 return combine(r0, r1);
157#else
158 uint16<16> perm_mask = make_shuffle_bytes16_mask<0,4,0,0,0,0,0,0>(perm_mask);
159 uint32<8> a32_0, a32_1, a32_2, a32_3;
160 uint64<4> a64_0, a64_1;
161 a32_0 = permute_bytes16((uint16<16>) a.vec(0), perm_mask);
162 a32_1 = permute_bytes16((uint16<16>) a.vec(1), perm_mask);
163 a32_2 = permute_bytes16((uint16<16>) a.vec(2), perm_mask);
164 a32_3 = permute_bytes16((uint16<16>) a.vec(3), perm_mask);
165 a64_0 = zip4_lo(a32_0, a32_1);
166 a64_1 = zip4_lo(a32_2, a32_3);
167 a64_0 = zip2_lo(a64_0, a64_1);
168 a32_0 = permute4<0,2,1,3>(a64_0);
169 a32_0 = permute4<0,2,1,3>(a32_0);
170 return (uint16<16>) a32_0;
171#endif
172}
173#endif
174
175#if SIMDPP_USE_AVX512BW
176SIMDPP_INL uint16<32> i_to_uint16(const uint64<32>& a)
177{
178 uint16<8> r0 = _mm512_cvtepi64_epi16(a.vec(0).native());
179 uint16<8> r1 = _mm512_cvtepi64_epi16(a.vec(1).native());
180 uint16<8> r2 = _mm512_cvtepi64_epi16(a.vec(2).native());
181 uint16<8> r3 = _mm512_cvtepi64_epi16(a.vec(3).native());
182 return combine(combine(r0, r1), combine(r2, r3));
183}
184#endif
185
186template<unsigned N> SIMDPP_INL
187uint16<N> i_to_uint16(const uint64<N>& a)
188{
189 SIMDPP_VEC_ARRAY_IMPL_CONV_EXTRACT(uint16<N>, i_to_uint16, a)
190}
191
192} // namespace insn
193} // namespace detail
194} // namespace SIMDPP_ARCH_NAMESPACE
195} // namespace simdpp
196
197#endif
198
199
200