| 1 | /* Copyright (C) 2017 Povilas Kanapickas <povilas@radix.lt> |
| 2 | |
| 3 | Distributed under the Boost Software License, Version 1.0. |
| 4 | (See accompanying file LICENSE_1_0.txt or copy at |
| 5 | http://www.boost.org/LICENSE_1_0.txt) |
| 6 | */ |
| 7 | |
| 8 | #ifndef LIBSIMDPP_SIMDPP_CORE_I_SHIFT_L_V_H |
| 9 | #define LIBSIMDPP_SIMDPP_CORE_I_SHIFT_L_V_H |
| 10 | |
| 11 | #ifndef LIBSIMDPP_SIMD_H |
| 12 | #error "This file must be included through simd.h" |
| 13 | #endif |
| 14 | |
| 15 | #include <simdpp/types.h> |
| 16 | #include <simdpp/detail/null/math.h> |
| 17 | #include <simdpp/detail/insn/i_shift.h> |
| 18 | #include <simdpp/detail/shuffle/shuffle_mask.h> |
| 19 | #include <simdpp/core/i_mul.h> |
| 20 | #include <simdpp/core/permute_bytes16.h> |
| 21 | #include <simdpp/detail/vector_array_macros.h> |
| 22 | |
| 23 | namespace simdpp { |
| 24 | namespace SIMDPP_ARCH_NAMESPACE { |
| 25 | namespace detail { |
| 26 | namespace insn { |
| 27 | |
| 28 | // emulates 8-bit variable shift using 16-bit variable shift |
| 29 | template<class U8> SIMDPP_INL |
| 30 | U8 v_emul_shift_l_v8_using_v16(const U8& a, const U8& count) |
| 31 | { |
| 32 | using U16 = typename same_width<U8>::u16; |
| 33 | |
| 34 | U16 a16; a16 = a; |
| 35 | U16 c16; c16 = count; |
| 36 | |
| 37 | U16 select_mask = make_uint(0xff00); |
| 38 | U16 a_lo = a16; |
| 39 | U16 a_hi = bit_and(a16, select_mask); |
| 40 | U16 c_lo = bit_andnot(c16, select_mask); |
| 41 | U16 c_hi = shift_r<8>(c16); |
| 42 | a_lo = shift_l(a_lo, c_lo); |
| 43 | a_hi = shift_l(a_hi, c_hi); |
| 44 | a_lo = bit_andnot(a_lo, select_mask); |
| 45 | |
| 46 | a16 = bit_or(a_lo, a_hi); |
| 47 | return (U8) a16; |
| 48 | } |
| 49 | |
| 50 | // emulates 8-bit variable shift using permute_bytes16 and 16-bit multiplication |
| 51 | template<class U8> SIMDPP_INL |
| 52 | U8 v_emul_shift_l_v8_using_mul(const U8& a, const U8& count) |
| 53 | { |
| 54 | using U16 = typename same_width<U8>::u16; |
| 55 | |
| 56 | // Variable shift is implemented by obtaining 1 << countN for each element |
| 57 | // from a and then multiplying each element by that number. Implementation |
| 58 | // is complicated by the fact, that only 16-bit multiplication is available. |
| 59 | U8 mulshift_mask = make_uint(0x01, 0x02, 0x04, 0x08, |
| 60 | 0x10, 0x20, 0x40, 0x80); |
| 61 | U16 mulshift = (U16) permute_bytes16(mulshift_mask, count); |
| 62 | |
| 63 | U16 a16; a16 = a; |
| 64 | U16 a16_lo, a16_hi, mulshift_lo, mulshift_hi; |
| 65 | U16 select_mask = make_uint(0x00ff); |
| 66 | |
| 67 | // Move the element values to the high byte of the 16-bit elements and the |
| 68 | // shift values to the low byte. The results will have the low byte clear |
| 69 | // which will help composing the result back to a single vector. |
| 70 | a16_lo = shift_l<8>(a16); |
| 71 | mulshift_lo = bit_and(mulshift, select_mask); |
| 72 | a16_hi = bit_andnot(a16, select_mask); |
| 73 | mulshift_hi = shift_r<8>(mulshift); |
| 74 | |
| 75 | a16_lo = mul_lo(a16_lo, mulshift_lo); |
| 76 | a16_hi = mul_lo(a16_hi, mulshift_hi); |
| 77 | |
| 78 | a16_lo = shift_r<8>(a16_lo); |
| 79 | a16 = bit_or(a16_lo, a16_hi); |
| 80 | return (U8) a16; |
| 81 | } |
| 82 | |
| 83 | static SIMDPP_INL |
| 84 | uint8<16> i_shift_l_v(const uint8<16>& a, const uint8<16>& count) |
| 85 | { |
| 86 | #if SIMDPP_USE_NULL |
| 87 | return detail::null::shift_l_v(a, count); |
| 88 | #elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL |
| 89 | return v_emul_shift_l_v8_using_v16(a, count); |
| 90 | #elif SIMDPP_USE_SSSE3 |
| 91 | return v_emul_shift_l_v8_using_mul(a, count); |
| 92 | #elif SIMDPP_USE_NEON |
| 93 | return vshlq_u8(a.native(), vreinterpretq_s8_u8(count.native())); |
| 94 | #elif SIMDPP_USE_ALTIVEC |
| 95 | return vec_sl(a.native(), count.native()); |
| 96 | #elif SIMDPP_USE_MSA |
| 97 | return (v16u8) __msa_sll_b((v16i8)a.native(), (v16i8)count.native()); |
| 98 | #else |
| 99 | return SIMDPP_NOT_IMPLEMENTED2(a, count); |
| 100 | #endif |
| 101 | } |
| 102 | |
| 103 | #if SIMDPP_USE_AVX2 |
| 104 | static SIMDPP_INL |
| 105 | uint8<32> i_shift_l_v(const uint8<32>& a, const uint8<32>& count) |
| 106 | { |
| 107 | #if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL |
| 108 | return v_emul_shift_l_v8_using_v16(a, count); |
| 109 | #else |
| 110 | return v_emul_shift_l_v8_using_mul(a, count); |
| 111 | #endif |
| 112 | } |
| 113 | #endif |
| 114 | |
| 115 | #if SIMDPP_USE_AVX512BW |
| 116 | static SIMDPP_INL |
| 117 | uint8<64> i_shift_l_v(const uint8<64>& a, const uint8<64>& count) |
| 118 | { |
| 119 | return v_emul_shift_l_v8_using_v16(a, count); |
| 120 | } |
| 121 | #endif |
| 122 | |
| 123 | // ----------------------------------------------------------------------------- |
| 124 | |
| 125 | // emulates 16-bit variable shift using permute_bytes16 and 16-bit multiplication |
| 126 | template<class U16> |
| 127 | U16 v_emul_shift_l_v16_using_mul(const U16& a, const U16& count) |
| 128 | { |
| 129 | using U8 = typename same_width<U16>::u8; |
| 130 | |
| 131 | // Variable shift is implemented by obtaining 1 << countN for each element |
| 132 | // from a and then multiplying each element by that number. The |
| 133 | // implementation is complicated by the fact that permute_bytes16 permutes |
| 134 | // 8-bit elements instead of 16 which would be optimal in this case |
| 135 | U8 mulshift_mask = make_uint(0x01, 0x02, 0x04, 0x08, |
| 136 | 0x10, 0x20, 0x40, 0x80, |
| 137 | 0x00, 0x00, 0x00, 0x00, |
| 138 | 0x00, 0x00, 0x00, 0x00); |
| 139 | U16 qcount = bit_or(count, shift_l<8>(count)); |
| 140 | |
| 141 | // toggle the 4-th bit so that the high byte takes zeros from the mulshift |
| 142 | // mask when the shift count is higher than 8. |
| 143 | qcount = bit_xor(qcount, 0x0800); |
| 144 | U16 mulshift = (U16) permute_bytes16(mulshift_mask, (U8) qcount); |
| 145 | return mul_lo(a, mulshift); |
| 146 | } |
| 147 | |
| 148 | static SIMDPP_INL |
| 149 | uint16<8> i_shift_l_v(const uint16<8>& a, const uint16<8>& count) |
| 150 | { |
| 151 | #if SIMDPP_USE_NULL |
| 152 | return detail::null::shift_l_v(a, count); |
| 153 | #elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL |
| 154 | return _mm_sllv_epi16(a.native(), count.native()); |
| 155 | #elif SIMDPP_USE_SSSE3 |
| 156 | return v_emul_shift_l_v16_using_mul(a, count); |
| 157 | #elif SIMDPP_USE_NEON |
| 158 | return vshlq_u16(a.native(), vreinterpretq_s16_u16(count.native())); |
| 159 | #elif SIMDPP_USE_ALTIVEC |
| 160 | return vec_sl(a.native(), count.native()); |
| 161 | #elif SIMDPP_USE_MSA |
| 162 | return (v8u16) __msa_sll_h((v8i16)a.native(), (v8i16)count.native()); |
| 163 | #else |
| 164 | return SIMDPP_NOT_IMPLEMENTED2(a, count); |
| 165 | #endif |
| 166 | } |
| 167 | |
| 168 | #if SIMDPP_USE_AVX2 |
| 169 | static SIMDPP_INL |
| 170 | uint16<16> i_shift_l_v(const uint16<16>& a, const uint16<16>& count) |
| 171 | { |
| 172 | #if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL |
| 173 | return _mm256_sllv_epi16(a.native(), count.native()); |
| 174 | #else |
| 175 | return v_emul_shift_l_v16_using_mul(a, count); |
| 176 | #endif |
| 177 | } |
| 178 | #endif |
| 179 | |
| 180 | #if SIMDPP_USE_AVX512BW |
| 181 | SIMDPP_INL uint16<32> i_shift_l_v(const uint16<32>& a, const uint16<32>& count) |
| 182 | { |
| 183 | return _mm512_sllv_epi16(a.native(), count.native()); |
| 184 | } |
| 185 | #endif |
| 186 | |
| 187 | // ----------------------------------------------------------------------------- |
| 188 | |
| 189 | static SIMDPP_INL |
| 190 | uint32<4> i_shift_l_v(const uint32<4>& a, const uint32<4>& count) |
| 191 | { |
| 192 | #if SIMDPP_USE_NULL |
| 193 | return detail::null::shift_l_v(a, count); |
| 194 | #elif SIMDPP_USE_AVX2 |
| 195 | return _mm_sllv_epi32(a.native(), count.native()); |
| 196 | #elif SIMDPP_USE_SSE2 |
| 197 | uint32<4> count0 = count; |
| 198 | #if SIMDPP_USE_SSE4_1 |
| 199 | uint32<4> zero = make_zero(); |
| 200 | count0 = _mm_blend_epi16(count0.native(), zero.native(), 0xcc); |
| 201 | #else |
| 202 | uint32<4> mask = make_uint(0xffffffff, 0, 0xffffffff, 0); |
| 203 | count0 = bit_and(count0, mask); |
| 204 | #endif |
| 205 | uint32<4> count1 = _mm_srli_epi64(count.native(), 32); |
| 206 | uint32<4> count2 = _mm_srli_si128(count0.native(), 8); |
| 207 | uint32<4> count3 = _mm_srli_si128(count.native(), 12); |
| 208 | |
| 209 | __m128i a0 = _mm_sll_epi32(a.native(), count0.native()); |
| 210 | __m128i a1 = _mm_sll_epi32(a.native(), count1.native()); |
| 211 | __m128i a2 = _mm_sll_epi32(a.native(), count2.native()); |
| 212 | __m128i a3 = _mm_sll_epi32(a.native(), count3.native()); |
| 213 | #if SIMDPP_USE_SSE4_1 |
| 214 | a0 = _mm_blend_epi16(a0, a1, 0x0c); |
| 215 | a2 = _mm_blend_epi16(a2, a3, 0xc0); |
| 216 | a0 = _mm_blend_epi16(a0, a2, 0xf0); |
| 217 | #else |
| 218 | __m128 f0 = _mm_shuffle_ps(_mm_castsi128_ps(a0), |
| 219 | _mm_castsi128_ps(a1), |
| 220 | SIMDPP_SHUFFLE_MASK_4x4(0, 0, 1, 1)); |
| 221 | __m128 f1 = _mm_shuffle_ps(_mm_castsi128_ps(a2), |
| 222 | _mm_castsi128_ps(a3), |
| 223 | SIMDPP_SHUFFLE_MASK_4x4(2, 2, 3, 3)); |
| 224 | f0 = _mm_shuffle_ps(f0, f1, SIMDPP_SHUFFLE_MASK_4x4(0, 2, 0, 2)); |
| 225 | a0 = _mm_castps_si128(f0); |
| 226 | #endif |
| 227 | return a0; |
| 228 | #elif SIMDPP_USE_NEON |
| 229 | return vshlq_u32(a.native(), vreinterpretq_s32_u32(count.native())); |
| 230 | #elif SIMDPP_USE_ALTIVEC |
| 231 | return vec_sl(a.native(), count.native()); |
| 232 | #elif SIMDPP_USE_MSA |
| 233 | return (v4u32) __msa_sll_w((v4i32)a.native(), (v4i32)count.native()); |
| 234 | #endif |
| 235 | } |
| 236 | |
| 237 | #if SIMDPP_USE_AVX2 |
| 238 | static SIMDPP_INL |
| 239 | uint32<8> i_shift_l_v(const uint32<8>& a, const uint32<8>& count) |
| 240 | { |
| 241 | return _mm256_sllv_epi32(a.native(), count.native()); |
| 242 | } |
| 243 | #endif |
| 244 | |
| 245 | #if SIMDPP_USE_AVX512F |
| 246 | SIMDPP_INL uint32<16> i_shift_l_v(const uint32<16>& a, const uint32<16>& count) |
| 247 | { |
| 248 | return _mm512_sllv_epi32(a.native(), count.native()); |
| 249 | } |
| 250 | #endif |
| 251 | |
| 252 | // ----------------------------------------------------------------------------- |
| 253 | |
| 254 | template<class V, class U> SIMDPP_INL |
| 255 | V i_shift_l_v(const V& a, const U& b) |
| 256 | { |
| 257 | SIMDPP_VEC_ARRAY_IMPL2(V, i_shift_l_v, a, b); |
| 258 | } |
| 259 | |
| 260 | } // namespace insn |
| 261 | } // namespace detail |
| 262 | } // namespace SIMDPP_ARCH_NAMESPACE |
| 263 | } // namespace simdpp |
| 264 | |
| 265 | #endif |
| 266 | |