1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_PERMUTE_BYTES16_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_PERMUTE_BYTES16_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/detail/not_implemented.h>
17#include <simdpp/detail/vector_array_macros.h>
18
19namespace simdpp {
20namespace SIMDPP_ARCH_NAMESPACE {
21namespace detail {
22namespace insn {
23
24static SIMDPP_INL
25uint8x16 i_permute_bytes16(const uint8x16& a, const uint8x16& mask)
26{
27#if SIMDPP_USE_NULL
28 uint8x16 r;
29
30 for (unsigned i = 0; i < 16; i++) {
31 unsigned j = mask.el(i) & 0x0f;
32 r.el(i) = a.el(j);
33 }
34 return r;
35#elif SIMDPP_USE_SSSE3
36 return _mm_shuffle_epi8(a.native(), mask.native());
37#elif SIMDPP_USE_NEON32
38 uint8x8x2_t table = {{vget_low_u8(a.native()), vget_high_u8(a.native())}};
39 uint8x8_t lo = vtbl2_u8(table, vget_low_u8(mask.native()));
40 uint8x8_t hi = vtbl2_u8(table, vget_high_u8(mask.native()));
41 return vcombine_u8(lo, hi);
42#elif SIMDPP_USE_NEON64
43 return vqtbl1q_u8(a.native(), mask.native());
44#elif SIMDPP_USE_ALTIVEC
45 return vec_perm(a.native(), a.native(), mask.native());
46#elif SIMDPP_USE_MSA
47 return (v16u8) __msa_vshf_b((v16i8)mask.native(),
48 (v16i8)a.native(),
49 (v16i8)a.native());
50#else
51 return SIMDPP_NOT_IMPLEMENTED2(a, mask);
52#endif
53}
54
55#if SIMDPP_USE_AVX2
56static SIMDPP_INL
57uint8x32 i_permute_bytes16(const uint8x32& a, const uint8x32& mask)
58{
59 return _mm256_shuffle_epi8(a.native(), mask.native());
60}
61#endif
62
63#if SIMDPP_USE_AVX512BW
64SIMDPP_INL uint8<64> i_permute_bytes16(const uint8<64>& a, const uint8<64>& mask)
65{
66 return _mm512_shuffle_epi8(a.native(), mask.native());
67}
68#endif
69
70template<unsigned N> SIMDPP_INL
71uint8<N> i_permute_bytes16(const uint8<N>& a, const uint8<N>& mask)
72{
73 SIMDPP_VEC_ARRAY_IMPL2(uint8<N>, i_permute_bytes16, a, mask)
74}
75template<unsigned N> SIMDPP_INL
76uint16<N> i_permute_bytes16(const uint16<N>& a, const uint16<N>& mask)
77{
78 return (uint16<N>) i_permute_bytes16(uint8<N*2>(a), uint8<N*2>(mask));
79}
80template<unsigned N> SIMDPP_INL
81uint32<N> i_permute_bytes16(const uint32<N>& a, const uint32<N>& mask)
82{
83 return (uint32<N>) i_permute_bytes16(uint8<N*4>(a), uint8<N*4>(mask));
84}
85template<unsigned N> SIMDPP_INL
86uint64<N> i_permute_bytes16(const uint64<N>& a, const uint64<N>& mask)
87{
88 return (uint64<N>) i_permute_bytes16(uint8<N*8>(a), uint8<N*8>(mask));
89}
90template<unsigned N> SIMDPP_INL
91float32<N> i_permute_bytes16(const float32<N>& a, const uint32<N>& mask)
92{
93 return float32<N>(i_permute_bytes16(uint32<N>(a), mask));
94}
95template<unsigned N> SIMDPP_INL
96float64<N> i_permute_bytes16(const float64<N>& a, const uint64<N>& mask)
97{
98 return float64<N>(i_permute_bytes16(uint64<N>(a), mask));
99}
100
101
102} // namespace insn
103} // namespace detail
104} // namespace SIMDPP_ARCH_NAMESPACE
105} // namespace simdpp
106
107#endif
108
109