1/* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_PERMUTE_ZBYTES16_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_PERMUTE_ZBYTES16_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/detail/not_implemented.h>
17#include <simdpp/core/permute_bytes16.h>
18#include <simdpp/detail/vector_array_macros.h>
19
20namespace simdpp {
21namespace SIMDPP_ARCH_NAMESPACE {
22namespace detail {
23namespace insn {
24
25#if _MSC_VER
26#pragma warning(push)
27#pragma warning(disable: 4800)
28#endif
29
30static SIMDPP_INL
31uint8x16 i_permute_zbytes16(const uint8x16& a, const uint8x16& mask)
32{
33#if SIMDPP_USE_NULL
34 uint8x16 r;
35
36 for (unsigned i = 0; i < 16; i++) {
37 unsigned j = mask.el(i) & 0x0f;
38 bool zero = mask.el(i) & 0x80;
39 r.el(i) = zero ? 0 : a.el(j);
40 }
41 return r;
42#elif SIMDPP_USE_SSSE3 || SIMDPP_USE_NEON
43 return permute_bytes16(a, mask);
44#elif SIMDPP_USE_ALTIVEC
45 int8x16 a0 = a;
46 int8x16 zero_mask = mask;
47 zero_mask = shift_r<7>(zero_mask); // shift in the sign bit
48 a0 = i_permute_bytes16(a0, mask);
49 a0 = bit_andnot(a0, zero_mask);
50 return a0;
51#elif SIMDPP_USE_MSA
52 return (v16u8) __msa_vshf_b((v16i8) mask.native(),
53 (v16i8) a.native(),
54 (v16i8) a.native());
55#else
56 return SIMDPP_NOT_IMPLEMENTED2(a, mask);
57#endif
58}
59
60#if _MSC_VER
61#pragma warning(pop)
62#endif
63
64#if SIMDPP_USE_AVX2
65static SIMDPP_INL
66uint8x32 i_permute_zbytes16(const uint8x32& a, const uint8x32& mask)
67{
68 return _mm256_shuffle_epi8(a.native(), mask.native());
69}
70#endif
71
72#if SIMDPP_USE_AVX512BW
73SIMDPP_INL uint8<64> i_permute_zbytes16(const uint8<64>& a, const uint8<64>& mask)
74{
75 return _mm512_shuffle_epi8(a.native(), mask.native());
76}
77#endif
78
79template<unsigned N> SIMDPP_INL
80uint8<N> i_permute_zbytes16(const uint8<N>& a, const uint8<N>& mask)
81{
82 SIMDPP_VEC_ARRAY_IMPL2(uint8<N>, i_permute_zbytes16, a, mask);
83}
84template<unsigned N> SIMDPP_INL
85uint16<N> i_permute_zbytes16(const uint16<N>& a, const uint16<N>& mask)
86{
87 return (uint16<N>) i_permute_zbytes16(uint8<N*2>(a), uint8<N*2>(mask));
88}
89template<unsigned N> SIMDPP_INL
90uint32<N> i_permute_zbytes16(const uint32<N>& a, const uint32<N>& mask)
91{
92 return (uint32<N>) i_permute_zbytes16(uint8<N*4>(a), uint8<N*4>(mask));
93}
94template<unsigned N> SIMDPP_INL
95uint64<N> i_permute_zbytes16(const uint64<N>& a, const uint64<N>& mask)
96{
97 return (uint64<N>) i_permute_zbytes16(uint8<N*8>(a), uint8<N*8>(mask));
98}
99template<unsigned N> SIMDPP_INL
100float32<N> i_permute_zbytes16(const float32<N>& a, const uint32<N>& mask)
101{
102 return float32<N>(i_permute_zbytes16(uint32<N>(a), mask));
103}
104template<unsigned N> SIMDPP_INL
105float64<N> i_permute_zbytes16(const float64<N>& a, const uint64<N>& mask)
106{
107 return float64<N>(i_permute_zbytes16(uint64<N>(a), mask));
108}
109
110
111} // namespace insn
112} // namespace detail
113} // namespace SIMDPP_ARCH_NAMESPACE
114} // namespace simdpp
115
116#endif
117
118