1/* Copyright (C) 2011-2017 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_EXTRACT_BITS_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_EXTRACT_BITS_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16#include <simdpp/core/bit_and.h>
17#include <simdpp/core/bit_or.h>
18#include <simdpp/core/extract.h>
19#include <simdpp/core/i_shift_l.h>
20#include <simdpp/core/i_sub.h>
21#include <simdpp/core/make_uint.h>
22#include <simdpp/core/move_l.h>
23
24namespace simdpp {
25namespace SIMDPP_ARCH_NAMESPACE {
26namespace detail {
27namespace insn {
28
29SIMDPP_INL uint16_t i_extract_bits_any(const uint8<16>& ca)
30{
31 uint8<16> a = ca;
32#if SIMDPP_USE_NULL
33 uint16_t r = 0;
34 for (unsigned i = 0; i < a.length; i++) {
35 uint8_t x = ca.el(i);
36 x = x & 1;
37 r = (r >> 1) | (uint16_t(x) << 15);
38 }
39 return r;
40#elif SIMDPP_USE_SSE2
41 // Note that i_extract_bits depends on the exact implementation of this
42 // function.
43 return _mm_movemask_epi8(a.native());
44#elif SIMDPP_USE_NEON
45 uint8x16 mask = make_uint(0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80);
46
47 a = bit_and(a, mask);
48 uint16<8> a16 = vpaddlq_u8(a.native());
49 uint32<4> a32 = vpaddlq_u16(a16.native());
50 uint8<16> a8 = vreinterpretq_u8_u64(vpaddlq_u32(a32.native()));
51 uint8x8_t r = vzip_u8(vget_low_u8(a8.native()), vget_high_u8(a8.native())).val[0];
52 return vget_lane_u16(vreinterpret_u16_u8(r), 0);
53#elif SIMDPP_USE_ALTIVEC
54 uint8x16 mask = make_uint(0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80);
55 a = bit_and(a, mask);
56 uint32<4> zero = make_zero();
57 uint32x4 s = vec_sum4s(a.native(), zero.native());
58 uint32x4 shifts = make_uint(0, 0, 8, 8);
59 s = (__vector uint32_t) vec_sl(s.native(), shifts.native());
60 s = (int32x4)vec_sums((__vector int32_t)s.native(),
61 (__vector int32_t)zero.native());
62#if SIMDPP_BIG_ENDIAN
63 return extract<7>(uint16x8(s));
64#else
65 return extract<6>(uint16x8(s));
66#endif
67#elif SIMDPP_USE_MSA
68 // Note: the implementation of extract_bits depends of the exact behavior
69 // of this function
70 uint8x16 mask = make_uint(0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80);
71
72 a = bit_and(a, mask);
73 uint16<8> a16 = __msa_hadd_u_h(a.native(), a.native());
74 uint32<4> a32 = __msa_hadd_u_w(a16.native(), a16.native());
75 a = (v16u8) __msa_hadd_u_d(a32.native(), a32.native());
76 a = bit_or(a, move16_l<7>(a));
77 return extract<0>((uint16<8>)a);
78#endif
79}
80
81SIMDPP_INL uint32_t i_extract_bits_any(const uint8<32>& ca)
82{
83 uint8<32> a = ca;
84#if SIMDPP_USE_AVX2
85 return _mm256_movemask_epi8(a.native());
86#else
87 uint8<16> lo, hi;
88 split(a, lo, hi);
89 return i_extract_bits_any(lo) | (i_extract_bits_any(hi) << 16);
90#endif
91}
92
93template<unsigned id> SIMDPP_INL
94uint16_t i_extract_bits(const uint8<16>& ca)
95{
96 uint8<16> a = ca;
97#if SIMDPP_USE_NULL
98 uint16_t r = 0;
99 for (unsigned i = 0; i < a.length; i++) {
100 uint8_t x = ca.el(i);
101 x = (x >> id) & 1;
102 r = (r >> 1) | (uint16_t(x) << 15);
103 }
104 return r;
105#elif SIMDPP_USE_SSE2
106 a = shift_l<7-id>((uint16x8) a);
107 return i_extract_bits_any(a);
108#elif SIMDPP_USE_NEON
109 int8x16 shift_mask = make_int(0-int(id), 1-int(id), 2-int(id), 3-int(id),
110 4-int(id), 5-int(id), 6-int(id), 7-int(id));
111
112 a = vshlq_u8(a.native(), shift_mask.native());
113 return i_extract_bits_any(a);
114#elif SIMDPP_USE_ALTIVEC
115 uint8x16 rot_mask = make_int(0-int(id), 1-int(id), 2-int(id), 3-int(id),
116 4-int(id), 5-int(id), 6-int(id), 7-int(id));
117 a = vec_rl(a.native(), rot_mask.native());
118 return i_extract_bits_any(a);
119#elif SIMDPP_USE_MSA
120 int8x16 shifts = make_int(0-int(id), 1-int(id), 2-int(id), 3-int(id),
121 4-int(id), 5-int(id), 6-int(id), 7-int(id));
122 uint8<16> a_l = (v16u8) __msa_sll_b((v16i8) a.native(), shifts.native());
123 shifts = sub((int8<16>) make_zero(), shifts);
124 uint8<16> a_r = (v16u8) __msa_srl_b((v16i8) a.native(), shifts.native());
125 a = bit_or(a_l, a_r);
126 return i_extract_bits_any(a);
127#endif
128}
129
130template<unsigned id> SIMDPP_INL
131uint32_t i_extract_bits(const uint8<32>& ca)
132{
133 uint8<32> a = ca;
134#if SIMDPP_USE_AVX2
135 a = shift_l<7-id>((uint16<16>) a);
136 return i_extract_bits_any(a);
137#else
138 uint8<16> lo, hi;
139 split(a, lo, hi);
140 return i_extract_bits<id>(lo) | (i_extract_bits<id>(hi) << 16);
141#endif
142}
143
144} // namespace insn
145} // namespace detail
146} // namespace SIMDPP_ARCH_NAMESPACE
147} // namespace simdpp
148
149#endif
150
151
152