1/* Copyright (C) 2011-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6*/
7
8#ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_COMBINE_H
9#define LIBSIMDPP_SIMDPP_DETAIL_INSN_COMBINE_H
10
11#ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13#endif
14
15#include <simdpp/types.h>
16
17namespace simdpp {
18namespace SIMDPP_ARCH_NAMESPACE {
19namespace detail {
20namespace insn {
21
22#if SIMDPP_USE_AVX2
23template<class Dummy> SIMDPP_INL
24uint8<32> i_combine(const uint8<16>& a, const uint8<16>& b)
25{
26 uint8<32> r;
27 r = _mm256_castsi128_si256(a.native());
28 r = _mm256_inserti128_si256(r.native(), b.native(), 1);
29 return r;
30}
31#endif
32
33#if SIMDPP_USE_AVX512BW
34template<class Dummy> SIMDPP_INL
35uint8<64> i_combine(const uint8<32>& a, const uint8<32>& b)
36{
37 uint8<64> r;
38 r = _mm512_castsi256_si512(a.native());
39 r = _mm512_inserti64x4(r.native(), b.native(), 1);
40 return r;
41}
42#endif
43
44// -----------------------------------------------------------------------------
45
46#if SIMDPP_USE_AVX2
47template<class Dummy> SIMDPP_INL
48uint16<16> i_combine(const uint16<8>& a, const uint16<8>& b)
49{
50 uint16<16> r;
51 r = _mm256_castsi128_si256(a.native());
52 r = _mm256_inserti128_si256(r.native(), b.native(), 1);
53 return r;
54}
55#endif
56
57#if SIMDPP_USE_AVX512BW
58template<class Dummy> SIMDPP_INL
59uint16<32> i_combine(const uint16<16>& a, const uint16<16>& b)
60{
61 uint16<32> r;
62 r = _mm512_castsi256_si512(a.native());
63 r = _mm512_inserti64x4(r.native(), b.native(), 1);
64 return r;
65}
66#endif
67
68// -----------------------------------------------------------------------------
69
70#if SIMDPP_USE_AVX2
71template<class Dummy> SIMDPP_INL
72uint32<8> i_combine(const uint32<4>& a, const uint32<4>& b)
73{
74 uint32<8> r;
75 r = _mm256_castsi128_si256(a.native());
76 r = _mm256_inserti128_si256(r.native(), b.native(), 1);
77 return r;
78}
79#endif
80
81#if SIMDPP_USE_AVX512F
82template<class Dummy> SIMDPP_INL
83uint32<16> i_combine(const uint32<8>& a, const uint32<8>& b)
84{
85 uint32<16> r;
86 r = _mm512_castsi256_si512(a.native());
87 r = _mm512_inserti64x4(r.native(), b.native(), 1);
88 return r;
89}
90#endif
91
92// -----------------------------------------------------------------------------
93
94#if SIMDPP_USE_AVX2
95template<class Dummy> SIMDPP_INL
96uint64<4> i_combine(const uint64<2>& a, const uint64<2>& b)
97{
98 uint64<4> r;
99 r = _mm256_castsi128_si256(a.native());
100 r = _mm256_inserti128_si256(r.native(), b.native(), 1);
101 return r;
102}
103#endif
104
105#if SIMDPP_USE_AVX512F
106template<class Dummy> SIMDPP_INL
107uint64<8> i_combine(const uint64<4>& a, const uint64<4>& b)
108{
109 uint64<8> r;
110 r = _mm512_castsi256_si512(a.native());
111 r = _mm512_inserti64x4(r.native(), b.native(), 1);
112 return r;
113}
114#endif
115
116// -----------------------------------------------------------------------------
117
118#if SIMDPP_USE_AVX
119template<class Dummy> SIMDPP_INL
120float32<8> i_combine(const float32<4>& a, const float32<4>& b)
121{
122 float32<8> r;
123 r = _mm256_castps128_ps256(a.native());
124 r = _mm256_insertf128_ps(r.native(), b.native(), 1);
125 return r;
126}
127#endif
128
129#if SIMDPP_USE_AVX512F
130template<class Dummy> SIMDPP_INL
131float32<16> i_combine(const float32<8>& a, const float32<8>& b)
132{
133 float32<16> r;
134 r = _mm512_castps256_ps512(a.native());
135 r = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(r.native()),
136 _mm256_castps_pd(b.native()), 1));
137 return r;
138}
139#endif
140
141// -----------------------------------------------------------------------------
142
143#if SIMDPP_USE_AVX
144template<class Dummy> SIMDPP_INL
145float64<4> i_combine(const float64<2>& a, const float64<2>& b)
146{
147 float64<4> r;
148 r = _mm256_castpd128_pd256(a.native());
149 r = _mm256_insertf128_pd(r.native(), b.native(), 1);
150 return r;
151}
152#endif
153
154#if SIMDPP_USE_AVX512F
155template<class Dummy> SIMDPP_INL
156float64<8> i_combine(const float64<4>& a, const float64<4>& b)
157{
158 float64<8> r;
159 r = _mm512_castpd256_pd512(a.native());
160 r = _mm512_insertf64x4(r.native(), b.native(), 1);
161 return r;
162}
163#endif
164
165// -----------------------------------------------------------------------------
166// generic implementation
167template<class V, class H> SIMDPP_INL
168V i_combine(const H& a1, const H& a2)
169{
170 V r;
171 unsigned h = H::vec_length;
172 for (unsigned i = 0; i < h; ++i) { r.vec(i) = a1.vec(i); }
173 for (unsigned i = 0; i < h; ++i) { r.vec(i+h) = a2.vec(i); }
174 return r;
175}
176
177} // namespace insn
178} // namespace detail
179} // namespace SIMDPP_ARCH_NAMESPACE
180} // namespace simdpp
181
182#endif
183