PacketMathHalf.h source code [NanoGUI/ext/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h]

1	// This file is part of Eigen, a lightweight C++ template library
2	// for linear algebra.
3	//
4	// Copyright (C) 2016 Benoit Steiner <benoit.steiner.goog@gmail.com>
5	//
6	// This Source Code Form is subject to the terms of the Mozilla
7	// Public License v. 2.0. If a copy of the MPL was not distributed
8	// with this file, You can obtain one at http://mozilla.org/MPL/2.0/.
9
10	#ifndef EIGEN_PACKET_MATH_HALF_CUDA_H
11	#define EIGEN_PACKET_MATH_HALF_CUDA_H
12
13
14	namespace Eigen {
15	namespace internal {
16
17	// Most of the following operations require arch >= 3.0
18	#if defined(EIGEN_HAS_CUDA_FP16) && defined(__CUDACC__) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 300
19
20	template<> struct is_arithmetic<half2> { enum { value = true }; };
21
22	template<> struct packet_traits<Eigen::half> : default_packet_traits
23	{
24	typedef half2 type;
25	typedef half2 half;
26	enum {
27	Vectorizable = `1`,
28	AlignedOnScalar = `1`,
29	size=`2`,
30	HasHalfPacket = `0`,
31	HasAdd = `1`,
32	HasMul = `1`,
33	HasDiv = `1`,
34	HasSqrt = `1`,
35	HasRsqrt = `1`,
36	HasExp = `1`,
37	HasLog = `1`,
38	HasLog1p = `1`
39	};
40	};
41
42	template<> struct unpacket_traits<half2> { typedef Eigen::half type; enum {size=`2`, alignment=Aligned16}; typedef half2 half; };
43
44	template<> __device__ EIGEN_STRONG_INLINE half2 pset1<half2>(const Eigen::half& from) {
45	return __half2half2(from);
46	}
47
48	template<> __device__ EIGEN_STRONG_INLINE half2 pload<half2>(const Eigen::half* from) {
49	return *reinterpret_cast<const half2*>(from);
50	}
51
52	template<> __device__ EIGEN_STRONG_INLINE half2 ploadu<half2>(const Eigen::half* from) {
53	return __halves2half2(from[`0`], from[`1`]);
54	}
55
56	template<> EIGEN_STRONG_INLINE half2 ploaddup<half2>(const Eigen::half* from) {
57	return __halves2half2(from[`0`], from[`0`]);
58	}
59
60	template<> __device__ EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const half2& from) {
61	*reinterpret_cast<half2*>(to) = from;
62	}
63
64	template<> __device__ EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const half2& from) {
65	to[`0`] = __low2half(from);
66	to[`1`] = __high2half(from);
67	}
68
69	template<>
70	__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Aligned>(const Eigen::half* from) {
71	#if __CUDA_ARCH__ >= 350
72	return __ldg((const half2*)from);
73	#else
74	return __halves2half2((from+`0`), (from+`1`));
75	#endif
76	}
77
78	template<>
79	__device__ EIGEN_ALWAYS_INLINE half2 ploadt_ro<half2, Unaligned>(const Eigen::half* from) {
80	#if __CUDA_ARCH__ >= 350
81	return __halves2half2(__ldg(from+`0`), __ldg(from+`1`));
82	#else
83	return __halves2half2((from+`0`), (from+`1`));
84	#endif
85	}
86
87	template<> __device__ EIGEN_STRONG_INLINE half2 pgather<Eigen::half, half2>(const Eigen::half* from, Index stride) {
88	return __halves2half2(from[`0`stride], from[`1`stride]);
89	}
90
91	template<> __device__ EIGEN_STRONG_INLINE void pscatter<Eigen::half, half2>(Eigen::half* to, const half2& from, Index stride) {
92	to[stride*`0`] = __low2half(from);
93	to[stride*`1`] = __high2half(from);
94	}
95
96	template<> __device__ EIGEN_STRONG_INLINE Eigen::half pfirst<half2>(const half2& a) {
97	return __low2half(a);
98	}
99
100	template<> __device__ EIGEN_STRONG_INLINE half2 pabs<half2>(const half2& a) {
101	half2 result;
102	unsigned temp = (reinterpret_cast<const* unsigned*>(&(a)));
103	(reinterpret_cast<unsigned**>(&(result))) = temp & `0x7FFF7FFF`;
104	return result;
105	}
106
107
108	__device__ EIGEN_STRONG_INLINE void
109	ptranspose(PacketBlock<half2,`2`>& kernel) {
110	__half a1 = __low2half(kernel.packet[`0`]);
111	__half a2 = __high2half(kernel.packet[`0`]);
112	__half b1 = __low2half(kernel.packet[`1`]);
113	__half b2 = __high2half(kernel.packet[`1`]);
114	kernel.packet[`0`] = __halves2half2(a1, b1);
115	kernel.packet[`1`] = __halves2half2(a2, b2);
116	}
117
118	template<> __device__ EIGEN_STRONG_INLINE half2 plset<half2>(const Eigen::half& a) {
119	#if __CUDA_ARCH__ >= 530
120	return __halves2half2(a, __hadd(a, __float2half(`1.0f`)));
121	#else
122	float f = __half2float(a) + `1.0f`;
123	return __halves2half2(a, __float2half(f));
124	#endif
125	}
126
127	template<> __device__ EIGEN_STRONG_INLINE half2 padd<half2>(const half2& a, const half2& b) {
128	#if __CUDA_ARCH__ >= 530
129	return __hadd2(a, b);
130	#else
131	float a1 = __low2float(a);
132	float a2 = __high2float(a);
133	float b1 = __low2float(b);
134	float b2 = __high2float(b);
135	float r1 = a1 + b1;
136	float r2 = a2 + b2;
137	return __floats2half2_rn(r1, r2);
138	#endif
139	}
140
141	template<> __device__ EIGEN_STRONG_INLINE half2 psub<half2>(const half2& a, const half2& b) {
142	#if __CUDA_ARCH__ >= 530
143	return __hsub2(a, b);
144	#else
145	float a1 = __low2float(a);
146	float a2 = __high2float(a);
147	float b1 = __low2float(b);
148	float b2 = __high2float(b);
149	float r1 = a1 - b1;
150	float r2 = a2 - b2;
151	return __floats2half2_rn(r1, r2);
152	#endif
153	}
154
155	template<> __device__ EIGEN_STRONG_INLINE half2 pnegate(const half2& a) {
156	#if __CUDA_ARCH__ >= 530
157	return __hneg2(a);
158	#else
159	float a1 = __low2float(a);
160	float a2 = __high2float(a);
161	return __floats2half2_rn(-a1, -a2);
162	#endif
163	}
164
165	template<> __device__ EIGEN_STRONG_INLINE half2 pconj(const half2& a) { return a; }
166
167	template<> __device__ EIGEN_STRONG_INLINE half2 pmul<half2>(const half2& a, const half2& b) {
168	#if __CUDA_ARCH__ >= 530
169	return __hmul2(a, b);
170	#else
171	float a1 = __low2float(a);
172	float a2 = __high2float(a);
173	float b1 = __low2float(b);
174	float b2 = __high2float(b);
175	float r1 = a1 * b1;
176	float r2 = a2 * b2;
177	return __floats2half2_rn(r1, r2);
178	#endif
179	}
180
181	template<> __device__ EIGEN_STRONG_INLINE half2 pmadd<half2>(const half2& a, const half2& b, const half2& c) {
182	#if __CUDA_ARCH__ >= 530
183	return __hfma2(a, b, c);
184	#else
185	float a1 = __low2float(a);
186	float a2 = __high2float(a);
187	float b1 = __low2float(b);
188	float b2 = __high2float(b);
189	float c1 = __low2float(c);
190	float c2 = __high2float(c);
191	float r1 = a1 * b1 + c1;
192	float r2 = a2 * b2 + c2;
193	return __floats2half2_rn(r1, r2);
194	#endif
195	}
196
197	template<> __device__ EIGEN_STRONG_INLINE half2 pdiv<half2>(const half2& a, const half2& b) {
198	float a1 = __low2float(a);
199	float a2 = __high2float(a);
200	float b1 = __low2float(b);
201	float b2 = __high2float(b);
202	float r1 = a1 / b1;
203	float r2 = a2 / b2;
204	return __floats2half2_rn(r1, r2);
205	}
206
207	template<> __device__ EIGEN_STRONG_INLINE half2 pmin<half2>(const half2& a, const half2& b) {
208	float a1 = __low2float(a);
209	float a2 = __high2float(a);
210	float b1 = __low2float(b);
211	float b2 = __high2float(b);
212	__half r1 = a1 < b1 ? __low2half(a) : __low2half(b);
213	__half r2 = a2 < b2 ? __high2half(a) : __high2half(b);
214	return __halves2half2(r1, r2);
215	}
216
217	template<> __device__ EIGEN_STRONG_INLINE half2 pmax<half2>(const half2& a, const half2& b) {
218	float a1 = __low2float(a);
219	float a2 = __high2float(a);
220	float b1 = __low2float(b);
221	float b2 = __high2float(b);
222	__half r1 = a1 > b1 ? __low2half(a) : __low2half(b);
223	__half r2 = a2 > b2 ? __high2half(a) : __high2half(b);
224	return __halves2half2(r1, r2);
225	}
226
227	template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux<half2>(const half2& a) {
228	#if __CUDA_ARCH__ >= 530
229	return __hadd(__low2half(a), __high2half(a));
230	#else
231	float a1 = __low2float(a);
232	float a2 = __high2float(a);
233	return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 + a2)));
234	#endif
235	}
236
237	template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_max<half2>(const half2& a) {
238	#if __CUDA_ARCH__ >= 530
239	__half first = __low2half(a);
240	__half second = __high2half(a);
241	return __hgt(first, second) ? first : second;
242	#else
243	float a1 = __low2float(a);
244	float a2 = __high2float(a);
245	return a1 > a2 ? __low2half(a) : __high2half(a);
246	#endif
247	}
248
249	template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_min<half2>(const half2& a) {
250	#if __CUDA_ARCH__ >= 530
251	__half first = __low2half(a);
252	__half second = __high2half(a);
253	return __hlt(first, second) ? first : second;
254	#else
255	float a1 = __low2float(a);
256	float a2 = __high2float(a);
257	return a1 < a2 ? __low2half(a) : __high2half(a);
258	#endif
259	}
260
261	template<> __device__ EIGEN_STRONG_INLINE Eigen::half predux_mul<half2>(const half2& a) {
262	#if __CUDA_ARCH__ >= 530
263	return __hmul(__low2half(a), __high2half(a));
264	#else
265	float a1 = __low2float(a);
266	float a2 = __high2float(a);
267	return Eigen::half(half_impl::raw_uint16_to_half(__float2half_rn(a1 * a2)));
268	#endif
269	}
270
271	template<> __device__ EIGEN_STRONG_INLINE half2 plog1p<half2>(const half2& a) {
272	float a1 = __low2float(a);
273	float a2 = __high2float(a);
274	float r1 = log1pf(a1);
275	float r2 = log1pf(a2);
276	return __floats2half2_rn(r1, r2);
277	}
278
279	#if EIGEN_CUDACC_VER >= 80000 && defined EIGEN_CUDA_ARCH && EIGEN_CUDA_ARCH >= 530
280
281	template<> __device__ EIGEN_STRONG_INLINE
282	half2 plog<half2>(const half2& a) {
283	return h2log(a);
284	}
285
286	template<> __device__ EIGEN_STRONG_INLINE
287	half2 pexp<half2>(const half2& a) {
288	return h2exp(a);
289	}
290
291	template<> __device__ EIGEN_STRONG_INLINE
292	half2 psqrt<half2>(const half2& a) {
293	return h2sqrt(a);
294	}
295
296	template<> __device__ EIGEN_STRONG_INLINE
297	half2 prsqrt<half2>(const half2& a) {
298	return h2rsqrt(a);
299	}
300
301	#else
302
303	template<> __device__ EIGEN_STRONG_INLINE half2 plog<half2>(const half2& a) {
304	float a1 = __low2float(a);
305	float a2 = __high2float(a);
306	float r1 = logf(a1);
307	float r2 = logf(a2);
308	return __floats2half2_rn(r1, r2);
309	}
310
311	template<> __device__ EIGEN_STRONG_INLINE half2 pexp<half2>(const half2& a) {
312	float a1 = __low2float(a);
313	float a2 = __high2float(a);
314	float r1 = expf(a1);
315	float r2 = expf(a2);
316	return __floats2half2_rn(r1, r2);
317	}
318
319	template<> __device__ EIGEN_STRONG_INLINE half2 psqrt<half2>(const half2& a) {
320	float a1 = __low2float(a);
321	float a2 = __high2float(a);
322	float r1 = sqrtf(a1);
323	float r2 = sqrtf(a2);
324	return __floats2half2_rn(r1, r2);
325	}
326
327	template<> __device__ EIGEN_STRONG_INLINE half2 prsqrt<half2>(const half2& a) {
328	float a1 = __low2float(a);
329	float a2 = __high2float(a);
330	float r1 = rsqrtf(a1);
331	float r2 = rsqrtf(a2);
332	return __floats2half2_rn(r1, r2);
333	}
334
335	#endif
336
337	#elif defined EIGEN_VECTORIZE_AVX512
338
339	typedef struct {
340	__m256i x;
341	} Packet16h;
342
343
344	template<> struct is_arithmetic<Packet16h> { enum { value = true }; };
345
346	template <>
347	struct packet_traits<half> : default_packet_traits {
348	typedef Packet16h type;
349	// There is no half-size packet for Packet16h.
350	typedef Packet16h half;
351	enum {
352	Vectorizable = `1`,
353	AlignedOnScalar = `1`,
354	size = `16`,
355	HasHalfPacket = `0`,
356	HasAdd = `0`,
357	HasSub = `0`,
358	HasMul = `0`,
359	HasNegate = `0`,
360	HasAbs = `0`,
361	HasAbs2 = `0`,
362	HasMin = `0`,
363	HasMax = `0`,
364	HasConj = `0`,
365	HasSetLinear = `0`,
366	HasDiv = `0`,
367	HasSqrt = `0`,
368	HasRsqrt = `0`,
369	HasExp = `0`,
370	HasLog = `0`,
371	HasBlend = `0`
372	};
373	};
374
375
376	template<> struct unpacket_traits<Packet16h> { typedef Eigen::half type; enum {size=`16`, alignment=Aligned32}; typedef Packet16h half; };
377
378	template<> EIGEN_STRONG_INLINE Packet16h pset1<Packet16h>(const Eigen::half& from) {
379	Packet16h result;
380	result.x = _mm256_set1_epi16(from.x);
381	return result;
382	}
383
384	template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet16h>(const Packet16h& from) {
385	return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm256_extract_epi16(from.x, `0`)));
386	}
387
388	template<> EIGEN_STRONG_INLINE Packet16h pload<Packet16h>(const Eigen::half* from) {
389	Packet16h result;
390	result.x = _mm256_load_si256(reinterpret_cast<const __m256i*>(from));
391	return result;
392	}
393
394	template<> EIGEN_STRONG_INLINE Packet16h ploadu<Packet16h>(const Eigen::half* from) {
395	Packet16h result;
396	result.x = _mm256_loadu_si256(reinterpret_cast<const __m256i*>(from));
397	return result;
398	}
399
400	template<> EIGEN_STRONG_INLINE void pstore<half>(Eigen::half* to, const Packet16h& from) {
401	_mm256_store_si256((__m256i*)to, from.x);
402	}
403
404	template<> EIGEN_STRONG_INLINE void pstoreu<half>(Eigen::half* to, const Packet16h& from) {
405	_mm256_storeu_si256((__m256i*)to, from.x);
406	}
407
408	template<> EIGEN_STRONG_INLINE Packet16h
409	ploadquad(const Eigen::half* from) {
410	Packet16h result;
411	unsigned short a = from[`0`].x;
412	unsigned short b = from[`1`].x;
413	unsigned short c = from[`2`].x;
414	unsigned short d = from[`3`].x;
415	result.x = _mm256_set_epi16(d, d, d, d, c, c, c, c, b, b, b, b, a, a, a, a);
416	return result;
417	}
418
419	EIGEN_STRONG_INLINE Packet16f half2float(const Packet16h& a) {
420	#ifdef EIGEN_HAS_FP16_C
421	return _mm512_cvtph_ps(a.x);
422	#else
423	EIGEN_ALIGN64 half aux[`16`];
424	pstore(aux, a);
425	float f0(aux[`0`]);
426	float f1(aux[`1`]);
427	float f2(aux[`2`]);
428	float f3(aux[`3`]);
429	float f4(aux[`4`]);
430	float f5(aux[`5`]);
431	float f6(aux[`6`]);
432	float f7(aux[`7`]);
433	float f8(aux[`8`]);
434	float f9(aux[`9`]);
435	float fa(aux[`10`]);
436	float fb(aux[`11`]);
437	float fc(aux[`12`]);
438	float fd(aux[`13`]);
439	float fe(aux[`14`]);
440	float ff(aux[`15`]);
441
442	return _mm512_set_ps(
443	ff, fe, fd, fc, fb, fa, f9, f8, f7, f6, f5, f4, f3, f2, f1, f0);
444	#endif
445	}
446
447	EIGEN_STRONG_INLINE Packet16h float2half(const Packet16f& a) {
448	#ifdef EIGEN_HAS_FP16_C
449	Packet16h result;
450	result.x = _mm512_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT\|_MM_FROUND_NO_EXC);
451	return result;
452	#else
453	EIGEN_ALIGN64 float aux[`16`];
454	pstore(aux, a);
455	half h0(aux[`0`]);
456	half h1(aux[`1`]);
457	half h2(aux[`2`]);
458	half h3(aux[`3`]);
459	half h4(aux[`4`]);
460	half h5(aux[`5`]);
461	half h6(aux[`6`]);
462	half h7(aux[`7`]);
463	half h8(aux[`8`]);
464	half h9(aux[`9`]);
465	half ha(aux[`10`]);
466	half hb(aux[`11`]);
467	half hc(aux[`12`]);
468	half hd(aux[`13`]);
469	half he(aux[`14`]);
470	half hf(aux[`15`]);
471
472	Packet16h result;
473	result.x = _mm256_set_epi16(
474	hf.x, he.x, hd.x, hc.x, hb.x, ha.x, h9.x, h8.x,
475	h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
476	return result;
477	#endif
478	}
479
480	template<> EIGEN_STRONG_INLINE Packet16h padd<Packet16h>(const Packet16h& a, const Packet16h& b) {
481	Packet16f af = half2float(a);
482	Packet16f bf = half2float(b);
483	Packet16f rf = padd(af, bf);
484	return float2half(rf);
485	}
486
487	template<> EIGEN_STRONG_INLINE Packet16h pmul<Packet16h>(const Packet16h& a, const Packet16h& b) {
488	Packet16f af = half2float(a);
489	Packet16f bf = half2float(b);
490	Packet16f rf = pmul(af, bf);
491	return float2half(rf);
492	}
493
494	template<> EIGEN_STRONG_INLINE half predux<Packet16h>(const Packet16h& from) {
495	Packet16f from_float = half2float(from);
496	return half(predux(from_float));
497	}
498
499	template<> EIGEN_STRONG_INLINE Packet16h pgather<Eigen::half, Packet16h>(const Eigen::half* from, Index stride)
500	{
501	Packet16h result;
502	result.x = _mm256_set_epi16(
503	from[`15`stride].x, from[`14`stride].x, from[`13`stride].x, from[`12`stride].x,
504	from[`11`stride].x, from[`10`stride].x, from[`9`stride].x, from[`8`stride].x,
505	from[`7`stride].x, from[`6`stride].x, from[`5`stride].x, from[`4`stride].x,
506	from[`3`stride].x, from[`2`stride].x, from[`1`stride].x, from[`0`stride].x);
507	return result;
508	}
509
510	template<> EIGEN_STRONG_INLINE void pscatter<half, Packet16h>(half* to, const Packet16h& from, Index stride)
511	{
512	EIGEN_ALIGN64 half aux[`16`];
513	pstore(aux, from);
514	to[stride*`0`].x = aux[`0`].x;
515	to[stride*`1`].x = aux[`1`].x;
516	to[stride*`2`].x = aux[`2`].x;
517	to[stride*`3`].x = aux[`3`].x;
518	to[stride*`4`].x = aux[`4`].x;
519	to[stride*`5`].x = aux[`5`].x;
520	to[stride*`6`].x = aux[`6`].x;
521	to[stride*`7`].x = aux[`7`].x;
522	to[stride*`8`].x = aux[`8`].x;
523	to[stride*`9`].x = aux[`9`].x;
524	to[stride*`10`].x = aux[`10`].x;
525	to[stride*`11`].x = aux[`11`].x;
526	to[stride*`12`].x = aux[`12`].x;
527	to[stride*`13`].x = aux[`13`].x;
528	to[stride*`14`].x = aux[`14`].x;
529	to[stride*`15`].x = aux[`15`].x;
530	}
531
532	EIGEN_STRONG_INLINE void
533	ptranspose(PacketBlock<Packet16h,`16`>& kernel) {
534	__m256i a = kernel.packet[`0`].x;
535	__m256i b = kernel.packet[`1`].x;
536	__m256i c = kernel.packet[`2`].x;
537	__m256i d = kernel.packet[`3`].x;
538	__m256i e = kernel.packet[`4`].x;
539	__m256i f = kernel.packet[`5`].x;
540	__m256i g = kernel.packet[`6`].x;
541	__m256i h = kernel.packet[`7`].x;
542	__m256i i = kernel.packet[`8`].x;
543	__m256i j = kernel.packet[`9`].x;
544	__m256i k = kernel.packet[`10`].x;
545	__m256i l = kernel.packet[`11`].x;
546	__m256i m = kernel.packet[`12`].x;
547	__m256i n = kernel.packet[`13`].x;
548	__m256i o = kernel.packet[`14`].x;
549	__m256i p = kernel.packet[`15`].x;
550
551	__m256i ab_07 = _mm256_unpacklo_epi16(a, b);
552	__m256i cd_07 = _mm256_unpacklo_epi16(c, d);
553	__m256i ef_07 = _mm256_unpacklo_epi16(e, f);
554	__m256i gh_07 = _mm256_unpacklo_epi16(g, h);
555	__m256i ij_07 = _mm256_unpacklo_epi16(i, j);
556	__m256i kl_07 = _mm256_unpacklo_epi16(k, l);
557	__m256i mn_07 = _mm256_unpacklo_epi16(m, n);
558	__m256i op_07 = _mm256_unpacklo_epi16(o, p);
559
560	__m256i ab_8f = _mm256_unpackhi_epi16(a, b);
561	__m256i cd_8f = _mm256_unpackhi_epi16(c, d);
562	__m256i ef_8f = _mm256_unpackhi_epi16(e, f);
563	__m256i gh_8f = _mm256_unpackhi_epi16(g, h);
564	__m256i ij_8f = _mm256_unpackhi_epi16(i, j);
565	__m256i kl_8f = _mm256_unpackhi_epi16(k, l);
566	__m256i mn_8f = _mm256_unpackhi_epi16(m, n);
567	__m256i op_8f = _mm256_unpackhi_epi16(o, p);
568
569	__m256i abcd_03 = _mm256_unpacklo_epi32(ab_07, cd_07);
570	__m256i abcd_47 = _mm256_unpackhi_epi32(ab_07, cd_07);
571	__m256i efgh_03 = _mm256_unpacklo_epi32(ef_07, gh_07);
572	__m256i efgh_47 = _mm256_unpackhi_epi32(ef_07, gh_07);
573	__m256i ijkl_03 = _mm256_unpacklo_epi32(ij_07, kl_07);
574	__m256i ijkl_47 = _mm256_unpackhi_epi32(ij_07, kl_07);
575	__m256i mnop_03 = _mm256_unpacklo_epi32(mn_07, op_07);
576	__m256i mnop_47 = _mm256_unpackhi_epi32(mn_07, op_07);
577
578	__m256i abcd_8b = _mm256_unpacklo_epi32(ab_8f, cd_8f);
579	__m256i abcd_cf = _mm256_unpackhi_epi32(ab_8f, cd_8f);
580	__m256i efgh_8b = _mm256_unpacklo_epi32(ef_8f, gh_8f);
581	__m256i efgh_cf = _mm256_unpackhi_epi32(ef_8f, gh_8f);
582	__m256i ijkl_8b = _mm256_unpacklo_epi32(ij_8f, kl_8f);
583	__m256i ijkl_cf = _mm256_unpackhi_epi32(ij_8f, kl_8f);
584	__m256i mnop_8b = _mm256_unpacklo_epi32(mn_8f, op_8f);
585	__m256i mnop_cf = _mm256_unpackhi_epi32(mn_8f, op_8f);
586
587	__m256i abcdefgh_01 = _mm256_unpacklo_epi64(abcd_03, efgh_03);
588	__m256i abcdefgh_23 = _mm256_unpackhi_epi64(abcd_03, efgh_03);
589	__m256i ijklmnop_01 = _mm256_unpacklo_epi64(ijkl_03, mnop_03);
590	__m256i ijklmnop_23 = _mm256_unpackhi_epi64(ijkl_03, mnop_03);
591	__m256i abcdefgh_45 = _mm256_unpacklo_epi64(abcd_47, efgh_47);
592	__m256i abcdefgh_67 = _mm256_unpackhi_epi64(abcd_47, efgh_47);
593	__m256i ijklmnop_45 = _mm256_unpacklo_epi64(ijkl_47, mnop_47);
594	__m256i ijklmnop_67 = _mm256_unpackhi_epi64(ijkl_47, mnop_47);
595	__m256i abcdefgh_89 = _mm256_unpacklo_epi64(abcd_8b, efgh_8b);
596	__m256i abcdefgh_ab = _mm256_unpackhi_epi64(abcd_8b, efgh_8b);
597	__m256i ijklmnop_89 = _mm256_unpacklo_epi64(ijkl_8b, mnop_8b);
598	__m256i ijklmnop_ab = _mm256_unpackhi_epi64(ijkl_8b, mnop_8b);
599	__m256i abcdefgh_cd = _mm256_unpacklo_epi64(abcd_cf, efgh_cf);
600	__m256i abcdefgh_ef = _mm256_unpackhi_epi64(abcd_cf, efgh_cf);
601	__m256i ijklmnop_cd = _mm256_unpacklo_epi64(ijkl_cf, mnop_cf);
602	__m256i ijklmnop_ef = _mm256_unpackhi_epi64(ijkl_cf, mnop_cf);
603
604	// NOTE: no unpacklo/hi instr in this case, so using permute instr.
605	__m256i a_p_0 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, `0x20`);
606	__m256i a_p_1 = _mm256_permute2x128_si256(abcdefgh_01, ijklmnop_01, `0x31`);
607	__m256i a_p_2 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, `0x20`);
608	__m256i a_p_3 = _mm256_permute2x128_si256(abcdefgh_23, ijklmnop_23, `0x31`);
609	__m256i a_p_4 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, `0x20`);
610	__m256i a_p_5 = _mm256_permute2x128_si256(abcdefgh_45, ijklmnop_45, `0x31`);
611	__m256i a_p_6 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, `0x20`);
612	__m256i a_p_7 = _mm256_permute2x128_si256(abcdefgh_67, ijklmnop_67, `0x31`);
613	__m256i a_p_8 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, `0x20`);
614	__m256i a_p_9 = _mm256_permute2x128_si256(abcdefgh_89, ijklmnop_89, `0x31`);
615	__m256i a_p_a = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, `0x20`);
616	__m256i a_p_b = _mm256_permute2x128_si256(abcdefgh_ab, ijklmnop_ab, `0x31`);
617	__m256i a_p_c = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, `0x20`);
618	__m256i a_p_d = _mm256_permute2x128_si256(abcdefgh_cd, ijklmnop_cd, `0x31`);
619	__m256i a_p_e = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, `0x20`);
620	__m256i a_p_f = _mm256_permute2x128_si256(abcdefgh_ef, ijklmnop_ef, `0x31`);
621
622	kernel.packet[`0`].x = a_p_0;
623	kernel.packet[`1`].x = a_p_1;
624	kernel.packet[`2`].x = a_p_2;
625	kernel.packet[`3`].x = a_p_3;
626	kernel.packet[`4`].x = a_p_4;
627	kernel.packet[`5`].x = a_p_5;
628	kernel.packet[`6`].x = a_p_6;
629	kernel.packet[`7`].x = a_p_7;
630	kernel.packet[`8`].x = a_p_8;
631	kernel.packet[`9`].x = a_p_9;
632	kernel.packet[`10`].x = a_p_a;
633	kernel.packet[`11`].x = a_p_b;
634	kernel.packet[`12`].x = a_p_c;
635	kernel.packet[`13`].x = a_p_d;
636	kernel.packet[`14`].x = a_p_e;
637	kernel.packet[`15`].x = a_p_f;
638	}
639
640	EIGEN_STRONG_INLINE void
641	ptranspose(PacketBlock<Packet16h,`8`>& kernel) {
642	EIGEN_ALIGN64 half in[`8`][`16`];
643	pstore<half>(in[`0`], kernel.packet[`0`]);
644	pstore<half>(in[`1`], kernel.packet[`1`]);
645	pstore<half>(in[`2`], kernel.packet[`2`]);
646	pstore<half>(in[`3`], kernel.packet[`3`]);
647	pstore<half>(in[`4`], kernel.packet[`4`]);
648	pstore<half>(in[`5`], kernel.packet[`5`]);
649	pstore<half>(in[`6`], kernel.packet[`6`]);
650	pstore<half>(in[`7`], kernel.packet[`7`]);
651
652	EIGEN_ALIGN64 half out[`8`][`16`];
653
654	for (int i = `0`; i < `8`; ++i) {
655	for (int j = `0`; j < `8`; ++j) {
656	out[i][j] = in[j][`2`*i];
657	}
658	for (int j = `0`; j < `8`; ++j) {
659	out[i][j+`8`] = in[j][`2`*i+`1`];
660	}
661	}
662
663	kernel.packet[`0`] = pload<Packet16h>(out[`0`]);
664	kernel.packet[`1`] = pload<Packet16h>(out[`1`]);
665	kernel.packet[`2`] = pload<Packet16h>(out[`2`]);
666	kernel.packet[`3`] = pload<Packet16h>(out[`3`]);
667	kernel.packet[`4`] = pload<Packet16h>(out[`4`]);
668	kernel.packet[`5`] = pload<Packet16h>(out[`5`]);
669	kernel.packet[`6`] = pload<Packet16h>(out[`6`]);
670	kernel.packet[`7`] = pload<Packet16h>(out[`7`]);
671	}
672
673	EIGEN_STRONG_INLINE void
674	ptranspose(PacketBlock<Packet16h,`4`>& kernel) {
675	EIGEN_ALIGN64 half in[`4`][`16`];
676	pstore<half>(in[`0`], kernel.packet[`0`]);
677	pstore<half>(in[`1`], kernel.packet[`1`]);
678	pstore<half>(in[`2`], kernel.packet[`2`]);
679	pstore<half>(in[`3`], kernel.packet[`3`]);
680
681	EIGEN_ALIGN64 half out[`4`][`16`];
682
683	for (int i = `0`; i < `4`; ++i) {
684	for (int j = `0`; j < `4`; ++j) {
685	out[i][j] = in[j][`4`*i];
686	}
687	for (int j = `0`; j < `4`; ++j) {
688	out[i][j+`4`] = in[j][`4`*i+`1`];
689	}
690	for (int j = `0`; j < `4`; ++j) {
691	out[i][j+`8`] = in[j][`4`*i+`2`];
692	}
693	for (int j = `0`; j < `4`; ++j) {
694	out[i][j+`12`] = in[j][`4`*i+`3`];
695	}
696	}
697
698	kernel.packet[`0`] = pload<Packet16h>(out[`0`]);
699	kernel.packet[`1`] = pload<Packet16h>(out[`1`]);
700	kernel.packet[`2`] = pload<Packet16h>(out[`2`]);
701	kernel.packet[`3`] = pload<Packet16h>(out[`3`]);
702	}
703
704
705	#elif defined EIGEN_VECTORIZE_AVX
706
707	typedef struct {
708	__m128i x;
709	} Packet8h;
710
711
712	template<> struct is_arithmetic<Packet8h> { enum { value = true }; };
713
714	template <>
715	struct packet_traits<Eigen::half> : default_packet_traits {
716	typedef Packet8h type;
717	// There is no half-size packet for Packet8h.
718	typedef Packet8h half;
719	enum {
720	Vectorizable = `1`,
721	AlignedOnScalar = `1`,
722	size = `8`,
723	HasHalfPacket = `0`,
724	HasAdd = `0`,
725	HasSub = `0`,
726	HasMul = `0`,
727	HasNegate = `0`,
728	HasAbs = `0`,
729	HasAbs2 = `0`,
730	HasMin = `0`,
731	HasMax = `0`,
732	HasConj = `0`,
733	HasSetLinear = `0`,
734	HasDiv = `0`,
735	HasSqrt = `0`,
736	HasRsqrt = `0`,
737	HasExp = `0`,
738	HasLog = `0`,
739	HasBlend = `0`
740	};
741	};
742
743
744	template<> struct unpacket_traits<Packet8h> { typedef Eigen::half type; enum {size=`8`, alignment=Aligned16}; typedef Packet8h half; };
745
746	template<> EIGEN_STRONG_INLINE Packet8h pset1<Packet8h>(const Eigen::half& from) {
747	Packet8h result;
748	result.x = _mm_set1_epi16(from.x);
749	return result;
750	}
751
752	template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet8h>(const Packet8h& from) {
753	return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_extract_epi16(from.x, `0`)));
754	}
755
756	template<> EIGEN_STRONG_INLINE Packet8h pload<Packet8h>(const Eigen::half* from) {
757	Packet8h result;
758	result.x = _mm_load_si128(reinterpret_cast<const __m128i*>(from));
759	return result;
760	}
761
762	template<> EIGEN_STRONG_INLINE Packet8h ploadu<Packet8h>(const Eigen::half* from) {
763	Packet8h result;
764	result.x = _mm_loadu_si128(reinterpret_cast<const __m128i*>(from));
765	return result;
766	}
767
768	template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet8h& from) {
769	_mm_store_si128(reinterpret_cast<__m128i*>(to), from.x);
770	}
771
772	template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet8h& from) {
773	_mm_storeu_si128(reinterpret_cast<__m128i*>(to), from.x);
774	}
775
776	template<> EIGEN_STRONG_INLINE Packet8h
777	ploadquad<Packet8h>(const Eigen::half* from) {
778	Packet8h result;
779	unsigned short a = from[`0`].x;
780	unsigned short b = from[`1`].x;
781	result.x = _mm_set_epi16(b, b, b, b, a, a, a, a);
782	return result;
783	}
784
785	EIGEN_STRONG_INLINE Packet8f half2float(const Packet8h& a) {
786	#ifdef EIGEN_HAS_FP16_C
787	return _mm256_cvtph_ps(a.x);
788	#else
789	EIGEN_ALIGN32 Eigen::half aux[`8`];
790	pstore(aux, a);
791	float f0(aux[`0`]);
792	float f1(aux[`1`]);
793	float f2(aux[`2`]);
794	float f3(aux[`3`]);
795	float f4(aux[`4`]);
796	float f5(aux[`5`]);
797	float f6(aux[`6`]);
798	float f7(aux[`7`]);
799
800	return _mm256_set_ps(f7, f6, f5, f4, f3, f2, f1, f0);
801	#endif
802	}
803
804	EIGEN_STRONG_INLINE Packet8h float2half(const Packet8f& a) {
805	#ifdef EIGEN_HAS_FP16_C
806	Packet8h result;
807	result.x = _mm256_cvtps_ph(a, _MM_FROUND_TO_NEAREST_INT\|_MM_FROUND_NO_EXC);
808	return result;
809	#else
810	EIGEN_ALIGN32 float aux[`8`];
811	pstore(aux, a);
812	Eigen::half h0(aux[`0`]);
813	Eigen::half h1(aux[`1`]);
814	Eigen::half h2(aux[`2`]);
815	Eigen::half h3(aux[`3`]);
816	Eigen::half h4(aux[`4`]);
817	Eigen::half h5(aux[`5`]);
818	Eigen::half h6(aux[`6`]);
819	Eigen::half h7(aux[`7`]);
820
821	Packet8h result;
822	result.x = _mm_set_epi16(h7.x, h6.x, h5.x, h4.x, h3.x, h2.x, h1.x, h0.x);
823	return result;
824	#endif
825	}
826
827	template<> EIGEN_STRONG_INLINE Packet8h pconj(const Packet8h& a) { return a; }
828
829	template<> EIGEN_STRONG_INLINE Packet8h padd<Packet8h>(const Packet8h& a, const Packet8h& b) {
830	Packet8f af = half2float(a);
831	Packet8f bf = half2float(b);
832	Packet8f rf = padd(af, bf);
833	return float2half(rf);
834	}
835
836	template<> EIGEN_STRONG_INLINE Packet8h pmul<Packet8h>(const Packet8h& a, const Packet8h& b) {
837	Packet8f af = half2float(a);
838	Packet8f bf = half2float(b);
839	Packet8f rf = pmul(af, bf);
840	return float2half(rf);
841	}
842
843	template<> EIGEN_STRONG_INLINE Packet8h pgather<Eigen::half, Packet8h>(const Eigen::half* from, Index stride)
844	{
845	Packet8h result;
846	result.x = _mm_set_epi16(from[`7`stride].x, from[`6`stride].x, from[`5`stride].x, from[`4`stride].x, from[`3`stride].x, from[`2`stride].x, from[`1`stride].x, from[`0`stride].x);
847	return result;
848	}
849
850	template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet8h>(Eigen::half* to, const Packet8h& from, Index stride)
851	{
852	EIGEN_ALIGN32 Eigen::half aux[`8`];
853	pstore(aux, from);
854	to[stride*`0`].x = aux[`0`].x;
855	to[stride*`1`].x = aux[`1`].x;
856	to[stride*`2`].x = aux[`2`].x;
857	to[stride*`3`].x = aux[`3`].x;
858	to[stride*`4`].x = aux[`4`].x;
859	to[stride*`5`].x = aux[`5`].x;
860	to[stride*`6`].x = aux[`6`].x;
861	to[stride*`7`].x = aux[`7`].x;
862	}
863
864	template<> EIGEN_STRONG_INLINE Eigen::half predux<Packet8h>(const Packet8h& a) {
865	Packet8f af = half2float(a);
866	float reduced = predux<Packet8f>(af);
867	return Eigen::half(reduced);
868	}
869
870	template<> EIGEN_STRONG_INLINE Eigen::half predux_max<Packet8h>(const Packet8h& a) {
871	Packet8f af = half2float(a);
872	float reduced = predux_max<Packet8f>(af);
873	return Eigen::half(reduced);
874	}
875
876	template<> EIGEN_STRONG_INLINE Eigen::half predux_min<Packet8h>(const Packet8h& a) {
877	Packet8f af = half2float(a);
878	float reduced = predux_min<Packet8f>(af);
879	return Eigen::half(reduced);
880	}
881
882	template<> EIGEN_STRONG_INLINE Eigen::half predux_mul<Packet8h>(const Packet8h& a) {
883	Packet8f af = half2float(a);
884	float reduced = predux_mul<Packet8f>(af);
885	return Eigen::half(reduced);
886	}
887
888	EIGEN_STRONG_INLINE void
889	ptranspose(PacketBlock<Packet8h,`8`>& kernel) {
890	__m128i a = kernel.packet[`0`].x;
891	__m128i b = kernel.packet[`1`].x;
892	__m128i c = kernel.packet[`2`].x;
893	__m128i d = kernel.packet[`3`].x;
894	__m128i e = kernel.packet[`4`].x;
895	__m128i f = kernel.packet[`5`].x;
896	__m128i g = kernel.packet[`6`].x;
897	__m128i h = kernel.packet[`7`].x;
898
899	__m128i a03b03 = _mm_unpacklo_epi16(a, b);
900	__m128i c03d03 = _mm_unpacklo_epi16(c, d);
901	__m128i e03f03 = _mm_unpacklo_epi16(e, f);
902	__m128i g03h03 = _mm_unpacklo_epi16(g, h);
903	__m128i a47b47 = _mm_unpackhi_epi16(a, b);
904	__m128i c47d47 = _mm_unpackhi_epi16(c, d);
905	__m128i e47f47 = _mm_unpackhi_epi16(e, f);
906	__m128i g47h47 = _mm_unpackhi_epi16(g, h);
907
908	__m128i a01b01c01d01 = _mm_unpacklo_epi32(a03b03, c03d03);
909	__m128i a23b23c23d23 = _mm_unpackhi_epi32(a03b03, c03d03);
910	__m128i e01f01g01h01 = _mm_unpacklo_epi32(e03f03, g03h03);
911	__m128i e23f23g23h23 = _mm_unpackhi_epi32(e03f03, g03h03);
912	__m128i a45b45c45d45 = _mm_unpacklo_epi32(a47b47, c47d47);
913	__m128i a67b67c67d67 = _mm_unpackhi_epi32(a47b47, c47d47);
914	__m128i e45f45g45h45 = _mm_unpacklo_epi32(e47f47, g47h47);
915	__m128i e67f67g67h67 = _mm_unpackhi_epi32(e47f47, g47h47);
916
917	__m128i a0b0c0d0e0f0g0h0 = _mm_unpacklo_epi64(a01b01c01d01, e01f01g01h01);
918	__m128i a1b1c1d1e1f1g1h1 = _mm_unpackhi_epi64(a01b01c01d01, e01f01g01h01);
919	__m128i a2b2c2d2e2f2g2h2 = _mm_unpacklo_epi64(a23b23c23d23, e23f23g23h23);
920	__m128i a3b3c3d3e3f3g3h3 = _mm_unpackhi_epi64(a23b23c23d23, e23f23g23h23);
921	__m128i a4b4c4d4e4f4g4h4 = _mm_unpacklo_epi64(a45b45c45d45, e45f45g45h45);
922	__m128i a5b5c5d5e5f5g5h5 = _mm_unpackhi_epi64(a45b45c45d45, e45f45g45h45);
923	__m128i a6b6c6d6e6f6g6h6 = _mm_unpacklo_epi64(a67b67c67d67, e67f67g67h67);
924	__m128i a7b7c7d7e7f7g7h7 = _mm_unpackhi_epi64(a67b67c67d67, e67f67g67h67);
925
926	kernel.packet[`0`].x = a0b0c0d0e0f0g0h0;
927	kernel.packet[`1`].x = a1b1c1d1e1f1g1h1;
928	kernel.packet[`2`].x = a2b2c2d2e2f2g2h2;
929	kernel.packet[`3`].x = a3b3c3d3e3f3g3h3;
930	kernel.packet[`4`].x = a4b4c4d4e4f4g4h4;
931	kernel.packet[`5`].x = a5b5c5d5e5f5g5h5;
932	kernel.packet[`6`].x = a6b6c6d6e6f6g6h6;
933	kernel.packet[`7`].x = a7b7c7d7e7f7g7h7;
934	}
935
936	EIGEN_STRONG_INLINE void
937	ptranspose(PacketBlock<Packet8h,`4`>& kernel) {
938	EIGEN_ALIGN32 Eigen::half in[`4`][`8`];
939	pstore<Eigen::half>(in[`0`], kernel.packet[`0`]);
940	pstore<Eigen::half>(in[`1`], kernel.packet[`1`]);
941	pstore<Eigen::half>(in[`2`], kernel.packet[`2`]);
942	pstore<Eigen::half>(in[`3`], kernel.packet[`3`]);
943
944	EIGEN_ALIGN32 Eigen::half out[`4`][`8`];
945
946	for (int i = `0`; i < `4`; ++i) {
947	for (int j = `0`; j < `4`; ++j) {
948	out[i][j] = in[j][`2`*i];
949	}
950	for (int j = `0`; j < `4`; ++j) {
951	out[i][j+`4`] = in[j][`2`*i+`1`];
952	}
953	}
954
955	kernel.packet[`0`] = pload<Packet8h>(out[`0`]);
956	kernel.packet[`1`] = pload<Packet8h>(out[`1`]);
957	kernel.packet[`2`] = pload<Packet8h>(out[`2`]);
958	kernel.packet[`3`] = pload<Packet8h>(out[`3`]);
959	}
960
961
962	// Disable the following code since it's broken on too many platforms / compilers.
963	//#elif defined(EIGEN_VECTORIZE_SSE) && (!EIGEN_ARCH_x86_64) && (!EIGEN_COMP_MSVC)
964	#elif 0
965
966	typedef struct {
967	__m64 x;
968	} Packet4h;
969
970
971	template<> struct is_arithmetic<Packet4h> { enum { value = true }; };
972
973	template <>
974	struct packet_traits<Eigen::half> : default_packet_traits {
975	typedef Packet4h type;
976	// There is no half-size packet for Packet4h.
977	typedef Packet4h half;
978	enum {
979	Vectorizable = `1`,
980	AlignedOnScalar = `1`,
981	size = `4`,
982	HasHalfPacket = `0`,
983	HasAdd = `0`,
984	HasSub = `0`,
985	HasMul = `0`,
986	HasNegate = `0`,
987	HasAbs = `0`,
988	HasAbs2 = `0`,
989	HasMin = `0`,
990	HasMax = `0`,
991	HasConj = `0`,
992	HasSetLinear = `0`,
993	HasDiv = `0`,
994	HasSqrt = `0`,
995	HasRsqrt = `0`,
996	HasExp = `0`,
997	HasLog = `0`,
998	HasBlend = `0`
999	};
1000	};
1001
1002
1003	template<> struct unpacket_traits<Packet4h> { typedef Eigen::half type; enum {size=`4`, alignment=Aligned16}; typedef Packet4h half; };
1004
1005	template<> EIGEN_STRONG_INLINE Packet4h pset1<Packet4h>(const Eigen::half& from) {
1006	Packet4h result;
1007	result.x = _mm_set1_pi16(from.x);
1008	return result;
1009	}
1010
1011	template<> EIGEN_STRONG_INLINE Eigen::half pfirst<Packet4h>(const Packet4h& from) {
1012	return half_impl::raw_uint16_to_half(static_cast<unsigned short>(_mm_cvtsi64_si32(from.x)));
1013	}
1014
1015	template<> EIGEN_STRONG_INLINE Packet4h pconj(const Packet4h& a) { return a; }
1016
1017	template<> EIGEN_STRONG_INLINE Packet4h padd<Packet4h>(const Packet4h& a, const Packet4h& b) {
1018	__int64_t a64 = _mm_cvtm64_si64(a.x);
1019	__int64_t b64 = _mm_cvtm64_si64(b.x);
1020
1021	Eigen::half h[`4`];
1022
1023	Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
1024	Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
1025	h[`0`] = ha + hb;
1026	ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> `16`));
1027	hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> `16`));
1028	h[`1`] = ha + hb;
1029	ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> `32`));
1030	hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> `32`));
1031	h[`2`] = ha + hb;
1032	ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> `48`));
1033	hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> `48`));
1034	h[`3`] = ha + hb;
1035	Packet4h result;
1036	result.x = _mm_set_pi16(h[`3`].x, h[`2`].x, h[`1`].x, h[`0`].x);
1037	return result;
1038	}
1039
1040	template<> EIGEN_STRONG_INLINE Packet4h pmul<Packet4h>(const Packet4h& a, const Packet4h& b) {
1041	__int64_t a64 = _mm_cvtm64_si64(a.x);
1042	__int64_t b64 = _mm_cvtm64_si64(b.x);
1043
1044	Eigen::half h[`4`];
1045
1046	Eigen::half ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64));
1047	Eigen::half hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64));
1048	h[`0`] = ha * hb;
1049	ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> `16`));
1050	hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> `16`));
1051	h[`1`] = ha * hb;
1052	ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> `32`));
1053	hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> `32`));
1054	h[`2`] = ha * hb;
1055	ha = half_impl::raw_uint16_to_half(static_cast<unsigned short>(a64 >> `48`));
1056	hb = half_impl::raw_uint16_to_half(static_cast<unsigned short>(b64 >> `48`));
1057	h[`3`] = ha * hb;
1058	Packet4h result;
1059	result.x = _mm_set_pi16(h[`3`].x, h[`2`].x, h[`1`].x, h[`0`].x);
1060	return result;
1061	}
1062
1063	template<> EIGEN_STRONG_INLINE Packet4h pload<Packet4h>(const Eigen::half* from) {
1064	Packet4h result;
1065	result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
1066	return result;
1067	}
1068
1069	template<> EIGEN_STRONG_INLINE Packet4h ploadu<Packet4h>(const Eigen::half* from) {
1070	Packet4h result;
1071	result.x = _mm_cvtsi64_m64(*reinterpret_cast<const __int64_t*>(from));
1072	return result;
1073	}
1074
1075	template<> EIGEN_STRONG_INLINE void pstore<Eigen::half>(Eigen::half* to, const Packet4h& from) {
1076	__int64_t r = _mm_cvtm64_si64(from.x);
1077	(reinterpret_cast<__int64_t>(to)) = r;
1078	}
1079
1080	template<> EIGEN_STRONG_INLINE void pstoreu<Eigen::half>(Eigen::half* to, const Packet4h& from) {
1081	__int64_t r = _mm_cvtm64_si64(from.x);
1082	(reinterpret_cast<__int64_t>(to)) = r;
1083	}
1084
1085	template<> EIGEN_STRONG_INLINE Packet4h
1086	ploadquad<Packet4h>(const Eigen::half* from) {
1087	return pset1<Packet4h>(*from);
1088	}
1089
1090	template<> EIGEN_STRONG_INLINE Packet4h pgather<Eigen::half, Packet4h>(const Eigen::half* from, Index stride)
1091	{
1092	Packet4h result;
1093	result.x = _mm_set_pi16(from[`3`stride].x, from[`2`stride].x, from[`1`stride].x, from[`0`stride].x);
1094	return result;
1095	}
1096
1097	template<> EIGEN_STRONG_INLINE void pscatter<Eigen::half, Packet4h>(Eigen::half* to, const Packet4h& from, Index stride)
1098	{
1099	__int64_t a = _mm_cvtm64_si64(from.x);
1100	to[stride`0`].x = static_cast<unsigned* short>(a);
1101	to[stride`1`].x = static_cast<unsigned* short>(a >> `16`);
1102	to[stride`2`].x = static_cast<unsigned* short>(a >> `32`);
1103	to[stride`3`].x = static_cast<unsigned* short>(a >> `48`);
1104	}
1105
1106	EIGEN_STRONG_INLINE void
1107	ptranspose(PacketBlock<Packet4h,`4`>& kernel) {
1108	__m64 T0 = _mm_unpacklo_pi16(kernel.packet[`0`].x, kernel.packet[`1`].x);
1109	__m64 T1 = _mm_unpacklo_pi16(kernel.packet[`2`].x, kernel.packet[`3`].x);
1110	__m64 T2 = _mm_unpackhi_pi16(kernel.packet[`0`].x, kernel.packet[`1`].x);
1111	__m64 T3 = _mm_unpackhi_pi16(kernel.packet[`2`].x, kernel.packet[`3`].x);
1112
1113	kernel.packet[`0`].x = _mm_unpacklo_pi32(T0, T1);
1114	kernel.packet[`1`].x = _mm_unpackhi_pi32(T0, T1);
1115	kernel.packet[`2`].x = _mm_unpacklo_pi32(T2, T3);
1116	kernel.packet[`3`].x = _mm_unpackhi_pi32(T2, T3);
1117	}
1118
1119	#endif
1120
1121	}
1122	}
1123
1124	#endif // EIGEN_PACKET_MATH_HALF_CUDA_H
1125

Browse the source code of NanoGUI/ext/eigen/Eigen/src/Core/arch/CUDA/PacketMathHalf.h