ConvectionKernels_ParallelMath.h source code [Godot/thirdparty/cvtt/ConvectionKernels_ParallelMath.h]

1	/*
2	Convection Texture Tools
3	Copyright (c) 2018-2019 Eric Lasota
4
5	Permission is hereby granted, free of charge, to any person obtaining
6	a copy of this software and associated documentation files (the
7	"Software"), to deal in the Software without restriction, including
8	without limitation the rights to use, copy, modify, merge, publish,
9	distribute, sublicense, and/or sell copies of the Software, and to
10	permit persons to whom the Software is furnished to do so, subject
11	to the following conditions:
12
13	The above copyright notice and this permission notice shall be included
14	in all copies or substantial portions of the Software.
15
16	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17	OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19	IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20	CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21	TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22	SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24	*/
25	#pragma once
26	#ifndef __CVTT_PARALLELMATH_H__
27	#define __CVTT_PARALLELMATH_H__
28
29	#include "ConvectionKernels.h"
30	#include "ConvectionKernels_Config.h"
31
32	#ifdef CVTT_USE_SSE2
33	#include <emmintrin.h>
34	#endif
35
36	#include <float.h>
37	#include <assert.h>
38	#include <string.h>
39	#include <algorithm>
40	#include <math.h>
41
42	#define UNREFERENCED_PARAMETER(n) ((void)n)
43
44	// Parallel math implementation
45	//
46	// After preprocessor defs are handled, what this should do is expose the following types:
47	// SInt16 - Signed 16-bit integer
48	// UInt16 - Signed 16-bit integer
49	// UInt15 - Unsigned 15-bit integer
50	// SInt32 - Signed 32-bit integer
51	// UInt31 - Unsigned 31-bit integer
52	// AInt16 - 16-bit integer of unknown signedness (only used for storage)
53	// Int16CompFlag - Comparison flags from comparing 16-bit integers
54	// Int32CompFlag - Comparison flags from comparing 32-bit integers
55	// FloatCompFlag - Comparison flags from comparing 32-bit floats
56	//
57	// The reason for these distinctions are that depending on the instruction set, signed or unsigned versions of certain ops
58	// (particularly max, min, compares, and right shift) may not be available. In cases where ops are not available, it's
59	// necessary to do high bit manipulations to accomplish the operation with 16-bit numbers. The 15-bit and 31-bit uint types
60	// can elide the bit flips if unsigned versions are not available.
61
62	namespace cvtt
63	{
64	#ifdef CVTT_USE_SSE2
65	// SSE2 version
66	struct ParallelMath
67	{
68	typedef uint16_t ScalarUInt16;
69	typedef int16_t ScalarSInt16;
70
71	template<unsigned int TRoundingMode>
72	struct RoundForScope
73	{
74	unsigned int m_oldCSR;
75
76	RoundForScope()
77	{
78	m_oldCSR = _mm_getcsr();
79	_mm_setcsr((m_oldCSR & ~_MM_ROUND_MASK) \| (TRoundingMode));
80	}
81
82	~RoundForScope()
83	{
84	_mm_setcsr(m_oldCSR);
85	}
86	};
87
88	struct RoundTowardZeroForScope : RoundForScope<_MM_ROUND_TOWARD_ZERO>
89	{
90	};
91
92	struct RoundTowardNearestForScope : RoundForScope<_MM_ROUND_NEAREST>
93	{
94	};
95
96	struct RoundUpForScope : RoundForScope<_MM_ROUND_UP>
97	{
98	};
99
100	struct RoundDownForScope : RoundForScope<_MM_ROUND_DOWN>
101	{
102	};
103
104	static const int ParallelSize = `8`;
105
106	enum Int16Subtype
107	{
108	IntSubtype_Signed,
109	IntSubtype_UnsignedFull,
110	IntSubtype_UnsignedTruncated,
111	IntSubtype_Abstract,
112	};
113
114	template<int TSubtype>
115	struct VInt16
116	{
117	__m128i m_value;
118
119	inline VInt16 operator+(int16_t other) const
120	{
121	VInt16 result;
122	result.m_value = _mm_add_epi16(m_value, _mm_set1_epi16(static_cast<int16_t>(other)));
123	return result;
124	}
125
126	inline VInt16 operator+(const VInt16 &other) const
127	{
128	VInt16 result;
129	result.m_value = _mm_add_epi16(m_value, other.m_value);
130	return result;
131	}
132
133	inline VInt16 operator\|(const VInt16 &other) const
134	{
135	VInt16 result;
136	result.m_value = _mm_or_si128(m_value, other.m_value);
137	return result;
138	}
139
140	inline VInt16 operator&(const VInt16 &other) const
141	{
142	VInt16 result;
143	result.m_value = _mm_and_si128(m_value, other.m_value);
144	return result;
145	}
146
147	inline VInt16 operator-(const VInt16 &other) const
148	{
149	VInt16 result;
150	result.m_value = _mm_sub_epi16(m_value, other.m_value);
151	return result;
152	}
153
154	inline VInt16 operator<<(int bits) const
155	{
156	VInt16 result;
157	result.m_value = _mm_slli_epi16(m_value, bits);
158	return result;
159	}
160
161	inline VInt16 operator^(const VInt16 &other) const
162	{
163	VInt16 result;
164	result.m_value = _mm_xor_si128(m_value, other.m_value);
165	return result;
166	}
167	};
168
169	typedef VInt16<IntSubtype_Signed> SInt16;
170	typedef VInt16<IntSubtype_UnsignedFull> UInt16;
171	typedef VInt16<IntSubtype_UnsignedTruncated> UInt15;
172	typedef VInt16<IntSubtype_Abstract> AInt16;
173
174	template<int TSubtype>
175	struct VInt32
176	{
177	__m128i m_values[`2`];
178
179	inline VInt32 operator+(const VInt32& other) const
180	{
181	VInt32 result;
182	result.m_values[`0`] = _mm_add_epi32(m_values[`0`], other.m_values[`0`]);
183	result.m_values[`1`] = _mm_add_epi32(m_values[`1`], other.m_values[`1`]);
184	return result;
185	}
186
187	inline VInt32 operator-(const VInt32& other) const
188	{
189	VInt32 result;
190	result.m_values[`0`] = _mm_sub_epi32(m_values[`0`], other.m_values[`0`]);
191	result.m_values[`1`] = _mm_sub_epi32(m_values[`1`], other.m_values[`1`]);
192	return result;
193	}
194
195	inline VInt32 operator<<(const int other) const
196	{
197	VInt32 result;
198	result.m_values[`0`] = _mm_slli_epi32(m_values[`0`], other);
199	result.m_values[`1`] = _mm_slli_epi32(m_values[`1`], other);
200	return result;
201	}
202
203	inline VInt32 operator\|(const VInt32& other) const
204	{
205	VInt32 result;
206	result.m_values[`0`] = _mm_or_si128(m_values[`0`], other.m_values[`0`]);
207	result.m_values[`1`] = _mm_or_si128(m_values[`1`], other.m_values[`1`]);
208	return result;
209	}
210	};
211
212	typedef VInt32<IntSubtype_Signed> SInt32;
213	typedef VInt32<IntSubtype_UnsignedTruncated> UInt31;
214	typedef VInt32<IntSubtype_UnsignedFull> UInt32;
215	typedef VInt32<IntSubtype_Abstract> AInt32;
216
217	template<class TTargetType>
218	struct LosslessCast
219	{
220	#ifdef CVTT_PERMIT_ALIASING
221	template<int TSrcSubtype>
222	static const TTargetType& Cast(const VInt32<TSrcSubtype> &src)
223	{
224	return reinterpret_cast<VInt32<TSubtype>&>(src);
225	}
226
227	template<int TSrcSubtype>
228	static const TTargetType& Cast(const VInt16<TSrcSubtype> &src)
229	{
230	return reinterpret_cast<VInt16<TSubtype>&>(src);
231	}
232	#else
233	template<int TSrcSubtype>
234	static TTargetType Cast(const VInt32<TSrcSubtype> &src)
235	{
236	TTargetType result;
237	result.m_values[`0`] = src.m_values[`0`];
238	result.m_values[`1`] = src.m_values[`1`];
239	return result;
240	}
241
242	template<int TSrcSubtype>
243	static TTargetType Cast(const VInt16<TSrcSubtype> &src)
244	{
245	TTargetType result;
246	result.m_value = src.m_value;
247	return result;
248	}
249	#endif
250	};
251
252	struct Int64
253	{
254	__m128i m_values[`4`];
255	};
256
257	struct Float
258	{
259	__m128 m_values[`2`];
260
261	inline Float operator+(const Float &other) const
262	{
263	Float result;
264	result.m_values[`0`] = _mm_add_ps(m_values[`0`], other.m_values[`0`]);
265	result.m_values[`1`] = _mm_add_ps(m_values[`1`], other.m_values[`1`]);
266	return result;
267	}
268
269	inline Float operator+(float other) const
270	{
271	Float result;
272	result.m_values[`0`] = _mm_add_ps(m_values[`0`], _mm_set1_ps(other));
273	result.m_values[`1`] = _mm_add_ps(m_values[`1`], _mm_set1_ps(other));
274	return result;
275	}
276
277	inline Float operator-(const Float& other) const
278	{
279	Float result;
280	result.m_values[`0`] = _mm_sub_ps(m_values[`0`], other.m_values[`0`]);
281	result.m_values[`1`] = _mm_sub_ps(m_values[`1`], other.m_values[`1`]);
282	return result;
283	}
284
285	inline Float operator-() const
286	{
287	Float result;
288	result.m_values[`0`] = _mm_sub_ps(_mm_setzero_ps(), m_values[`0`]);
289	result.m_values[`1`] = _mm_sub_ps(_mm_setzero_ps(), m_values[`1`]);
290	return result;
291	}
292
293	inline Float operator(const* Float& other) const
294	{
295	Float result;
296	result.m_values[`0`] = _mm_mul_ps(m_values[`0`], other.m_values[`0`]);
297	result.m_values[`1`] = _mm_mul_ps(m_values[`1`], other.m_values[`1`]);
298	return result;
299	}
300
301	inline Float operator(float* other) const
302	{
303	Float result;
304	result.m_values[`0`] = _mm_mul_ps(m_values[`0`], _mm_set1_ps(other));
305	result.m_values[`1`] = _mm_mul_ps(m_values[`1`], _mm_set1_ps(other));
306	return result;
307	}
308
309	inline Float operator/(const Float &other) const
310	{
311	Float result;
312	result.m_values[`0`] = _mm_div_ps(m_values[`0`], other.m_values[`0`]);
313	result.m_values[`1`] = _mm_div_ps(m_values[`1`], other.m_values[`1`]);
314	return result;
315	}
316
317	inline Float operator/(float other) const
318	{
319	Float result;
320	result.m_values[`0`] = _mm_div_ps(m_values[`0`], _mm_set1_ps(other));
321	result.m_values[`1`] = _mm_div_ps(m_values[`1`], _mm_set1_ps(other));
322	return result;
323	}
324	};
325
326	struct Int16CompFlag
327	{
328	__m128i m_value;
329
330	inline Int16CompFlag operator&(const Int16CompFlag &other) const
331	{
332	Int16CompFlag result;
333	result.m_value = _mm_and_si128(m_value, other.m_value);
334	return result;
335	}
336
337	inline Int16CompFlag operator\|(const Int16CompFlag &other) const
338	{
339	Int16CompFlag result;
340	result.m_value = _mm_or_si128(m_value, other.m_value);
341	return result;
342	}
343	};
344
345	struct Int32CompFlag
346	{
347	__m128i m_values[`2`];
348
349	inline Int32CompFlag operator&(const Int32CompFlag &other) const
350	{
351	Int32CompFlag result;
352	result.m_values[`0`] = _mm_and_si128(m_values[`0`], other.m_values[`0`]);
353	result.m_values[`1`] = _mm_and_si128(m_values[`1`], other.m_values[`1`]);
354	return result;
355	}
356
357	inline Int32CompFlag operator\|(const Int32CompFlag &other) const
358	{
359	Int32CompFlag result;
360	result.m_values[`0`] = _mm_or_si128(m_values[`0`], other.m_values[`0`]);
361	result.m_values[`1`] = _mm_or_si128(m_values[`1`], other.m_values[`1`]);
362	return result;
363	}
364	};
365
366	struct FloatCompFlag
367	{
368	__m128 m_values[`2`];
369
370	inline FloatCompFlag operator&(const FloatCompFlag &other) const
371	{
372	FloatCompFlag result;
373	result.m_values[`0`] = _mm_and_ps(m_values[`0`], other.m_values[`0`]);
374	result.m_values[`1`] = _mm_and_ps(m_values[`1`], other.m_values[`1`]);
375	return result;
376	}
377
378	inline FloatCompFlag operator\|(const FloatCompFlag &other) const
379	{
380	FloatCompFlag result;
381	result.m_values[`0`] = _mm_or_ps(m_values[`0`], other.m_values[`0`]);
382	result.m_values[`1`] = _mm_or_ps(m_values[`1`], other.m_values[`1`]);
383	return result;
384	}
385	};
386
387	template<int TSubtype>
388	static VInt16<TSubtype> AbstractAdd(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
389	{
390	VInt16<TSubtype> result;
391	result.m_value = _mm_add_epi16(a.m_value, b.m_value);
392	return result;
393	}
394
395	template<int TSubtype>
396	static VInt16<TSubtype> AbstractSubtract(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
397	{
398	VInt16<TSubtype> result;
399	result.m_value = _mm_sub_epi16(a.m_value, b.m_value);
400	return result;
401	}
402
403	static Float Select(const FloatCompFlag &flag, const Float &a, const Float &b)
404	{
405	Float result;
406	for (int i = `0`; i < `2`; i++)
407	result.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], a.m_values[i]), _mm_andnot_ps(flag.m_values[i], b.m_values[i]));
408	return result;
409	}
410
411	template<int TSubtype>
412	static VInt16<TSubtype> Select(const Int16CompFlag &flag, const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
413	{
414	VInt16<TSubtype> result;
415	result.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, a.m_value), _mm_andnot_si128(flag.m_value, b.m_value));
416	return result;
417	}
418
419	template<int TSubtype>
420	static VInt16<TSubtype> SelectOrZero(const Int16CompFlag &flag, const VInt16<TSubtype> &a)
421	{
422	VInt16<TSubtype> result;
423	result.m_value = _mm_and_si128(flag.m_value, a.m_value);
424	return result;
425	}
426
427	template<int TSubtype>
428	static void ConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
429	{
430	dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
431	}
432
433	template<int TSubtype>
434	static void ConditionalSet(VInt32<TSubtype> &dest, const Int16CompFlag &flag, const VInt32<TSubtype> &src)
435	{
436	__m128i lowFlags = _mm_unpacklo_epi16(flag.m_value, flag.m_value);
437	__m128i highFlags = _mm_unpackhi_epi16(flag.m_value, flag.m_value);
438	dest.m_values[`0`] = _mm_or_si128(_mm_andnot_si128(lowFlags, dest.m_values[`0`]), _mm_and_si128(lowFlags, src.m_values[`0`]));
439	dest.m_values[`1`] = _mm_or_si128(_mm_andnot_si128(highFlags, dest.m_values[`1`]), _mm_and_si128(highFlags, src.m_values[`1`]));
440	}
441
442	static void ConditionalSet(ParallelMath::Int16CompFlag &dest, const Int16CompFlag &flag, const ParallelMath::Int16CompFlag &src)
443	{
444	dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
445	}
446
447	static SInt16 ConditionalNegate(const Int16CompFlag &flag, const SInt16 &v)
448	{
449	SInt16 result;
450	result.m_value = _mm_add_epi16(_mm_xor_si128(flag.m_value, v.m_value), _mm_srli_epi16(flag.m_value, `15`));
451	return result;
452	}
453
454	template<int TSubtype>
455	static void NotConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
456	{
457	dest.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, dest.m_value), _mm_andnot_si128(flag.m_value, src.m_value));
458	}
459
460	static void ConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
461	{
462	for (int i = `0`; i < `2`; i++)
463	dest.m_values[i] = _mm_or_ps(_mm_andnot_ps(flag.m_values[i], dest.m_values[i]), _mm_and_ps(flag.m_values[i], src.m_values[i]));
464	}
465
466	static void NotConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
467	{
468	for (int i = `0`; i < `2`; i++)
469	dest.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], dest.m_values[i]), _mm_andnot_ps(flag.m_values[i], src.m_values[i]));
470	}
471
472	static void MakeSafeDenominator(Float& v)
473	{
474	ConditionalSet(v, Equal(v, MakeFloatZero()), MakeFloat(`1.0f`));
475	}
476
477	static SInt16 TruncateToPrecisionSigned(const SInt16 &v, int precision)
478	{
479	int lostBits = `16` - precision;
480	if (lostBits == `0`)
481	return v;
482
483	SInt16 result;
484	result.m_value = _mm_srai_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
485	return result;
486	}
487
488	static UInt16 TruncateToPrecisionUnsigned(const UInt16 &v, int precision)
489	{
490	int lostBits = `16` - precision;
491	if (lostBits == `0`)
492	return v;
493
494	UInt16 result;
495	result.m_value = _mm_srli_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
496	return result;
497	}
498
499	static UInt16 Min(const UInt16 &a, const UInt16 &b)
500	{
501	__m128i bitFlip = _mm_set1_epi16(-`32768`);
502
503	UInt16 result;
504	result.m_value = _mm_xor_si128(_mm_min_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
505	return result;
506	}
507
508	static SInt16 Min(const SInt16 &a, const SInt16 &b)
509	{
510	SInt16 result;
511	result.m_value = _mm_min_epi16(a.m_value, b.m_value);
512	return result;
513	}
514
515	static UInt15 Min(const UInt15 &a, const UInt15 &b)
516	{
517	UInt15 result;
518	result.m_value = _mm_min_epi16(a.m_value, b.m_value);
519	return result;
520	}
521
522	static Float Min(const Float &a, const Float &b)
523	{
524	Float result;
525	for (int i = `0`; i < `2`; i++)
526	result.m_values[i] = _mm_min_ps(a.m_values[i], b.m_values[i]);
527	return result;
528	}
529
530	static UInt16 Max(const UInt16 &a, const UInt16 &b)
531	{
532	__m128i bitFlip = _mm_set1_epi16(-`32768`);
533
534	UInt16 result;
535	result.m_value = _mm_xor_si128(_mm_max_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
536	return result;
537	}
538
539	static SInt16 Max(const SInt16 &a, const SInt16 &b)
540	{
541	SInt16 result;
542	result.m_value = _mm_max_epi16(a.m_value, b.m_value);
543	return result;
544	}
545
546	static UInt15 Max(const UInt15 &a, const UInt15 &b)
547	{
548	UInt15 result;
549	result.m_value = _mm_max_epi16(a.m_value, b.m_value);
550	return result;
551	}
552
553	static Float Max(const Float &a, const Float &b)
554	{
555	Float result;
556	for (int i = `0`; i < `2`; i++)
557	result.m_values[i] = _mm_max_ps(a.m_values[i], b.m_values[i]);
558	return result;
559	}
560
561	static Float Clamp(const Float &v, float min, float max)
562	{
563	Float result;
564	for (int i = `0`; i < `2`; i++)
565	result.m_values[i] = _mm_max_ps(_mm_min_ps(v.m_values[i], _mm_set1_ps(max)), _mm_set1_ps(min));
566	return result;
567	}
568
569	static Float Reciprocal(const Float &v)
570	{
571	Float result;
572	for (int i = `0`; i < `2`; i++)
573	result.m_values[i] = _mm_rcp_ps(v.m_values[i]);
574	return result;
575	}
576
577	static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, UInt15 &chOut)
578	{
579	int16_t values[`8`];
580	for (int i = `0`; i < `8`; i++)
581	values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
582
583	chOut.m_value = _mm_set_epi16(values[`7`], values[`6`], values[`5`], values[`4`], values[`3`], values[`2`], values[`1`], values[`0`]);
584	}
585
586	static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, SInt16 &chOut)
587	{
588	int16_t values[`8`];
589	for (int i = `0`; i < `8`; i++)
590	values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
591
592	chOut.m_value = _mm_set_epi16(values[`7`], values[`6`], values[`5`], values[`4`], values[`3`], values[`2`], values[`1`], values[`0`]);
593	}
594
595	static Float MakeFloat(float v)
596	{
597	Float f;
598	f.m_values[`0`] = f.m_values[`1`] = _mm_set1_ps(v);
599	return f;
600	}
601
602	static Float MakeFloatZero()
603	{
604	Float f;
605	f.m_values[`0`] = f.m_values[`1`] = _mm_setzero_ps();
606	return f;
607	}
608
609	static UInt16 MakeUInt16(uint16_t v)
610	{
611	UInt16 result;
612	result.m_value = _mm_set1_epi16(static_cast<short>(v));
613	return result;
614	}
615
616	static SInt16 MakeSInt16(int16_t v)
617	{
618	SInt16 result;
619	result.m_value = _mm_set1_epi16(static_cast<short>(v));
620	return result;
621	}
622
623	static AInt16 MakeAInt16(int16_t v)
624	{
625	AInt16 result;
626	result.m_value = _mm_set1_epi16(static_cast<short>(v));
627	return result;
628	}
629
630	static UInt15 MakeUInt15(uint16_t v)
631	{
632	UInt15 result;
633	result.m_value = _mm_set1_epi16(static_cast<short>(v));
634	return result;
635	}
636
637	static SInt32 MakeSInt32(int32_t v)
638	{
639	SInt32 result;
640	result.m_values[`0`] = _mm_set1_epi32(v);
641	result.m_values[`1`] = _mm_set1_epi32(v);
642	return result;
643	}
644
645	static UInt31 MakeUInt31(uint32_t v)
646	{
647	UInt31 result;
648	result.m_values[`0`] = _mm_set1_epi32(v);
649	result.m_values[`1`] = _mm_set1_epi32(v);
650	return result;
651	}
652
653	static uint16_t Extract(const UInt16 &v, int offset)
654	{
655	return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
656	}
657
658	static int16_t Extract(const SInt16 &v, int offset)
659	{
660	return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
661	}
662
663	static uint16_t Extract(const UInt15 &v, int offset)
664	{
665	return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
666	}
667
668	static int16_t Extract(const AInt16 &v, int offset)
669	{
670	return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
671	}
672
673	static int32_t Extract(const SInt32 &v, int offset)
674	{
675	return reinterpret_cast<const int32_t*>(&v.m_values[offset >> `2`])[offset & `3`];
676	}
677
678	static float Extract(const Float &v, int offset)
679	{
680	return reinterpret_cast<const float*>(&v.m_values[offset >> `2`])[offset & `3`];
681	}
682
683	static bool Extract(const ParallelMath::Int16CompFlag &v, int offset)
684	{
685	return reinterpret_cast<const int16_t*>(&v.m_value)[offset] != `0`;
686	}
687
688	static void PutUInt16(UInt16 &dest, int offset, uint16_t v)
689	{
690	reinterpret_cast<uint16_t*>(&dest)[offset] = v;
691	}
692
693	static void PutUInt15(UInt15 &dest, int offset, uint16_t v)
694	{
695	reinterpret_cast<uint16_t*>(&dest)[offset] = v;
696	}
697
698	static void PutSInt16(SInt16 &dest, int offset, int16_t v)
699	{
700	reinterpret_cast<int16_t*>(&dest)[offset] = v;
701	}
702
703	static float ExtractFloat(const Float& v, int offset)
704	{
705	return reinterpret_cast<const float*>(&v)[offset];
706	}
707
708	static void PutFloat(Float &dest, int offset, float v)
709	{
710	reinterpret_cast<float*>(&dest)[offset] = v;
711	}
712
713	static void PutBoolInt16(Int16CompFlag &dest, int offset, bool v)
714	{
715	reinterpret_cast<int16_t*>(&dest)[offset] = v ? -`1` : `0`;
716	}
717
718	static Int32CompFlag Less(const UInt31 &a, const UInt31 &b)
719	{
720	Int32CompFlag result;
721	result.m_values[`0`] = _mm_cmplt_epi32(a.m_values[`0`], b.m_values[`0`]);
722	result.m_values[`1`] = _mm_cmplt_epi32(a.m_values[`1`], b.m_values[`1`]);
723	return result;
724	}
725
726	static Int16CompFlag Less(const SInt16 &a, const SInt16 &b)
727	{
728	Int16CompFlag result;
729	result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
730	return result;
731	}
732
733	static Int16CompFlag Less(const UInt15 &a, const UInt15 &b)
734	{
735	Int16CompFlag result;
736	result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
737	return result;
738	}
739
740	static Int16CompFlag LessOrEqual(const UInt15 &a, const UInt15 &b)
741	{
742	Int16CompFlag result;
743	result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
744	return result;
745	}
746
747	static FloatCompFlag Less(const Float &a, const Float &b)
748	{
749	FloatCompFlag result;
750	for (int i = `0`; i < `2`; i++)
751	result.m_values[i] = _mm_cmplt_ps(a.m_values[i], b.m_values[i]);
752	return result;
753	}
754
755	static FloatCompFlag LessOrEqual(const Float &a, const Float &b)
756	{
757	FloatCompFlag result;
758	for (int i = `0`; i < `2`; i++)
759	result.m_values[i] = _mm_cmple_ps(a.m_values[i], b.m_values[i]);
760	return result;
761	}
762
763	template<int TSubtype>
764	static Int16CompFlag Equal(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
765	{
766	Int16CompFlag result;
767	result.m_value = _mm_cmpeq_epi16(a.m_value, b.m_value);
768	return result;
769	}
770
771	static FloatCompFlag Equal(const Float &a, const Float &b)
772	{
773	FloatCompFlag result;
774	for (int i = `0`; i < `2`; i++)
775	result.m_values[i] = _mm_cmpeq_ps(a.m_values[i], b.m_values[i]);
776	return result;
777	}
778
779	static Int16CompFlag Equal(const Int16CompFlag &a, const Int16CompFlag &b)
780	{
781	Int16CompFlag notResult;
782	notResult.m_value = _mm_xor_si128(a.m_value, b.m_value);
783	return Not(notResult);
784	}
785
786	static Float ToFloat(const UInt16 &v)
787	{
788	Float result;
789	result.m_values[`0`] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
790	result.m_values[`1`] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
791	return result;
792	}
793
794	static UInt31 ToUInt31(const UInt16 &v)
795	{
796	UInt31 result;
797	result.m_values[`0`] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
798	result.m_values[`1`] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
799	return result;
800	}
801
802	static SInt32 ToInt32(const UInt16 &v)
803	{
804	SInt32 result;
805	result.m_values[`0`] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
806	result.m_values[`1`] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
807	return result;
808	}
809
810	static SInt32 ToInt32(const UInt15 &v)
811	{
812	SInt32 result;
813	result.m_values[`0`] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
814	result.m_values[`1`] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
815	return result;
816	}
817
818	static SInt32 ToInt32(const SInt16 &v)
819	{
820	SInt32 result;
821	result.m_values[`0`] = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), `16`);
822	result.m_values[`1`] = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), `16`);
823	return result;
824	}
825
826	static Float ToFloat(const SInt16 &v)
827	{
828	Float result;
829	result.m_values[`0`] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), `16`));
830	result.m_values[`1`] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), `16`));
831	return result;
832	}
833
834	static Float ToFloat(const UInt15 &v)
835	{
836	Float result;
837	result.m_values[`0`] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
838	result.m_values[`1`] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
839	return result;
840	}
841
842	static Float ToFloat(const UInt31 &v)
843	{
844	Float result;
845	result.m_values[`0`] = _mm_cvtepi32_ps(v.m_values[`0`]);
846	result.m_values[`1`] = _mm_cvtepi32_ps(v.m_values[`1`]);
847	return result;
848	}
849
850	static Int16CompFlag FloatFlagToInt16(const FloatCompFlag &v)
851	{
852	__m128i lo = _mm_castps_si128(v.m_values[`0`]);
853	__m128i hi = _mm_castps_si128(v.m_values[`1`]);
854
855	Int16CompFlag result;
856	result.m_value = _mm_packs_epi32(lo, hi);
857	return result;
858	}
859
860	static FloatCompFlag Int16FlagToFloat(const Int16CompFlag &v)
861	{
862	__m128i lo = _mm_unpacklo_epi16(v.m_value, v.m_value);
863	__m128i hi = _mm_unpackhi_epi16(v.m_value, v.m_value);
864
865	FloatCompFlag result;
866	result.m_values[`0`] = _mm_castsi128_ps(lo);
867	result.m_values[`1`] = _mm_castsi128_ps(hi);
868	return result;
869	}
870
871	static Int16CompFlag Int32FlagToInt16(const Int32CompFlag &v)
872	{
873	__m128i lo = v.m_values[`0`];
874	__m128i hi = v.m_values[`1`];
875
876	Int16CompFlag result;
877	result.m_value = _mm_packs_epi32(lo, hi);
878	return result;
879	}
880
881	static Int16CompFlag MakeBoolInt16(bool b)
882	{
883	Int16CompFlag result;
884	if (b)
885	result.m_value = _mm_set1_epi16(-`1`);
886	else
887	result.m_value = _mm_setzero_si128();
888	return result;
889	}
890
891	static FloatCompFlag MakeBoolFloat(bool b)
892	{
893	FloatCompFlag result;
894	if (b)
895	result.m_values[`0`] = result.m_values[`1`] = _mm_castsi128_ps(_mm_set1_epi32(-`1`));
896	else
897	result.m_values[`0`] = result.m_values[`1`] = _mm_setzero_ps();
898	return result;
899	}
900
901	static Int16CompFlag AndNot(const Int16CompFlag &a, const Int16CompFlag &b)
902	{
903	Int16CompFlag result;
904	result.m_value = _mm_andnot_si128(b.m_value, a.m_value);
905	return result;
906	}
907
908	static Int16CompFlag Not(const Int16CompFlag &b)
909	{
910	Int16CompFlag result;
911	result.m_value = _mm_xor_si128(b.m_value, _mm_set1_epi32(-`1`));
912	return result;
913	}
914
915	static Int32CompFlag Not(const Int32CompFlag &b)
916	{
917	Int32CompFlag result;
918	result.m_values[`0`] = _mm_xor_si128(b.m_values[`0`], _mm_set1_epi32(-`1`));
919	result.m_values[`1`] = _mm_xor_si128(b.m_values[`1`], _mm_set1_epi32(-`1`));
920	return result;
921	}
922
923	static UInt16 RoundAndConvertToU16(const Float &v, const void* /roundingMode/)
924	{
925	__m128i lo = _mm_cvtps_epi32(_mm_add_ps(v.m_values[`0`], _mm_set1_ps(-`32768`)));
926	__m128i hi = _mm_cvtps_epi32(_mm_add_ps(v.m_values[`1`], _mm_set1_ps(-`32768`)));
927
928	__m128i packed = _mm_packs_epi32(lo, hi);
929
930	UInt16 result;
931	result.m_value = _mm_xor_si128(packed, _mm_set1_epi16(-`32768`));
932	return result;
933	}
934
935	static UInt15 RoundAndConvertToU15(const Float &v, const void* /roundingMode/)
936	{
937	__m128i lo = _mm_cvtps_epi32(v.m_values[`0`]);
938	__m128i hi = _mm_cvtps_epi32(v.m_values[`1`]);
939
940	__m128i packed = _mm_packs_epi32(lo, hi);
941
942	UInt15 result;
943	result.m_value = _mm_packs_epi32(lo, hi);
944	return result;
945	}
946
947	static SInt16 RoundAndConvertToS16(const Float &v, const void* /roundingMode/)
948	{
949	__m128i lo = _mm_cvtps_epi32(v.m_values[`0`]);
950	__m128i hi = _mm_cvtps_epi32(v.m_values[`1`]);
951
952	__m128i packed = _mm_packs_epi32(lo, hi);
953
954	SInt16 result;
955	result.m_value = _mm_packs_epi32(lo, hi);
956	return result;
957	}
958
959	static Float Sqrt(const Float &f)
960	{
961	Float result;
962	for (int i = `0`; i < `2`; i++)
963	result.m_values[i] = _mm_sqrt_ps(f.m_values[i]);
964	return result;
965	}
966
967	static UInt16 Abs(const SInt16 &a)
968	{
969	__m128i signBitsXor = _mm_srai_epi16(a.m_value, `15`);
970	__m128i signBitsAdd = _mm_srli_epi16(a.m_value, `15`);
971
972	UInt16 result;
973	result.m_value = _mm_add_epi16(_mm_xor_si128(a.m_value, signBitsXor), signBitsAdd);
974	return result;
975	}
976
977	static Float Abs(const Float& a)
978	{
979	__m128 invMask = _mm_set1_ps(-`0.0f`);
980
981	Float result;
982	result.m_values[`0`] = _mm_andnot_ps(invMask, a.m_values[`0`]);
983	result.m_values[`1`] = _mm_andnot_ps(invMask, a.m_values[`1`]);
984	return result;
985	}
986
987	static UInt16 SqDiffUInt8(const UInt15 &a, const UInt15 &b)
988	{
989	__m128i diff = _mm_sub_epi16(a.m_value, b.m_value);
990
991	UInt16 result;
992	result.m_value = _mm_mullo_epi16(diff, diff);
993	return result;
994	}
995
996	static Float SqDiffSInt16(const SInt16 &a, const SInt16 &b)
997	{
998	__m128i diffU = _mm_sub_epi16(_mm_max_epi16(a.m_value, b.m_value), _mm_min_epi16(a.m_value, b.m_value));
999
1000	__m128i mulHi = _mm_mulhi_epu16(diffU, diffU);
1001	__m128i mulLo = _mm_mullo_epi16(diffU, diffU);
1002	__m128i sqDiffHi = _mm_unpackhi_epi16(mulLo, mulHi);
1003	__m128i sqDiffLo = _mm_unpacklo_epi16(mulLo, mulHi);
1004
1005	Float result;
1006	result.m_values[`0`] = _mm_cvtepi32_ps(sqDiffLo);
1007	result.m_values[`1`] = _mm_cvtepi32_ps(sqDiffHi);
1008
1009	return result;
1010	}
1011
1012	static Float TwosCLHalfToFloat(const SInt16 &v)
1013	{
1014	__m128i absV = _mm_add_epi16(_mm_xor_si128(v.m_value, _mm_srai_epi16(v.m_value, `15`)), _mm_srli_epi16(v.m_value, `15`));
1015
1016	__m128i signBits = _mm_and_si128(v.m_value, _mm_set1_epi16(-`32768`));
1017	__m128i mantissa = _mm_and_si128(v.m_value, _mm_set1_epi16(`0x03ff`));
1018	__m128i exponent = _mm_and_si128(v.m_value, _mm_set1_epi16(`0x7c00`));
1019
1020	__m128i isDenormal = _mm_cmpeq_epi16(exponent, _mm_setzero_si128());
1021
1022	// Convert exponent to high-bits
1023	exponent = _mm_add_epi16(_mm_srli_epi16(exponent, `3`), _mm_set1_epi16(`14336`));
1024
1025	__m128i denormalCorrectionHigh = _mm_and_si128(isDenormal, _mm_or_si128(signBits, _mm_set1_epi16(`14336`)));
1026
1027	__m128i highBits = _mm_or_si128(signBits, _mm_or_si128(exponent, _mm_srli_epi16(mantissa, `3`)));
1028	__m128i lowBits = _mm_slli_epi16(mantissa, `13`);
1029
1030	__m128i flow = _mm_unpacklo_epi16(lowBits, highBits);
1031	__m128i fhigh = _mm_unpackhi_epi16(lowBits, highBits);
1032
1033	__m128i correctionLow = _mm_unpacklo_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
1034	__m128i correctionHigh = _mm_unpackhi_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
1035
1036	Float result;
1037	result.m_values[`0`] = _mm_sub_ps(_mm_castsi128_ps(flow), _mm_castsi128_ps(correctionLow));
1038	result.m_values[`1`] = _mm_sub_ps(_mm_castsi128_ps(fhigh), _mm_castsi128_ps(correctionHigh));
1039
1040	return result;
1041	}
1042
1043	static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
1044	{
1045	Float fa = TwosCLHalfToFloat(a);
1046
1047	Float diff = fa - b;
1048	return diff * diff;
1049	}
1050
1051	static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
1052	{
1053	Float fa = TwosCLHalfToFloat(a);
1054	Float fb = TwosCLHalfToFloat(b);
1055
1056	Float diff = fa - fb;
1057	return diff * diff;
1058	}
1059
1060	static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
1061	{
1062	Float fa = TwosCLHalfToFloat(a) * aWeight;
1063
1064	Float diff = fa - b;
1065	return diff * diff;
1066	}
1067
1068	static UInt16 RightShift(const UInt16 &v, int bits)
1069	{
1070	UInt16 result;
1071	result.m_value = _mm_srli_epi16(v.m_value, bits);
1072	return result;
1073	}
1074
1075	static UInt31 RightShift(const UInt31 &v, int bits)
1076	{
1077	UInt31 result;
1078	result.m_values[`0`] = _mm_srli_epi32(v.m_values[`0`], bits);
1079	result.m_values[`1`] = _mm_srli_epi32(v.m_values[`1`], bits);
1080	return result;
1081	}
1082
1083	static SInt16 RightShift(const SInt16 &v, int bits)
1084	{
1085	SInt16 result;
1086	result.m_value = _mm_srai_epi16(v.m_value, bits);
1087	return result;
1088	}
1089
1090	static UInt15 RightShift(const UInt15 &v, int bits)
1091	{
1092	UInt15 result;
1093	result.m_value = _mm_srli_epi16(v.m_value, bits);
1094	return result;
1095	}
1096
1097	static SInt32 RightShift(const SInt32 &v, int bits)
1098	{
1099	SInt32 result;
1100	result.m_values[`0`] = _mm_srai_epi32(v.m_values[`0`], bits);
1101	result.m_values[`1`] = _mm_srai_epi32(v.m_values[`1`], bits);
1102	return result;
1103	}
1104
1105	static SInt16 ToSInt16(const SInt32 &v)
1106	{
1107	SInt16 result;
1108	result.m_value = _mm_packs_epi32(v.m_values[`0`], v.m_values[`1`]);
1109	return result;
1110	}
1111
1112	static SInt16 ToSInt16(const UInt16 &v)
1113	{
1114	SInt16 result;
1115	result.m_value = v.m_value;
1116	return result;
1117	}
1118
1119	static SInt16 ToSInt16(const UInt15 &v)
1120	{
1121	SInt16 result;
1122	result.m_value = v.m_value;
1123	return result;
1124	}
1125
1126	static UInt16 ToUInt16(const UInt32 &v)
1127	{
1128	__m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[`0`], `16`), `16`);
1129	__m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[`1`], `16`), `16`);
1130
1131	UInt16 result;
1132	result.m_value = _mm_packs_epi32(low, high);
1133	return result;
1134	}
1135
1136	static UInt16 ToUInt16(const UInt31 &v)
1137	{
1138	__m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[`0`], `16`), `16`);
1139	__m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[`1`], `16`), `16`);
1140
1141	UInt16 result;
1142	result.m_value = _mm_packs_epi32(low, high);
1143	return result;
1144	}
1145
1146	static UInt15 ToUInt15(const UInt31 &v)
1147	{
1148	UInt15 result;
1149	result.m_value = _mm_packs_epi32(v.m_values[`0`], v.m_values[`1`]);
1150	return result;
1151	}
1152
1153	static UInt15 ToUInt15(const SInt16 &v)
1154	{
1155	UInt15 result;
1156	result.m_value = v.m_value;
1157	return result;
1158	}
1159
1160	static UInt15 ToUInt15(const UInt16 &v)
1161	{
1162	UInt15 result;
1163	result.m_value = v.m_value;
1164	return result;
1165	}
1166
1167	static SInt32 XMultiply(const SInt16 &a, const SInt16 &b)
1168	{
1169	__m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
1170	__m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1171
1172	SInt32 result;
1173	result.m_values[`0`] = _mm_unpacklo_epi16(low, high);
1174	result.m_values[`1`] = _mm_unpackhi_epi16(low, high);
1175	return result;
1176	}
1177
1178	static SInt32 XMultiply(const SInt16 &a, const UInt15 &b)
1179	{
1180	__m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
1181	__m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1182
1183	SInt32 result;
1184	result.m_values[`0`] = _mm_unpacklo_epi16(low, high);
1185	result.m_values[`1`] = _mm_unpackhi_epi16(low, high);
1186	return result;
1187	}
1188
1189	static SInt32 XMultiply(const UInt15 &a, const SInt16 &b)
1190	{
1191	return XMultiply(b, a);
1192	}
1193
1194	static UInt32 XMultiply(const UInt16 &a, const UInt16 &b)
1195	{
1196	__m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
1197	__m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1198
1199	UInt32 result;
1200	result.m_values[`0`] = _mm_unpacklo_epi16(low, high);
1201	result.m_values[`1`] = _mm_unpackhi_epi16(low, high);
1202	return result;
1203	}
1204
1205	static UInt16 CompactMultiply(const UInt16 &a, const UInt15 &b)
1206	{
1207	UInt16 result;
1208	result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
1209	return result;
1210	}
1211
1212	static UInt16 CompactMultiply(const UInt15 &a, const UInt15 &b)
1213	{
1214	UInt16 result;
1215	result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
1216	return result;
1217	}
1218
1219	static SInt16 CompactMultiply(const SInt16 &a, const UInt15 &b)
1220	{
1221	SInt16 result;
1222	result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
1223	return result;
1224	}
1225
1226	static SInt16 CompactMultiply(const SInt16 &a, const SInt16 &b)
1227	{
1228	SInt16 result;
1229	result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
1230	return result;
1231	}
1232
1233	static UInt31 XMultiply(const UInt15 &a, const UInt15 &b)
1234	{
1235	__m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
1236	__m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1237
1238	UInt31 result;
1239	result.m_values[`0`] = _mm_unpacklo_epi16(low, high);
1240	result.m_values[`1`] = _mm_unpackhi_epi16(low, high);
1241	return result;
1242	}
1243
1244	static UInt31 XMultiply(const UInt16 &a, const UInt15 &b)
1245	{
1246	__m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
1247	__m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1248
1249	UInt31 result;
1250	result.m_values[`0`] = _mm_unpacklo_epi16(low, high);
1251	result.m_values[`1`] = _mm_unpackhi_epi16(low, high);
1252	return result;
1253	}
1254
1255	static UInt31 XMultiply(const UInt15 &a, const UInt16 &b)
1256	{
1257	return XMultiply(b, a);
1258	}
1259
1260	static bool AnySet(const Int16CompFlag &v)
1261	{
1262	return _mm_movemask_epi8(v.m_value) != `0`;
1263	}
1264
1265	static bool AllSet(const Int16CompFlag &v)
1266	{
1267	return _mm_movemask_epi8(v.m_value) == `0xffff`;
1268	}
1269
1270	static bool AnySet(const FloatCompFlag &v)
1271	{
1272	return _mm_movemask_ps(v.m_values[`0`]) != `0` \|\| _mm_movemask_ps(v.m_values[`1`]) != `0`;
1273	}
1274
1275	static bool AllSet(const FloatCompFlag &v)
1276	{
1277	return _mm_movemask_ps(v.m_values[`0`]) == `0xf` && _mm_movemask_ps(v.m_values[`1`]) == `0xf`;
1278	}
1279	};
1280
1281	#else
1282	// Scalar version
1283	struct ParallelMath
1284	{
1285	struct RoundTowardZeroForScope
1286	{
1287	};
1288
1289	struct RoundTowardNearestForScope
1290	{
1291	};
1292
1293	struct RoundUpForScope
1294	{
1295	};
1296
1297	struct RoundDownForScope
1298	{
1299	};
1300
1301	static const int ParallelSize = `1`;
1302
1303	enum Int16Subtype
1304	{
1305	IntSubtype_Signed,
1306	IntSubtype_UnsignedFull,
1307	IntSubtype_UnsignedTruncated,
1308	IntSubtype_Abstract,
1309	};
1310
1311	typedef int32_t SInt16;
1312	typedef int32_t UInt15;
1313	typedef int32_t UInt16;
1314	typedef int32_t AInt16;
1315
1316	typedef int32_t SInt32;
1317	typedef int32_t UInt31;
1318	typedef int32_t UInt32;
1319	typedef int32_t AInt32;
1320
1321	typedef int32_t ScalarUInt16;
1322	typedef int32_t ScalarSInt16;
1323
1324	typedef float Float;
1325
1326	template<class TTargetType>
1327	struct LosslessCast
1328	{
1329	static const int32_t& Cast(const int32_t &src)
1330	{
1331	return src;
1332	}
1333	};
1334
1335	typedef bool Int16CompFlag;
1336	typedef bool FloatCompFlag;
1337
1338	static int32_t AbstractAdd(const int32_t &a, const int32_t &b)
1339	{
1340	return a + b;
1341	}
1342
1343	static int32_t AbstractSubtract(const int32_t &a, const int32_t &b)
1344	{
1345	return a - b;
1346	}
1347
1348	static float Select(bool flag, float a, float b)
1349	{
1350	return flag ? a : b;
1351	}
1352
1353	static int32_t Select(bool flag, int32_t a, int32_t b)
1354	{
1355	return flag ? a : b;
1356	}
1357
1358	static int32_t SelectOrZero(bool flag, int32_t a)
1359	{
1360	return flag ? a : `0`;
1361	}
1362
1363	static void ConditionalSet(int32_t& dest, bool flag, int32_t src)
1364	{
1365	if (flag)
1366	dest = src;
1367	}
1368
1369	static void ConditionalSet(bool& dest, bool flag, bool src)
1370	{
1371	if (flag)
1372	dest = src;
1373	}
1374
1375	static int32_t ConditionalNegate(bool flag, int32_t v)
1376	{
1377	return (flag) ? -v : v;
1378	}
1379
1380	static void NotConditionalSet(int32_t& dest, bool flag, int32_t src)
1381	{
1382	if (!flag)
1383	dest = src;
1384	}
1385
1386	static void ConditionalSet(float& dest, bool flag, float src)
1387	{
1388	if (flag)
1389	dest = src;
1390	}
1391
1392	static void NotConditionalSet(float& dest, bool flag, float src)
1393	{
1394	if (!flag)
1395	dest = src;
1396	}
1397
1398	static void MakeSafeDenominator(float& v)
1399	{
1400	if (v == `0.0f`)
1401	v = `1.0f`;
1402	}
1403
1404	static int32_t SignedRightShift(int32_t v, int bits)
1405	{
1406	return v >> bits;
1407	}
1408
1409	static int32_t TruncateToPrecisionSigned(int32_t v, int precision)
1410	{
1411	v = (v << (`32` - precision)) & `0xffffffff`;
1412	return SignedRightShift(v, `32` - precision);
1413	}
1414
1415	static int32_t TruncateToPrecisionUnsigned(int32_t v, int precision)
1416	{
1417	return v & ((`1` << precision) - `1`);
1418	}
1419
1420	static int32_t Min(int32_t a, int32_t b)
1421	{
1422	if (a < b)
1423	return a;
1424	return b;
1425	}
1426
1427	static float Min(float a, float b)
1428	{
1429	if (a < b)
1430	return a;
1431	return b;
1432	}
1433
1434	static int32_t Max(int32_t a, int32_t b)
1435	{
1436	if (a > b)
1437	return a;
1438	return b;
1439	}
1440
1441	static float Max(float a, float b)
1442	{
1443	if (a > b)
1444	return a;
1445	return b;
1446	}
1447
1448	static float Abs(float a)
1449	{
1450	return fabsf(a);
1451	}
1452
1453	static int32_t Abs(int32_t a)
1454	{
1455	if (a < `0`)
1456	return -a;
1457	return a;
1458	}
1459
1460	static float Clamp(float v, float min, float max)
1461	{
1462	if (v < min)
1463	return min;
1464	if (v > max)
1465	return max;
1466	return v;
1467	}
1468
1469	static float Reciprocal(float v)
1470	{
1471	return `1.0f` / v;
1472	}
1473
1474	static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, int32_t& chOut)
1475	{
1476	chOut = inputBlocks[`0`].m_pixels[pxOffset][channel];
1477	}
1478
1479	static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, int32_t& chOut)
1480	{
1481	chOut = inputBlocks[`0`].m_pixels[pxOffset][channel];
1482	}
1483
1484	static float MakeFloat(float v)
1485	{
1486	return v;
1487	}
1488
1489	static float MakeFloatZero()
1490	{
1491	return `0.0f`;
1492	}
1493
1494	static int32_t MakeUInt16(uint16_t v)
1495	{
1496	return v;
1497	}
1498
1499	static int32_t MakeSInt16(int16_t v)
1500	{
1501	return v;
1502	}
1503
1504	static int32_t MakeAInt16(int16_t v)
1505	{
1506	return v;
1507	}
1508
1509	static int32_t MakeUInt15(uint16_t v)
1510	{
1511	return v;
1512	}
1513
1514	static int32_t MakeSInt32(int32_t v)
1515	{
1516	return v;
1517	}
1518
1519	static int32_t MakeUInt31(int32_t v)
1520	{
1521	return v;
1522	}
1523
1524	static int32_t Extract(int32_t v, int offset)
1525	{
1526	UNREFERENCED_PARAMETER(offset);
1527	return v;
1528	}
1529
1530	static bool Extract(bool v, int offset)
1531	{
1532	UNREFERENCED_PARAMETER(offset);
1533	return v;
1534	}
1535
1536	static float Extract(float v, int offset)
1537	{
1538	UNREFERENCED_PARAMETER(offset);
1539	return v;
1540	}
1541
1542	static void PutUInt16(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
1543	{
1544	UNREFERENCED_PARAMETER(offset);
1545	dest = v;
1546	}
1547
1548	static void PutUInt15(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
1549	{
1550	UNREFERENCED_PARAMETER(offset);
1551	dest = v;
1552	}
1553
1554	static void PutSInt16(int32_t &dest, int offset, ParallelMath::ScalarSInt16 v)
1555	{
1556	UNREFERENCED_PARAMETER(offset);
1557	dest = v;
1558	}
1559
1560	static float ExtractFloat(float v, int offset)
1561	{
1562	UNREFERENCED_PARAMETER(offset);
1563	return v;
1564	}
1565
1566	static void PutFloat(float &dest, int offset, float v)
1567	{
1568	UNREFERENCED_PARAMETER(offset);
1569	dest = v;
1570	}
1571
1572	static void PutBoolInt16(bool &dest, int offset, bool v)
1573	{
1574	UNREFERENCED_PARAMETER(offset);
1575	dest = v;
1576	}
1577
1578	static bool Less(int32_t a, int32_t b)
1579	{
1580	return a < b;
1581	}
1582
1583	static bool Less(float a, float b)
1584	{
1585	return a < b;
1586	}
1587
1588	static bool LessOrEqual(int32_t a, int32_t b)
1589	{
1590	return a < b;
1591	}
1592
1593	static bool LessOrEqual(float a, float b)
1594	{
1595	return a < b;
1596	}
1597
1598	static bool Equal(int32_t a, int32_t b)
1599	{
1600	return a == b;
1601	}
1602
1603	static bool Equal(float a, float b)
1604	{
1605	return a == b;
1606	}
1607
1608	static float ToFloat(int32_t v)
1609	{
1610	return static_cast<float>(v);
1611	}
1612
1613	static int32_t ToUInt31(int32_t v)
1614	{
1615	return v;
1616	}
1617
1618	static int32_t ToInt32(int32_t v)
1619	{
1620	return v;
1621	}
1622
1623	static bool FloatFlagToInt16(bool v)
1624	{
1625	return v;
1626	}
1627
1628	static bool Int32FlagToInt16(bool v)
1629	{
1630	return v;
1631	}
1632
1633	static bool Int16FlagToFloat(bool v)
1634	{
1635	return v;
1636	}
1637
1638	static bool MakeBoolInt16(bool b)
1639	{
1640	return b;
1641	}
1642
1643	static bool MakeBoolFloat(bool b)
1644	{
1645	return b;
1646	}
1647
1648	static bool AndNot(bool a, bool b)
1649	{
1650	return a && !b;
1651	}
1652
1653	static bool Not(bool b)
1654	{
1655	return !b;
1656	}
1657
1658	static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardZeroForScope *rtz)
1659	{
1660	UNREFERENCED_PARAMETER(rtz);
1661	return static_cast<int>(v);
1662	}
1663
1664	static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundUpForScope *ru)
1665	{
1666	UNREFERENCED_PARAMETER(ru);
1667	return static_cast<int>(ceilf(v));
1668	}
1669
1670	static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundDownForScope *rd)
1671	{
1672	UNREFERENCED_PARAMETER(rd);
1673	return static_cast<int>(floorf(v));
1674	}
1675
1676	static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardNearestForScope *rtn)
1677	{
1678	UNREFERENCED_PARAMETER(rtn);
1679	return static_cast<int>(floorf(v + `0.5f`));
1680	}
1681
1682	template<class TRoundMode>
1683	static int32_t RoundAndConvertToU16(float v, const TRoundMode *roundingMode)
1684	{
1685	return RoundAndConvertToInt(v, roundingMode);
1686	}
1687
1688	template<class TRoundMode>
1689	static int32_t RoundAndConvertToU15(float v, const TRoundMode *roundingMode)
1690	{
1691	return RoundAndConvertToInt(v, roundingMode);
1692	}
1693
1694	template<class TRoundMode>
1695	static int32_t RoundAndConvertToS16(float v, const TRoundMode *roundingMode)
1696	{
1697	return RoundAndConvertToInt(v, roundingMode);
1698	}
1699
1700	static float Sqrt(float f)
1701	{
1702	return sqrtf(f);
1703	}
1704
1705	static int32_t SqDiffUInt8(int32_t a, int32_t b)
1706	{
1707	int32_t delta = a - b;
1708	return delta * delta;
1709	}
1710
1711	static int32_t SqDiffInt16(int32_t a, int32_t b)
1712	{
1713	int32_t delta = a - b;
1714	return delta * delta;
1715	}
1716
1717	static int32_t SqDiffSInt16(int32_t a, int32_t b)
1718	{
1719	int32_t delta = a - b;
1720	return delta * delta;
1721	}
1722
1723	static float TwosCLHalfToFloat(int32_t v)
1724	{
1725	int32_t absV = (v < `0`) ? -v : v;
1726
1727	int32_t signBits = (absV & -`32768`);
1728	int32_t mantissa = (absV & `0x03ff`);
1729	int32_t exponent = (absV & `0x7c00`);
1730
1731	bool isDenormal = (exponent == `0`);
1732
1733	// Convert exponent to high-bits
1734	exponent = (exponent >> `3`) + `14336`;
1735
1736	int32_t denormalCorrection = (isDenormal ? (signBits \| `14336`) : `0`) << `16`;
1737
1738	int32_t fBits = ((exponent \| signBits) << `16`) \| (mantissa << `13`);
1739
1740	float f, correction;
1741	memcpy(&f, &fBits, `4`);
1742	memcpy(&correction, &denormalCorrection, `4`);
1743
1744	return f - correction;
1745	}
1746
1747	static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
1748	{
1749	Float fa = TwosCLHalfToFloat(a);
1750
1751	Float diff = fa - b;
1752	return diff * diff;
1753	}
1754
1755	static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
1756	{
1757	Float fa = TwosCLHalfToFloat(a);
1758	Float fb = TwosCLHalfToFloat(b);
1759
1760	Float diff = fa - fb;
1761	return diff * diff;
1762	}
1763
1764	static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
1765	{
1766	Float fa = TwosCLHalfToFloat(a) * aWeight;
1767
1768	Float diff = fa - b;
1769	return diff * diff;
1770	}
1771
1772	static int32_t RightShift(int32_t v, int bits)
1773	{
1774	return SignedRightShift(v, bits);
1775	}
1776
1777	static int32_t ToSInt16(int32_t v)
1778	{
1779	return v;
1780	}
1781
1782	static int32_t ToUInt16(int32_t v)
1783	{
1784	return v;
1785	}
1786
1787	static int32_t ToUInt15(int32_t v)
1788	{
1789	return v;
1790	}
1791
1792	static int32_t XMultiply(int32_t a, int32_t b)
1793	{
1794	return a * b;
1795	}
1796
1797	static int32_t CompactMultiply(int32_t a, int32_t b)
1798	{
1799	return a * b;
1800	}
1801
1802	static bool AnySet(bool v)
1803	{
1804	return v;
1805	}
1806
1807	static bool AllSet(bool v)
1808	{
1809	return v;
1810	}
1811	};
1812
1813	#endif
1814	}
1815
1816	#endif
1817

Browse the source code of Godot/thirdparty/cvtt/ConvectionKernels_ParallelMath.h