1/*
2Convection Texture Tools
3Copyright (c) 2018-2019 Eric Lasota
4
5Permission is hereby granted, free of charge, to any person obtaining
6a copy of this software and associated documentation files (the
7"Software"), to deal in the Software without restriction, including
8without limitation the rights to use, copy, modify, merge, publish,
9distribute, sublicense, and/or sell copies of the Software, and to
10permit persons to whom the Software is furnished to do so, subject
11to the following conditions:
12
13The above copyright notice and this permission notice shall be included
14in all copies or substantial portions of the Software.
15
16THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23
24*/
25#pragma once
26#ifndef __CVTT_PARALLELMATH_H__
27#define __CVTT_PARALLELMATH_H__
28
29#include "ConvectionKernels.h"
30#include "ConvectionKernels_Config.h"
31
32#ifdef CVTT_USE_SSE2
33#include <emmintrin.h>
34#endif
35
36#include <float.h>
37#include <assert.h>
38#include <string.h>
39#include <algorithm>
40#include <math.h>
41
42#define UNREFERENCED_PARAMETER(n) ((void)n)
43
44// Parallel math implementation
45//
46// After preprocessor defs are handled, what this should do is expose the following types:
47// SInt16 - Signed 16-bit integer
48// UInt16 - Signed 16-bit integer
49// UInt15 - Unsigned 15-bit integer
50// SInt32 - Signed 32-bit integer
51// UInt31 - Unsigned 31-bit integer
52// AInt16 - 16-bit integer of unknown signedness (only used for storage)
53// Int16CompFlag - Comparison flags from comparing 16-bit integers
54// Int32CompFlag - Comparison flags from comparing 32-bit integers
55// FloatCompFlag - Comparison flags from comparing 32-bit floats
56//
57// The reason for these distinctions are that depending on the instruction set, signed or unsigned versions of certain ops
58// (particularly max, min, compares, and right shift) may not be available. In cases where ops are not available, it's
59// necessary to do high bit manipulations to accomplish the operation with 16-bit numbers. The 15-bit and 31-bit uint types
60// can elide the bit flips if unsigned versions are not available.
61
62namespace cvtt
63{
64#ifdef CVTT_USE_SSE2
65 // SSE2 version
66 struct ParallelMath
67 {
68 typedef uint16_t ScalarUInt16;
69 typedef int16_t ScalarSInt16;
70
71 template<unsigned int TRoundingMode>
72 struct RoundForScope
73 {
74 unsigned int m_oldCSR;
75
76 RoundForScope()
77 {
78 m_oldCSR = _mm_getcsr();
79 _mm_setcsr((m_oldCSR & ~_MM_ROUND_MASK) | (TRoundingMode));
80 }
81
82 ~RoundForScope()
83 {
84 _mm_setcsr(m_oldCSR);
85 }
86 };
87
88 struct RoundTowardZeroForScope : RoundForScope<_MM_ROUND_TOWARD_ZERO>
89 {
90 };
91
92 struct RoundTowardNearestForScope : RoundForScope<_MM_ROUND_NEAREST>
93 {
94 };
95
96 struct RoundUpForScope : RoundForScope<_MM_ROUND_UP>
97 {
98 };
99
100 struct RoundDownForScope : RoundForScope<_MM_ROUND_DOWN>
101 {
102 };
103
104 static const int ParallelSize = 8;
105
106 enum Int16Subtype
107 {
108 IntSubtype_Signed,
109 IntSubtype_UnsignedFull,
110 IntSubtype_UnsignedTruncated,
111 IntSubtype_Abstract,
112 };
113
114 template<int TSubtype>
115 struct VInt16
116 {
117 __m128i m_value;
118
119 inline VInt16 operator+(int16_t other) const
120 {
121 VInt16 result;
122 result.m_value = _mm_add_epi16(m_value, _mm_set1_epi16(static_cast<int16_t>(other)));
123 return result;
124 }
125
126 inline VInt16 operator+(const VInt16 &other) const
127 {
128 VInt16 result;
129 result.m_value = _mm_add_epi16(m_value, other.m_value);
130 return result;
131 }
132
133 inline VInt16 operator|(const VInt16 &other) const
134 {
135 VInt16 result;
136 result.m_value = _mm_or_si128(m_value, other.m_value);
137 return result;
138 }
139
140 inline VInt16 operator&(const VInt16 &other) const
141 {
142 VInt16 result;
143 result.m_value = _mm_and_si128(m_value, other.m_value);
144 return result;
145 }
146
147 inline VInt16 operator-(const VInt16 &other) const
148 {
149 VInt16 result;
150 result.m_value = _mm_sub_epi16(m_value, other.m_value);
151 return result;
152 }
153
154 inline VInt16 operator<<(int bits) const
155 {
156 VInt16 result;
157 result.m_value = _mm_slli_epi16(m_value, bits);
158 return result;
159 }
160
161 inline VInt16 operator^(const VInt16 &other) const
162 {
163 VInt16 result;
164 result.m_value = _mm_xor_si128(m_value, other.m_value);
165 return result;
166 }
167 };
168
169 typedef VInt16<IntSubtype_Signed> SInt16;
170 typedef VInt16<IntSubtype_UnsignedFull> UInt16;
171 typedef VInt16<IntSubtype_UnsignedTruncated> UInt15;
172 typedef VInt16<IntSubtype_Abstract> AInt16;
173
174 template<int TSubtype>
175 struct VInt32
176 {
177 __m128i m_values[2];
178
179 inline VInt32 operator+(const VInt32& other) const
180 {
181 VInt32 result;
182 result.m_values[0] = _mm_add_epi32(m_values[0], other.m_values[0]);
183 result.m_values[1] = _mm_add_epi32(m_values[1], other.m_values[1]);
184 return result;
185 }
186
187 inline VInt32 operator-(const VInt32& other) const
188 {
189 VInt32 result;
190 result.m_values[0] = _mm_sub_epi32(m_values[0], other.m_values[0]);
191 result.m_values[1] = _mm_sub_epi32(m_values[1], other.m_values[1]);
192 return result;
193 }
194
195 inline VInt32 operator<<(const int other) const
196 {
197 VInt32 result;
198 result.m_values[0] = _mm_slli_epi32(m_values[0], other);
199 result.m_values[1] = _mm_slli_epi32(m_values[1], other);
200 return result;
201 }
202
203 inline VInt32 operator|(const VInt32& other) const
204 {
205 VInt32 result;
206 result.m_values[0] = _mm_or_si128(m_values[0], other.m_values[0]);
207 result.m_values[1] = _mm_or_si128(m_values[1], other.m_values[1]);
208 return result;
209 }
210 };
211
212 typedef VInt32<IntSubtype_Signed> SInt32;
213 typedef VInt32<IntSubtype_UnsignedTruncated> UInt31;
214 typedef VInt32<IntSubtype_UnsignedFull> UInt32;
215 typedef VInt32<IntSubtype_Abstract> AInt32;
216
217 template<class TTargetType>
218 struct LosslessCast
219 {
220#ifdef CVTT_PERMIT_ALIASING
221 template<int TSrcSubtype>
222 static const TTargetType& Cast(const VInt32<TSrcSubtype> &src)
223 {
224 return reinterpret_cast<VInt32<TSubtype>&>(src);
225 }
226
227 template<int TSrcSubtype>
228 static const TTargetType& Cast(const VInt16<TSrcSubtype> &src)
229 {
230 return reinterpret_cast<VInt16<TSubtype>&>(src);
231 }
232#else
233 template<int TSrcSubtype>
234 static TTargetType Cast(const VInt32<TSrcSubtype> &src)
235 {
236 TTargetType result;
237 result.m_values[0] = src.m_values[0];
238 result.m_values[1] = src.m_values[1];
239 return result;
240 }
241
242 template<int TSrcSubtype>
243 static TTargetType Cast(const VInt16<TSrcSubtype> &src)
244 {
245 TTargetType result;
246 result.m_value = src.m_value;
247 return result;
248 }
249#endif
250 };
251
252 struct Int64
253 {
254 __m128i m_values[4];
255 };
256
257 struct Float
258 {
259 __m128 m_values[2];
260
261 inline Float operator+(const Float &other) const
262 {
263 Float result;
264 result.m_values[0] = _mm_add_ps(m_values[0], other.m_values[0]);
265 result.m_values[1] = _mm_add_ps(m_values[1], other.m_values[1]);
266 return result;
267 }
268
269 inline Float operator+(float other) const
270 {
271 Float result;
272 result.m_values[0] = _mm_add_ps(m_values[0], _mm_set1_ps(other));
273 result.m_values[1] = _mm_add_ps(m_values[1], _mm_set1_ps(other));
274 return result;
275 }
276
277 inline Float operator-(const Float& other) const
278 {
279 Float result;
280 result.m_values[0] = _mm_sub_ps(m_values[0], other.m_values[0]);
281 result.m_values[1] = _mm_sub_ps(m_values[1], other.m_values[1]);
282 return result;
283 }
284
285 inline Float operator-() const
286 {
287 Float result;
288 result.m_values[0] = _mm_sub_ps(_mm_setzero_ps(), m_values[0]);
289 result.m_values[1] = _mm_sub_ps(_mm_setzero_ps(), m_values[1]);
290 return result;
291 }
292
293 inline Float operator*(const Float& other) const
294 {
295 Float result;
296 result.m_values[0] = _mm_mul_ps(m_values[0], other.m_values[0]);
297 result.m_values[1] = _mm_mul_ps(m_values[1], other.m_values[1]);
298 return result;
299 }
300
301 inline Float operator*(float other) const
302 {
303 Float result;
304 result.m_values[0] = _mm_mul_ps(m_values[0], _mm_set1_ps(other));
305 result.m_values[1] = _mm_mul_ps(m_values[1], _mm_set1_ps(other));
306 return result;
307 }
308
309 inline Float operator/(const Float &other) const
310 {
311 Float result;
312 result.m_values[0] = _mm_div_ps(m_values[0], other.m_values[0]);
313 result.m_values[1] = _mm_div_ps(m_values[1], other.m_values[1]);
314 return result;
315 }
316
317 inline Float operator/(float other) const
318 {
319 Float result;
320 result.m_values[0] = _mm_div_ps(m_values[0], _mm_set1_ps(other));
321 result.m_values[1] = _mm_div_ps(m_values[1], _mm_set1_ps(other));
322 return result;
323 }
324 };
325
326 struct Int16CompFlag
327 {
328 __m128i m_value;
329
330 inline Int16CompFlag operator&(const Int16CompFlag &other) const
331 {
332 Int16CompFlag result;
333 result.m_value = _mm_and_si128(m_value, other.m_value);
334 return result;
335 }
336
337 inline Int16CompFlag operator|(const Int16CompFlag &other) const
338 {
339 Int16CompFlag result;
340 result.m_value = _mm_or_si128(m_value, other.m_value);
341 return result;
342 }
343 };
344
345 struct Int32CompFlag
346 {
347 __m128i m_values[2];
348
349 inline Int32CompFlag operator&(const Int32CompFlag &other) const
350 {
351 Int32CompFlag result;
352 result.m_values[0] = _mm_and_si128(m_values[0], other.m_values[0]);
353 result.m_values[1] = _mm_and_si128(m_values[1], other.m_values[1]);
354 return result;
355 }
356
357 inline Int32CompFlag operator|(const Int32CompFlag &other) const
358 {
359 Int32CompFlag result;
360 result.m_values[0] = _mm_or_si128(m_values[0], other.m_values[0]);
361 result.m_values[1] = _mm_or_si128(m_values[1], other.m_values[1]);
362 return result;
363 }
364 };
365
366 struct FloatCompFlag
367 {
368 __m128 m_values[2];
369
370 inline FloatCompFlag operator&(const FloatCompFlag &other) const
371 {
372 FloatCompFlag result;
373 result.m_values[0] = _mm_and_ps(m_values[0], other.m_values[0]);
374 result.m_values[1] = _mm_and_ps(m_values[1], other.m_values[1]);
375 return result;
376 }
377
378 inline FloatCompFlag operator|(const FloatCompFlag &other) const
379 {
380 FloatCompFlag result;
381 result.m_values[0] = _mm_or_ps(m_values[0], other.m_values[0]);
382 result.m_values[1] = _mm_or_ps(m_values[1], other.m_values[1]);
383 return result;
384 }
385 };
386
387 template<int TSubtype>
388 static VInt16<TSubtype> AbstractAdd(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
389 {
390 VInt16<TSubtype> result;
391 result.m_value = _mm_add_epi16(a.m_value, b.m_value);
392 return result;
393 }
394
395 template<int TSubtype>
396 static VInt16<TSubtype> AbstractSubtract(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
397 {
398 VInt16<TSubtype> result;
399 result.m_value = _mm_sub_epi16(a.m_value, b.m_value);
400 return result;
401 }
402
403 static Float Select(const FloatCompFlag &flag, const Float &a, const Float &b)
404 {
405 Float result;
406 for (int i = 0; i < 2; i++)
407 result.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], a.m_values[i]), _mm_andnot_ps(flag.m_values[i], b.m_values[i]));
408 return result;
409 }
410
411 template<int TSubtype>
412 static VInt16<TSubtype> Select(const Int16CompFlag &flag, const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
413 {
414 VInt16<TSubtype> result;
415 result.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, a.m_value), _mm_andnot_si128(flag.m_value, b.m_value));
416 return result;
417 }
418
419 template<int TSubtype>
420 static VInt16<TSubtype> SelectOrZero(const Int16CompFlag &flag, const VInt16<TSubtype> &a)
421 {
422 VInt16<TSubtype> result;
423 result.m_value = _mm_and_si128(flag.m_value, a.m_value);
424 return result;
425 }
426
427 template<int TSubtype>
428 static void ConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
429 {
430 dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
431 }
432
433 template<int TSubtype>
434 static void ConditionalSet(VInt32<TSubtype> &dest, const Int16CompFlag &flag, const VInt32<TSubtype> &src)
435 {
436 __m128i lowFlags = _mm_unpacklo_epi16(flag.m_value, flag.m_value);
437 __m128i highFlags = _mm_unpackhi_epi16(flag.m_value, flag.m_value);
438 dest.m_values[0] = _mm_or_si128(_mm_andnot_si128(lowFlags, dest.m_values[0]), _mm_and_si128(lowFlags, src.m_values[0]));
439 dest.m_values[1] = _mm_or_si128(_mm_andnot_si128(highFlags, dest.m_values[1]), _mm_and_si128(highFlags, src.m_values[1]));
440 }
441
442 static void ConditionalSet(ParallelMath::Int16CompFlag &dest, const Int16CompFlag &flag, const ParallelMath::Int16CompFlag &src)
443 {
444 dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value));
445 }
446
447 static SInt16 ConditionalNegate(const Int16CompFlag &flag, const SInt16 &v)
448 {
449 SInt16 result;
450 result.m_value = _mm_add_epi16(_mm_xor_si128(flag.m_value, v.m_value), _mm_srli_epi16(flag.m_value, 15));
451 return result;
452 }
453
454 template<int TSubtype>
455 static void NotConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src)
456 {
457 dest.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, dest.m_value), _mm_andnot_si128(flag.m_value, src.m_value));
458 }
459
460 static void ConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
461 {
462 for (int i = 0; i < 2; i++)
463 dest.m_values[i] = _mm_or_ps(_mm_andnot_ps(flag.m_values[i], dest.m_values[i]), _mm_and_ps(flag.m_values[i], src.m_values[i]));
464 }
465
466 static void NotConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src)
467 {
468 for (int i = 0; i < 2; i++)
469 dest.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], dest.m_values[i]), _mm_andnot_ps(flag.m_values[i], src.m_values[i]));
470 }
471
472 static void MakeSafeDenominator(Float& v)
473 {
474 ConditionalSet(v, Equal(v, MakeFloatZero()), MakeFloat(1.0f));
475 }
476
477 static SInt16 TruncateToPrecisionSigned(const SInt16 &v, int precision)
478 {
479 int lostBits = 16 - precision;
480 if (lostBits == 0)
481 return v;
482
483 SInt16 result;
484 result.m_value = _mm_srai_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
485 return result;
486 }
487
488 static UInt16 TruncateToPrecisionUnsigned(const UInt16 &v, int precision)
489 {
490 int lostBits = 16 - precision;
491 if (lostBits == 0)
492 return v;
493
494 UInt16 result;
495 result.m_value = _mm_srli_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits);
496 return result;
497 }
498
499 static UInt16 Min(const UInt16 &a, const UInt16 &b)
500 {
501 __m128i bitFlip = _mm_set1_epi16(-32768);
502
503 UInt16 result;
504 result.m_value = _mm_xor_si128(_mm_min_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
505 return result;
506 }
507
508 static SInt16 Min(const SInt16 &a, const SInt16 &b)
509 {
510 SInt16 result;
511 result.m_value = _mm_min_epi16(a.m_value, b.m_value);
512 return result;
513 }
514
515 static UInt15 Min(const UInt15 &a, const UInt15 &b)
516 {
517 UInt15 result;
518 result.m_value = _mm_min_epi16(a.m_value, b.m_value);
519 return result;
520 }
521
522 static Float Min(const Float &a, const Float &b)
523 {
524 Float result;
525 for (int i = 0; i < 2; i++)
526 result.m_values[i] = _mm_min_ps(a.m_values[i], b.m_values[i]);
527 return result;
528 }
529
530 static UInt16 Max(const UInt16 &a, const UInt16 &b)
531 {
532 __m128i bitFlip = _mm_set1_epi16(-32768);
533
534 UInt16 result;
535 result.m_value = _mm_xor_si128(_mm_max_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip);
536 return result;
537 }
538
539 static SInt16 Max(const SInt16 &a, const SInt16 &b)
540 {
541 SInt16 result;
542 result.m_value = _mm_max_epi16(a.m_value, b.m_value);
543 return result;
544 }
545
546 static UInt15 Max(const UInt15 &a, const UInt15 &b)
547 {
548 UInt15 result;
549 result.m_value = _mm_max_epi16(a.m_value, b.m_value);
550 return result;
551 }
552
553 static Float Max(const Float &a, const Float &b)
554 {
555 Float result;
556 for (int i = 0; i < 2; i++)
557 result.m_values[i] = _mm_max_ps(a.m_values[i], b.m_values[i]);
558 return result;
559 }
560
561 static Float Clamp(const Float &v, float min, float max)
562 {
563 Float result;
564 for (int i = 0; i < 2; i++)
565 result.m_values[i] = _mm_max_ps(_mm_min_ps(v.m_values[i], _mm_set1_ps(max)), _mm_set1_ps(min));
566 return result;
567 }
568
569 static Float Reciprocal(const Float &v)
570 {
571 Float result;
572 for (int i = 0; i < 2; i++)
573 result.m_values[i] = _mm_rcp_ps(v.m_values[i]);
574 return result;
575 }
576
577 static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, UInt15 &chOut)
578 {
579 int16_t values[8];
580 for (int i = 0; i < 8; i++)
581 values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
582
583 chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
584 }
585
586 static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, SInt16 &chOut)
587 {
588 int16_t values[8];
589 for (int i = 0; i < 8; i++)
590 values[i] = inputBlocks[i].m_pixels[pxOffset][channel];
591
592 chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);
593 }
594
595 static Float MakeFloat(float v)
596 {
597 Float f;
598 f.m_values[0] = f.m_values[1] = _mm_set1_ps(v);
599 return f;
600 }
601
602 static Float MakeFloatZero()
603 {
604 Float f;
605 f.m_values[0] = f.m_values[1] = _mm_setzero_ps();
606 return f;
607 }
608
609 static UInt16 MakeUInt16(uint16_t v)
610 {
611 UInt16 result;
612 result.m_value = _mm_set1_epi16(static_cast<short>(v));
613 return result;
614 }
615
616 static SInt16 MakeSInt16(int16_t v)
617 {
618 SInt16 result;
619 result.m_value = _mm_set1_epi16(static_cast<short>(v));
620 return result;
621 }
622
623 static AInt16 MakeAInt16(int16_t v)
624 {
625 AInt16 result;
626 result.m_value = _mm_set1_epi16(static_cast<short>(v));
627 return result;
628 }
629
630 static UInt15 MakeUInt15(uint16_t v)
631 {
632 UInt15 result;
633 result.m_value = _mm_set1_epi16(static_cast<short>(v));
634 return result;
635 }
636
637 static SInt32 MakeSInt32(int32_t v)
638 {
639 SInt32 result;
640 result.m_values[0] = _mm_set1_epi32(v);
641 result.m_values[1] = _mm_set1_epi32(v);
642 return result;
643 }
644
645 static UInt31 MakeUInt31(uint32_t v)
646 {
647 UInt31 result;
648 result.m_values[0] = _mm_set1_epi32(v);
649 result.m_values[1] = _mm_set1_epi32(v);
650 return result;
651 }
652
653 static uint16_t Extract(const UInt16 &v, int offset)
654 {
655 return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
656 }
657
658 static int16_t Extract(const SInt16 &v, int offset)
659 {
660 return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
661 }
662
663 static uint16_t Extract(const UInt15 &v, int offset)
664 {
665 return reinterpret_cast<const uint16_t*>(&v.m_value)[offset];
666 }
667
668 static int16_t Extract(const AInt16 &v, int offset)
669 {
670 return reinterpret_cast<const int16_t*>(&v.m_value)[offset];
671 }
672
673 static int32_t Extract(const SInt32 &v, int offset)
674 {
675 return reinterpret_cast<const int32_t*>(&v.m_values[offset >> 2])[offset & 3];
676 }
677
678 static float Extract(const Float &v, int offset)
679 {
680 return reinterpret_cast<const float*>(&v.m_values[offset >> 2])[offset & 3];
681 }
682
683 static bool Extract(const ParallelMath::Int16CompFlag &v, int offset)
684 {
685 return reinterpret_cast<const int16_t*>(&v.m_value)[offset] != 0;
686 }
687
688 static void PutUInt16(UInt16 &dest, int offset, uint16_t v)
689 {
690 reinterpret_cast<uint16_t*>(&dest)[offset] = v;
691 }
692
693 static void PutUInt15(UInt15 &dest, int offset, uint16_t v)
694 {
695 reinterpret_cast<uint16_t*>(&dest)[offset] = v;
696 }
697
698 static void PutSInt16(SInt16 &dest, int offset, int16_t v)
699 {
700 reinterpret_cast<int16_t*>(&dest)[offset] = v;
701 }
702
703 static float ExtractFloat(const Float& v, int offset)
704 {
705 return reinterpret_cast<const float*>(&v)[offset];
706 }
707
708 static void PutFloat(Float &dest, int offset, float v)
709 {
710 reinterpret_cast<float*>(&dest)[offset] = v;
711 }
712
713 static void PutBoolInt16(Int16CompFlag &dest, int offset, bool v)
714 {
715 reinterpret_cast<int16_t*>(&dest)[offset] = v ? -1 : 0;
716 }
717
718 static Int32CompFlag Less(const UInt31 &a, const UInt31 &b)
719 {
720 Int32CompFlag result;
721 result.m_values[0] = _mm_cmplt_epi32(a.m_values[0], b.m_values[0]);
722 result.m_values[1] = _mm_cmplt_epi32(a.m_values[1], b.m_values[1]);
723 return result;
724 }
725
726 static Int16CompFlag Less(const SInt16 &a, const SInt16 &b)
727 {
728 Int16CompFlag result;
729 result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
730 return result;
731 }
732
733 static Int16CompFlag Less(const UInt15 &a, const UInt15 &b)
734 {
735 Int16CompFlag result;
736 result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
737 return result;
738 }
739
740 static Int16CompFlag LessOrEqual(const UInt15 &a, const UInt15 &b)
741 {
742 Int16CompFlag result;
743 result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value);
744 return result;
745 }
746
747 static FloatCompFlag Less(const Float &a, const Float &b)
748 {
749 FloatCompFlag result;
750 for (int i = 0; i < 2; i++)
751 result.m_values[i] = _mm_cmplt_ps(a.m_values[i], b.m_values[i]);
752 return result;
753 }
754
755 static FloatCompFlag LessOrEqual(const Float &a, const Float &b)
756 {
757 FloatCompFlag result;
758 for (int i = 0; i < 2; i++)
759 result.m_values[i] = _mm_cmple_ps(a.m_values[i], b.m_values[i]);
760 return result;
761 }
762
763 template<int TSubtype>
764 static Int16CompFlag Equal(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b)
765 {
766 Int16CompFlag result;
767 result.m_value = _mm_cmpeq_epi16(a.m_value, b.m_value);
768 return result;
769 }
770
771 static FloatCompFlag Equal(const Float &a, const Float &b)
772 {
773 FloatCompFlag result;
774 for (int i = 0; i < 2; i++)
775 result.m_values[i] = _mm_cmpeq_ps(a.m_values[i], b.m_values[i]);
776 return result;
777 }
778
779 static Int16CompFlag Equal(const Int16CompFlag &a, const Int16CompFlag &b)
780 {
781 Int16CompFlag notResult;
782 notResult.m_value = _mm_xor_si128(a.m_value, b.m_value);
783 return Not(notResult);
784 }
785
786 static Float ToFloat(const UInt16 &v)
787 {
788 Float result;
789 result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
790 result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
791 return result;
792 }
793
794 static UInt31 ToUInt31(const UInt16 &v)
795 {
796 UInt31 result;
797 result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
798 result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
799 return result;
800 }
801
802 static SInt32 ToInt32(const UInt16 &v)
803 {
804 SInt32 result;
805 result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
806 result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
807 return result;
808 }
809
810 static SInt32 ToInt32(const UInt15 &v)
811 {
812 SInt32 result;
813 result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128());
814 result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128());
815 return result;
816 }
817
818 static SInt32 ToInt32(const SInt16 &v)
819 {
820 SInt32 result;
821 result.m_values[0] = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16);
822 result.m_values[1] = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16);
823 return result;
824 }
825
826 static Float ToFloat(const SInt16 &v)
827 {
828 Float result;
829 result.m_values[0] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16));
830 result.m_values[1] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16));
831 return result;
832 }
833
834 static Float ToFloat(const UInt15 &v)
835 {
836 Float result;
837 result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()));
838 result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()));
839 return result;
840 }
841
842 static Float ToFloat(const UInt31 &v)
843 {
844 Float result;
845 result.m_values[0] = _mm_cvtepi32_ps(v.m_values[0]);
846 result.m_values[1] = _mm_cvtepi32_ps(v.m_values[1]);
847 return result;
848 }
849
850 static Int16CompFlag FloatFlagToInt16(const FloatCompFlag &v)
851 {
852 __m128i lo = _mm_castps_si128(v.m_values[0]);
853 __m128i hi = _mm_castps_si128(v.m_values[1]);
854
855 Int16CompFlag result;
856 result.m_value = _mm_packs_epi32(lo, hi);
857 return result;
858 }
859
860 static FloatCompFlag Int16FlagToFloat(const Int16CompFlag &v)
861 {
862 __m128i lo = _mm_unpacklo_epi16(v.m_value, v.m_value);
863 __m128i hi = _mm_unpackhi_epi16(v.m_value, v.m_value);
864
865 FloatCompFlag result;
866 result.m_values[0] = _mm_castsi128_ps(lo);
867 result.m_values[1] = _mm_castsi128_ps(hi);
868 return result;
869 }
870
871 static Int16CompFlag Int32FlagToInt16(const Int32CompFlag &v)
872 {
873 __m128i lo = v.m_values[0];
874 __m128i hi = v.m_values[1];
875
876 Int16CompFlag result;
877 result.m_value = _mm_packs_epi32(lo, hi);
878 return result;
879 }
880
881 static Int16CompFlag MakeBoolInt16(bool b)
882 {
883 Int16CompFlag result;
884 if (b)
885 result.m_value = _mm_set1_epi16(-1);
886 else
887 result.m_value = _mm_setzero_si128();
888 return result;
889 }
890
891 static FloatCompFlag MakeBoolFloat(bool b)
892 {
893 FloatCompFlag result;
894 if (b)
895 result.m_values[0] = result.m_values[1] = _mm_castsi128_ps(_mm_set1_epi32(-1));
896 else
897 result.m_values[0] = result.m_values[1] = _mm_setzero_ps();
898 return result;
899 }
900
901 static Int16CompFlag AndNot(const Int16CompFlag &a, const Int16CompFlag &b)
902 {
903 Int16CompFlag result;
904 result.m_value = _mm_andnot_si128(b.m_value, a.m_value);
905 return result;
906 }
907
908 static Int16CompFlag Not(const Int16CompFlag &b)
909 {
910 Int16CompFlag result;
911 result.m_value = _mm_xor_si128(b.m_value, _mm_set1_epi32(-1));
912 return result;
913 }
914
915 static Int32CompFlag Not(const Int32CompFlag &b)
916 {
917 Int32CompFlag result;
918 result.m_values[0] = _mm_xor_si128(b.m_values[0], _mm_set1_epi32(-1));
919 result.m_values[1] = _mm_xor_si128(b.m_values[1], _mm_set1_epi32(-1));
920 return result;
921 }
922
923 static UInt16 RoundAndConvertToU16(const Float &v, const void* /*roundingMode*/)
924 {
925 __m128i lo = _mm_cvtps_epi32(_mm_add_ps(v.m_values[0], _mm_set1_ps(-32768)));
926 __m128i hi = _mm_cvtps_epi32(_mm_add_ps(v.m_values[1], _mm_set1_ps(-32768)));
927
928 __m128i packed = _mm_packs_epi32(lo, hi);
929
930 UInt16 result;
931 result.m_value = _mm_xor_si128(packed, _mm_set1_epi16(-32768));
932 return result;
933 }
934
935 static UInt15 RoundAndConvertToU15(const Float &v, const void* /*roundingMode*/)
936 {
937 __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
938 __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
939
940 __m128i packed = _mm_packs_epi32(lo, hi);
941
942 UInt15 result;
943 result.m_value = _mm_packs_epi32(lo, hi);
944 return result;
945 }
946
947 static SInt16 RoundAndConvertToS16(const Float &v, const void* /*roundingMode*/)
948 {
949 __m128i lo = _mm_cvtps_epi32(v.m_values[0]);
950 __m128i hi = _mm_cvtps_epi32(v.m_values[1]);
951
952 __m128i packed = _mm_packs_epi32(lo, hi);
953
954 SInt16 result;
955 result.m_value = _mm_packs_epi32(lo, hi);
956 return result;
957 }
958
959 static Float Sqrt(const Float &f)
960 {
961 Float result;
962 for (int i = 0; i < 2; i++)
963 result.m_values[i] = _mm_sqrt_ps(f.m_values[i]);
964 return result;
965 }
966
967 static UInt16 Abs(const SInt16 &a)
968 {
969 __m128i signBitsXor = _mm_srai_epi16(a.m_value, 15);
970 __m128i signBitsAdd = _mm_srli_epi16(a.m_value, 15);
971
972 UInt16 result;
973 result.m_value = _mm_add_epi16(_mm_xor_si128(a.m_value, signBitsXor), signBitsAdd);
974 return result;
975 }
976
977 static Float Abs(const Float& a)
978 {
979 __m128 invMask = _mm_set1_ps(-0.0f);
980
981 Float result;
982 result.m_values[0] = _mm_andnot_ps(invMask, a.m_values[0]);
983 result.m_values[1] = _mm_andnot_ps(invMask, a.m_values[1]);
984 return result;
985 }
986
987 static UInt16 SqDiffUInt8(const UInt15 &a, const UInt15 &b)
988 {
989 __m128i diff = _mm_sub_epi16(a.m_value, b.m_value);
990
991 UInt16 result;
992 result.m_value = _mm_mullo_epi16(diff, diff);
993 return result;
994 }
995
996 static Float SqDiffSInt16(const SInt16 &a, const SInt16 &b)
997 {
998 __m128i diffU = _mm_sub_epi16(_mm_max_epi16(a.m_value, b.m_value), _mm_min_epi16(a.m_value, b.m_value));
999
1000 __m128i mulHi = _mm_mulhi_epu16(diffU, diffU);
1001 __m128i mulLo = _mm_mullo_epi16(diffU, diffU);
1002 __m128i sqDiffHi = _mm_unpackhi_epi16(mulLo, mulHi);
1003 __m128i sqDiffLo = _mm_unpacklo_epi16(mulLo, mulHi);
1004
1005 Float result;
1006 result.m_values[0] = _mm_cvtepi32_ps(sqDiffLo);
1007 result.m_values[1] = _mm_cvtepi32_ps(sqDiffHi);
1008
1009 return result;
1010 }
1011
1012 static Float TwosCLHalfToFloat(const SInt16 &v)
1013 {
1014 __m128i absV = _mm_add_epi16(_mm_xor_si128(v.m_value, _mm_srai_epi16(v.m_value, 15)), _mm_srli_epi16(v.m_value, 15));
1015
1016 __m128i signBits = _mm_and_si128(v.m_value, _mm_set1_epi16(-32768));
1017 __m128i mantissa = _mm_and_si128(v.m_value, _mm_set1_epi16(0x03ff));
1018 __m128i exponent = _mm_and_si128(v.m_value, _mm_set1_epi16(0x7c00));
1019
1020 __m128i isDenormal = _mm_cmpeq_epi16(exponent, _mm_setzero_si128());
1021
1022 // Convert exponent to high-bits
1023 exponent = _mm_add_epi16(_mm_srli_epi16(exponent, 3), _mm_set1_epi16(14336));
1024
1025 __m128i denormalCorrectionHigh = _mm_and_si128(isDenormal, _mm_or_si128(signBits, _mm_set1_epi16(14336)));
1026
1027 __m128i highBits = _mm_or_si128(signBits, _mm_or_si128(exponent, _mm_srli_epi16(mantissa, 3)));
1028 __m128i lowBits = _mm_slli_epi16(mantissa, 13);
1029
1030 __m128i flow = _mm_unpacklo_epi16(lowBits, highBits);
1031 __m128i fhigh = _mm_unpackhi_epi16(lowBits, highBits);
1032
1033 __m128i correctionLow = _mm_unpacklo_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
1034 __m128i correctionHigh = _mm_unpackhi_epi16(_mm_setzero_si128(), denormalCorrectionHigh);
1035
1036 Float result;
1037 result.m_values[0] = _mm_sub_ps(_mm_castsi128_ps(flow), _mm_castsi128_ps(correctionLow));
1038 result.m_values[1] = _mm_sub_ps(_mm_castsi128_ps(fhigh), _mm_castsi128_ps(correctionHigh));
1039
1040 return result;
1041 }
1042
1043 static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
1044 {
1045 Float fa = TwosCLHalfToFloat(a);
1046
1047 Float diff = fa - b;
1048 return diff * diff;
1049 }
1050
1051 static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
1052 {
1053 Float fa = TwosCLHalfToFloat(a);
1054 Float fb = TwosCLHalfToFloat(b);
1055
1056 Float diff = fa - fb;
1057 return diff * diff;
1058 }
1059
1060 static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
1061 {
1062 Float fa = TwosCLHalfToFloat(a) * aWeight;
1063
1064 Float diff = fa - b;
1065 return diff * diff;
1066 }
1067
1068 static UInt16 RightShift(const UInt16 &v, int bits)
1069 {
1070 UInt16 result;
1071 result.m_value = _mm_srli_epi16(v.m_value, bits);
1072 return result;
1073 }
1074
1075 static UInt31 RightShift(const UInt31 &v, int bits)
1076 {
1077 UInt31 result;
1078 result.m_values[0] = _mm_srli_epi32(v.m_values[0], bits);
1079 result.m_values[1] = _mm_srli_epi32(v.m_values[1], bits);
1080 return result;
1081 }
1082
1083 static SInt16 RightShift(const SInt16 &v, int bits)
1084 {
1085 SInt16 result;
1086 result.m_value = _mm_srai_epi16(v.m_value, bits);
1087 return result;
1088 }
1089
1090 static UInt15 RightShift(const UInt15 &v, int bits)
1091 {
1092 UInt15 result;
1093 result.m_value = _mm_srli_epi16(v.m_value, bits);
1094 return result;
1095 }
1096
1097 static SInt32 RightShift(const SInt32 &v, int bits)
1098 {
1099 SInt32 result;
1100 result.m_values[0] = _mm_srai_epi32(v.m_values[0], bits);
1101 result.m_values[1] = _mm_srai_epi32(v.m_values[1], bits);
1102 return result;
1103 }
1104
1105 static SInt16 ToSInt16(const SInt32 &v)
1106 {
1107 SInt16 result;
1108 result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
1109 return result;
1110 }
1111
1112 static SInt16 ToSInt16(const UInt16 &v)
1113 {
1114 SInt16 result;
1115 result.m_value = v.m_value;
1116 return result;
1117 }
1118
1119 static SInt16 ToSInt16(const UInt15 &v)
1120 {
1121 SInt16 result;
1122 result.m_value = v.m_value;
1123 return result;
1124 }
1125
1126 static UInt16 ToUInt16(const UInt32 &v)
1127 {
1128 __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
1129 __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
1130
1131 UInt16 result;
1132 result.m_value = _mm_packs_epi32(low, high);
1133 return result;
1134 }
1135
1136 static UInt16 ToUInt16(const UInt31 &v)
1137 {
1138 __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16);
1139 __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16);
1140
1141 UInt16 result;
1142 result.m_value = _mm_packs_epi32(low, high);
1143 return result;
1144 }
1145
1146 static UInt15 ToUInt15(const UInt31 &v)
1147 {
1148 UInt15 result;
1149 result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]);
1150 return result;
1151 }
1152
1153 static UInt15 ToUInt15(const SInt16 &v)
1154 {
1155 UInt15 result;
1156 result.m_value = v.m_value;
1157 return result;
1158 }
1159
1160 static UInt15 ToUInt15(const UInt16 &v)
1161 {
1162 UInt15 result;
1163 result.m_value = v.m_value;
1164 return result;
1165 }
1166
1167 static SInt32 XMultiply(const SInt16 &a, const SInt16 &b)
1168 {
1169 __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
1170 __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1171
1172 SInt32 result;
1173 result.m_values[0] = _mm_unpacklo_epi16(low, high);
1174 result.m_values[1] = _mm_unpackhi_epi16(low, high);
1175 return result;
1176 }
1177
1178 static SInt32 XMultiply(const SInt16 &a, const UInt15 &b)
1179 {
1180 __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value);
1181 __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1182
1183 SInt32 result;
1184 result.m_values[0] = _mm_unpacklo_epi16(low, high);
1185 result.m_values[1] = _mm_unpackhi_epi16(low, high);
1186 return result;
1187 }
1188
1189 static SInt32 XMultiply(const UInt15 &a, const SInt16 &b)
1190 {
1191 return XMultiply(b, a);
1192 }
1193
1194 static UInt32 XMultiply(const UInt16 &a, const UInt16 &b)
1195 {
1196 __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
1197 __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1198
1199 UInt32 result;
1200 result.m_values[0] = _mm_unpacklo_epi16(low, high);
1201 result.m_values[1] = _mm_unpackhi_epi16(low, high);
1202 return result;
1203 }
1204
1205 static UInt16 CompactMultiply(const UInt16 &a, const UInt15 &b)
1206 {
1207 UInt16 result;
1208 result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
1209 return result;
1210 }
1211
1212 static UInt16 CompactMultiply(const UInt15 &a, const UInt15 &b)
1213 {
1214 UInt16 result;
1215 result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
1216 return result;
1217 }
1218
1219 static SInt16 CompactMultiply(const SInt16 &a, const UInt15 &b)
1220 {
1221 SInt16 result;
1222 result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
1223 return result;
1224 }
1225
1226 static SInt16 CompactMultiply(const SInt16 &a, const SInt16 &b)
1227 {
1228 SInt16 result;
1229 result.m_value = _mm_mullo_epi16(a.m_value, b.m_value);
1230 return result;
1231 }
1232
1233 static UInt31 XMultiply(const UInt15 &a, const UInt15 &b)
1234 {
1235 __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
1236 __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1237
1238 UInt31 result;
1239 result.m_values[0] = _mm_unpacklo_epi16(low, high);
1240 result.m_values[1] = _mm_unpackhi_epi16(low, high);
1241 return result;
1242 }
1243
1244 static UInt31 XMultiply(const UInt16 &a, const UInt15 &b)
1245 {
1246 __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value);
1247 __m128i low = _mm_mullo_epi16(a.m_value, b.m_value);
1248
1249 UInt31 result;
1250 result.m_values[0] = _mm_unpacklo_epi16(low, high);
1251 result.m_values[1] = _mm_unpackhi_epi16(low, high);
1252 return result;
1253 }
1254
1255 static UInt31 XMultiply(const UInt15 &a, const UInt16 &b)
1256 {
1257 return XMultiply(b, a);
1258 }
1259
1260 static bool AnySet(const Int16CompFlag &v)
1261 {
1262 return _mm_movemask_epi8(v.m_value) != 0;
1263 }
1264
1265 static bool AllSet(const Int16CompFlag &v)
1266 {
1267 return _mm_movemask_epi8(v.m_value) == 0xffff;
1268 }
1269
1270 static bool AnySet(const FloatCompFlag &v)
1271 {
1272 return _mm_movemask_ps(v.m_values[0]) != 0 || _mm_movemask_ps(v.m_values[1]) != 0;
1273 }
1274
1275 static bool AllSet(const FloatCompFlag &v)
1276 {
1277 return _mm_movemask_ps(v.m_values[0]) == 0xf && _mm_movemask_ps(v.m_values[1]) == 0xf;
1278 }
1279 };
1280
1281#else
1282 // Scalar version
1283 struct ParallelMath
1284 {
1285 struct RoundTowardZeroForScope
1286 {
1287 };
1288
1289 struct RoundTowardNearestForScope
1290 {
1291 };
1292
1293 struct RoundUpForScope
1294 {
1295 };
1296
1297 struct RoundDownForScope
1298 {
1299 };
1300
1301 static const int ParallelSize = 1;
1302
1303 enum Int16Subtype
1304 {
1305 IntSubtype_Signed,
1306 IntSubtype_UnsignedFull,
1307 IntSubtype_UnsignedTruncated,
1308 IntSubtype_Abstract,
1309 };
1310
1311 typedef int32_t SInt16;
1312 typedef int32_t UInt15;
1313 typedef int32_t UInt16;
1314 typedef int32_t AInt16;
1315
1316 typedef int32_t SInt32;
1317 typedef int32_t UInt31;
1318 typedef int32_t UInt32;
1319 typedef int32_t AInt32;
1320
1321 typedef int32_t ScalarUInt16;
1322 typedef int32_t ScalarSInt16;
1323
1324 typedef float Float;
1325
1326 template<class TTargetType>
1327 struct LosslessCast
1328 {
1329 static const int32_t& Cast(const int32_t &src)
1330 {
1331 return src;
1332 }
1333 };
1334
1335 typedef bool Int16CompFlag;
1336 typedef bool FloatCompFlag;
1337
1338 static int32_t AbstractAdd(const int32_t &a, const int32_t &b)
1339 {
1340 return a + b;
1341 }
1342
1343 static int32_t AbstractSubtract(const int32_t &a, const int32_t &b)
1344 {
1345 return a - b;
1346 }
1347
1348 static float Select(bool flag, float a, float b)
1349 {
1350 return flag ? a : b;
1351 }
1352
1353 static int32_t Select(bool flag, int32_t a, int32_t b)
1354 {
1355 return flag ? a : b;
1356 }
1357
1358 static int32_t SelectOrZero(bool flag, int32_t a)
1359 {
1360 return flag ? a : 0;
1361 }
1362
1363 static void ConditionalSet(int32_t& dest, bool flag, int32_t src)
1364 {
1365 if (flag)
1366 dest = src;
1367 }
1368
1369 static void ConditionalSet(bool& dest, bool flag, bool src)
1370 {
1371 if (flag)
1372 dest = src;
1373 }
1374
1375 static int32_t ConditionalNegate(bool flag, int32_t v)
1376 {
1377 return (flag) ? -v : v;
1378 }
1379
1380 static void NotConditionalSet(int32_t& dest, bool flag, int32_t src)
1381 {
1382 if (!flag)
1383 dest = src;
1384 }
1385
1386 static void ConditionalSet(float& dest, bool flag, float src)
1387 {
1388 if (flag)
1389 dest = src;
1390 }
1391
1392 static void NotConditionalSet(float& dest, bool flag, float src)
1393 {
1394 if (!flag)
1395 dest = src;
1396 }
1397
1398 static void MakeSafeDenominator(float& v)
1399 {
1400 if (v == 0.0f)
1401 v = 1.0f;
1402 }
1403
1404 static int32_t SignedRightShift(int32_t v, int bits)
1405 {
1406 return v >> bits;
1407 }
1408
1409 static int32_t TruncateToPrecisionSigned(int32_t v, int precision)
1410 {
1411 v = (v << (32 - precision)) & 0xffffffff;
1412 return SignedRightShift(v, 32 - precision);
1413 }
1414
1415 static int32_t TruncateToPrecisionUnsigned(int32_t v, int precision)
1416 {
1417 return v & ((1 << precision) - 1);
1418 }
1419
1420 static int32_t Min(int32_t a, int32_t b)
1421 {
1422 if (a < b)
1423 return a;
1424 return b;
1425 }
1426
1427 static float Min(float a, float b)
1428 {
1429 if (a < b)
1430 return a;
1431 return b;
1432 }
1433
1434 static int32_t Max(int32_t a, int32_t b)
1435 {
1436 if (a > b)
1437 return a;
1438 return b;
1439 }
1440
1441 static float Max(float a, float b)
1442 {
1443 if (a > b)
1444 return a;
1445 return b;
1446 }
1447
1448 static float Abs(float a)
1449 {
1450 return fabsf(a);
1451 }
1452
1453 static int32_t Abs(int32_t a)
1454 {
1455 if (a < 0)
1456 return -a;
1457 return a;
1458 }
1459
1460 static float Clamp(float v, float min, float max)
1461 {
1462 if (v < min)
1463 return min;
1464 if (v > max)
1465 return max;
1466 return v;
1467 }
1468
1469 static float Reciprocal(float v)
1470 {
1471 return 1.0f / v;
1472 }
1473
1474 static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, int32_t& chOut)
1475 {
1476 chOut = inputBlocks[0].m_pixels[pxOffset][channel];
1477 }
1478
1479 static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, int32_t& chOut)
1480 {
1481 chOut = inputBlocks[0].m_pixels[pxOffset][channel];
1482 }
1483
1484 static float MakeFloat(float v)
1485 {
1486 return v;
1487 }
1488
1489 static float MakeFloatZero()
1490 {
1491 return 0.0f;
1492 }
1493
1494 static int32_t MakeUInt16(uint16_t v)
1495 {
1496 return v;
1497 }
1498
1499 static int32_t MakeSInt16(int16_t v)
1500 {
1501 return v;
1502 }
1503
1504 static int32_t MakeAInt16(int16_t v)
1505 {
1506 return v;
1507 }
1508
1509 static int32_t MakeUInt15(uint16_t v)
1510 {
1511 return v;
1512 }
1513
1514 static int32_t MakeSInt32(int32_t v)
1515 {
1516 return v;
1517 }
1518
1519 static int32_t MakeUInt31(int32_t v)
1520 {
1521 return v;
1522 }
1523
1524 static int32_t Extract(int32_t v, int offset)
1525 {
1526 UNREFERENCED_PARAMETER(offset);
1527 return v;
1528 }
1529
1530 static bool Extract(bool v, int offset)
1531 {
1532 UNREFERENCED_PARAMETER(offset);
1533 return v;
1534 }
1535
1536 static float Extract(float v, int offset)
1537 {
1538 UNREFERENCED_PARAMETER(offset);
1539 return v;
1540 }
1541
1542 static void PutUInt16(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
1543 {
1544 UNREFERENCED_PARAMETER(offset);
1545 dest = v;
1546 }
1547
1548 static void PutUInt15(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v)
1549 {
1550 UNREFERENCED_PARAMETER(offset);
1551 dest = v;
1552 }
1553
1554 static void PutSInt16(int32_t &dest, int offset, ParallelMath::ScalarSInt16 v)
1555 {
1556 UNREFERENCED_PARAMETER(offset);
1557 dest = v;
1558 }
1559
1560 static float ExtractFloat(float v, int offset)
1561 {
1562 UNREFERENCED_PARAMETER(offset);
1563 return v;
1564 }
1565
1566 static void PutFloat(float &dest, int offset, float v)
1567 {
1568 UNREFERENCED_PARAMETER(offset);
1569 dest = v;
1570 }
1571
1572 static void PutBoolInt16(bool &dest, int offset, bool v)
1573 {
1574 UNREFERENCED_PARAMETER(offset);
1575 dest = v;
1576 }
1577
1578 static bool Less(int32_t a, int32_t b)
1579 {
1580 return a < b;
1581 }
1582
1583 static bool Less(float a, float b)
1584 {
1585 return a < b;
1586 }
1587
1588 static bool LessOrEqual(int32_t a, int32_t b)
1589 {
1590 return a < b;
1591 }
1592
1593 static bool LessOrEqual(float a, float b)
1594 {
1595 return a < b;
1596 }
1597
1598 static bool Equal(int32_t a, int32_t b)
1599 {
1600 return a == b;
1601 }
1602
1603 static bool Equal(float a, float b)
1604 {
1605 return a == b;
1606 }
1607
1608 static float ToFloat(int32_t v)
1609 {
1610 return static_cast<float>(v);
1611 }
1612
1613 static int32_t ToUInt31(int32_t v)
1614 {
1615 return v;
1616 }
1617
1618 static int32_t ToInt32(int32_t v)
1619 {
1620 return v;
1621 }
1622
1623 static bool FloatFlagToInt16(bool v)
1624 {
1625 return v;
1626 }
1627
1628 static bool Int32FlagToInt16(bool v)
1629 {
1630 return v;
1631 }
1632
1633 static bool Int16FlagToFloat(bool v)
1634 {
1635 return v;
1636 }
1637
1638 static bool MakeBoolInt16(bool b)
1639 {
1640 return b;
1641 }
1642
1643 static bool MakeBoolFloat(bool b)
1644 {
1645 return b;
1646 }
1647
1648 static bool AndNot(bool a, bool b)
1649 {
1650 return a && !b;
1651 }
1652
1653 static bool Not(bool b)
1654 {
1655 return !b;
1656 }
1657
1658 static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardZeroForScope *rtz)
1659 {
1660 UNREFERENCED_PARAMETER(rtz);
1661 return static_cast<int>(v);
1662 }
1663
1664 static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundUpForScope *ru)
1665 {
1666 UNREFERENCED_PARAMETER(ru);
1667 return static_cast<int>(ceilf(v));
1668 }
1669
1670 static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundDownForScope *rd)
1671 {
1672 UNREFERENCED_PARAMETER(rd);
1673 return static_cast<int>(floorf(v));
1674 }
1675
1676 static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardNearestForScope *rtn)
1677 {
1678 UNREFERENCED_PARAMETER(rtn);
1679 return static_cast<int>(floorf(v + 0.5f));
1680 }
1681
1682 template<class TRoundMode>
1683 static int32_t RoundAndConvertToU16(float v, const TRoundMode *roundingMode)
1684 {
1685 return RoundAndConvertToInt(v, roundingMode);
1686 }
1687
1688 template<class TRoundMode>
1689 static int32_t RoundAndConvertToU15(float v, const TRoundMode *roundingMode)
1690 {
1691 return RoundAndConvertToInt(v, roundingMode);
1692 }
1693
1694 template<class TRoundMode>
1695 static int32_t RoundAndConvertToS16(float v, const TRoundMode *roundingMode)
1696 {
1697 return RoundAndConvertToInt(v, roundingMode);
1698 }
1699
1700 static float Sqrt(float f)
1701 {
1702 return sqrtf(f);
1703 }
1704
1705 static int32_t SqDiffUInt8(int32_t a, int32_t b)
1706 {
1707 int32_t delta = a - b;
1708 return delta * delta;
1709 }
1710
1711 static int32_t SqDiffInt16(int32_t a, int32_t b)
1712 {
1713 int32_t delta = a - b;
1714 return delta * delta;
1715 }
1716
1717 static int32_t SqDiffSInt16(int32_t a, int32_t b)
1718 {
1719 int32_t delta = a - b;
1720 return delta * delta;
1721 }
1722
1723 static float TwosCLHalfToFloat(int32_t v)
1724 {
1725 int32_t absV = (v < 0) ? -v : v;
1726
1727 int32_t signBits = (absV & -32768);
1728 int32_t mantissa = (absV & 0x03ff);
1729 int32_t exponent = (absV & 0x7c00);
1730
1731 bool isDenormal = (exponent == 0);
1732
1733 // Convert exponent to high-bits
1734 exponent = (exponent >> 3) + 14336;
1735
1736 int32_t denormalCorrection = (isDenormal ? (signBits | 14336) : 0) << 16;
1737
1738 int32_t fBits = ((exponent | signBits) << 16) | (mantissa << 13);
1739
1740 float f, correction;
1741 memcpy(&f, &fBits, 4);
1742 memcpy(&correction, &denormalCorrection, 4);
1743
1744 return f - correction;
1745 }
1746
1747 static Float SqDiff2CLFloat(const SInt16 &a, const Float &b)
1748 {
1749 Float fa = TwosCLHalfToFloat(a);
1750
1751 Float diff = fa - b;
1752 return diff * diff;
1753 }
1754
1755 static Float SqDiff2CL(const SInt16 &a, const SInt16 &b)
1756 {
1757 Float fa = TwosCLHalfToFloat(a);
1758 Float fb = TwosCLHalfToFloat(b);
1759
1760 Float diff = fa - fb;
1761 return diff * diff;
1762 }
1763
1764 static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b)
1765 {
1766 Float fa = TwosCLHalfToFloat(a) * aWeight;
1767
1768 Float diff = fa - b;
1769 return diff * diff;
1770 }
1771
1772 static int32_t RightShift(int32_t v, int bits)
1773 {
1774 return SignedRightShift(v, bits);
1775 }
1776
1777 static int32_t ToSInt16(int32_t v)
1778 {
1779 return v;
1780 }
1781
1782 static int32_t ToUInt16(int32_t v)
1783 {
1784 return v;
1785 }
1786
1787 static int32_t ToUInt15(int32_t v)
1788 {
1789 return v;
1790 }
1791
1792 static int32_t XMultiply(int32_t a, int32_t b)
1793 {
1794 return a * b;
1795 }
1796
1797 static int32_t CompactMultiply(int32_t a, int32_t b)
1798 {
1799 return a * b;
1800 }
1801
1802 static bool AnySet(bool v)
1803 {
1804 return v;
1805 }
1806
1807 static bool AllSet(bool v)
1808 {
1809 return v;
1810 }
1811 };
1812
1813#endif
1814}
1815
1816#endif
1817