1 | /* |
2 | Convection Texture Tools |
3 | Copyright (c) 2018-2019 Eric Lasota |
4 | |
5 | Permission is hereby granted, free of charge, to any person obtaining |
6 | a copy of this software and associated documentation files (the |
7 | "Software"), to deal in the Software without restriction, including |
8 | without limitation the rights to use, copy, modify, merge, publish, |
9 | distribute, sublicense, and/or sell copies of the Software, and to |
10 | permit persons to whom the Software is furnished to do so, subject |
11 | to the following conditions: |
12 | |
13 | The above copyright notice and this permission notice shall be included |
14 | in all copies or substantial portions of the Software. |
15 | |
16 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
17 | OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
18 | MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. |
19 | IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY |
20 | CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
21 | TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
22 | SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
23 | |
24 | */ |
25 | #pragma once |
26 | #ifndef __CVTT_PARALLELMATH_H__ |
27 | #define __CVTT_PARALLELMATH_H__ |
28 | |
29 | #include "ConvectionKernels.h" |
30 | #include "ConvectionKernels_Config.h" |
31 | |
32 | #ifdef CVTT_USE_SSE2 |
33 | #include <emmintrin.h> |
34 | #endif |
35 | |
36 | #include <float.h> |
37 | #include <assert.h> |
38 | #include <string.h> |
39 | #include <algorithm> |
40 | #include <math.h> |
41 | |
42 | #define UNREFERENCED_PARAMETER(n) ((void)n) |
43 | |
44 | // Parallel math implementation |
45 | // |
46 | // After preprocessor defs are handled, what this should do is expose the following types: |
47 | // SInt16 - Signed 16-bit integer |
48 | // UInt16 - Signed 16-bit integer |
49 | // UInt15 - Unsigned 15-bit integer |
50 | // SInt32 - Signed 32-bit integer |
51 | // UInt31 - Unsigned 31-bit integer |
52 | // AInt16 - 16-bit integer of unknown signedness (only used for storage) |
53 | // Int16CompFlag - Comparison flags from comparing 16-bit integers |
54 | // Int32CompFlag - Comparison flags from comparing 32-bit integers |
55 | // FloatCompFlag - Comparison flags from comparing 32-bit floats |
56 | // |
57 | // The reason for these distinctions are that depending on the instruction set, signed or unsigned versions of certain ops |
58 | // (particularly max, min, compares, and right shift) may not be available. In cases where ops are not available, it's |
59 | // necessary to do high bit manipulations to accomplish the operation with 16-bit numbers. The 15-bit and 31-bit uint types |
60 | // can elide the bit flips if unsigned versions are not available. |
61 | |
62 | namespace cvtt |
63 | { |
64 | #ifdef CVTT_USE_SSE2 |
65 | // SSE2 version |
66 | struct ParallelMath |
67 | { |
68 | typedef uint16_t ScalarUInt16; |
69 | typedef int16_t ScalarSInt16; |
70 | |
71 | template<unsigned int TRoundingMode> |
72 | struct RoundForScope |
73 | { |
74 | unsigned int m_oldCSR; |
75 | |
76 | RoundForScope() |
77 | { |
78 | m_oldCSR = _mm_getcsr(); |
79 | _mm_setcsr((m_oldCSR & ~_MM_ROUND_MASK) | (TRoundingMode)); |
80 | } |
81 | |
82 | ~RoundForScope() |
83 | { |
84 | _mm_setcsr(m_oldCSR); |
85 | } |
86 | }; |
87 | |
88 | struct RoundTowardZeroForScope : RoundForScope<_MM_ROUND_TOWARD_ZERO> |
89 | { |
90 | }; |
91 | |
92 | struct RoundTowardNearestForScope : RoundForScope<_MM_ROUND_NEAREST> |
93 | { |
94 | }; |
95 | |
96 | struct RoundUpForScope : RoundForScope<_MM_ROUND_UP> |
97 | { |
98 | }; |
99 | |
100 | struct RoundDownForScope : RoundForScope<_MM_ROUND_DOWN> |
101 | { |
102 | }; |
103 | |
104 | static const int ParallelSize = 8; |
105 | |
106 | enum Int16Subtype |
107 | { |
108 | IntSubtype_Signed, |
109 | IntSubtype_UnsignedFull, |
110 | IntSubtype_UnsignedTruncated, |
111 | IntSubtype_Abstract, |
112 | }; |
113 | |
114 | template<int TSubtype> |
115 | struct VInt16 |
116 | { |
117 | __m128i m_value; |
118 | |
119 | inline VInt16 operator+(int16_t other) const |
120 | { |
121 | VInt16 result; |
122 | result.m_value = _mm_add_epi16(m_value, _mm_set1_epi16(static_cast<int16_t>(other))); |
123 | return result; |
124 | } |
125 | |
126 | inline VInt16 operator+(const VInt16 &other) const |
127 | { |
128 | VInt16 result; |
129 | result.m_value = _mm_add_epi16(m_value, other.m_value); |
130 | return result; |
131 | } |
132 | |
133 | inline VInt16 operator|(const VInt16 &other) const |
134 | { |
135 | VInt16 result; |
136 | result.m_value = _mm_or_si128(m_value, other.m_value); |
137 | return result; |
138 | } |
139 | |
140 | inline VInt16 operator&(const VInt16 &other) const |
141 | { |
142 | VInt16 result; |
143 | result.m_value = _mm_and_si128(m_value, other.m_value); |
144 | return result; |
145 | } |
146 | |
147 | inline VInt16 operator-(const VInt16 &other) const |
148 | { |
149 | VInt16 result; |
150 | result.m_value = _mm_sub_epi16(m_value, other.m_value); |
151 | return result; |
152 | } |
153 | |
154 | inline VInt16 operator<<(int bits) const |
155 | { |
156 | VInt16 result; |
157 | result.m_value = _mm_slli_epi16(m_value, bits); |
158 | return result; |
159 | } |
160 | |
161 | inline VInt16 operator^(const VInt16 &other) const |
162 | { |
163 | VInt16 result; |
164 | result.m_value = _mm_xor_si128(m_value, other.m_value); |
165 | return result; |
166 | } |
167 | }; |
168 | |
169 | typedef VInt16<IntSubtype_Signed> SInt16; |
170 | typedef VInt16<IntSubtype_UnsignedFull> UInt16; |
171 | typedef VInt16<IntSubtype_UnsignedTruncated> UInt15; |
172 | typedef VInt16<IntSubtype_Abstract> AInt16; |
173 | |
174 | template<int TSubtype> |
175 | struct VInt32 |
176 | { |
177 | __m128i m_values[2]; |
178 | |
179 | inline VInt32 operator+(const VInt32& other) const |
180 | { |
181 | VInt32 result; |
182 | result.m_values[0] = _mm_add_epi32(m_values[0], other.m_values[0]); |
183 | result.m_values[1] = _mm_add_epi32(m_values[1], other.m_values[1]); |
184 | return result; |
185 | } |
186 | |
187 | inline VInt32 operator-(const VInt32& other) const |
188 | { |
189 | VInt32 result; |
190 | result.m_values[0] = _mm_sub_epi32(m_values[0], other.m_values[0]); |
191 | result.m_values[1] = _mm_sub_epi32(m_values[1], other.m_values[1]); |
192 | return result; |
193 | } |
194 | |
195 | inline VInt32 operator<<(const int other) const |
196 | { |
197 | VInt32 result; |
198 | result.m_values[0] = _mm_slli_epi32(m_values[0], other); |
199 | result.m_values[1] = _mm_slli_epi32(m_values[1], other); |
200 | return result; |
201 | } |
202 | |
203 | inline VInt32 operator|(const VInt32& other) const |
204 | { |
205 | VInt32 result; |
206 | result.m_values[0] = _mm_or_si128(m_values[0], other.m_values[0]); |
207 | result.m_values[1] = _mm_or_si128(m_values[1], other.m_values[1]); |
208 | return result; |
209 | } |
210 | }; |
211 | |
212 | typedef VInt32<IntSubtype_Signed> SInt32; |
213 | typedef VInt32<IntSubtype_UnsignedTruncated> UInt31; |
214 | typedef VInt32<IntSubtype_UnsignedFull> UInt32; |
215 | typedef VInt32<IntSubtype_Abstract> AInt32; |
216 | |
217 | template<class TTargetType> |
218 | struct LosslessCast |
219 | { |
220 | #ifdef CVTT_PERMIT_ALIASING |
221 | template<int TSrcSubtype> |
222 | static const TTargetType& Cast(const VInt32<TSrcSubtype> &src) |
223 | { |
224 | return reinterpret_cast<VInt32<TSubtype>&>(src); |
225 | } |
226 | |
227 | template<int TSrcSubtype> |
228 | static const TTargetType& Cast(const VInt16<TSrcSubtype> &src) |
229 | { |
230 | return reinterpret_cast<VInt16<TSubtype>&>(src); |
231 | } |
232 | #else |
233 | template<int TSrcSubtype> |
234 | static TTargetType Cast(const VInt32<TSrcSubtype> &src) |
235 | { |
236 | TTargetType result; |
237 | result.m_values[0] = src.m_values[0]; |
238 | result.m_values[1] = src.m_values[1]; |
239 | return result; |
240 | } |
241 | |
242 | template<int TSrcSubtype> |
243 | static TTargetType Cast(const VInt16<TSrcSubtype> &src) |
244 | { |
245 | TTargetType result; |
246 | result.m_value = src.m_value; |
247 | return result; |
248 | } |
249 | #endif |
250 | }; |
251 | |
252 | struct Int64 |
253 | { |
254 | __m128i m_values[4]; |
255 | }; |
256 | |
257 | struct Float |
258 | { |
259 | __m128 m_values[2]; |
260 | |
261 | inline Float operator+(const Float &other) const |
262 | { |
263 | Float result; |
264 | result.m_values[0] = _mm_add_ps(m_values[0], other.m_values[0]); |
265 | result.m_values[1] = _mm_add_ps(m_values[1], other.m_values[1]); |
266 | return result; |
267 | } |
268 | |
269 | inline Float operator+(float other) const |
270 | { |
271 | Float result; |
272 | result.m_values[0] = _mm_add_ps(m_values[0], _mm_set1_ps(other)); |
273 | result.m_values[1] = _mm_add_ps(m_values[1], _mm_set1_ps(other)); |
274 | return result; |
275 | } |
276 | |
277 | inline Float operator-(const Float& other) const |
278 | { |
279 | Float result; |
280 | result.m_values[0] = _mm_sub_ps(m_values[0], other.m_values[0]); |
281 | result.m_values[1] = _mm_sub_ps(m_values[1], other.m_values[1]); |
282 | return result; |
283 | } |
284 | |
285 | inline Float operator-() const |
286 | { |
287 | Float result; |
288 | result.m_values[0] = _mm_sub_ps(_mm_setzero_ps(), m_values[0]); |
289 | result.m_values[1] = _mm_sub_ps(_mm_setzero_ps(), m_values[1]); |
290 | return result; |
291 | } |
292 | |
293 | inline Float operator*(const Float& other) const |
294 | { |
295 | Float result; |
296 | result.m_values[0] = _mm_mul_ps(m_values[0], other.m_values[0]); |
297 | result.m_values[1] = _mm_mul_ps(m_values[1], other.m_values[1]); |
298 | return result; |
299 | } |
300 | |
301 | inline Float operator*(float other) const |
302 | { |
303 | Float result; |
304 | result.m_values[0] = _mm_mul_ps(m_values[0], _mm_set1_ps(other)); |
305 | result.m_values[1] = _mm_mul_ps(m_values[1], _mm_set1_ps(other)); |
306 | return result; |
307 | } |
308 | |
309 | inline Float operator/(const Float &other) const |
310 | { |
311 | Float result; |
312 | result.m_values[0] = _mm_div_ps(m_values[0], other.m_values[0]); |
313 | result.m_values[1] = _mm_div_ps(m_values[1], other.m_values[1]); |
314 | return result; |
315 | } |
316 | |
317 | inline Float operator/(float other) const |
318 | { |
319 | Float result; |
320 | result.m_values[0] = _mm_div_ps(m_values[0], _mm_set1_ps(other)); |
321 | result.m_values[1] = _mm_div_ps(m_values[1], _mm_set1_ps(other)); |
322 | return result; |
323 | } |
324 | }; |
325 | |
326 | struct Int16CompFlag |
327 | { |
328 | __m128i m_value; |
329 | |
330 | inline Int16CompFlag operator&(const Int16CompFlag &other) const |
331 | { |
332 | Int16CompFlag result; |
333 | result.m_value = _mm_and_si128(m_value, other.m_value); |
334 | return result; |
335 | } |
336 | |
337 | inline Int16CompFlag operator|(const Int16CompFlag &other) const |
338 | { |
339 | Int16CompFlag result; |
340 | result.m_value = _mm_or_si128(m_value, other.m_value); |
341 | return result; |
342 | } |
343 | }; |
344 | |
345 | struct Int32CompFlag |
346 | { |
347 | __m128i m_values[2]; |
348 | |
349 | inline Int32CompFlag operator&(const Int32CompFlag &other) const |
350 | { |
351 | Int32CompFlag result; |
352 | result.m_values[0] = _mm_and_si128(m_values[0], other.m_values[0]); |
353 | result.m_values[1] = _mm_and_si128(m_values[1], other.m_values[1]); |
354 | return result; |
355 | } |
356 | |
357 | inline Int32CompFlag operator|(const Int32CompFlag &other) const |
358 | { |
359 | Int32CompFlag result; |
360 | result.m_values[0] = _mm_or_si128(m_values[0], other.m_values[0]); |
361 | result.m_values[1] = _mm_or_si128(m_values[1], other.m_values[1]); |
362 | return result; |
363 | } |
364 | }; |
365 | |
366 | struct FloatCompFlag |
367 | { |
368 | __m128 m_values[2]; |
369 | |
370 | inline FloatCompFlag operator&(const FloatCompFlag &other) const |
371 | { |
372 | FloatCompFlag result; |
373 | result.m_values[0] = _mm_and_ps(m_values[0], other.m_values[0]); |
374 | result.m_values[1] = _mm_and_ps(m_values[1], other.m_values[1]); |
375 | return result; |
376 | } |
377 | |
378 | inline FloatCompFlag operator|(const FloatCompFlag &other) const |
379 | { |
380 | FloatCompFlag result; |
381 | result.m_values[0] = _mm_or_ps(m_values[0], other.m_values[0]); |
382 | result.m_values[1] = _mm_or_ps(m_values[1], other.m_values[1]); |
383 | return result; |
384 | } |
385 | }; |
386 | |
387 | template<int TSubtype> |
388 | static VInt16<TSubtype> AbstractAdd(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b) |
389 | { |
390 | VInt16<TSubtype> result; |
391 | result.m_value = _mm_add_epi16(a.m_value, b.m_value); |
392 | return result; |
393 | } |
394 | |
395 | template<int TSubtype> |
396 | static VInt16<TSubtype> AbstractSubtract(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b) |
397 | { |
398 | VInt16<TSubtype> result; |
399 | result.m_value = _mm_sub_epi16(a.m_value, b.m_value); |
400 | return result; |
401 | } |
402 | |
403 | static Float Select(const FloatCompFlag &flag, const Float &a, const Float &b) |
404 | { |
405 | Float result; |
406 | for (int i = 0; i < 2; i++) |
407 | result.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], a.m_values[i]), _mm_andnot_ps(flag.m_values[i], b.m_values[i])); |
408 | return result; |
409 | } |
410 | |
411 | template<int TSubtype> |
412 | static VInt16<TSubtype> Select(const Int16CompFlag &flag, const VInt16<TSubtype> &a, const VInt16<TSubtype> &b) |
413 | { |
414 | VInt16<TSubtype> result; |
415 | result.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, a.m_value), _mm_andnot_si128(flag.m_value, b.m_value)); |
416 | return result; |
417 | } |
418 | |
419 | template<int TSubtype> |
420 | static VInt16<TSubtype> SelectOrZero(const Int16CompFlag &flag, const VInt16<TSubtype> &a) |
421 | { |
422 | VInt16<TSubtype> result; |
423 | result.m_value = _mm_and_si128(flag.m_value, a.m_value); |
424 | return result; |
425 | } |
426 | |
427 | template<int TSubtype> |
428 | static void ConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src) |
429 | { |
430 | dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value)); |
431 | } |
432 | |
433 | template<int TSubtype> |
434 | static void ConditionalSet(VInt32<TSubtype> &dest, const Int16CompFlag &flag, const VInt32<TSubtype> &src) |
435 | { |
436 | __m128i lowFlags = _mm_unpacklo_epi16(flag.m_value, flag.m_value); |
437 | __m128i highFlags = _mm_unpackhi_epi16(flag.m_value, flag.m_value); |
438 | dest.m_values[0] = _mm_or_si128(_mm_andnot_si128(lowFlags, dest.m_values[0]), _mm_and_si128(lowFlags, src.m_values[0])); |
439 | dest.m_values[1] = _mm_or_si128(_mm_andnot_si128(highFlags, dest.m_values[1]), _mm_and_si128(highFlags, src.m_values[1])); |
440 | } |
441 | |
442 | static void ConditionalSet(ParallelMath::Int16CompFlag &dest, const Int16CompFlag &flag, const ParallelMath::Int16CompFlag &src) |
443 | { |
444 | dest.m_value = _mm_or_si128(_mm_andnot_si128(flag.m_value, dest.m_value), _mm_and_si128(flag.m_value, src.m_value)); |
445 | } |
446 | |
447 | static SInt16 ConditionalNegate(const Int16CompFlag &flag, const SInt16 &v) |
448 | { |
449 | SInt16 result; |
450 | result.m_value = _mm_add_epi16(_mm_xor_si128(flag.m_value, v.m_value), _mm_srli_epi16(flag.m_value, 15)); |
451 | return result; |
452 | } |
453 | |
454 | template<int TSubtype> |
455 | static void NotConditionalSet(VInt16<TSubtype> &dest, const Int16CompFlag &flag, const VInt16<TSubtype> &src) |
456 | { |
457 | dest.m_value = _mm_or_si128(_mm_and_si128(flag.m_value, dest.m_value), _mm_andnot_si128(flag.m_value, src.m_value)); |
458 | } |
459 | |
460 | static void ConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src) |
461 | { |
462 | for (int i = 0; i < 2; i++) |
463 | dest.m_values[i] = _mm_or_ps(_mm_andnot_ps(flag.m_values[i], dest.m_values[i]), _mm_and_ps(flag.m_values[i], src.m_values[i])); |
464 | } |
465 | |
466 | static void NotConditionalSet(Float &dest, const FloatCompFlag &flag, const Float &src) |
467 | { |
468 | for (int i = 0; i < 2; i++) |
469 | dest.m_values[i] = _mm_or_ps(_mm_and_ps(flag.m_values[i], dest.m_values[i]), _mm_andnot_ps(flag.m_values[i], src.m_values[i])); |
470 | } |
471 | |
472 | static void MakeSafeDenominator(Float& v) |
473 | { |
474 | ConditionalSet(v, Equal(v, MakeFloatZero()), MakeFloat(1.0f)); |
475 | } |
476 | |
477 | static SInt16 TruncateToPrecisionSigned(const SInt16 &v, int precision) |
478 | { |
479 | int lostBits = 16 - precision; |
480 | if (lostBits == 0) |
481 | return v; |
482 | |
483 | SInt16 result; |
484 | result.m_value = _mm_srai_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits); |
485 | return result; |
486 | } |
487 | |
488 | static UInt16 TruncateToPrecisionUnsigned(const UInt16 &v, int precision) |
489 | { |
490 | int lostBits = 16 - precision; |
491 | if (lostBits == 0) |
492 | return v; |
493 | |
494 | UInt16 result; |
495 | result.m_value = _mm_srli_epi16(_mm_slli_epi16(v.m_value, lostBits), lostBits); |
496 | return result; |
497 | } |
498 | |
499 | static UInt16 Min(const UInt16 &a, const UInt16 &b) |
500 | { |
501 | __m128i bitFlip = _mm_set1_epi16(-32768); |
502 | |
503 | UInt16 result; |
504 | result.m_value = _mm_xor_si128(_mm_min_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip); |
505 | return result; |
506 | } |
507 | |
508 | static SInt16 Min(const SInt16 &a, const SInt16 &b) |
509 | { |
510 | SInt16 result; |
511 | result.m_value = _mm_min_epi16(a.m_value, b.m_value); |
512 | return result; |
513 | } |
514 | |
515 | static UInt15 Min(const UInt15 &a, const UInt15 &b) |
516 | { |
517 | UInt15 result; |
518 | result.m_value = _mm_min_epi16(a.m_value, b.m_value); |
519 | return result; |
520 | } |
521 | |
522 | static Float Min(const Float &a, const Float &b) |
523 | { |
524 | Float result; |
525 | for (int i = 0; i < 2; i++) |
526 | result.m_values[i] = _mm_min_ps(a.m_values[i], b.m_values[i]); |
527 | return result; |
528 | } |
529 | |
530 | static UInt16 Max(const UInt16 &a, const UInt16 &b) |
531 | { |
532 | __m128i bitFlip = _mm_set1_epi16(-32768); |
533 | |
534 | UInt16 result; |
535 | result.m_value = _mm_xor_si128(_mm_max_epi16(_mm_xor_si128(a.m_value, bitFlip), _mm_xor_si128(b.m_value, bitFlip)), bitFlip); |
536 | return result; |
537 | } |
538 | |
539 | static SInt16 Max(const SInt16 &a, const SInt16 &b) |
540 | { |
541 | SInt16 result; |
542 | result.m_value = _mm_max_epi16(a.m_value, b.m_value); |
543 | return result; |
544 | } |
545 | |
546 | static UInt15 Max(const UInt15 &a, const UInt15 &b) |
547 | { |
548 | UInt15 result; |
549 | result.m_value = _mm_max_epi16(a.m_value, b.m_value); |
550 | return result; |
551 | } |
552 | |
553 | static Float Max(const Float &a, const Float &b) |
554 | { |
555 | Float result; |
556 | for (int i = 0; i < 2; i++) |
557 | result.m_values[i] = _mm_max_ps(a.m_values[i], b.m_values[i]); |
558 | return result; |
559 | } |
560 | |
561 | static Float Clamp(const Float &v, float min, float max) |
562 | { |
563 | Float result; |
564 | for (int i = 0; i < 2; i++) |
565 | result.m_values[i] = _mm_max_ps(_mm_min_ps(v.m_values[i], _mm_set1_ps(max)), _mm_set1_ps(min)); |
566 | return result; |
567 | } |
568 | |
569 | static Float Reciprocal(const Float &v) |
570 | { |
571 | Float result; |
572 | for (int i = 0; i < 2; i++) |
573 | result.m_values[i] = _mm_rcp_ps(v.m_values[i]); |
574 | return result; |
575 | } |
576 | |
577 | static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, UInt15 &chOut) |
578 | { |
579 | int16_t values[8]; |
580 | for (int i = 0; i < 8; i++) |
581 | values[i] = inputBlocks[i].m_pixels[pxOffset][channel]; |
582 | |
583 | chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]); |
584 | } |
585 | |
586 | static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, SInt16 &chOut) |
587 | { |
588 | int16_t values[8]; |
589 | for (int i = 0; i < 8; i++) |
590 | values[i] = inputBlocks[i].m_pixels[pxOffset][channel]; |
591 | |
592 | chOut.m_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]); |
593 | } |
594 | |
595 | static Float MakeFloat(float v) |
596 | { |
597 | Float f; |
598 | f.m_values[0] = f.m_values[1] = _mm_set1_ps(v); |
599 | return f; |
600 | } |
601 | |
602 | static Float MakeFloatZero() |
603 | { |
604 | Float f; |
605 | f.m_values[0] = f.m_values[1] = _mm_setzero_ps(); |
606 | return f; |
607 | } |
608 | |
609 | static UInt16 MakeUInt16(uint16_t v) |
610 | { |
611 | UInt16 result; |
612 | result.m_value = _mm_set1_epi16(static_cast<short>(v)); |
613 | return result; |
614 | } |
615 | |
616 | static SInt16 MakeSInt16(int16_t v) |
617 | { |
618 | SInt16 result; |
619 | result.m_value = _mm_set1_epi16(static_cast<short>(v)); |
620 | return result; |
621 | } |
622 | |
623 | static AInt16 MakeAInt16(int16_t v) |
624 | { |
625 | AInt16 result; |
626 | result.m_value = _mm_set1_epi16(static_cast<short>(v)); |
627 | return result; |
628 | } |
629 | |
630 | static UInt15 MakeUInt15(uint16_t v) |
631 | { |
632 | UInt15 result; |
633 | result.m_value = _mm_set1_epi16(static_cast<short>(v)); |
634 | return result; |
635 | } |
636 | |
637 | static SInt32 MakeSInt32(int32_t v) |
638 | { |
639 | SInt32 result; |
640 | result.m_values[0] = _mm_set1_epi32(v); |
641 | result.m_values[1] = _mm_set1_epi32(v); |
642 | return result; |
643 | } |
644 | |
645 | static UInt31 MakeUInt31(uint32_t v) |
646 | { |
647 | UInt31 result; |
648 | result.m_values[0] = _mm_set1_epi32(v); |
649 | result.m_values[1] = _mm_set1_epi32(v); |
650 | return result; |
651 | } |
652 | |
653 | static uint16_t (const UInt16 &v, int offset) |
654 | { |
655 | return reinterpret_cast<const uint16_t*>(&v.m_value)[offset]; |
656 | } |
657 | |
658 | static int16_t (const SInt16 &v, int offset) |
659 | { |
660 | return reinterpret_cast<const int16_t*>(&v.m_value)[offset]; |
661 | } |
662 | |
663 | static uint16_t (const UInt15 &v, int offset) |
664 | { |
665 | return reinterpret_cast<const uint16_t*>(&v.m_value)[offset]; |
666 | } |
667 | |
668 | static int16_t (const AInt16 &v, int offset) |
669 | { |
670 | return reinterpret_cast<const int16_t*>(&v.m_value)[offset]; |
671 | } |
672 | |
673 | static int32_t (const SInt32 &v, int offset) |
674 | { |
675 | return reinterpret_cast<const int32_t*>(&v.m_values[offset >> 2])[offset & 3]; |
676 | } |
677 | |
678 | static float (const Float &v, int offset) |
679 | { |
680 | return reinterpret_cast<const float*>(&v.m_values[offset >> 2])[offset & 3]; |
681 | } |
682 | |
683 | static bool (const ParallelMath::Int16CompFlag &v, int offset) |
684 | { |
685 | return reinterpret_cast<const int16_t*>(&v.m_value)[offset] != 0; |
686 | } |
687 | |
688 | static void PutUInt16(UInt16 &dest, int offset, uint16_t v) |
689 | { |
690 | reinterpret_cast<uint16_t*>(&dest)[offset] = v; |
691 | } |
692 | |
693 | static void PutUInt15(UInt15 &dest, int offset, uint16_t v) |
694 | { |
695 | reinterpret_cast<uint16_t*>(&dest)[offset] = v; |
696 | } |
697 | |
698 | static void PutSInt16(SInt16 &dest, int offset, int16_t v) |
699 | { |
700 | reinterpret_cast<int16_t*>(&dest)[offset] = v; |
701 | } |
702 | |
703 | static float (const Float& v, int offset) |
704 | { |
705 | return reinterpret_cast<const float*>(&v)[offset]; |
706 | } |
707 | |
708 | static void PutFloat(Float &dest, int offset, float v) |
709 | { |
710 | reinterpret_cast<float*>(&dest)[offset] = v; |
711 | } |
712 | |
713 | static void PutBoolInt16(Int16CompFlag &dest, int offset, bool v) |
714 | { |
715 | reinterpret_cast<int16_t*>(&dest)[offset] = v ? -1 : 0; |
716 | } |
717 | |
718 | static Int32CompFlag Less(const UInt31 &a, const UInt31 &b) |
719 | { |
720 | Int32CompFlag result; |
721 | result.m_values[0] = _mm_cmplt_epi32(a.m_values[0], b.m_values[0]); |
722 | result.m_values[1] = _mm_cmplt_epi32(a.m_values[1], b.m_values[1]); |
723 | return result; |
724 | } |
725 | |
726 | static Int16CompFlag Less(const SInt16 &a, const SInt16 &b) |
727 | { |
728 | Int16CompFlag result; |
729 | result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value); |
730 | return result; |
731 | } |
732 | |
733 | static Int16CompFlag Less(const UInt15 &a, const UInt15 &b) |
734 | { |
735 | Int16CompFlag result; |
736 | result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value); |
737 | return result; |
738 | } |
739 | |
740 | static Int16CompFlag LessOrEqual(const UInt15 &a, const UInt15 &b) |
741 | { |
742 | Int16CompFlag result; |
743 | result.m_value = _mm_cmplt_epi16(a.m_value, b.m_value); |
744 | return result; |
745 | } |
746 | |
747 | static FloatCompFlag Less(const Float &a, const Float &b) |
748 | { |
749 | FloatCompFlag result; |
750 | for (int i = 0; i < 2; i++) |
751 | result.m_values[i] = _mm_cmplt_ps(a.m_values[i], b.m_values[i]); |
752 | return result; |
753 | } |
754 | |
755 | static FloatCompFlag LessOrEqual(const Float &a, const Float &b) |
756 | { |
757 | FloatCompFlag result; |
758 | for (int i = 0; i < 2; i++) |
759 | result.m_values[i] = _mm_cmple_ps(a.m_values[i], b.m_values[i]); |
760 | return result; |
761 | } |
762 | |
763 | template<int TSubtype> |
764 | static Int16CompFlag Equal(const VInt16<TSubtype> &a, const VInt16<TSubtype> &b) |
765 | { |
766 | Int16CompFlag result; |
767 | result.m_value = _mm_cmpeq_epi16(a.m_value, b.m_value); |
768 | return result; |
769 | } |
770 | |
771 | static FloatCompFlag Equal(const Float &a, const Float &b) |
772 | { |
773 | FloatCompFlag result; |
774 | for (int i = 0; i < 2; i++) |
775 | result.m_values[i] = _mm_cmpeq_ps(a.m_values[i], b.m_values[i]); |
776 | return result; |
777 | } |
778 | |
779 | static Int16CompFlag Equal(const Int16CompFlag &a, const Int16CompFlag &b) |
780 | { |
781 | Int16CompFlag notResult; |
782 | notResult.m_value = _mm_xor_si128(a.m_value, b.m_value); |
783 | return Not(notResult); |
784 | } |
785 | |
786 | static Float ToFloat(const UInt16 &v) |
787 | { |
788 | Float result; |
789 | result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128())); |
790 | result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128())); |
791 | return result; |
792 | } |
793 | |
794 | static UInt31 ToUInt31(const UInt16 &v) |
795 | { |
796 | UInt31 result; |
797 | result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()); |
798 | result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()); |
799 | return result; |
800 | } |
801 | |
802 | static SInt32 ToInt32(const UInt16 &v) |
803 | { |
804 | SInt32 result; |
805 | result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()); |
806 | result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()); |
807 | return result; |
808 | } |
809 | |
810 | static SInt32 ToInt32(const UInt15 &v) |
811 | { |
812 | SInt32 result; |
813 | result.m_values[0] = _mm_unpacklo_epi16(v.m_value, _mm_setzero_si128()); |
814 | result.m_values[1] = _mm_unpackhi_epi16(v.m_value, _mm_setzero_si128()); |
815 | return result; |
816 | } |
817 | |
818 | static SInt32 ToInt32(const SInt16 &v) |
819 | { |
820 | SInt32 result; |
821 | result.m_values[0] = _mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16); |
822 | result.m_values[1] = _mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16); |
823 | return result; |
824 | } |
825 | |
826 | static Float ToFloat(const SInt16 &v) |
827 | { |
828 | Float result; |
829 | result.m_values[0] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(_mm_setzero_si128(), v.m_value), 16)); |
830 | result.m_values[1] = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(_mm_setzero_si128(), v.m_value), 16)); |
831 | return result; |
832 | } |
833 | |
834 | static Float ToFloat(const UInt15 &v) |
835 | { |
836 | Float result; |
837 | result.m_values[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(v.m_value, _mm_setzero_si128())); |
838 | result.m_values[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(v.m_value, _mm_setzero_si128())); |
839 | return result; |
840 | } |
841 | |
842 | static Float ToFloat(const UInt31 &v) |
843 | { |
844 | Float result; |
845 | result.m_values[0] = _mm_cvtepi32_ps(v.m_values[0]); |
846 | result.m_values[1] = _mm_cvtepi32_ps(v.m_values[1]); |
847 | return result; |
848 | } |
849 | |
850 | static Int16CompFlag FloatFlagToInt16(const FloatCompFlag &v) |
851 | { |
852 | __m128i lo = _mm_castps_si128(v.m_values[0]); |
853 | __m128i hi = _mm_castps_si128(v.m_values[1]); |
854 | |
855 | Int16CompFlag result; |
856 | result.m_value = _mm_packs_epi32(lo, hi); |
857 | return result; |
858 | } |
859 | |
860 | static FloatCompFlag Int16FlagToFloat(const Int16CompFlag &v) |
861 | { |
862 | __m128i lo = _mm_unpacklo_epi16(v.m_value, v.m_value); |
863 | __m128i hi = _mm_unpackhi_epi16(v.m_value, v.m_value); |
864 | |
865 | FloatCompFlag result; |
866 | result.m_values[0] = _mm_castsi128_ps(lo); |
867 | result.m_values[1] = _mm_castsi128_ps(hi); |
868 | return result; |
869 | } |
870 | |
871 | static Int16CompFlag Int32FlagToInt16(const Int32CompFlag &v) |
872 | { |
873 | __m128i lo = v.m_values[0]; |
874 | __m128i hi = v.m_values[1]; |
875 | |
876 | Int16CompFlag result; |
877 | result.m_value = _mm_packs_epi32(lo, hi); |
878 | return result; |
879 | } |
880 | |
881 | static Int16CompFlag MakeBoolInt16(bool b) |
882 | { |
883 | Int16CompFlag result; |
884 | if (b) |
885 | result.m_value = _mm_set1_epi16(-1); |
886 | else |
887 | result.m_value = _mm_setzero_si128(); |
888 | return result; |
889 | } |
890 | |
891 | static FloatCompFlag MakeBoolFloat(bool b) |
892 | { |
893 | FloatCompFlag result; |
894 | if (b) |
895 | result.m_values[0] = result.m_values[1] = _mm_castsi128_ps(_mm_set1_epi32(-1)); |
896 | else |
897 | result.m_values[0] = result.m_values[1] = _mm_setzero_ps(); |
898 | return result; |
899 | } |
900 | |
901 | static Int16CompFlag AndNot(const Int16CompFlag &a, const Int16CompFlag &b) |
902 | { |
903 | Int16CompFlag result; |
904 | result.m_value = _mm_andnot_si128(b.m_value, a.m_value); |
905 | return result; |
906 | } |
907 | |
908 | static Int16CompFlag Not(const Int16CompFlag &b) |
909 | { |
910 | Int16CompFlag result; |
911 | result.m_value = _mm_xor_si128(b.m_value, _mm_set1_epi32(-1)); |
912 | return result; |
913 | } |
914 | |
915 | static Int32CompFlag Not(const Int32CompFlag &b) |
916 | { |
917 | Int32CompFlag result; |
918 | result.m_values[0] = _mm_xor_si128(b.m_values[0], _mm_set1_epi32(-1)); |
919 | result.m_values[1] = _mm_xor_si128(b.m_values[1], _mm_set1_epi32(-1)); |
920 | return result; |
921 | } |
922 | |
923 | static UInt16 RoundAndConvertToU16(const Float &v, const void* /*roundingMode*/) |
924 | { |
925 | __m128i lo = _mm_cvtps_epi32(_mm_add_ps(v.m_values[0], _mm_set1_ps(-32768))); |
926 | __m128i hi = _mm_cvtps_epi32(_mm_add_ps(v.m_values[1], _mm_set1_ps(-32768))); |
927 | |
928 | __m128i packed = _mm_packs_epi32(lo, hi); |
929 | |
930 | UInt16 result; |
931 | result.m_value = _mm_xor_si128(packed, _mm_set1_epi16(-32768)); |
932 | return result; |
933 | } |
934 | |
935 | static UInt15 RoundAndConvertToU15(const Float &v, const void* /*roundingMode*/) |
936 | { |
937 | __m128i lo = _mm_cvtps_epi32(v.m_values[0]); |
938 | __m128i hi = _mm_cvtps_epi32(v.m_values[1]); |
939 | |
940 | __m128i packed = _mm_packs_epi32(lo, hi); |
941 | |
942 | UInt15 result; |
943 | result.m_value = _mm_packs_epi32(lo, hi); |
944 | return result; |
945 | } |
946 | |
947 | static SInt16 RoundAndConvertToS16(const Float &v, const void* /*roundingMode*/) |
948 | { |
949 | __m128i lo = _mm_cvtps_epi32(v.m_values[0]); |
950 | __m128i hi = _mm_cvtps_epi32(v.m_values[1]); |
951 | |
952 | __m128i packed = _mm_packs_epi32(lo, hi); |
953 | |
954 | SInt16 result; |
955 | result.m_value = _mm_packs_epi32(lo, hi); |
956 | return result; |
957 | } |
958 | |
959 | static Float Sqrt(const Float &f) |
960 | { |
961 | Float result; |
962 | for (int i = 0; i < 2; i++) |
963 | result.m_values[i] = _mm_sqrt_ps(f.m_values[i]); |
964 | return result; |
965 | } |
966 | |
967 | static UInt16 Abs(const SInt16 &a) |
968 | { |
969 | __m128i signBitsXor = _mm_srai_epi16(a.m_value, 15); |
970 | __m128i signBitsAdd = _mm_srli_epi16(a.m_value, 15); |
971 | |
972 | UInt16 result; |
973 | result.m_value = _mm_add_epi16(_mm_xor_si128(a.m_value, signBitsXor), signBitsAdd); |
974 | return result; |
975 | } |
976 | |
977 | static Float Abs(const Float& a) |
978 | { |
979 | __m128 invMask = _mm_set1_ps(-0.0f); |
980 | |
981 | Float result; |
982 | result.m_values[0] = _mm_andnot_ps(invMask, a.m_values[0]); |
983 | result.m_values[1] = _mm_andnot_ps(invMask, a.m_values[1]); |
984 | return result; |
985 | } |
986 | |
987 | static UInt16 SqDiffUInt8(const UInt15 &a, const UInt15 &b) |
988 | { |
989 | __m128i diff = _mm_sub_epi16(a.m_value, b.m_value); |
990 | |
991 | UInt16 result; |
992 | result.m_value = _mm_mullo_epi16(diff, diff); |
993 | return result; |
994 | } |
995 | |
996 | static Float SqDiffSInt16(const SInt16 &a, const SInt16 &b) |
997 | { |
998 | __m128i diffU = _mm_sub_epi16(_mm_max_epi16(a.m_value, b.m_value), _mm_min_epi16(a.m_value, b.m_value)); |
999 | |
1000 | __m128i mulHi = _mm_mulhi_epu16(diffU, diffU); |
1001 | __m128i mulLo = _mm_mullo_epi16(diffU, diffU); |
1002 | __m128i sqDiffHi = _mm_unpackhi_epi16(mulLo, mulHi); |
1003 | __m128i sqDiffLo = _mm_unpacklo_epi16(mulLo, mulHi); |
1004 | |
1005 | Float result; |
1006 | result.m_values[0] = _mm_cvtepi32_ps(sqDiffLo); |
1007 | result.m_values[1] = _mm_cvtepi32_ps(sqDiffHi); |
1008 | |
1009 | return result; |
1010 | } |
1011 | |
1012 | static Float TwosCLHalfToFloat(const SInt16 &v) |
1013 | { |
1014 | __m128i absV = _mm_add_epi16(_mm_xor_si128(v.m_value, _mm_srai_epi16(v.m_value, 15)), _mm_srli_epi16(v.m_value, 15)); |
1015 | |
1016 | __m128i signBits = _mm_and_si128(v.m_value, _mm_set1_epi16(-32768)); |
1017 | __m128i mantissa = _mm_and_si128(v.m_value, _mm_set1_epi16(0x03ff)); |
1018 | __m128i exponent = _mm_and_si128(v.m_value, _mm_set1_epi16(0x7c00)); |
1019 | |
1020 | __m128i isDenormal = _mm_cmpeq_epi16(exponent, _mm_setzero_si128()); |
1021 | |
1022 | // Convert exponent to high-bits |
1023 | exponent = _mm_add_epi16(_mm_srli_epi16(exponent, 3), _mm_set1_epi16(14336)); |
1024 | |
1025 | __m128i denormalCorrectionHigh = _mm_and_si128(isDenormal, _mm_or_si128(signBits, _mm_set1_epi16(14336))); |
1026 | |
1027 | __m128i highBits = _mm_or_si128(signBits, _mm_or_si128(exponent, _mm_srli_epi16(mantissa, 3))); |
1028 | __m128i lowBits = _mm_slli_epi16(mantissa, 13); |
1029 | |
1030 | __m128i flow = _mm_unpacklo_epi16(lowBits, highBits); |
1031 | __m128i fhigh = _mm_unpackhi_epi16(lowBits, highBits); |
1032 | |
1033 | __m128i correctionLow = _mm_unpacklo_epi16(_mm_setzero_si128(), denormalCorrectionHigh); |
1034 | __m128i correctionHigh = _mm_unpackhi_epi16(_mm_setzero_si128(), denormalCorrectionHigh); |
1035 | |
1036 | Float result; |
1037 | result.m_values[0] = _mm_sub_ps(_mm_castsi128_ps(flow), _mm_castsi128_ps(correctionLow)); |
1038 | result.m_values[1] = _mm_sub_ps(_mm_castsi128_ps(fhigh), _mm_castsi128_ps(correctionHigh)); |
1039 | |
1040 | return result; |
1041 | } |
1042 | |
1043 | static Float SqDiff2CLFloat(const SInt16 &a, const Float &b) |
1044 | { |
1045 | Float fa = TwosCLHalfToFloat(a); |
1046 | |
1047 | Float diff = fa - b; |
1048 | return diff * diff; |
1049 | } |
1050 | |
1051 | static Float SqDiff2CL(const SInt16 &a, const SInt16 &b) |
1052 | { |
1053 | Float fa = TwosCLHalfToFloat(a); |
1054 | Float fb = TwosCLHalfToFloat(b); |
1055 | |
1056 | Float diff = fa - fb; |
1057 | return diff * diff; |
1058 | } |
1059 | |
1060 | static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b) |
1061 | { |
1062 | Float fa = TwosCLHalfToFloat(a) * aWeight; |
1063 | |
1064 | Float diff = fa - b; |
1065 | return diff * diff; |
1066 | } |
1067 | |
1068 | static UInt16 RightShift(const UInt16 &v, int bits) |
1069 | { |
1070 | UInt16 result; |
1071 | result.m_value = _mm_srli_epi16(v.m_value, bits); |
1072 | return result; |
1073 | } |
1074 | |
1075 | static UInt31 RightShift(const UInt31 &v, int bits) |
1076 | { |
1077 | UInt31 result; |
1078 | result.m_values[0] = _mm_srli_epi32(v.m_values[0], bits); |
1079 | result.m_values[1] = _mm_srli_epi32(v.m_values[1], bits); |
1080 | return result; |
1081 | } |
1082 | |
1083 | static SInt16 RightShift(const SInt16 &v, int bits) |
1084 | { |
1085 | SInt16 result; |
1086 | result.m_value = _mm_srai_epi16(v.m_value, bits); |
1087 | return result; |
1088 | } |
1089 | |
1090 | static UInt15 RightShift(const UInt15 &v, int bits) |
1091 | { |
1092 | UInt15 result; |
1093 | result.m_value = _mm_srli_epi16(v.m_value, bits); |
1094 | return result; |
1095 | } |
1096 | |
1097 | static SInt32 RightShift(const SInt32 &v, int bits) |
1098 | { |
1099 | SInt32 result; |
1100 | result.m_values[0] = _mm_srai_epi32(v.m_values[0], bits); |
1101 | result.m_values[1] = _mm_srai_epi32(v.m_values[1], bits); |
1102 | return result; |
1103 | } |
1104 | |
1105 | static SInt16 ToSInt16(const SInt32 &v) |
1106 | { |
1107 | SInt16 result; |
1108 | result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]); |
1109 | return result; |
1110 | } |
1111 | |
1112 | static SInt16 ToSInt16(const UInt16 &v) |
1113 | { |
1114 | SInt16 result; |
1115 | result.m_value = v.m_value; |
1116 | return result; |
1117 | } |
1118 | |
1119 | static SInt16 ToSInt16(const UInt15 &v) |
1120 | { |
1121 | SInt16 result; |
1122 | result.m_value = v.m_value; |
1123 | return result; |
1124 | } |
1125 | |
1126 | static UInt16 ToUInt16(const UInt32 &v) |
1127 | { |
1128 | __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16); |
1129 | __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16); |
1130 | |
1131 | UInt16 result; |
1132 | result.m_value = _mm_packs_epi32(low, high); |
1133 | return result; |
1134 | } |
1135 | |
1136 | static UInt16 ToUInt16(const UInt31 &v) |
1137 | { |
1138 | __m128i low = _mm_srai_epi32(_mm_slli_epi32(v.m_values[0], 16), 16); |
1139 | __m128i high = _mm_srai_epi32(_mm_slli_epi32(v.m_values[1], 16), 16); |
1140 | |
1141 | UInt16 result; |
1142 | result.m_value = _mm_packs_epi32(low, high); |
1143 | return result; |
1144 | } |
1145 | |
1146 | static UInt15 ToUInt15(const UInt31 &v) |
1147 | { |
1148 | UInt15 result; |
1149 | result.m_value = _mm_packs_epi32(v.m_values[0], v.m_values[1]); |
1150 | return result; |
1151 | } |
1152 | |
1153 | static UInt15 ToUInt15(const SInt16 &v) |
1154 | { |
1155 | UInt15 result; |
1156 | result.m_value = v.m_value; |
1157 | return result; |
1158 | } |
1159 | |
1160 | static UInt15 ToUInt15(const UInt16 &v) |
1161 | { |
1162 | UInt15 result; |
1163 | result.m_value = v.m_value; |
1164 | return result; |
1165 | } |
1166 | |
1167 | static SInt32 XMultiply(const SInt16 &a, const SInt16 &b) |
1168 | { |
1169 | __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value); |
1170 | __m128i low = _mm_mullo_epi16(a.m_value, b.m_value); |
1171 | |
1172 | SInt32 result; |
1173 | result.m_values[0] = _mm_unpacklo_epi16(low, high); |
1174 | result.m_values[1] = _mm_unpackhi_epi16(low, high); |
1175 | return result; |
1176 | } |
1177 | |
1178 | static SInt32 XMultiply(const SInt16 &a, const UInt15 &b) |
1179 | { |
1180 | __m128i high = _mm_mulhi_epi16(a.m_value, b.m_value); |
1181 | __m128i low = _mm_mullo_epi16(a.m_value, b.m_value); |
1182 | |
1183 | SInt32 result; |
1184 | result.m_values[0] = _mm_unpacklo_epi16(low, high); |
1185 | result.m_values[1] = _mm_unpackhi_epi16(low, high); |
1186 | return result; |
1187 | } |
1188 | |
1189 | static SInt32 XMultiply(const UInt15 &a, const SInt16 &b) |
1190 | { |
1191 | return XMultiply(b, a); |
1192 | } |
1193 | |
1194 | static UInt32 XMultiply(const UInt16 &a, const UInt16 &b) |
1195 | { |
1196 | __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value); |
1197 | __m128i low = _mm_mullo_epi16(a.m_value, b.m_value); |
1198 | |
1199 | UInt32 result; |
1200 | result.m_values[0] = _mm_unpacklo_epi16(low, high); |
1201 | result.m_values[1] = _mm_unpackhi_epi16(low, high); |
1202 | return result; |
1203 | } |
1204 | |
1205 | static UInt16 CompactMultiply(const UInt16 &a, const UInt15 &b) |
1206 | { |
1207 | UInt16 result; |
1208 | result.m_value = _mm_mullo_epi16(a.m_value, b.m_value); |
1209 | return result; |
1210 | } |
1211 | |
1212 | static UInt16 CompactMultiply(const UInt15 &a, const UInt15 &b) |
1213 | { |
1214 | UInt16 result; |
1215 | result.m_value = _mm_mullo_epi16(a.m_value, b.m_value); |
1216 | return result; |
1217 | } |
1218 | |
1219 | static SInt16 CompactMultiply(const SInt16 &a, const UInt15 &b) |
1220 | { |
1221 | SInt16 result; |
1222 | result.m_value = _mm_mullo_epi16(a.m_value, b.m_value); |
1223 | return result; |
1224 | } |
1225 | |
1226 | static SInt16 CompactMultiply(const SInt16 &a, const SInt16 &b) |
1227 | { |
1228 | SInt16 result; |
1229 | result.m_value = _mm_mullo_epi16(a.m_value, b.m_value); |
1230 | return result; |
1231 | } |
1232 | |
1233 | static UInt31 XMultiply(const UInt15 &a, const UInt15 &b) |
1234 | { |
1235 | __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value); |
1236 | __m128i low = _mm_mullo_epi16(a.m_value, b.m_value); |
1237 | |
1238 | UInt31 result; |
1239 | result.m_values[0] = _mm_unpacklo_epi16(low, high); |
1240 | result.m_values[1] = _mm_unpackhi_epi16(low, high); |
1241 | return result; |
1242 | } |
1243 | |
1244 | static UInt31 XMultiply(const UInt16 &a, const UInt15 &b) |
1245 | { |
1246 | __m128i high = _mm_mulhi_epu16(a.m_value, b.m_value); |
1247 | __m128i low = _mm_mullo_epi16(a.m_value, b.m_value); |
1248 | |
1249 | UInt31 result; |
1250 | result.m_values[0] = _mm_unpacklo_epi16(low, high); |
1251 | result.m_values[1] = _mm_unpackhi_epi16(low, high); |
1252 | return result; |
1253 | } |
1254 | |
1255 | static UInt31 XMultiply(const UInt15 &a, const UInt16 &b) |
1256 | { |
1257 | return XMultiply(b, a); |
1258 | } |
1259 | |
1260 | static bool AnySet(const Int16CompFlag &v) |
1261 | { |
1262 | return _mm_movemask_epi8(v.m_value) != 0; |
1263 | } |
1264 | |
1265 | static bool AllSet(const Int16CompFlag &v) |
1266 | { |
1267 | return _mm_movemask_epi8(v.m_value) == 0xffff; |
1268 | } |
1269 | |
1270 | static bool AnySet(const FloatCompFlag &v) |
1271 | { |
1272 | return _mm_movemask_ps(v.m_values[0]) != 0 || _mm_movemask_ps(v.m_values[1]) != 0; |
1273 | } |
1274 | |
1275 | static bool AllSet(const FloatCompFlag &v) |
1276 | { |
1277 | return _mm_movemask_ps(v.m_values[0]) == 0xf && _mm_movemask_ps(v.m_values[1]) == 0xf; |
1278 | } |
1279 | }; |
1280 | |
1281 | #else |
1282 | // Scalar version |
1283 | struct ParallelMath |
1284 | { |
1285 | struct RoundTowardZeroForScope |
1286 | { |
1287 | }; |
1288 | |
1289 | struct RoundTowardNearestForScope |
1290 | { |
1291 | }; |
1292 | |
1293 | struct RoundUpForScope |
1294 | { |
1295 | }; |
1296 | |
1297 | struct RoundDownForScope |
1298 | { |
1299 | }; |
1300 | |
1301 | static const int ParallelSize = 1; |
1302 | |
1303 | enum Int16Subtype |
1304 | { |
1305 | IntSubtype_Signed, |
1306 | IntSubtype_UnsignedFull, |
1307 | IntSubtype_UnsignedTruncated, |
1308 | IntSubtype_Abstract, |
1309 | }; |
1310 | |
1311 | typedef int32_t SInt16; |
1312 | typedef int32_t UInt15; |
1313 | typedef int32_t UInt16; |
1314 | typedef int32_t AInt16; |
1315 | |
1316 | typedef int32_t SInt32; |
1317 | typedef int32_t UInt31; |
1318 | typedef int32_t UInt32; |
1319 | typedef int32_t AInt32; |
1320 | |
1321 | typedef int32_t ScalarUInt16; |
1322 | typedef int32_t ScalarSInt16; |
1323 | |
1324 | typedef float Float; |
1325 | |
1326 | template<class TTargetType> |
1327 | struct LosslessCast |
1328 | { |
1329 | static const int32_t& Cast(const int32_t &src) |
1330 | { |
1331 | return src; |
1332 | } |
1333 | }; |
1334 | |
1335 | typedef bool Int16CompFlag; |
1336 | typedef bool FloatCompFlag; |
1337 | |
1338 | static int32_t AbstractAdd(const int32_t &a, const int32_t &b) |
1339 | { |
1340 | return a + b; |
1341 | } |
1342 | |
1343 | static int32_t AbstractSubtract(const int32_t &a, const int32_t &b) |
1344 | { |
1345 | return a - b; |
1346 | } |
1347 | |
1348 | static float Select(bool flag, float a, float b) |
1349 | { |
1350 | return flag ? a : b; |
1351 | } |
1352 | |
1353 | static int32_t Select(bool flag, int32_t a, int32_t b) |
1354 | { |
1355 | return flag ? a : b; |
1356 | } |
1357 | |
1358 | static int32_t SelectOrZero(bool flag, int32_t a) |
1359 | { |
1360 | return flag ? a : 0; |
1361 | } |
1362 | |
1363 | static void ConditionalSet(int32_t& dest, bool flag, int32_t src) |
1364 | { |
1365 | if (flag) |
1366 | dest = src; |
1367 | } |
1368 | |
1369 | static void ConditionalSet(bool& dest, bool flag, bool src) |
1370 | { |
1371 | if (flag) |
1372 | dest = src; |
1373 | } |
1374 | |
1375 | static int32_t ConditionalNegate(bool flag, int32_t v) |
1376 | { |
1377 | return (flag) ? -v : v; |
1378 | } |
1379 | |
1380 | static void NotConditionalSet(int32_t& dest, bool flag, int32_t src) |
1381 | { |
1382 | if (!flag) |
1383 | dest = src; |
1384 | } |
1385 | |
1386 | static void ConditionalSet(float& dest, bool flag, float src) |
1387 | { |
1388 | if (flag) |
1389 | dest = src; |
1390 | } |
1391 | |
1392 | static void NotConditionalSet(float& dest, bool flag, float src) |
1393 | { |
1394 | if (!flag) |
1395 | dest = src; |
1396 | } |
1397 | |
1398 | static void MakeSafeDenominator(float& v) |
1399 | { |
1400 | if (v == 0.0f) |
1401 | v = 1.0f; |
1402 | } |
1403 | |
1404 | static int32_t SignedRightShift(int32_t v, int bits) |
1405 | { |
1406 | return v >> bits; |
1407 | } |
1408 | |
1409 | static int32_t TruncateToPrecisionSigned(int32_t v, int precision) |
1410 | { |
1411 | v = (v << (32 - precision)) & 0xffffffff; |
1412 | return SignedRightShift(v, 32 - precision); |
1413 | } |
1414 | |
1415 | static int32_t TruncateToPrecisionUnsigned(int32_t v, int precision) |
1416 | { |
1417 | return v & ((1 << precision) - 1); |
1418 | } |
1419 | |
1420 | static int32_t Min(int32_t a, int32_t b) |
1421 | { |
1422 | if (a < b) |
1423 | return a; |
1424 | return b; |
1425 | } |
1426 | |
1427 | static float Min(float a, float b) |
1428 | { |
1429 | if (a < b) |
1430 | return a; |
1431 | return b; |
1432 | } |
1433 | |
1434 | static int32_t Max(int32_t a, int32_t b) |
1435 | { |
1436 | if (a > b) |
1437 | return a; |
1438 | return b; |
1439 | } |
1440 | |
1441 | static float Max(float a, float b) |
1442 | { |
1443 | if (a > b) |
1444 | return a; |
1445 | return b; |
1446 | } |
1447 | |
1448 | static float Abs(float a) |
1449 | { |
1450 | return fabsf(a); |
1451 | } |
1452 | |
1453 | static int32_t Abs(int32_t a) |
1454 | { |
1455 | if (a < 0) |
1456 | return -a; |
1457 | return a; |
1458 | } |
1459 | |
1460 | static float Clamp(float v, float min, float max) |
1461 | { |
1462 | if (v < min) |
1463 | return min; |
1464 | if (v > max) |
1465 | return max; |
1466 | return v; |
1467 | } |
1468 | |
1469 | static float Reciprocal(float v) |
1470 | { |
1471 | return 1.0f / v; |
1472 | } |
1473 | |
1474 | static void ConvertLDRInputs(const PixelBlockU8* inputBlocks, int pxOffset, int channel, int32_t& chOut) |
1475 | { |
1476 | chOut = inputBlocks[0].m_pixels[pxOffset][channel]; |
1477 | } |
1478 | |
1479 | static void ConvertHDRInputs(const PixelBlockF16* inputBlocks, int pxOffset, int channel, int32_t& chOut) |
1480 | { |
1481 | chOut = inputBlocks[0].m_pixels[pxOffset][channel]; |
1482 | } |
1483 | |
1484 | static float MakeFloat(float v) |
1485 | { |
1486 | return v; |
1487 | } |
1488 | |
1489 | static float MakeFloatZero() |
1490 | { |
1491 | return 0.0f; |
1492 | } |
1493 | |
1494 | static int32_t MakeUInt16(uint16_t v) |
1495 | { |
1496 | return v; |
1497 | } |
1498 | |
1499 | static int32_t MakeSInt16(int16_t v) |
1500 | { |
1501 | return v; |
1502 | } |
1503 | |
1504 | static int32_t MakeAInt16(int16_t v) |
1505 | { |
1506 | return v; |
1507 | } |
1508 | |
1509 | static int32_t MakeUInt15(uint16_t v) |
1510 | { |
1511 | return v; |
1512 | } |
1513 | |
1514 | static int32_t MakeSInt32(int32_t v) |
1515 | { |
1516 | return v; |
1517 | } |
1518 | |
1519 | static int32_t MakeUInt31(int32_t v) |
1520 | { |
1521 | return v; |
1522 | } |
1523 | |
1524 | static int32_t Extract(int32_t v, int offset) |
1525 | { |
1526 | UNREFERENCED_PARAMETER(offset); |
1527 | return v; |
1528 | } |
1529 | |
1530 | static bool Extract(bool v, int offset) |
1531 | { |
1532 | UNREFERENCED_PARAMETER(offset); |
1533 | return v; |
1534 | } |
1535 | |
1536 | static float Extract(float v, int offset) |
1537 | { |
1538 | UNREFERENCED_PARAMETER(offset); |
1539 | return v; |
1540 | } |
1541 | |
1542 | static void PutUInt16(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v) |
1543 | { |
1544 | UNREFERENCED_PARAMETER(offset); |
1545 | dest = v; |
1546 | } |
1547 | |
1548 | static void PutUInt15(int32_t &dest, int offset, ParallelMath::ScalarUInt16 v) |
1549 | { |
1550 | UNREFERENCED_PARAMETER(offset); |
1551 | dest = v; |
1552 | } |
1553 | |
1554 | static void PutSInt16(int32_t &dest, int offset, ParallelMath::ScalarSInt16 v) |
1555 | { |
1556 | UNREFERENCED_PARAMETER(offset); |
1557 | dest = v; |
1558 | } |
1559 | |
1560 | static float ExtractFloat(float v, int offset) |
1561 | { |
1562 | UNREFERENCED_PARAMETER(offset); |
1563 | return v; |
1564 | } |
1565 | |
1566 | static void PutFloat(float &dest, int offset, float v) |
1567 | { |
1568 | UNREFERENCED_PARAMETER(offset); |
1569 | dest = v; |
1570 | } |
1571 | |
1572 | static void PutBoolInt16(bool &dest, int offset, bool v) |
1573 | { |
1574 | UNREFERENCED_PARAMETER(offset); |
1575 | dest = v; |
1576 | } |
1577 | |
1578 | static bool Less(int32_t a, int32_t b) |
1579 | { |
1580 | return a < b; |
1581 | } |
1582 | |
1583 | static bool Less(float a, float b) |
1584 | { |
1585 | return a < b; |
1586 | } |
1587 | |
1588 | static bool LessOrEqual(int32_t a, int32_t b) |
1589 | { |
1590 | return a < b; |
1591 | } |
1592 | |
1593 | static bool LessOrEqual(float a, float b) |
1594 | { |
1595 | return a < b; |
1596 | } |
1597 | |
1598 | static bool Equal(int32_t a, int32_t b) |
1599 | { |
1600 | return a == b; |
1601 | } |
1602 | |
1603 | static bool Equal(float a, float b) |
1604 | { |
1605 | return a == b; |
1606 | } |
1607 | |
1608 | static float ToFloat(int32_t v) |
1609 | { |
1610 | return static_cast<float>(v); |
1611 | } |
1612 | |
1613 | static int32_t ToUInt31(int32_t v) |
1614 | { |
1615 | return v; |
1616 | } |
1617 | |
1618 | static int32_t ToInt32(int32_t v) |
1619 | { |
1620 | return v; |
1621 | } |
1622 | |
1623 | static bool FloatFlagToInt16(bool v) |
1624 | { |
1625 | return v; |
1626 | } |
1627 | |
1628 | static bool Int32FlagToInt16(bool v) |
1629 | { |
1630 | return v; |
1631 | } |
1632 | |
1633 | static bool Int16FlagToFloat(bool v) |
1634 | { |
1635 | return v; |
1636 | } |
1637 | |
1638 | static bool MakeBoolInt16(bool b) |
1639 | { |
1640 | return b; |
1641 | } |
1642 | |
1643 | static bool MakeBoolFloat(bool b) |
1644 | { |
1645 | return b; |
1646 | } |
1647 | |
1648 | static bool AndNot(bool a, bool b) |
1649 | { |
1650 | return a && !b; |
1651 | } |
1652 | |
1653 | static bool Not(bool b) |
1654 | { |
1655 | return !b; |
1656 | } |
1657 | |
1658 | static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardZeroForScope *rtz) |
1659 | { |
1660 | UNREFERENCED_PARAMETER(rtz); |
1661 | return static_cast<int>(v); |
1662 | } |
1663 | |
1664 | static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundUpForScope *ru) |
1665 | { |
1666 | UNREFERENCED_PARAMETER(ru); |
1667 | return static_cast<int>(ceilf(v)); |
1668 | } |
1669 | |
1670 | static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundDownForScope *rd) |
1671 | { |
1672 | UNREFERENCED_PARAMETER(rd); |
1673 | return static_cast<int>(floorf(v)); |
1674 | } |
1675 | |
1676 | static int32_t RoundAndConvertToInt(float v, const ParallelMath::RoundTowardNearestForScope *rtn) |
1677 | { |
1678 | UNREFERENCED_PARAMETER(rtn); |
1679 | return static_cast<int>(floorf(v + 0.5f)); |
1680 | } |
1681 | |
1682 | template<class TRoundMode> |
1683 | static int32_t RoundAndConvertToU16(float v, const TRoundMode *roundingMode) |
1684 | { |
1685 | return RoundAndConvertToInt(v, roundingMode); |
1686 | } |
1687 | |
1688 | template<class TRoundMode> |
1689 | static int32_t RoundAndConvertToU15(float v, const TRoundMode *roundingMode) |
1690 | { |
1691 | return RoundAndConvertToInt(v, roundingMode); |
1692 | } |
1693 | |
1694 | template<class TRoundMode> |
1695 | static int32_t RoundAndConvertToS16(float v, const TRoundMode *roundingMode) |
1696 | { |
1697 | return RoundAndConvertToInt(v, roundingMode); |
1698 | } |
1699 | |
1700 | static float Sqrt(float f) |
1701 | { |
1702 | return sqrtf(f); |
1703 | } |
1704 | |
1705 | static int32_t SqDiffUInt8(int32_t a, int32_t b) |
1706 | { |
1707 | int32_t delta = a - b; |
1708 | return delta * delta; |
1709 | } |
1710 | |
1711 | static int32_t SqDiffInt16(int32_t a, int32_t b) |
1712 | { |
1713 | int32_t delta = a - b; |
1714 | return delta * delta; |
1715 | } |
1716 | |
1717 | static int32_t SqDiffSInt16(int32_t a, int32_t b) |
1718 | { |
1719 | int32_t delta = a - b; |
1720 | return delta * delta; |
1721 | } |
1722 | |
1723 | static float TwosCLHalfToFloat(int32_t v) |
1724 | { |
1725 | int32_t absV = (v < 0) ? -v : v; |
1726 | |
1727 | int32_t signBits = (absV & -32768); |
1728 | int32_t mantissa = (absV & 0x03ff); |
1729 | int32_t exponent = (absV & 0x7c00); |
1730 | |
1731 | bool isDenormal = (exponent == 0); |
1732 | |
1733 | // Convert exponent to high-bits |
1734 | exponent = (exponent >> 3) + 14336; |
1735 | |
1736 | int32_t denormalCorrection = (isDenormal ? (signBits | 14336) : 0) << 16; |
1737 | |
1738 | int32_t fBits = ((exponent | signBits) << 16) | (mantissa << 13); |
1739 | |
1740 | float f, correction; |
1741 | memcpy(&f, &fBits, 4); |
1742 | memcpy(&correction, &denormalCorrection, 4); |
1743 | |
1744 | return f - correction; |
1745 | } |
1746 | |
1747 | static Float SqDiff2CLFloat(const SInt16 &a, const Float &b) |
1748 | { |
1749 | Float fa = TwosCLHalfToFloat(a); |
1750 | |
1751 | Float diff = fa - b; |
1752 | return diff * diff; |
1753 | } |
1754 | |
1755 | static Float SqDiff2CL(const SInt16 &a, const SInt16 &b) |
1756 | { |
1757 | Float fa = TwosCLHalfToFloat(a); |
1758 | Float fb = TwosCLHalfToFloat(b); |
1759 | |
1760 | Float diff = fa - fb; |
1761 | return diff * diff; |
1762 | } |
1763 | |
1764 | static Float SqDiff2CLFloat(const SInt16 &a, float aWeight, const Float &b) |
1765 | { |
1766 | Float fa = TwosCLHalfToFloat(a) * aWeight; |
1767 | |
1768 | Float diff = fa - b; |
1769 | return diff * diff; |
1770 | } |
1771 | |
1772 | static int32_t RightShift(int32_t v, int bits) |
1773 | { |
1774 | return SignedRightShift(v, bits); |
1775 | } |
1776 | |
1777 | static int32_t ToSInt16(int32_t v) |
1778 | { |
1779 | return v; |
1780 | } |
1781 | |
1782 | static int32_t ToUInt16(int32_t v) |
1783 | { |
1784 | return v; |
1785 | } |
1786 | |
1787 | static int32_t ToUInt15(int32_t v) |
1788 | { |
1789 | return v; |
1790 | } |
1791 | |
1792 | static int32_t XMultiply(int32_t a, int32_t b) |
1793 | { |
1794 | return a * b; |
1795 | } |
1796 | |
1797 | static int32_t CompactMultiply(int32_t a, int32_t b) |
1798 | { |
1799 | return a * b; |
1800 | } |
1801 | |
1802 | static bool AnySet(bool v) |
1803 | { |
1804 | return v; |
1805 | } |
1806 | |
1807 | static bool AllSet(bool v) |
1808 | { |
1809 | return v; |
1810 | } |
1811 | }; |
1812 | |
1813 | #endif |
1814 | } |
1815 | |
1816 | #endif |
1817 | |