1 | /* |
2 | Copyright (c) 2012, Broadcom Europe Ltd |
3 | All rights reserved. |
4 | |
5 | Redistribution and use in source and binary forms, with or without |
6 | modification, are permitted provided that the following conditions are met: |
7 | * Redistributions of source code must retain the above copyright |
8 | notice, this list of conditions and the following disclaimer. |
9 | * Redistributions in binary form must reproduce the above copyright |
10 | notice, this list of conditions and the following disclaimer in the |
11 | documentation and/or other materials provided with the distribution. |
12 | * Neither the name of the copyright holder nor the |
13 | names of its contributors may be used to endorse or promote products |
14 | derived from this software without specific prior written permission. |
15 | |
16 | THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND |
17 | ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED |
18 | WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
19 | DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY |
20 | DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES |
21 | (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; |
22 | LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
23 | ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT |
24 | (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
25 | SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
26 | */ |
27 | |
28 | #ifndef KHRN_INT_UTIL_H |
29 | #define KHRN_INT_UTIL_H |
30 | |
31 | #include <ctype.h> |
32 | #include <float.h> |
33 | #include <math.h> |
34 | |
35 | #include "interface/khronos/common/khrn_int_common.h" |
36 | #if !defined(__VIDEOCORE__) && !defined(WIN32) // threadsx/nucleus define LONG which clashses |
37 | #include "interface/vcos/vcos.h" |
38 | #endif |
39 | |
40 | /****************************************************************************** |
41 | replacements for videocore intrinsics |
42 | ******************************************************************************/ |
43 | |
44 | #ifdef _VIDEOCORE |
45 | #include <vc/intrinsics.h> |
46 | #define _minf(x,y) _min((float)(x),(float)(y)) |
47 | #define _maxf(x,y) _max((float)(x),(float)(y)) |
48 | #else |
49 | static INLINE int32_t _bmask(int32_t x, int32_t y) |
50 | { |
51 | return x & ((1 << (y & 0x1f)) - 1); |
52 | } |
53 | |
54 | static INLINE int32_t _min(int32_t x, int32_t y) |
55 | { |
56 | return x < y ? x : y; |
57 | } |
58 | |
59 | static INLINE int32_t _max(int32_t x, int32_t y) |
60 | { |
61 | return x > y ? x : y; |
62 | } |
63 | |
64 | #if defined(_MSC_VER) |
65 | static INLINE int32_t _msb(uint32_t x) |
66 | { |
67 | int32_t l = -1; |
68 | |
69 | if (x) |
70 | __asm { |
71 | bsr eax, x |
72 | mov l, eax |
73 | } |
74 | |
75 | return l; |
76 | } |
77 | #elif defined __CC_ARM |
78 | static INLINE int32_t _msb(uint32_t x) |
79 | { |
80 | return 31 - __clz(x); |
81 | } |
82 | #elif defined(__GNUC__) |
83 | static INLINE int32_t _msb(uint32_t x) |
84 | { |
85 | return x ? (31 - __builtin_clz(x)) : -1; |
86 | } |
87 | #else |
88 | static INLINE int32_t _msb(uint32_t x) /* unsigned to get lsr */ |
89 | { |
90 | int32_t msb = -1; |
91 | while (x != 0) { |
92 | ++msb; |
93 | x >>= 1; |
94 | } |
95 | return msb; |
96 | } |
97 | #endif |
98 | |
99 | static INLINE uint32_t _count(uint32_t x) |
100 | { |
101 | uint32_t count = 0; |
102 | while (x != 0) { |
103 | x &= x - 1; |
104 | ++count; |
105 | } |
106 | return count; |
107 | } |
108 | |
109 | #if defined __CC_ARM && __TARGET_ARCH_THUMB >= 4 |
110 | static INLINE uint32_t _bitrev(uint32_t x, uint32_t y) |
111 | { |
112 | return __rbit(x) >> (32-y); |
113 | } |
114 | #else |
115 | static INLINE uint32_t _bitrev(uint32_t x, uint32_t y) |
116 | { |
117 | uint32_t bitrev = 0; |
118 | uint32_t i; |
119 | for (i = 0; i != y; ++i) { |
120 | bitrev |= ((x >> i) & 1) << (y - i - 1); |
121 | } |
122 | return bitrev; |
123 | } |
124 | #endif |
125 | |
126 | #ifdef __CC_ARM |
127 | static INLINE int32_t _adds(int32_t x, int32_t y) |
128 | { |
129 | return __qadd(x, y); |
130 | } |
131 | |
132 | static INLINE int32_t _subs(int32_t x, int32_t y) |
133 | { |
134 | return __qsub(x, y); |
135 | } |
136 | |
137 | static INLINE uint32_t _ror(uint32_t x, uint32_t y) |
138 | { |
139 | return __ror(x, y); |
140 | } |
141 | #else |
142 | static INLINE int32_t _adds(int32_t x, int32_t y) |
143 | { |
144 | int32_t z = x + y; |
145 | return (y > 0) ? ((z < x) ? (int32_t)0x7fffffff : z) : ((z > x) ? (int32_t)0x80000000 : z); |
146 | } |
147 | |
148 | static INLINE int32_t _subs(int32_t x, int32_t y) |
149 | { |
150 | int32_t z = x - y; |
151 | return (y > 0) ? ((z > x) ? (int32_t)0x80000000 : z) : ((z < x) ? (int32_t)0x7fffffff : z); |
152 | } |
153 | |
154 | static INLINE uint32_t _ror(uint32_t x, uint32_t y) |
155 | { |
156 | return (x << (32 - y)) | (x >> y); |
157 | } |
158 | #endif // __CC_ARM |
159 | |
160 | static INLINE int32_t _abs(int32_t x) |
161 | { |
162 | return x > 0 ? x : -x; |
163 | } |
164 | |
165 | static INLINE float _minf(float x, float y) |
166 | { |
167 | return x < y ? x : y; |
168 | } |
169 | |
170 | static INLINE float _maxf(float x, float y) |
171 | { |
172 | return x > y ? x : y; |
173 | } |
174 | |
175 | #endif // !_VIDEOCORE |
176 | |
177 | |
178 | /****************************************************************************** |
179 | misc stuff |
180 | ******************************************************************************/ |
181 | |
182 | #define ARR_COUNT(ARR) (sizeof(ARR) / sizeof(*(ARR))) |
183 | |
184 | /* sign-extend 16-bit value with range [-0x4000, 0xbfff] */ |
185 | static INLINE int32_t s_ext_off16(int32_t x) |
186 | { |
187 | return ((int32_t)(int16_t)(x - 0x4000)) + 0x4000; |
188 | } |
189 | |
190 | static INLINE bool is_power_of_2(uint32_t x) |
191 | { |
192 | return (x != 0) && ((x & (x - 1)) == 0); |
193 | } |
194 | |
195 | static INLINE uint32_t next_power_of_2(uint32_t x) |
196 | { |
197 | return is_power_of_2(x) ? x : (uint32_t)(1 << (_msb(x) + 1)); |
198 | } |
199 | |
200 | static INLINE uint32_t round_up(uint32_t x, uint32_t y) |
201 | { |
202 | vcos_assert(is_power_of_2(y)); |
203 | return (x + (y - 1)) & ~(y - 1); |
204 | } |
205 | |
206 | static INLINE void *round_up_ptr(void *x, uint32_t y) |
207 | { |
208 | vcos_assert(is_power_of_2(y)); |
209 | return (void *)(((uintptr_t)x + (uintptr_t)(y - 1)) & ~(uintptr_t)(y - 1)); |
210 | } |
211 | |
212 | static INLINE uint32_t mod(int32_t x, int32_t y) |
213 | { |
214 | int32_t m = x % y; |
215 | return (m < 0) ? (m + y) : m; |
216 | } |
217 | |
218 | extern int khrn_get_type_size(int type /* GLenum*/); |
219 | |
220 | static INLINE int find_max(int count, int size, const void *indices) |
221 | { |
222 | int i; |
223 | int32_t max = -1; |
224 | |
225 | switch (size) { |
226 | case 1: |
227 | { |
228 | uint8_t *u = (uint8_t *)indices; |
229 | |
230 | for (i = 0; i < count; i++) |
231 | max = _max( max, (int32_t) u[i]); |
232 | |
233 | break; |
234 | } |
235 | case 2: |
236 | { |
237 | uint16_t *u = (uint16_t *)indices; |
238 | |
239 | for (i = 0; i < count; i++) |
240 | max = _max( max, (int32_t) u[i]); |
241 | |
242 | break; |
243 | } |
244 | default: |
245 | UNREACHABLE(); |
246 | break; |
247 | } |
248 | |
249 | return (int) max; |
250 | } |
251 | |
252 | /****************************************************************************** |
253 | for poking around inside floats (we assume ieee-754) |
254 | ******************************************************************************/ |
255 | |
256 | typedef union { |
257 | float f; |
258 | uint32_t bits; |
259 | } KHRN_FLOAT_BITS_T; |
260 | |
261 | static INLINE uint32_t float_to_bits(float f) |
262 | { |
263 | KHRN_FLOAT_BITS_T t; |
264 | t.f = f; |
265 | return t.bits; |
266 | } |
267 | |
268 | static INLINE float float_from_bits(uint32_t bits) |
269 | { |
270 | KHRN_FLOAT_BITS_T t; |
271 | t.bits = bits; |
272 | return t.f; |
273 | } |
274 | |
275 | /****************************************************************************** |
276 | input cleaning stuff |
277 | ******************************************************************************/ |
278 | |
279 | #include "interface/khronos/common/khrn_int_util_cr.h" |
280 | |
281 | static INLINE void clean_floats(float *dst, const float *src, uint32_t count) |
282 | { |
283 | uint32_t i; |
284 | for (i = 0; i != count; ++i) { |
285 | dst[i] = clean_float(src[i]); |
286 | } |
287 | } |
288 | |
289 | /****************************************************************************** |
290 | float to int conversions |
291 | ******************************************************************************/ |
292 | |
293 | static INLINE float r2ni_to_r2n_bias(float f, int32_t shift) |
294 | { |
295 | vcos_assert((shift >= -129) && (shift <= 124)); |
296 | return f + float_from_bits(((127 - (shift + 2)) << 23) | 0x7fffff); |
297 | } |
298 | |
299 | /* |
300 | convert float to integer value with shift |
301 | saturating, round to nearest |
302 | |
303 | on videocore, we support shifts in [-32, 31]. we only need to support shifts |
304 | of 0 and 16 for client-side code |
305 | */ |
306 | |
307 | static INLINE int32_t float_to_int_shift(float f, int32_t shift) |
308 | { |
309 | #ifdef _VIDEOCORE |
310 | /* floattouint is wrapping, round to negative infinity. shift should be in [-32, 31] */ |
311 | vcos_assert((shift >= -32) && (shift <= 31)); |
312 | f = r2ni_to_r2n_bias(f, shift); |
313 | if (f < float_from_bits((1 << 31) | ((127 + (31 - shift)) << 23))) { return 0x80000000; } |
314 | if (f > float_from_bits(((127 + (30 - shift)) << 23) | 0x7fffff)) { return 0x7fffffff; } |
315 | return _floattouint(f, shift); |
316 | #else |
317 | vcos_assert((shift >= 0) && (shift <= 31)); |
318 | f *= (float)(uint32_t)(1 << shift); |
319 | f += (f < 0.0f) ? -0.49999997f : 0.49999997f; /* assume float -> int conversion is round to zero */ |
320 | if (f < -2.14748365e9f) { return 0x80000000; } |
321 | if (f > 2.14748352e9f) { return 0x7fffffff; } |
322 | return (int32_t)f; |
323 | #endif |
324 | } |
325 | |
326 | /* |
327 | convert float to 48-bit integer value with shift |
328 | saturating, round to nearest |
329 | |
330 | this is only supported on videocore. shift should be in [-16, 31] |
331 | */ |
332 | |
333 | #ifdef _VIDEOCORE |
334 | static INLINE int64_t float_to_int48_shift(float f, int32_t shift) |
335 | { |
336 | /* floattouint is wrapping, round to negative infinity. shift should be in [-32, 31] */ |
337 | vcos_assert((shift >= -16) && (shift <= 31)); |
338 | f = r2ni_to_r2n_bias(f, shift); |
339 | if (f < float_from_bits((1 << 31) | ((127 + (47 - shift)) << 23))) { return 0xffff800000000000ll; } |
340 | if (f > float_from_bits(((127 + (46 - shift)) << 23) | 0x7fffff)) { return 0x00007fffffffffffll; } |
341 | return ((int64_t)(int32_t)_floattouint(f, shift - 16) << 16) | _floattouint(f, shift); |
342 | } |
343 | #endif |
344 | |
345 | /* |
346 | convert float to integer value |
347 | saturating, round to nearest |
348 | */ |
349 | |
350 | static INLINE int32_t float_to_int(float f) |
351 | { |
352 | return float_to_int_shift(f, 0); |
353 | } |
354 | |
355 | /* |
356 | convert float to integer value |
357 | saturating, round to negative inf |
358 | */ |
359 | |
360 | static INLINE int32_t float_to_int_floor(float f) |
361 | { |
362 | /* |
363 | special-case handling of small negative floats |
364 | this is so we return -1 for negative denormals (which the vg cts requires) |
365 | (we shouldn't need this if the fp library/hw properly handle denormals) |
366 | */ |
367 | |
368 | uint32_t u = float_to_bits(f); |
369 | if (((u & (1 << 31)) && (u + u)) && (f > -1.0f)) { |
370 | return -1; |
371 | } |
372 | |
373 | f = floorf(f); /* assume float -> int conversion is round to zero */ |
374 | if (f < -2.14748365e9f) { return 0x80000000; } |
375 | if (f > 2.14748352e9f) { return 0x7fffffff; } |
376 | return (int32_t)f; |
377 | } |
378 | |
379 | /* |
380 | convert float to integer value |
381 | saturating, round to zero |
382 | */ |
383 | |
384 | static INLINE int32_t float_to_int_zero(float f) |
385 | { |
386 | /* assume float -> int conversion is round to zero */ |
387 | if (f < -2.14748365e9f) { return 0x80000000; } |
388 | if (f > 2.14748352e9f) { return 0x7fffffff; } |
389 | return (int32_t)f; |
390 | } |
391 | |
392 | /* |
393 | convert float to 16.16 fixed point value |
394 | saturating, round to nearest |
395 | |
396 | Khronos documentation: |
397 | |
398 | If a value is so large in magnitude that it cannot be represented with the |
399 | requested type, then the nearest value representable using the requested type |
400 | is returned. |
401 | */ |
402 | |
403 | static INLINE int32_t float_to_fixed(float f) |
404 | { |
405 | return float_to_int_shift(f, 16); |
406 | } |
407 | |
408 | /****************************************************************************** |
409 | exact float tests (in case fp library/hw don't handle denormals correctly) |
410 | ******************************************************************************/ |
411 | |
412 | static INLINE bool floats_identical(float x, float y) |
413 | { |
414 | return float_to_bits(x) == float_to_bits(y); |
415 | } |
416 | |
417 | static INLINE bool is_zero(float f) |
418 | { |
419 | uint32_t u = float_to_bits(f); |
420 | return !(u + u); |
421 | } |
422 | |
423 | static INLINE bool is_le_zero(float f) |
424 | { |
425 | uint32_t u = float_to_bits(f); |
426 | return (u & (1 << 31)) || !u; |
427 | } |
428 | |
429 | /****************************************************************************** |
430 | alignment stuff |
431 | ******************************************************************************/ |
432 | |
433 | #ifdef _MSC_VER |
434 | #define alignof(T) __alignof(T) |
435 | #elif defined(__CC_ARM) |
436 | #define alignof(T) __alignof__(T) |
437 | #else |
438 | #define alignof(T) (sizeof(struct { T t; char ch; }) - sizeof(T)) |
439 | #endif |
440 | |
441 | /* |
442 | must use both ALIGNED and ALIGN_TO... |
443 | ALIGNED(16) int align_me[10]; |
444 | ALIGN_TO(align_me, 16); |
445 | */ |
446 | |
447 | #ifdef _MSC_VER |
448 | #define ALIGNED(ALIGNMENT) __declspec(align(ALIGNMENT)) |
449 | #define ALIGN_TO(X, ALIGNMENT) |
450 | #elif defined(__GNUC__) |
451 | #define ALIGNED(ALIGNMENT) __attribute__ ((aligned(ALIGNMENT))) |
452 | #define ALIGN_TO(X, ALIGNMENT) |
453 | #elif defined(__HIGHC__) |
454 | #define ALIGNED(ALIGMENT) |
455 | #define ALIGN_TO(X, ALIGNMENT) pragma Align_to(ALIGNMENT, X) |
456 | #else |
457 | /* leave undefined (will get error on use) */ |
458 | #endif |
459 | |
460 | /****************************************************************************** |
461 | range/rect intersect stuff |
462 | ******************************************************************************/ |
463 | |
464 | extern void khrn_clip_range( |
465 | int32_t *x0, int32_t *l0, |
466 | int32_t x1, int32_t l1); |
467 | |
468 | extern void khrn_clip_range2( |
469 | int32_t *ax0, int32_t *bx0, int32_t *l0, |
470 | int32_t ax1, int32_t al1, |
471 | int32_t bx1, int32_t bl1); |
472 | |
473 | extern void khrn_clip_rect( |
474 | int32_t *x0, int32_t *y0, int32_t *w0, int32_t *h0, |
475 | int32_t x1, int32_t y1, int32_t w1, int32_t h1); |
476 | |
477 | extern void khrn_clip_rect2( |
478 | int32_t *ax0, int32_t *ay0, int32_t *bx0, int32_t *by0, int32_t *w0, int32_t *h0, |
479 | int32_t ax1, int32_t ay1, int32_t aw1, int32_t ah1, |
480 | int32_t bx1, int32_t by1, int32_t bw1, int32_t bh1); |
481 | |
482 | static INLINE bool khrn_ranges_intersect( |
483 | int32_t x0, int32_t l0, |
484 | int32_t x1, int32_t l1) |
485 | { |
486 | return (x0 < (x1 + l1)) && (x1 < (x0 + l0)); |
487 | } |
488 | |
489 | static INLINE bool khrn_rects_intersect( |
490 | int32_t x0, int32_t y0, int32_t w0, int32_t h0, |
491 | int32_t x1, int32_t y1, int32_t w1, int32_t h1) |
492 | { |
493 | return khrn_ranges_intersect(x0, w0, x1, w1) && khrn_ranges_intersect(y0, h0, y1, h1); |
494 | } |
495 | |
496 | /****************************************************************************** |
497 | memory barrier |
498 | ******************************************************************************/ |
499 | |
500 | #ifdef KHRN_SINGLE_THREADED |
501 | /* everything is done in one thread, no need for barriers */ |
502 | static INLINE void khrn_barrier(void) {} |
503 | #elif defined(_VIDEOCORE) |
504 | /* don't need a real memory barrier |
505 | * extern function should do as a compiler barrier, but todo: is there a better way? */ |
506 | extern void khrn_barrier(void); |
507 | #else |
508 | /* leave undefined (will get error on use) */ |
509 | #endif |
510 | |
511 | #endif |
512 | |