1 | // SPDX-License-Identifier: Apache-2.0 |
2 | // ---------------------------------------------------------------------------- |
3 | // Copyright 2011-2021 Arm Limited |
4 | // |
5 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
6 | // use this file except in compliance with the License. You may obtain a copy |
7 | // of the License at: |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, software |
12 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
13 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
14 | // License for the specific language governing permissions and limitations |
15 | // under the License. |
16 | // ---------------------------------------------------------------------------- |
17 | |
18 | /** |
19 | * @brief Soft-float library for IEEE-754. |
20 | */ |
21 | #if (ASTCENC_F16C == 0) && (ASTCENC_NEON == 0) |
22 | |
23 | #include "astcenc_mathlib.h" |
24 | |
25 | /* sized soft-float types. These are mapped to the sized integer |
26 | types of C99, instead of C's floating-point types; this is because |
27 | the library needs to maintain exact, bit-level control on all |
28 | operations on these data types. */ |
29 | typedef uint16_t sf16; |
30 | typedef uint32_t sf32; |
31 | |
32 | /****************************************** |
33 | helper functions and their lookup tables |
34 | ******************************************/ |
35 | /* count leading zeros functions. Only used when the input is nonzero. */ |
36 | |
37 | #if defined(__GNUC__) && (defined(__i386) || defined(__amd64)) |
38 | #elif defined(__arm__) && defined(__ARMCC_VERSION) |
39 | #elif defined(__arm__) && defined(__GNUC__) |
40 | #else |
41 | /* table used for the slow default versions. */ |
42 | static const uint8_t clz_table[256] = |
43 | { |
44 | 8, 7, 6, 6, 5, 5, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, |
45 | 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, |
46 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
47 | 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, |
48 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
49 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
50 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
51 | 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, |
52 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
53 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
54 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
55 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
56 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
57 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
58 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, |
59 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 |
60 | }; |
61 | #endif |
62 | |
63 | /* |
64 | 32-bit count-leading-zeros function: use the Assembly instruction whenever possible. */ |
65 | static uint32_t clz32(uint32_t inp) |
66 | { |
67 | #if defined(__GNUC__) && (defined(__i386) || defined(__amd64)) |
68 | uint32_t bsr; |
69 | __asm__("bsrl %1, %0" : "=r" (bsr):"r" (inp | 1)); |
70 | return 31 - bsr; |
71 | #else |
72 | #if defined(__arm__) && defined(__ARMCC_VERSION) |
73 | return __clz(inp); /* armcc builtin */ |
74 | #else |
75 | #if defined(__arm__) && defined(__GNUC__) |
76 | uint32_t lz; |
77 | __asm__("clz %0, %1" : "=r" (lz):"r" (inp)); |
78 | return lz; |
79 | #else |
80 | /* slow default version */ |
81 | uint32_t summa = 24; |
82 | if (inp >= UINT32_C(0x10000)) |
83 | { |
84 | inp >>= 16; |
85 | summa -= 16; |
86 | } |
87 | if (inp >= UINT32_C(0x100)) |
88 | { |
89 | inp >>= 8; |
90 | summa -= 8; |
91 | } |
92 | return summa + clz_table[inp]; |
93 | #endif |
94 | #endif |
95 | #endif |
96 | } |
97 | |
98 | /* the five rounding modes that IEEE-754r defines */ |
99 | typedef enum |
100 | { |
101 | SF_UP = 0, /* round towards positive infinity */ |
102 | SF_DOWN = 1, /* round towards negative infinity */ |
103 | SF_TOZERO = 2, /* round towards zero */ |
104 | SF_NEARESTEVEN = 3, /* round toward nearest value; if mid-between, round to even value */ |
105 | SF_NEARESTAWAY = 4 /* round toward nearest value; if mid-between, round away from zero */ |
106 | } roundmode; |
107 | |
108 | |
109 | static uint32_t rtne_shift32(uint32_t inp, uint32_t shamt) |
110 | { |
111 | uint32_t vl1 = UINT32_C(1) << shamt; |
112 | uint32_t inp2 = inp + (vl1 >> 1); /* added 0.5 ULP */ |
113 | uint32_t msk = (inp | UINT32_C(1)) & vl1; /* nonzero if odd. '| 1' forces it to 1 if the shamt is 0. */ |
114 | msk--; /* negative if even, nonnegative if odd. */ |
115 | inp2 -= (msk >> 31); /* subtract epsilon before shift if even. */ |
116 | inp2 >>= shamt; |
117 | return inp2; |
118 | } |
119 | |
120 | static uint32_t rtna_shift32(uint32_t inp, uint32_t shamt) |
121 | { |
122 | uint32_t vl1 = (UINT32_C(1) << shamt) >> 1; |
123 | inp += vl1; |
124 | inp >>= shamt; |
125 | return inp; |
126 | } |
127 | |
128 | static uint32_t rtup_shift32(uint32_t inp, uint32_t shamt) |
129 | { |
130 | uint32_t vl1 = UINT32_C(1) << shamt; |
131 | inp += vl1; |
132 | inp--; |
133 | inp >>= shamt; |
134 | return inp; |
135 | } |
136 | |
137 | /* convert from FP16 to FP32. */ |
138 | static sf32 sf16_to_sf32(sf16 inp) |
139 | { |
140 | uint32_t inpx = inp; |
141 | |
142 | /* |
143 | This table contains, for every FP16 sign/exponent value combination, |
144 | the difference between the input FP16 value and the value obtained |
145 | by shifting the correct FP32 result right by 13 bits. |
146 | This table allows us to handle every case except denormals and NaN |
147 | with just 1 table lookup, 2 shifts and 1 add. |
148 | */ |
149 | |
150 | #define WITH_MSB(a) (UINT32_C(a) | (1u << 31)) |
151 | static const uint32_t tbl[64] = |
152 | { |
153 | WITH_MSB(0x00000), 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, |
154 | 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, |
155 | 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, |
156 | 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, 0x1C000, WITH_MSB(0x38000), |
157 | WITH_MSB(0x38000), 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, |
158 | 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, |
159 | 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, |
160 | 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, 0x54000, WITH_MSB(0x70000) |
161 | }; |
162 | |
163 | uint32_t res = tbl[inpx >> 10]; |
164 | res += inpx; |
165 | |
166 | /* Normal cases: MSB of 'res' not set. */ |
167 | if ((res & WITH_MSB(0)) == 0) |
168 | { |
169 | return res << 13; |
170 | } |
171 | |
172 | /* Infinity and Zero: 10 LSB of 'res' not set. */ |
173 | if ((res & 0x3FF) == 0) |
174 | { |
175 | return res << 13; |
176 | } |
177 | |
178 | /* NaN: the exponent field of 'inp' is non-zero. */ |
179 | if ((inpx & 0x7C00) != 0) |
180 | { |
181 | /* All NaNs are quietened. */ |
182 | return (res << 13) | 0x400000; |
183 | } |
184 | |
185 | /* Denormal cases */ |
186 | uint32_t sign = (inpx & 0x8000) << 16; |
187 | uint32_t mskval = inpx & 0x7FFF; |
188 | uint32_t leadingzeroes = clz32(mskval); |
189 | mskval <<= leadingzeroes; |
190 | return (mskval >> 8) + ((0x85 - leadingzeroes) << 23) + sign; |
191 | } |
192 | |
193 | /* Conversion routine that converts from FP32 to FP16. It supports denormals and all rounding modes. If a NaN is given as input, it is quietened. */ |
194 | static sf16 sf32_to_sf16(sf32 inp, roundmode rmode) |
195 | { |
196 | /* for each possible sign/exponent combination, store a case index. This gives a 512-byte table */ |
197 | static const uint8_t tab[512] { |
198 | 0, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, |
199 | 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, |
200 | 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, |
201 | 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, |
202 | 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, |
203 | 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, |
204 | 10, 10, 10, 10, 10, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, |
205 | 20, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, |
206 | 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 30, 40, |
207 | 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, |
208 | 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, |
209 | 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, |
210 | 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, |
211 | 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, |
212 | 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, |
213 | 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 50, |
214 | |
215 | 5, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
216 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
217 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
218 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
219 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
220 | 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, |
221 | 15, 15, 15, 15, 15, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, |
222 | 25, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, |
223 | 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 35, 45, |
224 | 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, |
225 | 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, |
226 | 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, |
227 | 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, |
228 | 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, |
229 | 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, |
230 | 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 45, 55, |
231 | }; |
232 | |
233 | /* many of the cases below use a case-dependent magic constant. So we look up a magic constant before actually performing the switch. This table allows us to group cases, thereby minimizing code |
234 | size. */ |
235 | static const uint32_t tabx[60] { |
236 | UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x80000000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), |
237 | UINT32_C(1), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8001), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), |
238 | UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), UINT32_C(0x8000), |
239 | UINT32_C(0xC8001FFF), UINT32_C(0xC8000000), UINT32_C(0xC8000000), UINT32_C(0xC8000FFF), UINT32_C(0xC8001000), |
240 | UINT32_C(0x58000000), UINT32_C(0x38001FFF), UINT32_C(0x58000000), UINT32_C(0x58000FFF), UINT32_C(0x58001000), |
241 | UINT32_C(0x7C00), UINT32_C(0x7BFF), UINT32_C(0x7BFF), UINT32_C(0x7C00), UINT32_C(0x7C00), |
242 | UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFBFF), UINT32_C(0xFC00), UINT32_C(0xFC00), |
243 | UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), UINT32_C(0x90000000), |
244 | UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000), UINT32_C(0x20000000) |
245 | }; |
246 | |
247 | uint32_t p; |
248 | uint32_t idx = rmode + tab[inp >> 23]; |
249 | uint32_t vlx = tabx[idx]; |
250 | switch (idx) |
251 | { |
252 | /* |
253 | Positive number which may be Infinity or NaN. |
254 | We need to check whether it is NaN; if it is, quieten it by setting the top bit of the mantissa. |
255 | (If we don't do this quieting, then a NaN that is distinguished only by having |
256 | its low-order bits set, would be turned into an INF. */ |
257 | case 50: |
258 | case 51: |
259 | case 52: |
260 | case 53: |
261 | case 54: |
262 | case 55: |
263 | case 56: |
264 | case 57: |
265 | case 58: |
266 | case 59: |
267 | /* |
268 | the input value is 0x7F800000 or 0xFF800000 if it is INF. |
269 | By subtracting 1, we get 7F7FFFFF or FF7FFFFF, that is, bit 23 becomes zero. |
270 | For NaNs, however, this operation will keep bit 23 with the value 1. |
271 | We can then extract bit 23, and logical-OR bit 9 of the result with this |
272 | bit in order to quieten the NaN (a Quiet NaN is a NaN where the top bit |
273 | of the mantissa is set.) |
274 | */ |
275 | p = (inp - 1) & UINT32_C(0x800000); /* zero if INF, nonzero if NaN. */ |
276 | return static_cast<sf16>(((inp + vlx) >> 13) | (p >> 14)); |
277 | /* |
278 | positive, exponent = 0, round-mode == UP; need to check whether number actually is 0. |
279 | If it is, then return 0, else return 1 (the smallest representable nonzero number) |
280 | */ |
281 | case 0: |
282 | /* |
283 | -inp will set the MSB if the input number is nonzero. |
284 | Thus (-inp) >> 31 will turn into 0 if the input number is 0 and 1 otherwise. |
285 | */ |
286 | return static_cast<sf16>(static_cast<uint32_t>((-static_cast<int32_t>(inp))) >> 31); |
287 | |
288 | /* |
289 | negative, exponent = , round-mode == DOWN, need to check whether number is |
290 | actually 0. If it is, return 0x8000 ( float -0.0 ) |
291 | Else return the smallest negative number ( 0x8001 ) */ |
292 | case 6: |
293 | /* |
294 | in this case 'vlx' is 0x80000000. By subtracting the input value from it, |
295 | we obtain a value that is 0 if the input value is in fact zero and has |
296 | the MSB set if it isn't. We then right-shift the value by 31 places to |
297 | get a value that is 0 if the input is -0.0 and 1 otherwise. |
298 | */ |
299 | return static_cast<sf16>(((vlx - inp) >> 31) + UINT32_C(0x8000)); |
300 | |
301 | /* |
302 | for all other cases involving underflow/overflow, we don't need to |
303 | do actual tests; we just return 'vlx'. |
304 | */ |
305 | case 1: |
306 | case 2: |
307 | case 3: |
308 | case 4: |
309 | case 5: |
310 | case 7: |
311 | case 8: |
312 | case 9: |
313 | case 10: |
314 | case 11: |
315 | case 12: |
316 | case 13: |
317 | case 14: |
318 | case 15: |
319 | case 16: |
320 | case 17: |
321 | case 18: |
322 | case 19: |
323 | case 40: |
324 | case 41: |
325 | case 42: |
326 | case 43: |
327 | case 44: |
328 | case 45: |
329 | case 46: |
330 | case 47: |
331 | case 48: |
332 | case 49: |
333 | return static_cast<sf16>(vlx); |
334 | |
335 | /* |
336 | for normal numbers, 'vlx' is the difference between the FP32 value of a number and the |
337 | FP16 representation of the same number left-shifted by 13 places. In addition, a rounding constant is |
338 | baked into 'vlx': for rounding-away-from zero, the constant is 2^13 - 1, causing roundoff away |
339 | from zero. for round-to-nearest away, the constant is 2^12, causing roundoff away from zero. |
340 | for round-to-nearest-even, the constant is 2^12 - 1. This causes correct round-to-nearest-even |
341 | except for odd input numbers. For odd input numbers, we need to add 1 to the constant. */ |
342 | |
343 | /* normal number, all rounding modes except round-to-nearest-even: */ |
344 | case 30: |
345 | case 31: |
346 | case 32: |
347 | case 34: |
348 | case 35: |
349 | case 36: |
350 | case 37: |
351 | case 39: |
352 | return static_cast<sf16>((inp + vlx) >> 13); |
353 | |
354 | /* normal number, round-to-nearest-even. */ |
355 | case 33: |
356 | case 38: |
357 | p = inp + vlx; |
358 | p += (inp >> 13) & 1; |
359 | return static_cast<sf16>(p >> 13); |
360 | |
361 | /* |
362 | the various denormal cases. These are not expected to be common, so their performance is a bit |
363 | less important. For each of these cases, we need to extract an exponent and a mantissa |
364 | (including the implicit '1'!), and then right-shift the mantissa by a shift-amount that |
365 | depends on the exponent. The shift must apply the correct rounding mode. 'vlx' is used to supply the |
366 | sign of the resulting denormal number. |
367 | */ |
368 | case 21: |
369 | case 22: |
370 | case 25: |
371 | case 27: |
372 | /* denormal, round towards zero. */ |
373 | p = 126 - ((inp >> 23) & 0xFF); |
374 | return static_cast<sf16>((((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000)) >> p) | vlx); |
375 | case 20: |
376 | case 26: |
377 | /* denormal, round away from zero. */ |
378 | p = 126 - ((inp >> 23) & 0xFF); |
379 | return static_cast<sf16>(rtup_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx); |
380 | case 24: |
381 | case 29: |
382 | /* denormal, round to nearest-away */ |
383 | p = 126 - ((inp >> 23) & 0xFF); |
384 | return static_cast<sf16>(rtna_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx); |
385 | case 23: |
386 | case 28: |
387 | /* denormal, round to nearest-even. */ |
388 | p = 126 - ((inp >> 23) & 0xFF); |
389 | return static_cast<sf16>(rtne_shift32((inp & UINT32_C(0x7FFFFF)) + UINT32_C(0x800000), p) | vlx); |
390 | } |
391 | |
392 | return 0; |
393 | } |
394 | |
395 | /* convert from soft-float to native-float */ |
396 | float sf16_to_float(uint16_t p) |
397 | { |
398 | if32 i; |
399 | i.u = sf16_to_sf32(p); |
400 | return i.f; |
401 | } |
402 | |
403 | /* convert from native-float to soft-float */ |
404 | uint16_t float_to_sf16(float p) |
405 | { |
406 | if32 i; |
407 | i.f = p; |
408 | return sf32_to_sf16(i.u, SF_NEARESTEVEN); |
409 | } |
410 | |
411 | #endif |
412 | |