1 | // Copyright 2016 The SwiftShader Authors. All Rights Reserved. |
2 | // |
3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | // you may not use this file except in compliance with the License. |
5 | // You may obtain a copy of the License at |
6 | // |
7 | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | // |
9 | // Unless required by applicable law or agreed to in writing, software |
10 | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | // See the License for the specific language governing permissions and |
13 | // limitations under the License. |
14 | |
15 | #ifndef sw_Half_hpp |
16 | #define sw_Half_hpp |
17 | |
18 | #include "Math.hpp" |
19 | |
20 | #include <algorithm> |
21 | #include <cmath> |
22 | |
23 | namespace sw |
24 | { |
25 | class half |
26 | { |
27 | public: |
28 | half() = default; |
29 | explicit half(float f); |
30 | |
31 | operator float() const; |
32 | |
33 | half &operator=(half h); |
34 | half &operator=(float f); |
35 | |
36 | private: |
37 | unsigned short fp16i; |
38 | }; |
39 | |
40 | inline half shortAsHalf(short s) |
41 | { |
42 | union |
43 | { |
44 | half h; |
45 | short s; |
46 | } hs; |
47 | |
48 | hs.s = s; |
49 | |
50 | return hs.h; |
51 | } |
52 | |
53 | class RGB9E5 |
54 | { |
55 | unsigned int R : 9; |
56 | unsigned int G : 9; |
57 | unsigned int B : 9; |
58 | unsigned int E : 5; |
59 | |
60 | public: |
61 | RGB9E5(float rgb[3]) : RGB9E5(rgb[0], rgb[1], rgb[2]) |
62 | { |
63 | } |
64 | |
65 | RGB9E5(float r, float g, float b) |
66 | { |
67 | // Vulkan 1.1.117 section 15.2.1 RGB to Shared Exponent Conversion |
68 | |
69 | // B is the exponent bias (15) |
70 | constexpr int g_sharedexp_bias = 15; |
71 | |
72 | // N is the number of mantissa bits per component (9) |
73 | constexpr int g_sharedexp_mantissabits = 9; |
74 | |
75 | // Emax is the maximum allowed biased exponent value (31) |
76 | constexpr int g_sharedexp_maxexponent = 31; |
77 | |
78 | constexpr float g_sharedexp_max = |
79 | ((static_cast<float>(1 << g_sharedexp_mantissabits) - 1) / |
80 | static_cast<float>(1 << g_sharedexp_mantissabits)) * |
81 | static_cast<float>(1 << (g_sharedexp_maxexponent - g_sharedexp_bias)); |
82 | |
83 | // Clamp components to valid range. NaN becomes 0. |
84 | const float red_c = std::min(!(r > 0) ? 0 : r, g_sharedexp_max); |
85 | const float green_c = std::min(!(g > 0) ? 0 : g, g_sharedexp_max); |
86 | const float blue_c = std::min(!(b > 0) ? 0 : b, g_sharedexp_max); |
87 | |
88 | // We're reducing the mantissa to 9 bits, so we must round up if the next |
89 | // bit is 1. In other words add 0.5 to the new mantissa's position and |
90 | // allow overflow into the exponent so we can scale correctly. |
91 | constexpr int half = 1 << (23 - g_sharedexp_mantissabits); |
92 | const float red_r = bit_cast<float>(bit_cast<int>(red_c) + half); |
93 | const float green_r = bit_cast<float>(bit_cast<int>(green_c) + half); |
94 | const float blue_r = bit_cast<float>(bit_cast<int>(blue_c) + half); |
95 | |
96 | // The largest component determines the shared exponent. It can't be lower |
97 | // than 0 (after bias subtraction) so also limit to the mimimum representable. |
98 | constexpr float min_s = 0.5f / (1 << g_sharedexp_bias); |
99 | float max_s = std::max(std::max(red_r, green_r), std::max(blue_r, min_s)); |
100 | |
101 | // Obtain the reciprocal of the shared exponent by inverting the bits, |
102 | // and scale by the new mantissa's size. Note that the IEEE-754 single-precision |
103 | // format has an implicit leading 1, but this shared component format does not. |
104 | float scale = bit_cast<float>((bit_cast<int>(max_s) & 0x7F800000) ^ 0x7F800000) * (1 << (g_sharedexp_mantissabits - 2)); |
105 | |
106 | R = static_cast<unsigned int>(round(red_c * scale)); |
107 | G = static_cast<unsigned int>(round(green_c * scale)); |
108 | B = static_cast<unsigned int>(round(blue_c * scale)); |
109 | E = (bit_cast<unsigned int>(max_s) >> 23) - 127 + 15 + 1; |
110 | } |
111 | |
112 | operator unsigned int() const |
113 | { |
114 | return *reinterpret_cast<const unsigned int*>(this); |
115 | } |
116 | |
117 | void toRGB16F(half rgb[3]) const |
118 | { |
119 | constexpr int offset = 24; // Exponent bias (15) + number of mantissa bits per component (9) = 24 |
120 | |
121 | const float factor = (1u << E) * (1.0f / (1 << offset)); |
122 | rgb[0] = half(R * factor); |
123 | rgb[1] = half(G * factor); |
124 | rgb[2] = half(B * factor); |
125 | } |
126 | }; |
127 | |
128 | class R11G11B10F |
129 | { |
130 | unsigned int R : 11; |
131 | unsigned int G : 11; |
132 | unsigned int B : 10; |
133 | |
134 | static inline half float11ToFloat16(unsigned short fp11) |
135 | { |
136 | return shortAsHalf(fp11 << 4); // Sign bit 0 |
137 | } |
138 | |
139 | static inline half float10ToFloat16(unsigned short fp10) |
140 | { |
141 | return shortAsHalf(fp10 << 5); // Sign bit 0 |
142 | } |
143 | |
144 | inline unsigned short float32ToFloat11(float fp32) |
145 | { |
146 | const unsigned int float32MantissaMask = 0x7FFFFF; |
147 | const unsigned int float32ExponentMask = 0x7F800000; |
148 | const unsigned int float32SignMask = 0x80000000; |
149 | const unsigned int float32ValueMask = ~float32SignMask; |
150 | const unsigned int float32ExponentFirstBit = 23; |
151 | const unsigned int float32ExponentBias = 127; |
152 | |
153 | const unsigned short float11Max = 0x7BF; |
154 | const unsigned short float11MantissaMask = 0x3F; |
155 | const unsigned short float11ExponentMask = 0x7C0; |
156 | const unsigned short float11BitMask = 0x7FF; |
157 | const unsigned int float11ExponentBias = 14; |
158 | |
159 | const unsigned int float32Maxfloat11 = 0x477E0000; |
160 | const unsigned int float32Minfloat11 = 0x38800000; |
161 | |
162 | const unsigned int float32Bits = *reinterpret_cast<unsigned int*>(&fp32); |
163 | const bool float32Sign = (float32Bits & float32SignMask) == float32SignMask; |
164 | |
165 | unsigned int float32Val = float32Bits & float32ValueMask; |
166 | |
167 | if((float32Val & float32ExponentMask) == float32ExponentMask) |
168 | { |
169 | // INF or NAN |
170 | if((float32Val & float32MantissaMask) != 0) |
171 | { |
172 | return float11ExponentMask | |
173 | (((float32Val >> 17) | (float32Val >> 11) | (float32Val >> 6) | (float32Val)) & |
174 | float11MantissaMask); |
175 | } |
176 | else if(float32Sign) |
177 | { |
178 | // -INF is clamped to 0 since float11 is positive only |
179 | return 0; |
180 | } |
181 | else |
182 | { |
183 | return float11ExponentMask; |
184 | } |
185 | } |
186 | else if(float32Sign) |
187 | { |
188 | // float11 is positive only, so clamp to zero |
189 | return 0; |
190 | } |
191 | else if(float32Val > float32Maxfloat11) |
192 | { |
193 | // The number is too large to be represented as a float11, set to max |
194 | return float11Max; |
195 | } |
196 | else |
197 | { |
198 | if(float32Val < float32Minfloat11) |
199 | { |
200 | // The number is too small to be represented as a normalized float11 |
201 | // Convert it to a denormalized value. |
202 | const unsigned int shift = (float32ExponentBias - float11ExponentBias) - |
203 | (float32Val >> float32ExponentFirstBit); |
204 | float32Val = |
205 | ((1 << float32ExponentFirstBit) | (float32Val & float32MantissaMask)) >> shift; |
206 | } |
207 | else |
208 | { |
209 | // Rebias the exponent to represent the value as a normalized float11 |
210 | float32Val += 0xC8000000; |
211 | } |
212 | |
213 | return ((float32Val + 0xFFFF + ((float32Val >> 17) & 1)) >> 17) & float11BitMask; |
214 | } |
215 | } |
216 | |
217 | inline unsigned short float32ToFloat10(float fp32) |
218 | { |
219 | const unsigned int float32MantissaMask = 0x7FFFFF; |
220 | const unsigned int float32ExponentMask = 0x7F800000; |
221 | const unsigned int float32SignMask = 0x80000000; |
222 | const unsigned int float32ValueMask = ~float32SignMask; |
223 | const unsigned int float32ExponentFirstBit = 23; |
224 | const unsigned int float32ExponentBias = 127; |
225 | |
226 | const unsigned short float10Max = 0x3DF; |
227 | const unsigned short float10MantissaMask = 0x1F; |
228 | const unsigned short float10ExponentMask = 0x3E0; |
229 | const unsigned short float10BitMask = 0x3FF; |
230 | const unsigned int float10ExponentBias = 14; |
231 | |
232 | const unsigned int float32Maxfloat10 = 0x477C0000; |
233 | const unsigned int float32Minfloat10 = 0x38800000; |
234 | |
235 | const unsigned int float32Bits = *reinterpret_cast<unsigned int*>(&fp32); |
236 | const bool float32Sign = (float32Bits & float32SignMask) == float32SignMask; |
237 | |
238 | unsigned int float32Val = float32Bits & float32ValueMask; |
239 | |
240 | if((float32Val & float32ExponentMask) == float32ExponentMask) |
241 | { |
242 | // INF or NAN |
243 | if((float32Val & float32MantissaMask) != 0) |
244 | { |
245 | return float10ExponentMask | |
246 | (((float32Val >> 18) | (float32Val >> 13) | (float32Val >> 3) | (float32Val)) & |
247 | float10MantissaMask); |
248 | } |
249 | else if(float32Sign) |
250 | { |
251 | // -INF is clamped to 0 since float11 is positive only |
252 | return 0; |
253 | } |
254 | else |
255 | { |
256 | return float10ExponentMask; |
257 | } |
258 | } |
259 | else if(float32Sign) |
260 | { |
261 | // float10 is positive only, so clamp to zero |
262 | return 0; |
263 | } |
264 | else if(float32Val > float32Maxfloat10) |
265 | { |
266 | // The number is too large to be represented as a float11, set to max |
267 | return float10Max; |
268 | } |
269 | else |
270 | { |
271 | if(float32Val < float32Minfloat10) |
272 | { |
273 | // The number is too small to be represented as a normalized float11 |
274 | // Convert it to a denormalized value. |
275 | const unsigned int shift = (float32ExponentBias - float10ExponentBias) - |
276 | (float32Val >> float32ExponentFirstBit); |
277 | float32Val = |
278 | ((1 << float32ExponentFirstBit) | (float32Val & float32MantissaMask)) >> shift; |
279 | } |
280 | else |
281 | { |
282 | // Rebias the exponent to represent the value as a normalized float11 |
283 | float32Val += 0xC8000000; |
284 | } |
285 | |
286 | return ((float32Val + 0x1FFFF + ((float32Val >> 18) & 1)) >> 18) & float10BitMask; |
287 | } |
288 | } |
289 | |
290 | public: |
291 | R11G11B10F(float rgb[3]) |
292 | { |
293 | R = float32ToFloat11(rgb[0]); |
294 | G = float32ToFloat11(rgb[1]); |
295 | B = float32ToFloat10(rgb[2]); |
296 | } |
297 | |
298 | operator unsigned int() const |
299 | { |
300 | return *reinterpret_cast<const unsigned int*>(this); |
301 | } |
302 | |
303 | void toRGB16F(half rgb[3]) const |
304 | { |
305 | rgb[0] = float11ToFloat16(R); |
306 | rgb[1] = float11ToFloat16(G); |
307 | rgb[2] = float10ToFloat16(B); |
308 | } |
309 | }; |
310 | } |
311 | |
312 | #endif // sw_Half_hpp |
313 | |