1// Copyright 2016 The SwiftShader Authors. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#ifndef sw_Half_hpp
16#define sw_Half_hpp
17
18#include "Math.hpp"
19
20#include <algorithm>
21#include <cmath>
22
23namespace sw
24{
25 class half
26 {
27 public:
28 half() = default;
29 explicit half(float f);
30
31 operator float() const;
32
33 half &operator=(half h);
34 half &operator=(float f);
35
36 private:
37 unsigned short fp16i;
38 };
39
40 inline half shortAsHalf(short s)
41 {
42 union
43 {
44 half h;
45 short s;
46 } hs;
47
48 hs.s = s;
49
50 return hs.h;
51 }
52
53 class RGB9E5
54 {
55 unsigned int R : 9;
56 unsigned int G : 9;
57 unsigned int B : 9;
58 unsigned int E : 5;
59
60 public:
61 RGB9E5(float rgb[3]) : RGB9E5(rgb[0], rgb[1], rgb[2])
62 {
63 }
64
65 RGB9E5(float r, float g, float b)
66 {
67 // Vulkan 1.1.117 section 15.2.1 RGB to Shared Exponent Conversion
68
69 // B is the exponent bias (15)
70 constexpr int g_sharedexp_bias = 15;
71
72 // N is the number of mantissa bits per component (9)
73 constexpr int g_sharedexp_mantissabits = 9;
74
75 // Emax is the maximum allowed biased exponent value (31)
76 constexpr int g_sharedexp_maxexponent = 31;
77
78 constexpr float g_sharedexp_max =
79 ((static_cast<float>(1 << g_sharedexp_mantissabits) - 1) /
80 static_cast<float>(1 << g_sharedexp_mantissabits)) *
81 static_cast<float>(1 << (g_sharedexp_maxexponent - g_sharedexp_bias));
82
83 // Clamp components to valid range. NaN becomes 0.
84 const float red_c = std::min(!(r > 0) ? 0 : r, g_sharedexp_max);
85 const float green_c = std::min(!(g > 0) ? 0 : g, g_sharedexp_max);
86 const float blue_c = std::min(!(b > 0) ? 0 : b, g_sharedexp_max);
87
88 // We're reducing the mantissa to 9 bits, so we must round up if the next
89 // bit is 1. In other words add 0.5 to the new mantissa's position and
90 // allow overflow into the exponent so we can scale correctly.
91 constexpr int half = 1 << (23 - g_sharedexp_mantissabits);
92 const float red_r = bit_cast<float>(bit_cast<int>(red_c) + half);
93 const float green_r = bit_cast<float>(bit_cast<int>(green_c) + half);
94 const float blue_r = bit_cast<float>(bit_cast<int>(blue_c) + half);
95
96 // The largest component determines the shared exponent. It can't be lower
97 // than 0 (after bias subtraction) so also limit to the mimimum representable.
98 constexpr float min_s = 0.5f / (1 << g_sharedexp_bias);
99 float max_s = std::max(std::max(red_r, green_r), std::max(blue_r, min_s));
100
101 // Obtain the reciprocal of the shared exponent by inverting the bits,
102 // and scale by the new mantissa's size. Note that the IEEE-754 single-precision
103 // format has an implicit leading 1, but this shared component format does not.
104 float scale = bit_cast<float>((bit_cast<int>(max_s) & 0x7F800000) ^ 0x7F800000) * (1 << (g_sharedexp_mantissabits - 2));
105
106 R = static_cast<unsigned int>(round(red_c * scale));
107 G = static_cast<unsigned int>(round(green_c * scale));
108 B = static_cast<unsigned int>(round(blue_c * scale));
109 E = (bit_cast<unsigned int>(max_s) >> 23) - 127 + 15 + 1;
110 }
111
112 operator unsigned int() const
113 {
114 return *reinterpret_cast<const unsigned int*>(this);
115 }
116
117 void toRGB16F(half rgb[3]) const
118 {
119 constexpr int offset = 24; // Exponent bias (15) + number of mantissa bits per component (9) = 24
120
121 const float factor = (1u << E) * (1.0f / (1 << offset));
122 rgb[0] = half(R * factor);
123 rgb[1] = half(G * factor);
124 rgb[2] = half(B * factor);
125 }
126 };
127
128 class R11G11B10F
129 {
130 unsigned int R : 11;
131 unsigned int G : 11;
132 unsigned int B : 10;
133
134 static inline half float11ToFloat16(unsigned short fp11)
135 {
136 return shortAsHalf(fp11 << 4); // Sign bit 0
137 }
138
139 static inline half float10ToFloat16(unsigned short fp10)
140 {
141 return shortAsHalf(fp10 << 5); // Sign bit 0
142 }
143
144 inline unsigned short float32ToFloat11(float fp32)
145 {
146 const unsigned int float32MantissaMask = 0x7FFFFF;
147 const unsigned int float32ExponentMask = 0x7F800000;
148 const unsigned int float32SignMask = 0x80000000;
149 const unsigned int float32ValueMask = ~float32SignMask;
150 const unsigned int float32ExponentFirstBit = 23;
151 const unsigned int float32ExponentBias = 127;
152
153 const unsigned short float11Max = 0x7BF;
154 const unsigned short float11MantissaMask = 0x3F;
155 const unsigned short float11ExponentMask = 0x7C0;
156 const unsigned short float11BitMask = 0x7FF;
157 const unsigned int float11ExponentBias = 14;
158
159 const unsigned int float32Maxfloat11 = 0x477E0000;
160 const unsigned int float32Minfloat11 = 0x38800000;
161
162 const unsigned int float32Bits = *reinterpret_cast<unsigned int*>(&fp32);
163 const bool float32Sign = (float32Bits & float32SignMask) == float32SignMask;
164
165 unsigned int float32Val = float32Bits & float32ValueMask;
166
167 if((float32Val & float32ExponentMask) == float32ExponentMask)
168 {
169 // INF or NAN
170 if((float32Val & float32MantissaMask) != 0)
171 {
172 return float11ExponentMask |
173 (((float32Val >> 17) | (float32Val >> 11) | (float32Val >> 6) | (float32Val)) &
174 float11MantissaMask);
175 }
176 else if(float32Sign)
177 {
178 // -INF is clamped to 0 since float11 is positive only
179 return 0;
180 }
181 else
182 {
183 return float11ExponentMask;
184 }
185 }
186 else if(float32Sign)
187 {
188 // float11 is positive only, so clamp to zero
189 return 0;
190 }
191 else if(float32Val > float32Maxfloat11)
192 {
193 // The number is too large to be represented as a float11, set to max
194 return float11Max;
195 }
196 else
197 {
198 if(float32Val < float32Minfloat11)
199 {
200 // The number is too small to be represented as a normalized float11
201 // Convert it to a denormalized value.
202 const unsigned int shift = (float32ExponentBias - float11ExponentBias) -
203 (float32Val >> float32ExponentFirstBit);
204 float32Val =
205 ((1 << float32ExponentFirstBit) | (float32Val & float32MantissaMask)) >> shift;
206 }
207 else
208 {
209 // Rebias the exponent to represent the value as a normalized float11
210 float32Val += 0xC8000000;
211 }
212
213 return ((float32Val + 0xFFFF + ((float32Val >> 17) & 1)) >> 17) & float11BitMask;
214 }
215 }
216
217 inline unsigned short float32ToFloat10(float fp32)
218 {
219 const unsigned int float32MantissaMask = 0x7FFFFF;
220 const unsigned int float32ExponentMask = 0x7F800000;
221 const unsigned int float32SignMask = 0x80000000;
222 const unsigned int float32ValueMask = ~float32SignMask;
223 const unsigned int float32ExponentFirstBit = 23;
224 const unsigned int float32ExponentBias = 127;
225
226 const unsigned short float10Max = 0x3DF;
227 const unsigned short float10MantissaMask = 0x1F;
228 const unsigned short float10ExponentMask = 0x3E0;
229 const unsigned short float10BitMask = 0x3FF;
230 const unsigned int float10ExponentBias = 14;
231
232 const unsigned int float32Maxfloat10 = 0x477C0000;
233 const unsigned int float32Minfloat10 = 0x38800000;
234
235 const unsigned int float32Bits = *reinterpret_cast<unsigned int*>(&fp32);
236 const bool float32Sign = (float32Bits & float32SignMask) == float32SignMask;
237
238 unsigned int float32Val = float32Bits & float32ValueMask;
239
240 if((float32Val & float32ExponentMask) == float32ExponentMask)
241 {
242 // INF or NAN
243 if((float32Val & float32MantissaMask) != 0)
244 {
245 return float10ExponentMask |
246 (((float32Val >> 18) | (float32Val >> 13) | (float32Val >> 3) | (float32Val)) &
247 float10MantissaMask);
248 }
249 else if(float32Sign)
250 {
251 // -INF is clamped to 0 since float11 is positive only
252 return 0;
253 }
254 else
255 {
256 return float10ExponentMask;
257 }
258 }
259 else if(float32Sign)
260 {
261 // float10 is positive only, so clamp to zero
262 return 0;
263 }
264 else if(float32Val > float32Maxfloat10)
265 {
266 // The number is too large to be represented as a float11, set to max
267 return float10Max;
268 }
269 else
270 {
271 if(float32Val < float32Minfloat10)
272 {
273 // The number is too small to be represented as a normalized float11
274 // Convert it to a denormalized value.
275 const unsigned int shift = (float32ExponentBias - float10ExponentBias) -
276 (float32Val >> float32ExponentFirstBit);
277 float32Val =
278 ((1 << float32ExponentFirstBit) | (float32Val & float32MantissaMask)) >> shift;
279 }
280 else
281 {
282 // Rebias the exponent to represent the value as a normalized float11
283 float32Val += 0xC8000000;
284 }
285
286 return ((float32Val + 0x1FFFF + ((float32Val >> 18) & 1)) >> 18) & float10BitMask;
287 }
288 }
289
290 public:
291 R11G11B10F(float rgb[3])
292 {
293 R = float32ToFloat11(rgb[0]);
294 G = float32ToFloat11(rgb[1]);
295 B = float32ToFloat10(rgb[2]);
296 }
297
298 operator unsigned int() const
299 {
300 return *reinterpret_cast<const unsigned int*>(this);
301 }
302
303 void toRGB16F(half rgb[3]) const
304 {
305 rgb[0] = float11ToFloat16(R);
306 rgb[1] = float11ToFloat16(G);
307 rgb[2] = float10ToFloat16(B);
308 }
309 };
310}
311
312#endif // sw_Half_hpp
313