1 | // SPDX-License-Identifier: Apache-2.0 |
2 | // ---------------------------------------------------------------------------- |
3 | // Copyright 2011-2022 Arm Limited |
4 | // |
5 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
6 | // use this file except in compliance with the License. You may obtain a copy |
7 | // of the License at: |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, software |
12 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
13 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
14 | // License for the specific language governing permissions and limitations |
15 | // under the License. |
16 | // ---------------------------------------------------------------------------- |
17 | |
18 | /** |
19 | * @brief Functions for creating in-memory ASTC image structures. |
20 | */ |
21 | |
22 | #include <cassert> |
23 | #include <cstring> |
24 | |
25 | #include "astcenc_internal.h" |
26 | |
27 | /** |
28 | * @brief Loader pipeline function type for data fetch from memory. |
29 | */ |
30 | using pixel_loader = vfloat4(*)(const void*, int); |
31 | |
32 | /** |
33 | * @brief Loader pipeline function type for swizzling data in a vector. |
34 | */ |
35 | using pixel_swizzler = vfloat4(*)(vfloat4, const astcenc_swizzle&); |
36 | |
37 | /** |
38 | * @brief Loader pipeline function type for converting data in a vector to LNS. |
39 | */ |
40 | using pixel_converter = vfloat4(*)(vfloat4, vmask4); |
41 | |
42 | /** |
43 | * @brief Load a 8-bit UNORM texel from a data array. |
44 | * |
45 | * @param data The data pointer. |
46 | * @param base_offset The index offset to the start of the pixel. |
47 | */ |
48 | static vfloat4 load_texel_u8( |
49 | const void* data, |
50 | int base_offset |
51 | ) { |
52 | const uint8_t* data8 = static_cast<const uint8_t*>(data); |
53 | return int_to_float(vint4(data8 + base_offset)) / 255.0f; |
54 | } |
55 | |
56 | /** |
57 | * @brief Load a 16-bit fp16 texel from a data array. |
58 | * |
59 | * @param data The data pointer. |
60 | * @param base_offset The index offset to the start of the pixel. |
61 | */ |
62 | static vfloat4 load_texel_f16( |
63 | const void* data, |
64 | int base_offset |
65 | ) { |
66 | const uint16_t* data16 = static_cast<const uint16_t*>(data); |
67 | int r = data16[base_offset ]; |
68 | int g = data16[base_offset + 1]; |
69 | int b = data16[base_offset + 2]; |
70 | int a = data16[base_offset + 3]; |
71 | return float16_to_float(vint4(r, g, b, a)); |
72 | } |
73 | |
74 | /** |
75 | * @brief Load a 32-bit float texel from a data array. |
76 | * |
77 | * @param data The data pointer. |
78 | * @param base_offset The index offset to the start of the pixel. |
79 | */ |
80 | static vfloat4 load_texel_f32( |
81 | const void* data, |
82 | int base_offset |
83 | ) { |
84 | const float* data32 = static_cast<const float*>(data); |
85 | return vfloat4(data32 + base_offset); |
86 | } |
87 | |
88 | /** |
89 | * @brief Dummy no-op swizzle function. |
90 | * |
91 | * @param data The source RGBA vector to swizzle. |
92 | * @param swz The swizzle to use. |
93 | */ |
94 | static vfloat4 swz_texel_skip( |
95 | vfloat4 data, |
96 | const astcenc_swizzle& swz |
97 | ) { |
98 | (void)swz; |
99 | return data; |
100 | } |
101 | |
102 | /** |
103 | * @brief Swizzle a texel into a new arrangement. |
104 | * |
105 | * @param data The source RGBA vector to swizzle. |
106 | * @param swz The swizzle to use. |
107 | */ |
108 | static vfloat4 swz_texel( |
109 | vfloat4 data, |
110 | const astcenc_swizzle& swz |
111 | ) { |
112 | alignas(16) float datas[6]; |
113 | |
114 | storea(data, datas); |
115 | datas[ASTCENC_SWZ_0] = 0.0f; |
116 | datas[ASTCENC_SWZ_1] = 1.0f; |
117 | |
118 | return vfloat4(datas[swz.r], datas[swz.g], datas[swz.b], datas[swz.a]); |
119 | } |
120 | |
121 | /** |
122 | * @brief Encode a texel that is entirely LDR linear. |
123 | * |
124 | * @param data The RGBA data to encode. |
125 | * @param lns_mask The mask for the HDR channels than need LNS encoding. |
126 | */ |
127 | static vfloat4 encode_texel_unorm( |
128 | vfloat4 data, |
129 | vmask4 lns_mask |
130 | ) { |
131 | (void)lns_mask; |
132 | return data * 65535.0f; |
133 | } |
134 | |
135 | /** |
136 | * @brief Encode a texel that includes at least some HDR LNS texels. |
137 | * |
138 | * @param data The RGBA data to encode. |
139 | * @param lns_mask The mask for the HDR channels than need LNS encoding. |
140 | */ |
141 | static vfloat4 encode_texel_lns( |
142 | vfloat4 data, |
143 | vmask4 lns_mask |
144 | ) { |
145 | vfloat4 datav_unorm = data * 65535.0f; |
146 | vfloat4 datav_lns = float_to_lns(data); |
147 | return select(datav_unorm, datav_lns, lns_mask); |
148 | } |
149 | |
150 | /* See header for documentation. */ |
151 | void load_image_block( |
152 | astcenc_profile decode_mode, |
153 | const astcenc_image& img, |
154 | image_block& blk, |
155 | const block_size_descriptor& bsd, |
156 | unsigned int xpos, |
157 | unsigned int ypos, |
158 | unsigned int zpos, |
159 | const astcenc_swizzle& swz |
160 | ) { |
161 | unsigned int xsize = img.dim_x; |
162 | unsigned int ysize = img.dim_y; |
163 | unsigned int zsize = img.dim_z; |
164 | |
165 | blk.xpos = xpos; |
166 | blk.ypos = ypos; |
167 | blk.zpos = zpos; |
168 | |
169 | // True if any non-identity swizzle |
170 | bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) || |
171 | (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A); |
172 | |
173 | int idx = 0; |
174 | |
175 | vfloat4 data_min(1e38f); |
176 | vfloat4 data_mean(0.0f); |
177 | vfloat4 data_mean_scale(1.0f / static_cast<float>(bsd.texel_count)); |
178 | vfloat4 data_max(-1e38f); |
179 | vmask4 grayscalev(true); |
180 | |
181 | // This works because we impose the same choice everywhere during encode |
182 | uint8_t rgb_lns = (decode_mode == ASTCENC_PRF_HDR) || |
183 | (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A) ? 1 : 0; |
184 | uint8_t a_lns = decode_mode == ASTCENC_PRF_HDR ? 1 : 0; |
185 | vint4 use_lns(rgb_lns, rgb_lns, rgb_lns, a_lns); |
186 | vmask4 lns_mask = use_lns != vint4::zero(); |
187 | |
188 | // Set up the function pointers for loading pipeline as needed |
189 | pixel_loader loader = load_texel_u8; |
190 | if (img.data_type == ASTCENC_TYPE_F16) |
191 | { |
192 | loader = load_texel_f16; |
193 | } |
194 | else if (img.data_type == ASTCENC_TYPE_F32) |
195 | { |
196 | loader = load_texel_f32; |
197 | } |
198 | |
199 | pixel_swizzler swizzler = swz_texel_skip; |
200 | if (needs_swz) |
201 | { |
202 | swizzler = swz_texel; |
203 | } |
204 | |
205 | pixel_converter converter = encode_texel_unorm; |
206 | if (any(lns_mask)) |
207 | { |
208 | converter = encode_texel_lns; |
209 | } |
210 | |
211 | for (unsigned int z = 0; z < bsd.zdim; z++) |
212 | { |
213 | unsigned int zi = astc::min(zpos + z, zsize - 1); |
214 | void* plane = img.data[zi]; |
215 | |
216 | for (unsigned int y = 0; y < bsd.ydim; y++) |
217 | { |
218 | unsigned int yi = astc::min(ypos + y, ysize - 1); |
219 | |
220 | for (unsigned int x = 0; x < bsd.xdim; x++) |
221 | { |
222 | unsigned int xi = astc::min(xpos + x, xsize - 1); |
223 | |
224 | vfloat4 datav = loader(plane, (4 * xsize * yi) + (4 * xi)); |
225 | datav = swizzler(datav, swz); |
226 | datav = converter(datav, lns_mask); |
227 | |
228 | // Compute block metadata |
229 | data_min = min(data_min, datav); |
230 | data_mean += datav * data_mean_scale; |
231 | data_max = max(data_max, datav); |
232 | |
233 | grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>()); |
234 | |
235 | blk.data_r[idx] = datav.lane<0>(); |
236 | blk.data_g[idx] = datav.lane<1>(); |
237 | blk.data_b[idx] = datav.lane<2>(); |
238 | blk.data_a[idx] = datav.lane<3>(); |
239 | |
240 | blk.rgb_lns[idx] = rgb_lns; |
241 | blk.alpha_lns[idx] = a_lns; |
242 | |
243 | idx++; |
244 | } |
245 | } |
246 | } |
247 | |
248 | // Reverse the encoding so we store origin block in the original format |
249 | vfloat4 data_enc = blk.texel(0); |
250 | vfloat4 data_enc_unorm = data_enc / 65535.0f; |
251 | vfloat4 data_enc_lns = vfloat4::zero(); |
252 | |
253 | if (rgb_lns || a_lns) |
254 | { |
255 | data_enc_lns = float16_to_float(lns_to_sf16(float_to_int(data_enc))); |
256 | } |
257 | |
258 | blk.origin_texel = select(data_enc_unorm, data_enc_lns, lns_mask); |
259 | |
260 | // Store block metadata |
261 | blk.data_min = data_min; |
262 | blk.data_mean = data_mean; |
263 | blk.data_max = data_max; |
264 | blk.grayscale = all(grayscalev); |
265 | } |
266 | |
267 | /* See header for documentation. */ |
268 | void load_image_block_fast_ldr( |
269 | astcenc_profile decode_mode, |
270 | const astcenc_image& img, |
271 | image_block& blk, |
272 | const block_size_descriptor& bsd, |
273 | unsigned int xpos, |
274 | unsigned int ypos, |
275 | unsigned int zpos, |
276 | const astcenc_swizzle& swz |
277 | ) { |
278 | (void)swz; |
279 | (void)decode_mode; |
280 | |
281 | unsigned int xsize = img.dim_x; |
282 | unsigned int ysize = img.dim_y; |
283 | |
284 | blk.xpos = xpos; |
285 | blk.ypos = ypos; |
286 | blk.zpos = zpos; |
287 | |
288 | vfloat4 data_min(1e38f); |
289 | vfloat4 data_mean = vfloat4::zero(); |
290 | vfloat4 data_max(-1e38f); |
291 | vmask4 grayscalev(true); |
292 | int idx = 0; |
293 | |
294 | const uint8_t* plane = static_cast<const uint8_t*>(img.data[0]); |
295 | for (unsigned int y = ypos; y < ypos + bsd.ydim; y++) |
296 | { |
297 | unsigned int yi = astc::min(y, ysize - 1); |
298 | |
299 | for (unsigned int x = xpos; x < xpos + bsd.xdim; x++) |
300 | { |
301 | unsigned int xi = astc::min(x, xsize - 1); |
302 | |
303 | vint4 datavi = vint4(plane + (4 * xsize * yi) + (4 * xi)); |
304 | vfloat4 datav = int_to_float(datavi) * (65535.0f / 255.0f); |
305 | |
306 | // Compute block metadata |
307 | data_min = min(data_min, datav); |
308 | data_mean += datav; |
309 | data_max = max(data_max, datav); |
310 | |
311 | grayscalev = grayscalev & (datav.swz<0,0,0,0>() == datav.swz<1,1,2,2>()); |
312 | |
313 | blk.data_r[idx] = datav.lane<0>(); |
314 | blk.data_g[idx] = datav.lane<1>(); |
315 | blk.data_b[idx] = datav.lane<2>(); |
316 | blk.data_a[idx] = datav.lane<3>(); |
317 | |
318 | idx++; |
319 | } |
320 | } |
321 | |
322 | // Reverse the encoding so we store origin block in the original format |
323 | blk.origin_texel = blk.texel(0) / 65535.0f; |
324 | |
325 | // Store block metadata |
326 | blk.rgb_lns[0] = 0; |
327 | blk.alpha_lns[0] = 0; |
328 | blk.data_min = data_min; |
329 | blk.data_mean = data_mean / static_cast<float>(bsd.texel_count); |
330 | blk.data_max = data_max; |
331 | blk.grayscale = all(grayscalev); |
332 | } |
333 | |
334 | /* See header for documentation. */ |
335 | void store_image_block( |
336 | astcenc_image& img, |
337 | const image_block& blk, |
338 | const block_size_descriptor& bsd, |
339 | unsigned int xpos, |
340 | unsigned int ypos, |
341 | unsigned int zpos, |
342 | const astcenc_swizzle& swz |
343 | ) { |
344 | unsigned int x_size = img.dim_x; |
345 | unsigned int x_start = xpos; |
346 | unsigned int x_end = astc::min(x_size, xpos + bsd.xdim); |
347 | unsigned int x_count = x_end - x_start; |
348 | unsigned int x_nudge = bsd.xdim - x_count; |
349 | |
350 | unsigned int y_size = img.dim_y; |
351 | unsigned int y_start = ypos; |
352 | unsigned int y_end = astc::min(y_size, ypos + bsd.ydim); |
353 | unsigned int y_count = y_end - y_start; |
354 | unsigned int y_nudge = (bsd.ydim - y_count) * bsd.xdim; |
355 | |
356 | unsigned int z_size = img.dim_z; |
357 | unsigned int z_start = zpos; |
358 | unsigned int z_end = astc::min(z_size, zpos + bsd.zdim); |
359 | |
360 | // True if any non-identity swizzle |
361 | bool needs_swz = (swz.r != ASTCENC_SWZ_R) || (swz.g != ASTCENC_SWZ_G) || |
362 | (swz.b != ASTCENC_SWZ_B) || (swz.a != ASTCENC_SWZ_A); |
363 | |
364 | // True if any swizzle uses Z reconstruct |
365 | bool needs_z = (swz.r == ASTCENC_SWZ_Z) || (swz.g == ASTCENC_SWZ_Z) || |
366 | (swz.b == ASTCENC_SWZ_Z) || (swz.a == ASTCENC_SWZ_Z); |
367 | |
368 | int idx = 0; |
369 | if (img.data_type == ASTCENC_TYPE_U8) |
370 | { |
371 | for (unsigned int z = z_start; z < z_end; z++) |
372 | { |
373 | // Fetch the image plane |
374 | uint8_t* data8 = static_cast<uint8_t*>(img.data[z]); |
375 | |
376 | for (unsigned int y = y_start; y < y_end; y++) |
377 | { |
378 | uint8_t* data8_row = data8 + (4 * x_size * y) + (4 * x_start); |
379 | |
380 | for (unsigned int x = 0; x < x_count; x += ASTCENC_SIMD_WIDTH) |
381 | { |
382 | unsigned int max_texels = ASTCENC_SIMD_WIDTH; |
383 | unsigned int used_texels = astc::min(x_count - x, max_texels); |
384 | |
385 | // Unaligned load as rows are not always SIMD_WIDTH long |
386 | vfloat data_r(blk.data_r + idx); |
387 | vfloat data_g(blk.data_g + idx); |
388 | vfloat data_b(blk.data_b + idx); |
389 | vfloat data_a(blk.data_a + idx); |
390 | |
391 | vint data_ri = float_to_int_rtn(min(data_r, 1.0f) * 255.0f); |
392 | vint data_gi = float_to_int_rtn(min(data_g, 1.0f) * 255.0f); |
393 | vint data_bi = float_to_int_rtn(min(data_b, 1.0f) * 255.0f); |
394 | vint data_ai = float_to_int_rtn(min(data_a, 1.0f) * 255.0f); |
395 | |
396 | if (needs_swz) |
397 | { |
398 | vint swizzle_table[7]; |
399 | swizzle_table[ASTCENC_SWZ_0] = vint(0); |
400 | swizzle_table[ASTCENC_SWZ_1] = vint(255); |
401 | swizzle_table[ASTCENC_SWZ_R] = data_ri; |
402 | swizzle_table[ASTCENC_SWZ_G] = data_gi; |
403 | swizzle_table[ASTCENC_SWZ_B] = data_bi; |
404 | swizzle_table[ASTCENC_SWZ_A] = data_ai; |
405 | |
406 | if (needs_z) |
407 | { |
408 | vfloat data_x = (data_r * vfloat(2.0f)) - vfloat(1.0f); |
409 | vfloat data_y = (data_a * vfloat(2.0f)) - vfloat(1.0f); |
410 | vfloat data_z = vfloat(1.0f) - (data_x * data_x) - (data_y * data_y); |
411 | data_z = max(data_z, 0.0f); |
412 | data_z = (sqrt(data_z) * vfloat(0.5f)) + vfloat(0.5f); |
413 | |
414 | swizzle_table[ASTCENC_SWZ_Z] = float_to_int_rtn(min(data_z, 1.0f) * 255.0f); |
415 | } |
416 | |
417 | data_ri = swizzle_table[swz.r]; |
418 | data_gi = swizzle_table[swz.g]; |
419 | data_bi = swizzle_table[swz.b]; |
420 | data_ai = swizzle_table[swz.a]; |
421 | } |
422 | |
423 | // Errors are NaN encoded - convert to magenta error color |
424 | // Branch is OK here - it is almost never true so predicts well |
425 | vmask nan_mask = data_r != data_r; |
426 | if (any(nan_mask)) |
427 | { |
428 | data_ri = select(data_ri, vint(0xFF), nan_mask); |
429 | data_gi = select(data_gi, vint(0x00), nan_mask); |
430 | data_bi = select(data_bi, vint(0xFF), nan_mask); |
431 | data_ai = select(data_ai, vint(0xFF), nan_mask); |
432 | } |
433 | |
434 | vint data_rgbai = interleave_rgba8(data_ri, data_gi, data_bi, data_ai); |
435 | vmask store_mask = vint::lane_id() < vint(used_texels); |
436 | store_lanes_masked(reinterpret_cast<int*>(data8_row), data_rgbai, store_mask); |
437 | |
438 | data8_row += ASTCENC_SIMD_WIDTH * 4; |
439 | idx += used_texels; |
440 | } |
441 | idx += x_nudge; |
442 | } |
443 | idx += y_nudge; |
444 | } |
445 | } |
446 | else if (img.data_type == ASTCENC_TYPE_F16) |
447 | { |
448 | for (unsigned int z = z_start; z < z_end; z++) |
449 | { |
450 | // Fetch the image plane |
451 | uint16_t* data16 = static_cast<uint16_t*>(img.data[z]); |
452 | |
453 | for (unsigned int y = y_start; y < y_end; y++) |
454 | { |
455 | uint16_t* data16_row = data16 + (4 * x_size * y) + (4 * x_start); |
456 | |
457 | for (unsigned int x = 0; x < x_count; x++) |
458 | { |
459 | vint4 color; |
460 | |
461 | // NaNs are handled inline - no need to special case |
462 | if (needs_swz) |
463 | { |
464 | float data[7]; |
465 | data[ASTCENC_SWZ_0] = 0.0f; |
466 | data[ASTCENC_SWZ_1] = 1.0f; |
467 | data[ASTCENC_SWZ_R] = blk.data_r[idx]; |
468 | data[ASTCENC_SWZ_G] = blk.data_g[idx]; |
469 | data[ASTCENC_SWZ_B] = blk.data_b[idx]; |
470 | data[ASTCENC_SWZ_A] = blk.data_a[idx]; |
471 | |
472 | if (needs_z) |
473 | { |
474 | float xN = (data[0] * 2.0f) - 1.0f; |
475 | float yN = (data[3] * 2.0f) - 1.0f; |
476 | float zN = 1.0f - xN * xN - yN * yN; |
477 | if (zN < 0.0f) |
478 | { |
479 | zN = 0.0f; |
480 | } |
481 | data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f; |
482 | } |
483 | |
484 | vfloat4 colorf(data[swz.r], data[swz.g], data[swz.b], data[swz.a]); |
485 | color = float_to_float16(colorf); |
486 | } |
487 | else |
488 | { |
489 | vfloat4 colorf = blk.texel(idx); |
490 | color = float_to_float16(colorf); |
491 | } |
492 | |
493 | // TODO: Vectorize with store N shorts? |
494 | data16_row[0] = static_cast<uint16_t>(color.lane<0>()); |
495 | data16_row[1] = static_cast<uint16_t>(color.lane<1>()); |
496 | data16_row[2] = static_cast<uint16_t>(color.lane<2>()); |
497 | data16_row[3] = static_cast<uint16_t>(color.lane<3>()); |
498 | data16_row += 4; |
499 | idx++; |
500 | } |
501 | idx += x_nudge; |
502 | } |
503 | idx += y_nudge; |
504 | } |
505 | } |
506 | else // if (img.data_type == ASTCENC_TYPE_F32) |
507 | { |
508 | assert(img.data_type == ASTCENC_TYPE_F32); |
509 | |
510 | for (unsigned int z = z_start; z < z_end; z++) |
511 | { |
512 | // Fetch the image plane |
513 | float* data32 = static_cast<float*>(img.data[z]); |
514 | |
515 | for (unsigned int y = y_start; y < y_end; y++) |
516 | { |
517 | float* data32_row = data32 + (4 * x_size * y) + (4 * x_start); |
518 | |
519 | for (unsigned int x = 0; x < x_count; x++) |
520 | { |
521 | vfloat4 color = blk.texel(idx); |
522 | |
523 | // NaNs are handled inline - no need to special case |
524 | if (needs_swz) |
525 | { |
526 | float data[7]; |
527 | data[ASTCENC_SWZ_0] = 0.0f; |
528 | data[ASTCENC_SWZ_1] = 1.0f; |
529 | data[ASTCENC_SWZ_R] = color.lane<0>(); |
530 | data[ASTCENC_SWZ_G] = color.lane<1>(); |
531 | data[ASTCENC_SWZ_B] = color.lane<2>(); |
532 | data[ASTCENC_SWZ_A] = color.lane<3>(); |
533 | |
534 | if (needs_z) |
535 | { |
536 | float xN = (data[0] * 2.0f) - 1.0f; |
537 | float yN = (data[3] * 2.0f) - 1.0f; |
538 | float zN = 1.0f - xN * xN - yN * yN; |
539 | if (zN < 0.0f) |
540 | { |
541 | zN = 0.0f; |
542 | } |
543 | data[ASTCENC_SWZ_Z] = (astc::sqrt(zN) * 0.5f) + 0.5f; |
544 | } |
545 | |
546 | color = vfloat4(data[swz.r], data[swz.g], data[swz.b], data[swz.a]); |
547 | } |
548 | |
549 | store(color, data32_row); |
550 | data32_row += 4; |
551 | idx++; |
552 | } |
553 | idx += x_nudge; |
554 | } |
555 | idx += y_nudge; |
556 | } |
557 | } |
558 | } |
559 | |