1 | // SPDX-License-Identifier: Apache-2.0 |
2 | // ---------------------------------------------------------------------------- |
3 | // Copyright 2011-2023 Arm Limited |
4 | // |
5 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
6 | // use this file except in compliance with the License. You may obtain a copy |
7 | // of the License at: |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, software |
12 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
13 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
14 | // License for the specific language governing permissions and limitations |
15 | // under the License. |
16 | // ---------------------------------------------------------------------------- |
17 | |
18 | /** |
19 | * @brief Functions for the library entrypoint. |
20 | */ |
21 | |
22 | #include <array> |
23 | #include <cstring> |
24 | #include <new> |
25 | |
26 | #include "astcenc.h" |
27 | #include "astcenc_internal_entry.h" |
28 | #include "astcenc_diagnostic_trace.h" |
29 | |
30 | /** |
31 | * @brief Record of the quality tuning parameter values. |
32 | * |
33 | * See the @c astcenc_config structure for detailed parameter documentation. |
34 | * |
35 | * Note that the mse_overshoot entries are scaling factors relative to the base MSE to hit db_limit. |
36 | * A 20% overshoot is harder to hit for a higher base db_limit, so we may actually use lower ratios |
37 | * for the more through search presets because the underlying db_limit is so much higher. |
38 | */ |
39 | struct astcenc_preset_config |
40 | { |
41 | float quality; |
42 | unsigned int tune_partition_count_limit; |
43 | unsigned int tune_2partition_index_limit; |
44 | unsigned int tune_3partition_index_limit; |
45 | unsigned int tune_4partition_index_limit; |
46 | unsigned int tune_block_mode_limit; |
47 | unsigned int tune_refinement_limit; |
48 | unsigned int tune_candidate_limit; |
49 | unsigned int tune_2partitioning_candidate_limit; |
50 | unsigned int tune_3partitioning_candidate_limit; |
51 | unsigned int tune_4partitioning_candidate_limit; |
52 | float tune_db_limit_a_base; |
53 | float tune_db_limit_b_base; |
54 | float tune_mse_overshoot; |
55 | float tune_2partition_early_out_limit_factor; |
56 | float tune_3partition_early_out_limit_factor; |
57 | float tune_2plane_early_out_limit_correlation; |
58 | }; |
59 | |
60 | /** |
61 | * @brief The static presets for high bandwidth encodings (x < 25 texels per block). |
62 | */ |
63 | static const std::array<astcenc_preset_config, 6> preset_configs_high {{ |
64 | { |
65 | ASTCENC_PRE_FASTEST, |
66 | 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f |
67 | }, { |
68 | ASTCENC_PRE_FAST, |
69 | 3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f |
70 | }, { |
71 | ASTCENC_PRE_MEDIUM, |
72 | 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f |
73 | }, { |
74 | ASTCENC_PRE_THOROUGH, |
75 | 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f |
76 | }, { |
77 | ASTCENC_PRE_VERYTHOROUGH, |
78 | 4, 256, 128, 64, 98, 4, 6, 20, 14, 8, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f |
79 | }, { |
80 | ASTCENC_PRE_EXHAUSTIVE, |
81 | 4, 512, 512, 512, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f |
82 | } |
83 | }}; |
84 | |
85 | /** |
86 | * @brief The static presets for medium bandwidth encodings (25 <= x < 64 texels per block). |
87 | */ |
88 | static const std::array<astcenc_preset_config, 6> preset_configs_mid {{ |
89 | { |
90 | ASTCENC_PRE_FASTEST, |
91 | 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f |
92 | }, { |
93 | ASTCENC_PRE_FAST, |
94 | 3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f |
95 | }, { |
96 | ASTCENC_PRE_MEDIUM, |
97 | 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f |
98 | }, { |
99 | ASTCENC_PRE_THOROUGH, |
100 | 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f |
101 | }, { |
102 | ASTCENC_PRE_VERYTHOROUGH, |
103 | 4, 256, 128, 64, 98, 4, 6, 12, 8, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f |
104 | }, { |
105 | ASTCENC_PRE_EXHAUSTIVE, |
106 | 4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f |
107 | } |
108 | }}; |
109 | |
110 | /** |
111 | * @brief The static presets for low bandwidth encodings (64 <= x texels per block). |
112 | */ |
113 | static const std::array<astcenc_preset_config, 6> preset_configs_low {{ |
114 | { |
115 | ASTCENC_PRE_FASTEST, |
116 | 2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f |
117 | }, { |
118 | ASTCENC_PRE_FAST, |
119 | 2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f |
120 | }, { |
121 | ASTCENC_PRE_MEDIUM, |
122 | 3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f |
123 | }, { |
124 | ASTCENC_PRE_THOROUGH, |
125 | 4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f |
126 | }, { |
127 | ASTCENC_PRE_VERYTHOROUGH, |
128 | 4, 256, 128, 64, 98, 4, 6, 9, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f |
129 | }, { |
130 | ASTCENC_PRE_EXHAUSTIVE, |
131 | 4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f |
132 | } |
133 | }}; |
134 | |
135 | /** |
136 | * @brief Validate CPU floating point meets assumptions made in the codec. |
137 | * |
138 | * The codec is written with the assumption that a float threaded through the @c if32 union will be |
139 | * stored and reloaded as a 32-bit IEEE-754 float with round-to-nearest rounding. This is always the |
140 | * case in an IEEE-754 compliant system, however not every system or compilation mode is actually |
141 | * IEEE-754 compliant. This normally fails if the code is compiled with fast math enabled. |
142 | * |
143 | * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. |
144 | */ |
145 | static astcenc_error validate_cpu_float() |
146 | { |
147 | if32 p; |
148 | volatile float xprec_testval = 2.51f; |
149 | p.f = xprec_testval + 12582912.0f; |
150 | float q = p.f - 12582912.0f; |
151 | |
152 | if (q != 3.0f) |
153 | { |
154 | return ASTCENC_ERR_BAD_CPU_FLOAT; |
155 | } |
156 | |
157 | return ASTCENC_SUCCESS; |
158 | } |
159 | |
160 | /** |
161 | * @brief Validate config profile. |
162 | * |
163 | * @param profile The profile to check. |
164 | * |
165 | * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. |
166 | */ |
167 | static astcenc_error validate_profile( |
168 | astcenc_profile profile |
169 | ) { |
170 | // Values in this enum are from an external user, so not guaranteed to be |
171 | // bounded to the enum values |
172 | switch (static_cast<int>(profile)) |
173 | { |
174 | case ASTCENC_PRF_LDR_SRGB: |
175 | case ASTCENC_PRF_LDR: |
176 | case ASTCENC_PRF_HDR_RGB_LDR_A: |
177 | case ASTCENC_PRF_HDR: |
178 | return ASTCENC_SUCCESS; |
179 | default: |
180 | return ASTCENC_ERR_BAD_PROFILE; |
181 | } |
182 | } |
183 | |
184 | /** |
185 | * @brief Validate block size. |
186 | * |
187 | * @param block_x The block x dimensions. |
188 | * @param block_y The block y dimensions. |
189 | * @param block_z The block z dimensions. |
190 | * |
191 | * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. |
192 | */ |
193 | static astcenc_error validate_block_size( |
194 | unsigned int block_x, |
195 | unsigned int block_y, |
196 | unsigned int block_z |
197 | ) { |
198 | // Test if this is a legal block size at all |
199 | bool is_legal = (((block_z <= 1) && is_legal_2d_block_size(block_x, block_y)) || |
200 | ((block_z >= 2) && is_legal_3d_block_size(block_x, block_y, block_z))); |
201 | if (!is_legal) |
202 | { |
203 | return ASTCENC_ERR_BAD_BLOCK_SIZE; |
204 | } |
205 | |
206 | // Test if this build has sufficient capacity for this block size |
207 | bool have_capacity = (block_x * block_y * block_z) <= BLOCK_MAX_TEXELS; |
208 | if (!have_capacity) |
209 | { |
210 | return ASTCENC_ERR_NOT_IMPLEMENTED; |
211 | } |
212 | |
213 | return ASTCENC_SUCCESS; |
214 | } |
215 | |
216 | /** |
217 | * @brief Validate flags. |
218 | * |
219 | * @param flags The flags to check. |
220 | * |
221 | * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. |
222 | */ |
223 | static astcenc_error validate_flags( |
224 | unsigned int flags |
225 | ) { |
226 | // Flags field must not contain any unknown flag bits |
227 | unsigned int exMask = ~ASTCENC_ALL_FLAGS; |
228 | if (popcount(flags & exMask) != 0) |
229 | { |
230 | return ASTCENC_ERR_BAD_FLAGS; |
231 | } |
232 | |
233 | // Flags field must only contain at most a single map type |
234 | exMask = ASTCENC_FLG_MAP_NORMAL |
235 | | ASTCENC_FLG_MAP_RGBM; |
236 | if (popcount(flags & exMask) > 1) |
237 | { |
238 | return ASTCENC_ERR_BAD_FLAGS; |
239 | } |
240 | |
241 | return ASTCENC_SUCCESS; |
242 | } |
243 | |
244 | #if !defined(ASTCENC_DECOMPRESS_ONLY) |
245 | |
246 | /** |
247 | * @brief Validate single channel compression swizzle. |
248 | * |
249 | * @param swizzle The swizzle to check. |
250 | * |
251 | * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. |
252 | */ |
253 | static astcenc_error validate_compression_swz( |
254 | astcenc_swz swizzle |
255 | ) { |
256 | // Not all enum values are handled; SWZ_Z is invalid for compression |
257 | switch (static_cast<int>(swizzle)) |
258 | { |
259 | case ASTCENC_SWZ_R: |
260 | case ASTCENC_SWZ_G: |
261 | case ASTCENC_SWZ_B: |
262 | case ASTCENC_SWZ_A: |
263 | case ASTCENC_SWZ_0: |
264 | case ASTCENC_SWZ_1: |
265 | return ASTCENC_SUCCESS; |
266 | default: |
267 | return ASTCENC_ERR_BAD_SWIZZLE; |
268 | } |
269 | } |
270 | |
271 | /** |
272 | * @brief Validate overall compression swizzle. |
273 | * |
274 | * @param swizzle The swizzle to check. |
275 | * |
276 | * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. |
277 | */ |
278 | static astcenc_error validate_compression_swizzle( |
279 | const astcenc_swizzle& swizzle |
280 | ) { |
281 | if (validate_compression_swz(swizzle.r) || |
282 | validate_compression_swz(swizzle.g) || |
283 | validate_compression_swz(swizzle.b) || |
284 | validate_compression_swz(swizzle.a)) |
285 | { |
286 | return ASTCENC_ERR_BAD_SWIZZLE; |
287 | } |
288 | |
289 | return ASTCENC_SUCCESS; |
290 | } |
291 | #endif |
292 | |
293 | /** |
294 | * @brief Validate single channel decompression swizzle. |
295 | * |
296 | * @param swizzle The swizzle to check. |
297 | * |
298 | * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. |
299 | */ |
300 | static astcenc_error validate_decompression_swz( |
301 | astcenc_swz swizzle |
302 | ) { |
303 | // Values in this enum are from an external user, so not guaranteed to be |
304 | // bounded to the enum values |
305 | switch (static_cast<int>(swizzle)) |
306 | { |
307 | case ASTCENC_SWZ_R: |
308 | case ASTCENC_SWZ_G: |
309 | case ASTCENC_SWZ_B: |
310 | case ASTCENC_SWZ_A: |
311 | case ASTCENC_SWZ_0: |
312 | case ASTCENC_SWZ_1: |
313 | case ASTCENC_SWZ_Z: |
314 | return ASTCENC_SUCCESS; |
315 | default: |
316 | return ASTCENC_ERR_BAD_SWIZZLE; |
317 | } |
318 | } |
319 | |
320 | /** |
321 | * @brief Validate overall decompression swizzle. |
322 | * |
323 | * @param swizzle The swizzle to check. |
324 | * |
325 | * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. |
326 | */ |
327 | static astcenc_error validate_decompression_swizzle( |
328 | const astcenc_swizzle& swizzle |
329 | ) { |
330 | if (validate_decompression_swz(swizzle.r) || |
331 | validate_decompression_swz(swizzle.g) || |
332 | validate_decompression_swz(swizzle.b) || |
333 | validate_decompression_swz(swizzle.a)) |
334 | { |
335 | return ASTCENC_ERR_BAD_SWIZZLE; |
336 | } |
337 | |
338 | return ASTCENC_SUCCESS; |
339 | } |
340 | |
341 | /** |
342 | * Validate that an incoming configuration is in-spec. |
343 | * |
344 | * This function can respond in two ways: |
345 | * |
346 | * * Numerical inputs that have valid ranges are clamped to those valid ranges. No error is thrown |
347 | * for out-of-range inputs in this case. |
348 | * * Numerical inputs and logic inputs are are logically invalid and which make no sense |
349 | * algorithmically will return an error. |
350 | * |
351 | * @param[in,out] config The input compressor configuration. |
352 | * |
353 | * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. |
354 | */ |
355 | static astcenc_error validate_config( |
356 | astcenc_config &config |
357 | ) { |
358 | astcenc_error status; |
359 | |
360 | status = validate_profile(config.profile); |
361 | if (status != ASTCENC_SUCCESS) |
362 | { |
363 | return status; |
364 | } |
365 | |
366 | status = validate_flags(config.flags); |
367 | if (status != ASTCENC_SUCCESS) |
368 | { |
369 | return status; |
370 | } |
371 | |
372 | status = validate_block_size(config.block_x, config.block_y, config.block_z); |
373 | if (status != ASTCENC_SUCCESS) |
374 | { |
375 | return status; |
376 | } |
377 | |
378 | #if defined(ASTCENC_DECOMPRESS_ONLY) |
379 | // Decompress-only builds only support decompress-only contexts |
380 | if (!(config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)) |
381 | { |
382 | return ASTCENC_ERR_BAD_PARAM; |
383 | } |
384 | #endif |
385 | |
386 | config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f); |
387 | |
388 | config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u); |
389 | config.tune_2partition_index_limit = astc::clamp(config.tune_2partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS); |
390 | config.tune_3partition_index_limit = astc::clamp(config.tune_3partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS); |
391 | config.tune_4partition_index_limit = astc::clamp(config.tune_4partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS); |
392 | config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u); |
393 | config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u); |
394 | config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES); |
395 | config.tune_2partitioning_candidate_limit = astc::clamp(config.tune_2partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES); |
396 | config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES); |
397 | config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES); |
398 | config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f); |
399 | config.tune_mse_overshoot = astc::max(config.tune_mse_overshoot, 1.0f); |
400 | config.tune_2partition_early_out_limit_factor = astc::max(config.tune_2partition_early_out_limit_factor, 0.0f); |
401 | config.tune_3partition_early_out_limit_factor = astc::max(config.tune_3partition_early_out_limit_factor, 0.0f); |
402 | config.tune_2plane_early_out_limit_correlation = astc::max(config.tune_2plane_early_out_limit_correlation, 0.0f); |
403 | |
404 | // Specifying a zero weight color component is not allowed; force to small value |
405 | float max_weight = astc::max(astc::max(config.cw_r_weight, config.cw_g_weight), |
406 | astc::max(config.cw_b_weight, config.cw_a_weight)); |
407 | if (max_weight > 0.0f) |
408 | { |
409 | max_weight /= 1000.0f; |
410 | config.cw_r_weight = astc::max(config.cw_r_weight, max_weight); |
411 | config.cw_g_weight = astc::max(config.cw_g_weight, max_weight); |
412 | config.cw_b_weight = astc::max(config.cw_b_weight, max_weight); |
413 | config.cw_a_weight = astc::max(config.cw_a_weight, max_weight); |
414 | } |
415 | // If all color components error weights are zero then return an error |
416 | else |
417 | { |
418 | return ASTCENC_ERR_BAD_PARAM; |
419 | } |
420 | |
421 | return ASTCENC_SUCCESS; |
422 | } |
423 | |
424 | /* See header for documentation. */ |
425 | astcenc_error astcenc_config_init( |
426 | astcenc_profile profile, |
427 | unsigned int block_x, |
428 | unsigned int block_y, |
429 | unsigned int block_z, |
430 | float quality, |
431 | unsigned int flags, |
432 | astcenc_config* configp |
433 | ) { |
434 | astcenc_error status; |
435 | |
436 | status = validate_cpu_float(); |
437 | if (status != ASTCENC_SUCCESS) |
438 | { |
439 | return status; |
440 | } |
441 | |
442 | // Zero init all config fields; although most of will be over written |
443 | astcenc_config& config = *configp; |
444 | std::memset(&config, 0, sizeof(config)); |
445 | |
446 | // Process the block size |
447 | block_z = astc::max(block_z, 1u); // For 2D blocks Z==0 is accepted, but convert to 1 |
448 | status = validate_block_size(block_x, block_y, block_z); |
449 | if (status != ASTCENC_SUCCESS) |
450 | { |
451 | return status; |
452 | } |
453 | |
454 | config.block_x = block_x; |
455 | config.block_y = block_y; |
456 | config.block_z = block_z; |
457 | |
458 | float texels = static_cast<float>(block_x * block_y * block_z); |
459 | float ltexels = logf(texels) / logf(10.0f); |
460 | |
461 | // Process the performance quality level or preset; note that this must be done before we |
462 | // process any additional settings, such as color profile and flags, which may replace some of |
463 | // these settings with more use case tuned values |
464 | if (quality < ASTCENC_PRE_FASTEST || |
465 | quality > ASTCENC_PRE_EXHAUSTIVE) |
466 | { |
467 | return ASTCENC_ERR_BAD_QUALITY; |
468 | } |
469 | |
470 | static const std::array<astcenc_preset_config, 6>* preset_configs; |
471 | int texels_int = block_x * block_y * block_z; |
472 | if (texels_int < 25) |
473 | { |
474 | preset_configs = &preset_configs_high; |
475 | } |
476 | else if (texels_int < 64) |
477 | { |
478 | preset_configs = &preset_configs_mid; |
479 | } |
480 | else |
481 | { |
482 | preset_configs = &preset_configs_low; |
483 | } |
484 | |
485 | // Determine which preset to use, or which pair to interpolate |
486 | size_t start; |
487 | size_t end; |
488 | for (end = 0; end < preset_configs->size(); end++) |
489 | { |
490 | if ((*preset_configs)[end].quality >= quality) |
491 | { |
492 | break; |
493 | } |
494 | } |
495 | |
496 | start = end == 0 ? 0 : end - 1; |
497 | |
498 | // Start and end node are the same - so just transfer the values. |
499 | if (start == end) |
500 | { |
501 | config.tune_partition_count_limit = (*preset_configs)[start].tune_partition_count_limit; |
502 | config.tune_2partition_index_limit = (*preset_configs)[start].tune_2partition_index_limit; |
503 | config.tune_3partition_index_limit = (*preset_configs)[start].tune_3partition_index_limit; |
504 | config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit; |
505 | config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit; |
506 | config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit; |
507 | config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit, TUNE_MAX_TRIAL_CANDIDATES); |
508 | config.tune_2partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_2partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES); |
509 | config.tune_3partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_3partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES); |
510 | config.tune_4partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_4partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES); |
511 | config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels, |
512 | (*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels); |
513 | |
514 | config.tune_mse_overshoot = (*preset_configs)[start].tune_mse_overshoot; |
515 | |
516 | config.tune_2partition_early_out_limit_factor = (*preset_configs)[start].tune_2partition_early_out_limit_factor; |
517 | config.tune_3partition_early_out_limit_factor = (*preset_configs)[start].tune_3partition_early_out_limit_factor; |
518 | config.tune_2plane_early_out_limit_correlation = (*preset_configs)[start].tune_2plane_early_out_limit_correlation; |
519 | } |
520 | // Start and end node are not the same - so interpolate between them |
521 | else |
522 | { |
523 | auto& node_a = (*preset_configs)[start]; |
524 | auto& node_b = (*preset_configs)[end]; |
525 | |
526 | float wt_range = node_b.quality - node_a.quality; |
527 | assert(wt_range > 0); |
528 | |
529 | // Compute interpolation factors |
530 | float wt_node_a = (node_b.quality - quality) / wt_range; |
531 | float wt_node_b = (quality - node_a.quality) / wt_range; |
532 | |
533 | #define LERP(param) ((node_a.param * wt_node_a) + (node_b.param * wt_node_b)) |
534 | #define LERPI(param) astc::flt2int_rtn(\ |
535 | (static_cast<float>(node_a.param) * wt_node_a) + \ |
536 | (static_cast<float>(node_b.param) * wt_node_b)) |
537 | #define LERPUI(param) static_cast<unsigned int>(LERPI(param)) |
538 | |
539 | config.tune_partition_count_limit = LERPI(tune_partition_count_limit); |
540 | config.tune_2partition_index_limit = LERPI(tune_2partition_index_limit); |
541 | config.tune_3partition_index_limit = LERPI(tune_3partition_index_limit); |
542 | config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit); |
543 | config.tune_block_mode_limit = LERPI(tune_block_mode_limit); |
544 | config.tune_refinement_limit = LERPI(tune_refinement_limit); |
545 | config.tune_candidate_limit = astc::min(LERPUI(tune_candidate_limit), |
546 | TUNE_MAX_TRIAL_CANDIDATES); |
547 | config.tune_2partitioning_candidate_limit = astc::min(LERPUI(tune_2partitioning_candidate_limit), |
548 | BLOCK_MAX_PARTITIONINGS); |
549 | config.tune_3partitioning_candidate_limit = astc::min(LERPUI(tune_3partitioning_candidate_limit), |
550 | BLOCK_MAX_PARTITIONINGS); |
551 | config.tune_4partitioning_candidate_limit = astc::min(LERPUI(tune_4partitioning_candidate_limit), |
552 | BLOCK_MAX_PARTITIONINGS); |
553 | config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels, |
554 | LERP(tune_db_limit_b_base) - 19 * ltexels); |
555 | |
556 | config.tune_mse_overshoot = LERP(tune_mse_overshoot); |
557 | |
558 | config.tune_2partition_early_out_limit_factor = LERP(tune_2partition_early_out_limit_factor); |
559 | config.tune_3partition_early_out_limit_factor = LERP(tune_3partition_early_out_limit_factor); |
560 | config.tune_2plane_early_out_limit_correlation = LERP(tune_2plane_early_out_limit_correlation); |
561 | #undef LERP |
562 | #undef LERPI |
563 | #undef LERPUI |
564 | } |
565 | |
566 | // Set heuristics to the defaults for each color profile |
567 | config.cw_r_weight = 1.0f; |
568 | config.cw_g_weight = 1.0f; |
569 | config.cw_b_weight = 1.0f; |
570 | config.cw_a_weight = 1.0f; |
571 | |
572 | config.a_scale_radius = 0; |
573 | |
574 | config.rgbm_m_scale = 0.0f; |
575 | |
576 | config.profile = profile; |
577 | |
578 | // Values in this enum are from an external user, so not guaranteed to be |
579 | // bounded to the enum values |
580 | switch (static_cast<int>(profile)) |
581 | { |
582 | case ASTCENC_PRF_LDR: |
583 | case ASTCENC_PRF_LDR_SRGB: |
584 | break; |
585 | case ASTCENC_PRF_HDR_RGB_LDR_A: |
586 | case ASTCENC_PRF_HDR: |
587 | config.tune_db_limit = 999.0f; |
588 | break; |
589 | default: |
590 | return ASTCENC_ERR_BAD_PROFILE; |
591 | } |
592 | |
593 | // Flags field must not contain any unknown flag bits |
594 | status = validate_flags(flags); |
595 | if (status != ASTCENC_SUCCESS) |
596 | { |
597 | return status; |
598 | } |
599 | |
600 | if (flags & ASTCENC_FLG_MAP_NORMAL) |
601 | { |
602 | // Normal map encoding uses L+A blocks, so allow one more partitioning |
603 | // than normal. We need need fewer bits for endpoints, so more likely |
604 | // to be able to use more partitions than an RGB/RGBA block |
605 | config.tune_partition_count_limit = astc::min(config.tune_partition_count_limit + 1u, 4u); |
606 | |
607 | config.cw_g_weight = 0.0f; |
608 | config.cw_b_weight = 0.0f; |
609 | config.tune_2partition_early_out_limit_factor *= 1.5f; |
610 | config.tune_3partition_early_out_limit_factor *= 1.5f; |
611 | config.tune_2plane_early_out_limit_correlation = 0.99f; |
612 | |
613 | // Normals are prone to blocking artifacts on smooth curves |
614 | // so force compressor to try harder here ... |
615 | config.tune_db_limit *= 1.03f; |
616 | } |
617 | else if (flags & ASTCENC_FLG_MAP_RGBM) |
618 | { |
619 | config.rgbm_m_scale = 5.0f; |
620 | config.cw_a_weight = 2.0f * config.rgbm_m_scale; |
621 | } |
622 | else // (This is color data) |
623 | { |
624 | // This is a very basic perceptual metric for RGB color data, which weights error |
625 | // significance by the perceptual luminance contribution of each color channel. For |
626 | // luminance the usual weights to compute luminance from a linear RGB value are as |
627 | // follows: |
628 | // |
629 | // l = r * 0.3 + g * 0.59 + b * 0.11 |
630 | // |
631 | // ... but we scale these up to keep a better balance between color and alpha. Note |
632 | // that if the content is using alpha we'd recommend using the -a option to weight |
633 | // the color contribution by the alpha transparency. |
634 | if (flags & ASTCENC_FLG_USE_PERCEPTUAL) |
635 | { |
636 | config.cw_r_weight = 0.30f * 2.25f; |
637 | config.cw_g_weight = 0.59f * 2.25f; |
638 | config.cw_b_weight = 0.11f * 2.25f; |
639 | } |
640 | } |
641 | config.flags = flags; |
642 | |
643 | return ASTCENC_SUCCESS; |
644 | } |
645 | |
646 | /* See header for documentation. */ |
647 | astcenc_error astcenc_context_alloc( |
648 | const astcenc_config* configp, |
649 | unsigned int thread_count, |
650 | astcenc_context** context |
651 | ) { |
652 | astcenc_error status; |
653 | const astcenc_config& config = *configp; |
654 | |
655 | status = validate_cpu_float(); |
656 | if (status != ASTCENC_SUCCESS) |
657 | { |
658 | return status; |
659 | } |
660 | |
661 | if (thread_count == 0) |
662 | { |
663 | return ASTCENC_ERR_BAD_PARAM; |
664 | } |
665 | |
666 | #if defined(ASTCENC_DIAGNOSTICS) |
667 | // Force single threaded compressor use in diagnostic mode. |
668 | if (thread_count != 1) |
669 | { |
670 | return ASTCENC_ERR_BAD_PARAM; |
671 | } |
672 | #endif |
673 | |
674 | astcenc_context* ctxo = new astcenc_context; |
675 | astcenc_contexti* ctx = &ctxo->context; |
676 | ctx->thread_count = thread_count; |
677 | ctx->config = config; |
678 | ctx->working_buffers = nullptr; |
679 | |
680 | // These are allocated per-compress, as they depend on image size |
681 | ctx->input_alpha_averages = nullptr; |
682 | |
683 | // Copy the config first and validate the copy (we may modify it) |
684 | status = validate_config(ctx->config); |
685 | if (status != ASTCENC_SUCCESS) |
686 | { |
687 | delete ctxo; |
688 | return status; |
689 | } |
690 | |
691 | ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN); |
692 | bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY); |
693 | init_block_size_descriptor(config.block_x, config.block_y, config.block_z, |
694 | can_omit_modes, |
695 | config.tune_partition_count_limit, |
696 | static_cast<float>(config.tune_block_mode_limit) / 100.0f, |
697 | *ctx->bsd); |
698 | |
699 | #if !defined(ASTCENC_DECOMPRESS_ONLY) |
700 | // Do setup only needed by compression |
701 | if (!(status & ASTCENC_FLG_DECOMPRESS_ONLY)) |
702 | { |
703 | // Turn a dB limit into a per-texel error for faster use later |
704 | if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB)) |
705 | { |
706 | ctx->config.tune_db_limit = astc::pow(0.1f, ctx->config.tune_db_limit * 0.1f) * 65535.0f * 65535.0f; |
707 | } |
708 | else |
709 | { |
710 | ctx->config.tune_db_limit = 0.0f; |
711 | } |
712 | |
713 | size_t worksize = sizeof(compression_working_buffers) * thread_count; |
714 | ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN); |
715 | static_assert((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0, |
716 | "compression_working_buffers size must be multiple of vector alignment" ); |
717 | if (!ctx->working_buffers) |
718 | { |
719 | aligned_free<block_size_descriptor>(ctx->bsd); |
720 | delete ctxo; |
721 | *context = nullptr; |
722 | return ASTCENC_ERR_OUT_OF_MEM; |
723 | } |
724 | } |
725 | #endif |
726 | |
727 | #if defined(ASTCENC_DIAGNOSTICS) |
728 | ctx->trace_log = new TraceLog(ctx->config.trace_file_path); |
729 | if (!ctx->trace_log->m_file) |
730 | { |
731 | return ASTCENC_ERR_DTRACE_FAILURE; |
732 | } |
733 | |
734 | trace_add_data("block_x" , config.block_x); |
735 | trace_add_data("block_y" , config.block_y); |
736 | trace_add_data("block_z" , config.block_z); |
737 | #endif |
738 | |
739 | *context = ctxo; |
740 | |
741 | #if !defined(ASTCENC_DECOMPRESS_ONLY) |
742 | prepare_angular_tables(); |
743 | #endif |
744 | |
745 | return ASTCENC_SUCCESS; |
746 | } |
747 | |
748 | /* See header dor documentation. */ |
749 | void astcenc_context_free( |
750 | astcenc_context* ctxo |
751 | ) { |
752 | if (ctxo) |
753 | { |
754 | astcenc_contexti* ctx = &ctxo->context; |
755 | aligned_free<compression_working_buffers>(ctx->working_buffers); |
756 | aligned_free<block_size_descriptor>(ctx->bsd); |
757 | #if defined(ASTCENC_DIAGNOSTICS) |
758 | delete ctx->trace_log; |
759 | #endif |
760 | delete ctxo; |
761 | } |
762 | } |
763 | |
764 | #if !defined(ASTCENC_DECOMPRESS_ONLY) |
765 | |
766 | /** |
767 | * @brief Compress an image, after any preflight has completed. |
768 | * |
769 | * @param[out] ctxo The compressor context. |
770 | * @param thread_index The thread index. |
771 | * @param image The intput image. |
772 | * @param swizzle The input swizzle. |
773 | * @param[out] buffer The output array for the compressed data. |
774 | */ |
775 | static void compress_image( |
776 | astcenc_context& ctxo, |
777 | unsigned int thread_index, |
778 | const astcenc_image& image, |
779 | const astcenc_swizzle& swizzle, |
780 | uint8_t* buffer |
781 | ) { |
782 | astcenc_contexti& ctx = ctxo.context; |
783 | const block_size_descriptor& bsd = *ctx.bsd; |
784 | astcenc_profile decode_mode = ctx.config.profile; |
785 | |
786 | image_block blk; |
787 | |
788 | int block_x = bsd.xdim; |
789 | int block_y = bsd.ydim; |
790 | int block_z = bsd.zdim; |
791 | blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z); |
792 | |
793 | int dim_x = image.dim_x; |
794 | int dim_y = image.dim_y; |
795 | int dim_z = image.dim_z; |
796 | |
797 | int xblocks = (dim_x + block_x - 1) / block_x; |
798 | int yblocks = (dim_y + block_y - 1) / block_y; |
799 | int zblocks = (dim_z + block_z - 1) / block_z; |
800 | int block_count = zblocks * yblocks * xblocks; |
801 | |
802 | int row_blocks = xblocks; |
803 | int plane_blocks = xblocks * yblocks; |
804 | |
805 | // Populate the block channel weights |
806 | blk.channel_weight = vfloat4(ctx.config.cw_r_weight, |
807 | ctx.config.cw_g_weight, |
808 | ctx.config.cw_b_weight, |
809 | ctx.config.cw_a_weight); |
810 | |
811 | // Use preallocated scratch buffer |
812 | auto& temp_buffers = ctx.working_buffers[thread_index]; |
813 | |
814 | // Only the first thread actually runs the initializer |
815 | ctxo.manage_compress.init(block_count); |
816 | |
817 | // Determine if we can use an optimized load function |
818 | bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) || |
819 | (swizzle.b != ASTCENC_SWZ_B) || (swizzle.a != ASTCENC_SWZ_A); |
820 | |
821 | bool needs_hdr = (decode_mode == ASTCENC_PRF_HDR) || |
822 | (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A); |
823 | |
824 | bool use_fast_load = !needs_swz && !needs_hdr && |
825 | block_z == 1 && image.data_type == ASTCENC_TYPE_U8; |
826 | |
827 | auto load_func = load_image_block; |
828 | if (use_fast_load) |
829 | { |
830 | load_func = load_image_block_fast_ldr; |
831 | } |
832 | |
833 | // All threads run this processing loop until there is no work remaining |
834 | while (true) |
835 | { |
836 | unsigned int count; |
837 | unsigned int base = ctxo.manage_compress.get_task_assignment(16, count); |
838 | if (!count) |
839 | { |
840 | break; |
841 | } |
842 | |
843 | for (unsigned int i = base; i < base + count; i++) |
844 | { |
845 | // Decode i into x, y, z block indices |
846 | int z = i / plane_blocks; |
847 | unsigned int rem = i - (z * plane_blocks); |
848 | int y = rem / row_blocks; |
849 | int x = rem - (y * row_blocks); |
850 | |
851 | // Test if we can apply some basic alpha-scale RDO |
852 | bool use_full_block = true; |
853 | if (ctx.config.a_scale_radius != 0 && block_z == 1) |
854 | { |
855 | int start_x = x * block_x; |
856 | int end_x = astc::min(dim_x, start_x + block_x); |
857 | |
858 | int start_y = y * block_y; |
859 | int end_y = astc::min(dim_y, start_y + block_y); |
860 | |
861 | // SATs accumulate error, so don't test exactly zero. Test for |
862 | // less than 1 alpha in the expanded block footprint that |
863 | // includes the alpha radius. |
864 | int = block_x + 2 * (ctx.config.a_scale_radius - 1); |
865 | |
866 | int = block_y + 2 * (ctx.config.a_scale_radius - 1); |
867 | |
868 | float = static_cast<float>(x_footprint * y_footprint); |
869 | float threshold = 0.9f / (255.0f * footprint); |
870 | |
871 | // Do we have any alpha values? |
872 | use_full_block = false; |
873 | for (int ay = start_y; ay < end_y; ay++) |
874 | { |
875 | for (int ax = start_x; ax < end_x; ax++) |
876 | { |
877 | float a_avg = ctx.input_alpha_averages[ay * dim_x + ax]; |
878 | if (a_avg > threshold) |
879 | { |
880 | use_full_block = true; |
881 | ax = end_x; |
882 | ay = end_y; |
883 | } |
884 | } |
885 | } |
886 | } |
887 | |
888 | // Fetch the full block for compression |
889 | if (use_full_block) |
890 | { |
891 | load_func(decode_mode, image, blk, bsd, x * block_x, y * block_y, z * block_z, swizzle); |
892 | |
893 | // Scale RGB error contribution by the maximum alpha in the block |
894 | // This encourages preserving alpha accuracy in regions with high |
895 | // transparency, and can buy up to 0.5 dB PSNR. |
896 | if (ctx.config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT) |
897 | { |
898 | float alpha_scale = blk.data_max.lane<3>() * (1.0f / 65535.0f); |
899 | blk.channel_weight = vfloat4(ctx.config.cw_r_weight * alpha_scale, |
900 | ctx.config.cw_g_weight * alpha_scale, |
901 | ctx.config.cw_b_weight * alpha_scale, |
902 | ctx.config.cw_a_weight); |
903 | } |
904 | } |
905 | // Apply alpha scale RDO - substitute constant color block |
906 | else |
907 | { |
908 | blk.origin_texel = vfloat4::zero(); |
909 | blk.data_min = vfloat4::zero(); |
910 | blk.data_mean = vfloat4::zero(); |
911 | blk.data_max = vfloat4::zero(); |
912 | blk.grayscale = true; |
913 | } |
914 | |
915 | int offset = ((z * yblocks + y) * xblocks + x) * 16; |
916 | uint8_t *bp = buffer + offset; |
917 | physical_compressed_block* pcb = reinterpret_cast<physical_compressed_block*>(bp); |
918 | compress_block(ctx, blk, *pcb, temp_buffers); |
919 | } |
920 | |
921 | ctxo.manage_compress.complete_task_assignment(count); |
922 | } |
923 | } |
924 | |
925 | /** |
926 | * @brief Compute regional averages in an image. |
927 | * |
928 | * This function can be called by multiple threads, but only after a single |
929 | * thread calls the setup function @c init_compute_averages(). |
930 | * |
931 | * Results are written back into @c img->input_alpha_averages. |
932 | * |
933 | * @param[out] ctx The context. |
934 | * @param ag The average and variance arguments created during setup. |
935 | */ |
936 | static void compute_averages( |
937 | astcenc_context& ctx, |
938 | const avg_args &ag |
939 | ) { |
940 | pixel_region_args arg = ag.arg; |
941 | arg.work_memory = new vfloat4[ag.work_memory_size]; |
942 | |
943 | int size_x = ag.img_size_x; |
944 | int size_y = ag.img_size_y; |
945 | int size_z = ag.img_size_z; |
946 | |
947 | int step_xy = ag.blk_size_xy; |
948 | int step_z = ag.blk_size_z; |
949 | |
950 | int y_tasks = (size_y + step_xy - 1) / step_xy; |
951 | |
952 | // All threads run this processing loop until there is no work remaining |
953 | while (true) |
954 | { |
955 | unsigned int count; |
956 | unsigned int base = ctx.manage_avg.get_task_assignment(16, count); |
957 | if (!count) |
958 | { |
959 | break; |
960 | } |
961 | |
962 | for (unsigned int i = base; i < base + count; i++) |
963 | { |
964 | int z = (i / (y_tasks)) * step_z; |
965 | int y = (i - (z * y_tasks)) * step_xy; |
966 | |
967 | arg.size_z = astc::min(step_z, size_z - z); |
968 | arg.offset_z = z; |
969 | |
970 | arg.size_y = astc::min(step_xy, size_y - y); |
971 | arg.offset_y = y; |
972 | |
973 | for (int x = 0; x < size_x; x += step_xy) |
974 | { |
975 | arg.size_x = astc::min(step_xy, size_x - x); |
976 | arg.offset_x = x; |
977 | compute_pixel_region_variance(ctx.context, arg); |
978 | } |
979 | } |
980 | |
981 | ctx.manage_avg.complete_task_assignment(count); |
982 | } |
983 | |
984 | delete[] arg.work_memory; |
985 | } |
986 | |
987 | #endif |
988 | |
989 | /* See header for documentation. */ |
990 | astcenc_error astcenc_compress_image( |
991 | astcenc_context* ctxo, |
992 | astcenc_image* imagep, |
993 | const astcenc_swizzle* swizzle, |
994 | uint8_t* data_out, |
995 | size_t data_len, |
996 | unsigned int thread_index |
997 | ) { |
998 | #if defined(ASTCENC_DECOMPRESS_ONLY) |
999 | (void)ctxo; |
1000 | (void)imagep; |
1001 | (void)swizzle; |
1002 | (void)data_out; |
1003 | (void)data_len; |
1004 | (void)thread_index; |
1005 | return ASTCENC_ERR_BAD_CONTEXT; |
1006 | #else |
1007 | astcenc_contexti* ctx = &ctxo->context; |
1008 | astcenc_error status; |
1009 | astcenc_image& image = *imagep; |
1010 | |
1011 | if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY) |
1012 | { |
1013 | return ASTCENC_ERR_BAD_CONTEXT; |
1014 | } |
1015 | |
1016 | status = validate_compression_swizzle(*swizzle); |
1017 | if (status != ASTCENC_SUCCESS) |
1018 | { |
1019 | return status; |
1020 | } |
1021 | |
1022 | if (thread_index >= ctx->thread_count) |
1023 | { |
1024 | return ASTCENC_ERR_BAD_PARAM; |
1025 | } |
1026 | |
1027 | unsigned int block_x = ctx->config.block_x; |
1028 | unsigned int block_y = ctx->config.block_y; |
1029 | unsigned int block_z = ctx->config.block_z; |
1030 | |
1031 | unsigned int xblocks = (image.dim_x + block_x - 1) / block_x; |
1032 | unsigned int yblocks = (image.dim_y + block_y - 1) / block_y; |
1033 | unsigned int zblocks = (image.dim_z + block_z - 1) / block_z; |
1034 | |
1035 | // Check we have enough output space (16 bytes per block) |
1036 | size_t size_needed = xblocks * yblocks * zblocks * 16; |
1037 | if (data_len < size_needed) |
1038 | { |
1039 | return ASTCENC_ERR_OUT_OF_MEM; |
1040 | } |
1041 | |
1042 | // If context thread count is one then implicitly reset |
1043 | if (ctx->thread_count == 1) |
1044 | { |
1045 | astcenc_compress_reset(ctxo); |
1046 | } |
1047 | |
1048 | if (ctx->config.a_scale_radius != 0) |
1049 | { |
1050 | // First thread to enter will do setup, other threads will subsequently |
1051 | // enter the critical section but simply skip over the initialization |
1052 | auto init_avg = [ctx, &image, swizzle]() { |
1053 | // Perform memory allocations for the destination buffers |
1054 | size_t texel_count = image.dim_x * image.dim_y * image.dim_z; |
1055 | ctx->input_alpha_averages = new float[texel_count]; |
1056 | |
1057 | return init_compute_averages( |
1058 | image, ctx->config.a_scale_radius, *swizzle, |
1059 | ctx->avg_preprocess_args); |
1060 | }; |
1061 | |
1062 | // Only the first thread actually runs the initializer |
1063 | ctxo->manage_avg.init(init_avg); |
1064 | |
1065 | // All threads will enter this function and dynamically grab work |
1066 | compute_averages(*ctxo, ctx->avg_preprocess_args); |
1067 | } |
1068 | |
1069 | // Wait for compute_averages to complete before compressing |
1070 | ctxo->manage_avg.wait(); |
1071 | |
1072 | compress_image(*ctxo, thread_index, image, *swizzle, data_out); |
1073 | |
1074 | // Wait for compress to complete before freeing memory |
1075 | ctxo->manage_compress.wait(); |
1076 | |
1077 | auto term_compress = [ctx]() { |
1078 | delete[] ctx->input_alpha_averages; |
1079 | ctx->input_alpha_averages = nullptr; |
1080 | }; |
1081 | |
1082 | // Only the first thread to arrive actually runs the term |
1083 | ctxo->manage_compress.term(term_compress); |
1084 | |
1085 | return ASTCENC_SUCCESS; |
1086 | #endif |
1087 | } |
1088 | |
1089 | /* See header for documentation. */ |
1090 | astcenc_error astcenc_compress_reset( |
1091 | astcenc_context* ctxo |
1092 | ) { |
1093 | #if defined(ASTCENC_DECOMPRESS_ONLY) |
1094 | (void)ctxo; |
1095 | return ASTCENC_ERR_BAD_CONTEXT; |
1096 | #else |
1097 | astcenc_contexti* ctx = &ctxo->context; |
1098 | if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY) |
1099 | { |
1100 | return ASTCENC_ERR_BAD_CONTEXT; |
1101 | } |
1102 | |
1103 | ctxo->manage_avg.reset(); |
1104 | ctxo->manage_compress.reset(); |
1105 | return ASTCENC_SUCCESS; |
1106 | #endif |
1107 | } |
1108 | |
1109 | /* See header for documentation. */ |
1110 | astcenc_error astcenc_decompress_image( |
1111 | astcenc_context* ctxo, |
1112 | const uint8_t* data, |
1113 | size_t data_len, |
1114 | astcenc_image* image_outp, |
1115 | const astcenc_swizzle* swizzle, |
1116 | unsigned int thread_index |
1117 | ) { |
1118 | astcenc_error status; |
1119 | astcenc_image& image_out = *image_outp; |
1120 | astcenc_contexti* ctx = &ctxo->context; |
1121 | |
1122 | // Today this doesn't matter (working set on stack) but might in future ... |
1123 | if (thread_index >= ctx->thread_count) |
1124 | { |
1125 | return ASTCENC_ERR_BAD_PARAM; |
1126 | } |
1127 | |
1128 | status = validate_decompression_swizzle(*swizzle); |
1129 | if (status != ASTCENC_SUCCESS) |
1130 | { |
1131 | return status; |
1132 | } |
1133 | |
1134 | unsigned int block_x = ctx->config.block_x; |
1135 | unsigned int block_y = ctx->config.block_y; |
1136 | unsigned int block_z = ctx->config.block_z; |
1137 | |
1138 | unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x; |
1139 | unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y; |
1140 | unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z; |
1141 | |
1142 | int row_blocks = xblocks; |
1143 | int plane_blocks = xblocks * yblocks; |
1144 | |
1145 | // Check we have enough output space (16 bytes per block) |
1146 | size_t size_needed = xblocks * yblocks * zblocks * 16; |
1147 | if (data_len < size_needed) |
1148 | { |
1149 | return ASTCENC_ERR_OUT_OF_MEM; |
1150 | } |
1151 | |
1152 | image_block blk; |
1153 | blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z); |
1154 | |
1155 | // If context thread count is one then implicitly reset |
1156 | if (ctx->thread_count == 1) |
1157 | { |
1158 | astcenc_decompress_reset(ctxo); |
1159 | } |
1160 | |
1161 | // Only the first thread actually runs the initializer |
1162 | ctxo->manage_decompress.init(zblocks * yblocks * xblocks); |
1163 | |
1164 | // All threads run this processing loop until there is no work remaining |
1165 | while (true) |
1166 | { |
1167 | unsigned int count; |
1168 | unsigned int base = ctxo->manage_decompress.get_task_assignment(128, count); |
1169 | if (!count) |
1170 | { |
1171 | break; |
1172 | } |
1173 | |
1174 | for (unsigned int i = base; i < base + count; i++) |
1175 | { |
1176 | // Decode i into x, y, z block indices |
1177 | int z = i / plane_blocks; |
1178 | unsigned int rem = i - (z * plane_blocks); |
1179 | int y = rem / row_blocks; |
1180 | int x = rem - (y * row_blocks); |
1181 | |
1182 | unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16; |
1183 | const uint8_t* bp = data + offset; |
1184 | |
1185 | const physical_compressed_block& pcb = *reinterpret_cast<const physical_compressed_block*>(bp); |
1186 | symbolic_compressed_block scb; |
1187 | |
1188 | physical_to_symbolic(*ctx->bsd, pcb, scb); |
1189 | |
1190 | decompress_symbolic_block(ctx->config.profile, *ctx->bsd, |
1191 | x * block_x, y * block_y, z * block_z, |
1192 | scb, blk); |
1193 | |
1194 | store_image_block(image_out, blk, *ctx->bsd, |
1195 | x * block_x, y * block_y, z * block_z, *swizzle); |
1196 | } |
1197 | |
1198 | ctxo->manage_decompress.complete_task_assignment(count); |
1199 | } |
1200 | |
1201 | return ASTCENC_SUCCESS; |
1202 | } |
1203 | |
1204 | /* See header for documentation. */ |
1205 | astcenc_error astcenc_decompress_reset( |
1206 | astcenc_context* ctxo |
1207 | ) { |
1208 | ctxo->manage_decompress.reset(); |
1209 | return ASTCENC_SUCCESS; |
1210 | } |
1211 | |
1212 | /* See header for documentation. */ |
1213 | astcenc_error astcenc_get_block_info( |
1214 | astcenc_context* ctxo, |
1215 | const uint8_t data[16], |
1216 | astcenc_block_info* info |
1217 | ) { |
1218 | #if defined(ASTCENC_DECOMPRESS_ONLY) |
1219 | (void)ctxo; |
1220 | (void)data; |
1221 | (void)info; |
1222 | return ASTCENC_ERR_BAD_CONTEXT; |
1223 | #else |
1224 | astcenc_contexti* ctx = &ctxo->context; |
1225 | |
1226 | // Decode the compressed data into a symbolic form |
1227 | const physical_compressed_block&pcb = *reinterpret_cast<const physical_compressed_block*>(data); |
1228 | symbolic_compressed_block scb; |
1229 | physical_to_symbolic(*ctx->bsd, pcb, scb); |
1230 | |
1231 | // Fetch the appropriate partition and decimation tables |
1232 | block_size_descriptor& bsd = *ctx->bsd; |
1233 | |
1234 | // Start from a clean slate |
1235 | memset(info, 0, sizeof(*info)); |
1236 | |
1237 | // Basic info we can always populate |
1238 | info->profile = ctx->config.profile; |
1239 | |
1240 | info->block_x = ctx->config.block_x; |
1241 | info->block_y = ctx->config.block_y; |
1242 | info->block_z = ctx->config.block_z; |
1243 | info->texel_count = bsd.texel_count; |
1244 | |
1245 | // Check for error blocks first |
1246 | info->is_error_block = scb.block_type == SYM_BTYPE_ERROR; |
1247 | if (info->is_error_block) |
1248 | { |
1249 | return ASTCENC_SUCCESS; |
1250 | } |
1251 | |
1252 | // Check for constant color blocks second |
1253 | info->is_constant_block = scb.block_type == SYM_BTYPE_CONST_F16 || |
1254 | scb.block_type == SYM_BTYPE_CONST_U16; |
1255 | if (info->is_constant_block) |
1256 | { |
1257 | return ASTCENC_SUCCESS; |
1258 | } |
1259 | |
1260 | // Otherwise handle a full block ; known to be valid after conditions above have been checked |
1261 | int partition_count = scb.partition_count; |
1262 | const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index); |
1263 | |
1264 | const block_mode& bm = bsd.get_block_mode(scb.block_mode); |
1265 | const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode); |
1266 | |
1267 | info->weight_x = di.weight_x; |
1268 | info->weight_y = di.weight_y; |
1269 | info->weight_z = di.weight_z; |
1270 | |
1271 | info->is_dual_plane_block = bm.is_dual_plane != 0; |
1272 | |
1273 | info->partition_count = scb.partition_count; |
1274 | info->partition_index = scb.partition_index; |
1275 | info->dual_plane_component = scb.plane2_component; |
1276 | |
1277 | info->color_level_count = get_quant_level(scb.get_color_quant_mode()); |
1278 | info->weight_level_count = get_quant_level(bm.get_weight_quant_mode()); |
1279 | |
1280 | // Unpack color endpoints for each active partition |
1281 | for (unsigned int i = 0; i < scb.partition_count; i++) |
1282 | { |
1283 | bool rgb_hdr; |
1284 | bool a_hdr; |
1285 | vint4 endpnt[2]; |
1286 | |
1287 | unpack_color_endpoints(ctx->config.profile, |
1288 | scb.color_formats[i], |
1289 | scb.color_values[i], |
1290 | rgb_hdr, a_hdr, |
1291 | endpnt[0], endpnt[1]); |
1292 | |
1293 | // Store the color endpoint mode info |
1294 | info->color_endpoint_modes[i] = scb.color_formats[i]; |
1295 | info->is_hdr_block = info->is_hdr_block || rgb_hdr || a_hdr; |
1296 | |
1297 | // Store the unpacked and decoded color endpoint |
1298 | vmask4 hdr_mask(rgb_hdr, rgb_hdr, rgb_hdr, a_hdr); |
1299 | for (int j = 0; j < 2; j++) |
1300 | { |
1301 | vint4 color_lns = lns_to_sf16(endpnt[j]); |
1302 | vint4 color_unorm = unorm16_to_sf16(endpnt[j]); |
1303 | vint4 datai = select(color_unorm, color_lns, hdr_mask); |
1304 | store(float16_to_float(datai), info->color_endpoints[i][j]); |
1305 | } |
1306 | } |
1307 | |
1308 | // Unpack weights for each texel |
1309 | int weight_plane1[BLOCK_MAX_TEXELS]; |
1310 | int weight_plane2[BLOCK_MAX_TEXELS]; |
1311 | |
1312 | unpack_weights(bsd, scb, di, bm.is_dual_plane, weight_plane1, weight_plane2); |
1313 | for (unsigned int i = 0; i < bsd.texel_count; i++) |
1314 | { |
1315 | info->weight_values_plane1[i] = static_cast<float>(weight_plane1[i]) * (1.0f / WEIGHTS_TEXEL_SUM); |
1316 | if (info->is_dual_plane_block) |
1317 | { |
1318 | info->weight_values_plane2[i] = static_cast<float>(weight_plane2[i]) * (1.0f / WEIGHTS_TEXEL_SUM); |
1319 | } |
1320 | } |
1321 | |
1322 | // Unpack partition assignments for each texel |
1323 | for (unsigned int i = 0; i < bsd.texel_count; i++) |
1324 | { |
1325 | info->partition_assignment[i] = pi.partition_of_texel[i]; |
1326 | } |
1327 | |
1328 | return ASTCENC_SUCCESS; |
1329 | #endif |
1330 | } |
1331 | |
1332 | /* See header for documentation. */ |
1333 | const char* astcenc_get_error_string( |
1334 | astcenc_error status |
1335 | ) { |
1336 | // Values in this enum are from an external user, so not guaranteed to be |
1337 | // bounded to the enum values |
1338 | switch (static_cast<int>(status)) |
1339 | { |
1340 | case ASTCENC_SUCCESS: |
1341 | return "ASTCENC_SUCCESS" ; |
1342 | case ASTCENC_ERR_OUT_OF_MEM: |
1343 | return "ASTCENC_ERR_OUT_OF_MEM" ; |
1344 | case ASTCENC_ERR_BAD_CPU_FLOAT: |
1345 | return "ASTCENC_ERR_BAD_CPU_FLOAT" ; |
1346 | case ASTCENC_ERR_BAD_PARAM: |
1347 | return "ASTCENC_ERR_BAD_PARAM" ; |
1348 | case ASTCENC_ERR_BAD_BLOCK_SIZE: |
1349 | return "ASTCENC_ERR_BAD_BLOCK_SIZE" ; |
1350 | case ASTCENC_ERR_BAD_PROFILE: |
1351 | return "ASTCENC_ERR_BAD_PROFILE" ; |
1352 | case ASTCENC_ERR_BAD_QUALITY: |
1353 | return "ASTCENC_ERR_BAD_QUALITY" ; |
1354 | case ASTCENC_ERR_BAD_FLAGS: |
1355 | return "ASTCENC_ERR_BAD_FLAGS" ; |
1356 | case ASTCENC_ERR_BAD_SWIZZLE: |
1357 | return "ASTCENC_ERR_BAD_SWIZZLE" ; |
1358 | case ASTCENC_ERR_BAD_CONTEXT: |
1359 | return "ASTCENC_ERR_BAD_CONTEXT" ; |
1360 | case ASTCENC_ERR_NOT_IMPLEMENTED: |
1361 | return "ASTCENC_ERR_NOT_IMPLEMENTED" ; |
1362 | #if defined(ASTCENC_DIAGNOSTICS) |
1363 | case ASTCENC_ERR_DTRACE_FAILURE: |
1364 | return "ASTCENC_ERR_DTRACE_FAILURE" ; |
1365 | #endif |
1366 | default: |
1367 | return nullptr; |
1368 | } |
1369 | } |
1370 | |