| 1 | // SPDX-License-Identifier: Apache-2.0 |
| 2 | // ---------------------------------------------------------------------------- |
| 3 | // Copyright 2011-2023 Arm Limited |
| 4 | // |
| 5 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not |
| 6 | // use this file except in compliance with the License. You may obtain a copy |
| 7 | // of the License at: |
| 8 | // |
| 9 | // http://www.apache.org/licenses/LICENSE-2.0 |
| 10 | // |
| 11 | // Unless required by applicable law or agreed to in writing, software |
| 12 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT |
| 13 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the |
| 14 | // License for the specific language governing permissions and limitations |
| 15 | // under the License. |
| 16 | // ---------------------------------------------------------------------------- |
| 17 | |
| 18 | /** |
| 19 | * @brief Functions for the library entrypoint. |
| 20 | */ |
| 21 | |
| 22 | #include <array> |
| 23 | #include <cstring> |
| 24 | #include <new> |
| 25 | |
| 26 | #include "astcenc.h" |
| 27 | #include "astcenc_internal_entry.h" |
| 28 | #include "astcenc_diagnostic_trace.h" |
| 29 | |
| 30 | /** |
| 31 | * @brief Record of the quality tuning parameter values. |
| 32 | * |
| 33 | * See the @c astcenc_config structure for detailed parameter documentation. |
| 34 | * |
| 35 | * Note that the mse_overshoot entries are scaling factors relative to the base MSE to hit db_limit. |
| 36 | * A 20% overshoot is harder to hit for a higher base db_limit, so we may actually use lower ratios |
| 37 | * for the more through search presets because the underlying db_limit is so much higher. |
| 38 | */ |
| 39 | struct astcenc_preset_config |
| 40 | { |
| 41 | float quality; |
| 42 | unsigned int tune_partition_count_limit; |
| 43 | unsigned int tune_2partition_index_limit; |
| 44 | unsigned int tune_3partition_index_limit; |
| 45 | unsigned int tune_4partition_index_limit; |
| 46 | unsigned int tune_block_mode_limit; |
| 47 | unsigned int tune_refinement_limit; |
| 48 | unsigned int tune_candidate_limit; |
| 49 | unsigned int tune_2partitioning_candidate_limit; |
| 50 | unsigned int tune_3partitioning_candidate_limit; |
| 51 | unsigned int tune_4partitioning_candidate_limit; |
| 52 | float tune_db_limit_a_base; |
| 53 | float tune_db_limit_b_base; |
| 54 | float tune_mse_overshoot; |
| 55 | float tune_2partition_early_out_limit_factor; |
| 56 | float tune_3partition_early_out_limit_factor; |
| 57 | float tune_2plane_early_out_limit_correlation; |
| 58 | }; |
| 59 | |
| 60 | /** |
| 61 | * @brief The static presets for high bandwidth encodings (x < 25 texels per block). |
| 62 | */ |
| 63 | static const std::array<astcenc_preset_config, 6> preset_configs_high {{ |
| 64 | { |
| 65 | ASTCENC_PRE_FASTEST, |
| 66 | 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f |
| 67 | }, { |
| 68 | ASTCENC_PRE_FAST, |
| 69 | 3, 18, 10, 8, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.90f |
| 70 | }, { |
| 71 | ASTCENC_PRE_MEDIUM, |
| 72 | 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 2.5f, 1.1f, 1.05f, 0.95f |
| 73 | }, { |
| 74 | ASTCENC_PRE_THOROUGH, |
| 75 | 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.35f, 1.15f, 0.97f |
| 76 | }, { |
| 77 | ASTCENC_PRE_VERYTHOROUGH, |
| 78 | 4, 256, 128, 64, 98, 4, 6, 20, 14, 8, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f |
| 79 | }, { |
| 80 | ASTCENC_PRE_EXHAUSTIVE, |
| 81 | 4, 512, 512, 512, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f |
| 82 | } |
| 83 | }}; |
| 84 | |
| 85 | /** |
| 86 | * @brief The static presets for medium bandwidth encodings (25 <= x < 64 texels per block). |
| 87 | */ |
| 88 | static const std::array<astcenc_preset_config, 6> preset_configs_mid {{ |
| 89 | { |
| 90 | ASTCENC_PRE_FASTEST, |
| 91 | 2, 10, 6, 4, 43, 2, 2, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.80f |
| 92 | }, { |
| 93 | ASTCENC_PRE_FAST, |
| 94 | 3, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.2f, 63.2f, 3.5f, 1.0f, 1.0f, 0.85f |
| 95 | }, { |
| 96 | ASTCENC_PRE_MEDIUM, |
| 97 | 4, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.0f, 1.1f, 1.05f, 0.90f |
| 98 | }, { |
| 99 | ASTCENC_PRE_THOROUGH, |
| 100 | 4, 82, 60, 30, 94, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.4f, 1.2f, 0.95f |
| 101 | }, { |
| 102 | ASTCENC_PRE_VERYTHOROUGH, |
| 103 | 4, 256, 128, 64, 98, 4, 6, 12, 8, 3, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f |
| 104 | }, { |
| 105 | ASTCENC_PRE_EXHAUSTIVE, |
| 106 | 4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f |
| 107 | } |
| 108 | }}; |
| 109 | |
| 110 | /** |
| 111 | * @brief The static presets for low bandwidth encodings (64 <= x texels per block). |
| 112 | */ |
| 113 | static const std::array<astcenc_preset_config, 6> preset_configs_low {{ |
| 114 | { |
| 115 | ASTCENC_PRE_FASTEST, |
| 116 | 2, 10, 6, 4, 40, 2, 2, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.80f |
| 117 | }, { |
| 118 | ASTCENC_PRE_FAST, |
| 119 | 2, 18, 12, 10, 55, 3, 3, 2, 2, 2, 85.0f, 63.0f, 3.5f, 1.0f, 1.0f, 0.85f |
| 120 | }, { |
| 121 | ASTCENC_PRE_MEDIUM, |
| 122 | 3, 34, 28, 16, 77, 3, 3, 2, 2, 2, 95.0f, 70.0f, 3.5f, 1.1f, 1.05f, 0.90f |
| 123 | }, { |
| 124 | ASTCENC_PRE_THOROUGH, |
| 125 | 4, 82, 60, 30, 93, 4, 4, 3, 2, 2, 105.0f, 77.0f, 10.0f, 1.3f, 1.2f, 0.97f |
| 126 | }, { |
| 127 | ASTCENC_PRE_VERYTHOROUGH, |
| 128 | 4, 256, 128, 64, 98, 4, 6, 9, 5, 2, 200.0f, 200.0f, 10.0f, 1.6f, 1.4f, 0.98f |
| 129 | }, { |
| 130 | ASTCENC_PRE_EXHAUSTIVE, |
| 131 | 4, 256, 256, 256, 100, 4, 8, 32, 32, 32, 200.0f, 200.0f, 10.0f, 2.0f, 2.0f, 0.99f |
| 132 | } |
| 133 | }}; |
| 134 | |
| 135 | /** |
| 136 | * @brief Validate CPU floating point meets assumptions made in the codec. |
| 137 | * |
| 138 | * The codec is written with the assumption that a float threaded through the @c if32 union will be |
| 139 | * stored and reloaded as a 32-bit IEEE-754 float with round-to-nearest rounding. This is always the |
| 140 | * case in an IEEE-754 compliant system, however not every system or compilation mode is actually |
| 141 | * IEEE-754 compliant. This normally fails if the code is compiled with fast math enabled. |
| 142 | * |
| 143 | * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. |
| 144 | */ |
| 145 | static astcenc_error validate_cpu_float() |
| 146 | { |
| 147 | if32 p; |
| 148 | volatile float xprec_testval = 2.51f; |
| 149 | p.f = xprec_testval + 12582912.0f; |
| 150 | float q = p.f - 12582912.0f; |
| 151 | |
| 152 | if (q != 3.0f) |
| 153 | { |
| 154 | return ASTCENC_ERR_BAD_CPU_FLOAT; |
| 155 | } |
| 156 | |
| 157 | return ASTCENC_SUCCESS; |
| 158 | } |
| 159 | |
| 160 | /** |
| 161 | * @brief Validate config profile. |
| 162 | * |
| 163 | * @param profile The profile to check. |
| 164 | * |
| 165 | * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. |
| 166 | */ |
| 167 | static astcenc_error validate_profile( |
| 168 | astcenc_profile profile |
| 169 | ) { |
| 170 | // Values in this enum are from an external user, so not guaranteed to be |
| 171 | // bounded to the enum values |
| 172 | switch (static_cast<int>(profile)) |
| 173 | { |
| 174 | case ASTCENC_PRF_LDR_SRGB: |
| 175 | case ASTCENC_PRF_LDR: |
| 176 | case ASTCENC_PRF_HDR_RGB_LDR_A: |
| 177 | case ASTCENC_PRF_HDR: |
| 178 | return ASTCENC_SUCCESS; |
| 179 | default: |
| 180 | return ASTCENC_ERR_BAD_PROFILE; |
| 181 | } |
| 182 | } |
| 183 | |
| 184 | /** |
| 185 | * @brief Validate block size. |
| 186 | * |
| 187 | * @param block_x The block x dimensions. |
| 188 | * @param block_y The block y dimensions. |
| 189 | * @param block_z The block z dimensions. |
| 190 | * |
| 191 | * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. |
| 192 | */ |
| 193 | static astcenc_error validate_block_size( |
| 194 | unsigned int block_x, |
| 195 | unsigned int block_y, |
| 196 | unsigned int block_z |
| 197 | ) { |
| 198 | // Test if this is a legal block size at all |
| 199 | bool is_legal = (((block_z <= 1) && is_legal_2d_block_size(block_x, block_y)) || |
| 200 | ((block_z >= 2) && is_legal_3d_block_size(block_x, block_y, block_z))); |
| 201 | if (!is_legal) |
| 202 | { |
| 203 | return ASTCENC_ERR_BAD_BLOCK_SIZE; |
| 204 | } |
| 205 | |
| 206 | // Test if this build has sufficient capacity for this block size |
| 207 | bool have_capacity = (block_x * block_y * block_z) <= BLOCK_MAX_TEXELS; |
| 208 | if (!have_capacity) |
| 209 | { |
| 210 | return ASTCENC_ERR_NOT_IMPLEMENTED; |
| 211 | } |
| 212 | |
| 213 | return ASTCENC_SUCCESS; |
| 214 | } |
| 215 | |
| 216 | /** |
| 217 | * @brief Validate flags. |
| 218 | * |
| 219 | * @param flags The flags to check. |
| 220 | * |
| 221 | * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. |
| 222 | */ |
| 223 | static astcenc_error validate_flags( |
| 224 | unsigned int flags |
| 225 | ) { |
| 226 | // Flags field must not contain any unknown flag bits |
| 227 | unsigned int exMask = ~ASTCENC_ALL_FLAGS; |
| 228 | if (popcount(flags & exMask) != 0) |
| 229 | { |
| 230 | return ASTCENC_ERR_BAD_FLAGS; |
| 231 | } |
| 232 | |
| 233 | // Flags field must only contain at most a single map type |
| 234 | exMask = ASTCENC_FLG_MAP_NORMAL |
| 235 | | ASTCENC_FLG_MAP_RGBM; |
| 236 | if (popcount(flags & exMask) > 1) |
| 237 | { |
| 238 | return ASTCENC_ERR_BAD_FLAGS; |
| 239 | } |
| 240 | |
| 241 | return ASTCENC_SUCCESS; |
| 242 | } |
| 243 | |
| 244 | #if !defined(ASTCENC_DECOMPRESS_ONLY) |
| 245 | |
| 246 | /** |
| 247 | * @brief Validate single channel compression swizzle. |
| 248 | * |
| 249 | * @param swizzle The swizzle to check. |
| 250 | * |
| 251 | * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. |
| 252 | */ |
| 253 | static astcenc_error validate_compression_swz( |
| 254 | astcenc_swz swizzle |
| 255 | ) { |
| 256 | // Not all enum values are handled; SWZ_Z is invalid for compression |
| 257 | switch (static_cast<int>(swizzle)) |
| 258 | { |
| 259 | case ASTCENC_SWZ_R: |
| 260 | case ASTCENC_SWZ_G: |
| 261 | case ASTCENC_SWZ_B: |
| 262 | case ASTCENC_SWZ_A: |
| 263 | case ASTCENC_SWZ_0: |
| 264 | case ASTCENC_SWZ_1: |
| 265 | return ASTCENC_SUCCESS; |
| 266 | default: |
| 267 | return ASTCENC_ERR_BAD_SWIZZLE; |
| 268 | } |
| 269 | } |
| 270 | |
| 271 | /** |
| 272 | * @brief Validate overall compression swizzle. |
| 273 | * |
| 274 | * @param swizzle The swizzle to check. |
| 275 | * |
| 276 | * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. |
| 277 | */ |
| 278 | static astcenc_error validate_compression_swizzle( |
| 279 | const astcenc_swizzle& swizzle |
| 280 | ) { |
| 281 | if (validate_compression_swz(swizzle.r) || |
| 282 | validate_compression_swz(swizzle.g) || |
| 283 | validate_compression_swz(swizzle.b) || |
| 284 | validate_compression_swz(swizzle.a)) |
| 285 | { |
| 286 | return ASTCENC_ERR_BAD_SWIZZLE; |
| 287 | } |
| 288 | |
| 289 | return ASTCENC_SUCCESS; |
| 290 | } |
| 291 | #endif |
| 292 | |
| 293 | /** |
| 294 | * @brief Validate single channel decompression swizzle. |
| 295 | * |
| 296 | * @param swizzle The swizzle to check. |
| 297 | * |
| 298 | * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. |
| 299 | */ |
| 300 | static astcenc_error validate_decompression_swz( |
| 301 | astcenc_swz swizzle |
| 302 | ) { |
| 303 | // Values in this enum are from an external user, so not guaranteed to be |
| 304 | // bounded to the enum values |
| 305 | switch (static_cast<int>(swizzle)) |
| 306 | { |
| 307 | case ASTCENC_SWZ_R: |
| 308 | case ASTCENC_SWZ_G: |
| 309 | case ASTCENC_SWZ_B: |
| 310 | case ASTCENC_SWZ_A: |
| 311 | case ASTCENC_SWZ_0: |
| 312 | case ASTCENC_SWZ_1: |
| 313 | case ASTCENC_SWZ_Z: |
| 314 | return ASTCENC_SUCCESS; |
| 315 | default: |
| 316 | return ASTCENC_ERR_BAD_SWIZZLE; |
| 317 | } |
| 318 | } |
| 319 | |
| 320 | /** |
| 321 | * @brief Validate overall decompression swizzle. |
| 322 | * |
| 323 | * @param swizzle The swizzle to check. |
| 324 | * |
| 325 | * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. |
| 326 | */ |
| 327 | static astcenc_error validate_decompression_swizzle( |
| 328 | const astcenc_swizzle& swizzle |
| 329 | ) { |
| 330 | if (validate_decompression_swz(swizzle.r) || |
| 331 | validate_decompression_swz(swizzle.g) || |
| 332 | validate_decompression_swz(swizzle.b) || |
| 333 | validate_decompression_swz(swizzle.a)) |
| 334 | { |
| 335 | return ASTCENC_ERR_BAD_SWIZZLE; |
| 336 | } |
| 337 | |
| 338 | return ASTCENC_SUCCESS; |
| 339 | } |
| 340 | |
| 341 | /** |
| 342 | * Validate that an incoming configuration is in-spec. |
| 343 | * |
| 344 | * This function can respond in two ways: |
| 345 | * |
| 346 | * * Numerical inputs that have valid ranges are clamped to those valid ranges. No error is thrown |
| 347 | * for out-of-range inputs in this case. |
| 348 | * * Numerical inputs and logic inputs are are logically invalid and which make no sense |
| 349 | * algorithmically will return an error. |
| 350 | * |
| 351 | * @param[in,out] config The input compressor configuration. |
| 352 | * |
| 353 | * @return Return @c ASTCENC_SUCCESS if validated, otherwise an error on failure. |
| 354 | */ |
| 355 | static astcenc_error validate_config( |
| 356 | astcenc_config &config |
| 357 | ) { |
| 358 | astcenc_error status; |
| 359 | |
| 360 | status = validate_profile(config.profile); |
| 361 | if (status != ASTCENC_SUCCESS) |
| 362 | { |
| 363 | return status; |
| 364 | } |
| 365 | |
| 366 | status = validate_flags(config.flags); |
| 367 | if (status != ASTCENC_SUCCESS) |
| 368 | { |
| 369 | return status; |
| 370 | } |
| 371 | |
| 372 | status = validate_block_size(config.block_x, config.block_y, config.block_z); |
| 373 | if (status != ASTCENC_SUCCESS) |
| 374 | { |
| 375 | return status; |
| 376 | } |
| 377 | |
| 378 | #if defined(ASTCENC_DECOMPRESS_ONLY) |
| 379 | // Decompress-only builds only support decompress-only contexts |
| 380 | if (!(config.flags & ASTCENC_FLG_DECOMPRESS_ONLY)) |
| 381 | { |
| 382 | return ASTCENC_ERR_BAD_PARAM; |
| 383 | } |
| 384 | #endif |
| 385 | |
| 386 | config.rgbm_m_scale = astc::max(config.rgbm_m_scale, 1.0f); |
| 387 | |
| 388 | config.tune_partition_count_limit = astc::clamp(config.tune_partition_count_limit, 1u, 4u); |
| 389 | config.tune_2partition_index_limit = astc::clamp(config.tune_2partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS); |
| 390 | config.tune_3partition_index_limit = astc::clamp(config.tune_3partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS); |
| 391 | config.tune_4partition_index_limit = astc::clamp(config.tune_4partition_index_limit, 1u, BLOCK_MAX_PARTITIONINGS); |
| 392 | config.tune_block_mode_limit = astc::clamp(config.tune_block_mode_limit, 1u, 100u); |
| 393 | config.tune_refinement_limit = astc::max(config.tune_refinement_limit, 1u); |
| 394 | config.tune_candidate_limit = astc::clamp(config.tune_candidate_limit, 1u, TUNE_MAX_TRIAL_CANDIDATES); |
| 395 | config.tune_2partitioning_candidate_limit = astc::clamp(config.tune_2partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES); |
| 396 | config.tune_3partitioning_candidate_limit = astc::clamp(config.tune_3partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES); |
| 397 | config.tune_4partitioning_candidate_limit = astc::clamp(config.tune_4partitioning_candidate_limit, 1u, TUNE_MAX_PARTITIONING_CANDIDATES); |
| 398 | config.tune_db_limit = astc::max(config.tune_db_limit, 0.0f); |
| 399 | config.tune_mse_overshoot = astc::max(config.tune_mse_overshoot, 1.0f); |
| 400 | config.tune_2partition_early_out_limit_factor = astc::max(config.tune_2partition_early_out_limit_factor, 0.0f); |
| 401 | config.tune_3partition_early_out_limit_factor = astc::max(config.tune_3partition_early_out_limit_factor, 0.0f); |
| 402 | config.tune_2plane_early_out_limit_correlation = astc::max(config.tune_2plane_early_out_limit_correlation, 0.0f); |
| 403 | |
| 404 | // Specifying a zero weight color component is not allowed; force to small value |
| 405 | float max_weight = astc::max(astc::max(config.cw_r_weight, config.cw_g_weight), |
| 406 | astc::max(config.cw_b_weight, config.cw_a_weight)); |
| 407 | if (max_weight > 0.0f) |
| 408 | { |
| 409 | max_weight /= 1000.0f; |
| 410 | config.cw_r_weight = astc::max(config.cw_r_weight, max_weight); |
| 411 | config.cw_g_weight = astc::max(config.cw_g_weight, max_weight); |
| 412 | config.cw_b_weight = astc::max(config.cw_b_weight, max_weight); |
| 413 | config.cw_a_weight = astc::max(config.cw_a_weight, max_weight); |
| 414 | } |
| 415 | // If all color components error weights are zero then return an error |
| 416 | else |
| 417 | { |
| 418 | return ASTCENC_ERR_BAD_PARAM; |
| 419 | } |
| 420 | |
| 421 | return ASTCENC_SUCCESS; |
| 422 | } |
| 423 | |
| 424 | /* See header for documentation. */ |
| 425 | astcenc_error astcenc_config_init( |
| 426 | astcenc_profile profile, |
| 427 | unsigned int block_x, |
| 428 | unsigned int block_y, |
| 429 | unsigned int block_z, |
| 430 | float quality, |
| 431 | unsigned int flags, |
| 432 | astcenc_config* configp |
| 433 | ) { |
| 434 | astcenc_error status; |
| 435 | |
| 436 | status = validate_cpu_float(); |
| 437 | if (status != ASTCENC_SUCCESS) |
| 438 | { |
| 439 | return status; |
| 440 | } |
| 441 | |
| 442 | // Zero init all config fields; although most of will be over written |
| 443 | astcenc_config& config = *configp; |
| 444 | std::memset(&config, 0, sizeof(config)); |
| 445 | |
| 446 | // Process the block size |
| 447 | block_z = astc::max(block_z, 1u); // For 2D blocks Z==0 is accepted, but convert to 1 |
| 448 | status = validate_block_size(block_x, block_y, block_z); |
| 449 | if (status != ASTCENC_SUCCESS) |
| 450 | { |
| 451 | return status; |
| 452 | } |
| 453 | |
| 454 | config.block_x = block_x; |
| 455 | config.block_y = block_y; |
| 456 | config.block_z = block_z; |
| 457 | |
| 458 | float texels = static_cast<float>(block_x * block_y * block_z); |
| 459 | float ltexels = logf(texels) / logf(10.0f); |
| 460 | |
| 461 | // Process the performance quality level or preset; note that this must be done before we |
| 462 | // process any additional settings, such as color profile and flags, which may replace some of |
| 463 | // these settings with more use case tuned values |
| 464 | if (quality < ASTCENC_PRE_FASTEST || |
| 465 | quality > ASTCENC_PRE_EXHAUSTIVE) |
| 466 | { |
| 467 | return ASTCENC_ERR_BAD_QUALITY; |
| 468 | } |
| 469 | |
| 470 | static const std::array<astcenc_preset_config, 6>* preset_configs; |
| 471 | int texels_int = block_x * block_y * block_z; |
| 472 | if (texels_int < 25) |
| 473 | { |
| 474 | preset_configs = &preset_configs_high; |
| 475 | } |
| 476 | else if (texels_int < 64) |
| 477 | { |
| 478 | preset_configs = &preset_configs_mid; |
| 479 | } |
| 480 | else |
| 481 | { |
| 482 | preset_configs = &preset_configs_low; |
| 483 | } |
| 484 | |
| 485 | // Determine which preset to use, or which pair to interpolate |
| 486 | size_t start; |
| 487 | size_t end; |
| 488 | for (end = 0; end < preset_configs->size(); end++) |
| 489 | { |
| 490 | if ((*preset_configs)[end].quality >= quality) |
| 491 | { |
| 492 | break; |
| 493 | } |
| 494 | } |
| 495 | |
| 496 | start = end == 0 ? 0 : end - 1; |
| 497 | |
| 498 | // Start and end node are the same - so just transfer the values. |
| 499 | if (start == end) |
| 500 | { |
| 501 | config.tune_partition_count_limit = (*preset_configs)[start].tune_partition_count_limit; |
| 502 | config.tune_2partition_index_limit = (*preset_configs)[start].tune_2partition_index_limit; |
| 503 | config.tune_3partition_index_limit = (*preset_configs)[start].tune_3partition_index_limit; |
| 504 | config.tune_4partition_index_limit = (*preset_configs)[start].tune_4partition_index_limit; |
| 505 | config.tune_block_mode_limit = (*preset_configs)[start].tune_block_mode_limit; |
| 506 | config.tune_refinement_limit = (*preset_configs)[start].tune_refinement_limit; |
| 507 | config.tune_candidate_limit = astc::min((*preset_configs)[start].tune_candidate_limit, TUNE_MAX_TRIAL_CANDIDATES); |
| 508 | config.tune_2partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_2partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES); |
| 509 | config.tune_3partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_3partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES); |
| 510 | config.tune_4partitioning_candidate_limit = astc::min((*preset_configs)[start].tune_4partitioning_candidate_limit, TUNE_MAX_PARTITIONING_CANDIDATES); |
| 511 | config.tune_db_limit = astc::max((*preset_configs)[start].tune_db_limit_a_base - 35 * ltexels, |
| 512 | (*preset_configs)[start].tune_db_limit_b_base - 19 * ltexels); |
| 513 | |
| 514 | config.tune_mse_overshoot = (*preset_configs)[start].tune_mse_overshoot; |
| 515 | |
| 516 | config.tune_2partition_early_out_limit_factor = (*preset_configs)[start].tune_2partition_early_out_limit_factor; |
| 517 | config.tune_3partition_early_out_limit_factor = (*preset_configs)[start].tune_3partition_early_out_limit_factor; |
| 518 | config.tune_2plane_early_out_limit_correlation = (*preset_configs)[start].tune_2plane_early_out_limit_correlation; |
| 519 | } |
| 520 | // Start and end node are not the same - so interpolate between them |
| 521 | else |
| 522 | { |
| 523 | auto& node_a = (*preset_configs)[start]; |
| 524 | auto& node_b = (*preset_configs)[end]; |
| 525 | |
| 526 | float wt_range = node_b.quality - node_a.quality; |
| 527 | assert(wt_range > 0); |
| 528 | |
| 529 | // Compute interpolation factors |
| 530 | float wt_node_a = (node_b.quality - quality) / wt_range; |
| 531 | float wt_node_b = (quality - node_a.quality) / wt_range; |
| 532 | |
| 533 | #define LERP(param) ((node_a.param * wt_node_a) + (node_b.param * wt_node_b)) |
| 534 | #define LERPI(param) astc::flt2int_rtn(\ |
| 535 | (static_cast<float>(node_a.param) * wt_node_a) + \ |
| 536 | (static_cast<float>(node_b.param) * wt_node_b)) |
| 537 | #define LERPUI(param) static_cast<unsigned int>(LERPI(param)) |
| 538 | |
| 539 | config.tune_partition_count_limit = LERPI(tune_partition_count_limit); |
| 540 | config.tune_2partition_index_limit = LERPI(tune_2partition_index_limit); |
| 541 | config.tune_3partition_index_limit = LERPI(tune_3partition_index_limit); |
| 542 | config.tune_4partition_index_limit = LERPI(tune_4partition_index_limit); |
| 543 | config.tune_block_mode_limit = LERPI(tune_block_mode_limit); |
| 544 | config.tune_refinement_limit = LERPI(tune_refinement_limit); |
| 545 | config.tune_candidate_limit = astc::min(LERPUI(tune_candidate_limit), |
| 546 | TUNE_MAX_TRIAL_CANDIDATES); |
| 547 | config.tune_2partitioning_candidate_limit = astc::min(LERPUI(tune_2partitioning_candidate_limit), |
| 548 | BLOCK_MAX_PARTITIONINGS); |
| 549 | config.tune_3partitioning_candidate_limit = astc::min(LERPUI(tune_3partitioning_candidate_limit), |
| 550 | BLOCK_MAX_PARTITIONINGS); |
| 551 | config.tune_4partitioning_candidate_limit = astc::min(LERPUI(tune_4partitioning_candidate_limit), |
| 552 | BLOCK_MAX_PARTITIONINGS); |
| 553 | config.tune_db_limit = astc::max(LERP(tune_db_limit_a_base) - 35 * ltexels, |
| 554 | LERP(tune_db_limit_b_base) - 19 * ltexels); |
| 555 | |
| 556 | config.tune_mse_overshoot = LERP(tune_mse_overshoot); |
| 557 | |
| 558 | config.tune_2partition_early_out_limit_factor = LERP(tune_2partition_early_out_limit_factor); |
| 559 | config.tune_3partition_early_out_limit_factor = LERP(tune_3partition_early_out_limit_factor); |
| 560 | config.tune_2plane_early_out_limit_correlation = LERP(tune_2plane_early_out_limit_correlation); |
| 561 | #undef LERP |
| 562 | #undef LERPI |
| 563 | #undef LERPUI |
| 564 | } |
| 565 | |
| 566 | // Set heuristics to the defaults for each color profile |
| 567 | config.cw_r_weight = 1.0f; |
| 568 | config.cw_g_weight = 1.0f; |
| 569 | config.cw_b_weight = 1.0f; |
| 570 | config.cw_a_weight = 1.0f; |
| 571 | |
| 572 | config.a_scale_radius = 0; |
| 573 | |
| 574 | config.rgbm_m_scale = 0.0f; |
| 575 | |
| 576 | config.profile = profile; |
| 577 | |
| 578 | // Values in this enum are from an external user, so not guaranteed to be |
| 579 | // bounded to the enum values |
| 580 | switch (static_cast<int>(profile)) |
| 581 | { |
| 582 | case ASTCENC_PRF_LDR: |
| 583 | case ASTCENC_PRF_LDR_SRGB: |
| 584 | break; |
| 585 | case ASTCENC_PRF_HDR_RGB_LDR_A: |
| 586 | case ASTCENC_PRF_HDR: |
| 587 | config.tune_db_limit = 999.0f; |
| 588 | break; |
| 589 | default: |
| 590 | return ASTCENC_ERR_BAD_PROFILE; |
| 591 | } |
| 592 | |
| 593 | // Flags field must not contain any unknown flag bits |
| 594 | status = validate_flags(flags); |
| 595 | if (status != ASTCENC_SUCCESS) |
| 596 | { |
| 597 | return status; |
| 598 | } |
| 599 | |
| 600 | if (flags & ASTCENC_FLG_MAP_NORMAL) |
| 601 | { |
| 602 | // Normal map encoding uses L+A blocks, so allow one more partitioning |
| 603 | // than normal. We need need fewer bits for endpoints, so more likely |
| 604 | // to be able to use more partitions than an RGB/RGBA block |
| 605 | config.tune_partition_count_limit = astc::min(config.tune_partition_count_limit + 1u, 4u); |
| 606 | |
| 607 | config.cw_g_weight = 0.0f; |
| 608 | config.cw_b_weight = 0.0f; |
| 609 | config.tune_2partition_early_out_limit_factor *= 1.5f; |
| 610 | config.tune_3partition_early_out_limit_factor *= 1.5f; |
| 611 | config.tune_2plane_early_out_limit_correlation = 0.99f; |
| 612 | |
| 613 | // Normals are prone to blocking artifacts on smooth curves |
| 614 | // so force compressor to try harder here ... |
| 615 | config.tune_db_limit *= 1.03f; |
| 616 | } |
| 617 | else if (flags & ASTCENC_FLG_MAP_RGBM) |
| 618 | { |
| 619 | config.rgbm_m_scale = 5.0f; |
| 620 | config.cw_a_weight = 2.0f * config.rgbm_m_scale; |
| 621 | } |
| 622 | else // (This is color data) |
| 623 | { |
| 624 | // This is a very basic perceptual metric for RGB color data, which weights error |
| 625 | // significance by the perceptual luminance contribution of each color channel. For |
| 626 | // luminance the usual weights to compute luminance from a linear RGB value are as |
| 627 | // follows: |
| 628 | // |
| 629 | // l = r * 0.3 + g * 0.59 + b * 0.11 |
| 630 | // |
| 631 | // ... but we scale these up to keep a better balance between color and alpha. Note |
| 632 | // that if the content is using alpha we'd recommend using the -a option to weight |
| 633 | // the color contribution by the alpha transparency. |
| 634 | if (flags & ASTCENC_FLG_USE_PERCEPTUAL) |
| 635 | { |
| 636 | config.cw_r_weight = 0.30f * 2.25f; |
| 637 | config.cw_g_weight = 0.59f * 2.25f; |
| 638 | config.cw_b_weight = 0.11f * 2.25f; |
| 639 | } |
| 640 | } |
| 641 | config.flags = flags; |
| 642 | |
| 643 | return ASTCENC_SUCCESS; |
| 644 | } |
| 645 | |
| 646 | /* See header for documentation. */ |
| 647 | astcenc_error astcenc_context_alloc( |
| 648 | const astcenc_config* configp, |
| 649 | unsigned int thread_count, |
| 650 | astcenc_context** context |
| 651 | ) { |
| 652 | astcenc_error status; |
| 653 | const astcenc_config& config = *configp; |
| 654 | |
| 655 | status = validate_cpu_float(); |
| 656 | if (status != ASTCENC_SUCCESS) |
| 657 | { |
| 658 | return status; |
| 659 | } |
| 660 | |
| 661 | if (thread_count == 0) |
| 662 | { |
| 663 | return ASTCENC_ERR_BAD_PARAM; |
| 664 | } |
| 665 | |
| 666 | #if defined(ASTCENC_DIAGNOSTICS) |
| 667 | // Force single threaded compressor use in diagnostic mode. |
| 668 | if (thread_count != 1) |
| 669 | { |
| 670 | return ASTCENC_ERR_BAD_PARAM; |
| 671 | } |
| 672 | #endif |
| 673 | |
| 674 | astcenc_context* ctxo = new astcenc_context; |
| 675 | astcenc_contexti* ctx = &ctxo->context; |
| 676 | ctx->thread_count = thread_count; |
| 677 | ctx->config = config; |
| 678 | ctx->working_buffers = nullptr; |
| 679 | |
| 680 | // These are allocated per-compress, as they depend on image size |
| 681 | ctx->input_alpha_averages = nullptr; |
| 682 | |
| 683 | // Copy the config first and validate the copy (we may modify it) |
| 684 | status = validate_config(ctx->config); |
| 685 | if (status != ASTCENC_SUCCESS) |
| 686 | { |
| 687 | delete ctxo; |
| 688 | return status; |
| 689 | } |
| 690 | |
| 691 | ctx->bsd = aligned_malloc<block_size_descriptor>(sizeof(block_size_descriptor), ASTCENC_VECALIGN); |
| 692 | bool can_omit_modes = static_cast<bool>(config.flags & ASTCENC_FLG_SELF_DECOMPRESS_ONLY); |
| 693 | init_block_size_descriptor(config.block_x, config.block_y, config.block_z, |
| 694 | can_omit_modes, |
| 695 | config.tune_partition_count_limit, |
| 696 | static_cast<float>(config.tune_block_mode_limit) / 100.0f, |
| 697 | *ctx->bsd); |
| 698 | |
| 699 | #if !defined(ASTCENC_DECOMPRESS_ONLY) |
| 700 | // Do setup only needed by compression |
| 701 | if (!(status & ASTCENC_FLG_DECOMPRESS_ONLY)) |
| 702 | { |
| 703 | // Turn a dB limit into a per-texel error for faster use later |
| 704 | if ((ctx->config.profile == ASTCENC_PRF_LDR) || (ctx->config.profile == ASTCENC_PRF_LDR_SRGB)) |
| 705 | { |
| 706 | ctx->config.tune_db_limit = astc::pow(0.1f, ctx->config.tune_db_limit * 0.1f) * 65535.0f * 65535.0f; |
| 707 | } |
| 708 | else |
| 709 | { |
| 710 | ctx->config.tune_db_limit = 0.0f; |
| 711 | } |
| 712 | |
| 713 | size_t worksize = sizeof(compression_working_buffers) * thread_count; |
| 714 | ctx->working_buffers = aligned_malloc<compression_working_buffers>(worksize, ASTCENC_VECALIGN); |
| 715 | static_assert((sizeof(compression_working_buffers) % ASTCENC_VECALIGN) == 0, |
| 716 | "compression_working_buffers size must be multiple of vector alignment" ); |
| 717 | if (!ctx->working_buffers) |
| 718 | { |
| 719 | aligned_free<block_size_descriptor>(ctx->bsd); |
| 720 | delete ctxo; |
| 721 | *context = nullptr; |
| 722 | return ASTCENC_ERR_OUT_OF_MEM; |
| 723 | } |
| 724 | } |
| 725 | #endif |
| 726 | |
| 727 | #if defined(ASTCENC_DIAGNOSTICS) |
| 728 | ctx->trace_log = new TraceLog(ctx->config.trace_file_path); |
| 729 | if (!ctx->trace_log->m_file) |
| 730 | { |
| 731 | return ASTCENC_ERR_DTRACE_FAILURE; |
| 732 | } |
| 733 | |
| 734 | trace_add_data("block_x" , config.block_x); |
| 735 | trace_add_data("block_y" , config.block_y); |
| 736 | trace_add_data("block_z" , config.block_z); |
| 737 | #endif |
| 738 | |
| 739 | *context = ctxo; |
| 740 | |
| 741 | #if !defined(ASTCENC_DECOMPRESS_ONLY) |
| 742 | prepare_angular_tables(); |
| 743 | #endif |
| 744 | |
| 745 | return ASTCENC_SUCCESS; |
| 746 | } |
| 747 | |
| 748 | /* See header dor documentation. */ |
| 749 | void astcenc_context_free( |
| 750 | astcenc_context* ctxo |
| 751 | ) { |
| 752 | if (ctxo) |
| 753 | { |
| 754 | astcenc_contexti* ctx = &ctxo->context; |
| 755 | aligned_free<compression_working_buffers>(ctx->working_buffers); |
| 756 | aligned_free<block_size_descriptor>(ctx->bsd); |
| 757 | #if defined(ASTCENC_DIAGNOSTICS) |
| 758 | delete ctx->trace_log; |
| 759 | #endif |
| 760 | delete ctxo; |
| 761 | } |
| 762 | } |
| 763 | |
| 764 | #if !defined(ASTCENC_DECOMPRESS_ONLY) |
| 765 | |
| 766 | /** |
| 767 | * @brief Compress an image, after any preflight has completed. |
| 768 | * |
| 769 | * @param[out] ctxo The compressor context. |
| 770 | * @param thread_index The thread index. |
| 771 | * @param image The intput image. |
| 772 | * @param swizzle The input swizzle. |
| 773 | * @param[out] buffer The output array for the compressed data. |
| 774 | */ |
| 775 | static void compress_image( |
| 776 | astcenc_context& ctxo, |
| 777 | unsigned int thread_index, |
| 778 | const astcenc_image& image, |
| 779 | const astcenc_swizzle& swizzle, |
| 780 | uint8_t* buffer |
| 781 | ) { |
| 782 | astcenc_contexti& ctx = ctxo.context; |
| 783 | const block_size_descriptor& bsd = *ctx.bsd; |
| 784 | astcenc_profile decode_mode = ctx.config.profile; |
| 785 | |
| 786 | image_block blk; |
| 787 | |
| 788 | int block_x = bsd.xdim; |
| 789 | int block_y = bsd.ydim; |
| 790 | int block_z = bsd.zdim; |
| 791 | blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z); |
| 792 | |
| 793 | int dim_x = image.dim_x; |
| 794 | int dim_y = image.dim_y; |
| 795 | int dim_z = image.dim_z; |
| 796 | |
| 797 | int xblocks = (dim_x + block_x - 1) / block_x; |
| 798 | int yblocks = (dim_y + block_y - 1) / block_y; |
| 799 | int zblocks = (dim_z + block_z - 1) / block_z; |
| 800 | int block_count = zblocks * yblocks * xblocks; |
| 801 | |
| 802 | int row_blocks = xblocks; |
| 803 | int plane_blocks = xblocks * yblocks; |
| 804 | |
| 805 | // Populate the block channel weights |
| 806 | blk.channel_weight = vfloat4(ctx.config.cw_r_weight, |
| 807 | ctx.config.cw_g_weight, |
| 808 | ctx.config.cw_b_weight, |
| 809 | ctx.config.cw_a_weight); |
| 810 | |
| 811 | // Use preallocated scratch buffer |
| 812 | auto& temp_buffers = ctx.working_buffers[thread_index]; |
| 813 | |
| 814 | // Only the first thread actually runs the initializer |
| 815 | ctxo.manage_compress.init(block_count); |
| 816 | |
| 817 | // Determine if we can use an optimized load function |
| 818 | bool needs_swz = (swizzle.r != ASTCENC_SWZ_R) || (swizzle.g != ASTCENC_SWZ_G) || |
| 819 | (swizzle.b != ASTCENC_SWZ_B) || (swizzle.a != ASTCENC_SWZ_A); |
| 820 | |
| 821 | bool needs_hdr = (decode_mode == ASTCENC_PRF_HDR) || |
| 822 | (decode_mode == ASTCENC_PRF_HDR_RGB_LDR_A); |
| 823 | |
| 824 | bool use_fast_load = !needs_swz && !needs_hdr && |
| 825 | block_z == 1 && image.data_type == ASTCENC_TYPE_U8; |
| 826 | |
| 827 | auto load_func = load_image_block; |
| 828 | if (use_fast_load) |
| 829 | { |
| 830 | load_func = load_image_block_fast_ldr; |
| 831 | } |
| 832 | |
| 833 | // All threads run this processing loop until there is no work remaining |
| 834 | while (true) |
| 835 | { |
| 836 | unsigned int count; |
| 837 | unsigned int base = ctxo.manage_compress.get_task_assignment(16, count); |
| 838 | if (!count) |
| 839 | { |
| 840 | break; |
| 841 | } |
| 842 | |
| 843 | for (unsigned int i = base; i < base + count; i++) |
| 844 | { |
| 845 | // Decode i into x, y, z block indices |
| 846 | int z = i / plane_blocks; |
| 847 | unsigned int rem = i - (z * plane_blocks); |
| 848 | int y = rem / row_blocks; |
| 849 | int x = rem - (y * row_blocks); |
| 850 | |
| 851 | // Test if we can apply some basic alpha-scale RDO |
| 852 | bool use_full_block = true; |
| 853 | if (ctx.config.a_scale_radius != 0 && block_z == 1) |
| 854 | { |
| 855 | int start_x = x * block_x; |
| 856 | int end_x = astc::min(dim_x, start_x + block_x); |
| 857 | |
| 858 | int start_y = y * block_y; |
| 859 | int end_y = astc::min(dim_y, start_y + block_y); |
| 860 | |
| 861 | // SATs accumulate error, so don't test exactly zero. Test for |
| 862 | // less than 1 alpha in the expanded block footprint that |
| 863 | // includes the alpha radius. |
| 864 | int = block_x + 2 * (ctx.config.a_scale_radius - 1); |
| 865 | |
| 866 | int = block_y + 2 * (ctx.config.a_scale_radius - 1); |
| 867 | |
| 868 | float = static_cast<float>(x_footprint * y_footprint); |
| 869 | float threshold = 0.9f / (255.0f * footprint); |
| 870 | |
| 871 | // Do we have any alpha values? |
| 872 | use_full_block = false; |
| 873 | for (int ay = start_y; ay < end_y; ay++) |
| 874 | { |
| 875 | for (int ax = start_x; ax < end_x; ax++) |
| 876 | { |
| 877 | float a_avg = ctx.input_alpha_averages[ay * dim_x + ax]; |
| 878 | if (a_avg > threshold) |
| 879 | { |
| 880 | use_full_block = true; |
| 881 | ax = end_x; |
| 882 | ay = end_y; |
| 883 | } |
| 884 | } |
| 885 | } |
| 886 | } |
| 887 | |
| 888 | // Fetch the full block for compression |
| 889 | if (use_full_block) |
| 890 | { |
| 891 | load_func(decode_mode, image, blk, bsd, x * block_x, y * block_y, z * block_z, swizzle); |
| 892 | |
| 893 | // Scale RGB error contribution by the maximum alpha in the block |
| 894 | // This encourages preserving alpha accuracy in regions with high |
| 895 | // transparency, and can buy up to 0.5 dB PSNR. |
| 896 | if (ctx.config.flags & ASTCENC_FLG_USE_ALPHA_WEIGHT) |
| 897 | { |
| 898 | float alpha_scale = blk.data_max.lane<3>() * (1.0f / 65535.0f); |
| 899 | blk.channel_weight = vfloat4(ctx.config.cw_r_weight * alpha_scale, |
| 900 | ctx.config.cw_g_weight * alpha_scale, |
| 901 | ctx.config.cw_b_weight * alpha_scale, |
| 902 | ctx.config.cw_a_weight); |
| 903 | } |
| 904 | } |
| 905 | // Apply alpha scale RDO - substitute constant color block |
| 906 | else |
| 907 | { |
| 908 | blk.origin_texel = vfloat4::zero(); |
| 909 | blk.data_min = vfloat4::zero(); |
| 910 | blk.data_mean = vfloat4::zero(); |
| 911 | blk.data_max = vfloat4::zero(); |
| 912 | blk.grayscale = true; |
| 913 | } |
| 914 | |
| 915 | int offset = ((z * yblocks + y) * xblocks + x) * 16; |
| 916 | uint8_t *bp = buffer + offset; |
| 917 | physical_compressed_block* pcb = reinterpret_cast<physical_compressed_block*>(bp); |
| 918 | compress_block(ctx, blk, *pcb, temp_buffers); |
| 919 | } |
| 920 | |
| 921 | ctxo.manage_compress.complete_task_assignment(count); |
| 922 | } |
| 923 | } |
| 924 | |
| 925 | /** |
| 926 | * @brief Compute regional averages in an image. |
| 927 | * |
| 928 | * This function can be called by multiple threads, but only after a single |
| 929 | * thread calls the setup function @c init_compute_averages(). |
| 930 | * |
| 931 | * Results are written back into @c img->input_alpha_averages. |
| 932 | * |
| 933 | * @param[out] ctx The context. |
| 934 | * @param ag The average and variance arguments created during setup. |
| 935 | */ |
| 936 | static void compute_averages( |
| 937 | astcenc_context& ctx, |
| 938 | const avg_args &ag |
| 939 | ) { |
| 940 | pixel_region_args arg = ag.arg; |
| 941 | arg.work_memory = new vfloat4[ag.work_memory_size]; |
| 942 | |
| 943 | int size_x = ag.img_size_x; |
| 944 | int size_y = ag.img_size_y; |
| 945 | int size_z = ag.img_size_z; |
| 946 | |
| 947 | int step_xy = ag.blk_size_xy; |
| 948 | int step_z = ag.blk_size_z; |
| 949 | |
| 950 | int y_tasks = (size_y + step_xy - 1) / step_xy; |
| 951 | |
| 952 | // All threads run this processing loop until there is no work remaining |
| 953 | while (true) |
| 954 | { |
| 955 | unsigned int count; |
| 956 | unsigned int base = ctx.manage_avg.get_task_assignment(16, count); |
| 957 | if (!count) |
| 958 | { |
| 959 | break; |
| 960 | } |
| 961 | |
| 962 | for (unsigned int i = base; i < base + count; i++) |
| 963 | { |
| 964 | int z = (i / (y_tasks)) * step_z; |
| 965 | int y = (i - (z * y_tasks)) * step_xy; |
| 966 | |
| 967 | arg.size_z = astc::min(step_z, size_z - z); |
| 968 | arg.offset_z = z; |
| 969 | |
| 970 | arg.size_y = astc::min(step_xy, size_y - y); |
| 971 | arg.offset_y = y; |
| 972 | |
| 973 | for (int x = 0; x < size_x; x += step_xy) |
| 974 | { |
| 975 | arg.size_x = astc::min(step_xy, size_x - x); |
| 976 | arg.offset_x = x; |
| 977 | compute_pixel_region_variance(ctx.context, arg); |
| 978 | } |
| 979 | } |
| 980 | |
| 981 | ctx.manage_avg.complete_task_assignment(count); |
| 982 | } |
| 983 | |
| 984 | delete[] arg.work_memory; |
| 985 | } |
| 986 | |
| 987 | #endif |
| 988 | |
| 989 | /* See header for documentation. */ |
| 990 | astcenc_error astcenc_compress_image( |
| 991 | astcenc_context* ctxo, |
| 992 | astcenc_image* imagep, |
| 993 | const astcenc_swizzle* swizzle, |
| 994 | uint8_t* data_out, |
| 995 | size_t data_len, |
| 996 | unsigned int thread_index |
| 997 | ) { |
| 998 | #if defined(ASTCENC_DECOMPRESS_ONLY) |
| 999 | (void)ctxo; |
| 1000 | (void)imagep; |
| 1001 | (void)swizzle; |
| 1002 | (void)data_out; |
| 1003 | (void)data_len; |
| 1004 | (void)thread_index; |
| 1005 | return ASTCENC_ERR_BAD_CONTEXT; |
| 1006 | #else |
| 1007 | astcenc_contexti* ctx = &ctxo->context; |
| 1008 | astcenc_error status; |
| 1009 | astcenc_image& image = *imagep; |
| 1010 | |
| 1011 | if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY) |
| 1012 | { |
| 1013 | return ASTCENC_ERR_BAD_CONTEXT; |
| 1014 | } |
| 1015 | |
| 1016 | status = validate_compression_swizzle(*swizzle); |
| 1017 | if (status != ASTCENC_SUCCESS) |
| 1018 | { |
| 1019 | return status; |
| 1020 | } |
| 1021 | |
| 1022 | if (thread_index >= ctx->thread_count) |
| 1023 | { |
| 1024 | return ASTCENC_ERR_BAD_PARAM; |
| 1025 | } |
| 1026 | |
| 1027 | unsigned int block_x = ctx->config.block_x; |
| 1028 | unsigned int block_y = ctx->config.block_y; |
| 1029 | unsigned int block_z = ctx->config.block_z; |
| 1030 | |
| 1031 | unsigned int xblocks = (image.dim_x + block_x - 1) / block_x; |
| 1032 | unsigned int yblocks = (image.dim_y + block_y - 1) / block_y; |
| 1033 | unsigned int zblocks = (image.dim_z + block_z - 1) / block_z; |
| 1034 | |
| 1035 | // Check we have enough output space (16 bytes per block) |
| 1036 | size_t size_needed = xblocks * yblocks * zblocks * 16; |
| 1037 | if (data_len < size_needed) |
| 1038 | { |
| 1039 | return ASTCENC_ERR_OUT_OF_MEM; |
| 1040 | } |
| 1041 | |
| 1042 | // If context thread count is one then implicitly reset |
| 1043 | if (ctx->thread_count == 1) |
| 1044 | { |
| 1045 | astcenc_compress_reset(ctxo); |
| 1046 | } |
| 1047 | |
| 1048 | if (ctx->config.a_scale_radius != 0) |
| 1049 | { |
| 1050 | // First thread to enter will do setup, other threads will subsequently |
| 1051 | // enter the critical section but simply skip over the initialization |
| 1052 | auto init_avg = [ctx, &image, swizzle]() { |
| 1053 | // Perform memory allocations for the destination buffers |
| 1054 | size_t texel_count = image.dim_x * image.dim_y * image.dim_z; |
| 1055 | ctx->input_alpha_averages = new float[texel_count]; |
| 1056 | |
| 1057 | return init_compute_averages( |
| 1058 | image, ctx->config.a_scale_radius, *swizzle, |
| 1059 | ctx->avg_preprocess_args); |
| 1060 | }; |
| 1061 | |
| 1062 | // Only the first thread actually runs the initializer |
| 1063 | ctxo->manage_avg.init(init_avg); |
| 1064 | |
| 1065 | // All threads will enter this function and dynamically grab work |
| 1066 | compute_averages(*ctxo, ctx->avg_preprocess_args); |
| 1067 | } |
| 1068 | |
| 1069 | // Wait for compute_averages to complete before compressing |
| 1070 | ctxo->manage_avg.wait(); |
| 1071 | |
| 1072 | compress_image(*ctxo, thread_index, image, *swizzle, data_out); |
| 1073 | |
| 1074 | // Wait for compress to complete before freeing memory |
| 1075 | ctxo->manage_compress.wait(); |
| 1076 | |
| 1077 | auto term_compress = [ctx]() { |
| 1078 | delete[] ctx->input_alpha_averages; |
| 1079 | ctx->input_alpha_averages = nullptr; |
| 1080 | }; |
| 1081 | |
| 1082 | // Only the first thread to arrive actually runs the term |
| 1083 | ctxo->manage_compress.term(term_compress); |
| 1084 | |
| 1085 | return ASTCENC_SUCCESS; |
| 1086 | #endif |
| 1087 | } |
| 1088 | |
| 1089 | /* See header for documentation. */ |
| 1090 | astcenc_error astcenc_compress_reset( |
| 1091 | astcenc_context* ctxo |
| 1092 | ) { |
| 1093 | #if defined(ASTCENC_DECOMPRESS_ONLY) |
| 1094 | (void)ctxo; |
| 1095 | return ASTCENC_ERR_BAD_CONTEXT; |
| 1096 | #else |
| 1097 | astcenc_contexti* ctx = &ctxo->context; |
| 1098 | if (ctx->config.flags & ASTCENC_FLG_DECOMPRESS_ONLY) |
| 1099 | { |
| 1100 | return ASTCENC_ERR_BAD_CONTEXT; |
| 1101 | } |
| 1102 | |
| 1103 | ctxo->manage_avg.reset(); |
| 1104 | ctxo->manage_compress.reset(); |
| 1105 | return ASTCENC_SUCCESS; |
| 1106 | #endif |
| 1107 | } |
| 1108 | |
| 1109 | /* See header for documentation. */ |
| 1110 | astcenc_error astcenc_decompress_image( |
| 1111 | astcenc_context* ctxo, |
| 1112 | const uint8_t* data, |
| 1113 | size_t data_len, |
| 1114 | astcenc_image* image_outp, |
| 1115 | const astcenc_swizzle* swizzle, |
| 1116 | unsigned int thread_index |
| 1117 | ) { |
| 1118 | astcenc_error status; |
| 1119 | astcenc_image& image_out = *image_outp; |
| 1120 | astcenc_contexti* ctx = &ctxo->context; |
| 1121 | |
| 1122 | // Today this doesn't matter (working set on stack) but might in future ... |
| 1123 | if (thread_index >= ctx->thread_count) |
| 1124 | { |
| 1125 | return ASTCENC_ERR_BAD_PARAM; |
| 1126 | } |
| 1127 | |
| 1128 | status = validate_decompression_swizzle(*swizzle); |
| 1129 | if (status != ASTCENC_SUCCESS) |
| 1130 | { |
| 1131 | return status; |
| 1132 | } |
| 1133 | |
| 1134 | unsigned int block_x = ctx->config.block_x; |
| 1135 | unsigned int block_y = ctx->config.block_y; |
| 1136 | unsigned int block_z = ctx->config.block_z; |
| 1137 | |
| 1138 | unsigned int xblocks = (image_out.dim_x + block_x - 1) / block_x; |
| 1139 | unsigned int yblocks = (image_out.dim_y + block_y - 1) / block_y; |
| 1140 | unsigned int zblocks = (image_out.dim_z + block_z - 1) / block_z; |
| 1141 | |
| 1142 | int row_blocks = xblocks; |
| 1143 | int plane_blocks = xblocks * yblocks; |
| 1144 | |
| 1145 | // Check we have enough output space (16 bytes per block) |
| 1146 | size_t size_needed = xblocks * yblocks * zblocks * 16; |
| 1147 | if (data_len < size_needed) |
| 1148 | { |
| 1149 | return ASTCENC_ERR_OUT_OF_MEM; |
| 1150 | } |
| 1151 | |
| 1152 | image_block blk; |
| 1153 | blk.texel_count = static_cast<uint8_t>(block_x * block_y * block_z); |
| 1154 | |
| 1155 | // If context thread count is one then implicitly reset |
| 1156 | if (ctx->thread_count == 1) |
| 1157 | { |
| 1158 | astcenc_decompress_reset(ctxo); |
| 1159 | } |
| 1160 | |
| 1161 | // Only the first thread actually runs the initializer |
| 1162 | ctxo->manage_decompress.init(zblocks * yblocks * xblocks); |
| 1163 | |
| 1164 | // All threads run this processing loop until there is no work remaining |
| 1165 | while (true) |
| 1166 | { |
| 1167 | unsigned int count; |
| 1168 | unsigned int base = ctxo->manage_decompress.get_task_assignment(128, count); |
| 1169 | if (!count) |
| 1170 | { |
| 1171 | break; |
| 1172 | } |
| 1173 | |
| 1174 | for (unsigned int i = base; i < base + count; i++) |
| 1175 | { |
| 1176 | // Decode i into x, y, z block indices |
| 1177 | int z = i / plane_blocks; |
| 1178 | unsigned int rem = i - (z * plane_blocks); |
| 1179 | int y = rem / row_blocks; |
| 1180 | int x = rem - (y * row_blocks); |
| 1181 | |
| 1182 | unsigned int offset = (((z * yblocks + y) * xblocks) + x) * 16; |
| 1183 | const uint8_t* bp = data + offset; |
| 1184 | |
| 1185 | const physical_compressed_block& pcb = *reinterpret_cast<const physical_compressed_block*>(bp); |
| 1186 | symbolic_compressed_block scb; |
| 1187 | |
| 1188 | physical_to_symbolic(*ctx->bsd, pcb, scb); |
| 1189 | |
| 1190 | decompress_symbolic_block(ctx->config.profile, *ctx->bsd, |
| 1191 | x * block_x, y * block_y, z * block_z, |
| 1192 | scb, blk); |
| 1193 | |
| 1194 | store_image_block(image_out, blk, *ctx->bsd, |
| 1195 | x * block_x, y * block_y, z * block_z, *swizzle); |
| 1196 | } |
| 1197 | |
| 1198 | ctxo->manage_decompress.complete_task_assignment(count); |
| 1199 | } |
| 1200 | |
| 1201 | return ASTCENC_SUCCESS; |
| 1202 | } |
| 1203 | |
| 1204 | /* See header for documentation. */ |
| 1205 | astcenc_error astcenc_decompress_reset( |
| 1206 | astcenc_context* ctxo |
| 1207 | ) { |
| 1208 | ctxo->manage_decompress.reset(); |
| 1209 | return ASTCENC_SUCCESS; |
| 1210 | } |
| 1211 | |
| 1212 | /* See header for documentation. */ |
| 1213 | astcenc_error astcenc_get_block_info( |
| 1214 | astcenc_context* ctxo, |
| 1215 | const uint8_t data[16], |
| 1216 | astcenc_block_info* info |
| 1217 | ) { |
| 1218 | #if defined(ASTCENC_DECOMPRESS_ONLY) |
| 1219 | (void)ctxo; |
| 1220 | (void)data; |
| 1221 | (void)info; |
| 1222 | return ASTCENC_ERR_BAD_CONTEXT; |
| 1223 | #else |
| 1224 | astcenc_contexti* ctx = &ctxo->context; |
| 1225 | |
| 1226 | // Decode the compressed data into a symbolic form |
| 1227 | const physical_compressed_block&pcb = *reinterpret_cast<const physical_compressed_block*>(data); |
| 1228 | symbolic_compressed_block scb; |
| 1229 | physical_to_symbolic(*ctx->bsd, pcb, scb); |
| 1230 | |
| 1231 | // Fetch the appropriate partition and decimation tables |
| 1232 | block_size_descriptor& bsd = *ctx->bsd; |
| 1233 | |
| 1234 | // Start from a clean slate |
| 1235 | memset(info, 0, sizeof(*info)); |
| 1236 | |
| 1237 | // Basic info we can always populate |
| 1238 | info->profile = ctx->config.profile; |
| 1239 | |
| 1240 | info->block_x = ctx->config.block_x; |
| 1241 | info->block_y = ctx->config.block_y; |
| 1242 | info->block_z = ctx->config.block_z; |
| 1243 | info->texel_count = bsd.texel_count; |
| 1244 | |
| 1245 | // Check for error blocks first |
| 1246 | info->is_error_block = scb.block_type == SYM_BTYPE_ERROR; |
| 1247 | if (info->is_error_block) |
| 1248 | { |
| 1249 | return ASTCENC_SUCCESS; |
| 1250 | } |
| 1251 | |
| 1252 | // Check for constant color blocks second |
| 1253 | info->is_constant_block = scb.block_type == SYM_BTYPE_CONST_F16 || |
| 1254 | scb.block_type == SYM_BTYPE_CONST_U16; |
| 1255 | if (info->is_constant_block) |
| 1256 | { |
| 1257 | return ASTCENC_SUCCESS; |
| 1258 | } |
| 1259 | |
| 1260 | // Otherwise handle a full block ; known to be valid after conditions above have been checked |
| 1261 | int partition_count = scb.partition_count; |
| 1262 | const auto& pi = bsd.get_partition_info(partition_count, scb.partition_index); |
| 1263 | |
| 1264 | const block_mode& bm = bsd.get_block_mode(scb.block_mode); |
| 1265 | const decimation_info& di = bsd.get_decimation_info(bm.decimation_mode); |
| 1266 | |
| 1267 | info->weight_x = di.weight_x; |
| 1268 | info->weight_y = di.weight_y; |
| 1269 | info->weight_z = di.weight_z; |
| 1270 | |
| 1271 | info->is_dual_plane_block = bm.is_dual_plane != 0; |
| 1272 | |
| 1273 | info->partition_count = scb.partition_count; |
| 1274 | info->partition_index = scb.partition_index; |
| 1275 | info->dual_plane_component = scb.plane2_component; |
| 1276 | |
| 1277 | info->color_level_count = get_quant_level(scb.get_color_quant_mode()); |
| 1278 | info->weight_level_count = get_quant_level(bm.get_weight_quant_mode()); |
| 1279 | |
| 1280 | // Unpack color endpoints for each active partition |
| 1281 | for (unsigned int i = 0; i < scb.partition_count; i++) |
| 1282 | { |
| 1283 | bool rgb_hdr; |
| 1284 | bool a_hdr; |
| 1285 | vint4 endpnt[2]; |
| 1286 | |
| 1287 | unpack_color_endpoints(ctx->config.profile, |
| 1288 | scb.color_formats[i], |
| 1289 | scb.color_values[i], |
| 1290 | rgb_hdr, a_hdr, |
| 1291 | endpnt[0], endpnt[1]); |
| 1292 | |
| 1293 | // Store the color endpoint mode info |
| 1294 | info->color_endpoint_modes[i] = scb.color_formats[i]; |
| 1295 | info->is_hdr_block = info->is_hdr_block || rgb_hdr || a_hdr; |
| 1296 | |
| 1297 | // Store the unpacked and decoded color endpoint |
| 1298 | vmask4 hdr_mask(rgb_hdr, rgb_hdr, rgb_hdr, a_hdr); |
| 1299 | for (int j = 0; j < 2; j++) |
| 1300 | { |
| 1301 | vint4 color_lns = lns_to_sf16(endpnt[j]); |
| 1302 | vint4 color_unorm = unorm16_to_sf16(endpnt[j]); |
| 1303 | vint4 datai = select(color_unorm, color_lns, hdr_mask); |
| 1304 | store(float16_to_float(datai), info->color_endpoints[i][j]); |
| 1305 | } |
| 1306 | } |
| 1307 | |
| 1308 | // Unpack weights for each texel |
| 1309 | int weight_plane1[BLOCK_MAX_TEXELS]; |
| 1310 | int weight_plane2[BLOCK_MAX_TEXELS]; |
| 1311 | |
| 1312 | unpack_weights(bsd, scb, di, bm.is_dual_plane, weight_plane1, weight_plane2); |
| 1313 | for (unsigned int i = 0; i < bsd.texel_count; i++) |
| 1314 | { |
| 1315 | info->weight_values_plane1[i] = static_cast<float>(weight_plane1[i]) * (1.0f / WEIGHTS_TEXEL_SUM); |
| 1316 | if (info->is_dual_plane_block) |
| 1317 | { |
| 1318 | info->weight_values_plane2[i] = static_cast<float>(weight_plane2[i]) * (1.0f / WEIGHTS_TEXEL_SUM); |
| 1319 | } |
| 1320 | } |
| 1321 | |
| 1322 | // Unpack partition assignments for each texel |
| 1323 | for (unsigned int i = 0; i < bsd.texel_count; i++) |
| 1324 | { |
| 1325 | info->partition_assignment[i] = pi.partition_of_texel[i]; |
| 1326 | } |
| 1327 | |
| 1328 | return ASTCENC_SUCCESS; |
| 1329 | #endif |
| 1330 | } |
| 1331 | |
| 1332 | /* See header for documentation. */ |
| 1333 | const char* astcenc_get_error_string( |
| 1334 | astcenc_error status |
| 1335 | ) { |
| 1336 | // Values in this enum are from an external user, so not guaranteed to be |
| 1337 | // bounded to the enum values |
| 1338 | switch (static_cast<int>(status)) |
| 1339 | { |
| 1340 | case ASTCENC_SUCCESS: |
| 1341 | return "ASTCENC_SUCCESS" ; |
| 1342 | case ASTCENC_ERR_OUT_OF_MEM: |
| 1343 | return "ASTCENC_ERR_OUT_OF_MEM" ; |
| 1344 | case ASTCENC_ERR_BAD_CPU_FLOAT: |
| 1345 | return "ASTCENC_ERR_BAD_CPU_FLOAT" ; |
| 1346 | case ASTCENC_ERR_BAD_PARAM: |
| 1347 | return "ASTCENC_ERR_BAD_PARAM" ; |
| 1348 | case ASTCENC_ERR_BAD_BLOCK_SIZE: |
| 1349 | return "ASTCENC_ERR_BAD_BLOCK_SIZE" ; |
| 1350 | case ASTCENC_ERR_BAD_PROFILE: |
| 1351 | return "ASTCENC_ERR_BAD_PROFILE" ; |
| 1352 | case ASTCENC_ERR_BAD_QUALITY: |
| 1353 | return "ASTCENC_ERR_BAD_QUALITY" ; |
| 1354 | case ASTCENC_ERR_BAD_FLAGS: |
| 1355 | return "ASTCENC_ERR_BAD_FLAGS" ; |
| 1356 | case ASTCENC_ERR_BAD_SWIZZLE: |
| 1357 | return "ASTCENC_ERR_BAD_SWIZZLE" ; |
| 1358 | case ASTCENC_ERR_BAD_CONTEXT: |
| 1359 | return "ASTCENC_ERR_BAD_CONTEXT" ; |
| 1360 | case ASTCENC_ERR_NOT_IMPLEMENTED: |
| 1361 | return "ASTCENC_ERR_NOT_IMPLEMENTED" ; |
| 1362 | #if defined(ASTCENC_DIAGNOSTICS) |
| 1363 | case ASTCENC_ERR_DTRACE_FAILURE: |
| 1364 | return "ASTCENC_ERR_DTRACE_FAILURE" ; |
| 1365 | #endif |
| 1366 | default: |
| 1367 | return nullptr; |
| 1368 | } |
| 1369 | } |
| 1370 | |