| 1 | // SPDX-License-Identifier: Apache-2.0 | 
|---|
| 2 | // ---------------------------------------------------------------------------- | 
|---|
| 3 | // Copyright 2011-2023 Arm Limited | 
|---|
| 4 | // | 
|---|
| 5 | // Licensed under the Apache License, Version 2.0 (the "License"); you may not | 
|---|
| 6 | // use this file except in compliance with the License. You may obtain a copy | 
|---|
| 7 | // of the License at: | 
|---|
| 8 | // | 
|---|
| 9 | //     http://www.apache.org/licenses/LICENSE-2.0 | 
|---|
| 10 | // | 
|---|
| 11 | // Unless required by applicable law or agreed to in writing, software | 
|---|
| 12 | // distributed under the License is distributed on an "AS IS" BASIS, WITHOUT | 
|---|
| 13 | // WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the | 
|---|
| 14 | // License for the specific language governing permissions and limitations | 
|---|
| 15 | // under the License. | 
|---|
| 16 | // ---------------------------------------------------------------------------- | 
|---|
| 17 |  | 
|---|
| 18 | /** | 
|---|
| 19 | * @brief Functions for finding dominant direction of a set of colors. | 
|---|
| 20 | */ | 
|---|
| 21 | #if !defined(ASTCENC_DECOMPRESS_ONLY) | 
|---|
| 22 |  | 
|---|
| 23 | #include "astcenc_internal.h" | 
|---|
| 24 |  | 
|---|
| 25 | #include <cassert> | 
|---|
| 26 |  | 
|---|
| 27 | /** | 
|---|
| 28 | * @brief Compute the average RGB color of each partition. | 
|---|
| 29 | * | 
|---|
| 30 | * The algorithm here uses a vectorized sequential scan and per-partition | 
|---|
| 31 | * color accumulators, using select() to mask texel lanes in other partitions. | 
|---|
| 32 | * | 
|---|
| 33 | * We only accumulate sums for N-1 partitions during the scan; the value for | 
|---|
| 34 | * the last partition can be computed given that we know the block-wide average | 
|---|
| 35 | * already. | 
|---|
| 36 | * | 
|---|
| 37 | * Because of this we could reduce the loop iteration count so it "just" spans | 
|---|
| 38 | * the max texel index needed for the N-1 partitions, which could need fewer | 
|---|
| 39 | * iterations than the full block texel count. However, this makes the loop | 
|---|
| 40 | * count erratic and causes more branch mispredictions so is a net loss. | 
|---|
| 41 | * | 
|---|
| 42 | * @param      pi         The partitioning to use. | 
|---|
| 43 | * @param      blk        The block data to process. | 
|---|
| 44 | * @param[out] averages   The output averages. Unused partition indices will | 
|---|
| 45 | *                        not be initialized, and lane<3> will be zero. | 
|---|
| 46 | */ | 
|---|
| 47 | static void compute_partition_averages_rgb( | 
|---|
| 48 | const partition_info& pi, | 
|---|
| 49 | const image_block& blk, | 
|---|
| 50 | vfloat4 averages[BLOCK_MAX_PARTITIONS] | 
|---|
| 51 | ) { | 
|---|
| 52 | unsigned int partition_count = pi.partition_count; | 
|---|
| 53 | unsigned int texel_count = blk.texel_count; | 
|---|
| 54 | promise(texel_count > 0); | 
|---|
| 55 |  | 
|---|
| 56 | // For 1 partition just use the precomputed mean | 
|---|
| 57 | if (partition_count == 1) | 
|---|
| 58 | { | 
|---|
| 59 | averages[0] = blk.data_mean.swz<0, 1, 2>(); | 
|---|
| 60 | } | 
|---|
| 61 | // For 2 partitions scan results for partition 0, compute partition 1 | 
|---|
| 62 | else if (partition_count == 2) | 
|---|
| 63 | { | 
|---|
| 64 | vfloatacc pp_avg_rgb[3] {}; | 
|---|
| 65 |  | 
|---|
| 66 | vint lane_id = vint::lane_id(); | 
|---|
| 67 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) | 
|---|
| 68 | { | 
|---|
| 69 | vint texel_partition(pi.partition_of_texel + i); | 
|---|
| 70 |  | 
|---|
| 71 | vmask lane_mask = lane_id < vint(texel_count); | 
|---|
| 72 | lane_id += vint(ASTCENC_SIMD_WIDTH); | 
|---|
| 73 |  | 
|---|
| 74 | vmask p0_mask = lane_mask & (texel_partition == vint(0)); | 
|---|
| 75 |  | 
|---|
| 76 | vfloat data_r = loada(blk.data_r + i); | 
|---|
| 77 | haccumulate(pp_avg_rgb[0], data_r, p0_mask); | 
|---|
| 78 |  | 
|---|
| 79 | vfloat data_g = loada(blk.data_g + i); | 
|---|
| 80 | haccumulate(pp_avg_rgb[1], data_g, p0_mask); | 
|---|
| 81 |  | 
|---|
| 82 | vfloat data_b = loada(blk.data_b + i); | 
|---|
| 83 | haccumulate(pp_avg_rgb[2], data_b, p0_mask); | 
|---|
| 84 | } | 
|---|
| 85 |  | 
|---|
| 86 | vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count); | 
|---|
| 87 |  | 
|---|
| 88 | vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0]), | 
|---|
| 89 | hadd_s(pp_avg_rgb[1]), | 
|---|
| 90 | hadd_s(pp_avg_rgb[2])); | 
|---|
| 91 |  | 
|---|
| 92 | vfloat4 p1_total = block_total - p0_total; | 
|---|
| 93 |  | 
|---|
| 94 | averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]); | 
|---|
| 95 | averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]); | 
|---|
| 96 | } | 
|---|
| 97 | // For 3 partitions scan results for partition 0/1, compute partition 2 | 
|---|
| 98 | else if (partition_count == 3) | 
|---|
| 99 | { | 
|---|
| 100 | vfloatacc pp_avg_rgb[2][3] {}; | 
|---|
| 101 |  | 
|---|
| 102 | vint lane_id = vint::lane_id(); | 
|---|
| 103 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) | 
|---|
| 104 | { | 
|---|
| 105 | vint texel_partition(pi.partition_of_texel + i); | 
|---|
| 106 |  | 
|---|
| 107 | vmask lane_mask = lane_id < vint(texel_count); | 
|---|
| 108 | lane_id += vint(ASTCENC_SIMD_WIDTH); | 
|---|
| 109 |  | 
|---|
| 110 | vmask p0_mask = lane_mask & (texel_partition == vint(0)); | 
|---|
| 111 | vmask p1_mask = lane_mask & (texel_partition == vint(1)); | 
|---|
| 112 |  | 
|---|
| 113 | vfloat data_r = loada(blk.data_r + i); | 
|---|
| 114 | haccumulate(pp_avg_rgb[0][0], data_r, p0_mask); | 
|---|
| 115 | haccumulate(pp_avg_rgb[1][0], data_r, p1_mask); | 
|---|
| 116 |  | 
|---|
| 117 | vfloat data_g = loada(blk.data_g + i); | 
|---|
| 118 | haccumulate(pp_avg_rgb[0][1], data_g, p0_mask); | 
|---|
| 119 | haccumulate(pp_avg_rgb[1][1], data_g, p1_mask); | 
|---|
| 120 |  | 
|---|
| 121 | vfloat data_b = loada(blk.data_b + i); | 
|---|
| 122 | haccumulate(pp_avg_rgb[0][2], data_b, p0_mask); | 
|---|
| 123 | haccumulate(pp_avg_rgb[1][2], data_b, p1_mask); | 
|---|
| 124 | } | 
|---|
| 125 |  | 
|---|
| 126 | vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count); | 
|---|
| 127 |  | 
|---|
| 128 | vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]), | 
|---|
| 129 | hadd_s(pp_avg_rgb[0][1]), | 
|---|
| 130 | hadd_s(pp_avg_rgb[0][2])); | 
|---|
| 131 |  | 
|---|
| 132 | vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]), | 
|---|
| 133 | hadd_s(pp_avg_rgb[1][1]), | 
|---|
| 134 | hadd_s(pp_avg_rgb[1][2])); | 
|---|
| 135 |  | 
|---|
| 136 | vfloat4 p2_total = block_total - p0_total - p1_total; | 
|---|
| 137 |  | 
|---|
| 138 | averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]); | 
|---|
| 139 | averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]); | 
|---|
| 140 | averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]); | 
|---|
| 141 | } | 
|---|
| 142 | else | 
|---|
| 143 | { | 
|---|
| 144 | // For 4 partitions scan results for partition 0/1/2, compute partition 3 | 
|---|
| 145 | vfloatacc pp_avg_rgb[3][3] {}; | 
|---|
| 146 |  | 
|---|
| 147 | vint lane_id = vint::lane_id(); | 
|---|
| 148 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) | 
|---|
| 149 | { | 
|---|
| 150 | vint texel_partition(pi.partition_of_texel + i); | 
|---|
| 151 |  | 
|---|
| 152 | vmask lane_mask = lane_id < vint(texel_count); | 
|---|
| 153 | lane_id += vint(ASTCENC_SIMD_WIDTH); | 
|---|
| 154 |  | 
|---|
| 155 | vmask p0_mask = lane_mask & (texel_partition == vint(0)); | 
|---|
| 156 | vmask p1_mask = lane_mask & (texel_partition == vint(1)); | 
|---|
| 157 | vmask p2_mask = lane_mask & (texel_partition == vint(2)); | 
|---|
| 158 |  | 
|---|
| 159 | vfloat data_r = loada(blk.data_r + i); | 
|---|
| 160 | haccumulate(pp_avg_rgb[0][0], data_r, p0_mask); | 
|---|
| 161 | haccumulate(pp_avg_rgb[1][0], data_r, p1_mask); | 
|---|
| 162 | haccumulate(pp_avg_rgb[2][0], data_r, p2_mask); | 
|---|
| 163 |  | 
|---|
| 164 | vfloat data_g = loada(blk.data_g + i); | 
|---|
| 165 | haccumulate(pp_avg_rgb[0][1], data_g, p0_mask); | 
|---|
| 166 | haccumulate(pp_avg_rgb[1][1], data_g, p1_mask); | 
|---|
| 167 | haccumulate(pp_avg_rgb[2][1], data_g, p2_mask); | 
|---|
| 168 |  | 
|---|
| 169 | vfloat data_b = loada(blk.data_b + i); | 
|---|
| 170 | haccumulate(pp_avg_rgb[0][2], data_b, p0_mask); | 
|---|
| 171 | haccumulate(pp_avg_rgb[1][2], data_b, p1_mask); | 
|---|
| 172 | haccumulate(pp_avg_rgb[2][2], data_b, p2_mask); | 
|---|
| 173 | } | 
|---|
| 174 |  | 
|---|
| 175 | vfloat4 block_total = blk.data_mean.swz<0, 1, 2>() * static_cast<float>(blk.texel_count); | 
|---|
| 176 |  | 
|---|
| 177 | vfloat4 p0_total = vfloat3(hadd_s(pp_avg_rgb[0][0]), | 
|---|
| 178 | hadd_s(pp_avg_rgb[0][1]), | 
|---|
| 179 | hadd_s(pp_avg_rgb[0][2])); | 
|---|
| 180 |  | 
|---|
| 181 | vfloat4 p1_total = vfloat3(hadd_s(pp_avg_rgb[1][0]), | 
|---|
| 182 | hadd_s(pp_avg_rgb[1][1]), | 
|---|
| 183 | hadd_s(pp_avg_rgb[1][2])); | 
|---|
| 184 |  | 
|---|
| 185 | vfloat4 p2_total = vfloat3(hadd_s(pp_avg_rgb[2][0]), | 
|---|
| 186 | hadd_s(pp_avg_rgb[2][1]), | 
|---|
| 187 | hadd_s(pp_avg_rgb[2][2])); | 
|---|
| 188 |  | 
|---|
| 189 | vfloat4 p3_total = block_total - p0_total - p1_total- p2_total; | 
|---|
| 190 |  | 
|---|
| 191 | averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]); | 
|---|
| 192 | averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]); | 
|---|
| 193 | averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]); | 
|---|
| 194 | averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]); | 
|---|
| 195 | } | 
|---|
| 196 | } | 
|---|
| 197 |  | 
|---|
| 198 | /** | 
|---|
| 199 | * @brief Compute the average RGBA color of each partition. | 
|---|
| 200 | * | 
|---|
| 201 | * The algorithm here uses a vectorized sequential scan and per-partition | 
|---|
| 202 | * color accumulators, using select() to mask texel lanes in other partitions. | 
|---|
| 203 | * | 
|---|
| 204 | * We only accumulate sums for N-1 partitions during the scan; the value for | 
|---|
| 205 | * the last partition can be computed given that we know the block-wide average | 
|---|
| 206 | * already. | 
|---|
| 207 | * | 
|---|
| 208 | * Because of this we could reduce the loop iteration count so it "just" spans | 
|---|
| 209 | * the max texel index needed for the N-1 partitions, which could need fewer | 
|---|
| 210 | * iterations than the full block texel count. However, this makes the loop | 
|---|
| 211 | * count erratic and causes more branch mispredictions so is a net loss. | 
|---|
| 212 | * | 
|---|
| 213 | * @param      pi         The partitioning to use. | 
|---|
| 214 | * @param      blk        The block data to process. | 
|---|
| 215 | * @param[out] averages   The output averages. Unused partition indices will | 
|---|
| 216 | *                        not be initialized. | 
|---|
| 217 | */ | 
|---|
| 218 | static void compute_partition_averages_rgba( | 
|---|
| 219 | const partition_info& pi, | 
|---|
| 220 | const image_block& blk, | 
|---|
| 221 | vfloat4 averages[BLOCK_MAX_PARTITIONS] | 
|---|
| 222 | ) { | 
|---|
| 223 | unsigned int partition_count = pi.partition_count; | 
|---|
| 224 | unsigned int texel_count = blk.texel_count; | 
|---|
| 225 | promise(texel_count > 0); | 
|---|
| 226 |  | 
|---|
| 227 | // For 1 partition just use the precomputed mean | 
|---|
| 228 | if (partition_count == 1) | 
|---|
| 229 | { | 
|---|
| 230 | averages[0] = blk.data_mean; | 
|---|
| 231 | } | 
|---|
| 232 | // For 2 partitions scan results for partition 0, compute partition 1 | 
|---|
| 233 | else if (partition_count == 2) | 
|---|
| 234 | { | 
|---|
| 235 | vfloat4 pp_avg_rgba[4] {}; | 
|---|
| 236 |  | 
|---|
| 237 | vint lane_id = vint::lane_id(); | 
|---|
| 238 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) | 
|---|
| 239 | { | 
|---|
| 240 | vint texel_partition(pi.partition_of_texel + i); | 
|---|
| 241 |  | 
|---|
| 242 | vmask lane_mask = lane_id < vint(texel_count); | 
|---|
| 243 | lane_id += vint(ASTCENC_SIMD_WIDTH); | 
|---|
| 244 |  | 
|---|
| 245 | vmask p0_mask = lane_mask & (texel_partition == vint(0)); | 
|---|
| 246 |  | 
|---|
| 247 | vfloat data_r = loada(blk.data_r + i); | 
|---|
| 248 | haccumulate(pp_avg_rgba[0], data_r, p0_mask); | 
|---|
| 249 |  | 
|---|
| 250 | vfloat data_g = loada(blk.data_g + i); | 
|---|
| 251 | haccumulate(pp_avg_rgba[1], data_g, p0_mask); | 
|---|
| 252 |  | 
|---|
| 253 | vfloat data_b = loada(blk.data_b + i); | 
|---|
| 254 | haccumulate(pp_avg_rgba[2], data_b, p0_mask); | 
|---|
| 255 |  | 
|---|
| 256 | vfloat data_a = loada(blk.data_a + i); | 
|---|
| 257 | haccumulate(pp_avg_rgba[3], data_a, p0_mask); | 
|---|
| 258 | } | 
|---|
| 259 |  | 
|---|
| 260 | vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count); | 
|---|
| 261 |  | 
|---|
| 262 | vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0]), | 
|---|
| 263 | hadd_s(pp_avg_rgba[1]), | 
|---|
| 264 | hadd_s(pp_avg_rgba[2]), | 
|---|
| 265 | hadd_s(pp_avg_rgba[3])); | 
|---|
| 266 |  | 
|---|
| 267 | vfloat4 p1_total = block_total - p0_total; | 
|---|
| 268 |  | 
|---|
| 269 | averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]); | 
|---|
| 270 | averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]); | 
|---|
| 271 | } | 
|---|
| 272 | // For 3 partitions scan results for partition 0/1, compute partition 2 | 
|---|
| 273 | else if (partition_count == 3) | 
|---|
| 274 | { | 
|---|
| 275 | vfloat4 pp_avg_rgba[2][4] {}; | 
|---|
| 276 |  | 
|---|
| 277 | vint lane_id = vint::lane_id(); | 
|---|
| 278 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) | 
|---|
| 279 | { | 
|---|
| 280 | vint texel_partition(pi.partition_of_texel + i); | 
|---|
| 281 |  | 
|---|
| 282 | vmask lane_mask = lane_id < vint(texel_count); | 
|---|
| 283 | lane_id += vint(ASTCENC_SIMD_WIDTH); | 
|---|
| 284 |  | 
|---|
| 285 | vmask p0_mask = lane_mask & (texel_partition == vint(0)); | 
|---|
| 286 | vmask p1_mask = lane_mask & (texel_partition == vint(1)); | 
|---|
| 287 |  | 
|---|
| 288 | vfloat data_r = loada(blk.data_r + i); | 
|---|
| 289 | haccumulate(pp_avg_rgba[0][0], data_r, p0_mask); | 
|---|
| 290 | haccumulate(pp_avg_rgba[1][0], data_r, p1_mask); | 
|---|
| 291 |  | 
|---|
| 292 | vfloat data_g = loada(blk.data_g + i); | 
|---|
| 293 | haccumulate(pp_avg_rgba[0][1], data_g, p0_mask); | 
|---|
| 294 | haccumulate(pp_avg_rgba[1][1], data_g, p1_mask); | 
|---|
| 295 |  | 
|---|
| 296 | vfloat data_b = loada(blk.data_b + i); | 
|---|
| 297 | haccumulate(pp_avg_rgba[0][2], data_b, p0_mask); | 
|---|
| 298 | haccumulate(pp_avg_rgba[1][2], data_b, p1_mask); | 
|---|
| 299 |  | 
|---|
| 300 | vfloat data_a = loada(blk.data_a + i); | 
|---|
| 301 | haccumulate(pp_avg_rgba[0][3], data_a, p0_mask); | 
|---|
| 302 | haccumulate(pp_avg_rgba[1][3], data_a, p1_mask); | 
|---|
| 303 | } | 
|---|
| 304 |  | 
|---|
| 305 | vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count); | 
|---|
| 306 |  | 
|---|
| 307 | vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]), | 
|---|
| 308 | hadd_s(pp_avg_rgba[0][1]), | 
|---|
| 309 | hadd_s(pp_avg_rgba[0][2]), | 
|---|
| 310 | hadd_s(pp_avg_rgba[0][3])); | 
|---|
| 311 |  | 
|---|
| 312 | vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]), | 
|---|
| 313 | hadd_s(pp_avg_rgba[1][1]), | 
|---|
| 314 | hadd_s(pp_avg_rgba[1][2]), | 
|---|
| 315 | hadd_s(pp_avg_rgba[1][3])); | 
|---|
| 316 |  | 
|---|
| 317 | vfloat4 p2_total = block_total - p0_total - p1_total; | 
|---|
| 318 |  | 
|---|
| 319 | averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]); | 
|---|
| 320 | averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]); | 
|---|
| 321 | averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]); | 
|---|
| 322 | } | 
|---|
| 323 | else | 
|---|
| 324 | { | 
|---|
| 325 | // For 4 partitions scan results for partition 0/1/2, compute partition 3 | 
|---|
| 326 | vfloat4 pp_avg_rgba[3][4] {}; | 
|---|
| 327 |  | 
|---|
| 328 | vint lane_id = vint::lane_id(); | 
|---|
| 329 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) | 
|---|
| 330 | { | 
|---|
| 331 | vint texel_partition(pi.partition_of_texel + i); | 
|---|
| 332 |  | 
|---|
| 333 | vmask lane_mask = lane_id < vint(texel_count); | 
|---|
| 334 | lane_id += vint(ASTCENC_SIMD_WIDTH); | 
|---|
| 335 |  | 
|---|
| 336 | vmask p0_mask = lane_mask & (texel_partition == vint(0)); | 
|---|
| 337 | vmask p1_mask = lane_mask & (texel_partition == vint(1)); | 
|---|
| 338 | vmask p2_mask = lane_mask & (texel_partition == vint(2)); | 
|---|
| 339 |  | 
|---|
| 340 | vfloat data_r = loada(blk.data_r + i); | 
|---|
| 341 | haccumulate(pp_avg_rgba[0][0], data_r, p0_mask); | 
|---|
| 342 | haccumulate(pp_avg_rgba[1][0], data_r, p1_mask); | 
|---|
| 343 | haccumulate(pp_avg_rgba[2][0], data_r, p2_mask); | 
|---|
| 344 |  | 
|---|
| 345 | vfloat data_g = loada(blk.data_g + i); | 
|---|
| 346 | haccumulate(pp_avg_rgba[0][1], data_g, p0_mask); | 
|---|
| 347 | haccumulate(pp_avg_rgba[1][1], data_g, p1_mask); | 
|---|
| 348 | haccumulate(pp_avg_rgba[2][1], data_g, p2_mask); | 
|---|
| 349 |  | 
|---|
| 350 | vfloat data_b = loada(blk.data_b + i); | 
|---|
| 351 | haccumulate(pp_avg_rgba[0][2], data_b, p0_mask); | 
|---|
| 352 | haccumulate(pp_avg_rgba[1][2], data_b, p1_mask); | 
|---|
| 353 | haccumulate(pp_avg_rgba[2][2], data_b, p2_mask); | 
|---|
| 354 |  | 
|---|
| 355 | vfloat data_a = loada(blk.data_a + i); | 
|---|
| 356 | haccumulate(pp_avg_rgba[0][3], data_a, p0_mask); | 
|---|
| 357 | haccumulate(pp_avg_rgba[1][3], data_a, p1_mask); | 
|---|
| 358 | haccumulate(pp_avg_rgba[2][3], data_a, p2_mask); | 
|---|
| 359 | } | 
|---|
| 360 |  | 
|---|
| 361 | vfloat4 block_total = blk.data_mean * static_cast<float>(blk.texel_count); | 
|---|
| 362 |  | 
|---|
| 363 | vfloat4 p0_total = vfloat4(hadd_s(pp_avg_rgba[0][0]), | 
|---|
| 364 | hadd_s(pp_avg_rgba[0][1]), | 
|---|
| 365 | hadd_s(pp_avg_rgba[0][2]), | 
|---|
| 366 | hadd_s(pp_avg_rgba[0][3])); | 
|---|
| 367 |  | 
|---|
| 368 | vfloat4 p1_total = vfloat4(hadd_s(pp_avg_rgba[1][0]), | 
|---|
| 369 | hadd_s(pp_avg_rgba[1][1]), | 
|---|
| 370 | hadd_s(pp_avg_rgba[1][2]), | 
|---|
| 371 | hadd_s(pp_avg_rgba[1][3])); | 
|---|
| 372 |  | 
|---|
| 373 | vfloat4 p2_total = vfloat4(hadd_s(pp_avg_rgba[2][0]), | 
|---|
| 374 | hadd_s(pp_avg_rgba[2][1]), | 
|---|
| 375 | hadd_s(pp_avg_rgba[2][2]), | 
|---|
| 376 | hadd_s(pp_avg_rgba[2][3])); | 
|---|
| 377 |  | 
|---|
| 378 | vfloat4 p3_total = block_total - p0_total - p1_total- p2_total; | 
|---|
| 379 |  | 
|---|
| 380 | averages[0] = p0_total / static_cast<float>(pi.partition_texel_count[0]); | 
|---|
| 381 | averages[1] = p1_total / static_cast<float>(pi.partition_texel_count[1]); | 
|---|
| 382 | averages[2] = p2_total / static_cast<float>(pi.partition_texel_count[2]); | 
|---|
| 383 | averages[3] = p3_total / static_cast<float>(pi.partition_texel_count[3]); | 
|---|
| 384 | } | 
|---|
| 385 | } | 
|---|
| 386 |  | 
|---|
| 387 | /* See header for documentation. */ | 
|---|
| 388 | void compute_avgs_and_dirs_4_comp( | 
|---|
| 389 | const partition_info& pi, | 
|---|
| 390 | const image_block& blk, | 
|---|
| 391 | partition_metrics pm[BLOCK_MAX_PARTITIONS] | 
|---|
| 392 | ) { | 
|---|
| 393 | int partition_count = pi.partition_count; | 
|---|
| 394 | promise(partition_count > 0); | 
|---|
| 395 |  | 
|---|
| 396 | // Pre-compute partition_averages | 
|---|
| 397 | vfloat4 partition_averages[BLOCK_MAX_PARTITIONS]; | 
|---|
| 398 | compute_partition_averages_rgba(pi, blk, partition_averages); | 
|---|
| 399 |  | 
|---|
| 400 | for (int partition = 0; partition < partition_count; partition++) | 
|---|
| 401 | { | 
|---|
| 402 | const uint8_t *texel_indexes = pi.texels_of_partition[partition]; | 
|---|
| 403 | unsigned int texel_count = pi.partition_texel_count[partition]; | 
|---|
| 404 | promise(texel_count > 0); | 
|---|
| 405 |  | 
|---|
| 406 | vfloat4 average = partition_averages[partition]; | 
|---|
| 407 | pm[partition].avg = average; | 
|---|
| 408 |  | 
|---|
| 409 | vfloat4 sum_xp = vfloat4::zero(); | 
|---|
| 410 | vfloat4 sum_yp = vfloat4::zero(); | 
|---|
| 411 | vfloat4 sum_zp = vfloat4::zero(); | 
|---|
| 412 | vfloat4 sum_wp = vfloat4::zero(); | 
|---|
| 413 |  | 
|---|
| 414 | for (unsigned int i = 0; i < texel_count; i++) | 
|---|
| 415 | { | 
|---|
| 416 | unsigned int iwt = texel_indexes[i]; | 
|---|
| 417 | vfloat4 texel_datum = blk.texel(iwt); | 
|---|
| 418 | texel_datum = texel_datum - average; | 
|---|
| 419 |  | 
|---|
| 420 | vfloat4 zero = vfloat4::zero(); | 
|---|
| 421 |  | 
|---|
| 422 | vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero; | 
|---|
| 423 | sum_xp += select(zero, texel_datum, tdm0); | 
|---|
| 424 |  | 
|---|
| 425 | vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero; | 
|---|
| 426 | sum_yp += select(zero, texel_datum, tdm1); | 
|---|
| 427 |  | 
|---|
| 428 | vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero; | 
|---|
| 429 | sum_zp += select(zero, texel_datum, tdm2); | 
|---|
| 430 |  | 
|---|
| 431 | vmask4 tdm3 = texel_datum.swz<3,3,3,3>() > zero; | 
|---|
| 432 | sum_wp += select(zero, texel_datum, tdm3); | 
|---|
| 433 | } | 
|---|
| 434 |  | 
|---|
| 435 | vfloat4 prod_xp = dot(sum_xp, sum_xp); | 
|---|
| 436 | vfloat4 prod_yp = dot(sum_yp, sum_yp); | 
|---|
| 437 | vfloat4 prod_zp = dot(sum_zp, sum_zp); | 
|---|
| 438 | vfloat4 prod_wp = dot(sum_wp, sum_wp); | 
|---|
| 439 |  | 
|---|
| 440 | vfloat4 best_vector = sum_xp; | 
|---|
| 441 | vfloat4 best_sum = prod_xp; | 
|---|
| 442 |  | 
|---|
| 443 | vmask4 mask = prod_yp > best_sum; | 
|---|
| 444 | best_vector = select(best_vector, sum_yp, mask); | 
|---|
| 445 | best_sum = select(best_sum, prod_yp, mask); | 
|---|
| 446 |  | 
|---|
| 447 | mask = prod_zp > best_sum; | 
|---|
| 448 | best_vector = select(best_vector, sum_zp, mask); | 
|---|
| 449 | best_sum = select(best_sum, prod_zp, mask); | 
|---|
| 450 |  | 
|---|
| 451 | mask = prod_wp > best_sum; | 
|---|
| 452 | best_vector = select(best_vector, sum_wp, mask); | 
|---|
| 453 |  | 
|---|
| 454 | pm[partition].dir = best_vector; | 
|---|
| 455 | } | 
|---|
| 456 | } | 
|---|
| 457 |  | 
|---|
| 458 | /* See header for documentation. */ | 
|---|
| 459 | void compute_avgs_and_dirs_3_comp( | 
|---|
| 460 | const partition_info& pi, | 
|---|
| 461 | const image_block& blk, | 
|---|
| 462 | unsigned int omitted_component, | 
|---|
| 463 | partition_metrics pm[BLOCK_MAX_PARTITIONS] | 
|---|
| 464 | ) { | 
|---|
| 465 | // Pre-compute partition_averages | 
|---|
| 466 | vfloat4 partition_averages[BLOCK_MAX_PARTITIONS]; | 
|---|
| 467 | compute_partition_averages_rgba(pi, blk, partition_averages); | 
|---|
| 468 |  | 
|---|
| 469 | const float* data_vr = blk.data_r; | 
|---|
| 470 | const float* data_vg = blk.data_g; | 
|---|
| 471 | const float* data_vb = blk.data_b; | 
|---|
| 472 |  | 
|---|
| 473 | // TODO: Data-driven permute would be useful to avoid this ... | 
|---|
| 474 | if (omitted_component == 0) | 
|---|
| 475 | { | 
|---|
| 476 | partition_averages[0] = partition_averages[0].swz<1, 2, 3>(); | 
|---|
| 477 | partition_averages[1] = partition_averages[1].swz<1, 2, 3>(); | 
|---|
| 478 | partition_averages[2] = partition_averages[2].swz<1, 2, 3>(); | 
|---|
| 479 | partition_averages[3] = partition_averages[3].swz<1, 2, 3>(); | 
|---|
| 480 |  | 
|---|
| 481 | data_vr = blk.data_g; | 
|---|
| 482 | data_vg = blk.data_b; | 
|---|
| 483 | data_vb = blk.data_a; | 
|---|
| 484 | } | 
|---|
| 485 | else if (omitted_component == 1) | 
|---|
| 486 | { | 
|---|
| 487 | partition_averages[0] = partition_averages[0].swz<0, 2, 3>(); | 
|---|
| 488 | partition_averages[1] = partition_averages[1].swz<0, 2, 3>(); | 
|---|
| 489 | partition_averages[2] = partition_averages[2].swz<0, 2, 3>(); | 
|---|
| 490 | partition_averages[3] = partition_averages[3].swz<0, 2, 3>(); | 
|---|
| 491 |  | 
|---|
| 492 | data_vg = blk.data_b; | 
|---|
| 493 | data_vb = blk.data_a; | 
|---|
| 494 | } | 
|---|
| 495 | else if (omitted_component == 2) | 
|---|
| 496 | { | 
|---|
| 497 | partition_averages[0] = partition_averages[0].swz<0, 1, 3>(); | 
|---|
| 498 | partition_averages[1] = partition_averages[1].swz<0, 1, 3>(); | 
|---|
| 499 | partition_averages[2] = partition_averages[2].swz<0, 1, 3>(); | 
|---|
| 500 | partition_averages[3] = partition_averages[3].swz<0, 1, 3>(); | 
|---|
| 501 |  | 
|---|
| 502 | data_vb = blk.data_a; | 
|---|
| 503 | } | 
|---|
| 504 | else | 
|---|
| 505 | { | 
|---|
| 506 | partition_averages[0] = partition_averages[0].swz<0, 1, 2>(); | 
|---|
| 507 | partition_averages[1] = partition_averages[1].swz<0, 1, 2>(); | 
|---|
| 508 | partition_averages[2] = partition_averages[2].swz<0, 1, 2>(); | 
|---|
| 509 | partition_averages[3] = partition_averages[3].swz<0, 1, 2>(); | 
|---|
| 510 | } | 
|---|
| 511 |  | 
|---|
| 512 | unsigned int partition_count = pi.partition_count; | 
|---|
| 513 | promise(partition_count > 0); | 
|---|
| 514 |  | 
|---|
| 515 | for (unsigned int partition = 0; partition < partition_count; partition++) | 
|---|
| 516 | { | 
|---|
| 517 | const uint8_t *texel_indexes = pi.texels_of_partition[partition]; | 
|---|
| 518 | unsigned int texel_count = pi.partition_texel_count[partition]; | 
|---|
| 519 | promise(texel_count > 0); | 
|---|
| 520 |  | 
|---|
| 521 | vfloat4 average = partition_averages[partition]; | 
|---|
| 522 | pm[partition].avg = average; | 
|---|
| 523 |  | 
|---|
| 524 | vfloat4 sum_xp = vfloat4::zero(); | 
|---|
| 525 | vfloat4 sum_yp = vfloat4::zero(); | 
|---|
| 526 | vfloat4 sum_zp = vfloat4::zero(); | 
|---|
| 527 |  | 
|---|
| 528 | for (unsigned int i = 0; i < texel_count; i++) | 
|---|
| 529 | { | 
|---|
| 530 | unsigned int iwt = texel_indexes[i]; | 
|---|
| 531 |  | 
|---|
| 532 | vfloat4 texel_datum = vfloat3(data_vr[iwt], | 
|---|
| 533 | data_vg[iwt], | 
|---|
| 534 | data_vb[iwt]); | 
|---|
| 535 | texel_datum = texel_datum - average; | 
|---|
| 536 |  | 
|---|
| 537 | vfloat4 zero = vfloat4::zero(); | 
|---|
| 538 |  | 
|---|
| 539 | vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero; | 
|---|
| 540 | sum_xp += select(zero, texel_datum, tdm0); | 
|---|
| 541 |  | 
|---|
| 542 | vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero; | 
|---|
| 543 | sum_yp += select(zero, texel_datum, tdm1); | 
|---|
| 544 |  | 
|---|
| 545 | vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero; | 
|---|
| 546 | sum_zp += select(zero, texel_datum, tdm2); | 
|---|
| 547 | } | 
|---|
| 548 |  | 
|---|
| 549 | vfloat4 prod_xp = dot(sum_xp, sum_xp); | 
|---|
| 550 | vfloat4 prod_yp = dot(sum_yp, sum_yp); | 
|---|
| 551 | vfloat4 prod_zp = dot(sum_zp, sum_zp); | 
|---|
| 552 |  | 
|---|
| 553 | vfloat4 best_vector = sum_xp; | 
|---|
| 554 | vfloat4 best_sum = prod_xp; | 
|---|
| 555 |  | 
|---|
| 556 | vmask4 mask = prod_yp > best_sum; | 
|---|
| 557 | best_vector = select(best_vector, sum_yp, mask); | 
|---|
| 558 | best_sum = select(best_sum, prod_yp, mask); | 
|---|
| 559 |  | 
|---|
| 560 | mask = prod_zp > best_sum; | 
|---|
| 561 | best_vector = select(best_vector, sum_zp, mask); | 
|---|
| 562 |  | 
|---|
| 563 | pm[partition].dir = best_vector; | 
|---|
| 564 | } | 
|---|
| 565 | } | 
|---|
| 566 |  | 
|---|
| 567 | /* See header for documentation. */ | 
|---|
| 568 | void compute_avgs_and_dirs_3_comp_rgb( | 
|---|
| 569 | const partition_info& pi, | 
|---|
| 570 | const image_block& blk, | 
|---|
| 571 | partition_metrics pm[BLOCK_MAX_PARTITIONS] | 
|---|
| 572 | ) { | 
|---|
| 573 | unsigned int partition_count = pi.partition_count; | 
|---|
| 574 | promise(partition_count > 0); | 
|---|
| 575 |  | 
|---|
| 576 | // Pre-compute partition_averages | 
|---|
| 577 | vfloat4 partition_averages[BLOCK_MAX_PARTITIONS]; | 
|---|
| 578 | compute_partition_averages_rgb(pi, blk, partition_averages); | 
|---|
| 579 |  | 
|---|
| 580 | for (unsigned int partition = 0; partition < partition_count; partition++) | 
|---|
| 581 | { | 
|---|
| 582 | const uint8_t *texel_indexes = pi.texels_of_partition[partition]; | 
|---|
| 583 | unsigned int texel_count = pi.partition_texel_count[partition]; | 
|---|
| 584 | promise(texel_count > 0); | 
|---|
| 585 |  | 
|---|
| 586 | vfloat4 average = partition_averages[partition]; | 
|---|
| 587 | pm[partition].avg = average; | 
|---|
| 588 |  | 
|---|
| 589 | vfloat4 sum_xp = vfloat4::zero(); | 
|---|
| 590 | vfloat4 sum_yp = vfloat4::zero(); | 
|---|
| 591 | vfloat4 sum_zp = vfloat4::zero(); | 
|---|
| 592 |  | 
|---|
| 593 | for (unsigned int i = 0; i < texel_count; i++) | 
|---|
| 594 | { | 
|---|
| 595 | unsigned int iwt = texel_indexes[i]; | 
|---|
| 596 |  | 
|---|
| 597 | vfloat4 texel_datum = blk.texel3(iwt); | 
|---|
| 598 | texel_datum = texel_datum - average; | 
|---|
| 599 |  | 
|---|
| 600 | vfloat4 zero = vfloat4::zero(); | 
|---|
| 601 |  | 
|---|
| 602 | vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero; | 
|---|
| 603 | sum_xp += select(zero, texel_datum, tdm0); | 
|---|
| 604 |  | 
|---|
| 605 | vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero; | 
|---|
| 606 | sum_yp += select(zero, texel_datum, tdm1); | 
|---|
| 607 |  | 
|---|
| 608 | vmask4 tdm2 = texel_datum.swz<2,2,2,2>() > zero; | 
|---|
| 609 | sum_zp += select(zero, texel_datum, tdm2); | 
|---|
| 610 | } | 
|---|
| 611 |  | 
|---|
| 612 | vfloat4 prod_xp = dot(sum_xp, sum_xp); | 
|---|
| 613 | vfloat4 prod_yp = dot(sum_yp, sum_yp); | 
|---|
| 614 | vfloat4 prod_zp = dot(sum_zp, sum_zp); | 
|---|
| 615 |  | 
|---|
| 616 | vfloat4 best_vector = sum_xp; | 
|---|
| 617 | vfloat4 best_sum = prod_xp; | 
|---|
| 618 |  | 
|---|
| 619 | vmask4 mask = prod_yp > best_sum; | 
|---|
| 620 | best_vector = select(best_vector, sum_yp, mask); | 
|---|
| 621 | best_sum = select(best_sum, prod_yp, mask); | 
|---|
| 622 |  | 
|---|
| 623 | mask = prod_zp > best_sum; | 
|---|
| 624 | best_vector = select(best_vector, sum_zp, mask); | 
|---|
| 625 |  | 
|---|
| 626 | pm[partition].dir = best_vector; | 
|---|
| 627 | } | 
|---|
| 628 | } | 
|---|
| 629 |  | 
|---|
| 630 | /* See header for documentation. */ | 
|---|
| 631 | void compute_avgs_and_dirs_2_comp( | 
|---|
| 632 | const partition_info& pt, | 
|---|
| 633 | const image_block& blk, | 
|---|
| 634 | unsigned int component1, | 
|---|
| 635 | unsigned int component2, | 
|---|
| 636 | partition_metrics pm[BLOCK_MAX_PARTITIONS] | 
|---|
| 637 | ) { | 
|---|
| 638 | vfloat4 average; | 
|---|
| 639 |  | 
|---|
| 640 | const float* data_vr = nullptr; | 
|---|
| 641 | const float* data_vg = nullptr; | 
|---|
| 642 |  | 
|---|
| 643 | if (component1 == 0 && component2 == 1) | 
|---|
| 644 | { | 
|---|
| 645 | average = blk.data_mean.swz<0, 1>(); | 
|---|
| 646 |  | 
|---|
| 647 | data_vr = blk.data_r; | 
|---|
| 648 | data_vg = blk.data_g; | 
|---|
| 649 | } | 
|---|
| 650 | else if (component1 == 0 && component2 == 2) | 
|---|
| 651 | { | 
|---|
| 652 | average = blk.data_mean.swz<0, 2>(); | 
|---|
| 653 |  | 
|---|
| 654 | data_vr = blk.data_r; | 
|---|
| 655 | data_vg = blk.data_b; | 
|---|
| 656 | } | 
|---|
| 657 | else // (component1 == 1 && component2 == 2) | 
|---|
| 658 | { | 
|---|
| 659 | assert(component1 == 1 && component2 == 2); | 
|---|
| 660 |  | 
|---|
| 661 | average = blk.data_mean.swz<1, 2>(); | 
|---|
| 662 |  | 
|---|
| 663 | data_vr = blk.data_g; | 
|---|
| 664 | data_vg = blk.data_b; | 
|---|
| 665 | } | 
|---|
| 666 |  | 
|---|
| 667 | unsigned int partition_count = pt.partition_count; | 
|---|
| 668 | promise(partition_count > 0); | 
|---|
| 669 |  | 
|---|
| 670 | for (unsigned int partition = 0; partition < partition_count; partition++) | 
|---|
| 671 | { | 
|---|
| 672 | const uint8_t *texel_indexes = pt.texels_of_partition[partition]; | 
|---|
| 673 | unsigned int texel_count = pt.partition_texel_count[partition]; | 
|---|
| 674 | promise(texel_count > 0); | 
|---|
| 675 |  | 
|---|
| 676 | // Only compute a partition mean if more than one partition | 
|---|
| 677 | if (partition_count > 1) | 
|---|
| 678 | { | 
|---|
| 679 | average = vfloat4::zero(); | 
|---|
| 680 | for (unsigned int i = 0; i < texel_count; i++) | 
|---|
| 681 | { | 
|---|
| 682 | unsigned int iwt = texel_indexes[i]; | 
|---|
| 683 | average += vfloat2(data_vr[iwt], data_vg[iwt]); | 
|---|
| 684 | } | 
|---|
| 685 |  | 
|---|
| 686 | average = average / static_cast<float>(texel_count); | 
|---|
| 687 | } | 
|---|
| 688 |  | 
|---|
| 689 | pm[partition].avg = average; | 
|---|
| 690 |  | 
|---|
| 691 | vfloat4 sum_xp = vfloat4::zero(); | 
|---|
| 692 | vfloat4 sum_yp = vfloat4::zero(); | 
|---|
| 693 |  | 
|---|
| 694 | for (unsigned int i = 0; i < texel_count; i++) | 
|---|
| 695 | { | 
|---|
| 696 | unsigned int iwt = texel_indexes[i]; | 
|---|
| 697 | vfloat4 texel_datum = vfloat2(data_vr[iwt], data_vg[iwt]); | 
|---|
| 698 | texel_datum = texel_datum - average; | 
|---|
| 699 |  | 
|---|
| 700 | vfloat4 zero = vfloat4::zero(); | 
|---|
| 701 |  | 
|---|
| 702 | vmask4 tdm0 = texel_datum.swz<0,0,0,0>() > zero; | 
|---|
| 703 | sum_xp += select(zero, texel_datum, tdm0); | 
|---|
| 704 |  | 
|---|
| 705 | vmask4 tdm1 = texel_datum.swz<1,1,1,1>() > zero; | 
|---|
| 706 | sum_yp += select(zero, texel_datum, tdm1); | 
|---|
| 707 | } | 
|---|
| 708 |  | 
|---|
| 709 | vfloat4 prod_xp = dot(sum_xp, sum_xp); | 
|---|
| 710 | vfloat4 prod_yp = dot(sum_yp, sum_yp); | 
|---|
| 711 |  | 
|---|
| 712 | vfloat4 best_vector = sum_xp; | 
|---|
| 713 | vfloat4 best_sum = prod_xp; | 
|---|
| 714 |  | 
|---|
| 715 | vmask4 mask = prod_yp > best_sum; | 
|---|
| 716 | best_vector = select(best_vector, sum_yp, mask); | 
|---|
| 717 |  | 
|---|
| 718 | pm[partition].dir = best_vector; | 
|---|
| 719 | } | 
|---|
| 720 | } | 
|---|
| 721 |  | 
|---|
| 722 | /* See header for documentation. */ | 
|---|
| 723 | void compute_error_squared_rgba( | 
|---|
| 724 | const partition_info& pi, | 
|---|
| 725 | const image_block& blk, | 
|---|
| 726 | const processed_line4 uncor_plines[BLOCK_MAX_PARTITIONS], | 
|---|
| 727 | const processed_line4 samec_plines[BLOCK_MAX_PARTITIONS], | 
|---|
| 728 | float line_lengths[BLOCK_MAX_PARTITIONS], | 
|---|
| 729 | float& uncor_error, | 
|---|
| 730 | float& samec_error | 
|---|
| 731 | ) { | 
|---|
| 732 | unsigned int partition_count = pi.partition_count; | 
|---|
| 733 | promise(partition_count > 0); | 
|---|
| 734 |  | 
|---|
| 735 | vfloatacc uncor_errorsumv = vfloatacc::zero(); | 
|---|
| 736 | vfloatacc samec_errorsumv = vfloatacc::zero(); | 
|---|
| 737 |  | 
|---|
| 738 | for (unsigned int partition = 0; partition < partition_count; partition++) | 
|---|
| 739 | { | 
|---|
| 740 | const uint8_t *texel_indexes = pi.texels_of_partition[partition]; | 
|---|
| 741 |  | 
|---|
| 742 | processed_line4 l_uncor = uncor_plines[partition]; | 
|---|
| 743 | processed_line4 l_samec = samec_plines[partition]; | 
|---|
| 744 |  | 
|---|
| 745 | unsigned int texel_count = pi.partition_texel_count[partition]; | 
|---|
| 746 | promise(texel_count > 0); | 
|---|
| 747 |  | 
|---|
| 748 | // Vectorize some useful scalar inputs | 
|---|
| 749 | vfloat l_uncor_bs0(l_uncor.bs.lane<0>()); | 
|---|
| 750 | vfloat l_uncor_bs1(l_uncor.bs.lane<1>()); | 
|---|
| 751 | vfloat l_uncor_bs2(l_uncor.bs.lane<2>()); | 
|---|
| 752 | vfloat l_uncor_bs3(l_uncor.bs.lane<3>()); | 
|---|
| 753 |  | 
|---|
| 754 | vfloat l_uncor_amod0(l_uncor.amod.lane<0>()); | 
|---|
| 755 | vfloat l_uncor_amod1(l_uncor.amod.lane<1>()); | 
|---|
| 756 | vfloat l_uncor_amod2(l_uncor.amod.lane<2>()); | 
|---|
| 757 | vfloat l_uncor_amod3(l_uncor.amod.lane<3>()); | 
|---|
| 758 |  | 
|---|
| 759 | vfloat l_samec_bs0(l_samec.bs.lane<0>()); | 
|---|
| 760 | vfloat l_samec_bs1(l_samec.bs.lane<1>()); | 
|---|
| 761 | vfloat l_samec_bs2(l_samec.bs.lane<2>()); | 
|---|
| 762 | vfloat l_samec_bs3(l_samec.bs.lane<3>()); | 
|---|
| 763 |  | 
|---|
| 764 | assert(all(l_samec.amod == vfloat4(0.0f))); | 
|---|
| 765 |  | 
|---|
| 766 | vfloat uncor_loparamv(1e10f); | 
|---|
| 767 | vfloat uncor_hiparamv(-1e10f); | 
|---|
| 768 |  | 
|---|
| 769 | vfloat ew_r(blk.channel_weight.lane<0>()); | 
|---|
| 770 | vfloat ew_g(blk.channel_weight.lane<1>()); | 
|---|
| 771 | vfloat ew_b(blk.channel_weight.lane<2>()); | 
|---|
| 772 | vfloat ew_a(blk.channel_weight.lane<3>()); | 
|---|
| 773 |  | 
|---|
| 774 | // This implementation over-shoots, but this is safe as we initialize the texel_indexes | 
|---|
| 775 | // array to extend the last value. This means min/max are not impacted, but we need to mask | 
|---|
| 776 | // out the dummy values when we compute the line weighting. | 
|---|
| 777 | vint lane_ids = vint::lane_id(); | 
|---|
| 778 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) | 
|---|
| 779 | { | 
|---|
| 780 | vmask mask = lane_ids < vint(texel_count); | 
|---|
| 781 | vint texel_idxs(texel_indexes + i); | 
|---|
| 782 |  | 
|---|
| 783 | vfloat data_r = gatherf(blk.data_r, texel_idxs); | 
|---|
| 784 | vfloat data_g = gatherf(blk.data_g, texel_idxs); | 
|---|
| 785 | vfloat data_b = gatherf(blk.data_b, texel_idxs); | 
|---|
| 786 | vfloat data_a = gatherf(blk.data_a, texel_idxs); | 
|---|
| 787 |  | 
|---|
| 788 | vfloat uncor_param = (data_r * l_uncor_bs0) | 
|---|
| 789 | + (data_g * l_uncor_bs1) | 
|---|
| 790 | + (data_b * l_uncor_bs2) | 
|---|
| 791 | + (data_a * l_uncor_bs3); | 
|---|
| 792 |  | 
|---|
| 793 | uncor_loparamv = min(uncor_param, uncor_loparamv); | 
|---|
| 794 | uncor_hiparamv = max(uncor_param, uncor_hiparamv); | 
|---|
| 795 |  | 
|---|
| 796 | vfloat uncor_dist0 = (l_uncor_amod0 - data_r) | 
|---|
| 797 | + (uncor_param * l_uncor_bs0); | 
|---|
| 798 | vfloat uncor_dist1 = (l_uncor_amod1 - data_g) | 
|---|
| 799 | + (uncor_param * l_uncor_bs1); | 
|---|
| 800 | vfloat uncor_dist2 = (l_uncor_amod2 - data_b) | 
|---|
| 801 | + (uncor_param * l_uncor_bs2); | 
|---|
| 802 | vfloat uncor_dist3 = (l_uncor_amod3 - data_a) | 
|---|
| 803 | + (uncor_param * l_uncor_bs3); | 
|---|
| 804 |  | 
|---|
| 805 | vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0) | 
|---|
| 806 | + (ew_g * uncor_dist1 * uncor_dist1) | 
|---|
| 807 | + (ew_b * uncor_dist2 * uncor_dist2) | 
|---|
| 808 | + (ew_a * uncor_dist3 * uncor_dist3); | 
|---|
| 809 |  | 
|---|
| 810 | haccumulate(uncor_errorsumv, uncor_err, mask); | 
|---|
| 811 |  | 
|---|
| 812 | // Process samechroma data | 
|---|
| 813 | vfloat samec_param = (data_r * l_samec_bs0) | 
|---|
| 814 | + (data_g * l_samec_bs1) | 
|---|
| 815 | + (data_b * l_samec_bs2) | 
|---|
| 816 | + (data_a * l_samec_bs3); | 
|---|
| 817 |  | 
|---|
| 818 | vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r; | 
|---|
| 819 | vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g; | 
|---|
| 820 | vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b; | 
|---|
| 821 | vfloat samec_dist3 = samec_param * l_samec_bs3 - data_a; | 
|---|
| 822 |  | 
|---|
| 823 | vfloat samec_err = (ew_r * samec_dist0 * samec_dist0) | 
|---|
| 824 | + (ew_g * samec_dist1 * samec_dist1) | 
|---|
| 825 | + (ew_b * samec_dist2 * samec_dist2) | 
|---|
| 826 | + (ew_a * samec_dist3 * samec_dist3); | 
|---|
| 827 |  | 
|---|
| 828 | haccumulate(samec_errorsumv, samec_err, mask); | 
|---|
| 829 |  | 
|---|
| 830 | lane_ids += vint(ASTCENC_SIMD_WIDTH); | 
|---|
| 831 | } | 
|---|
| 832 |  | 
|---|
| 833 | // Turn very small numbers and NaNs into a small number | 
|---|
| 834 | float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv); | 
|---|
| 835 | line_lengths[partition] = astc::max(uncor_linelen, 1e-7f); | 
|---|
| 836 | } | 
|---|
| 837 |  | 
|---|
| 838 | uncor_error = hadd_s(uncor_errorsumv); | 
|---|
| 839 | samec_error = hadd_s(samec_errorsumv); | 
|---|
| 840 | } | 
|---|
| 841 |  | 
|---|
| 842 | /* See header for documentation. */ | 
|---|
| 843 | void compute_error_squared_rgb( | 
|---|
| 844 | const partition_info& pi, | 
|---|
| 845 | const image_block& blk, | 
|---|
| 846 | partition_lines3 plines[BLOCK_MAX_PARTITIONS], | 
|---|
| 847 | float& uncor_error, | 
|---|
| 848 | float& samec_error | 
|---|
| 849 | ) { | 
|---|
| 850 | unsigned int partition_count = pi.partition_count; | 
|---|
| 851 | promise(partition_count > 0); | 
|---|
| 852 |  | 
|---|
| 853 | vfloatacc uncor_errorsumv = vfloatacc::zero(); | 
|---|
| 854 | vfloatacc samec_errorsumv = vfloatacc::zero(); | 
|---|
| 855 |  | 
|---|
| 856 | for (unsigned int partition = 0; partition < partition_count; partition++) | 
|---|
| 857 | { | 
|---|
| 858 | partition_lines3& pl = plines[partition]; | 
|---|
| 859 | const uint8_t *texel_indexes = pi.texels_of_partition[partition]; | 
|---|
| 860 | unsigned int texel_count = pi.partition_texel_count[partition]; | 
|---|
| 861 | promise(texel_count > 0); | 
|---|
| 862 |  | 
|---|
| 863 | processed_line3 l_uncor = pl.uncor_pline; | 
|---|
| 864 | processed_line3 l_samec = pl.samec_pline; | 
|---|
| 865 |  | 
|---|
| 866 | // Vectorize some useful scalar inputs | 
|---|
| 867 | vfloat l_uncor_bs0(l_uncor.bs.lane<0>()); | 
|---|
| 868 | vfloat l_uncor_bs1(l_uncor.bs.lane<1>()); | 
|---|
| 869 | vfloat l_uncor_bs2(l_uncor.bs.lane<2>()); | 
|---|
| 870 |  | 
|---|
| 871 | vfloat l_uncor_amod0(l_uncor.amod.lane<0>()); | 
|---|
| 872 | vfloat l_uncor_amod1(l_uncor.amod.lane<1>()); | 
|---|
| 873 | vfloat l_uncor_amod2(l_uncor.amod.lane<2>()); | 
|---|
| 874 |  | 
|---|
| 875 | vfloat l_samec_bs0(l_samec.bs.lane<0>()); | 
|---|
| 876 | vfloat l_samec_bs1(l_samec.bs.lane<1>()); | 
|---|
| 877 | vfloat l_samec_bs2(l_samec.bs.lane<2>()); | 
|---|
| 878 |  | 
|---|
| 879 | assert(all(l_samec.amod == vfloat4(0.0f))); | 
|---|
| 880 |  | 
|---|
| 881 | vfloat uncor_loparamv(1e10f); | 
|---|
| 882 | vfloat uncor_hiparamv(-1e10f); | 
|---|
| 883 |  | 
|---|
| 884 | vfloat ew_r(blk.channel_weight.lane<0>()); | 
|---|
| 885 | vfloat ew_g(blk.channel_weight.lane<1>()); | 
|---|
| 886 | vfloat ew_b(blk.channel_weight.lane<2>()); | 
|---|
| 887 |  | 
|---|
| 888 | // This implementation over-shoots, but this is safe as we initialize the weights array | 
|---|
| 889 | // to extend the last value. This means min/max are not impacted, but we need to mask | 
|---|
| 890 | // out the dummy values when we compute the line weighting. | 
|---|
| 891 | vint lane_ids = vint::lane_id(); | 
|---|
| 892 | for (unsigned int i = 0; i < texel_count; i += ASTCENC_SIMD_WIDTH) | 
|---|
| 893 | { | 
|---|
| 894 | vmask mask = lane_ids < vint(texel_count); | 
|---|
| 895 | vint texel_idxs(texel_indexes + i); | 
|---|
| 896 |  | 
|---|
| 897 | vfloat data_r = gatherf(blk.data_r, texel_idxs); | 
|---|
| 898 | vfloat data_g = gatherf(blk.data_g, texel_idxs); | 
|---|
| 899 | vfloat data_b = gatherf(blk.data_b, texel_idxs); | 
|---|
| 900 |  | 
|---|
| 901 | vfloat uncor_param = (data_r * l_uncor_bs0) | 
|---|
| 902 | + (data_g * l_uncor_bs1) | 
|---|
| 903 | + (data_b * l_uncor_bs2); | 
|---|
| 904 |  | 
|---|
| 905 | uncor_loparamv = min(uncor_param, uncor_loparamv); | 
|---|
| 906 | uncor_hiparamv = max(uncor_param, uncor_hiparamv); | 
|---|
| 907 |  | 
|---|
| 908 | vfloat uncor_dist0 = (l_uncor_amod0 - data_r) | 
|---|
| 909 | + (uncor_param * l_uncor_bs0); | 
|---|
| 910 | vfloat uncor_dist1 = (l_uncor_amod1 - data_g) | 
|---|
| 911 | + (uncor_param * l_uncor_bs1); | 
|---|
| 912 | vfloat uncor_dist2 = (l_uncor_amod2 - data_b) | 
|---|
| 913 | + (uncor_param * l_uncor_bs2); | 
|---|
| 914 |  | 
|---|
| 915 | vfloat uncor_err = (ew_r * uncor_dist0 * uncor_dist0) | 
|---|
| 916 | + (ew_g * uncor_dist1 * uncor_dist1) | 
|---|
| 917 | + (ew_b * uncor_dist2 * uncor_dist2); | 
|---|
| 918 |  | 
|---|
| 919 | haccumulate(uncor_errorsumv, uncor_err, mask); | 
|---|
| 920 |  | 
|---|
| 921 | // Process samechroma data | 
|---|
| 922 | vfloat samec_param = (data_r * l_samec_bs0) | 
|---|
| 923 | + (data_g * l_samec_bs1) | 
|---|
| 924 | + (data_b * l_samec_bs2); | 
|---|
| 925 |  | 
|---|
| 926 | vfloat samec_dist0 = samec_param * l_samec_bs0 - data_r; | 
|---|
| 927 | vfloat samec_dist1 = samec_param * l_samec_bs1 - data_g; | 
|---|
| 928 | vfloat samec_dist2 = samec_param * l_samec_bs2 - data_b; | 
|---|
| 929 |  | 
|---|
| 930 | vfloat samec_err = (ew_r * samec_dist0 * samec_dist0) | 
|---|
| 931 | + (ew_g * samec_dist1 * samec_dist1) | 
|---|
| 932 | + (ew_b * samec_dist2 * samec_dist2); | 
|---|
| 933 |  | 
|---|
| 934 | haccumulate(samec_errorsumv, samec_err, mask); | 
|---|
| 935 |  | 
|---|
| 936 | lane_ids += vint(ASTCENC_SIMD_WIDTH); | 
|---|
| 937 | } | 
|---|
| 938 |  | 
|---|
| 939 | // Turn very small numbers and NaNs into a small number | 
|---|
| 940 | float uncor_linelen = hmax_s(uncor_hiparamv) - hmin_s(uncor_loparamv); | 
|---|
| 941 | pl.line_length = astc::max(uncor_linelen, 1e-7f); | 
|---|
| 942 | } | 
|---|
| 943 |  | 
|---|
| 944 | uncor_error = hadd_s(uncor_errorsumv); | 
|---|
| 945 | samec_error = hadd_s(samec_errorsumv); | 
|---|
| 946 | } | 
|---|
| 947 |  | 
|---|
| 948 | #endif | 
|---|
| 949 |  | 
|---|