| 1 | // Copyright 2015 Google Inc. All Rights Reserved. |
| 2 | // |
| 3 | // Use of this source code is governed by a BSD-style license |
| 4 | // that can be found in the COPYING file in the root of the source |
| 5 | // tree. An additional intellectual property rights grant can be found |
| 6 | // in the file PATENTS. All contributing project authors may |
| 7 | // be found in the AUTHORS file in the root of the source tree. |
| 8 | // ----------------------------------------------------------------------------- |
| 9 | // |
| 10 | // MIPS version of lossless functions |
| 11 | // |
| 12 | // Author(s): Djordje Pesut (djordje.pesut@imgtec.com) |
| 13 | // Jovan Zelincevic (jovan.zelincevic@imgtec.com) |
| 14 | |
| 15 | #include "./dsp.h" |
| 16 | #include "./lossless.h" |
| 17 | #include "./lossless_common.h" |
| 18 | |
| 19 | #if defined(WEBP_USE_MIPS32) |
| 20 | |
| 21 | #include <assert.h> |
| 22 | #include <math.h> |
| 23 | #include <stdlib.h> |
| 24 | #include <string.h> |
| 25 | |
| 26 | static float FastSLog2Slow(uint32_t v) { |
| 27 | assert(v >= LOG_LOOKUP_IDX_MAX); |
| 28 | if (v < APPROX_LOG_WITH_CORRECTION_MAX) { |
| 29 | uint32_t log_cnt, y, correction; |
| 30 | const int c24 = 24; |
| 31 | const float v_f = (float)v; |
| 32 | uint32_t temp; |
| 33 | |
| 34 | // Xf = 256 = 2^8 |
| 35 | // log_cnt is index of leading one in upper 24 bits |
| 36 | __asm__ volatile( |
| 37 | "clz %[log_cnt], %[v] \n\t" |
| 38 | "addiu %[y], $zero, 1 \n\t" |
| 39 | "subu %[log_cnt], %[c24], %[log_cnt] \n\t" |
| 40 | "sllv %[y], %[y], %[log_cnt] \n\t" |
| 41 | "srlv %[temp], %[v], %[log_cnt] \n\t" |
| 42 | : [log_cnt]"=&r" (log_cnt), [y]"=&r" (y), |
| 43 | [temp]"=r" (temp) |
| 44 | : [c24]"r" (c24), [v]"r" (v) |
| 45 | ); |
| 46 | |
| 47 | // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256 |
| 48 | // Xf = floor(Xf) * (1 + (v % y) / v) |
| 49 | // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v) |
| 50 | // The correction factor: log(1 + d) ~ d; for very small d values, so |
| 51 | // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v |
| 52 | // LOG_2_RECIPROCAL ~ 23/16 |
| 53 | |
| 54 | // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1) |
| 55 | correction = (23 * (v & (y - 1))) >> 4; |
| 56 | return v_f * (kLog2Table[temp] + log_cnt) + correction; |
| 57 | } else { |
| 58 | return (float)(LOG_2_RECIPROCAL * v * log((double)v)); |
| 59 | } |
| 60 | } |
| 61 | |
| 62 | static float FastLog2Slow(uint32_t v) { |
| 63 | assert(v >= LOG_LOOKUP_IDX_MAX); |
| 64 | if (v < APPROX_LOG_WITH_CORRECTION_MAX) { |
| 65 | uint32_t log_cnt, y; |
| 66 | const int c24 = 24; |
| 67 | double log_2; |
| 68 | uint32_t temp; |
| 69 | |
| 70 | __asm__ volatile( |
| 71 | "clz %[log_cnt], %[v] \n\t" |
| 72 | "addiu %[y], $zero, 1 \n\t" |
| 73 | "subu %[log_cnt], %[c24], %[log_cnt] \n\t" |
| 74 | "sllv %[y], %[y], %[log_cnt] \n\t" |
| 75 | "srlv %[temp], %[v], %[log_cnt] \n\t" |
| 76 | : [log_cnt]"=&r" (log_cnt), [y]"=&r" (y), |
| 77 | [temp]"=r" (temp) |
| 78 | : [c24]"r" (c24), [v]"r" (v) |
| 79 | ); |
| 80 | |
| 81 | log_2 = kLog2Table[temp] + log_cnt; |
| 82 | if (v >= APPROX_LOG_MAX) { |
| 83 | // Since the division is still expensive, add this correction factor only |
| 84 | // for large values of 'v'. |
| 85 | |
| 86 | const uint32_t correction = (23 * (v & (y - 1))) >> 4; |
| 87 | log_2 += (double)correction / v; |
| 88 | } |
| 89 | return (float)log_2; |
| 90 | } else { |
| 91 | return (float)(LOG_2_RECIPROCAL * log((double)v)); |
| 92 | } |
| 93 | } |
| 94 | |
| 95 | // C version of this function: |
| 96 | // int i = 0; |
| 97 | // int64_t cost = 0; |
| 98 | // const uint32_t* pop = &population[4]; |
| 99 | // const uint32_t* LoopEnd = &population[length]; |
| 100 | // while (pop != LoopEnd) { |
| 101 | // ++i; |
| 102 | // cost += i * *pop; |
| 103 | // cost += i * *(pop + 1); |
| 104 | // pop += 2; |
| 105 | // } |
| 106 | // return (double)cost; |
| 107 | static double ExtraCost(const uint32_t* const population, int length) { |
| 108 | int i, temp0, temp1; |
| 109 | const uint32_t* pop = &population[4]; |
| 110 | const uint32_t* const LoopEnd = &population[length]; |
| 111 | |
| 112 | __asm__ volatile( |
| 113 | "mult $zero, $zero \n\t" |
| 114 | "xor %[i], %[i], %[i] \n\t" |
| 115 | "beq %[pop], %[LoopEnd], 2f \n\t" |
| 116 | "1: \n\t" |
| 117 | "lw %[temp0], 0(%[pop]) \n\t" |
| 118 | "lw %[temp1], 4(%[pop]) \n\t" |
| 119 | "addiu %[i], %[i], 1 \n\t" |
| 120 | "addiu %[pop], %[pop], 8 \n\t" |
| 121 | "madd %[i], %[temp0] \n\t" |
| 122 | "madd %[i], %[temp1] \n\t" |
| 123 | "bne %[pop], %[LoopEnd], 1b \n\t" |
| 124 | "2: \n\t" |
| 125 | "mfhi %[temp0] \n\t" |
| 126 | "mflo %[temp1] \n\t" |
| 127 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), |
| 128 | [i]"=&r" (i), [pop]"+r" (pop) |
| 129 | : [LoopEnd]"r" (LoopEnd) |
| 130 | : "memory" , "hi" , "lo" |
| 131 | ); |
| 132 | |
| 133 | return (double)((int64_t)temp0 << 32 | temp1); |
| 134 | } |
| 135 | |
| 136 | // C version of this function: |
| 137 | // int i = 0; |
| 138 | // int64_t cost = 0; |
| 139 | // const uint32_t* pX = &X[4]; |
| 140 | // const uint32_t* pY = &Y[4]; |
| 141 | // const uint32_t* LoopEnd = &X[length]; |
| 142 | // while (pX != LoopEnd) { |
| 143 | // const uint32_t xy0 = *pX + *pY; |
| 144 | // const uint32_t xy1 = *(pX + 1) + *(pY + 1); |
| 145 | // ++i; |
| 146 | // cost += i * xy0; |
| 147 | // cost += i * xy1; |
| 148 | // pX += 2; |
| 149 | // pY += 2; |
| 150 | // } |
| 151 | // return (double)cost; |
| 152 | static double ExtraCostCombined(const uint32_t* const X, |
| 153 | const uint32_t* const Y, int length) { |
| 154 | int i, temp0, temp1, temp2, temp3; |
| 155 | const uint32_t* pX = &X[4]; |
| 156 | const uint32_t* pY = &Y[4]; |
| 157 | const uint32_t* const LoopEnd = &X[length]; |
| 158 | |
| 159 | __asm__ volatile( |
| 160 | "mult $zero, $zero \n\t" |
| 161 | "xor %[i], %[i], %[i] \n\t" |
| 162 | "beq %[pX], %[LoopEnd], 2f \n\t" |
| 163 | "1: \n\t" |
| 164 | "lw %[temp0], 0(%[pX]) \n\t" |
| 165 | "lw %[temp1], 0(%[pY]) \n\t" |
| 166 | "lw %[temp2], 4(%[pX]) \n\t" |
| 167 | "lw %[temp3], 4(%[pY]) \n\t" |
| 168 | "addiu %[i], %[i], 1 \n\t" |
| 169 | "addu %[temp0], %[temp0], %[temp1] \n\t" |
| 170 | "addu %[temp2], %[temp2], %[temp3] \n\t" |
| 171 | "addiu %[pX], %[pX], 8 \n\t" |
| 172 | "addiu %[pY], %[pY], 8 \n\t" |
| 173 | "madd %[i], %[temp0] \n\t" |
| 174 | "madd %[i], %[temp2] \n\t" |
| 175 | "bne %[pX], %[LoopEnd], 1b \n\t" |
| 176 | "2: \n\t" |
| 177 | "mfhi %[temp0] \n\t" |
| 178 | "mflo %[temp1] \n\t" |
| 179 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), |
| 180 | [temp2]"=&r" (temp2), [temp3]"=&r" (temp3), |
| 181 | [i]"=&r" (i), [pX]"+r" (pX), [pY]"+r" (pY) |
| 182 | : [LoopEnd]"r" (LoopEnd) |
| 183 | : "memory" , "hi" , "lo" |
| 184 | ); |
| 185 | |
| 186 | return (double)((int64_t)temp0 << 32 | temp1); |
| 187 | } |
| 188 | |
| 189 | #define HUFFMAN_COST_PASS \ |
| 190 | __asm__ volatile( \ |
| 191 | "sll %[temp1], %[temp0], 3 \n\t" \ |
| 192 | "addiu %[temp3], %[streak], -3 \n\t" \ |
| 193 | "addu %[temp2], %[pstreaks], %[temp1] \n\t" \ |
| 194 | "blez %[temp3], 1f \n\t" \ |
| 195 | "srl %[temp1], %[temp1], 1 \n\t" \ |
| 196 | "addu %[temp3], %[pcnts], %[temp1] \n\t" \ |
| 197 | "lw %[temp0], 4(%[temp2]) \n\t" \ |
| 198 | "lw %[temp1], 0(%[temp3]) \n\t" \ |
| 199 | "addu %[temp0], %[temp0], %[streak] \n\t" \ |
| 200 | "addiu %[temp1], %[temp1], 1 \n\t" \ |
| 201 | "sw %[temp0], 4(%[temp2]) \n\t" \ |
| 202 | "sw %[temp1], 0(%[temp3]) \n\t" \ |
| 203 | "b 2f \n\t" \ |
| 204 | "1: \n\t" \ |
| 205 | "lw %[temp0], 0(%[temp2]) \n\t" \ |
| 206 | "addu %[temp0], %[temp0], %[streak] \n\t" \ |
| 207 | "sw %[temp0], 0(%[temp2]) \n\t" \ |
| 208 | "2: \n\t" \ |
| 209 | : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \ |
| 210 | [temp3]"=&r"(temp3), [temp0]"+r"(temp0) \ |
| 211 | : [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts), \ |
| 212 | [streak]"r"(streak) \ |
| 213 | : "memory" \ |
| 214 | ); |
| 215 | |
| 216 | // Returns the various RLE counts |
| 217 | static WEBP_INLINE void GetEntropyUnrefinedHelper( |
| 218 | uint32_t val, int i, uint32_t* const val_prev, int* const i_prev, |
| 219 | VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) { |
| 220 | int* const pstreaks = &stats->streaks[0][0]; |
| 221 | int* const pcnts = &stats->counts[0]; |
| 222 | int temp0, temp1, temp2, temp3; |
| 223 | const int streak = i - *i_prev; |
| 224 | |
| 225 | // Gather info for the bit entropy. |
| 226 | if (*val_prev != 0) { |
| 227 | bit_entropy->sum += (*val_prev) * streak; |
| 228 | bit_entropy->nonzeros += streak; |
| 229 | bit_entropy->nonzero_code = *i_prev; |
| 230 | bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak; |
| 231 | if (bit_entropy->max_val < *val_prev) { |
| 232 | bit_entropy->max_val = *val_prev; |
| 233 | } |
| 234 | } |
| 235 | |
| 236 | // Gather info for the Huffman cost. |
| 237 | temp0 = (*val_prev != 0); |
| 238 | HUFFMAN_COST_PASS |
| 239 | |
| 240 | *val_prev = val; |
| 241 | *i_prev = i; |
| 242 | } |
| 243 | |
| 244 | static void GetEntropyUnrefined(const uint32_t X[], int length, |
| 245 | VP8LBitEntropy* const bit_entropy, |
| 246 | VP8LStreaks* const stats) { |
| 247 | int i; |
| 248 | int i_prev = 0; |
| 249 | uint32_t x_prev = X[0]; |
| 250 | |
| 251 | memset(stats, 0, sizeof(*stats)); |
| 252 | VP8LBitEntropyInit(bit_entropy); |
| 253 | |
| 254 | for (i = 1; i < length; ++i) { |
| 255 | const uint32_t x = X[i]; |
| 256 | if (x != x_prev) { |
| 257 | GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats); |
| 258 | } |
| 259 | } |
| 260 | GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats); |
| 261 | |
| 262 | bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum); |
| 263 | } |
| 264 | |
| 265 | static void GetCombinedEntropyUnrefined(const uint32_t X[], const uint32_t Y[], |
| 266 | int length, |
| 267 | VP8LBitEntropy* const bit_entropy, |
| 268 | VP8LStreaks* const stats) { |
| 269 | int i = 1; |
| 270 | int i_prev = 0; |
| 271 | uint32_t xy_prev = X[0] + Y[0]; |
| 272 | |
| 273 | memset(stats, 0, sizeof(*stats)); |
| 274 | VP8LBitEntropyInit(bit_entropy); |
| 275 | |
| 276 | for (i = 1; i < length; ++i) { |
| 277 | const uint32_t xy = X[i] + Y[i]; |
| 278 | if (xy != xy_prev) { |
| 279 | GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, bit_entropy, stats); |
| 280 | } |
| 281 | } |
| 282 | GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, bit_entropy, stats); |
| 283 | |
| 284 | bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum); |
| 285 | } |
| 286 | |
| 287 | #define ASM_START \ |
| 288 | __asm__ volatile( \ |
| 289 | ".set push \n\t" \ |
| 290 | ".set at \n\t" \ |
| 291 | ".set macro \n\t" \ |
| 292 | "1: \n\t" |
| 293 | |
| 294 | // P2 = P0 + P1 |
| 295 | // A..D - offsets |
| 296 | // E - temp variable to tell macro |
| 297 | // if pointer should be incremented |
| 298 | // literal_ and successive histograms could be unaligned |
| 299 | // so we must use ulw and usw |
| 300 | #define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2) \ |
| 301 | "ulw %[temp0], " #A "(%[" #P0 "]) \n\t" \ |
| 302 | "ulw %[temp1], " #B "(%[" #P0 "]) \n\t" \ |
| 303 | "ulw %[temp2], " #C "(%[" #P0 "]) \n\t" \ |
| 304 | "ulw %[temp3], " #D "(%[" #P0 "]) \n\t" \ |
| 305 | "ulw %[temp4], " #A "(%[" #P1 "]) \n\t" \ |
| 306 | "ulw %[temp5], " #B "(%[" #P1 "]) \n\t" \ |
| 307 | "ulw %[temp6], " #C "(%[" #P1 "]) \n\t" \ |
| 308 | "ulw %[temp7], " #D "(%[" #P1 "]) \n\t" \ |
| 309 | "addu %[temp4], %[temp4], %[temp0] \n\t" \ |
| 310 | "addu %[temp5], %[temp5], %[temp1] \n\t" \ |
| 311 | "addu %[temp6], %[temp6], %[temp2] \n\t" \ |
| 312 | "addu %[temp7], %[temp7], %[temp3] \n\t" \ |
| 313 | "addiu %[" #P0 "], %[" #P0 "], 16 \n\t" \ |
| 314 | ".if " #E " == 1 \n\t" \ |
| 315 | "addiu %[" #P1 "], %[" #P1 "], 16 \n\t" \ |
| 316 | ".endif \n\t" \ |
| 317 | "usw %[temp4], " #A "(%[" #P2 "]) \n\t" \ |
| 318 | "usw %[temp5], " #B "(%[" #P2 "]) \n\t" \ |
| 319 | "usw %[temp6], " #C "(%[" #P2 "]) \n\t" \ |
| 320 | "usw %[temp7], " #D "(%[" #P2 "]) \n\t" \ |
| 321 | "addiu %[" #P2 "], %[" #P2 "], 16 \n\t" \ |
| 322 | "bne %[" #P0 "], %[LoopEnd], 1b \n\t" \ |
| 323 | ".set pop \n\t" \ |
| 324 | |
| 325 | #define ASM_END_COMMON_0 \ |
| 326 | : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \ |
| 327 | [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), \ |
| 328 | [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \ |
| 329 | [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), \ |
| 330 | [pa]"+r"(pa), [pout]"+r"(pout) |
| 331 | |
| 332 | #define ASM_END_COMMON_1 \ |
| 333 | : [LoopEnd]"r"(LoopEnd) \ |
| 334 | : "memory", "at" \ |
| 335 | ); |
| 336 | |
| 337 | #define ASM_END_0 \ |
| 338 | ASM_END_COMMON_0 \ |
| 339 | , [pb]"+r"(pb) \ |
| 340 | ASM_END_COMMON_1 |
| 341 | |
| 342 | #define ASM_END_1 \ |
| 343 | ASM_END_COMMON_0 \ |
| 344 | ASM_END_COMMON_1 |
| 345 | |
| 346 | #define ADD_VECTOR(A, B, OUT, SIZE, EXTRA_SIZE) do { \ |
| 347 | const uint32_t* pa = (const uint32_t*)(A); \ |
| 348 | const uint32_t* pb = (const uint32_t*)(B); \ |
| 349 | uint32_t* pout = (uint32_t*)(OUT); \ |
| 350 | const uint32_t* const LoopEnd = pa + (SIZE); \ |
| 351 | assert((SIZE) % 4 == 0); \ |
| 352 | ASM_START \ |
| 353 | ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout) \ |
| 354 | ASM_END_0 \ |
| 355 | if ((EXTRA_SIZE) > 0) { \ |
| 356 | const int last = (EXTRA_SIZE); \ |
| 357 | int i; \ |
| 358 | for (i = 0; i < last; ++i) pout[i] = pa[i] + pb[i]; \ |
| 359 | } \ |
| 360 | } while (0) |
| 361 | |
| 362 | #define ADD_VECTOR_EQ(A, OUT, SIZE, EXTRA_SIZE) do { \ |
| 363 | const uint32_t* pa = (const uint32_t*)(A); \ |
| 364 | uint32_t* pout = (uint32_t*)(OUT); \ |
| 365 | const uint32_t* const LoopEnd = pa + (SIZE); \ |
| 366 | assert((SIZE) % 4 == 0); \ |
| 367 | ASM_START \ |
| 368 | ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout) \ |
| 369 | ASM_END_1 \ |
| 370 | if ((EXTRA_SIZE) > 0) { \ |
| 371 | const int last = (EXTRA_SIZE); \ |
| 372 | int i; \ |
| 373 | for (i = 0; i < last; ++i) pout[i] += pa[i]; \ |
| 374 | } \ |
| 375 | } while (0) |
| 376 | |
| 377 | static void HistogramAdd(const VP8LHistogram* const a, |
| 378 | const VP8LHistogram* const b, |
| 379 | VP8LHistogram* const out) { |
| 380 | uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; |
| 381 | const int extra_cache_size = VP8LHistogramNumCodes(a->palette_code_bits_) |
| 382 | - (NUM_LITERAL_CODES + NUM_LENGTH_CODES); |
| 383 | assert(a->palette_code_bits_ == b->palette_code_bits_); |
| 384 | |
| 385 | if (b != out) { |
| 386 | ADD_VECTOR(a->literal_, b->literal_, out->literal_, |
| 387 | NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size); |
| 388 | ADD_VECTOR(a->distance_, b->distance_, out->distance_, |
| 389 | NUM_DISTANCE_CODES, 0); |
| 390 | ADD_VECTOR(a->red_, b->red_, out->red_, NUM_LITERAL_CODES, 0); |
| 391 | ADD_VECTOR(a->blue_, b->blue_, out->blue_, NUM_LITERAL_CODES, 0); |
| 392 | ADD_VECTOR(a->alpha_, b->alpha_, out->alpha_, NUM_LITERAL_CODES, 0); |
| 393 | } else { |
| 394 | ADD_VECTOR_EQ(a->literal_, out->literal_, |
| 395 | NUM_LITERAL_CODES + NUM_LENGTH_CODES, extra_cache_size); |
| 396 | ADD_VECTOR_EQ(a->distance_, out->distance_, NUM_DISTANCE_CODES, 0); |
| 397 | ADD_VECTOR_EQ(a->red_, out->red_, NUM_LITERAL_CODES, 0); |
| 398 | ADD_VECTOR_EQ(a->blue_, out->blue_, NUM_LITERAL_CODES, 0); |
| 399 | ADD_VECTOR_EQ(a->alpha_, out->alpha_, NUM_LITERAL_CODES, 0); |
| 400 | } |
| 401 | } |
| 402 | |
| 403 | #undef ADD_VECTOR_EQ |
| 404 | #undef ADD_VECTOR |
| 405 | #undef ASM_END_1 |
| 406 | #undef ASM_END_0 |
| 407 | #undef ASM_END_COMMON_1 |
| 408 | #undef ASM_END_COMMON_0 |
| 409 | #undef ADD_TO_OUT |
| 410 | #undef ASM_START |
| 411 | |
| 412 | //------------------------------------------------------------------------------ |
| 413 | // Entry point |
| 414 | |
| 415 | extern void VP8LEncDspInitMIPS32(void); |
| 416 | |
| 417 | WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) { |
| 418 | VP8LFastSLog2Slow = FastSLog2Slow; |
| 419 | VP8LFastLog2Slow = FastLog2Slow; |
| 420 | VP8LExtraCost = ExtraCost; |
| 421 | VP8LExtraCostCombined = ExtraCostCombined; |
| 422 | VP8LGetEntropyUnrefined = GetEntropyUnrefined; |
| 423 | VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined; |
| 424 | VP8LHistogramAdd = HistogramAdd; |
| 425 | } |
| 426 | |
| 427 | #else // !WEBP_USE_MIPS32 |
| 428 | |
| 429 | WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPS32) |
| 430 | |
| 431 | #endif // WEBP_USE_MIPS32 |
| 432 | |