| 1 | // Copyright 2015 Google Inc. All Rights Reserved. |
| 2 | // |
| 3 | // Use of this source code is governed by a BSD-style license |
| 4 | // that can be found in the COPYING file in the root of the source |
| 5 | // tree. An additional intellectual property rights grant can be found |
| 6 | // in the file PATENTS. All contributing project authors may |
| 7 | // be found in the AUTHORS file in the root of the source tree. |
| 8 | // ----------------------------------------------------------------------------- |
| 9 | // |
| 10 | // MIPS version of lossless functions |
| 11 | // |
| 12 | // Author(s): Djordje Pesut (djordje.pesut@imgtec.com) |
| 13 | // Jovan Zelincevic (jovan.zelincevic@imgtec.com) |
| 14 | |
| 15 | #include "src/dsp/dsp.h" |
| 16 | #include "src/dsp/lossless.h" |
| 17 | #include "src/dsp/lossless_common.h" |
| 18 | |
| 19 | #if defined(WEBP_USE_MIPS32) |
| 20 | |
| 21 | #include <assert.h> |
| 22 | #include <math.h> |
| 23 | #include <stdlib.h> |
| 24 | #include <string.h> |
| 25 | |
| 26 | static float FastSLog2Slow_MIPS32(uint32_t v) { |
| 27 | assert(v >= LOG_LOOKUP_IDX_MAX); |
| 28 | if (v < APPROX_LOG_WITH_CORRECTION_MAX) { |
| 29 | uint32_t log_cnt, y, correction; |
| 30 | const int c24 = 24; |
| 31 | const float v_f = (float)v; |
| 32 | uint32_t temp; |
| 33 | |
| 34 | // Xf = 256 = 2^8 |
| 35 | // log_cnt is index of leading one in upper 24 bits |
| 36 | __asm__ volatile( |
| 37 | "clz %[log_cnt], %[v] \n\t" |
| 38 | "addiu %[y], $zero, 1 \n\t" |
| 39 | "subu %[log_cnt], %[c24], %[log_cnt] \n\t" |
| 40 | "sllv %[y], %[y], %[log_cnt] \n\t" |
| 41 | "srlv %[temp], %[v], %[log_cnt] \n\t" |
| 42 | : [log_cnt]"=&r" (log_cnt), [y]"=&r" (y), |
| 43 | [temp]"=r" (temp) |
| 44 | : [c24]"r" (c24), [v]"r" (v) |
| 45 | ); |
| 46 | |
| 47 | // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256 |
| 48 | // Xf = floor(Xf) * (1 + (v % y) / v) |
| 49 | // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v) |
| 50 | // The correction factor: log(1 + d) ~ d; for very small d values, so |
| 51 | // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v |
| 52 | // LOG_2_RECIPROCAL ~ 23/16 |
| 53 | |
| 54 | // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1) |
| 55 | correction = (23 * (v & (y - 1))) >> 4; |
| 56 | return v_f * (kLog2Table[temp] + log_cnt) + correction; |
| 57 | } else { |
| 58 | return (float)(LOG_2_RECIPROCAL * v * log((double)v)); |
| 59 | } |
| 60 | } |
| 61 | |
| 62 | static float FastLog2Slow_MIPS32(uint32_t v) { |
| 63 | assert(v >= LOG_LOOKUP_IDX_MAX); |
| 64 | if (v < APPROX_LOG_WITH_CORRECTION_MAX) { |
| 65 | uint32_t log_cnt, y; |
| 66 | const int c24 = 24; |
| 67 | double log_2; |
| 68 | uint32_t temp; |
| 69 | |
| 70 | __asm__ volatile( |
| 71 | "clz %[log_cnt], %[v] \n\t" |
| 72 | "addiu %[y], $zero, 1 \n\t" |
| 73 | "subu %[log_cnt], %[c24], %[log_cnt] \n\t" |
| 74 | "sllv %[y], %[y], %[log_cnt] \n\t" |
| 75 | "srlv %[temp], %[v], %[log_cnt] \n\t" |
| 76 | : [log_cnt]"=&r" (log_cnt), [y]"=&r" (y), |
| 77 | [temp]"=r" (temp) |
| 78 | : [c24]"r" (c24), [v]"r" (v) |
| 79 | ); |
| 80 | |
| 81 | log_2 = kLog2Table[temp] + log_cnt; |
| 82 | if (v >= APPROX_LOG_MAX) { |
| 83 | // Since the division is still expensive, add this correction factor only |
| 84 | // for large values of 'v'. |
| 85 | |
| 86 | const uint32_t correction = (23 * (v & (y - 1))) >> 4; |
| 87 | log_2 += (double)correction / v; |
| 88 | } |
| 89 | return (float)log_2; |
| 90 | } else { |
| 91 | return (float)(LOG_2_RECIPROCAL * log((double)v)); |
| 92 | } |
| 93 | } |
| 94 | |
| 95 | // C version of this function: |
| 96 | // int i = 0; |
| 97 | // int64_t cost = 0; |
| 98 | // const uint32_t* pop = &population[4]; |
| 99 | // const uint32_t* LoopEnd = &population[length]; |
| 100 | // while (pop != LoopEnd) { |
| 101 | // ++i; |
| 102 | // cost += i * *pop; |
| 103 | // cost += i * *(pop + 1); |
| 104 | // pop += 2; |
| 105 | // } |
| 106 | // return (double)cost; |
| 107 | static double ExtraCost_MIPS32(const uint32_t* const population, int length) { |
| 108 | int i, temp0, temp1; |
| 109 | const uint32_t* pop = &population[4]; |
| 110 | const uint32_t* const LoopEnd = &population[length]; |
| 111 | |
| 112 | __asm__ volatile( |
| 113 | "mult $zero, $zero \n\t" |
| 114 | "xor %[i], %[i], %[i] \n\t" |
| 115 | "beq %[pop], %[LoopEnd], 2f \n\t" |
| 116 | "1: \n\t" |
| 117 | "lw %[temp0], 0(%[pop]) \n\t" |
| 118 | "lw %[temp1], 4(%[pop]) \n\t" |
| 119 | "addiu %[i], %[i], 1 \n\t" |
| 120 | "addiu %[pop], %[pop], 8 \n\t" |
| 121 | "madd %[i], %[temp0] \n\t" |
| 122 | "madd %[i], %[temp1] \n\t" |
| 123 | "bne %[pop], %[LoopEnd], 1b \n\t" |
| 124 | "2: \n\t" |
| 125 | "mfhi %[temp0] \n\t" |
| 126 | "mflo %[temp1] \n\t" |
| 127 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), |
| 128 | [i]"=&r" (i), [pop]"+r" (pop) |
| 129 | : [LoopEnd]"r" (LoopEnd) |
| 130 | : "memory" , "hi" , "lo" |
| 131 | ); |
| 132 | |
| 133 | return (double)((int64_t)temp0 << 32 | temp1); |
| 134 | } |
| 135 | |
| 136 | // C version of this function: |
| 137 | // int i = 0; |
| 138 | // int64_t cost = 0; |
| 139 | // const uint32_t* pX = &X[4]; |
| 140 | // const uint32_t* pY = &Y[4]; |
| 141 | // const uint32_t* LoopEnd = &X[length]; |
| 142 | // while (pX != LoopEnd) { |
| 143 | // const uint32_t xy0 = *pX + *pY; |
| 144 | // const uint32_t xy1 = *(pX + 1) + *(pY + 1); |
| 145 | // ++i; |
| 146 | // cost += i * xy0; |
| 147 | // cost += i * xy1; |
| 148 | // pX += 2; |
| 149 | // pY += 2; |
| 150 | // } |
| 151 | // return (double)cost; |
| 152 | static double ExtraCostCombined_MIPS32(const uint32_t* const X, |
| 153 | const uint32_t* const Y, int length) { |
| 154 | int i, temp0, temp1, temp2, temp3; |
| 155 | const uint32_t* pX = &X[4]; |
| 156 | const uint32_t* pY = &Y[4]; |
| 157 | const uint32_t* const LoopEnd = &X[length]; |
| 158 | |
| 159 | __asm__ volatile( |
| 160 | "mult $zero, $zero \n\t" |
| 161 | "xor %[i], %[i], %[i] \n\t" |
| 162 | "beq %[pX], %[LoopEnd], 2f \n\t" |
| 163 | "1: \n\t" |
| 164 | "lw %[temp0], 0(%[pX]) \n\t" |
| 165 | "lw %[temp1], 0(%[pY]) \n\t" |
| 166 | "lw %[temp2], 4(%[pX]) \n\t" |
| 167 | "lw %[temp3], 4(%[pY]) \n\t" |
| 168 | "addiu %[i], %[i], 1 \n\t" |
| 169 | "addu %[temp0], %[temp0], %[temp1] \n\t" |
| 170 | "addu %[temp2], %[temp2], %[temp3] \n\t" |
| 171 | "addiu %[pX], %[pX], 8 \n\t" |
| 172 | "addiu %[pY], %[pY], 8 \n\t" |
| 173 | "madd %[i], %[temp0] \n\t" |
| 174 | "madd %[i], %[temp2] \n\t" |
| 175 | "bne %[pX], %[LoopEnd], 1b \n\t" |
| 176 | "2: \n\t" |
| 177 | "mfhi %[temp0] \n\t" |
| 178 | "mflo %[temp1] \n\t" |
| 179 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), |
| 180 | [temp2]"=&r" (temp2), [temp3]"=&r" (temp3), |
| 181 | [i]"=&r" (i), [pX]"+r" (pX), [pY]"+r" (pY) |
| 182 | : [LoopEnd]"r" (LoopEnd) |
| 183 | : "memory" , "hi" , "lo" |
| 184 | ); |
| 185 | |
| 186 | return (double)((int64_t)temp0 << 32 | temp1); |
| 187 | } |
| 188 | |
| 189 | #define HUFFMAN_COST_PASS \ |
| 190 | __asm__ volatile( \ |
| 191 | "sll %[temp1], %[temp0], 3 \n\t" \ |
| 192 | "addiu %[temp3], %[streak], -3 \n\t" \ |
| 193 | "addu %[temp2], %[pstreaks], %[temp1] \n\t" \ |
| 194 | "blez %[temp3], 1f \n\t" \ |
| 195 | "srl %[temp1], %[temp1], 1 \n\t" \ |
| 196 | "addu %[temp3], %[pcnts], %[temp1] \n\t" \ |
| 197 | "lw %[temp0], 4(%[temp2]) \n\t" \ |
| 198 | "lw %[temp1], 0(%[temp3]) \n\t" \ |
| 199 | "addu %[temp0], %[temp0], %[streak] \n\t" \ |
| 200 | "addiu %[temp1], %[temp1], 1 \n\t" \ |
| 201 | "sw %[temp0], 4(%[temp2]) \n\t" \ |
| 202 | "sw %[temp1], 0(%[temp3]) \n\t" \ |
| 203 | "b 2f \n\t" \ |
| 204 | "1: \n\t" \ |
| 205 | "lw %[temp0], 0(%[temp2]) \n\t" \ |
| 206 | "addu %[temp0], %[temp0], %[streak] \n\t" \ |
| 207 | "sw %[temp0], 0(%[temp2]) \n\t" \ |
| 208 | "2: \n\t" \ |
| 209 | : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \ |
| 210 | [temp3]"=&r"(temp3), [temp0]"+r"(temp0) \ |
| 211 | : [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts), \ |
| 212 | [streak]"r"(streak) \ |
| 213 | : "memory" \ |
| 214 | ); |
| 215 | |
| 216 | // Returns the various RLE counts |
| 217 | static WEBP_INLINE void GetEntropyUnrefinedHelper( |
| 218 | uint32_t val, int i, uint32_t* const val_prev, int* const i_prev, |
| 219 | VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) { |
| 220 | int* const pstreaks = &stats->streaks[0][0]; |
| 221 | int* const pcnts = &stats->counts[0]; |
| 222 | int temp0, temp1, temp2, temp3; |
| 223 | const int streak = i - *i_prev; |
| 224 | |
| 225 | // Gather info for the bit entropy. |
| 226 | if (*val_prev != 0) { |
| 227 | bit_entropy->sum += (*val_prev) * streak; |
| 228 | bit_entropy->nonzeros += streak; |
| 229 | bit_entropy->nonzero_code = *i_prev; |
| 230 | bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak; |
| 231 | if (bit_entropy->max_val < *val_prev) { |
| 232 | bit_entropy->max_val = *val_prev; |
| 233 | } |
| 234 | } |
| 235 | |
| 236 | // Gather info for the Huffman cost. |
| 237 | temp0 = (*val_prev != 0); |
| 238 | HUFFMAN_COST_PASS |
| 239 | |
| 240 | *val_prev = val; |
| 241 | *i_prev = i; |
| 242 | } |
| 243 | |
| 244 | static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length, |
| 245 | VP8LBitEntropy* const bit_entropy, |
| 246 | VP8LStreaks* const stats) { |
| 247 | int i; |
| 248 | int i_prev = 0; |
| 249 | uint32_t x_prev = X[0]; |
| 250 | |
| 251 | memset(stats, 0, sizeof(*stats)); |
| 252 | VP8LBitEntropyInit(bit_entropy); |
| 253 | |
| 254 | for (i = 1; i < length; ++i) { |
| 255 | const uint32_t x = X[i]; |
| 256 | if (x != x_prev) { |
| 257 | GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats); |
| 258 | } |
| 259 | } |
| 260 | GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats); |
| 261 | |
| 262 | bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum); |
| 263 | } |
| 264 | |
| 265 | static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[], |
| 266 | const uint32_t Y[], |
| 267 | int length, |
| 268 | VP8LBitEntropy* const entropy, |
| 269 | VP8LStreaks* const stats) { |
| 270 | int i = 1; |
| 271 | int i_prev = 0; |
| 272 | uint32_t xy_prev = X[0] + Y[0]; |
| 273 | |
| 274 | memset(stats, 0, sizeof(*stats)); |
| 275 | VP8LBitEntropyInit(entropy); |
| 276 | |
| 277 | for (i = 1; i < length; ++i) { |
| 278 | const uint32_t xy = X[i] + Y[i]; |
| 279 | if (xy != xy_prev) { |
| 280 | GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, entropy, stats); |
| 281 | } |
| 282 | } |
| 283 | GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats); |
| 284 | |
| 285 | entropy->entropy += VP8LFastSLog2(entropy->sum); |
| 286 | } |
| 287 | |
| 288 | #define ASM_START \ |
| 289 | __asm__ volatile( \ |
| 290 | ".set push \n\t" \ |
| 291 | ".set at \n\t" \ |
| 292 | ".set macro \n\t" \ |
| 293 | "1: \n\t" |
| 294 | |
| 295 | // P2 = P0 + P1 |
| 296 | // A..D - offsets |
| 297 | // E - temp variable to tell macro |
| 298 | // if pointer should be incremented |
| 299 | // literal_ and successive histograms could be unaligned |
| 300 | // so we must use ulw and usw |
| 301 | #define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2) \ |
| 302 | "ulw %[temp0], " #A "(%[" #P0 "]) \n\t" \ |
| 303 | "ulw %[temp1], " #B "(%[" #P0 "]) \n\t" \ |
| 304 | "ulw %[temp2], " #C "(%[" #P0 "]) \n\t" \ |
| 305 | "ulw %[temp3], " #D "(%[" #P0 "]) \n\t" \ |
| 306 | "ulw %[temp4], " #A "(%[" #P1 "]) \n\t" \ |
| 307 | "ulw %[temp5], " #B "(%[" #P1 "]) \n\t" \ |
| 308 | "ulw %[temp6], " #C "(%[" #P1 "]) \n\t" \ |
| 309 | "ulw %[temp7], " #D "(%[" #P1 "]) \n\t" \ |
| 310 | "addu %[temp4], %[temp4], %[temp0] \n\t" \ |
| 311 | "addu %[temp5], %[temp5], %[temp1] \n\t" \ |
| 312 | "addu %[temp6], %[temp6], %[temp2] \n\t" \ |
| 313 | "addu %[temp7], %[temp7], %[temp3] \n\t" \ |
| 314 | "addiu %[" #P0 "], %[" #P0 "], 16 \n\t" \ |
| 315 | ".if " #E " == 1 \n\t" \ |
| 316 | "addiu %[" #P1 "], %[" #P1 "], 16 \n\t" \ |
| 317 | ".endif \n\t" \ |
| 318 | "usw %[temp4], " #A "(%[" #P2 "]) \n\t" \ |
| 319 | "usw %[temp5], " #B "(%[" #P2 "]) \n\t" \ |
| 320 | "usw %[temp6], " #C "(%[" #P2 "]) \n\t" \ |
| 321 | "usw %[temp7], " #D "(%[" #P2 "]) \n\t" \ |
| 322 | "addiu %[" #P2 "], %[" #P2 "], 16 \n\t" \ |
| 323 | "bne %[" #P0 "], %[LoopEnd], 1b \n\t" \ |
| 324 | ".set pop \n\t" \ |
| 325 | |
| 326 | #define ASM_END_COMMON_0 \ |
| 327 | : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \ |
| 328 | [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), \ |
| 329 | [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \ |
| 330 | [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), \ |
| 331 | [pa]"+r"(pa), [pout]"+r"(pout) |
| 332 | |
| 333 | #define ASM_END_COMMON_1 \ |
| 334 | : [LoopEnd]"r"(LoopEnd) \ |
| 335 | : "memory", "at" \ |
| 336 | ); |
| 337 | |
| 338 | #define ASM_END_0 \ |
| 339 | ASM_END_COMMON_0 \ |
| 340 | , [pb]"+r"(pb) \ |
| 341 | ASM_END_COMMON_1 |
| 342 | |
| 343 | #define ASM_END_1 \ |
| 344 | ASM_END_COMMON_0 \ |
| 345 | ASM_END_COMMON_1 |
| 346 | |
| 347 | static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb, |
| 348 | uint32_t* pout, int size) { |
| 349 | uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; |
| 350 | const uint32_t end = ((size) / 4) * 4; |
| 351 | const uint32_t* const LoopEnd = pa + end; |
| 352 | int i; |
| 353 | ASM_START |
| 354 | ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout) |
| 355 | ASM_END_0 |
| 356 | for (i = end; i < size; ++i) pout[i] = pa[i] + pb[i]; |
| 357 | } |
| 358 | |
| 359 | static void AddVectorEq_MIPS32(const uint32_t* pa, uint32_t* pout, int size) { |
| 360 | uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; |
| 361 | const uint32_t end = ((size) / 4) * 4; |
| 362 | const uint32_t* const LoopEnd = pa + end; |
| 363 | int i; |
| 364 | ASM_START |
| 365 | ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout) |
| 366 | ASM_END_1 |
| 367 | for (i = end; i < size; ++i) pout[i] += pa[i]; |
| 368 | } |
| 369 | |
| 370 | #undef ASM_END_1 |
| 371 | #undef ASM_END_0 |
| 372 | #undef ASM_END_COMMON_1 |
| 373 | #undef ASM_END_COMMON_0 |
| 374 | #undef ADD_TO_OUT |
| 375 | #undef ASM_START |
| 376 | |
| 377 | //------------------------------------------------------------------------------ |
| 378 | // Entry point |
| 379 | |
| 380 | extern void VP8LEncDspInitMIPS32(void); |
| 381 | |
| 382 | WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) { |
| 383 | VP8LFastSLog2Slow = FastSLog2Slow_MIPS32; |
| 384 | VP8LFastLog2Slow = FastLog2Slow_MIPS32; |
| 385 | VP8LExtraCost = ExtraCost_MIPS32; |
| 386 | VP8LExtraCostCombined = ExtraCostCombined_MIPS32; |
| 387 | VP8LGetEntropyUnrefined = GetEntropyUnrefined_MIPS32; |
| 388 | VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_MIPS32; |
| 389 | VP8LAddVector = AddVector_MIPS32; |
| 390 | VP8LAddVectorEq = AddVectorEq_MIPS32; |
| 391 | } |
| 392 | |
| 393 | #else // !WEBP_USE_MIPS32 |
| 394 | |
| 395 | WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPS32) |
| 396 | |
| 397 | #endif // WEBP_USE_MIPS32 |
| 398 | |