| 1 | // Copyright 2014 Google Inc. All Rights Reserved. | 
| 2 | // | 
| 3 | // Use of this source code is governed by a BSD-style license | 
| 4 | // that can be found in the COPYING file in the root of the source | 
| 5 | // tree. An additional intellectual property rights grant can be found | 
| 6 | // in the file PATENTS. All contributing project authors may | 
| 7 | // be found in the AUTHORS file in the root of the source tree. | 
| 8 | // ----------------------------------------------------------------------------- | 
| 9 | // | 
| 10 | // MIPS version of speed-critical encoding functions. | 
| 11 | // | 
| 12 | // Author(s): Djordje Pesut    (djordje.pesut@imgtec.com) | 
| 13 | //            Jovan Zelincevic (jovan.zelincevic@imgtec.com) | 
| 14 | //            Slobodan Prijic  (slobodan.prijic@imgtec.com) | 
| 15 |  | 
| 16 | #include "src/dsp/dsp.h" | 
| 17 |  | 
| 18 | #if defined(WEBP_USE_MIPS32) | 
| 19 |  | 
| 20 | #include "src/dsp/mips_macro.h" | 
| 21 | #include "src/enc/vp8i_enc.h" | 
| 22 | #include "src/enc/cost_enc.h" | 
| 23 |  | 
| 24 | static const int kC1 = 20091 + (1 << 16); | 
| 25 | static const int kC2 = 35468; | 
| 26 |  | 
| 27 | // macro for one vertical pass in ITransformOne | 
| 28 | // MUL macro inlined | 
| 29 | // temp0..temp15 holds tmp[0]..tmp[15] | 
| 30 | // A..D - offsets in bytes to load from in buffer | 
| 31 | // TEMP0..TEMP3 - registers for corresponding tmp elements | 
| 32 | // TEMP4..TEMP5 - temporary registers | 
| 33 | #define VERTICAL_PASS(A, B, C, D, TEMP4, TEMP0, TEMP1, TEMP2, TEMP3)        \ | 
| 34 |   "lh      %[temp16],      " #A "(%[temp20])                 \n\t"          \ | 
| 35 |   "lh      %[temp18],      " #B "(%[temp20])                 \n\t"          \ | 
| 36 |   "lh      %[temp17],      " #C "(%[temp20])                 \n\t"          \ | 
| 37 |   "lh      %[temp19],      " #D "(%[temp20])                 \n\t"          \ | 
| 38 |   "addu    %[" #TEMP4 "],    %[temp16],      %[temp18]       \n\t"          \ | 
| 39 |   "subu    %[temp16],      %[temp16],      %[temp18]         \n\t"          \ | 
| 40 |   "mul     %[" #TEMP0 "],    %[temp17],      %[kC2]          \n\t"          \ | 
| 41 |   "mul     %[temp18],      %[temp19],      %[kC1]            \n\t"          \ | 
| 42 |   "mul     %[temp17],      %[temp17],      %[kC1]            \n\t"          \ | 
| 43 |   "mul     %[temp19],      %[temp19],      %[kC2]            \n\t"          \ | 
| 44 |   "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16            \n\n"          \ | 
| 45 |   "sra     %[temp18],      %[temp18],      16                \n\n"          \ | 
| 46 |   "sra     %[temp17],      %[temp17],      16                \n\n"          \ | 
| 47 |   "sra     %[temp19],      %[temp19],      16                \n\n"          \ | 
| 48 |   "subu    %[" #TEMP2 "],    %[" #TEMP0 "],    %[temp18]     \n\t"          \ | 
| 49 |   "addu    %[" #TEMP3 "],    %[temp17],      %[temp19]       \n\t"          \ | 
| 50 |   "addu    %[" #TEMP0 "],    %[" #TEMP4 "],    %[" #TEMP3 "] \n\t"          \ | 
| 51 |   "addu    %[" #TEMP1 "],    %[temp16],      %[" #TEMP2 "]   \n\t"          \ | 
| 52 |   "subu    %[" #TEMP2 "],    %[temp16],      %[" #TEMP2 "]   \n\t"          \ | 
| 53 |   "subu    %[" #TEMP3 "],    %[" #TEMP4 "],    %[" #TEMP3 "] \n\t" | 
| 54 |  | 
| 55 | // macro for one horizontal pass in ITransformOne | 
| 56 | // MUL and STORE macros inlined | 
| 57 | // a = clip_8b(a) is replaced with: a = max(a, 0); a = min(a, 255) | 
| 58 | // temp0..temp15 holds tmp[0]..tmp[15] | 
| 59 | // A - offset in bytes to load from ref and store to dst buffer | 
| 60 | // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements | 
| 61 | #define HORIZONTAL_PASS(A, TEMP0, TEMP4, TEMP8, TEMP12)                       \ | 
| 62 |   "addiu   %[" #TEMP0 "],    %[" #TEMP0 "],    4               \n\t"          \ | 
| 63 |   "addu    %[temp16],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \ | 
| 64 |   "subu    %[temp17],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \ | 
| 65 |   "mul     %[" #TEMP0 "],    %[" #TEMP4 "],    %[kC2]          \n\t"          \ | 
| 66 |   "mul     %[" #TEMP8 "],    %[" #TEMP12 "],   %[kC1]          \n\t"          \ | 
| 67 |   "mul     %[" #TEMP4 "],    %[" #TEMP4 "],    %[kC1]          \n\t"          \ | 
| 68 |   "mul     %[" #TEMP12 "],   %[" #TEMP12 "],   %[kC2]          \n\t"          \ | 
| 69 |   "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    16              \n\t"          \ | 
| 70 |   "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    16              \n\t"          \ | 
| 71 |   "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    16              \n\t"          \ | 
| 72 |   "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   16              \n\t"          \ | 
| 73 |   "subu    %[temp18],      %[" #TEMP0 "],    %[" #TEMP8 "]     \n\t"          \ | 
| 74 |   "addu    %[temp19],      %[" #TEMP4 "],    %[" #TEMP12 "]    \n\t"          \ | 
| 75 |   "addu    %[" #TEMP0 "],    %[temp16],      %[temp19]         \n\t"          \ | 
| 76 |   "addu    %[" #TEMP4 "],    %[temp17],      %[temp18]         \n\t"          \ | 
| 77 |   "subu    %[" #TEMP8 "],    %[temp17],      %[temp18]         \n\t"          \ | 
| 78 |   "subu    %[" #TEMP12 "],   %[temp16],      %[temp19]         \n\t"          \ | 
| 79 |   "lw      %[temp20],      0(%[args])                          \n\t"          \ | 
| 80 |   "sra     %[" #TEMP0 "],    %[" #TEMP0 "],    3               \n\t"          \ | 
| 81 |   "sra     %[" #TEMP4 "],    %[" #TEMP4 "],    3               \n\t"          \ | 
| 82 |   "sra     %[" #TEMP8 "],    %[" #TEMP8 "],    3               \n\t"          \ | 
| 83 |   "sra     %[" #TEMP12 "],   %[" #TEMP12 "],   3               \n\t"          \ | 
| 84 |   "lbu     %[temp16],      0+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \ | 
| 85 |   "lbu     %[temp17],      1+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \ | 
| 86 |   "lbu     %[temp18],      2+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \ | 
| 87 |   "lbu     %[temp19],      3+" XSTR(BPS) "*" #A "(%[temp20])   \n\t"          \ | 
| 88 |   "addu    %[" #TEMP0 "],    %[temp16],      %[" #TEMP0 "]     \n\t"          \ | 
| 89 |   "addu    %[" #TEMP4 "],    %[temp17],      %[" #TEMP4 "]     \n\t"          \ | 
| 90 |   "addu    %[" #TEMP8 "],    %[temp18],      %[" #TEMP8 "]     \n\t"          \ | 
| 91 |   "addu    %[" #TEMP12 "],   %[temp19],      %[" #TEMP12 "]    \n\t"          \ | 
| 92 |   "slt     %[temp16],      %[" #TEMP0 "],    $zero             \n\t"          \ | 
| 93 |   "slt     %[temp17],      %[" #TEMP4 "],    $zero             \n\t"          \ | 
| 94 |   "slt     %[temp18],      %[" #TEMP8 "],    $zero             \n\t"          \ | 
| 95 |   "slt     %[temp19],      %[" #TEMP12 "],   $zero             \n\t"          \ | 
| 96 |   "movn    %[" #TEMP0 "],    $zero,          %[temp16]         \n\t"          \ | 
| 97 |   "movn    %[" #TEMP4 "],    $zero,          %[temp17]         \n\t"          \ | 
| 98 |   "movn    %[" #TEMP8 "],    $zero,          %[temp18]         \n\t"          \ | 
| 99 |   "movn    %[" #TEMP12 "],   $zero,          %[temp19]         \n\t"          \ | 
| 100 |   "addiu   %[temp20],      $zero,          255                 \n\t"          \ | 
| 101 |   "slt     %[temp16],      %[" #TEMP0 "],    %[temp20]         \n\t"          \ | 
| 102 |   "slt     %[temp17],      %[" #TEMP4 "],    %[temp20]         \n\t"          \ | 
| 103 |   "slt     %[temp18],      %[" #TEMP8 "],    %[temp20]         \n\t"          \ | 
| 104 |   "slt     %[temp19],      %[" #TEMP12 "],   %[temp20]         \n\t"          \ | 
| 105 |   "movz    %[" #TEMP0 "],    %[temp20],      %[temp16]         \n\t"          \ | 
| 106 |   "movz    %[" #TEMP4 "],    %[temp20],      %[temp17]         \n\t"          \ | 
| 107 |   "lw      %[temp16],      8(%[args])                          \n\t"          \ | 
| 108 |   "movz    %[" #TEMP8 "],    %[temp20],      %[temp18]         \n\t"          \ | 
| 109 |   "movz    %[" #TEMP12 "],   %[temp20],      %[temp19]         \n\t"          \ | 
| 110 |   "sb      %[" #TEMP0 "],    0+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"          \ | 
| 111 |   "sb      %[" #TEMP4 "],    1+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"          \ | 
| 112 |   "sb      %[" #TEMP8 "],    2+" XSTR(BPS) "*" #A "(%[temp16]) \n\t"          \ | 
| 113 |   "sb      %[" #TEMP12 "],   3+" XSTR(BPS) "*" #A "(%[temp16]) \n\t" | 
| 114 |  | 
| 115 | // Does one or two inverse transforms. | 
| 116 | static WEBP_INLINE void ITransformOne_MIPS32(const uint8_t* ref, | 
| 117 |                                              const int16_t* in, | 
| 118 |                                              uint8_t* dst) { | 
| 119 |   int temp0, temp1, temp2, temp3, temp4, temp5, temp6; | 
| 120 |   int temp7, temp8, temp9, temp10, temp11, temp12, temp13; | 
| 121 |   int temp14, temp15, temp16, temp17, temp18, temp19, temp20; | 
| 122 |   const int* args[3] = {(const int*)ref, (const int*)in, (const int*)dst}; | 
| 123 |  | 
| 124 |   __asm__ volatile( | 
| 125 |     "lw      %[temp20],      4(%[args])                      \n\t"  | 
| 126 |     VERTICAL_PASS(0, 16,  8, 24, temp4,  temp0,  temp1,  temp2,  temp3) | 
| 127 |     VERTICAL_PASS(2, 18, 10, 26, temp8,  temp4,  temp5,  temp6,  temp7) | 
| 128 |     VERTICAL_PASS(4, 20, 12, 28, temp12, temp8,  temp9,  temp10, temp11) | 
| 129 |     VERTICAL_PASS(6, 22, 14, 30, temp20, temp12, temp13, temp14, temp15) | 
| 130 |  | 
| 131 |     HORIZONTAL_PASS(0, temp0, temp4, temp8,  temp12) | 
| 132 |     HORIZONTAL_PASS(1, temp1, temp5, temp9,  temp13) | 
| 133 |     HORIZONTAL_PASS(2, temp2, temp6, temp10, temp14) | 
| 134 |     HORIZONTAL_PASS(3, temp3, temp7, temp11, temp15) | 
| 135 |  | 
| 136 |     : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), | 
| 137 |       [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), | 
| 138 |       [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8), | 
| 139 |       [temp9]"=&r" (temp9), [temp10]"=&r" (temp10), [temp11]"=&r" (temp11), | 
| 140 |       [temp12]"=&r" (temp12), [temp13]"=&r" (temp13), [temp14]"=&r" (temp14), | 
| 141 |       [temp15]"=&r" (temp15), [temp16]"=&r" (temp16), [temp17]"=&r" (temp17), | 
| 142 |       [temp18]"=&r" (temp18), [temp19]"=&r" (temp19), [temp20]"=&r" (temp20) | 
| 143 |     : [args]"r" (args), [kC1]"r" (kC1), [kC2]"r" (kC2) | 
| 144 |     : "memory" , "hi" , "lo"  | 
| 145 |   ); | 
| 146 | } | 
| 147 |  | 
| 148 | static void ITransform_MIPS32(const uint8_t* ref, const int16_t* in, | 
| 149 |                               uint8_t* dst, int do_two) { | 
| 150 |   ITransformOne_MIPS32(ref, in, dst); | 
| 151 |   if (do_two) { | 
| 152 |     ITransformOne_MIPS32(ref + 4, in + 16, dst + 4); | 
| 153 |   } | 
| 154 | } | 
| 155 |  | 
| 156 | #undef VERTICAL_PASS | 
| 157 | #undef HORIZONTAL_PASS | 
| 158 |  | 
| 159 | // macro for one pass through for loop in QuantizeBlock | 
| 160 | // QUANTDIV macro inlined | 
| 161 | // J - offset in bytes (kZigzag[n] * 2) | 
| 162 | // K - offset in bytes (kZigzag[n] * 4) | 
| 163 | // N - offset in bytes (n * 2) | 
| 164 | #define QUANTIZE_ONE(J, K, N)                                               \ | 
| 165 |   "lh           %[temp0],       " #J "(%[ppin])                     \n\t"   \ | 
| 166 |   "lhu          %[temp1],       " #J "(%[ppsharpen])                \n\t"   \ | 
| 167 |   "lw           %[temp2],       " #K "(%[ppzthresh])                \n\t"   \ | 
| 168 |   "sra          %[sign],        %[temp0],           15              \n\t"   \ | 
| 169 |   "xor          %[coeff],       %[temp0],           %[sign]         \n\t"   \ | 
| 170 |   "subu         %[coeff],       %[coeff],           %[sign]         \n\t"   \ | 
| 171 |   "addu         %[coeff],       %[coeff],           %[temp1]        \n\t"   \ | 
| 172 |   "slt          %[temp4],       %[temp2],           %[coeff]        \n\t"   \ | 
| 173 |   "addiu        %[temp5],       $zero,              0               \n\t"   \ | 
| 174 |   "addiu        %[level],       $zero,              0               \n\t"   \ | 
| 175 |   "beqz         %[temp4],       2f                                  \n\t"   \ | 
| 176 |   "lhu          %[temp1],       " #J "(%[ppiq])                     \n\t"   \ | 
| 177 |   "lw           %[temp2],       " #K "(%[ppbias])                   \n\t"   \ | 
| 178 |   "lhu          %[temp3],       " #J "(%[ppq])                      \n\t"   \ | 
| 179 |   "mul          %[level],       %[coeff],           %[temp1]        \n\t"   \ | 
| 180 |   "addu         %[level],       %[level],           %[temp2]        \n\t"   \ | 
| 181 |   "sra          %[level],       %[level],           17              \n\t"   \ | 
| 182 |   "slt          %[temp4],       %[max_level],       %[level]        \n\t"   \ | 
| 183 |   "movn         %[level],       %[max_level],       %[temp4]        \n\t"   \ | 
| 184 |   "xor          %[level],       %[level],           %[sign]         \n\t"   \ | 
| 185 |   "subu         %[level],       %[level],           %[sign]         \n\t"   \ | 
| 186 |   "mul          %[temp5],       %[level],           %[temp3]        \n\t"   \ | 
| 187 | "2:                                                                 \n\t"   \ | 
| 188 |   "sh           %[temp5],       " #J "(%[ppin])                     \n\t"   \ | 
| 189 |   "sh           %[level],       " #N "(%[pout])                     \n\t" | 
| 190 |  | 
| 191 | static int QuantizeBlock_MIPS32(int16_t in[16], int16_t out[16], | 
| 192 |                                 const VP8Matrix* const mtx) { | 
| 193 |   int temp0, temp1, temp2, temp3, temp4, temp5; | 
| 194 |   int sign, coeff, level, i; | 
| 195 |   int max_level = MAX_LEVEL; | 
| 196 |  | 
| 197 |   int16_t* ppin             = &in[0]; | 
| 198 |   int16_t* pout             = &out[0]; | 
| 199 |   const uint16_t* ppsharpen = &mtx->sharpen_[0]; | 
| 200 |   const uint32_t* ppzthresh = &mtx->zthresh_[0]; | 
| 201 |   const uint16_t* ppq       = &mtx->q_[0]; | 
| 202 |   const uint16_t* ppiq      = &mtx->iq_[0]; | 
| 203 |   const uint32_t* ppbias    = &mtx->bias_[0]; | 
| 204 |  | 
| 205 |   __asm__ volatile( | 
| 206 |     QUANTIZE_ONE( 0,  0,  0) | 
| 207 |     QUANTIZE_ONE( 2,  4,  2) | 
| 208 |     QUANTIZE_ONE( 8, 16,  4) | 
| 209 |     QUANTIZE_ONE(16, 32,  6) | 
| 210 |     QUANTIZE_ONE(10, 20,  8) | 
| 211 |     QUANTIZE_ONE( 4,  8, 10) | 
| 212 |     QUANTIZE_ONE( 6, 12, 12) | 
| 213 |     QUANTIZE_ONE(12, 24, 14) | 
| 214 |     QUANTIZE_ONE(18, 36, 16) | 
| 215 |     QUANTIZE_ONE(24, 48, 18) | 
| 216 |     QUANTIZE_ONE(26, 52, 20) | 
| 217 |     QUANTIZE_ONE(20, 40, 22) | 
| 218 |     QUANTIZE_ONE(14, 28, 24) | 
| 219 |     QUANTIZE_ONE(22, 44, 26) | 
| 220 |     QUANTIZE_ONE(28, 56, 28) | 
| 221 |     QUANTIZE_ONE(30, 60, 30) | 
| 222 |  | 
| 223 |     : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), | 
| 224 |       [temp2]"=&r" (temp2), [temp3]"=&r" (temp3), | 
| 225 |       [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), | 
| 226 |       [sign]"=&r" (sign), [coeff]"=&r" (coeff), | 
| 227 |       [level]"=&r" (level) | 
| 228 |     : [pout]"r" (pout), [ppin]"r" (ppin), | 
| 229 |       [ppiq]"r" (ppiq), [max_level]"r" (max_level), | 
| 230 |       [ppbias]"r" (ppbias), [ppzthresh]"r" (ppzthresh), | 
| 231 |       [ppsharpen]"r" (ppsharpen), [ppq]"r" (ppq) | 
| 232 |     : "memory" , "hi" , "lo"  | 
| 233 |   ); | 
| 234 |  | 
| 235 |   // moved out from macro to increase possibility for earlier breaking | 
| 236 |   for (i = 15; i >= 0; i--) { | 
| 237 |     if (out[i]) return 1; | 
| 238 |   } | 
| 239 |   return 0; | 
| 240 | } | 
| 241 |  | 
| 242 | static int Quantize2Blocks_MIPS32(int16_t in[32], int16_t out[32], | 
| 243 |                                   const VP8Matrix* const mtx) { | 
| 244 |   int nz; | 
| 245 |   nz  = QuantizeBlock_MIPS32(in + 0 * 16, out + 0 * 16, mtx) << 0; | 
| 246 |   nz |= QuantizeBlock_MIPS32(in + 1 * 16, out + 1 * 16, mtx) << 1; | 
| 247 |   return nz; | 
| 248 | } | 
| 249 |  | 
| 250 | #undef QUANTIZE_ONE | 
| 251 |  | 
| 252 | // macro for one horizontal pass in Disto4x4 (TTransform) | 
| 253 | // two calls of function TTransform are merged into single one | 
| 254 | // A - offset in bytes to load from a and b buffers | 
| 255 | // E..H - offsets in bytes to store first results to tmp buffer | 
| 256 | // E1..H1 - offsets in bytes to store second results to tmp buffer | 
| 257 | #define HORIZONTAL_PASS(A, E, F, G, H, E1, F1, G1, H1)                  \ | 
| 258 |   "lbu    %[temp0],  0+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \ | 
| 259 |   "lbu    %[temp1],  1+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \ | 
| 260 |   "lbu    %[temp2],  2+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \ | 
| 261 |   "lbu    %[temp3],  3+" XSTR(BPS) "*" #A "(%[a])  \n\t"                \ | 
| 262 |   "lbu    %[temp4],  0+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \ | 
| 263 |   "lbu    %[temp5],  1+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \ | 
| 264 |   "lbu    %[temp6],  2+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \ | 
| 265 |   "lbu    %[temp7],  3+" XSTR(BPS) "*" #A "(%[b])  \n\t"                \ | 
| 266 |   "addu   %[temp8],  %[temp0],    %[temp2]         \n\t"                \ | 
| 267 |   "subu   %[temp0],  %[temp0],    %[temp2]         \n\t"                \ | 
| 268 |   "addu   %[temp2],  %[temp1],    %[temp3]         \n\t"                \ | 
| 269 |   "subu   %[temp1],  %[temp1],    %[temp3]         \n\t"                \ | 
| 270 |   "addu   %[temp3],  %[temp4],    %[temp6]         \n\t"                \ | 
| 271 |   "subu   %[temp4],  %[temp4],    %[temp6]         \n\t"                \ | 
| 272 |   "addu   %[temp6],  %[temp5],    %[temp7]         \n\t"                \ | 
| 273 |   "subu   %[temp5],  %[temp5],    %[temp7]         \n\t"                \ | 
| 274 |   "addu   %[temp7],  %[temp8],    %[temp2]         \n\t"                \ | 
| 275 |   "subu   %[temp2],  %[temp8],    %[temp2]         \n\t"                \ | 
| 276 |   "addu   %[temp8],  %[temp0],    %[temp1]         \n\t"                \ | 
| 277 |   "subu   %[temp0],  %[temp0],    %[temp1]         \n\t"                \ | 
| 278 |   "addu   %[temp1],  %[temp3],    %[temp6]         \n\t"                \ | 
| 279 |   "subu   %[temp3],  %[temp3],    %[temp6]         \n\t"                \ | 
| 280 |   "addu   %[temp6],  %[temp4],    %[temp5]         \n\t"                \ | 
| 281 |   "subu   %[temp4],  %[temp4],    %[temp5]         \n\t"                \ | 
| 282 |   "sw     %[temp7],  " #E "(%[tmp])                \n\t"                \ | 
| 283 |   "sw     %[temp2],  " #H "(%[tmp])                \n\t"                \ | 
| 284 |   "sw     %[temp8],  " #F "(%[tmp])                \n\t"                \ | 
| 285 |   "sw     %[temp0],  " #G "(%[tmp])                \n\t"                \ | 
| 286 |   "sw     %[temp1],  " #E1 "(%[tmp])               \n\t"                \ | 
| 287 |   "sw     %[temp3],  " #H1 "(%[tmp])               \n\t"                \ | 
| 288 |   "sw     %[temp6],  " #F1 "(%[tmp])               \n\t"                \ | 
| 289 |   "sw     %[temp4],  " #G1 "(%[tmp])               \n\t" | 
| 290 |  | 
| 291 | // macro for one vertical pass in Disto4x4 (TTransform) | 
| 292 | // two calls of function TTransform are merged into single one | 
| 293 | // since only one accu is available in mips32r1 instruction set | 
| 294 | //   first is done second call of function TTransform and after | 
| 295 | //   that first one. | 
| 296 | //   const int sum1 = TTransform(a, w); | 
| 297 | //   const int sum2 = TTransform(b, w); | 
| 298 | //   return abs(sum2 - sum1) >> 5; | 
| 299 | //   (sum2 - sum1) is calculated with madds (sub2) and msubs (sub1) | 
| 300 | // A..D - offsets in bytes to load first results from tmp buffer | 
| 301 | // A1..D1 - offsets in bytes to load second results from tmp buffer | 
| 302 | // E..H - offsets in bytes to load from w buffer | 
| 303 | #define VERTICAL_PASS(A, B, C, D, A1, B1, C1, D1, E, F, G, H)     \ | 
| 304 |   "lw     %[temp0],  " #A1 "(%[tmp])         \n\t"                \ | 
| 305 |   "lw     %[temp1],  " #C1 "(%[tmp])         \n\t"                \ | 
| 306 |   "lw     %[temp2],  " #B1 "(%[tmp])         \n\t"                \ | 
| 307 |   "lw     %[temp3],  " #D1 "(%[tmp])         \n\t"                \ | 
| 308 |   "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \ | 
| 309 |   "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \ | 
| 310 |   "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \ | 
| 311 |   "subu   %[temp2],  %[temp2],    %[temp3]   \n\t"                \ | 
| 312 |   "addu   %[temp3],  %[temp8],    %[temp1]   \n\t"                \ | 
| 313 |   "subu   %[temp8],  %[temp8],    %[temp1]   \n\t"                \ | 
| 314 |   "addu   %[temp1],  %[temp0],    %[temp2]   \n\t"                \ | 
| 315 |   "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \ | 
| 316 |   "sra    %[temp4],  %[temp3],    31         \n\t"                \ | 
| 317 |   "sra    %[temp5],  %[temp1],    31         \n\t"                \ | 
| 318 |   "sra    %[temp6],  %[temp0],    31         \n\t"                \ | 
| 319 |   "sra    %[temp7],  %[temp8],    31         \n\t"                \ | 
| 320 |   "xor    %[temp3],  %[temp3],    %[temp4]   \n\t"                \ | 
| 321 |   "xor    %[temp1],  %[temp1],    %[temp5]   \n\t"                \ | 
| 322 |   "xor    %[temp0],  %[temp0],    %[temp6]   \n\t"                \ | 
| 323 |   "xor    %[temp8],  %[temp8],    %[temp7]   \n\t"                \ | 
| 324 |   "subu   %[temp3],  %[temp3],    %[temp4]   \n\t"                \ | 
| 325 |   "subu   %[temp1],  %[temp1],    %[temp5]   \n\t"                \ | 
| 326 |   "subu   %[temp0],  %[temp0],    %[temp6]   \n\t"                \ | 
| 327 |   "subu   %[temp8],  %[temp8],    %[temp7]   \n\t"                \ | 
| 328 |   "lhu    %[temp4],  " #E "(%[w])            \n\t"                \ | 
| 329 |   "lhu    %[temp5],  " #F "(%[w])            \n\t"                \ | 
| 330 |   "lhu    %[temp6],  " #G "(%[w])            \n\t"                \ | 
| 331 |   "lhu    %[temp7],  " #H "(%[w])            \n\t"                \ | 
| 332 |   "madd   %[temp4],  %[temp3]                \n\t"                \ | 
| 333 |   "madd   %[temp5],  %[temp1]                \n\t"                \ | 
| 334 |   "madd   %[temp6],  %[temp0]                \n\t"                \ | 
| 335 |   "madd   %[temp7],  %[temp8]                \n\t"                \ | 
| 336 |   "lw     %[temp0],  " #A "(%[tmp])          \n\t"                \ | 
| 337 |   "lw     %[temp1],  " #C "(%[tmp])          \n\t"                \ | 
| 338 |   "lw     %[temp2],  " #B "(%[tmp])          \n\t"                \ | 
| 339 |   "lw     %[temp3],  " #D "(%[tmp])          \n\t"                \ | 
| 340 |   "addu   %[temp8],  %[temp0],    %[temp1]   \n\t"                \ | 
| 341 |   "subu   %[temp0],  %[temp0],    %[temp1]   \n\t"                \ | 
| 342 |   "addu   %[temp1],  %[temp2],    %[temp3]   \n\t"                \ | 
| 343 |   "subu   %[temp2],  %[temp2],    %[temp3]   \n\t"                \ | 
| 344 |   "addu   %[temp3],  %[temp8],    %[temp1]   \n\t"                \ | 
| 345 |   "subu   %[temp1],  %[temp8],    %[temp1]   \n\t"                \ | 
| 346 |   "addu   %[temp8],  %[temp0],    %[temp2]   \n\t"                \ | 
| 347 |   "subu   %[temp0],  %[temp0],    %[temp2]   \n\t"                \ | 
| 348 |   "sra    %[temp2],  %[temp3],    31         \n\t"                \ | 
| 349 |   "xor    %[temp3],  %[temp3],    %[temp2]   \n\t"                \ | 
| 350 |   "subu   %[temp3],  %[temp3],    %[temp2]   \n\t"                \ | 
| 351 |   "msub   %[temp4],  %[temp3]                \n\t"                \ | 
| 352 |   "sra    %[temp2],  %[temp8],    31         \n\t"                \ | 
| 353 |   "sra    %[temp3],  %[temp0],    31         \n\t"                \ | 
| 354 |   "sra    %[temp4],  %[temp1],    31         \n\t"                \ | 
| 355 |   "xor    %[temp8],  %[temp8],    %[temp2]   \n\t"                \ | 
| 356 |   "xor    %[temp0],  %[temp0],    %[temp3]   \n\t"                \ | 
| 357 |   "xor    %[temp1],  %[temp1],    %[temp4]   \n\t"                \ | 
| 358 |   "subu   %[temp8],  %[temp8],    %[temp2]   \n\t"                \ | 
| 359 |   "subu   %[temp0],  %[temp0],    %[temp3]   \n\t"                \ | 
| 360 |   "subu   %[temp1],  %[temp1],    %[temp4]   \n\t"                \ | 
| 361 |   "msub   %[temp5],  %[temp8]                \n\t"                \ | 
| 362 |   "msub   %[temp6],  %[temp0]                \n\t"                \ | 
| 363 |   "msub   %[temp7],  %[temp1]                \n\t" | 
| 364 |  | 
| 365 | static int Disto4x4_MIPS32(const uint8_t* const a, const uint8_t* const b, | 
| 366 |                            const uint16_t* const w) { | 
| 367 |   int tmp[32]; | 
| 368 |   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; | 
| 369 |  | 
| 370 |   __asm__ volatile( | 
| 371 |     HORIZONTAL_PASS(0,   0,  4,  8, 12,    64,  68,  72,  76) | 
| 372 |     HORIZONTAL_PASS(1,  16, 20, 24, 28,    80,  84,  88,  92) | 
| 373 |     HORIZONTAL_PASS(2,  32, 36, 40, 44,    96, 100, 104, 108) | 
| 374 |     HORIZONTAL_PASS(3,  48, 52, 56, 60,   112, 116, 120, 124) | 
| 375 |     "mthi   $zero                             \n\t"  | 
| 376 |     "mtlo   $zero                             \n\t"  | 
| 377 |     VERTICAL_PASS( 0, 16, 32, 48,     64, 80,  96, 112,   0,  8, 16, 24) | 
| 378 |     VERTICAL_PASS( 4, 20, 36, 52,     68, 84, 100, 116,   2, 10, 18, 26) | 
| 379 |     VERTICAL_PASS( 8, 24, 40, 56,     72, 88, 104, 120,   4, 12, 20, 28) | 
| 380 |     VERTICAL_PASS(12, 28, 44, 60,     76, 92, 108, 124,   6, 14, 22, 30) | 
| 381 |     "mflo   %[temp0]                          \n\t"  | 
| 382 |     "sra    %[temp1],  %[temp0],  31          \n\t"  | 
| 383 |     "xor    %[temp0],  %[temp0],  %[temp1]    \n\t"  | 
| 384 |     "subu   %[temp0],  %[temp0],  %[temp1]    \n\t"  | 
| 385 |     "sra    %[temp0],  %[temp0],  5           \n\t"  | 
| 386 |  | 
| 387 |     : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), | 
| 388 |       [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), | 
| 389 |       [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8) | 
| 390 |     : [a]"r" (a), [b]"r" (b), [w]"r" (w), [tmp]"r" (tmp) | 
| 391 |     : "memory" , "hi" , "lo"  | 
| 392 |   ); | 
| 393 |  | 
| 394 |   return temp0; | 
| 395 | } | 
| 396 |  | 
| 397 | #undef VERTICAL_PASS | 
| 398 | #undef HORIZONTAL_PASS | 
| 399 |  | 
| 400 | static int Disto16x16_MIPS32(const uint8_t* const a, const uint8_t* const b, | 
| 401 |                              const uint16_t* const w) { | 
| 402 |   int D = 0; | 
| 403 |   int x, y; | 
| 404 |   for (y = 0; y < 16 * BPS; y += 4 * BPS) { | 
| 405 |     for (x = 0; x < 16; x += 4) { | 
| 406 |       D += Disto4x4_MIPS32(a + x + y, b + x + y, w); | 
| 407 |     } | 
| 408 |   } | 
| 409 |   return D; | 
| 410 | } | 
| 411 |  | 
| 412 | // macro for one horizontal pass in FTransform | 
| 413 | // temp0..temp15 holds tmp[0]..tmp[15] | 
| 414 | // A - offset in bytes to load from src and ref buffers | 
| 415 | // TEMP0..TEMP3 - registers for corresponding tmp elements | 
| 416 | #define HORIZONTAL_PASS(A, TEMP0, TEMP1, TEMP2, TEMP3)                  \ | 
| 417 |   "lw     %[" #TEMP1 "],  0(%[args])                           \n\t"    \ | 
| 418 |   "lw     %[" #TEMP2 "],  4(%[args])                           \n\t"    \ | 
| 419 |   "lbu    %[temp16],    0+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \ | 
| 420 |   "lbu    %[temp17],    0+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \ | 
| 421 |   "lbu    %[temp18],    1+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \ | 
| 422 |   "lbu    %[temp19],    1+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \ | 
| 423 |   "subu   %[temp20],    %[temp16],    %[temp17]                \n\t"    \ | 
| 424 |   "lbu    %[temp16],    2+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \ | 
| 425 |   "lbu    %[temp17],    2+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \ | 
| 426 |   "subu   %[" #TEMP0 "],  %[temp18],    %[temp19]              \n\t"    \ | 
| 427 |   "lbu    %[temp18],    3+" XSTR(BPS) "*" #A "(%[" #TEMP1 "])  \n\t"    \ | 
| 428 |   "lbu    %[temp19],    3+" XSTR(BPS) "*" #A "(%[" #TEMP2 "])  \n\t"    \ | 
| 429 |   "subu   %[" #TEMP1 "],  %[temp16],    %[temp17]              \n\t"    \ | 
| 430 |   "subu   %[" #TEMP2 "],  %[temp18],    %[temp19]              \n\t"    \ | 
| 431 |   "addu   %[" #TEMP3 "],  %[temp20],    %[" #TEMP2 "]          \n\t"    \ | 
| 432 |   "subu   %[" #TEMP2 "],  %[temp20],    %[" #TEMP2 "]          \n\t"    \ | 
| 433 |   "addu   %[temp20],    %[" #TEMP0 "],  %[" #TEMP1 "]          \n\t"    \ | 
| 434 |   "subu   %[" #TEMP0 "],  %[" #TEMP0 "],  %[" #TEMP1 "]        \n\t"    \ | 
| 435 |   "mul    %[temp16],    %[" #TEMP2 "],  %[c5352]               \n\t"    \ | 
| 436 |   "mul    %[temp17],    %[" #TEMP2 "],  %[c2217]               \n\t"    \ | 
| 437 |   "mul    %[temp18],    %[" #TEMP0 "],  %[c5352]               \n\t"    \ | 
| 438 |   "mul    %[temp19],    %[" #TEMP0 "],  %[c2217]               \n\t"    \ | 
| 439 |   "addu   %[" #TEMP1 "],  %[" #TEMP3 "],  %[temp20]            \n\t"    \ | 
| 440 |   "subu   %[temp20],    %[" #TEMP3 "],  %[temp20]              \n\t"    \ | 
| 441 |   "sll    %[" #TEMP0 "],  %[" #TEMP1 "],  3                    \n\t"    \ | 
| 442 |   "sll    %[" #TEMP2 "],  %[temp20],    3                      \n\t"    \ | 
| 443 |   "addiu  %[temp16],    %[temp16],    1812                     \n\t"    \ | 
| 444 |   "addiu  %[temp17],    %[temp17],    937                      \n\t"    \ | 
| 445 |   "addu   %[temp16],    %[temp16],    %[temp19]                \n\t"    \ | 
| 446 |   "subu   %[temp17],    %[temp17],    %[temp18]                \n\t"    \ | 
| 447 |   "sra    %[" #TEMP1 "],  %[temp16],    9                      \n\t"    \ | 
| 448 |   "sra    %[" #TEMP3 "],  %[temp17],    9                      \n\t" | 
| 449 |  | 
| 450 | // macro for one vertical pass in FTransform | 
| 451 | // temp0..temp15 holds tmp[0]..tmp[15] | 
| 452 | // A..D - offsets in bytes to store to out buffer | 
| 453 | // TEMP0, TEMP4, TEMP8 and TEMP12 - registers for corresponding tmp elements | 
| 454 | #define VERTICAL_PASS(A, B, C, D, TEMP0, TEMP4, TEMP8, TEMP12)    \ | 
| 455 |   "addu   %[temp16],    %[" #TEMP0 "],  %[" #TEMP12 "]   \n\t"    \ | 
| 456 |   "subu   %[temp19],    %[" #TEMP0 "],  %[" #TEMP12 "]   \n\t"    \ | 
| 457 |   "addu   %[temp17],    %[" #TEMP4 "],  %[" #TEMP8 "]    \n\t"    \ | 
| 458 |   "subu   %[temp18],    %[" #TEMP4 "],  %[" #TEMP8 "]    \n\t"    \ | 
| 459 |   "mul    %[" #TEMP8 "],  %[temp19],    %[c2217]         \n\t"    \ | 
| 460 |   "mul    %[" #TEMP12 "], %[temp18],    %[c2217]         \n\t"    \ | 
| 461 |   "mul    %[" #TEMP4 "],  %[temp19],    %[c5352]         \n\t"    \ | 
| 462 |   "mul    %[temp18],    %[temp18],    %[c5352]           \n\t"    \ | 
| 463 |   "addiu  %[temp16],    %[temp16],    7                  \n\t"    \ | 
| 464 |   "addu   %[" #TEMP0 "],  %[temp16],    %[temp17]        \n\t"    \ | 
| 465 |   "sra    %[" #TEMP0 "],  %[" #TEMP0 "],  4              \n\t"    \ | 
| 466 |   "addu   %[" #TEMP12 "], %[" #TEMP12 "], %[" #TEMP4 "]  \n\t"    \ | 
| 467 |   "subu   %[" #TEMP4 "],  %[temp16],    %[temp17]        \n\t"    \ | 
| 468 |   "sra    %[" #TEMP4 "],  %[" #TEMP4 "],  4              \n\t"    \ | 
| 469 |   "addiu  %[" #TEMP8 "],  %[" #TEMP8 "],  30000          \n\t"    \ | 
| 470 |   "addiu  %[" #TEMP12 "], %[" #TEMP12 "], 12000          \n\t"    \ | 
| 471 |   "addiu  %[" #TEMP8 "],  %[" #TEMP8 "],  21000          \n\t"    \ | 
| 472 |   "subu   %[" #TEMP8 "],  %[" #TEMP8 "],  %[temp18]      \n\t"    \ | 
| 473 |   "sra    %[" #TEMP12 "], %[" #TEMP12 "], 16             \n\t"    \ | 
| 474 |   "sra    %[" #TEMP8 "],  %[" #TEMP8 "],  16             \n\t"    \ | 
| 475 |   "addiu  %[temp16],    %[" #TEMP12 "], 1                \n\t"    \ | 
| 476 |   "movn   %[" #TEMP12 "], %[temp16],    %[temp19]        \n\t"    \ | 
| 477 |   "sh     %[" #TEMP0 "],  " #A "(%[temp20])              \n\t"    \ | 
| 478 |   "sh     %[" #TEMP4 "],  " #C "(%[temp20])              \n\t"    \ | 
| 479 |   "sh     %[" #TEMP8 "],  " #D "(%[temp20])              \n\t"    \ | 
| 480 |   "sh     %[" #TEMP12 "], " #B "(%[temp20])              \n\t" | 
| 481 |  | 
| 482 | static void FTransform_MIPS32(const uint8_t* src, const uint8_t* ref, | 
| 483 |                               int16_t* out) { | 
| 484 |   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7, temp8; | 
| 485 |   int temp9, temp10, temp11, temp12, temp13, temp14, temp15, temp16; | 
| 486 |   int temp17, temp18, temp19, temp20; | 
| 487 |   const int c2217 = 2217; | 
| 488 |   const int c5352 = 5352; | 
| 489 |   const int* const args[3] = | 
| 490 |       { (const int*)src, (const int*)ref, (const int*)out }; | 
| 491 |  | 
| 492 |   __asm__ volatile( | 
| 493 |     HORIZONTAL_PASS(0, temp0,  temp1,  temp2,  temp3) | 
| 494 |     HORIZONTAL_PASS(1, temp4,  temp5,  temp6,  temp7) | 
| 495 |     HORIZONTAL_PASS(2, temp8,  temp9,  temp10, temp11) | 
| 496 |     HORIZONTAL_PASS(3, temp12, temp13, temp14, temp15) | 
| 497 |     "lw   %[temp20],    8(%[args])                     \n\t"  | 
| 498 |     VERTICAL_PASS(0,  8, 16, 24, temp0, temp4, temp8,  temp12) | 
| 499 |     VERTICAL_PASS(2, 10, 18, 26, temp1, temp5, temp9,  temp13) | 
| 500 |     VERTICAL_PASS(4, 12, 20, 28, temp2, temp6, temp10, temp14) | 
| 501 |     VERTICAL_PASS(6, 14, 22, 30, temp3, temp7, temp11, temp15) | 
| 502 |  | 
| 503 |     : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), | 
| 504 |       [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), | 
| 505 |       [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [temp8]"=&r" (temp8), | 
| 506 |       [temp9]"=&r" (temp9), [temp10]"=&r" (temp10), [temp11]"=&r" (temp11), | 
| 507 |       [temp12]"=&r" (temp12), [temp13]"=&r" (temp13), [temp14]"=&r" (temp14), | 
| 508 |       [temp15]"=&r" (temp15), [temp16]"=&r" (temp16), [temp17]"=&r" (temp17), | 
| 509 |       [temp18]"=&r" (temp18), [temp19]"=&r" (temp19), [temp20]"=&r" (temp20) | 
| 510 |     : [args]"r" (args), [c2217]"r" (c2217), [c5352]"r" (c5352) | 
| 511 |     : "memory" , "hi" , "lo"  | 
| 512 |   ); | 
| 513 | } | 
| 514 |  | 
| 515 | #undef VERTICAL_PASS | 
| 516 | #undef HORIZONTAL_PASS | 
| 517 |  | 
| 518 | #if !defined(WORK_AROUND_GCC) | 
| 519 |  | 
| 520 | #define GET_SSE_INNER(A, B, C, D)                               \ | 
| 521 |   "lbu     %[temp0],    " #A "(%[a])                 \n\t"      \ | 
| 522 |   "lbu     %[temp1],    " #A "(%[b])                 \n\t"      \ | 
| 523 |   "lbu     %[temp2],    " #B "(%[a])                 \n\t"      \ | 
| 524 |   "lbu     %[temp3],    " #B "(%[b])                 \n\t"      \ | 
| 525 |   "lbu     %[temp4],    " #C "(%[a])                 \n\t"      \ | 
| 526 |   "lbu     %[temp5],    " #C "(%[b])                 \n\t"      \ | 
| 527 |   "lbu     %[temp6],    " #D "(%[a])                 \n\t"      \ | 
| 528 |   "lbu     %[temp7],    " #D "(%[b])                 \n\t"      \ | 
| 529 |   "subu    %[temp0],    %[temp0],     %[temp1]       \n\t"      \ | 
| 530 |   "subu    %[temp2],    %[temp2],     %[temp3]       \n\t"      \ | 
| 531 |   "subu    %[temp4],    %[temp4],     %[temp5]       \n\t"      \ | 
| 532 |   "subu    %[temp6],    %[temp6],     %[temp7]       \n\t"      \ | 
| 533 |   "madd    %[temp0],    %[temp0]                     \n\t"      \ | 
| 534 |   "madd    %[temp2],    %[temp2]                     \n\t"      \ | 
| 535 |   "madd    %[temp4],    %[temp4]                     \n\t"      \ | 
| 536 |   "madd    %[temp6],    %[temp6]                     \n\t" | 
| 537 |  | 
| 538 | #define GET_SSE(A, B, C, D)               \ | 
| 539 |   GET_SSE_INNER(A, A + 1, A + 2, A + 3)   \ | 
| 540 |   GET_SSE_INNER(B, B + 1, B + 2, B + 3)   \ | 
| 541 |   GET_SSE_INNER(C, C + 1, C + 2, C + 3)   \ | 
| 542 |   GET_SSE_INNER(D, D + 1, D + 2, D + 3) | 
| 543 |  | 
| 544 | static int SSE16x16_MIPS32(const uint8_t* a, const uint8_t* b) { | 
| 545 |   int count; | 
| 546 |   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; | 
| 547 |  | 
| 548 |   __asm__ volatile( | 
| 549 |      "mult   $zero,    $zero                            \n\t"  | 
| 550 |  | 
| 551 |      GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS) | 
| 552 |      GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS) | 
| 553 |      GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS) | 
| 554 |      GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS) | 
| 555 |      GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS) | 
| 556 |      GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS) | 
| 557 |      GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS) | 
| 558 |      GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS) | 
| 559 |      GET_SSE( 8 * BPS, 4 +  8 * BPS, 8 +  8 * BPS, 12 +  8 * BPS) | 
| 560 |      GET_SSE( 9 * BPS, 4 +  9 * BPS, 8 +  9 * BPS, 12 +  9 * BPS) | 
| 561 |      GET_SSE(10 * BPS, 4 + 10 * BPS, 8 + 10 * BPS, 12 + 10 * BPS) | 
| 562 |      GET_SSE(11 * BPS, 4 + 11 * BPS, 8 + 11 * BPS, 12 + 11 * BPS) | 
| 563 |      GET_SSE(12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS) | 
| 564 |      GET_SSE(13 * BPS, 4 + 13 * BPS, 8 + 13 * BPS, 12 + 13 * BPS) | 
| 565 |      GET_SSE(14 * BPS, 4 + 14 * BPS, 8 + 14 * BPS, 12 + 14 * BPS) | 
| 566 |      GET_SSE(15 * BPS, 4 + 15 * BPS, 8 + 15 * BPS, 12 + 15 * BPS) | 
| 567 |  | 
| 568 |     "mflo    %[count]                                   \n\t"  | 
| 569 |     : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), | 
| 570 |       [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), | 
| 571 |       [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [count]"=&r" (count) | 
| 572 |     : [a]"r" (a), [b]"r" (b) | 
| 573 |     : "memory" , "hi" , "lo"  | 
| 574 |   ); | 
| 575 |   return count; | 
| 576 | } | 
| 577 |  | 
| 578 | static int SSE16x8_MIPS32(const uint8_t* a, const uint8_t* b) { | 
| 579 |   int count; | 
| 580 |   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; | 
| 581 |  | 
| 582 |   __asm__ volatile( | 
| 583 |      "mult   $zero,    $zero                            \n\t"  | 
| 584 |  | 
| 585 |      GET_SSE( 0 * BPS, 4 +  0 * BPS, 8 +  0 * BPS, 12 +  0 * BPS) | 
| 586 |      GET_SSE( 1 * BPS, 4 +  1 * BPS, 8 +  1 * BPS, 12 +  1 * BPS) | 
| 587 |      GET_SSE( 2 * BPS, 4 +  2 * BPS, 8 +  2 * BPS, 12 +  2 * BPS) | 
| 588 |      GET_SSE( 3 * BPS, 4 +  3 * BPS, 8 +  3 * BPS, 12 +  3 * BPS) | 
| 589 |      GET_SSE( 4 * BPS, 4 +  4 * BPS, 8 +  4 * BPS, 12 +  4 * BPS) | 
| 590 |      GET_SSE( 5 * BPS, 4 +  5 * BPS, 8 +  5 * BPS, 12 +  5 * BPS) | 
| 591 |      GET_SSE( 6 * BPS, 4 +  6 * BPS, 8 +  6 * BPS, 12 +  6 * BPS) | 
| 592 |      GET_SSE( 7 * BPS, 4 +  7 * BPS, 8 +  7 * BPS, 12 +  7 * BPS) | 
| 593 |  | 
| 594 |     "mflo    %[count]                                   \n\t"  | 
| 595 |     : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), | 
| 596 |       [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), | 
| 597 |       [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [count]"=&r" (count) | 
| 598 |     : [a]"r" (a), [b]"r" (b) | 
| 599 |     : "memory" , "hi" , "lo"  | 
| 600 |   ); | 
| 601 |   return count; | 
| 602 | } | 
| 603 |  | 
| 604 | static int SSE8x8_MIPS32(const uint8_t* a, const uint8_t* b) { | 
| 605 |   int count; | 
| 606 |   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; | 
| 607 |  | 
| 608 |   __asm__ volatile( | 
| 609 |      "mult   $zero,    $zero                            \n\t"  | 
| 610 |  | 
| 611 |      GET_SSE(0 * BPS, 4 + 0 * BPS, 1 * BPS, 4 + 1 * BPS) | 
| 612 |      GET_SSE(2 * BPS, 4 + 2 * BPS, 3 * BPS, 4 + 3 * BPS) | 
| 613 |      GET_SSE(4 * BPS, 4 + 4 * BPS, 5 * BPS, 4 + 5 * BPS) | 
| 614 |      GET_SSE(6 * BPS, 4 + 6 * BPS, 7 * BPS, 4 + 7 * BPS) | 
| 615 |  | 
| 616 |     "mflo    %[count]                                   \n\t"  | 
| 617 |     : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), | 
| 618 |       [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), | 
| 619 |       [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [count]"=&r" (count) | 
| 620 |     : [a]"r" (a), [b]"r" (b) | 
| 621 |     : "memory" , "hi" , "lo"  | 
| 622 |   ); | 
| 623 |   return count; | 
| 624 | } | 
| 625 |  | 
| 626 | static int SSE4x4_MIPS32(const uint8_t* a, const uint8_t* b) { | 
| 627 |   int count; | 
| 628 |   int temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; | 
| 629 |  | 
| 630 |   __asm__ volatile( | 
| 631 |      "mult   $zero,    $zero                            \n\t"  | 
| 632 |  | 
| 633 |      GET_SSE(0 * BPS, 1 * BPS, 2 * BPS, 3 * BPS) | 
| 634 |  | 
| 635 |     "mflo    %[count]                                   \n\t"  | 
| 636 |     : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), [temp2]"=&r" (temp2), | 
| 637 |       [temp3]"=&r" (temp3), [temp4]"=&r" (temp4), [temp5]"=&r" (temp5), | 
| 638 |       [temp6]"=&r" (temp6), [temp7]"=&r" (temp7), [count]"=&r" (count) | 
| 639 |     : [a]"r" (a), [b]"r" (b) | 
| 640 |     : "memory" , "hi" , "lo"  | 
| 641 |   ); | 
| 642 |   return count; | 
| 643 | } | 
| 644 |  | 
| 645 | #undef GET_SSE | 
| 646 | #undef GET_SSE_INNER | 
| 647 |  | 
| 648 | #endif  // !WORK_AROUND_GCC | 
| 649 |  | 
| 650 | //------------------------------------------------------------------------------ | 
| 651 | // Entry point | 
| 652 |  | 
| 653 | extern void VP8EncDspInitMIPS32(void); | 
| 654 |  | 
| 655 | WEBP_TSAN_IGNORE_FUNCTION void VP8EncDspInitMIPS32(void) { | 
| 656 |   VP8ITransform = ITransform_MIPS32; | 
| 657 |   VP8FTransform = FTransform_MIPS32; | 
| 658 |  | 
| 659 |   VP8EncQuantizeBlock = QuantizeBlock_MIPS32; | 
| 660 |   VP8EncQuantize2Blocks = Quantize2Blocks_MIPS32; | 
| 661 |  | 
| 662 |   VP8TDisto4x4 = Disto4x4_MIPS32; | 
| 663 |   VP8TDisto16x16 = Disto16x16_MIPS32; | 
| 664 |  | 
| 665 | #if !defined(WORK_AROUND_GCC) | 
| 666 |   VP8SSE16x16 = SSE16x16_MIPS32; | 
| 667 |   VP8SSE8x8 = SSE8x8_MIPS32; | 
| 668 |   VP8SSE16x8 = SSE16x8_MIPS32; | 
| 669 |   VP8SSE4x4 = SSE4x4_MIPS32; | 
| 670 | #endif | 
| 671 | } | 
| 672 |  | 
| 673 | #else  // !WEBP_USE_MIPS32 | 
| 674 |  | 
| 675 | WEBP_DSP_INIT_STUB(VP8EncDspInitMIPS32) | 
| 676 |  | 
| 677 | #endif  // WEBP_USE_MIPS32 | 
| 678 |  |