1 | // Copyright 2015 Google Inc. All Rights Reserved. |
2 | // |
3 | // Use of this source code is governed by a BSD-style license |
4 | // that can be found in the COPYING file in the root of the source |
5 | // tree. An additional intellectual property rights grant can be found |
6 | // in the file PATENTS. All contributing project authors may |
7 | // be found in the AUTHORS file in the root of the source tree. |
8 | // ----------------------------------------------------------------------------- |
9 | // |
10 | // MIPS version of lossless functions |
11 | // |
12 | // Author(s): Djordje Pesut (djordje.pesut@imgtec.com) |
13 | // Jovan Zelincevic (jovan.zelincevic@imgtec.com) |
14 | |
15 | #include "src/dsp/dsp.h" |
16 | #include "src/dsp/lossless.h" |
17 | #include "src/dsp/lossless_common.h" |
18 | |
19 | #if defined(WEBP_USE_MIPS32) |
20 | |
21 | #include <assert.h> |
22 | #include <math.h> |
23 | #include <stdlib.h> |
24 | #include <string.h> |
25 | |
26 | static float FastSLog2Slow_MIPS32(uint32_t v) { |
27 | assert(v >= LOG_LOOKUP_IDX_MAX); |
28 | if (v < APPROX_LOG_WITH_CORRECTION_MAX) { |
29 | uint32_t log_cnt, y, correction; |
30 | const int c24 = 24; |
31 | const float v_f = (float)v; |
32 | uint32_t temp; |
33 | |
34 | // Xf = 256 = 2^8 |
35 | // log_cnt is index of leading one in upper 24 bits |
36 | __asm__ volatile( |
37 | "clz %[log_cnt], %[v] \n\t" |
38 | "addiu %[y], $zero, 1 \n\t" |
39 | "subu %[log_cnt], %[c24], %[log_cnt] \n\t" |
40 | "sllv %[y], %[y], %[log_cnt] \n\t" |
41 | "srlv %[temp], %[v], %[log_cnt] \n\t" |
42 | : [log_cnt]"=&r" (log_cnt), [y]"=&r" (y), |
43 | [temp]"=r" (temp) |
44 | : [c24]"r" (c24), [v]"r" (v) |
45 | ); |
46 | |
47 | // vf = (2^log_cnt) * Xf; where y = 2^log_cnt and Xf < 256 |
48 | // Xf = floor(Xf) * (1 + (v % y) / v) |
49 | // log2(Xf) = log2(floor(Xf)) + log2(1 + (v % y) / v) |
50 | // The correction factor: log(1 + d) ~ d; for very small d values, so |
51 | // log2(1 + (v % y) / v) ~ LOG_2_RECIPROCAL * (v % y)/v |
52 | // LOG_2_RECIPROCAL ~ 23/16 |
53 | |
54 | // (v % y) = (v % 2^log_cnt) = v & (2^log_cnt - 1) |
55 | correction = (23 * (v & (y - 1))) >> 4; |
56 | return v_f * (kLog2Table[temp] + log_cnt) + correction; |
57 | } else { |
58 | return (float)(LOG_2_RECIPROCAL * v * log((double)v)); |
59 | } |
60 | } |
61 | |
62 | static float FastLog2Slow_MIPS32(uint32_t v) { |
63 | assert(v >= LOG_LOOKUP_IDX_MAX); |
64 | if (v < APPROX_LOG_WITH_CORRECTION_MAX) { |
65 | uint32_t log_cnt, y; |
66 | const int c24 = 24; |
67 | double log_2; |
68 | uint32_t temp; |
69 | |
70 | __asm__ volatile( |
71 | "clz %[log_cnt], %[v] \n\t" |
72 | "addiu %[y], $zero, 1 \n\t" |
73 | "subu %[log_cnt], %[c24], %[log_cnt] \n\t" |
74 | "sllv %[y], %[y], %[log_cnt] \n\t" |
75 | "srlv %[temp], %[v], %[log_cnt] \n\t" |
76 | : [log_cnt]"=&r" (log_cnt), [y]"=&r" (y), |
77 | [temp]"=r" (temp) |
78 | : [c24]"r" (c24), [v]"r" (v) |
79 | ); |
80 | |
81 | log_2 = kLog2Table[temp] + log_cnt; |
82 | if (v >= APPROX_LOG_MAX) { |
83 | // Since the division is still expensive, add this correction factor only |
84 | // for large values of 'v'. |
85 | |
86 | const uint32_t correction = (23 * (v & (y - 1))) >> 4; |
87 | log_2 += (double)correction / v; |
88 | } |
89 | return (float)log_2; |
90 | } else { |
91 | return (float)(LOG_2_RECIPROCAL * log((double)v)); |
92 | } |
93 | } |
94 | |
95 | // C version of this function: |
96 | // int i = 0; |
97 | // int64_t cost = 0; |
98 | // const uint32_t* pop = &population[4]; |
99 | // const uint32_t* LoopEnd = &population[length]; |
100 | // while (pop != LoopEnd) { |
101 | // ++i; |
102 | // cost += i * *pop; |
103 | // cost += i * *(pop + 1); |
104 | // pop += 2; |
105 | // } |
106 | // return (double)cost; |
107 | static double ExtraCost_MIPS32(const uint32_t* const population, int length) { |
108 | int i, temp0, temp1; |
109 | const uint32_t* pop = &population[4]; |
110 | const uint32_t* const LoopEnd = &population[length]; |
111 | |
112 | __asm__ volatile( |
113 | "mult $zero, $zero \n\t" |
114 | "xor %[i], %[i], %[i] \n\t" |
115 | "beq %[pop], %[LoopEnd], 2f \n\t" |
116 | "1: \n\t" |
117 | "lw %[temp0], 0(%[pop]) \n\t" |
118 | "lw %[temp1], 4(%[pop]) \n\t" |
119 | "addiu %[i], %[i], 1 \n\t" |
120 | "addiu %[pop], %[pop], 8 \n\t" |
121 | "madd %[i], %[temp0] \n\t" |
122 | "madd %[i], %[temp1] \n\t" |
123 | "bne %[pop], %[LoopEnd], 1b \n\t" |
124 | "2: \n\t" |
125 | "mfhi %[temp0] \n\t" |
126 | "mflo %[temp1] \n\t" |
127 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), |
128 | [i]"=&r" (i), [pop]"+r" (pop) |
129 | : [LoopEnd]"r" (LoopEnd) |
130 | : "memory" , "hi" , "lo" |
131 | ); |
132 | |
133 | return (double)((int64_t)temp0 << 32 | temp1); |
134 | } |
135 | |
136 | // C version of this function: |
137 | // int i = 0; |
138 | // int64_t cost = 0; |
139 | // const uint32_t* pX = &X[4]; |
140 | // const uint32_t* pY = &Y[4]; |
141 | // const uint32_t* LoopEnd = &X[length]; |
142 | // while (pX != LoopEnd) { |
143 | // const uint32_t xy0 = *pX + *pY; |
144 | // const uint32_t xy1 = *(pX + 1) + *(pY + 1); |
145 | // ++i; |
146 | // cost += i * xy0; |
147 | // cost += i * xy1; |
148 | // pX += 2; |
149 | // pY += 2; |
150 | // } |
151 | // return (double)cost; |
152 | static double ExtraCostCombined_MIPS32(const uint32_t* const X, |
153 | const uint32_t* const Y, int length) { |
154 | int i, temp0, temp1, temp2, temp3; |
155 | const uint32_t* pX = &X[4]; |
156 | const uint32_t* pY = &Y[4]; |
157 | const uint32_t* const LoopEnd = &X[length]; |
158 | |
159 | __asm__ volatile( |
160 | "mult $zero, $zero \n\t" |
161 | "xor %[i], %[i], %[i] \n\t" |
162 | "beq %[pX], %[LoopEnd], 2f \n\t" |
163 | "1: \n\t" |
164 | "lw %[temp0], 0(%[pX]) \n\t" |
165 | "lw %[temp1], 0(%[pY]) \n\t" |
166 | "lw %[temp2], 4(%[pX]) \n\t" |
167 | "lw %[temp3], 4(%[pY]) \n\t" |
168 | "addiu %[i], %[i], 1 \n\t" |
169 | "addu %[temp0], %[temp0], %[temp1] \n\t" |
170 | "addu %[temp2], %[temp2], %[temp3] \n\t" |
171 | "addiu %[pX], %[pX], 8 \n\t" |
172 | "addiu %[pY], %[pY], 8 \n\t" |
173 | "madd %[i], %[temp0] \n\t" |
174 | "madd %[i], %[temp2] \n\t" |
175 | "bne %[pX], %[LoopEnd], 1b \n\t" |
176 | "2: \n\t" |
177 | "mfhi %[temp0] \n\t" |
178 | "mflo %[temp1] \n\t" |
179 | : [temp0]"=&r" (temp0), [temp1]"=&r" (temp1), |
180 | [temp2]"=&r" (temp2), [temp3]"=&r" (temp3), |
181 | [i]"=&r" (i), [pX]"+r" (pX), [pY]"+r" (pY) |
182 | : [LoopEnd]"r" (LoopEnd) |
183 | : "memory" , "hi" , "lo" |
184 | ); |
185 | |
186 | return (double)((int64_t)temp0 << 32 | temp1); |
187 | } |
188 | |
189 | #define HUFFMAN_COST_PASS \ |
190 | __asm__ volatile( \ |
191 | "sll %[temp1], %[temp0], 3 \n\t" \ |
192 | "addiu %[temp3], %[streak], -3 \n\t" \ |
193 | "addu %[temp2], %[pstreaks], %[temp1] \n\t" \ |
194 | "blez %[temp3], 1f \n\t" \ |
195 | "srl %[temp1], %[temp1], 1 \n\t" \ |
196 | "addu %[temp3], %[pcnts], %[temp1] \n\t" \ |
197 | "lw %[temp0], 4(%[temp2]) \n\t" \ |
198 | "lw %[temp1], 0(%[temp3]) \n\t" \ |
199 | "addu %[temp0], %[temp0], %[streak] \n\t" \ |
200 | "addiu %[temp1], %[temp1], 1 \n\t" \ |
201 | "sw %[temp0], 4(%[temp2]) \n\t" \ |
202 | "sw %[temp1], 0(%[temp3]) \n\t" \ |
203 | "b 2f \n\t" \ |
204 | "1: \n\t" \ |
205 | "lw %[temp0], 0(%[temp2]) \n\t" \ |
206 | "addu %[temp0], %[temp0], %[streak] \n\t" \ |
207 | "sw %[temp0], 0(%[temp2]) \n\t" \ |
208 | "2: \n\t" \ |
209 | : [temp1]"=&r"(temp1), [temp2]"=&r"(temp2), \ |
210 | [temp3]"=&r"(temp3), [temp0]"+r"(temp0) \ |
211 | : [pstreaks]"r"(pstreaks), [pcnts]"r"(pcnts), \ |
212 | [streak]"r"(streak) \ |
213 | : "memory" \ |
214 | ); |
215 | |
216 | // Returns the various RLE counts |
217 | static WEBP_INLINE void GetEntropyUnrefinedHelper( |
218 | uint32_t val, int i, uint32_t* const val_prev, int* const i_prev, |
219 | VP8LBitEntropy* const bit_entropy, VP8LStreaks* const stats) { |
220 | int* const pstreaks = &stats->streaks[0][0]; |
221 | int* const pcnts = &stats->counts[0]; |
222 | int temp0, temp1, temp2, temp3; |
223 | const int streak = i - *i_prev; |
224 | |
225 | // Gather info for the bit entropy. |
226 | if (*val_prev != 0) { |
227 | bit_entropy->sum += (*val_prev) * streak; |
228 | bit_entropy->nonzeros += streak; |
229 | bit_entropy->nonzero_code = *i_prev; |
230 | bit_entropy->entropy -= VP8LFastSLog2(*val_prev) * streak; |
231 | if (bit_entropy->max_val < *val_prev) { |
232 | bit_entropy->max_val = *val_prev; |
233 | } |
234 | } |
235 | |
236 | // Gather info for the Huffman cost. |
237 | temp0 = (*val_prev != 0); |
238 | HUFFMAN_COST_PASS |
239 | |
240 | *val_prev = val; |
241 | *i_prev = i; |
242 | } |
243 | |
244 | static void GetEntropyUnrefined_MIPS32(const uint32_t X[], int length, |
245 | VP8LBitEntropy* const bit_entropy, |
246 | VP8LStreaks* const stats) { |
247 | int i; |
248 | int i_prev = 0; |
249 | uint32_t x_prev = X[0]; |
250 | |
251 | memset(stats, 0, sizeof(*stats)); |
252 | VP8LBitEntropyInit(bit_entropy); |
253 | |
254 | for (i = 1; i < length; ++i) { |
255 | const uint32_t x = X[i]; |
256 | if (x != x_prev) { |
257 | GetEntropyUnrefinedHelper(x, i, &x_prev, &i_prev, bit_entropy, stats); |
258 | } |
259 | } |
260 | GetEntropyUnrefinedHelper(0, i, &x_prev, &i_prev, bit_entropy, stats); |
261 | |
262 | bit_entropy->entropy += VP8LFastSLog2(bit_entropy->sum); |
263 | } |
264 | |
265 | static void GetCombinedEntropyUnrefined_MIPS32(const uint32_t X[], |
266 | const uint32_t Y[], |
267 | int length, |
268 | VP8LBitEntropy* const entropy, |
269 | VP8LStreaks* const stats) { |
270 | int i = 1; |
271 | int i_prev = 0; |
272 | uint32_t xy_prev = X[0] + Y[0]; |
273 | |
274 | memset(stats, 0, sizeof(*stats)); |
275 | VP8LBitEntropyInit(entropy); |
276 | |
277 | for (i = 1; i < length; ++i) { |
278 | const uint32_t xy = X[i] + Y[i]; |
279 | if (xy != xy_prev) { |
280 | GetEntropyUnrefinedHelper(xy, i, &xy_prev, &i_prev, entropy, stats); |
281 | } |
282 | } |
283 | GetEntropyUnrefinedHelper(0, i, &xy_prev, &i_prev, entropy, stats); |
284 | |
285 | entropy->entropy += VP8LFastSLog2(entropy->sum); |
286 | } |
287 | |
288 | #define ASM_START \ |
289 | __asm__ volatile( \ |
290 | ".set push \n\t" \ |
291 | ".set at \n\t" \ |
292 | ".set macro \n\t" \ |
293 | "1: \n\t" |
294 | |
295 | // P2 = P0 + P1 |
296 | // A..D - offsets |
297 | // E - temp variable to tell macro |
298 | // if pointer should be incremented |
299 | // literal_ and successive histograms could be unaligned |
300 | // so we must use ulw and usw |
301 | #define ADD_TO_OUT(A, B, C, D, E, P0, P1, P2) \ |
302 | "ulw %[temp0], " #A "(%[" #P0 "]) \n\t" \ |
303 | "ulw %[temp1], " #B "(%[" #P0 "]) \n\t" \ |
304 | "ulw %[temp2], " #C "(%[" #P0 "]) \n\t" \ |
305 | "ulw %[temp3], " #D "(%[" #P0 "]) \n\t" \ |
306 | "ulw %[temp4], " #A "(%[" #P1 "]) \n\t" \ |
307 | "ulw %[temp5], " #B "(%[" #P1 "]) \n\t" \ |
308 | "ulw %[temp6], " #C "(%[" #P1 "]) \n\t" \ |
309 | "ulw %[temp7], " #D "(%[" #P1 "]) \n\t" \ |
310 | "addu %[temp4], %[temp4], %[temp0] \n\t" \ |
311 | "addu %[temp5], %[temp5], %[temp1] \n\t" \ |
312 | "addu %[temp6], %[temp6], %[temp2] \n\t" \ |
313 | "addu %[temp7], %[temp7], %[temp3] \n\t" \ |
314 | "addiu %[" #P0 "], %[" #P0 "], 16 \n\t" \ |
315 | ".if " #E " == 1 \n\t" \ |
316 | "addiu %[" #P1 "], %[" #P1 "], 16 \n\t" \ |
317 | ".endif \n\t" \ |
318 | "usw %[temp4], " #A "(%[" #P2 "]) \n\t" \ |
319 | "usw %[temp5], " #B "(%[" #P2 "]) \n\t" \ |
320 | "usw %[temp6], " #C "(%[" #P2 "]) \n\t" \ |
321 | "usw %[temp7], " #D "(%[" #P2 "]) \n\t" \ |
322 | "addiu %[" #P2 "], %[" #P2 "], 16 \n\t" \ |
323 | "bne %[" #P0 "], %[LoopEnd], 1b \n\t" \ |
324 | ".set pop \n\t" \ |
325 | |
326 | #define ASM_END_COMMON_0 \ |
327 | : [temp0]"=&r"(temp0), [temp1]"=&r"(temp1), \ |
328 | [temp2]"=&r"(temp2), [temp3]"=&r"(temp3), \ |
329 | [temp4]"=&r"(temp4), [temp5]"=&r"(temp5), \ |
330 | [temp6]"=&r"(temp6), [temp7]"=&r"(temp7), \ |
331 | [pa]"+r"(pa), [pout]"+r"(pout) |
332 | |
333 | #define ASM_END_COMMON_1 \ |
334 | : [LoopEnd]"r"(LoopEnd) \ |
335 | : "memory", "at" \ |
336 | ); |
337 | |
338 | #define ASM_END_0 \ |
339 | ASM_END_COMMON_0 \ |
340 | , [pb]"+r"(pb) \ |
341 | ASM_END_COMMON_1 |
342 | |
343 | #define ASM_END_1 \ |
344 | ASM_END_COMMON_0 \ |
345 | ASM_END_COMMON_1 |
346 | |
347 | static void AddVector_MIPS32(const uint32_t* pa, const uint32_t* pb, |
348 | uint32_t* pout, int size) { |
349 | uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; |
350 | const uint32_t end = ((size) / 4) * 4; |
351 | const uint32_t* const LoopEnd = pa + end; |
352 | int i; |
353 | ASM_START |
354 | ADD_TO_OUT(0, 4, 8, 12, 1, pa, pb, pout) |
355 | ASM_END_0 |
356 | for (i = end; i < size; ++i) pout[i] = pa[i] + pb[i]; |
357 | } |
358 | |
359 | static void AddVectorEq_MIPS32(const uint32_t* pa, uint32_t* pout, int size) { |
360 | uint32_t temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7; |
361 | const uint32_t end = ((size) / 4) * 4; |
362 | const uint32_t* const LoopEnd = pa + end; |
363 | int i; |
364 | ASM_START |
365 | ADD_TO_OUT(0, 4, 8, 12, 0, pa, pout, pout) |
366 | ASM_END_1 |
367 | for (i = end; i < size; ++i) pout[i] += pa[i]; |
368 | } |
369 | |
370 | #undef ASM_END_1 |
371 | #undef ASM_END_0 |
372 | #undef ASM_END_COMMON_1 |
373 | #undef ASM_END_COMMON_0 |
374 | #undef ADD_TO_OUT |
375 | #undef ASM_START |
376 | |
377 | //------------------------------------------------------------------------------ |
378 | // Entry point |
379 | |
380 | extern void VP8LEncDspInitMIPS32(void); |
381 | |
382 | WEBP_TSAN_IGNORE_FUNCTION void VP8LEncDspInitMIPS32(void) { |
383 | VP8LFastSLog2Slow = FastSLog2Slow_MIPS32; |
384 | VP8LFastLog2Slow = FastLog2Slow_MIPS32; |
385 | VP8LExtraCost = ExtraCost_MIPS32; |
386 | VP8LExtraCostCombined = ExtraCostCombined_MIPS32; |
387 | VP8LGetEntropyUnrefined = GetEntropyUnrefined_MIPS32; |
388 | VP8LGetCombinedEntropyUnrefined = GetCombinedEntropyUnrefined_MIPS32; |
389 | VP8LAddVector = AddVector_MIPS32; |
390 | VP8LAddVectorEq = AddVectorEq_MIPS32; |
391 | } |
392 | |
393 | #else // !WEBP_USE_MIPS32 |
394 | |
395 | WEBP_DSP_INIT_STUB(VP8LEncDspInitMIPS32) |
396 | |
397 | #endif // WEBP_USE_MIPS32 |
398 | |