1// Copyright 2011 Google Inc. All Rights Reserved.
2//
3// Use of this source code is governed by a BSD-style license
4// that can be found in the COPYING file in the root of the source
5// tree. An additional intellectual property rights grant can be found
6// in the file PATENTS. All contributing project authors may
7// be found in the AUTHORS file in the root of the source tree.
8// -----------------------------------------------------------------------------
9//
10// Quantization
11//
12// Author: Skal (pascal.massimino@gmail.com)
13
14#include <assert.h>
15#include <math.h>
16#include <stdlib.h> // for abs()
17
18#include "src/dsp/quant.h"
19#include "src/enc/vp8i_enc.h"
20#include "src/enc/cost_enc.h"
21
22#define DO_TRELLIS_I4 1
23#define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate.
24#define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth.
25#define USE_TDISTO 1
26
27#define MID_ALPHA 64 // neutral value for susceptibility
28#define MIN_ALPHA 30 // lowest usable value for susceptibility
29#define MAX_ALPHA 100 // higher meaningful value for susceptibility
30
31#define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP
32 // power-law modulation. Must be strictly less than 1.
33
34// number of non-zero coeffs below which we consider the block very flat
35// (and apply a penalty to complex predictions)
36#define FLATNESS_LIMIT_I16 0 // I16 mode (special case)
37#define FLATNESS_LIMIT_I4 3 // I4 mode
38#define FLATNESS_LIMIT_UV 2 // UV mode
39#define FLATNESS_PENALTY 140 // roughly ~1bit per block
40
41#define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
42
43#define RD_DISTO_MULT 256 // distortion multiplier (equivalent of lambda)
44
45// #define DEBUG_BLOCK
46
47//------------------------------------------------------------------------------
48
49#if defined(DEBUG_BLOCK)
50
51#include <stdio.h>
52#include <stdlib.h>
53
54static void PrintBlockInfo(const VP8EncIterator* const it,
55 const VP8ModeScore* const rd) {
56 int i, j;
57 const int is_i16 = (it->mb_->type_ == 1);
58 const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
59 const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
60 const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
61 const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
62 printf("SOURCE / OUTPUT / ABS DELTA\n");
63 for (j = 0; j < 16; ++j) {
64 for (i = 0; i < 16; ++i) printf("%3d ", y_in[i + j * BPS]);
65 printf(" ");
66 for (i = 0; i < 16; ++i) printf("%3d ", y_out[i + j * BPS]);
67 printf(" ");
68 for (i = 0; i < 16; ++i) {
69 printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
70 }
71 printf("\n");
72 }
73 printf("\n"); // newline before the U/V block
74 for (j = 0; j < 8; ++j) {
75 for (i = 0; i < 8; ++i) printf("%3d ", uv_in[i + j * BPS]);
76 printf(" ");
77 for (i = 8; i < 16; ++i) printf("%3d ", uv_in[i + j * BPS]);
78 printf(" ");
79 for (i = 0; i < 8; ++i) printf("%3d ", uv_out[i + j * BPS]);
80 printf(" ");
81 for (i = 8; i < 16; ++i) printf("%3d ", uv_out[i + j * BPS]);
82 printf(" ");
83 for (i = 0; i < 8; ++i) {
84 printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
85 }
86 printf(" ");
87 for (i = 8; i < 16; ++i) {
88 printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
89 }
90 printf("\n");
91 }
92 printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
93 (int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
94 (int)rd->score);
95 if (is_i16) {
96 printf("Mode: %d\n", rd->mode_i16);
97 printf("y_dc_levels:");
98 for (i = 0; i < 16; ++i) printf("%3d ", rd->y_dc_levels[i]);
99 printf("\n");
100 } else {
101 printf("Modes[16]: ");
102 for (i = 0; i < 16; ++i) printf("%d ", rd->modes_i4[i]);
103 printf("\n");
104 }
105 printf("y_ac_levels:\n");
106 for (j = 0; j < 16; ++j) {
107 for (i = is_i16 ? 1 : 0; i < 16; ++i) {
108 printf("%4d ", rd->y_ac_levels[j][i]);
109 }
110 printf("\n");
111 }
112 printf("\n");
113 printf("uv_levels (mode=%d):\n", rd->mode_uv);
114 for (j = 0; j < 8; ++j) {
115 for (i = 0; i < 16; ++i) {
116 printf("%4d ", rd->uv_levels[j][i]);
117 }
118 printf("\n");
119 }
120}
121
122#endif // DEBUG_BLOCK
123
124//------------------------------------------------------------------------------
125
126static WEBP_INLINE int clip(int v, int m, int M) {
127 return v < m ? m : v > M ? M : v;
128}
129
130static const uint8_t kZigzag[16] = {
131 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15
132};
133
134static const uint8_t kDcTable[128] = {
135 4, 5, 6, 7, 8, 9, 10, 10,
136 11, 12, 13, 14, 15, 16, 17, 17,
137 18, 19, 20, 20, 21, 21, 22, 22,
138 23, 23, 24, 25, 25, 26, 27, 28,
139 29, 30, 31, 32, 33, 34, 35, 36,
140 37, 37, 38, 39, 40, 41, 42, 43,
141 44, 45, 46, 46, 47, 48, 49, 50,
142 51, 52, 53, 54, 55, 56, 57, 58,
143 59, 60, 61, 62, 63, 64, 65, 66,
144 67, 68, 69, 70, 71, 72, 73, 74,
145 75, 76, 76, 77, 78, 79, 80, 81,
146 82, 83, 84, 85, 86, 87, 88, 89,
147 91, 93, 95, 96, 98, 100, 101, 102,
148 104, 106, 108, 110, 112, 114, 116, 118,
149 122, 124, 126, 128, 130, 132, 134, 136,
150 138, 140, 143, 145, 148, 151, 154, 157
151};
152
153static const uint16_t kAcTable[128] = {
154 4, 5, 6, 7, 8, 9, 10, 11,
155 12, 13, 14, 15, 16, 17, 18, 19,
156 20, 21, 22, 23, 24, 25, 26, 27,
157 28, 29, 30, 31, 32, 33, 34, 35,
158 36, 37, 38, 39, 40, 41, 42, 43,
159 44, 45, 46, 47, 48, 49, 50, 51,
160 52, 53, 54, 55, 56, 57, 58, 60,
161 62, 64, 66, 68, 70, 72, 74, 76,
162 78, 80, 82, 84, 86, 88, 90, 92,
163 94, 96, 98, 100, 102, 104, 106, 108,
164 110, 112, 114, 116, 119, 122, 125, 128,
165 131, 134, 137, 140, 143, 146, 149, 152,
166 155, 158, 161, 164, 167, 170, 173, 177,
167 181, 185, 189, 193, 197, 201, 205, 209,
168 213, 217, 221, 225, 229, 234, 239, 245,
169 249, 254, 259, 264, 269, 274, 279, 284
170};
171
172static const uint16_t kAcTable2[128] = {
173 8, 8, 9, 10, 12, 13, 15, 17,
174 18, 20, 21, 23, 24, 26, 27, 29,
175 31, 32, 34, 35, 37, 38, 40, 41,
176 43, 44, 46, 48, 49, 51, 52, 54,
177 55, 57, 58, 60, 62, 63, 65, 66,
178 68, 69, 71, 72, 74, 75, 77, 79,
179 80, 82, 83, 85, 86, 88, 89, 93,
180 96, 99, 102, 105, 108, 111, 114, 117,
181 120, 124, 127, 130, 133, 136, 139, 142,
182 145, 148, 151, 155, 158, 161, 164, 167,
183 170, 173, 176, 179, 184, 189, 193, 198,
184 203, 207, 212, 217, 221, 226, 230, 235,
185 240, 244, 249, 254, 258, 263, 268, 274,
186 280, 286, 292, 299, 305, 311, 317, 323,
187 330, 336, 342, 348, 354, 362, 370, 379,
188 385, 393, 401, 409, 416, 424, 432, 440
189};
190
191static const uint8_t kBiasMatrices[3][2] = { // [luma-ac,luma-dc,chroma][dc,ac]
192 { 96, 110 }, { 96, 108 }, { 110, 115 }
193};
194
195// Sharpening by (slightly) raising the hi-frequency coeffs.
196// Hack-ish but helpful for mid-bitrate range. Use with care.
197#define SHARPEN_BITS 11 // number of descaling bits for sharpening bias
198static const uint8_t kFreqSharpening[16] = {
199 0, 30, 60, 90,
200 30, 60, 90, 90,
201 60, 90, 90, 90,
202 90, 90, 90, 90
203};
204
205//------------------------------------------------------------------------------
206// Initialize quantization parameters in VP8Matrix
207
208// Returns the average quantizer
209static int ExpandMatrix(VP8Matrix* const m, int type) {
210 int i, sum;
211 for (i = 0; i < 2; ++i) {
212 const int is_ac_coeff = (i > 0);
213 const int bias = kBiasMatrices[type][is_ac_coeff];
214 m->iq_[i] = (1 << QFIX) / m->q_[i];
215 m->bias_[i] = BIAS(bias);
216 // zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
217 // * zero if coeff <= zthresh
218 // * non-zero if coeff > zthresh
219 m->zthresh_[i] = ((1 << QFIX) - 1 - m->bias_[i]) / m->iq_[i];
220 }
221 for (i = 2; i < 16; ++i) {
222 m->q_[i] = m->q_[1];
223 m->iq_[i] = m->iq_[1];
224 m->bias_[i] = m->bias_[1];
225 m->zthresh_[i] = m->zthresh_[1];
226 }
227 for (sum = 0, i = 0; i < 16; ++i) {
228 if (type == 0) { // we only use sharpening for AC luma coeffs
229 m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
230 } else {
231 m->sharpen_[i] = 0;
232 }
233 sum += m->q_[i];
234 }
235 return (sum + 8) >> 4;
236}
237
238static void CheckLambdaValue(int* const v) { if (*v < 1) *v = 1; }
239
240static void SetupMatrices(VP8Encoder* enc) {
241 int i;
242 const int tlambda_scale =
243 (enc->method_ >= 4) ? enc->config_->sns_strength
244 : 0;
245 const int num_segments = enc->segment_hdr_.num_segments_;
246 for (i = 0; i < num_segments; ++i) {
247 VP8SegmentInfo* const m = &enc->dqm_[i];
248 const int q = m->quant_;
249 int q_i4, q_i16, q_uv;
250 m->y1_.q_[0] = kDcTable[clip(q + enc->dq_y1_dc_, 0, 127)];
251 m->y1_.q_[1] = kAcTable[clip(q, 0, 127)];
252
253 m->y2_.q_[0] = kDcTable[ clip(q + enc->dq_y2_dc_, 0, 127)] * 2;
254 m->y2_.q_[1] = kAcTable2[clip(q + enc->dq_y2_ac_, 0, 127)];
255
256 m->uv_.q_[0] = kDcTable[clip(q + enc->dq_uv_dc_, 0, 117)];
257 m->uv_.q_[1] = kAcTable[clip(q + enc->dq_uv_ac_, 0, 127)];
258
259 q_i4 = ExpandMatrix(&m->y1_, 0);
260 q_i16 = ExpandMatrix(&m->y2_, 1);
261 q_uv = ExpandMatrix(&m->uv_, 2);
262
263 m->lambda_i4_ = (3 * q_i4 * q_i4) >> 7;
264 m->lambda_i16_ = (3 * q_i16 * q_i16);
265 m->lambda_uv_ = (3 * q_uv * q_uv) >> 6;
266 m->lambda_mode_ = (1 * q_i4 * q_i4) >> 7;
267 m->lambda_trellis_i4_ = (7 * q_i4 * q_i4) >> 3;
268 m->lambda_trellis_i16_ = (q_i16 * q_i16) >> 2;
269 m->lambda_trellis_uv_ = (q_uv * q_uv) << 1;
270 m->tlambda_ = (tlambda_scale * q_i4) >> 5;
271
272 // none of these constants should be < 1
273 CheckLambdaValue(&m->lambda_i4_);
274 CheckLambdaValue(&m->lambda_i16_);
275 CheckLambdaValue(&m->lambda_uv_);
276 CheckLambdaValue(&m->lambda_mode_);
277 CheckLambdaValue(&m->lambda_trellis_i4_);
278 CheckLambdaValue(&m->lambda_trellis_i16_);
279 CheckLambdaValue(&m->lambda_trellis_uv_);
280 CheckLambdaValue(&m->tlambda_);
281
282 m->min_disto_ = 20 * m->y1_.q_[0]; // quantization-aware min disto
283 m->max_edge_ = 0;
284
285 m->i4_penalty_ = 1000 * q_i4 * q_i4;
286 }
287}
288
289//------------------------------------------------------------------------------
290// Initialize filtering parameters
291
292// Very small filter-strength values have close to no visual effect. So we can
293// save a little decoding-CPU by turning filtering off for these.
294#define FSTRENGTH_CUTOFF 2
295
296static void SetupFilterStrength(VP8Encoder* const enc) {
297 int i;
298 // level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
299 const int level0 = 5 * enc->config_->filter_strength;
300 for (i = 0; i < NUM_MB_SEGMENTS; ++i) {
301 VP8SegmentInfo* const m = &enc->dqm_[i];
302 // We focus on the quantization of AC coeffs.
303 const int qstep = kAcTable[clip(m->quant_, 0, 127)] >> 2;
304 const int base_strength =
305 VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
306 // Segments with lower complexity ('beta') will be less filtered.
307 const int f = base_strength * level0 / (256 + m->beta_);
308 m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? 0 : (f > 63) ? 63 : f;
309 }
310 // We record the initial strength (mainly for the case of 1-segment only).
311 enc->filter_hdr_.level_ = enc->dqm_[0].fstrength_;
312 enc->filter_hdr_.simple_ = (enc->config_->filter_type == 0);
313 enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
314}
315
316//------------------------------------------------------------------------------
317
318// Note: if you change the values below, remember that the max range
319// allowed by the syntax for DQ_UV is [-16,16].
320#define MAX_DQ_UV (6)
321#define MIN_DQ_UV (-4)
322
323// We want to emulate jpeg-like behaviour where the expected "good" quality
324// is around q=75. Internally, our "good" middle is around c=50. So we
325// map accordingly using linear piece-wise function
326static double QualityToCompression(double c) {
327 const double linear_c = (c < 0.75) ? c * (2. / 3.) : 2. * c - 1.;
328 // The file size roughly scales as pow(quantizer, 3.). Actually, the
329 // exponent is somewhere between 2.8 and 3.2, but we're mostly interested
330 // in the mid-quant range. So we scale the compressibility inversely to
331 // this power-law: quant ~= compression ^ 1/3. This law holds well for
332 // low quant. Finer modeling for high-quant would make use of kAcTable[]
333 // more explicitly.
334 const double v = pow(linear_c, 1 / 3.);
335 return v;
336}
337
338static double QualityToJPEGCompression(double c, double alpha) {
339 // We map the complexity 'alpha' and quality setting 'c' to a compression
340 // exponent empirically matched to the compression curve of libjpeg6b.
341 // On average, the WebP output size will be roughly similar to that of a
342 // JPEG file compressed with same quality factor.
343 const double amin = 0.30;
344 const double amax = 0.85;
345 const double exp_min = 0.4;
346 const double exp_max = 0.9;
347 const double slope = (exp_min - exp_max) / (amax - amin);
348 // Linearly interpolate 'expn' from exp_min to exp_max
349 // in the [amin, amax] range.
350 const double expn = (alpha > amax) ? exp_min
351 : (alpha < amin) ? exp_max
352 : exp_max + slope * (alpha - amin);
353 const double v = pow(c, expn);
354 return v;
355}
356
357static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
358 const VP8SegmentInfo* const S2) {
359 return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
360}
361
362static void SimplifySegments(VP8Encoder* const enc) {
363 int map[NUM_MB_SEGMENTS] = { 0, 1, 2, 3 };
364 // 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
365 // explicit check is needed to avoid a spurious warning about 'i' exceeding
366 // array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
367 const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
368 ? enc->segment_hdr_.num_segments_
369 : NUM_MB_SEGMENTS;
370 int num_final_segments = 1;
371 int s1, s2;
372 for (s1 = 1; s1 < num_segments; ++s1) { // find similar segments
373 const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
374 int found = 0;
375 // check if we already have similar segment
376 for (s2 = 0; s2 < num_final_segments; ++s2) {
377 const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
378 if (SegmentsAreEquivalent(S1, S2)) {
379 found = 1;
380 break;
381 }
382 }
383 map[s1] = s2;
384 if (!found) {
385 if (num_final_segments != s1) {
386 enc->dqm_[num_final_segments] = enc->dqm_[s1];
387 }
388 ++num_final_segments;
389 }
390 }
391 if (num_final_segments < num_segments) { // Remap
392 int i = enc->mb_w_ * enc->mb_h_;
393 while (i-- > 0) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
394 enc->segment_hdr_.num_segments_ = num_final_segments;
395 // Replicate the trailing segment infos (it's mostly cosmetics)
396 for (i = num_final_segments; i < num_segments; ++i) {
397 enc->dqm_[i] = enc->dqm_[num_final_segments - 1];
398 }
399 }
400}
401
402void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
403 int i;
404 int dq_uv_ac, dq_uv_dc;
405 const int num_segments = enc->segment_hdr_.num_segments_;
406 const double amp = SNS_TO_DQ * enc->config_->sns_strength / 100. / 128.;
407 const double Q = quality / 100.;
408 const double c_base = enc->config_->emulate_jpeg_size ?
409 QualityToJPEGCompression(Q, enc->alpha_ / 255.) :
410 QualityToCompression(Q);
411 for (i = 0; i < num_segments; ++i) {
412 // We modulate the base coefficient to accommodate for the quantization
413 // susceptibility and allow denser segments to be quantized more.
414 const double expn = 1. - amp * enc->dqm_[i].alpha_;
415 const double c = pow(c_base, expn);
416 const int q = (int)(127. * (1. - c));
417 assert(expn > 0.);
418 enc->dqm_[i].quant_ = clip(q, 0, 127);
419 }
420
421 // purely indicative in the bitstream (except for the 1-segment case)
422 enc->base_quant_ = enc->dqm_[0].quant_;
423
424 // fill-in values for the unused segments (required by the syntax)
425 for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
426 enc->dqm_[i].quant_ = enc->base_quant_;
427 }
428
429 // uv_alpha_ is normally spread around ~60. The useful range is
430 // typically ~30 (quite bad) to ~100 (ok to decimate UV more).
431 // We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
432 dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
433 / (MAX_ALPHA - MIN_ALPHA);
434 // we rescale by the user-defined strength of adaptation
435 dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / 100;
436 // and make it safe.
437 dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
438 // We also boost the dc-uv-quant a little, based on sns-strength, since
439 // U/V channels are quite more reactive to high quants (flat DC-blocks
440 // tend to appear, and are unpleasant).
441 dq_uv_dc = -4 * enc->config_->sns_strength / 100;
442 dq_uv_dc = clip(dq_uv_dc, -15, 15); // 4bit-signed max allowed
443
444 enc->dq_y1_dc_ = 0; // TODO(skal): dq-lum
445 enc->dq_y2_dc_ = 0;
446 enc->dq_y2_ac_ = 0;
447 enc->dq_uv_dc_ = dq_uv_dc;
448 enc->dq_uv_ac_ = dq_uv_ac;
449
450 SetupFilterStrength(enc); // initialize segments' filtering, eventually
451
452 if (num_segments > 1) SimplifySegments(enc);
453
454 SetupMatrices(enc); // finalize quantization matrices
455}
456
457//------------------------------------------------------------------------------
458// Form the predictions in cache
459
460// Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
461const uint16_t VP8I16ModeOffsets[4] = { I16DC16, I16TM16, I16VE16, I16HE16 };
462const uint16_t VP8UVModeOffsets[4] = { C8DC8, C8TM8, C8VE8, C8HE8 };
463
464// Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
465const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
466 I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
467};
468
469void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
470 const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
471 const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
472 VP8EncPredLuma16(it->yuv_p_, left, top);
473}
474
475void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
476 const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
477 const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
478 VP8EncPredChroma8(it->yuv_p_, left, top);
479}
480
481void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
482 VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
483}
484
485//------------------------------------------------------------------------------
486// Quantize
487
488// Layout:
489// +----+----+
490// |YYYY|UUVV| 0
491// |YYYY|UUVV| 4
492// |YYYY|....| 8
493// |YYYY|....| 12
494// +----+----+
495
496const uint16_t VP8Scan[16] = { // Luma
497 0 + 0 * BPS, 4 + 0 * BPS, 8 + 0 * BPS, 12 + 0 * BPS,
498 0 + 4 * BPS, 4 + 4 * BPS, 8 + 4 * BPS, 12 + 4 * BPS,
499 0 + 8 * BPS, 4 + 8 * BPS, 8 + 8 * BPS, 12 + 8 * BPS,
500 0 + 12 * BPS, 4 + 12 * BPS, 8 + 12 * BPS, 12 + 12 * BPS,
501};
502
503static const uint16_t VP8ScanUV[4 + 4] = {
504 0 + 0 * BPS, 4 + 0 * BPS, 0 + 4 * BPS, 4 + 4 * BPS, // U
505 8 + 0 * BPS, 12 + 0 * BPS, 8 + 4 * BPS, 12 + 4 * BPS // V
506};
507
508//------------------------------------------------------------------------------
509// Distortion measurement
510
511static const uint16_t kWeightY[16] = {
512 38, 32, 20, 9, 32, 28, 17, 7, 20, 17, 10, 4, 9, 7, 4, 2
513};
514
515static const uint16_t kWeightTrellis[16] = {
516#if USE_TDISTO == 0
517 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16, 16
518#else
519 30, 27, 19, 11,
520 27, 24, 17, 10,
521 19, 17, 12, 8,
522 11, 10, 8, 6
523#endif
524};
525
526// Init/Copy the common fields in score.
527static void InitScore(VP8ModeScore* const rd) {
528 rd->D = 0;
529 rd->SD = 0;
530 rd->R = 0;
531 rd->H = 0;
532 rd->nz = 0;
533 rd->score = MAX_COST;
534}
535
536static void CopyScore(VP8ModeScore* WEBP_RESTRICT const dst,
537 const VP8ModeScore* WEBP_RESTRICT const src) {
538 dst->D = src->D;
539 dst->SD = src->SD;
540 dst->R = src->R;
541 dst->H = src->H;
542 dst->nz = src->nz; // note that nz is not accumulated, but just copied.
543 dst->score = src->score;
544}
545
546static void AddScore(VP8ModeScore* WEBP_RESTRICT const dst,
547 const VP8ModeScore* WEBP_RESTRICT const src) {
548 dst->D += src->D;
549 dst->SD += src->SD;
550 dst->R += src->R;
551 dst->H += src->H;
552 dst->nz |= src->nz; // here, new nz bits are accumulated.
553 dst->score += src->score;
554}
555
556//------------------------------------------------------------------------------
557// Performs trellis-optimized quantization.
558
559// -- GODOT start --
560// Prevents Visual Studio debugger from using this Node struct in place of the Godot Node class.
561#define Node Node_libwebp_quant
562// -- GODOT end --
563
564// Trellis node
565typedef struct {
566 int8_t prev; // best previous node
567 int8_t sign; // sign of coeff_i
568 int16_t level; // level
569} Node;
570
571// Score state
572typedef struct {
573 score_t score; // partial RD score
574 const uint16_t* costs; // shortcut to cost tables
575} ScoreState;
576
577// If a coefficient was quantized to a value Q (using a neutral bias),
578// we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
579// We don't test negative values though.
580#define MIN_DELTA 0 // how much lower level to try
581#define MAX_DELTA 1 // how much higher
582#define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
583#define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
584#define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
585
586static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
587 rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
588}
589
590static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
591 score_t distortion) {
592 return rate * lambda + RD_DISTO_MULT * distortion;
593}
594
595// Coefficient type.
596enum { TYPE_I16_AC = 0, TYPE_I16_DC = 1, TYPE_CHROMA_A = 2, TYPE_I4_AC = 3 };
597
598static int TrellisQuantizeBlock(const VP8Encoder* WEBP_RESTRICT const enc,
599 int16_t in[16], int16_t out[16],
600 int ctx0, int coeff_type,
601 const VP8Matrix* WEBP_RESTRICT const mtx,
602 int lambda) {
603 const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
604 CostArrayPtr const costs =
605 (CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
606 const int first = (coeff_type == TYPE_I16_AC) ? 1 : 0;
607 Node nodes[16][NUM_NODES];
608 ScoreState score_states[2][NUM_NODES];
609 ScoreState* ss_cur = &SCORE_STATE(0, MIN_DELTA);
610 ScoreState* ss_prev = &SCORE_STATE(1, MIN_DELTA);
611 int best_path[3] = {-1, -1, -1}; // store best-last/best-level/best-previous
612 score_t best_score;
613 int n, m, p, last;
614
615 {
616 score_t cost;
617 const int thresh = mtx->q_[1] * mtx->q_[1] / 4;
618 const int last_proba = probas[VP8EncBands[first]][ctx0][0];
619
620 // compute the position of the last interesting coefficient
621 last = first - 1;
622 for (n = 15; n >= first; --n) {
623 const int j = kZigzag[n];
624 const int err = in[j] * in[j];
625 if (err > thresh) {
626 last = n;
627 break;
628 }
629 }
630 // we don't need to go inspect up to n = 16 coeffs. We can just go up
631 // to last + 1 (inclusive) without losing much.
632 if (last < 15) ++last;
633
634 // compute 'skip' score. This is the max score one can do.
635 cost = VP8BitCost(0, last_proba);
636 best_score = RDScoreTrellis(lambda, cost, 0);
637
638 // initialize source node.
639 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
640 const score_t rate = (ctx0 == 0) ? VP8BitCost(1, last_proba) : 0;
641 ss_cur[m].score = RDScoreTrellis(lambda, rate, 0);
642 ss_cur[m].costs = costs[first][ctx0];
643 }
644 }
645
646 // traverse trellis.
647 for (n = first; n <= last; ++n) {
648 const int j = kZigzag[n];
649 const uint32_t Q = mtx->q_[j];
650 const uint32_t iQ = mtx->iq_[j];
651 const uint32_t B = BIAS(0x00); // neutral bias
652 // note: it's important to take sign of the _original_ coeff,
653 // so we don't have to consider level < 0 afterward.
654 const int sign = (in[j] < 0);
655 const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
656 int level0 = QUANTDIV(coeff0, iQ, B);
657 int thresh_level = QUANTDIV(coeff0, iQ, BIAS(0x80));
658 if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
659 if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
660
661 { // Swap current and previous score states
662 ScoreState* const tmp = ss_cur;
663 ss_cur = ss_prev;
664 ss_prev = tmp;
665 }
666
667 // test all alternate level values around level0.
668 for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
669 Node* const cur = &NODE(n, m);
670 const int level = level0 + m;
671 const int ctx = (level > 2) ? 2 : level;
672 const int band = VP8EncBands[n + 1];
673 score_t base_score;
674 score_t best_cur_score;
675 int best_prev;
676 score_t cost, score;
677
678 ss_cur[m].costs = costs[n + 1][ctx];
679 if (level < 0 || level > thresh_level) {
680 ss_cur[m].score = MAX_COST;
681 // Node is dead.
682 continue;
683 }
684
685 {
686 // Compute delta_error = how much coding this level will
687 // subtract to max_error as distortion.
688 // Here, distortion = sum of (|coeff_i| - level_i * Q_i)^2
689 const int new_error = coeff0 - level * Q;
690 const int delta_error =
691 kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
692 base_score = RDScoreTrellis(lambda, 0, delta_error);
693 }
694
695 // Inspect all possible non-dead predecessors. Retain only the best one.
696 // The base_score is added to all scores so it is only added for the final
697 // value after the loop.
698 cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level);
699 best_cur_score =
700 ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, 0);
701 best_prev = -MIN_DELTA;
702 for (p = -MIN_DELTA + 1; p <= MAX_DELTA; ++p) {
703 // Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
704 // eliminated since their score can't be better than the current best.
705 cost = VP8LevelCost(ss_prev[p].costs, level);
706 // Examine node assuming it's a non-terminal one.
707 score = ss_prev[p].score + RDScoreTrellis(lambda, cost, 0);
708 if (score < best_cur_score) {
709 best_cur_score = score;
710 best_prev = p;
711 }
712 }
713 best_cur_score += base_score;
714 // Store best finding in current node.
715 cur->sign = sign;
716 cur->level = level;
717 cur->prev = best_prev;
718 ss_cur[m].score = best_cur_score;
719
720 // Now, record best terminal node (and thus best entry in the graph).
721 if (level != 0 && best_cur_score < best_score) {
722 const score_t last_pos_cost =
723 (n < 15) ? VP8BitCost(0, probas[band][ctx][0]) : 0;
724 const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, 0);
725 score = best_cur_score + last_pos_score;
726 if (score < best_score) {
727 best_score = score;
728 best_path[0] = n; // best eob position
729 best_path[1] = m; // best node index
730 best_path[2] = best_prev; // best predecessor
731 }
732 }
733 }
734 }
735
736 // Fresh start
737 // Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case.
738 if (coeff_type == TYPE_I16_AC) {
739 memset(in + 1, 0, 15 * sizeof(*in));
740 memset(out + 1, 0, 15 * sizeof(*out));
741 } else {
742 memset(in, 0, 16 * sizeof(*in));
743 memset(out, 0, 16 * sizeof(*out));
744 }
745 if (best_path[0] == -1) {
746 return 0; // skip!
747 }
748
749 {
750 // Unwind the best path.
751 // Note: best-prev on terminal node is not necessarily equal to the
752 // best_prev for non-terminal. So we patch best_path[2] in.
753 int nz = 0;
754 int best_node = best_path[1];
755 n = best_path[0];
756 NODE(n, best_node).prev = best_path[2]; // force best-prev for terminal
757
758 for (; n >= first; --n) {
759 const Node* const node = &NODE(n, best_node);
760 const int j = kZigzag[n];
761 out[n] = node->sign ? -node->level : node->level;
762 nz |= node->level;
763 in[j] = out[n] * mtx->q_[j];
764 best_node = node->prev;
765 }
766 return (nz != 0);
767 }
768}
769
770#undef NODE
771
772//------------------------------------------------------------------------------
773// Performs: difference, transform, quantize, back-transform, add
774// all at once. Output is the reconstructed block in *yuv_out, and the
775// quantized levels in *levels.
776
777static int ReconstructIntra16(VP8EncIterator* WEBP_RESTRICT const it,
778 VP8ModeScore* WEBP_RESTRICT const rd,
779 uint8_t* WEBP_RESTRICT const yuv_out,
780 int mode) {
781 const VP8Encoder* const enc = it->enc_;
782 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
783 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
784 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
785 int nz = 0;
786 int n;
787 int16_t tmp[16][16], dc_tmp[16];
788
789 for (n = 0; n < 16; n += 2) {
790 VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
791 }
792 VP8FTransformWHT(tmp[0], dc_tmp);
793 nz |= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << 24;
794
795 if (DO_TRELLIS_I16 && it->do_trellis_) {
796 int x, y;
797 VP8IteratorNzToBytes(it);
798 for (y = 0, n = 0; y < 4; ++y) {
799 for (x = 0; x < 4; ++x, ++n) {
800 const int ctx = it->top_nz_[x] + it->left_nz_[y];
801 const int non_zero = TrellisQuantizeBlock(
802 enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1_,
803 dqm->lambda_trellis_i16_);
804 it->top_nz_[x] = it->left_nz_[y] = non_zero;
805 rd->y_ac_levels[n][0] = 0;
806 nz |= non_zero << n;
807 }
808 }
809 } else {
810 for (n = 0; n < 16; n += 2) {
811 // Zero-out the first coeff, so that: a) nz is correct below, and
812 // b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
813 tmp[n][0] = tmp[n + 1][0] = 0;
814 nz |= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
815 assert(rd->y_ac_levels[n + 0][0] == 0);
816 assert(rd->y_ac_levels[n + 1][0] == 0);
817 }
818 }
819
820 // Transform back
821 VP8TransformWHT(dc_tmp, tmp[0]);
822 for (n = 0; n < 16; n += 2) {
823 VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], 1);
824 }
825
826 return nz;
827}
828
829static int ReconstructIntra4(VP8EncIterator* WEBP_RESTRICT const it,
830 int16_t levels[16],
831 const uint8_t* WEBP_RESTRICT const src,
832 uint8_t* WEBP_RESTRICT const yuv_out,
833 int mode) {
834 const VP8Encoder* const enc = it->enc_;
835 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
836 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
837 int nz = 0;
838 int16_t tmp[16];
839
840 VP8FTransform(src, ref, tmp);
841 if (DO_TRELLIS_I4 && it->do_trellis_) {
842 const int x = it->i4_ & 3, y = it->i4_ >> 2;
843 const int ctx = it->top_nz_[x] + it->left_nz_[y];
844 nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1_,
845 dqm->lambda_trellis_i4_);
846 } else {
847 nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
848 }
849 VP8ITransform(ref, tmp, yuv_out, 0);
850 return nz;
851}
852
853//------------------------------------------------------------------------------
854// DC-error diffusion
855
856// Diffusion weights. We under-correct a bit (15/16th of the error is actually
857// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
858#define C1 7 // fraction of error sent to the 4x4 block below
859#define C2 8 // fraction of error sent to the 4x4 block on the right
860#define DSHIFT 4
861#define DSCALE 1 // storage descaling, needed to make the error fit int8_t
862
863// Quantize as usual, but also compute and return the quantization error.
864// Error is already divided by DSHIFT.
865static int QuantizeSingle(int16_t* WEBP_RESTRICT const v,
866 const VP8Matrix* WEBP_RESTRICT const mtx) {
867 int V = *v;
868 const int sign = (V < 0);
869 if (sign) V = -V;
870 if (V > (int)mtx->zthresh_[0]) {
871 const int qV = QUANTDIV(V, mtx->iq_[0], mtx->bias_[0]) * mtx->q_[0];
872 const int err = (V - qV);
873 *v = sign ? -qV : qV;
874 return (sign ? -err : err) >> DSCALE;
875 }
876 *v = 0;
877 return (sign ? -V : V) >> DSCALE;
878}
879
880static void CorrectDCValues(const VP8EncIterator* WEBP_RESTRICT const it,
881 const VP8Matrix* WEBP_RESTRICT const mtx,
882 int16_t tmp[][16],
883 VP8ModeScore* WEBP_RESTRICT const rd) {
884 // | top[0] | top[1]
885 // --------+--------+---------
886 // left[0] | tmp[0] tmp[1] <-> err0 err1
887 // left[1] | tmp[2] tmp[3] err2 err3
888 //
889 // Final errors {err1,err2,err3} are preserved and later restored
890 // as top[]/left[] on the next block.
891 int ch;
892 for (ch = 0; ch <= 1; ++ch) {
893 const int8_t* const top = it->top_derr_[it->x_][ch];
894 const int8_t* const left = it->left_derr_[ch];
895 int16_t (* const c)[16] = &tmp[ch * 4];
896 int err0, err1, err2, err3;
897 c[0][0] += (C1 * top[0] + C2 * left[0]) >> (DSHIFT - DSCALE);
898 err0 = QuantizeSingle(&c[0][0], mtx);
899 c[1][0] += (C1 * top[1] + C2 * err0) >> (DSHIFT - DSCALE);
900 err1 = QuantizeSingle(&c[1][0], mtx);
901 c[2][0] += (C1 * err0 + C2 * left[1]) >> (DSHIFT - DSCALE);
902 err2 = QuantizeSingle(&c[2][0], mtx);
903 c[3][0] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
904 err3 = QuantizeSingle(&c[3][0], mtx);
905 // error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
906 // err >> DSCALE will fit in an int8_t type if DSCALE>=1.
907 assert(abs(err1) <= 127 && abs(err2) <= 127 && abs(err3) <= 127);
908 rd->derr[ch][0] = (int8_t)err1;
909 rd->derr[ch][1] = (int8_t)err2;
910 rd->derr[ch][2] = (int8_t)err3;
911 }
912}
913
914static void StoreDiffusionErrors(VP8EncIterator* WEBP_RESTRICT const it,
915 const VP8ModeScore* WEBP_RESTRICT const rd) {
916 int ch;
917 for (ch = 0; ch <= 1; ++ch) {
918 int8_t* const top = it->top_derr_[it->x_][ch];
919 int8_t* const left = it->left_derr_[ch];
920 left[0] = rd->derr[ch][0]; // restore err1
921 left[1] = 3 * rd->derr[ch][2] >> 2; // ... 3/4th of err3
922 top[0] = rd->derr[ch][1]; // ... err2
923 top[1] = rd->derr[ch][2] - left[1]; // ... 1/4th of err3.
924 }
925}
926
927#undef C1
928#undef C2
929#undef DSHIFT
930#undef DSCALE
931
932//------------------------------------------------------------------------------
933
934static int ReconstructUV(VP8EncIterator* WEBP_RESTRICT const it,
935 VP8ModeScore* WEBP_RESTRICT const rd,
936 uint8_t* WEBP_RESTRICT const yuv_out, int mode) {
937 const VP8Encoder* const enc = it->enc_;
938 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
939 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
940 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
941 int nz = 0;
942 int n;
943 int16_t tmp[8][16];
944
945 for (n = 0; n < 8; n += 2) {
946 VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
947 }
948 if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
949
950 if (DO_TRELLIS_UV && it->do_trellis_) {
951 int ch, x, y;
952 for (ch = 0, n = 0; ch <= 2; ch += 2) {
953 for (y = 0; y < 2; ++y) {
954 for (x = 0; x < 2; ++x, ++n) {
955 const int ctx = it->top_nz_[4 + ch + x] + it->left_nz_[4 + ch + y];
956 const int non_zero = TrellisQuantizeBlock(
957 enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv_,
958 dqm->lambda_trellis_uv_);
959 it->top_nz_[4 + ch + x] = it->left_nz_[4 + ch + y] = non_zero;
960 nz |= non_zero << n;
961 }
962 }
963 }
964 } else {
965 for (n = 0; n < 8; n += 2) {
966 nz |= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
967 }
968 }
969
970 for (n = 0; n < 8; n += 2) {
971 VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], 1);
972 }
973 return (nz << 16);
974}
975
976//------------------------------------------------------------------------------
977// RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
978// Pick the mode is lower RD-cost = Rate + lambda * Distortion.
979
980static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[16]) {
981 // We look at the first three AC coefficients to determine what is the average
982 // delta between each sub-4x4 block.
983 const int v0 = abs(DCs[1]);
984 const int v1 = abs(DCs[2]);
985 const int v2 = abs(DCs[4]);
986 int max_v = (v1 > v0) ? v1 : v0;
987 max_v = (v2 > max_v) ? v2 : max_v;
988 if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
989}
990
991static void SwapModeScore(VP8ModeScore** a, VP8ModeScore** b) {
992 VP8ModeScore* const tmp = *a;
993 *a = *b;
994 *b = tmp;
995}
996
997static void SwapPtr(uint8_t** a, uint8_t** b) {
998 uint8_t* const tmp = *a;
999 *a = *b;
1000 *b = tmp;
1001}
1002
1003static void SwapOut(VP8EncIterator* const it) {
1004 SwapPtr(&it->yuv_out_, &it->yuv_out2_);
1005}
1006
1007static void PickBestIntra16(VP8EncIterator* WEBP_RESTRICT const it,
1008 VP8ModeScore* WEBP_RESTRICT rd) {
1009 const int kNumBlocks = 16;
1010 VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1011 const int lambda = dqm->lambda_i16_;
1012 const int tlambda = dqm->tlambda_;
1013 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1014 VP8ModeScore rd_tmp;
1015 VP8ModeScore* rd_cur = &rd_tmp;
1016 VP8ModeScore* rd_best = rd;
1017 int mode;
1018 int is_flat = IsFlatSource16(it->yuv_in_ + Y_OFF_ENC);
1019
1020 rd->mode_i16 = -1;
1021 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1022 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC; // scratch buffer
1023 rd_cur->mode_i16 = mode;
1024
1025 // Reconstruct
1026 rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
1027
1028 // Measure RD-score
1029 rd_cur->D = VP8SSE16x16(src, tmp_dst);
1030 rd_cur->SD =
1031 tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : 0;
1032 rd_cur->H = VP8FixedCostsI16[mode];
1033 rd_cur->R = VP8GetCostLuma16(it, rd_cur);
1034 if (is_flat) {
1035 // refine the first impression (which was in pixel space)
1036 is_flat = IsFlat(rd_cur->y_ac_levels[0], kNumBlocks, FLATNESS_LIMIT_I16);
1037 if (is_flat) {
1038 // Block is very flat. We put emphasis on the distortion being very low!
1039 rd_cur->D *= 2;
1040 rd_cur->SD *= 2;
1041 }
1042 }
1043
1044 // Since we always examine Intra16 first, we can overwrite *rd directly.
1045 SetRDScore(lambda, rd_cur);
1046 if (mode == 0 || rd_cur->score < rd_best->score) {
1047 SwapModeScore(&rd_cur, &rd_best);
1048 SwapOut(it);
1049 }
1050 }
1051 if (rd_best != rd) {
1052 memcpy(rd, rd_best, sizeof(*rd));
1053 }
1054 SetRDScore(dqm->lambda_mode_, rd); // finalize score for mode decision.
1055 VP8SetIntra16Mode(it, rd->mode_i16);
1056
1057 // we have a blocky macroblock (only DCs are non-zero) with fairly high
1058 // distortion, record max delta so we can later adjust the minimal filtering
1059 // strength needed to smooth these blocks out.
1060 if ((rd->nz & 0x100ffff) == 0x1000000 && rd->D > dqm->min_disto_) {
1061 StoreMaxDelta(dqm, rd->y_dc_levels);
1062 }
1063}
1064
1065//------------------------------------------------------------------------------
1066
1067// return the cost array corresponding to the surrounding prediction modes.
1068static const uint16_t* GetCostModeI4(VP8EncIterator* WEBP_RESTRICT const it,
1069 const uint8_t modes[16]) {
1070 const int preds_w = it->enc_->preds_w_;
1071 const int x = (it->i4_ & 3), y = it->i4_ >> 2;
1072 const int left = (x == 0) ? it->preds_[y * preds_w - 1] : modes[it->i4_ - 1];
1073 const int top = (y == 0) ? it->preds_[-preds_w + x] : modes[it->i4_ - 4];
1074 return VP8FixedCostsI4[top][left];
1075}
1076
1077static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
1078 VP8ModeScore* WEBP_RESTRICT const rd) {
1079 const VP8Encoder* const enc = it->enc_;
1080 const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
1081 const int lambda = dqm->lambda_i4_;
1082 const int tlambda = dqm->tlambda_;
1083 const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
1084 uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
1085 int total_header_bits = 0;
1086 VP8ModeScore rd_best;
1087
1088 if (enc->max_i4_header_bits_ == 0) {
1089 return 0;
1090 }
1091
1092 InitScore(&rd_best);
1093 rd_best.H = 211; // '211' is the value of VP8BitCost(0, 145)
1094 SetRDScore(dqm->lambda_mode_, &rd_best);
1095 VP8IteratorStartI4(it);
1096 do {
1097 const int kNumBlocks = 1;
1098 VP8ModeScore rd_i4;
1099 int mode;
1100 int best_mode = -1;
1101 const uint8_t* const src = src0 + VP8Scan[it->i4_];
1102 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1103 uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1104 uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer.
1105
1106 InitScore(&rd_i4);
1107 VP8MakeIntra4Preds(it);
1108 for (mode = 0; mode < NUM_BMODES; ++mode) {
1109 VP8ModeScore rd_tmp;
1110 int16_t tmp_levels[16];
1111
1112 // Reconstruct
1113 rd_tmp.nz =
1114 ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1115
1116 // Compute RD-score
1117 rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1118 rd_tmp.SD =
1119 tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1120 : 0;
1121 rd_tmp.H = mode_costs[mode];
1122
1123 // Add flatness penalty, to avoid flat area to be mispredicted
1124 // by a complex mode.
1125 if (mode > 0 && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1126 rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1127 } else {
1128 rd_tmp.R = 0;
1129 }
1130
1131 // early-out check
1132 SetRDScore(lambda, &rd_tmp);
1133 if (best_mode >= 0 && rd_tmp.score >= rd_i4.score) continue;
1134
1135 // finish computing score
1136 rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1137 SetRDScore(lambda, &rd_tmp);
1138
1139 if (best_mode < 0 || rd_tmp.score < rd_i4.score) {
1140 CopyScore(&rd_i4, &rd_tmp);
1141 best_mode = mode;
1142 SwapPtr(&tmp_dst, &best_block);
1143 memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1144 sizeof(rd_best.y_ac_levels[it->i4_]));
1145 }
1146 }
1147 SetRDScore(dqm->lambda_mode_, &rd_i4);
1148 AddScore(&rd_best, &rd_i4);
1149 if (rd_best.score >= rd->score) {
1150 return 0;
1151 }
1152 total_header_bits += (int)rd_i4.H; // <- equal to mode_costs[best_mode];
1153 if (total_header_bits > enc->max_i4_header_bits_) {
1154 return 0;
1155 }
1156 // Copy selected samples if not in the right place already.
1157 if (best_block != best_blocks + VP8Scan[it->i4_]) {
1158 VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1159 }
1160 rd->modes_i4[it->i4_] = best_mode;
1161 it->top_nz_[it->i4_ & 3] = it->left_nz_[it->i4_ >> 2] = (rd_i4.nz ? 1 : 0);
1162 } while (VP8IteratorRotateI4(it, best_blocks));
1163
1164 // finalize state
1165 CopyScore(rd, &rd_best);
1166 VP8SetIntra4Mode(it, rd->modes_i4);
1167 SwapOut(it);
1168 memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1169 return 1; // select intra4x4 over intra16x16
1170}
1171
1172//------------------------------------------------------------------------------
1173
1174static void PickBestUV(VP8EncIterator* WEBP_RESTRICT const it,
1175 VP8ModeScore* WEBP_RESTRICT const rd) {
1176 const int kNumBlocks = 8;
1177 const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1178 const int lambda = dqm->lambda_uv_;
1179 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1180 uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC; // scratch buffer
1181 uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1182 uint8_t* dst = dst0;
1183 VP8ModeScore rd_best;
1184 int mode;
1185
1186 rd->mode_uv = -1;
1187 InitScore(&rd_best);
1188 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1189 VP8ModeScore rd_uv;
1190
1191 // Reconstruct
1192 rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1193
1194 // Compute RD-score
1195 rd_uv.D = VP8SSE16x8(src, tmp_dst);
1196 rd_uv.SD = 0; // not calling TDisto here: it tends to flatten areas.
1197 rd_uv.H = VP8FixedCostsUV[mode];
1198 rd_uv.R = VP8GetCostUV(it, &rd_uv);
1199 if (mode > 0 && IsFlat(rd_uv.uv_levels[0], kNumBlocks, FLATNESS_LIMIT_UV)) {
1200 rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1201 }
1202
1203 SetRDScore(lambda, &rd_uv);
1204 if (mode == 0 || rd_uv.score < rd_best.score) {
1205 CopyScore(&rd_best, &rd_uv);
1206 rd->mode_uv = mode;
1207 memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1208 if (it->top_derr_ != NULL) {
1209 memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
1210 }
1211 SwapPtr(&dst, &tmp_dst);
1212 }
1213 }
1214 VP8SetIntraUVMode(it, rd->mode_uv);
1215 AddScore(rd, &rd_best);
1216 if (dst != dst0) { // copy 16x8 block if needed
1217 VP8Copy16x8(dst, dst0);
1218 }
1219 if (it->top_derr_ != NULL) { // store diffusion errors for next block
1220 StoreDiffusionErrors(it, rd);
1221 }
1222}
1223
1224//------------------------------------------------------------------------------
1225// Final reconstruction and quantization.
1226
1227static void SimpleQuantize(VP8EncIterator* WEBP_RESTRICT const it,
1228 VP8ModeScore* WEBP_RESTRICT const rd) {
1229 const VP8Encoder* const enc = it->enc_;
1230 const int is_i16 = (it->mb_->type_ == 1);
1231 int nz = 0;
1232
1233 if (is_i16) {
1234 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1235 } else {
1236 VP8IteratorStartI4(it);
1237 do {
1238 const int mode =
1239 it->preds_[(it->i4_ & 3) + (it->i4_ >> 2) * enc->preds_w_];
1240 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1241 uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1242 VP8MakeIntra4Preds(it);
1243 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1244 src, dst, mode) << it->i4_;
1245 } while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1246 }
1247
1248 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1249 rd->nz = nz;
1250}
1251
1252// Refine intra16/intra4 sub-modes based on distortion only (not rate).
1253static void RefineUsingDistortion(VP8EncIterator* WEBP_RESTRICT const it,
1254 int try_both_modes, int refine_uv_mode,
1255 VP8ModeScore* WEBP_RESTRICT const rd) {
1256 score_t best_score = MAX_COST;
1257 int nz = 0;
1258 int mode;
1259 int is_i16 = try_both_modes || (it->mb_->type_ == 1);
1260
1261 const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1262 // Some empiric constants, of approximate order of magnitude.
1263 const int lambda_d_i16 = 106;
1264 const int lambda_d_i4 = 11;
1265 const int lambda_d_uv = 120;
1266 score_t score_i4 = dqm->i4_penalty_;
1267 score_t i4_bit_sum = 0;
1268 const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1269 : MAX_COST; // no early-out allowed
1270
1271 if (is_i16) { // First, evaluate Intra16 distortion
1272 int best_mode = -1;
1273 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1274 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1275 const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1276 const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
1277 + VP8FixedCostsI16[mode] * lambda_d_i16;
1278 if (mode > 0 && VP8FixedCostsI16[mode] > bit_limit) {
1279 continue;
1280 }
1281
1282 if (score < best_score) {
1283 best_mode = mode;
1284 best_score = score;
1285 }
1286 }
1287 if (it->x_ == 0 || it->y_ == 0) {
1288 // avoid starting a checkerboard resonance from the border. See bug #432.
1289 if (IsFlatSource16(src)) {
1290 best_mode = (it->x_ == 0) ? 0 : 2;
1291 try_both_modes = 0; // stick to i16
1292 }
1293 }
1294 VP8SetIntra16Mode(it, best_mode);
1295 // we'll reconstruct later, if i16 mode actually gets selected
1296 }
1297
1298 // Next, evaluate Intra4
1299 if (try_both_modes || !is_i16) {
1300 // We don't evaluate the rate here, but just account for it through a
1301 // constant penalty (i4 mode usually needs more bits compared to i16).
1302 is_i16 = 0;
1303 VP8IteratorStartI4(it);
1304 do {
1305 int best_i4_mode = -1;
1306 score_t best_i4_score = MAX_COST;
1307 const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1308 const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1309
1310 VP8MakeIntra4Preds(it);
1311 for (mode = 0; mode < NUM_BMODES; ++mode) {
1312 const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1313 const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1314 + mode_costs[mode] * lambda_d_i4;
1315 if (score < best_i4_score) {
1316 best_i4_mode = mode;
1317 best_i4_score = score;
1318 }
1319 }
1320 i4_bit_sum += mode_costs[best_i4_mode];
1321 rd->modes_i4[it->i4_] = best_i4_mode;
1322 score_i4 += best_i4_score;
1323 if (score_i4 >= best_score || i4_bit_sum > bit_limit) {
1324 // Intra4 won't be better than Intra16. Bail out and pick Intra16.
1325 is_i16 = 1;
1326 break;
1327 } else { // reconstruct partial block inside yuv_out2_ buffer
1328 uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1329 nz |= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1330 src, tmp_dst, best_i4_mode) << it->i4_;
1331 }
1332 } while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1333 }
1334
1335 // Final reconstruction, depending on which mode is selected.
1336 if (!is_i16) {
1337 VP8SetIntra4Mode(it, rd->modes_i4);
1338 SwapOut(it);
1339 best_score = score_i4;
1340 } else {
1341 nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[0]);
1342 }
1343
1344 // ... and UV!
1345 if (refine_uv_mode) {
1346 int best_mode = -1;
1347 score_t best_uv_score = MAX_COST;
1348 const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1349 for (mode = 0; mode < NUM_PRED_MODES; ++mode) {
1350 const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1351 const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1352 + VP8FixedCostsUV[mode] * lambda_d_uv;
1353 if (score < best_uv_score) {
1354 best_mode = mode;
1355 best_uv_score = score;
1356 }
1357 }
1358 VP8SetIntraUVMode(it, best_mode);
1359 }
1360 nz |= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1361
1362 rd->nz = nz;
1363 rd->score = best_score;
1364}
1365
1366//------------------------------------------------------------------------------
1367// Entry point
1368
1369int VP8Decimate(VP8EncIterator* WEBP_RESTRICT const it,
1370 VP8ModeScore* WEBP_RESTRICT const rd,
1371 VP8RDLevel rd_opt) {
1372 int is_skipped;
1373 const int method = it->enc_->method_;
1374
1375 InitScore(rd);
1376
1377 // We can perform predictions for Luma16x16 and Chroma8x8 already.
1378 // Luma4x4 predictions needs to be done as-we-go.
1379 VP8MakeLuma16Preds(it);
1380 VP8MakeChroma8Preds(it);
1381
1382 if (rd_opt > RD_OPT_NONE) {
1383 it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1384 PickBestIntra16(it, rd);
1385 if (method >= 2) {
1386 PickBestIntra4(it, rd);
1387 }
1388 PickBestUV(it, rd);
1389 if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now
1390 it->do_trellis_ = 1;
1391 SimpleQuantize(it, rd);
1392 }
1393 } else {
1394 // At this point we have heuristically decided intra16 / intra4.
1395 // For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1396 // For method <= 1, we don't re-examine the decision but just go ahead with
1397 // quantization/reconstruction.
1398 RefineUsingDistortion(it, (method >= 2), (method >= 1), rd);
1399 }
1400 is_skipped = (rd->nz == 0);
1401 VP8SetSkip(it, is_skipped);
1402 return is_skipped;
1403}
1404