quant_enc.c source code [engine/third_party/libwebp/src/enc/quant_enc.c]

1	// Copyright 2011 Google Inc. All Rights Reserved.
2	//
3	// Use of this source code is governed by a BSD-style license
4	// that can be found in the COPYING file in the root of the source
5	// tree. An additional intellectual property rights grant can be found
6	// in the file PATENTS. All contributing project authors may
7	// be found in the AUTHORS file in the root of the source tree.
8	// -----------------------------------------------------------------------------
9	//
10	// Quantization
11	//
12	// Author: Skal (pascal.massimino@gmail.com)
13
14	#include <assert.h>
15	#include <math.h>
16	#include <stdlib.h> // for abs()
17
18	#include "./vp8i_enc.h"
19	#include "./cost_enc.h"
20
21	#define DO_TRELLIS_I4 1
22	#define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate.
23	#define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth.
24	#define USE_TDISTO 1
25
26	#define MID_ALPHA 64 // neutral value for susceptibility
27	#define MIN_ALPHA 30 // lowest usable value for susceptibility
28	#define MAX_ALPHA 100 // higher meaningful value for susceptibility
29
30	#define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP
31	// power-law modulation. Must be strictly less than 1.
32
33	// number of non-zero coeffs below which we consider the block very flat
34	// (and apply a penalty to complex predictions)
35	#define FLATNESS_LIMIT_I16 10 // I16 mode
36	#define FLATNESS_LIMIT_I4 3 // I4 mode
37	#define FLATNESS_LIMIT_UV 2 // UV mode
38	#define FLATNESS_PENALTY 140 // roughly ~1bit per block
39
40	#define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
41
42	#define RD_DISTO_MULT 256 // distortion multiplier (equivalent of lambda)
43
44	// #define DEBUG_BLOCK
45
46	//------------------------------------------------------------------------------
47
48	#if defined(DEBUG_BLOCK)
49
50	#include <stdio.h>
51	#include <stdlib.h>
52
53	static void PrintBlockInfo(const VP8EncIterator* const it,
54	const VP8ModeScore* const rd) {
55	int i, j;
56	const int is_i16 = (it->mb_->type_ == `1`);
57	const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
58	const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
59	const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
60	const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
61	printf("SOURCE / OUTPUT / ABS DELTA\n");
62	for (j = `0`; j < `16`; ++j) {
63	for (i = `0`; i < `16`; ++i) printf("%3d ", y_in[i + j * BPS]);
64	printf(" ");
65	for (i = `0`; i < `16`; ++i) printf("%3d ", y_out[i + j * BPS]);
66	printf(" ");
67	for (i = `0`; i < `16`; ++i) {
68	printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
69	}
70	printf("\n");
71	}
72	printf("\n"); // newline before the U/V block
73	for (j = `0`; j < `8`; ++j) {
74	for (i = `0`; i < `8`; ++i) printf("%3d ", uv_in[i + j * BPS]);
75	printf(" ");
76	for (i = `8`; i < `16`; ++i) printf("%3d ", uv_in[i + j * BPS]);
77	printf(" ");
78	for (i = `0`; i < `8`; ++i) printf("%3d ", uv_out[i + j * BPS]);
79	printf(" ");
80	for (i = `8`; i < `16`; ++i) printf("%3d ", uv_out[i + j * BPS]);
81	printf(" ");
82	for (i = `0`; i < `8`; ++i) {
83	printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
84	}
85	printf(" ");
86	for (i = `8`; i < `16`; ++i) {
87	printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
88	}
89	printf("\n");
90	}
91	printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
92	(int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
93	(int)rd->score);
94	if (is_i16) {
95	printf("Mode: %d\n", rd->mode_i16);
96	printf("y_dc_levels:");
97	for (i = `0`; i < `16`; ++i) printf("%3d ", rd->y_dc_levels[i]);
98	printf("\n");
99	} else {
100	printf("Modes[16]: ");
101	for (i = `0`; i < `16`; ++i) printf("%d ", rd->modes_i4[i]);
102	printf("\n");
103	}
104	printf("y_ac_levels:\n");
105	for (j = `0`; j < `16`; ++j) {
106	for (i = is_i16 ? `1` : `0`; i < `16`; ++i) {
107	printf("%4d ", rd->y_ac_levels[j][i]);
108	}
109	printf("\n");
110	}
111	printf("\n");
112	printf("uv_levels (mode=%d):\n", rd->mode_uv);
113	for (j = `0`; j < `8`; ++j) {
114	for (i = `0`; i < `16`; ++i) {
115	printf("%4d ", rd->uv_levels[j][i]);
116	}
117	printf("\n");
118	}
119	}
120
121	#endif // DEBUG_BLOCK
122
123	//------------------------------------------------------------------------------
124
125	static WEBP_INLINE int clip(int v, int m, int M) {
126	return v < m ? m : v > M ? M : v;
127	}
128
129	static const uint8_t kZigzag[`16`] = {
130	`0`, `1`, `4`, `8`, `5`, `2`, `3`, `6`, `9`, `12`, `13`, `10`, `7`, `11`, `14`, `15`
131	};
132
133	static const uint8_t kDcTable[`128`] = {
134	`4`, `5`, `6`, `7`, `8`, `9`, `10`, `10`,
135	`11`, `12`, `13`, `14`, `15`, `16`, `17`, `17`,
136	`18`, `19`, `20`, `20`, `21`, `21`, `22`, `22`,
137	`23`, `23`, `24`, `25`, `25`, `26`, `27`, `28`,
138	`29`, `30`, `31`, `32`, `33`, `34`, `35`, `36`,
139	`37`, `37`, `38`, `39`, `40`, `41`, `42`, `43`,
140	`44`, `45`, `46`, `46`, `47`, `48`, `49`, `50`,
141	`51`, `52`, `53`, `54`, `55`, `56`, `57`, `58`,
142	`59`, `60`, `61`, `62`, `63`, `64`, `65`, `66`,
143	`67`, `68`, `69`, `70`, `71`, `72`, `73`, `74`,
144	`75`, `76`, `76`, `77`, `78`, `79`, `80`, `81`,
145	`82`, `83`, `84`, `85`, `86`, `87`, `88`, `89`,
146	`91`, `93`, `95`, `96`, `98`, `100`, `101`, `102`,
147	`104`, `106`, `108`, `110`, `112`, `114`, `116`, `118`,
148	`122`, `124`, `126`, `128`, `130`, `132`, `134`, `136`,
149	`138`, `140`, `143`, `145`, `148`, `151`, `154`, `157`
150	};
151
152	static const uint16_t kAcTable[`128`] = {
153	`4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`,
154	`12`, `13`, `14`, `15`, `16`, `17`, `18`, `19`,
155	`20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`,
156	`28`, `29`, `30`, `31`, `32`, `33`, `34`, `35`,
157	`36`, `37`, `38`, `39`, `40`, `41`, `42`, `43`,
158	`44`, `45`, `46`, `47`, `48`, `49`, `50`, `51`,
159	`52`, `53`, `54`, `55`, `56`, `57`, `58`, `60`,
160	`62`, `64`, `66`, `68`, `70`, `72`, `74`, `76`,
161	`78`, `80`, `82`, `84`, `86`, `88`, `90`, `92`,
162	`94`, `96`, `98`, `100`, `102`, `104`, `106`, `108`,
163	`110`, `112`, `114`, `116`, `119`, `122`, `125`, `128`,
164	`131`, `134`, `137`, `140`, `143`, `146`, `149`, `152`,
165	`155`, `158`, `161`, `164`, `167`, `170`, `173`, `177`,
166	`181`, `185`, `189`, `193`, `197`, `201`, `205`, `209`,
167	`213`, `217`, `221`, `225`, `229`, `234`, `239`, `245`,
168	`249`, `254`, `259`, `264`, `269`, `274`, `279`, `284`
169	};
170
171	static const uint16_t kAcTable2[`128`] = {
172	`8`, `8`, `9`, `10`, `12`, `13`, `15`, `17`,
173	`18`, `20`, `21`, `23`, `24`, `26`, `27`, `29`,
174	`31`, `32`, `34`, `35`, `37`, `38`, `40`, `41`,
175	`43`, `44`, `46`, `48`, `49`, `51`, `52`, `54`,
176	`55`, `57`, `58`, `60`, `62`, `63`, `65`, `66`,
177	`68`, `69`, `71`, `72`, `74`, `75`, `77`, `79`,
178	`80`, `82`, `83`, `85`, `86`, `88`, `89`, `93`,
179	`96`, `99`, `102`, `105`, `108`, `111`, `114`, `117`,
180	`120`, `124`, `127`, `130`, `133`, `136`, `139`, `142`,
181	`145`, `148`, `151`, `155`, `158`, `161`, `164`, `167`,
182	`170`, `173`, `176`, `179`, `184`, `189`, `193`, `198`,
183	`203`, `207`, `212`, `217`, `221`, `226`, `230`, `235`,
184	`240`, `244`, `249`, `254`, `258`, `263`, `268`, `274`,
185	`280`, `286`, `292`, `299`, `305`, `311`, `317`, `323`,
186	`330`, `336`, `342`, `348`, `354`, `362`, `370`, `379`,
187	`385`, `393`, `401`, `409`, `416`, `424`, `432`, `440`
188	};
189
190	static const uint8_t kBiasMatrices[`3`][`2`] = { // [luma-ac,luma-dc,chroma][dc,ac]
191	{ `96`, `110` }, { `96`, `108` }, { `110`, `115` }
192	};
193
194	// Sharpening by (slightly) raising the hi-frequency coeffs.
195	// Hack-ish but helpful for mid-bitrate range. Use with care.
196	#define SHARPEN_BITS 11 // number of descaling bits for sharpening bias
197	static const uint8_t kFreqSharpening[`16`] = {
198	`0`, `30`, `60`, `90`,
199	`30`, `60`, `90`, `90`,
200	`60`, `90`, `90`, `90`,
201	`90`, `90`, `90`, `90`
202	};
203
204	//------------------------------------------------------------------------------
205	// Initialize quantization parameters in VP8Matrix
206
207	// Returns the average quantizer
208	static int ExpandMatrix(VP8Matrix* const m, int type) {
209	int i, sum;
210	for (i = `0`; i < `2`; ++i) {
211	const int is_ac_coeff = (i > `0`);
212	const int bias = kBiasMatrices[type][is_ac_coeff];
213	m->iq_[i] = (`1` << QFIX) / m->q_[i];
214	m->bias_[i] = BIAS(bias);
215	// zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
216	// zero if coeff <= zthresh*
217	// non-zero if coeff > zthresh*
218	m->zthresh_[i] = ((`1` << QFIX) - `1` - m->bias_[i]) / m->iq_[i];
219	}
220	for (i = `2`; i < `16`; ++i) {
221	m->q_[i] = m->q_[`1`];
222	m->iq_[i] = m->iq_[`1`];
223	m->bias_[i] = m->bias_[`1`];
224	m->zthresh_[i] = m->zthresh_[`1`];
225	}
226	for (sum = `0`, i = `0`; i < `16`; ++i) {
227	if (type == `0`) { // we only use sharpening for AC luma coeffs
228	m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
229	} else {
230	m->sharpen_[i] = `0`;
231	}
232	sum += m->q_[i];
233	}
234	return (sum + `8`) >> `4`;
235	}
236
237	static void CheckLambdaValue(int* const v) { if (v < `1`) v = `1`; }
238
239	static void SetupMatrices(VP8Encoder* enc) {
240	int i;
241	const int tlambda_scale =
242	(enc->method_ >= `4`) ? enc->config_->sns_strength
243	: `0`;
244	const int num_segments = enc->segment_hdr_.num_segments_;
245	for (i = `0`; i < num_segments; ++i) {
246	VP8SegmentInfo* const m = &enc->dqm_[i];
247	const int q = m->quant_;
248	int q_i4, q_i16, q_uv;
249	m->y1_.q_[`0`] = kDcTable[clip(q + enc->dq_y1_dc_, `0`, `127`)];
250	m->y1_.q_[`1`] = kAcTable[clip(q, `0`, `127`)];
251
252	m->y2_.q_[`0`] = kDcTable[ clip(q + enc->dq_y2_dc_, `0`, `127`)] * `2`;
253	m->y2_.q_[`1`] = kAcTable2[clip(q + enc->dq_y2_ac_, `0`, `127`)];
254
255	m->uv_.q_[`0`] = kDcTable[clip(q + enc->dq_uv_dc_, `0`, `117`)];
256	m->uv_.q_[`1`] = kAcTable[clip(q + enc->dq_uv_ac_, `0`, `127`)];
257
258	q_i4 = ExpandMatrix(&m->y1_, `0`);
259	q_i16 = ExpandMatrix(&m->y2_, `1`);
260	q_uv = ExpandMatrix(&m->uv_, `2`);
261
262	m->lambda_i4_ = (`3` * q_i4 * q_i4) >> `7`;
263	m->lambda_i16_ = (`3` * q_i16 * q_i16);
264	m->lambda_uv_ = (`3` * q_uv * q_uv) >> `6`;
265	m->lambda_mode_ = (`1` * q_i4 * q_i4) >> `7`;
266	m->lambda_trellis_i4_ = (`7` * q_i4 * q_i4) >> `3`;
267	m->lambda_trellis_i16_ = (q_i16 * q_i16) >> `2`;
268	m->lambda_trellis_uv_ = (q_uv * q_uv) << `1`;
269	m->tlambda_ = (tlambda_scale * q_i4) >> `5`;
270
271	// none of these constants should be < 1
272	CheckLambdaValue(&m->lambda_i4_);
273	CheckLambdaValue(&m->lambda_i16_);
274	CheckLambdaValue(&m->lambda_uv_);
275	CheckLambdaValue(&m->lambda_mode_);
276	CheckLambdaValue(&m->lambda_trellis_i4_);
277	CheckLambdaValue(&m->lambda_trellis_i16_);
278	CheckLambdaValue(&m->lambda_trellis_uv_);
279	CheckLambdaValue(&m->tlambda_);
280
281	m->min_disto_ = `20` * m->y1_.q_[`0`]; // quantization-aware min disto
282	m->max_edge_ = `0`;
283
284	m->i4_penalty_ = `1000` * q_i4 * q_i4;
285	}
286	}
287
288	//------------------------------------------------------------------------------
289	// Initialize filtering parameters
290
291	// Very small filter-strength values have close to no visual effect. So we can
292	// save a little decoding-CPU by turning filtering off for these.
293	#define FSTRENGTH_CUTOFF 2
294
295	static void SetupFilterStrength(VP8Encoder* const enc) {
296	int i;
297	// level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
298	const int level0 = `5` * enc->config_->filter_strength;
299	for (i = `0`; i < NUM_MB_SEGMENTS; ++i) {
300	VP8SegmentInfo* const m = &enc->dqm_[i];
301	// We focus on the quantization of AC coeffs.
302	const int qstep = kAcTable[clip(m->quant_, `0`, `127`)] >> `2`;
303	const int base_strength =
304	VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
305	// Segments with lower complexity ('beta') will be less filtered.
306	const int f = base_strength * level0 / (`256` + m->beta_);
307	m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? `0` : (f > `63`) ? `63` : f;
308	}
309	// We record the initial strength (mainly for the case of 1-segment only).
310	enc->filter_hdr_.level_ = enc->dqm_[`0`].fstrength_;
311	enc->filter_hdr_.simple_ = (enc->config_->filter_type == `0`);
312	enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
313	}
314
315	//------------------------------------------------------------------------------
316
317	// Note: if you change the values below, remember that the max range
318	// allowed by the syntax for DQ_UV is [-16,16].
319	#define MAX_DQ_UV (6)
320	#define MIN_DQ_UV (-4)
321
322	// We want to emulate jpeg-like behaviour where the expected "good" quality
323	// is around q=75. Internally, our "good" middle is around c=50. So we
324	// map accordingly using linear piece-wise function
325	static double QualityToCompression(double c) {
326	const double linear_c = (c < `0.75`) ? c * (`2.` / `3.`) : `2.` * c - `1.`;
327	// The file size roughly scales as pow(quantizer, 3.). Actually, the
328	// exponent is somewhere between 2.8 and 3.2, but we're mostly interested
329	// in the mid-quant range. So we scale the compressibility inversely to
330	// this power-law: quant ~= compression ^ 1/3. This law holds well for
331	// low quant. Finer modeling for high-quant would make use of kAcTable[]
332	// more explicitly.
333	const double v = pow(linear_c, `1` / `3.`);
334	return v;
335	}
336
337	static double QualityToJPEGCompression(double c, double alpha) {
338	// We map the complexity 'alpha' and quality setting 'c' to a compression
339	// exponent empirically matched to the compression curve of libjpeg6b.
340	// On average, the WebP output size will be roughly similar to that of a
341	// JPEG file compressed with same quality factor.
342	const double amin = `0.30`;
343	const double amax = `0.85`;
344	const double exp_min = `0.4`;
345	const double exp_max = `0.9`;
346	const double slope = (exp_min - exp_max) / (amax - amin);
347	// Linearly interpolate 'expn' from exp_min to exp_max
348	// in the [amin, amax] range.
349	const double expn = (alpha > amax) ? exp_min
350	: (alpha < amin) ? exp_max
351	: exp_max + slope * (alpha - amin);
352	const double v = pow(c, expn);
353	return v;
354	}
355
356	static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
357	const VP8SegmentInfo* const S2) {
358	return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
359	}
360
361	static void SimplifySegments(VP8Encoder* const enc) {
362	int map[NUM_MB_SEGMENTS] = { `0`, `1`, `2`, `3` };
363	// 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
364	// explicit check is needed to avoid a spurious warning about 'i' exceeding
365	// array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
366	const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
367	? enc->segment_hdr_.num_segments_
368	: NUM_MB_SEGMENTS;
369	int num_final_segments = `1`;
370	int s1, s2;
371	for (s1 = `1`; s1 < num_segments; ++s1) { // find similar segments
372	const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
373	int found = `0`;
374	// check if we already have similar segment
375	for (s2 = `0`; s2 < num_final_segments; ++s2) {
376	const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
377	if (SegmentsAreEquivalent(S1, S2)) {
378	found = `1`;
379	break;
380	}
381	}
382	map[s1] = s2;
383	if (!found) {
384	if (num_final_segments != s1) {
385	enc->dqm_[num_final_segments] = enc->dqm_[s1];
386	}
387	++num_final_segments;
388	}
389	}
390	if (num_final_segments < num_segments) { // Remap
391	int i = enc->mb_w_ * enc->mb_h_;
392	while (i-- > `0`) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
393	enc->segment_hdr_.num_segments_ = num_final_segments;
394	// Replicate the trailing segment infos (it's mostly cosmetics)
395	for (i = num_final_segments; i < num_segments; ++i) {
396	enc->dqm_[i] = enc->dqm_[num_final_segments - `1`];
397	}
398	}
399	}
400
401	void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
402	int i;
403	int dq_uv_ac, dq_uv_dc;
404	const int num_segments = enc->segment_hdr_.num_segments_;
405	const double amp = SNS_TO_DQ * enc->config_->sns_strength / `100.` / `128.`;
406	const double Q = quality / `100.`;
407	const double c_base = enc->config_->emulate_jpeg_size ?
408	QualityToJPEGCompression(Q, enc->alpha_ / `255.`) :
409	QualityToCompression(Q);
410	for (i = `0`; i < num_segments; ++i) {
411	// We modulate the base coefficient to accommodate for the quantization
412	// susceptibility and allow denser segments to be quantized more.
413	const double expn = `1.` - amp * enc->dqm_[i].alpha_;
414	const double c = pow(c_base, expn);
415	const int q = (int)(`127.` * (`1.` - c));
416	assert(expn > `0.`);
417	enc->dqm_[i].quant_ = clip(q, `0`, `127`);
418	}
419
420	// purely indicative in the bitstream (except for the 1-segment case)
421	enc->base_quant_ = enc->dqm_[`0`].quant_;
422
423	// fill-in values for the unused segments (required by the syntax)
424	for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
425	enc->dqm_[i].quant_ = enc->base_quant_;
426	}
427
428	// uv_alpha_ is normally spread around ~60. The useful range is
429	// typically ~30 (quite bad) to ~100 (ok to decimate UV more).
430	// We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
431	dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
432	/ (MAX_ALPHA - MIN_ALPHA);
433	// we rescale by the user-defined strength of adaptation
434	dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / `100`;
435	// and make it safe.
436	dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
437	// We also boost the dc-uv-quant a little, based on sns-strength, since
438	// U/V channels are quite more reactive to high quants (flat DC-blocks
439	// tend to appear, and are unpleasant).
440	dq_uv_dc = -`4` * enc->config_->sns_strength / `100`;
441	dq_uv_dc = clip(dq_uv_dc, -`15`, `15`); // 4bit-signed max allowed
442
443	enc->dq_y1_dc_ = `0`; // TODO(skal): dq-lum
444	enc->dq_y2_dc_ = `0`;
445	enc->dq_y2_ac_ = `0`;
446	enc->dq_uv_dc_ = dq_uv_dc;
447	enc->dq_uv_ac_ = dq_uv_ac;
448
449	SetupFilterStrength(enc); // initialize segments' filtering, eventually
450
451	if (num_segments > `1`) SimplifySegments(enc);
452
453	SetupMatrices(enc); // finalize quantization matrices
454	}
455
456	//------------------------------------------------------------------------------
457	// Form the predictions in cache
458
459	// Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
460	const int VP8I16ModeOffsets[`4`] = { I16DC16, I16TM16, I16VE16, I16HE16 };
461	const int VP8UVModeOffsets[`4`] = { C8DC8, C8TM8, C8VE8, C8HE8 };
462
463	// Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
464	const int VP8I4ModeOffsets[NUM_BMODES] = {
465	I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
466	};
467
468	void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
469	const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
470	const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
471	VP8EncPredLuma16(it->yuv_p_, left, top);
472	}
473
474	void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
475	const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
476	const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
477	VP8EncPredChroma8(it->yuv_p_, left, top);
478	}
479
480	void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
481	VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
482	}
483
484	//------------------------------------------------------------------------------
485	// Quantize
486
487	// Layout:
488	// +----+----+
489	// \|YYYY\|UUVV\| 0
490	// \|YYYY\|UUVV\| 4
491	// \|YYYY\|....\| 8
492	// \|YYYY\|....\| 12
493	// +----+----+
494
495	const int VP8Scan[`16`] = { // Luma
496	`0` + `0` * BPS, `4` + `0` * BPS, `8` + `0` * BPS, `12` + `0` * BPS,
497	`0` + `4` * BPS, `4` + `4` * BPS, `8` + `4` * BPS, `12` + `4` * BPS,
498	`0` + `8` * BPS, `4` + `8` * BPS, `8` + `8` * BPS, `12` + `8` * BPS,
499	`0` + `12` * BPS, `4` + `12` * BPS, `8` + `12` * BPS, `12` + `12` * BPS,
500	};
501
502	static const int VP8ScanUV[`4` + `4`] = {
503	`0` + `0` * BPS, `4` + `0` * BPS, `0` + `4` * BPS, `4` + `4` * BPS, // U
504	`8` + `0` * BPS, `12` + `0` * BPS, `8` + `4` * BPS, `12` + `4` * BPS // V
505	};
506
507	//------------------------------------------------------------------------------
508	// Distortion measurement
509
510	static const uint16_t kWeightY[`16`] = {
511	`38`, `32`, `20`, `9`, `32`, `28`, `17`, `7`, `20`, `17`, `10`, `4`, `9`, `7`, `4`, `2`
512	};
513
514	static const uint16_t kWeightTrellis[`16`] = {
515	#if USE_TDISTO == 0
516	`16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`
517	#else
518	`30`, `27`, `19`, `11`,
519	`27`, `24`, `17`, `10`,
520	`19`, `17`, `12`, `8`,
521	`11`, `10`, `8`, `6`
522	#endif
523	};
524
525	// Init/Copy the common fields in score.
526	static void InitScore(VP8ModeScore* const rd) {
527	rd->D = `0`;
528	rd->SD = `0`;
529	rd->R = `0`;
530	rd->H = `0`;
531	rd->nz = `0`;
532	rd->score = MAX_COST;
533	}
534
535	static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
536	dst->D = src->D;
537	dst->SD = src->SD;
538	dst->R = src->R;
539	dst->H = src->H;
540	dst->nz = src->nz; // note that nz is not accumulated, but just copied.
541	dst->score = src->score;
542	}
543
544	static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
545	dst->D += src->D;
546	dst->SD += src->SD;
547	dst->R += src->R;
548	dst->H += src->H;
549	dst->nz \|= src->nz; // here, new nz bits are accumulated.
550	dst->score += src->score;
551	}
552
553	//------------------------------------------------------------------------------
554	// Performs trellis-optimized quantization.
555
556	// Trellis node
557	typedef struct {
558	int8_t prev; // best previous node
559	int8_t sign; // sign of coeff_i
560	int16_t level; // level
561	} Node;
562
563	// Score state
564	typedef struct {
565	score_t score; // partial RD score
566	const uint16_t* costs; // shortcut to cost tables
567	} ScoreState;
568
569	// If a coefficient was quantized to a value Q (using a neutral bias),
570	// we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
571	// We don't test negative values though.
572	#define MIN_DELTA 0 // how much lower level to try
573	#define MAX_DELTA 1 // how much higher
574	#define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
575	#define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
576	#define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
577
578	static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
579	rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
580	}
581
582	static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
583	score_t distortion) {
584	return rate * lambda + RD_DISTO_MULT * distortion;
585	}
586
587	static int TrellisQuantizeBlock(const VP8Encoder* const enc,
588	int16_t in[`16`], int16_t out[`16`],
589	int ctx0, int coeff_type,
590	const VP8Matrix* const mtx,
591	int lambda) {
592	const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
593	CostArrayPtr const costs =
594	(CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
595	const int first = (coeff_type == `0`) ? `1` : `0`;
596	Node nodes[`16`][NUM_NODES];
597	ScoreState score_states[`2`][NUM_NODES];
598	ScoreState* ss_cur = &SCORE_STATE(`0`, MIN_DELTA);
599	ScoreState* ss_prev = &SCORE_STATE(`1`, MIN_DELTA);
600	int best_path[`3`] = {-`1`, -`1`, -`1`}; // store best-last/best-level/best-previous
601	score_t best_score;
602	int n, m, p, last;
603
604	{
605	score_t cost;
606	const int thresh = mtx->q_[`1`] * mtx->q_[`1`] / `4`;
607	const int last_proba = probas[VP8EncBands[first]][ctx0][`0`];
608
609	// compute the position of the last interesting coefficient
610	last = first - `1`;
611	for (n = `15`; n >= first; --n) {
612	const int j = kZigzag[n];
613	const int err = in[j] * in[j];
614	if (err > thresh) {
615	last = n;
616	break;
617	}
618	}
619	// we don't need to go inspect up to n = 16 coeffs. We can just go up
620	// to last + 1 (inclusive) without losing much.
621	if (last < `15`) ++last;
622
623	// compute 'skip' score. This is the max score one can do.
624	cost = VP8BitCost(`0`, last_proba);
625	best_score = RDScoreTrellis(lambda, cost, `0`);
626
627	// initialize source node.
628	for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
629	const score_t rate = (ctx0 == `0`) ? VP8BitCost(`1`, last_proba) : `0`;
630	ss_cur[m].score = RDScoreTrellis(lambda, rate, `0`);
631	ss_cur[m].costs = costs[first][ctx0];
632	}
633	}
634
635	// traverse trellis.
636	for (n = first; n <= last; ++n) {
637	const int j = kZigzag[n];
638	const uint32_t Q = mtx->q_[j];
639	const uint32_t iQ = mtx->iq_[j];
640	const uint32_t B = BIAS(`0x00`); // neutral bias
641	// note: it's important to take sign of the _original_ coeff,
642	// so we don't have to consider level < 0 afterward.
643	const int sign = (in[j] < `0`);
644	const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
645	int level0 = QUANTDIV(coeff0, iQ, B);
646	int thresh_level = QUANTDIV(coeff0, iQ, BIAS(`0x80`));
647	if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
648	if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
649
650	{ // Swap current and previous score states
651	ScoreState* const tmp = ss_cur;
652	ss_cur = ss_prev;
653	ss_prev = tmp;
654	}
655
656	// test all alternate level values around level0.
657	for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
658	Node* const cur = &NODE(n, m);
659	int level = level0 + m;
660	const int ctx = (level > `2`) ? `2` : level;
661	const int band = VP8EncBands[n + `1`];
662	score_t base_score;
663	score_t best_cur_score = MAX_COST;
664	int best_prev = `0`; // default, in case
665
666	ss_cur[m].score = MAX_COST;
667	ss_cur[m].costs = costs[n + `1`][ctx];
668	if (level < `0` \|\| level > thresh_level) {
669	// Node is dead.
670	continue;
671	}
672
673	{
674	// Compute delta_error = how much coding this level will
675	// subtract to max_error as distortion.
676	// Here, distortion = sum of (\|coeff_i\| - level_i Q_i)^2*
677	const int new_error = coeff0 - level * Q;
678	const int delta_error =
679	kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
680	base_score = RDScoreTrellis(lambda, `0`, delta_error);
681	}
682
683	// Inspect all possible non-dead predecessors. Retain only the best one.
684	for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
685	// Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
686	// eliminated since their score can't be better than the current best.
687	const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
688	// Examine node assuming it's a non-terminal one.
689	const score_t score =
690	base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, `0`);
691	if (score < best_cur_score) {
692	best_cur_score = score;
693	best_prev = p;
694	}
695	}
696	// Store best finding in current node.
697	cur->sign = sign;
698	cur->level = level;
699	cur->prev = best_prev;
700	ss_cur[m].score = best_cur_score;
701
702	// Now, record best terminal node (and thus best entry in the graph).
703	if (level != `0`) {
704	const score_t last_pos_cost =
705	(n < `15`) ? VP8BitCost(`0`, probas[band][ctx][`0`]) : `0`;
706	const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, `0`);
707	const score_t score = best_cur_score + last_pos_score;
708	if (score < best_score) {
709	best_score = score;
710	best_path[`0`] = n; // best eob position
711	best_path[`1`] = m; // best node index
712	best_path[`2`] = best_prev; // best predecessor
713	}
714	}
715	}
716	}
717
718	// Fresh start
719	memset(in + first, `0`, (`16` - first) * sizeof(*in));
720	memset(out + first, `0`, (`16` - first) * sizeof(*out));
721	if (best_path[`0`] == -`1`) {
722	return `0`; // skip!
723	}
724
725	{
726	// Unwind the best path.
727	// Note: best-prev on terminal node is not necessarily equal to the
728	// best_prev for non-terminal. So we patch best_path[2] in.
729	int nz = `0`;
730	int best_node = best_path[`1`];
731	n = best_path[`0`];
732	NODE(n, best_node).prev = best_path[`2`]; // force best-prev for terminal
733
734	for (; n >= first; --n) {
735	const Node* const node = &NODE(n, best_node);
736	const int j = kZigzag[n];
737	out[n] = node->sign ? -node->level : node->level;
738	nz \|= node->level;
739	in[j] = out[n] * mtx->q_[j];
740	best_node = node->prev;
741	}
742	return (nz != `0`);
743	}
744	}
745
746	#undef NODE
747
748	//------------------------------------------------------------------------------
749	// Performs: difference, transform, quantize, back-transform, add
750	// all at once. Output is the reconstructed block in yuv_out, and the*
751	// quantized levels in levels.*
752
753	static int ReconstructIntra16(VP8EncIterator* const it,
754	VP8ModeScore* const rd,
755	uint8_t* const yuv_out,
756	int mode) {
757	const VP8Encoder* const enc = it->enc_;
758	const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
759	const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
760	const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
761	int nz = `0`;
762	int n;
763	int16_t tmp[`16`][`16`], dc_tmp[`16`];
764
765	for (n = `0`; n < `16`; n += `2`) {
766	VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
767	}
768	VP8FTransformWHT(tmp[`0`], dc_tmp);
769	nz \|= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << `24`;
770
771	if (DO_TRELLIS_I16 && it->do_trellis_) {
772	int x, y;
773	VP8IteratorNzToBytes(it);
774	for (y = `0`, n = `0`; y < `4`; ++y) {
775	for (x = `0`; x < `4`; ++x, ++n) {
776	const int ctx = it->top_nz_[x] + it->left_nz_[y];
777	const int non_zero =
778	TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, `0`,
779	&dqm->y1_, dqm->lambda_trellis_i16_);
780	it->top_nz_[x] = it->left_nz_[y] = non_zero;
781	rd->y_ac_levels[n][`0`] = `0`;
782	nz \|= non_zero << n;
783	}
784	}
785	} else {
786	for (n = `0`; n < `16`; n += `2`) {
787	// Zero-out the first coeff, so that: a) nz is correct below, and
788	// b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
789	tmp[n][`0`] = tmp[n + `1`][`0`] = `0`;
790	nz \|= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
791	assert(rd->y_ac_levels[n + `0`][`0`] == `0`);
792	assert(rd->y_ac_levels[n + `1`][`0`] == `0`);
793	}
794	}
795
796	// Transform back
797	VP8TransformWHT(dc_tmp, tmp[`0`]);
798	for (n = `0`; n < `16`; n += `2`) {
799	VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], `1`);
800	}
801
802	return nz;
803	}
804
805	static int ReconstructIntra4(VP8EncIterator* const it,
806	int16_t levels[`16`],
807	const uint8_t* const src,
808	uint8_t* const yuv_out,
809	int mode) {
810	const VP8Encoder* const enc = it->enc_;
811	const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
812	const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
813	int nz = `0`;
814	int16_t tmp[`16`];
815
816	VP8FTransform(src, ref, tmp);
817	if (DO_TRELLIS_I4 && it->do_trellis_) {
818	const int x = it->i4_ & `3`, y = it->i4_ >> `2`;
819	const int ctx = it->top_nz_[x] + it->left_nz_[y];
820	nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, `3`, &dqm->y1_,
821	dqm->lambda_trellis_i4_);
822	} else {
823	nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
824	}
825	VP8ITransform(ref, tmp, yuv_out, `0`);
826	return nz;
827	}
828
829	static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
830	uint8_t* const yuv_out, int mode) {
831	const VP8Encoder* const enc = it->enc_;
832	const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
833	const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
834	const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
835	int nz = `0`;
836	int n;
837	int16_t tmp[`8`][`16`];
838
839	for (n = `0`; n < `8`; n += `2`) {
840	VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
841	}
842	if (DO_TRELLIS_UV && it->do_trellis_) {
843	int ch, x, y;
844	for (ch = `0`, n = `0`; ch <= `2`; ch += `2`) {
845	for (y = `0`; y < `2`; ++y) {
846	for (x = `0`; x < `2`; ++x, ++n) {
847	const int ctx = it->top_nz_[`4` + ch + x] + it->left_nz_[`4` + ch + y];
848	const int non_zero =
849	TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, `2`,
850	&dqm->uv_, dqm->lambda_trellis_uv_);
851	it->top_nz_[`4` + ch + x] = it->left_nz_[`4` + ch + y] = non_zero;
852	nz \|= non_zero << n;
853	}
854	}
855	}
856	} else {
857	for (n = `0`; n < `8`; n += `2`) {
858	nz \|= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
859	}
860	}
861
862	for (n = `0`; n < `8`; n += `2`) {
863	VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], `1`);
864	}
865	return (nz << `16`);
866	}
867
868	//------------------------------------------------------------------------------
869	// RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
870	// Pick the mode is lower RD-cost = Rate + lambda Distortion.*
871
872	static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[`16`]) {
873	// We look at the first three AC coefficients to determine what is the average
874	// delta between each sub-4x4 block.
875	const int v0 = abs(DCs[`1`]);
876	const int v1 = abs(DCs[`2`]);
877	const int v2 = abs(DCs[`4`]);
878	int max_v = (v1 > v0) ? v1 : v0;
879	max_v = (v2 > max_v) ? v2 : max_v;
880	if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
881	}
882
883	static void SwapModeScore(VP8ModeScore a, VP8ModeScore b) {
884	VP8ModeScore* const tmp = *a;
885	a = b;
886	*b = tmp;
887	}
888
889	static void SwapPtr(uint8_t a, uint8_t b) {
890	uint8_t* const tmp = *a;
891	a = b;
892	*b = tmp;
893	}
894
895	static void SwapOut(VP8EncIterator* const it) {
896	SwapPtr(&it->yuv_out_, &it->yuv_out2_);
897	}
898
899	static score_t IsFlat(const int16_t* levels, int num_blocks, score_t thresh) {
900	score_t score = `0`;
901	while (num_blocks-- > `0`) { // TODO(skal): refine positional scoring?
902	int i;
903	for (i = `1`; i < `16`; ++i) { // omit DC, we're only interested in AC
904	score += (levels[i] != `0`);
905	if (score > thresh) return `0`;
906	}
907	levels += `16`;
908	}
909	return `1`;
910	}
911
912	static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
913	const int kNumBlocks = `16`;
914	VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
915	const int lambda = dqm->lambda_i16_;
916	const int tlambda = dqm->tlambda_;
917	const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
918	VP8ModeScore rd_tmp;
919	VP8ModeScore* rd_cur = &rd_tmp;
920	VP8ModeScore* rd_best = rd;
921	int mode;
922
923	rd->mode_i16 = -`1`;
924	for (mode = `0`; mode < NUM_PRED_MODES; ++mode) {
925	uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC; // scratch buffer
926	rd_cur->mode_i16 = mode;
927
928	// Reconstruct
929	rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
930
931	// Measure RD-score
932	rd_cur->D = VP8SSE16x16(src, tmp_dst);
933	rd_cur->SD =
934	tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : `0`;
935	rd_cur->H = VP8FixedCostsI16[mode];
936	rd_cur->R = VP8GetCostLuma16(it, rd_cur);
937	if (mode > `0` &&
938	IsFlat(rd_cur->y_ac_levels[`0`], kNumBlocks, FLATNESS_LIMIT_I16)) {
939	// penalty to avoid flat area to be mispredicted by complex mode
940	rd_cur->R += FLATNESS_PENALTY * kNumBlocks;
941	}
942
943	// Since we always examine Intra16 first, we can overwrite rd directly.*
944	SetRDScore(lambda, rd_cur);
945	if (mode == `0` \|\| rd_cur->score < rd_best->score) {
946	SwapModeScore(&rd_cur, &rd_best);
947	SwapOut(it);
948	}
949	}
950	if (rd_best != rd) {
951	memcpy(rd, rd_best, sizeof(*rd));
952	}
953	SetRDScore(dqm->lambda_mode_, rd); // finalize score for mode decision.
954	VP8SetIntra16Mode(it, rd->mode_i16);
955
956	// we have a blocky macroblock (only DCs are non-zero) with fairly high
957	// distortion, record max delta so we can later adjust the minimal filtering
958	// strength needed to smooth these blocks out.
959	if ((rd->nz & `0x100ffff`) == `0x1000000` && rd->D > dqm->min_disto_) {
960	StoreMaxDelta(dqm, rd->y_dc_levels);
961	}
962	}
963
964	//------------------------------------------------------------------------------
965
966	// return the cost array corresponding to the surrounding prediction modes.
967	static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
968	const uint8_t modes[`16`]) {
969	const int preds_w = it->enc_->preds_w_;
970	const int x = (it->i4_ & `3`), y = it->i4_ >> `2`;
971	const int left = (x == `0`) ? it->preds_[y * preds_w - `1`] : modes[it->i4_ - `1`];
972	const int top = (y == `0`) ? it->preds_[-preds_w + x] : modes[it->i4_ - `4`];
973	return VP8FixedCostsI4[top][left];
974	}
975
976	static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
977	const VP8Encoder* const enc = it->enc_;
978	const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
979	const int lambda = dqm->lambda_i4_;
980	const int tlambda = dqm->tlambda_;
981	const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
982	uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
983	int total_header_bits = `0`;
984	VP8ModeScore rd_best;
985
986	if (enc->max_i4_header_bits_ == `0`) {
987	return `0`;
988	}
989
990	InitScore(&rd_best);
991	rd_best.H = `211`; // '211' is the value of VP8BitCost(0, 145)
992	SetRDScore(dqm->lambda_mode_, &rd_best);
993	VP8IteratorStartI4(it);
994	do {
995	const int kNumBlocks = `1`;
996	VP8ModeScore rd_i4;
997	int mode;
998	int best_mode = -`1`;
999	const uint8_t* const src = src0 + VP8Scan[it->i4_];
1000	const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1001	uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1002	uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer.
1003
1004	InitScore(&rd_i4);
1005	VP8MakeIntra4Preds(it);
1006	for (mode = `0`; mode < NUM_BMODES; ++mode) {
1007	VP8ModeScore rd_tmp;
1008	int16_t tmp_levels[`16`];
1009
1010	// Reconstruct
1011	rd_tmp.nz =
1012	ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1013
1014	// Compute RD-score
1015	rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1016	rd_tmp.SD =
1017	tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1018	: `0`;
1019	rd_tmp.H = mode_costs[mode];
1020
1021	// Add flatness penalty
1022	if (mode > `0` && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1023	rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1024	} else {
1025	rd_tmp.R = `0`;
1026	}
1027
1028	// early-out check
1029	SetRDScore(lambda, &rd_tmp);
1030	if (best_mode >= `0` && rd_tmp.score >= rd_i4.score) continue;
1031
1032	// finish computing score
1033	rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1034	SetRDScore(lambda, &rd_tmp);
1035
1036	if (best_mode < `0` \|\| rd_tmp.score < rd_i4.score) {
1037	CopyScore(&rd_i4, &rd_tmp);
1038	best_mode = mode;
1039	SwapPtr(&tmp_dst, &best_block);
1040	memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1041	sizeof(rd_best.y_ac_levels[it->i4_]));
1042	}
1043	}
1044	SetRDScore(dqm->lambda_mode_, &rd_i4);
1045	AddScore(&rd_best, &rd_i4);
1046	if (rd_best.score >= rd->score) {
1047	return `0`;
1048	}
1049	total_header_bits += (int)rd_i4.H; // <- equal to mode_costs[best_mode];
1050	if (total_header_bits > enc->max_i4_header_bits_) {
1051	return `0`;
1052	}
1053	// Copy selected samples if not in the right place already.
1054	if (best_block != best_blocks + VP8Scan[it->i4_]) {
1055	VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1056	}
1057	rd->modes_i4[it->i4_] = best_mode;
1058	it->top_nz_[it->i4_ & `3`] = it->left_nz_[it->i4_ >> `2`] = (rd_i4.nz ? `1` : `0`);
1059	} while (VP8IteratorRotateI4(it, best_blocks));
1060
1061	// finalize state
1062	CopyScore(rd, &rd_best);
1063	VP8SetIntra4Mode(it, rd->modes_i4);
1064	SwapOut(it);
1065	memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1066	return `1`; // select intra4x4 over intra16x16
1067	}
1068
1069	//------------------------------------------------------------------------------
1070
1071	static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
1072	const int kNumBlocks = `8`;
1073	const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1074	const int lambda = dqm->lambda_uv_;
1075	const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1076	uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC; // scratch buffer
1077	uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1078	uint8_t* dst = dst0;
1079	VP8ModeScore rd_best;
1080	int mode;
1081
1082	rd->mode_uv = -`1`;
1083	InitScore(&rd_best);
1084	for (mode = `0`; mode < NUM_PRED_MODES; ++mode) {
1085	VP8ModeScore rd_uv;
1086
1087	// Reconstruct
1088	rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1089
1090	// Compute RD-score
1091	rd_uv.D = VP8SSE16x8(src, tmp_dst);
1092	rd_uv.SD = `0`; // not calling TDisto here: it tends to flatten areas.
1093	rd_uv.H = VP8FixedCostsUV[mode];
1094	rd_uv.R = VP8GetCostUV(it, &rd_uv);
1095	if (mode > `0` && IsFlat(rd_uv.uv_levels[`0`], kNumBlocks, FLATNESS_LIMIT_UV)) {
1096	rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1097	}
1098
1099	SetRDScore(lambda, &rd_uv);
1100	if (mode == `0` \|\| rd_uv.score < rd_best.score) {
1101	CopyScore(&rd_best, &rd_uv);
1102	rd->mode_uv = mode;
1103	memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1104	SwapPtr(&dst, &tmp_dst);
1105	}
1106	}
1107	VP8SetIntraUVMode(it, rd->mode_uv);
1108	AddScore(rd, &rd_best);
1109	if (dst != dst0) { // copy 16x8 block if needed
1110	VP8Copy16x8(dst, dst0);
1111	}
1112	}
1113
1114	//------------------------------------------------------------------------------
1115	// Final reconstruction and quantization.
1116
1117	static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
1118	const VP8Encoder* const enc = it->enc_;
1119	const int is_i16 = (it->mb_->type_ == `1`);
1120	int nz = `0`;
1121
1122	if (is_i16) {
1123	nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[`0`]);
1124	} else {
1125	VP8IteratorStartI4(it);
1126	do {
1127	const int mode =
1128	it->preds_[(it->i4_ & `3`) + (it->i4_ >> `2`) * enc->preds_w_];
1129	const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1130	uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1131	VP8MakeIntra4Preds(it);
1132	nz \|= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1133	src, dst, mode) << it->i4_;
1134	} while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1135	}
1136
1137	nz \|= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1138	rd->nz = nz;
1139	}
1140
1141	// Refine intra16/intra4 sub-modes based on distortion only (not rate).
1142	static void RefineUsingDistortion(VP8EncIterator* const it,
1143	int try_both_modes, int refine_uv_mode,
1144	VP8ModeScore* const rd) {
1145	score_t best_score = MAX_COST;
1146	int nz = `0`;
1147	int mode;
1148	int is_i16 = try_both_modes \|\| (it->mb_->type_ == `1`);
1149
1150	const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1151	// Some empiric constants, of approximate order of magnitude.
1152	const int lambda_d_i16 = `106`;
1153	const int lambda_d_i4 = `11`;
1154	const int lambda_d_uv = `120`;
1155	score_t score_i4 = dqm->i4_penalty_;
1156	score_t i4_bit_sum = `0`;
1157	const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1158	: MAX_COST; // no early-out allowed
1159
1160	if (is_i16) { // First, evaluate Intra16 distortion
1161	int best_mode = -`1`;
1162	const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1163	for (mode = `0`; mode < NUM_PRED_MODES; ++mode) {
1164	const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1165	const score_t score = VP8SSE16x16(src, ref) * RD_DISTO_MULT
1166	+ VP8FixedCostsI16[mode] * lambda_d_i16;
1167	if (mode > `0` && VP8FixedCostsI16[mode] > bit_limit) {
1168	continue;
1169	}
1170	if (score < best_score) {
1171	best_mode = mode;
1172	best_score = score;
1173	}
1174	}
1175	VP8SetIntra16Mode(it, best_mode);
1176	// we'll reconstruct later, if i16 mode actually gets selected
1177	}
1178
1179	// Next, evaluate Intra4
1180	if (try_both_modes \|\| !is_i16) {
1181	// We don't evaluate the rate here, but just account for it through a
1182	// constant penalty (i4 mode usually needs more bits compared to i16).
1183	is_i16 = `0`;
1184	VP8IteratorStartI4(it);
1185	do {
1186	int best_i4_mode = -`1`;
1187	score_t best_i4_score = MAX_COST;
1188	const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1189	const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1190
1191	VP8MakeIntra4Preds(it);
1192	for (mode = `0`; mode < NUM_BMODES; ++mode) {
1193	const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1194	const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1195	+ mode_costs[mode] * lambda_d_i4;
1196	if (score < best_i4_score) {
1197	best_i4_mode = mode;
1198	best_i4_score = score;
1199	}
1200	}
1201	i4_bit_sum += mode_costs[best_i4_mode];
1202	rd->modes_i4[it->i4_] = best_i4_mode;
1203	score_i4 += best_i4_score;
1204	if (score_i4 >= best_score \|\| i4_bit_sum > bit_limit) {
1205	// Intra4 won't be better than Intra16. Bail out and pick Intra16.
1206	is_i16 = `1`;
1207	break;
1208	} else { // reconstruct partial block inside yuv_out2_ buffer
1209	uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1210	nz \|= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1211	src, tmp_dst, best_i4_mode) << it->i4_;
1212	}
1213	} while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1214	}
1215
1216	// Final reconstruction, depending on which mode is selected.
1217	if (!is_i16) {
1218	VP8SetIntra4Mode(it, rd->modes_i4);
1219	SwapOut(it);
1220	best_score = score_i4;
1221	} else {
1222	nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[`0`]);
1223	}
1224
1225	// ... and UV!
1226	if (refine_uv_mode) {
1227	int best_mode = -`1`;
1228	score_t best_uv_score = MAX_COST;
1229	const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1230	for (mode = `0`; mode < NUM_PRED_MODES; ++mode) {
1231	const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1232	const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1233	+ VP8FixedCostsUV[mode] * lambda_d_uv;
1234	if (score < best_uv_score) {
1235	best_mode = mode;
1236	best_uv_score = score;
1237	}
1238	}
1239	VP8SetIntraUVMode(it, best_mode);
1240	}
1241	nz \|= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1242
1243	rd->nz = nz;
1244	rd->score = best_score;
1245	}
1246
1247	//------------------------------------------------------------------------------
1248	// Entry point
1249
1250	int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1251	VP8RDLevel rd_opt) {
1252	int is_skipped;
1253	const int method = it->enc_->method_;
1254
1255	InitScore(rd);
1256
1257	// We can perform predictions for Luma16x16 and Chroma8x8 already.
1258	// Luma4x4 predictions needs to be done as-we-go.
1259	VP8MakeLuma16Preds(it);
1260	VP8MakeChroma8Preds(it);
1261
1262	if (rd_opt > RD_OPT_NONE) {
1263	it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1264	PickBestIntra16(it, rd);
1265	if (method >= `2`) {
1266	PickBestIntra4(it, rd);
1267	}
1268	PickBestUV(it, rd);
1269	if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now
1270	it->do_trellis_ = `1`;
1271	SimpleQuantize(it, rd);
1272	}
1273	} else {
1274	// At this point we have heuristically decided intra16 / intra4.
1275	// For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1276	// For method <= 1, we don't re-examine the decision but just go ahead with
1277	// quantization/reconstruction.
1278	RefineUsingDistortion(it, (method >= `2`), (method >= `1`), rd);
1279	}
1280	is_skipped = (rd->nz == `0`);
1281	VP8SetSkip(it, is_skipped);
1282	return is_skipped;
1283	}
1284

Browse the source code of engine/third_party/libwebp/src/enc/quant_enc.c