quant_enc.c source code [Skia/third_party/externals/libwebp/src/enc/quant_enc.c]

1	// Copyright 2011 Google Inc. All Rights Reserved.
2	//
3	// Use of this source code is governed by a BSD-style license
4	// that can be found in the COPYING file in the root of the source
5	// tree. An additional intellectual property rights grant can be found
6	// in the file PATENTS. All contributing project authors may
7	// be found in the AUTHORS file in the root of the source tree.
8	// -----------------------------------------------------------------------------
9	//
10	// Quantization
11	//
12	// Author: Skal (pascal.massimino@gmail.com)
13
14	#include <assert.h>
15	#include <math.h>
16	#include <stdlib.h> // for abs()
17
18	#include "src/dsp/quant.h"
19	#include "src/enc/vp8i_enc.h"
20	#include "src/enc/cost_enc.h"
21
22	#define DO_TRELLIS_I4 1
23	#define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate.
24	#define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth.
25	#define USE_TDISTO 1
26
27	#define MID_ALPHA 64 // neutral value for susceptibility
28	#define MIN_ALPHA 30 // lowest usable value for susceptibility
29	#define MAX_ALPHA 100 // higher meaningful value for susceptibility
30
31	#define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP
32	// power-law modulation. Must be strictly less than 1.
33
34	// number of non-zero coeffs below which we consider the block very flat
35	// (and apply a penalty to complex predictions)
36	#define FLATNESS_LIMIT_I16 0 // I16 mode (special case)
37	#define FLATNESS_LIMIT_I4 3 // I4 mode
38	#define FLATNESS_LIMIT_UV 2 // UV mode
39	#define FLATNESS_PENALTY 140 // roughly ~1bit per block
40
41	#define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
42
43	#define RD_DISTO_MULT 256 // distortion multiplier (equivalent of lambda)
44
45	// #define DEBUG_BLOCK
46
47	//------------------------------------------------------------------------------
48
49	#if defined(DEBUG_BLOCK)
50
51	#include <stdio.h>
52	#include <stdlib.h>
53
54	static void PrintBlockInfo(const VP8EncIterator* const it,
55	const VP8ModeScore* const rd) {
56	int i, j;
57	const int is_i16 = (it->mb_->type_ == `1`);
58	const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
59	const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
60	const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
61	const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
62	printf("SOURCE / OUTPUT / ABS DELTA\n");
63	for (j = `0`; j < `16`; ++j) {
64	for (i = `0`; i < `16`; ++i) printf("%3d ", y_in[i + j * BPS]);
65	printf(" ");
66	for (i = `0`; i < `16`; ++i) printf("%3d ", y_out[i + j * BPS]);
67	printf(" ");
68	for (i = `0`; i < `16`; ++i) {
69	printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
70	}
71	printf("\n");
72	}
73	printf("\n"); // newline before the U/V block
74	for (j = `0`; j < `8`; ++j) {
75	for (i = `0`; i < `8`; ++i) printf("%3d ", uv_in[i + j * BPS]);
76	printf(" ");
77	for (i = `8`; i < `16`; ++i) printf("%3d ", uv_in[i + j * BPS]);
78	printf(" ");
79	for (i = `0`; i < `8`; ++i) printf("%3d ", uv_out[i + j * BPS]);
80	printf(" ");
81	for (i = `8`; i < `16`; ++i) printf("%3d ", uv_out[i + j * BPS]);
82	printf(" ");
83	for (i = `0`; i < `8`; ++i) {
84	printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
85	}
86	printf(" ");
87	for (i = `8`; i < `16`; ++i) {
88	printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
89	}
90	printf("\n");
91	}
92	printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
93	(int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
94	(int)rd->score);
95	if (is_i16) {
96	printf("Mode: %d\n", rd->mode_i16);
97	printf("y_dc_levels:");
98	for (i = `0`; i < `16`; ++i) printf("%3d ", rd->y_dc_levels[i]);
99	printf("\n");
100	} else {
101	printf("Modes[16]: ");
102	for (i = `0`; i < `16`; ++i) printf("%d ", rd->modes_i4[i]);
103	printf("\n");
104	}
105	printf("y_ac_levels:\n");
106	for (j = `0`; j < `16`; ++j) {
107	for (i = is_i16 ? `1` : `0`; i < `16`; ++i) {
108	printf("%4d ", rd->y_ac_levels[j][i]);
109	}
110	printf("\n");
111	}
112	printf("\n");
113	printf("uv_levels (mode=%d):\n", rd->mode_uv);
114	for (j = `0`; j < `8`; ++j) {
115	for (i = `0`; i < `16`; ++i) {
116	printf("%4d ", rd->uv_levels[j][i]);
117	}
118	printf("\n");
119	}
120	}
121
122	#endif // DEBUG_BLOCK
123
124	//------------------------------------------------------------------------------
125
126	static WEBP_INLINE int clip(int v, int m, int M) {
127	return v < m ? m : v > M ? M : v;
128	}
129
130	static const uint8_t kZigzag[`16`] = {
131	`0`, `1`, `4`, `8`, `5`, `2`, `3`, `6`, `9`, `12`, `13`, `10`, `7`, `11`, `14`, `15`
132	};
133
134	static const uint8_t kDcTable[`128`] = {
135	`4`, `5`, `6`, `7`, `8`, `9`, `10`, `10`,
136	`11`, `12`, `13`, `14`, `15`, `16`, `17`, `17`,
137	`18`, `19`, `20`, `20`, `21`, `21`, `22`, `22`,
138	`23`, `23`, `24`, `25`, `25`, `26`, `27`, `28`,
139	`29`, `30`, `31`, `32`, `33`, `34`, `35`, `36`,
140	`37`, `37`, `38`, `39`, `40`, `41`, `42`, `43`,
141	`44`, `45`, `46`, `46`, `47`, `48`, `49`, `50`,
142	`51`, `52`, `53`, `54`, `55`, `56`, `57`, `58`,
143	`59`, `60`, `61`, `62`, `63`, `64`, `65`, `66`,
144	`67`, `68`, `69`, `70`, `71`, `72`, `73`, `74`,
145	`75`, `76`, `76`, `77`, `78`, `79`, `80`, `81`,
146	`82`, `83`, `84`, `85`, `86`, `87`, `88`, `89`,
147	`91`, `93`, `95`, `96`, `98`, `100`, `101`, `102`,
148	`104`, `106`, `108`, `110`, `112`, `114`, `116`, `118`,
149	`122`, `124`, `126`, `128`, `130`, `132`, `134`, `136`,
150	`138`, `140`, `143`, `145`, `148`, `151`, `154`, `157`
151	};
152
153	static const uint16_t kAcTable[`128`] = {
154	`4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`,
155	`12`, `13`, `14`, `15`, `16`, `17`, `18`, `19`,
156	`20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`,
157	`28`, `29`, `30`, `31`, `32`, `33`, `34`, `35`,
158	`36`, `37`, `38`, `39`, `40`, `41`, `42`, `43`,
159	`44`, `45`, `46`, `47`, `48`, `49`, `50`, `51`,
160	`52`, `53`, `54`, `55`, `56`, `57`, `58`, `60`,
161	`62`, `64`, `66`, `68`, `70`, `72`, `74`, `76`,
162	`78`, `80`, `82`, `84`, `86`, `88`, `90`, `92`,
163	`94`, `96`, `98`, `100`, `102`, `104`, `106`, `108`,
164	`110`, `112`, `114`, `116`, `119`, `122`, `125`, `128`,
165	`131`, `134`, `137`, `140`, `143`, `146`, `149`, `152`,
166	`155`, `158`, `161`, `164`, `167`, `170`, `173`, `177`,
167	`181`, `185`, `189`, `193`, `197`, `201`, `205`, `209`,
168	`213`, `217`, `221`, `225`, `229`, `234`, `239`, `245`,
169	`249`, `254`, `259`, `264`, `269`, `274`, `279`, `284`
170	};
171
172	static const uint16_t kAcTable2[`128`] = {
173	`8`, `8`, `9`, `10`, `12`, `13`, `15`, `17`,
174	`18`, `20`, `21`, `23`, `24`, `26`, `27`, `29`,
175	`31`, `32`, `34`, `35`, `37`, `38`, `40`, `41`,
176	`43`, `44`, `46`, `48`, `49`, `51`, `52`, `54`,
177	`55`, `57`, `58`, `60`, `62`, `63`, `65`, `66`,
178	`68`, `69`, `71`, `72`, `74`, `75`, `77`, `79`,
179	`80`, `82`, `83`, `85`, `86`, `88`, `89`, `93`,
180	`96`, `99`, `102`, `105`, `108`, `111`, `114`, `117`,
181	`120`, `124`, `127`, `130`, `133`, `136`, `139`, `142`,
182	`145`, `148`, `151`, `155`, `158`, `161`, `164`, `167`,
183	`170`, `173`, `176`, `179`, `184`, `189`, `193`, `198`,
184	`203`, `207`, `212`, `217`, `221`, `226`, `230`, `235`,
185	`240`, `244`, `249`, `254`, `258`, `263`, `268`, `274`,
186	`280`, `286`, `292`, `299`, `305`, `311`, `317`, `323`,
187	`330`, `336`, `342`, `348`, `354`, `362`, `370`, `379`,
188	`385`, `393`, `401`, `409`, `416`, `424`, `432`, `440`
189	};
190
191	static const uint8_t kBiasMatrices[`3`][`2`] = { // [luma-ac,luma-dc,chroma][dc,ac]
192	{ `96`, `110` }, { `96`, `108` }, { `110`, `115` }
193	};
194
195	// Sharpening by (slightly) raising the hi-frequency coeffs.
196	// Hack-ish but helpful for mid-bitrate range. Use with care.
197	#define SHARPEN_BITS 11 // number of descaling bits for sharpening bias
198	static const uint8_t kFreqSharpening[`16`] = {
199	`0`, `30`, `60`, `90`,
200	`30`, `60`, `90`, `90`,
201	`60`, `90`, `90`, `90`,
202	`90`, `90`, `90`, `90`
203	};
204
205	//------------------------------------------------------------------------------
206	// Initialize quantization parameters in VP8Matrix
207
208	// Returns the average quantizer
209	static int ExpandMatrix(VP8Matrix* const m, int type) {
210	int i, sum;
211	for (i = `0`; i < `2`; ++i) {
212	const int is_ac_coeff = (i > `0`);
213	const int bias = kBiasMatrices[type][is_ac_coeff];
214	m->iq_[i] = (`1` << QFIX) / m->q_[i];
215	m->bias_[i] = BIAS(bias);
216	// zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
217	// zero if coeff <= zthresh*
218	// non-zero if coeff > zthresh*
219	m->zthresh_[i] = ((`1` << QFIX) - `1` - m->bias_[i]) / m->iq_[i];
220	}
221	for (i = `2`; i < `16`; ++i) {
222	m->q_[i] = m->q_[`1`];
223	m->iq_[i] = m->iq_[`1`];
224	m->bias_[i] = m->bias_[`1`];
225	m->zthresh_[i] = m->zthresh_[`1`];
226	}
227	for (sum = `0`, i = `0`; i < `16`; ++i) {
228	if (type == `0`) { // we only use sharpening for AC luma coeffs
229	m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
230	} else {
231	m->sharpen_[i] = `0`;
232	}
233	sum += m->q_[i];
234	}
235	return (sum + `8`) >> `4`;
236	}
237
238	static void CheckLambdaValue(int* const v) { if (v < `1`) v = `1`; }
239
240	static void SetupMatrices(VP8Encoder* enc) {
241	int i;
242	const int tlambda_scale =
243	(enc->method_ >= `4`) ? enc->config_->sns_strength
244	: `0`;
245	const int num_segments = enc->segment_hdr_.num_segments_;
246	for (i = `0`; i < num_segments; ++i) {
247	VP8SegmentInfo* const m = &enc->dqm_[i];
248	const int q = m->quant_;
249	int q_i4, q_i16, q_uv;
250	m->y1_.q_[`0`] = kDcTable[clip(q + enc->dq_y1_dc_, `0`, `127`)];
251	m->y1_.q_[`1`] = kAcTable[clip(q, `0`, `127`)];
252
253	m->y2_.q_[`0`] = kDcTable[ clip(q + enc->dq_y2_dc_, `0`, `127`)] * `2`;
254	m->y2_.q_[`1`] = kAcTable2[clip(q + enc->dq_y2_ac_, `0`, `127`)];
255
256	m->uv_.q_[`0`] = kDcTable[clip(q + enc->dq_uv_dc_, `0`, `117`)];
257	m->uv_.q_[`1`] = kAcTable[clip(q + enc->dq_uv_ac_, `0`, `127`)];
258
259	q_i4 = ExpandMatrix(&m->y1_, `0`);
260	q_i16 = ExpandMatrix(&m->y2_, `1`);
261	q_uv = ExpandMatrix(&m->uv_, `2`);
262
263	m->lambda_i4_ = (`3` * q_i4 * q_i4) >> `7`;
264	m->lambda_i16_ = (`3` * q_i16 * q_i16);
265	m->lambda_uv_ = (`3` * q_uv * q_uv) >> `6`;
266	m->lambda_mode_ = (`1` * q_i4 * q_i4) >> `7`;
267	m->lambda_trellis_i4_ = (`7` * q_i4 * q_i4) >> `3`;
268	m->lambda_trellis_i16_ = (q_i16 * q_i16) >> `2`;
269	m->lambda_trellis_uv_ = (q_uv * q_uv) << `1`;
270	m->tlambda_ = (tlambda_scale * q_i4) >> `5`;
271
272	// none of these constants should be < 1
273	CheckLambdaValue(&m->lambda_i4_);
274	CheckLambdaValue(&m->lambda_i16_);
275	CheckLambdaValue(&m->lambda_uv_);
276	CheckLambdaValue(&m->lambda_mode_);
277	CheckLambdaValue(&m->lambda_trellis_i4_);
278	CheckLambdaValue(&m->lambda_trellis_i16_);
279	CheckLambdaValue(&m->lambda_trellis_uv_);
280	CheckLambdaValue(&m->tlambda_);
281
282	m->min_disto_ = `20` * m->y1_.q_[`0`]; // quantization-aware min disto
283	m->max_edge_ = `0`;
284
285	m->i4_penalty_ = `1000` * q_i4 * q_i4;
286	}
287	}
288
289	//------------------------------------------------------------------------------
290	// Initialize filtering parameters
291
292	// Very small filter-strength values have close to no visual effect. So we can
293	// save a little decoding-CPU by turning filtering off for these.
294	#define FSTRENGTH_CUTOFF 2
295
296	static void SetupFilterStrength(VP8Encoder* const enc) {
297	int i;
298	// level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
299	const int level0 = `5` * enc->config_->filter_strength;
300	for (i = `0`; i < NUM_MB_SEGMENTS; ++i) {
301	VP8SegmentInfo* const m = &enc->dqm_[i];
302	// We focus on the quantization of AC coeffs.
303	const int qstep = kAcTable[clip(m->quant_, `0`, `127`)] >> `2`;
304	const int base_strength =
305	VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
306	// Segments with lower complexity ('beta') will be less filtered.
307	const int f = base_strength * level0 / (`256` + m->beta_);
308	m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? `0` : (f > `63`) ? `63` : f;
309	}
310	// We record the initial strength (mainly for the case of 1-segment only).
311	enc->filter_hdr_.level_ = enc->dqm_[`0`].fstrength_;
312	enc->filter_hdr_.simple_ = (enc->config_->filter_type == `0`);
313	enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
314	}
315
316	//------------------------------------------------------------------------------
317
318	// Note: if you change the values below, remember that the max range
319	// allowed by the syntax for DQ_UV is [-16,16].
320	#define MAX_DQ_UV (6)
321	#define MIN_DQ_UV (-4)
322
323	// We want to emulate jpeg-like behaviour where the expected "good" quality
324	// is around q=75. Internally, our "good" middle is around c=50. So we
325	// map accordingly using linear piece-wise function
326	static double QualityToCompression(double c) {
327	const double linear_c = (c < `0.75`) ? c * (`2.` / `3.`) : `2.` * c - `1.`;
328	// The file size roughly scales as pow(quantizer, 3.). Actually, the
329	// exponent is somewhere between 2.8 and 3.2, but we're mostly interested
330	// in the mid-quant range. So we scale the compressibility inversely to
331	// this power-law: quant ~= compression ^ 1/3. This law holds well for
332	// low quant. Finer modeling for high-quant would make use of kAcTable[]
333	// more explicitly.
334	const double v = pow(linear_c, `1` / `3.`);
335	return v;
336	}
337
338	static double QualityToJPEGCompression(double c, double alpha) {
339	// We map the complexity 'alpha' and quality setting 'c' to a compression
340	// exponent empirically matched to the compression curve of libjpeg6b.
341	// On average, the WebP output size will be roughly similar to that of a
342	// JPEG file compressed with same quality factor.
343	const double amin = `0.30`;
344	const double amax = `0.85`;
345	const double exp_min = `0.4`;
346	const double exp_max = `0.9`;
347	const double slope = (exp_min - exp_max) / (amax - amin);
348	// Linearly interpolate 'expn' from exp_min to exp_max
349	// in the [amin, amax] range.
350	const double expn = (alpha > amax) ? exp_min
351	: (alpha < amin) ? exp_max
352	: exp_max + slope * (alpha - amin);
353	const double v = pow(c, expn);
354	return v;
355	}
356
357	static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
358	const VP8SegmentInfo* const S2) {
359	return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
360	}
361
362	static void SimplifySegments(VP8Encoder* const enc) {
363	int map[NUM_MB_SEGMENTS] = { `0`, `1`, `2`, `3` };
364	// 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
365	// explicit check is needed to avoid a spurious warning about 'i' exceeding
366	// array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
367	const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
368	? enc->segment_hdr_.num_segments_
369	: NUM_MB_SEGMENTS;
370	int num_final_segments = `1`;
371	int s1, s2;
372	for (s1 = `1`; s1 < num_segments; ++s1) { // find similar segments
373	const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
374	int found = `0`;
375	// check if we already have similar segment
376	for (s2 = `0`; s2 < num_final_segments; ++s2) {
377	const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
378	if (SegmentsAreEquivalent(S1, S2)) {
379	found = `1`;
380	break;
381	}
382	}
383	map[s1] = s2;
384	if (!found) {
385	if (num_final_segments != s1) {
386	enc->dqm_[num_final_segments] = enc->dqm_[s1];
387	}
388	++num_final_segments;
389	}
390	}
391	if (num_final_segments < num_segments) { // Remap
392	int i = enc->mb_w_ * enc->mb_h_;
393	while (i-- > `0`) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
394	enc->segment_hdr_.num_segments_ = num_final_segments;
395	// Replicate the trailing segment infos (it's mostly cosmetics)
396	for (i = num_final_segments; i < num_segments; ++i) {
397	enc->dqm_[i] = enc->dqm_[num_final_segments - `1`];
398	}
399	}
400	}
401
402	void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
403	int i;
404	int dq_uv_ac, dq_uv_dc;
405	const int num_segments = enc->segment_hdr_.num_segments_;
406	const double amp = SNS_TO_DQ * enc->config_->sns_strength / `100.` / `128.`;
407	const double Q = quality / `100.`;
408	const double c_base = enc->config_->emulate_jpeg_size ?
409	QualityToJPEGCompression(Q, enc->alpha_ / `255.`) :
410	QualityToCompression(Q);
411	for (i = `0`; i < num_segments; ++i) {
412	// We modulate the base coefficient to accommodate for the quantization
413	// susceptibility and allow denser segments to be quantized more.
414	const double expn = `1.` - amp * enc->dqm_[i].alpha_;
415	const double c = pow(c_base, expn);
416	const int q = (int)(`127.` * (`1.` - c));
417	assert(expn > `0.`);
418	enc->dqm_[i].quant_ = clip(q, `0`, `127`);
419	}
420
421	// purely indicative in the bitstream (except for the 1-segment case)
422	enc->base_quant_ = enc->dqm_[`0`].quant_;
423
424	// fill-in values for the unused segments (required by the syntax)
425	for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
426	enc->dqm_[i].quant_ = enc->base_quant_;
427	}
428
429	// uv_alpha_ is normally spread around ~60. The useful range is
430	// typically ~30 (quite bad) to ~100 (ok to decimate UV more).
431	// We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
432	dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
433	/ (MAX_ALPHA - MIN_ALPHA);
434	// we rescale by the user-defined strength of adaptation
435	dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / `100`;
436	// and make it safe.
437	dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
438	// We also boost the dc-uv-quant a little, based on sns-strength, since
439	// U/V channels are quite more reactive to high quants (flat DC-blocks
440	// tend to appear, and are unpleasant).
441	dq_uv_dc = -`4` * enc->config_->sns_strength / `100`;
442	dq_uv_dc = clip(dq_uv_dc, -`15`, `15`); // 4bit-signed max allowed
443
444	enc->dq_y1_dc_ = `0`; // TODO(skal): dq-lum
445	enc->dq_y2_dc_ = `0`;
446	enc->dq_y2_ac_ = `0`;
447	enc->dq_uv_dc_ = dq_uv_dc;
448	enc->dq_uv_ac_ = dq_uv_ac;
449
450	SetupFilterStrength(enc); // initialize segments' filtering, eventually
451
452	if (num_segments > `1`) SimplifySegments(enc);
453
454	SetupMatrices(enc); // finalize quantization matrices
455	}
456
457	//------------------------------------------------------------------------------
458	// Form the predictions in cache
459
460	// Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
461	const uint16_t VP8I16ModeOffsets[`4`] = { I16DC16, I16TM16, I16VE16, I16HE16 };
462	const uint16_t VP8UVModeOffsets[`4`] = { C8DC8, C8TM8, C8VE8, C8HE8 };
463
464	// Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
465	const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
466	I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
467	};
468
469	void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
470	const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
471	const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
472	VP8EncPredLuma16(it->yuv_p_, left, top);
473	}
474
475	void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
476	const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
477	const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
478	VP8EncPredChroma8(it->yuv_p_, left, top);
479	}
480
481	void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
482	VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
483	}
484
485	//------------------------------------------------------------------------------
486	// Quantize
487
488	// Layout:
489	// +----+----+
490	// \|YYYY\|UUVV\| 0
491	// \|YYYY\|UUVV\| 4
492	// \|YYYY\|....\| 8
493	// \|YYYY\|....\| 12
494	// +----+----+
495
496	const uint16_t VP8Scan[`16`] = { // Luma
497	`0` + `0` * BPS, `4` + `0` * BPS, `8` + `0` * BPS, `12` + `0` * BPS,
498	`0` + `4` * BPS, `4` + `4` * BPS, `8` + `4` * BPS, `12` + `4` * BPS,
499	`0` + `8` * BPS, `4` + `8` * BPS, `8` + `8` * BPS, `12` + `8` * BPS,
500	`0` + `12` * BPS, `4` + `12` * BPS, `8` + `12` * BPS, `12` + `12` * BPS,
501	};
502
503	static const uint16_t VP8ScanUV[`4` + `4`] = {
504	`0` + `0` * BPS, `4` + `0` * BPS, `0` + `4` * BPS, `4` + `4` * BPS, // U
505	`8` + `0` * BPS, `12` + `0` * BPS, `8` + `4` * BPS, `12` + `4` * BPS // V
506	};
507
508	//------------------------------------------------------------------------------
509	// Distortion measurement
510
511	static const uint16_t kWeightY[`16`] = {
512	`38`, `32`, `20`, `9`, `32`, `28`, `17`, `7`, `20`, `17`, `10`, `4`, `9`, `7`, `4`, `2`
513	};
514
515	static const uint16_t kWeightTrellis[`16`] = {
516	#if USE_TDISTO == 0
517	`16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`
518	#else
519	`30`, `27`, `19`, `11`,
520	`27`, `24`, `17`, `10`,
521	`19`, `17`, `12`, `8`,
522	`11`, `10`, `8`, `6`
523	#endif
524	};
525
526	// Init/Copy the common fields in score.
527	static void InitScore(VP8ModeScore* const rd) {
528	rd->D = `0`;
529	rd->SD = `0`;
530	rd->R = `0`;
531	rd->H = `0`;
532	rd->nz = `0`;
533	rd->score = MAX_COST;
534	}
535
536	static void CopyScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
537	dst->D = src->D;
538	dst->SD = src->SD;
539	dst->R = src->R;
540	dst->H = src->H;
541	dst->nz = src->nz; // note that nz is not accumulated, but just copied.
542	dst->score = src->score;
543	}
544
545	static void AddScore(VP8ModeScore* const dst, const VP8ModeScore* const src) {
546	dst->D += src->D;
547	dst->SD += src->SD;
548	dst->R += src->R;
549	dst->H += src->H;
550	dst->nz \|= src->nz; // here, new nz bits are accumulated.
551	dst->score += src->score;
552	}
553
554	//------------------------------------------------------------------------------
555	// Performs trellis-optimized quantization.
556
557	// Trellis node
558	typedef struct {
559	int8_t prev; // best previous node
560	int8_t sign; // sign of coeff_i
561	int16_t level; // level
562	} Node;
563
564	// Score state
565	typedef struct {
566	score_t score; // partial RD score
567	const uint16_t* costs; // shortcut to cost tables
568	} ScoreState;
569
570	// If a coefficient was quantized to a value Q (using a neutral bias),
571	// we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
572	// We don't test negative values though.
573	#define MIN_DELTA 0 // how much lower level to try
574	#define MAX_DELTA 1 // how much higher
575	#define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
576	#define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
577	#define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
578
579	static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
580	rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
581	}
582
583	static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
584	score_t distortion) {
585	return rate * lambda + RD_DISTO_MULT * distortion;
586	}
587
588	static int TrellisQuantizeBlock(const VP8Encoder* const enc,
589	int16_t in[`16`], int16_t out[`16`],
590	int ctx0, int coeff_type,
591	const VP8Matrix* const mtx,
592	int lambda) {
593	const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
594	CostArrayPtr const costs =
595	(CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
596	const int first = (coeff_type == `0`) ? `1` : `0`;
597	Node nodes[`16`][NUM_NODES];
598	ScoreState score_states[`2`][NUM_NODES];
599	ScoreState* ss_cur = &SCORE_STATE(`0`, MIN_DELTA);
600	ScoreState* ss_prev = &SCORE_STATE(`1`, MIN_DELTA);
601	int best_path[`3`] = {-`1`, -`1`, -`1`}; // store best-last/best-level/best-previous
602	score_t best_score;
603	int n, m, p, last;
604
605	{
606	score_t cost;
607	const int thresh = mtx->q_[`1`] * mtx->q_[`1`] / `4`;
608	const int last_proba = probas[VP8EncBands[first]][ctx0][`0`];
609
610	// compute the position of the last interesting coefficient
611	last = first - `1`;
612	for (n = `15`; n >= first; --n) {
613	const int j = kZigzag[n];
614	const int err = in[j] * in[j];
615	if (err > thresh) {
616	last = n;
617	break;
618	}
619	}
620	// we don't need to go inspect up to n = 16 coeffs. We can just go up
621	// to last + 1 (inclusive) without losing much.
622	if (last < `15`) ++last;
623
624	// compute 'skip' score. This is the max score one can do.
625	cost = VP8BitCost(`0`, last_proba);
626	best_score = RDScoreTrellis(lambda, cost, `0`);
627
628	// initialize source node.
629	for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
630	const score_t rate = (ctx0 == `0`) ? VP8BitCost(`1`, last_proba) : `0`;
631	ss_cur[m].score = RDScoreTrellis(lambda, rate, `0`);
632	ss_cur[m].costs = costs[first][ctx0];
633	}
634	}
635
636	// traverse trellis.
637	for (n = first; n <= last; ++n) {
638	const int j = kZigzag[n];
639	const uint32_t Q = mtx->q_[j];
640	const uint32_t iQ = mtx->iq_[j];
641	const uint32_t B = BIAS(`0x00`); // neutral bias
642	// note: it's important to take sign of the _original_ coeff,
643	// so we don't have to consider level < 0 afterward.
644	const int sign = (in[j] < `0`);
645	const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
646	int level0 = QUANTDIV(coeff0, iQ, B);
647	int thresh_level = QUANTDIV(coeff0, iQ, BIAS(`0x80`));
648	if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
649	if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
650
651	{ // Swap current and previous score states
652	ScoreState* const tmp = ss_cur;
653	ss_cur = ss_prev;
654	ss_prev = tmp;
655	}
656
657	// test all alternate level values around level0.
658	for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
659	Node* const cur = &NODE(n, m);
660	int level = level0 + m;
661	const int ctx = (level > `2`) ? `2` : level;
662	const int band = VP8EncBands[n + `1`];
663	score_t base_score;
664	score_t best_cur_score = MAX_COST;
665	int best_prev = `0`; // default, in case
666
667	ss_cur[m].score = MAX_COST;
668	ss_cur[m].costs = costs[n + `1`][ctx];
669	if (level < `0` \|\| level > thresh_level) {
670	// Node is dead.
671	continue;
672	}
673
674	{
675	// Compute delta_error = how much coding this level will
676	// subtract to max_error as distortion.
677	// Here, distortion = sum of (\|coeff_i\| - level_i Q_i)^2*
678	const int new_error = coeff0 - level * Q;
679	const int delta_error =
680	kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
681	base_score = RDScoreTrellis(lambda, `0`, delta_error);
682	}
683
684	// Inspect all possible non-dead predecessors. Retain only the best one.
685	for (p = -MIN_DELTA; p <= MAX_DELTA; ++p) {
686	// Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
687	// eliminated since their score can't be better than the current best.
688	const score_t cost = VP8LevelCost(ss_prev[p].costs, level);
689	// Examine node assuming it's a non-terminal one.
690	const score_t score =
691	base_score + ss_prev[p].score + RDScoreTrellis(lambda, cost, `0`);
692	if (score < best_cur_score) {
693	best_cur_score = score;
694	best_prev = p;
695	}
696	}
697	// Store best finding in current node.
698	cur->sign = sign;
699	cur->level = level;
700	cur->prev = best_prev;
701	ss_cur[m].score = best_cur_score;
702
703	// Now, record best terminal node (and thus best entry in the graph).
704	if (level != `0`) {
705	const score_t last_pos_cost =
706	(n < `15`) ? VP8BitCost(`0`, probas[band][ctx][`0`]) : `0`;
707	const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, `0`);
708	const score_t score = best_cur_score + last_pos_score;
709	if (score < best_score) {
710	best_score = score;
711	best_path[`0`] = n; // best eob position
712	best_path[`1`] = m; // best node index
713	best_path[`2`] = best_prev; // best predecessor
714	}
715	}
716	}
717	}
718
719	// Fresh start
720	memset(in + first, `0`, (`16` - first) * sizeof(*in));
721	memset(out + first, `0`, (`16` - first) * sizeof(*out));
722	if (best_path[`0`] == -`1`) {
723	return `0`; // skip!
724	}
725
726	{
727	// Unwind the best path.
728	// Note: best-prev on terminal node is not necessarily equal to the
729	// best_prev for non-terminal. So we patch best_path[2] in.
730	int nz = `0`;
731	int best_node = best_path[`1`];
732	n = best_path[`0`];
733	NODE(n, best_node).prev = best_path[`2`]; // force best-prev for terminal
734
735	for (; n >= first; --n) {
736	const Node* const node = &NODE(n, best_node);
737	const int j = kZigzag[n];
738	out[n] = node->sign ? -node->level : node->level;
739	nz \|= node->level;
740	in[j] = out[n] * mtx->q_[j];
741	best_node = node->prev;
742	}
743	return (nz != `0`);
744	}
745	}
746
747	#undef NODE
748
749	//------------------------------------------------------------------------------
750	// Performs: difference, transform, quantize, back-transform, add
751	// all at once. Output is the reconstructed block in yuv_out, and the*
752	// quantized levels in levels.*
753
754	static int ReconstructIntra16(VP8EncIterator* const it,
755	VP8ModeScore* const rd,
756	uint8_t* const yuv_out,
757	int mode) {
758	const VP8Encoder* const enc = it->enc_;
759	const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
760	const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
761	const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
762	int nz = `0`;
763	int n;
764	int16_t tmp[`16`][`16`], dc_tmp[`16`];
765
766	for (n = `0`; n < `16`; n += `2`) {
767	VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
768	}
769	VP8FTransformWHT(tmp[`0`], dc_tmp);
770	nz \|= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << `24`;
771
772	if (DO_TRELLIS_I16 && it->do_trellis_) {
773	int x, y;
774	VP8IteratorNzToBytes(it);
775	for (y = `0`, n = `0`; y < `4`; ++y) {
776	for (x = `0`; x < `4`; ++x, ++n) {
777	const int ctx = it->top_nz_[x] + it->left_nz_[y];
778	const int non_zero =
779	TrellisQuantizeBlock(enc, tmp[n], rd->y_ac_levels[n], ctx, `0`,
780	&dqm->y1_, dqm->lambda_trellis_i16_);
781	it->top_nz_[x] = it->left_nz_[y] = non_zero;
782	rd->y_ac_levels[n][`0`] = `0`;
783	nz \|= non_zero << n;
784	}
785	}
786	} else {
787	for (n = `0`; n < `16`; n += `2`) {
788	// Zero-out the first coeff, so that: a) nz is correct below, and
789	// b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
790	tmp[n][`0`] = tmp[n + `1`][`0`] = `0`;
791	nz \|= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
792	assert(rd->y_ac_levels[n + `0`][`0`] == `0`);
793	assert(rd->y_ac_levels[n + `1`][`0`] == `0`);
794	}
795	}
796
797	// Transform back
798	VP8TransformWHT(dc_tmp, tmp[`0`]);
799	for (n = `0`; n < `16`; n += `2`) {
800	VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], `1`);
801	}
802
803	return nz;
804	}
805
806	static int ReconstructIntra4(VP8EncIterator* const it,
807	int16_t levels[`16`],
808	const uint8_t* const src,
809	uint8_t* const yuv_out,
810	int mode) {
811	const VP8Encoder* const enc = it->enc_;
812	const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
813	const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
814	int nz = `0`;
815	int16_t tmp[`16`];
816
817	VP8FTransform(src, ref, tmp);
818	if (DO_TRELLIS_I4 && it->do_trellis_) {
819	const int x = it->i4_ & `3`, y = it->i4_ >> `2`;
820	const int ctx = it->top_nz_[x] + it->left_nz_[y];
821	nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, `3`, &dqm->y1_,
822	dqm->lambda_trellis_i4_);
823	} else {
824	nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
825	}
826	VP8ITransform(ref, tmp, yuv_out, `0`);
827	return nz;
828	}
829
830	//------------------------------------------------------------------------------
831	// DC-error diffusion
832
833	// Diffusion weights. We under-correct a bit (15/16th of the error is actually
834	// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
835	#define C1 7 // fraction of error sent to the 4x4 block below
836	#define C2 8 // fraction of error sent to the 4x4 block on the right
837	#define DSHIFT 4
838	#define DSCALE 1 // storage descaling, needed to make the error fit int8_t
839
840	// Quantize as usual, but also compute and return the quantization error.
841	// Error is already divided by DSHIFT.
842	static int QuantizeSingle(int16_t* const v, const VP8Matrix* const mtx) {
843	int V = *v;
844	const int sign = (V < `0`);
845	if (sign) V = -V;
846	if (V > (int)mtx->zthresh_[`0`]) {
847	const int qV = QUANTDIV(V, mtx->iq_[`0`], mtx->bias_[`0`]) * mtx->q_[`0`];
848	const int err = (V - qV);
849	*v = sign ? -qV : qV;
850	return (sign ? -err : err) >> DSCALE;
851	}
852	*v = `0`;
853	return (sign ? -V : V) >> DSCALE;
854	}
855
856	static void CorrectDCValues(const VP8EncIterator* const it,
857	const VP8Matrix* const mtx,
858	int16_t tmp[][`16`], VP8ModeScore* const rd) {
859	// \| top[0] \| top[1]
860	// --------+--------+---------
861	// left[0] \| tmp[0] tmp[1] <-> err0 err1
862	// left[1] \| tmp[2] tmp[3] err2 err3
863	//
864	// Final errors {err1,err2,err3} are preserved and later restored
865	// as top[]/left[] on the next block.
866	int ch;
867	for (ch = `0`; ch <= `1`; ++ch) {
868	const int8_t* const top = it->top_derr_[it->x_][ch];
869	const int8_t* const left = it->left_derr_[ch];
870	int16_t (* const c)[`16`] = &tmp[ch * `4`];
871	int err0, err1, err2, err3;
872	c[`0`][`0`] += (C1 * top[`0`] + C2 * left[`0`]) >> (DSHIFT - DSCALE);
873	err0 = QuantizeSingle(&c[`0`][`0`], mtx);
874	c[`1`][`0`] += (C1 * top[`1`] + C2 * err0) >> (DSHIFT - DSCALE);
875	err1 = QuantizeSingle(&c[`1`][`0`], mtx);
876	c[`2`][`0`] += (C1 * err0 + C2 * left[`1`]) >> (DSHIFT - DSCALE);
877	err2 = QuantizeSingle(&c[`2`][`0`], mtx);
878	c[`3`][`0`] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
879	err3 = QuantizeSingle(&c[`3`][`0`], mtx);
880	// error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
881	// err >> DSCALE will fit in an int8_t type if DSCALE>=1.
882	assert(abs(err1) <= `127` && abs(err2) <= `127` && abs(err3) <= `127`);
883	rd->derr[ch][`0`] = (int8_t)err1;
884	rd->derr[ch][`1`] = (int8_t)err2;
885	rd->derr[ch][`2`] = (int8_t)err3;
886	}
887	}
888
889	static void StoreDiffusionErrors(VP8EncIterator* const it,
890	const VP8ModeScore* const rd) {
891	int ch;
892	for (ch = `0`; ch <= `1`; ++ch) {
893	int8_t* const top = it->top_derr_[it->x_][ch];
894	int8_t* const left = it->left_derr_[ch];
895	left[`0`] = rd->derr[ch][`0`]; // restore err1
896	left[`1`] = `3` * rd->derr[ch][`2`] >> `2`; // ... 3/4th of err3
897	top[`0`] = rd->derr[ch][`1`]; // ... err2
898	top[`1`] = rd->derr[ch][`2`] - left[`1`]; // ... 1/4th of err3.
899	}
900	}
901
902	#undef C1
903	#undef C2
904	#undef DSHIFT
905	#undef DSCALE
906
907	//------------------------------------------------------------------------------
908
909	static int ReconstructUV(VP8EncIterator* const it, VP8ModeScore* const rd,
910	uint8_t* const yuv_out, int mode) {
911	const VP8Encoder* const enc = it->enc_;
912	const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
913	const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
914	const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
915	int nz = `0`;
916	int n;
917	int16_t tmp[`8`][`16`];
918
919	for (n = `0`; n < `8`; n += `2`) {
920	VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
921	}
922	if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
923
924	if (DO_TRELLIS_UV && it->do_trellis_) {
925	int ch, x, y;
926	for (ch = `0`, n = `0`; ch <= `2`; ch += `2`) {
927	for (y = `0`; y < `2`; ++y) {
928	for (x = `0`; x < `2`; ++x, ++n) {
929	const int ctx = it->top_nz_[`4` + ch + x] + it->left_nz_[`4` + ch + y];
930	const int non_zero =
931	TrellisQuantizeBlock(enc, tmp[n], rd->uv_levels[n], ctx, `2`,
932	&dqm->uv_, dqm->lambda_trellis_uv_);
933	it->top_nz_[`4` + ch + x] = it->left_nz_[`4` + ch + y] = non_zero;
934	nz \|= non_zero << n;
935	}
936	}
937	}
938	} else {
939	for (n = `0`; n < `8`; n += `2`) {
940	nz \|= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
941	}
942	}
943
944	for (n = `0`; n < `8`; n += `2`) {
945	VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], `1`);
946	}
947	return (nz << `16`);
948	}
949
950	//------------------------------------------------------------------------------
951	// RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
952	// Pick the mode is lower RD-cost = Rate + lambda Distortion.*
953
954	static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[`16`]) {
955	// We look at the first three AC coefficients to determine what is the average
956	// delta between each sub-4x4 block.
957	const int v0 = abs(DCs[`1`]);
958	const int v1 = abs(DCs[`2`]);
959	const int v2 = abs(DCs[`4`]);
960	int max_v = (v1 > v0) ? v1 : v0;
961	max_v = (v2 > max_v) ? v2 : max_v;
962	if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
963	}
964
965	static void SwapModeScore(VP8ModeScore a, VP8ModeScore b) {
966	VP8ModeScore* const tmp = *a;
967	a = b;
968	*b = tmp;
969	}
970
971	static void SwapPtr(uint8_t a, uint8_t b) {
972	uint8_t* const tmp = *a;
973	a = b;
974	*b = tmp;
975	}
976
977	static void SwapOut(VP8EncIterator* const it) {
978	SwapPtr(&it->yuv_out_, &it->yuv_out2_);
979	}
980
981	static void PickBestIntra16(VP8EncIterator* const it, VP8ModeScore* rd) {
982	const int kNumBlocks = `16`;
983	VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
984	const int lambda = dqm->lambda_i16_;
985	const int tlambda = dqm->tlambda_;
986	const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
987	VP8ModeScore rd_tmp;
988	VP8ModeScore* rd_cur = &rd_tmp;
989	VP8ModeScore* rd_best = rd;
990	int mode;
991	int is_flat = IsFlatSource16(it->yuv_in_ + Y_OFF_ENC);
992
993	rd->mode_i16 = -`1`;
994	for (mode = `0`; mode < NUM_PRED_MODES; ++mode) {
995	uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC; // scratch buffer
996	rd_cur->mode_i16 = mode;
997
998	// Reconstruct
999	rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
1000
1001	// Measure RD-score
1002	rd_cur->D = VP8SSE16x16(src, tmp_dst);
1003	rd_cur->SD =
1004	tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : `0`;
1005	rd_cur->H = VP8FixedCostsI16[mode];
1006	rd_cur->R = VP8GetCostLuma16(it, rd_cur);
1007	if (is_flat) {
1008	// refine the first impression (which was in pixel space)
1009	is_flat = IsFlat(rd_cur->y_ac_levels[`0`], kNumBlocks, FLATNESS_LIMIT_I16);
1010	if (is_flat) {
1011	// Block is very flat. We put emphasis on the distortion being very low!
1012	rd_cur->D *= `2`;
1013	rd_cur->SD *= `2`;
1014	}
1015	}
1016
1017	// Since we always examine Intra16 first, we can overwrite rd directly.*
1018	SetRDScore(lambda, rd_cur);
1019	if (mode == `0` \|\| rd_cur->score < rd_best->score) {
1020	SwapModeScore(&rd_cur, &rd_best);
1021	SwapOut(it);
1022	}
1023	}
1024	if (rd_best != rd) {
1025	memcpy(rd, rd_best, sizeof(*rd));
1026	}
1027	SetRDScore(dqm->lambda_mode_, rd); // finalize score for mode decision.
1028	VP8SetIntra16Mode(it, rd->mode_i16);
1029
1030	// we have a blocky macroblock (only DCs are non-zero) with fairly high
1031	// distortion, record max delta so we can later adjust the minimal filtering
1032	// strength needed to smooth these blocks out.
1033	if ((rd->nz & `0x100ffff`) == `0x1000000` && rd->D > dqm->min_disto_) {
1034	StoreMaxDelta(dqm, rd->y_dc_levels);
1035	}
1036	}
1037
1038	//------------------------------------------------------------------------------
1039
1040	// return the cost array corresponding to the surrounding prediction modes.
1041	static const uint16_t* GetCostModeI4(VP8EncIterator* const it,
1042	const uint8_t modes[`16`]) {
1043	const int preds_w = it->enc_->preds_w_;
1044	const int x = (it->i4_ & `3`), y = it->i4_ >> `2`;
1045	const int left = (x == `0`) ? it->preds_[y * preds_w - `1`] : modes[it->i4_ - `1`];
1046	const int top = (y == `0`) ? it->preds_[-preds_w + x] : modes[it->i4_ - `4`];
1047	return VP8FixedCostsI4[top][left];
1048	}
1049
1050	static int PickBestIntra4(VP8EncIterator* const it, VP8ModeScore* const rd) {
1051	const VP8Encoder* const enc = it->enc_;
1052	const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
1053	const int lambda = dqm->lambda_i4_;
1054	const int tlambda = dqm->tlambda_;
1055	const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
1056	uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
1057	int total_header_bits = `0`;
1058	VP8ModeScore rd_best;
1059
1060	if (enc->max_i4_header_bits_ == `0`) {
1061	return `0`;
1062	}
1063
1064	InitScore(&rd_best);
1065	rd_best.H = `211`; // '211' is the value of VP8BitCost(0, 145)
1066	SetRDScore(dqm->lambda_mode_, &rd_best);
1067	VP8IteratorStartI4(it);
1068	do {
1069	const int kNumBlocks = `1`;
1070	VP8ModeScore rd_i4;
1071	int mode;
1072	int best_mode = -`1`;
1073	const uint8_t* const src = src0 + VP8Scan[it->i4_];
1074	const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1075	uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1076	uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer.
1077
1078	InitScore(&rd_i4);
1079	VP8MakeIntra4Preds(it);
1080	for (mode = `0`; mode < NUM_BMODES; ++mode) {
1081	VP8ModeScore rd_tmp;
1082	int16_t tmp_levels[`16`];
1083
1084	// Reconstruct
1085	rd_tmp.nz =
1086	ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1087
1088	// Compute RD-score
1089	rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1090	rd_tmp.SD =
1091	tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1092	: `0`;
1093	rd_tmp.H = mode_costs[mode];
1094
1095	// Add flatness penalty, to avoid flat area to be mispredicted
1096	// by a complex mode.
1097	if (mode > `0` && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1098	rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1099	} else {
1100	rd_tmp.R = `0`;
1101	}
1102
1103	// early-out check
1104	SetRDScore(lambda, &rd_tmp);
1105	if (best_mode >= `0` && rd_tmp.score >= rd_i4.score) continue;
1106
1107	// finish computing score
1108	rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1109	SetRDScore(lambda, &rd_tmp);
1110
1111	if (best_mode < `0` \|\| rd_tmp.score < rd_i4.score) {
1112	CopyScore(&rd_i4, &rd_tmp);
1113	best_mode = mode;
1114	SwapPtr(&tmp_dst, &best_block);
1115	memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1116	sizeof(rd_best.y_ac_levels[it->i4_]));
1117	}
1118	}
1119	SetRDScore(dqm->lambda_mode_, &rd_i4);
1120	AddScore(&rd_best, &rd_i4);
1121	if (rd_best.score >= rd->score) {
1122	return `0`;
1123	}
1124	total_header_bits += (int)rd_i4.H; // <- equal to mode_costs[best_mode];
1125	if (total_header_bits > enc->max_i4_header_bits_) {
1126	return `0`;
1127	}
1128	// Copy selected samples if not in the right place already.
1129	if (best_block != best_blocks + VP8Scan[it->i4_]) {
1130	VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1131	}
1132	rd->modes_i4[it->i4_] = best_mode;
1133	it->top_nz_[it->i4_ & `3`] = it->left_nz_[it->i4_ >> `2`] = (rd_i4.nz ? `1` : `0`);
1134	} while (VP8IteratorRotateI4(it, best_blocks));
1135
1136	// finalize state
1137	CopyScore(rd, &rd_best);
1138	VP8SetIntra4Mode(it, rd->modes_i4);
1139	SwapOut(it);
1140	memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1141	return `1`; // select intra4x4 over intra16x16
1142	}
1143
1144	//------------------------------------------------------------------------------
1145
1146	static void PickBestUV(VP8EncIterator* const it, VP8ModeScore* const rd) {
1147	const int kNumBlocks = `8`;
1148	const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1149	const int lambda = dqm->lambda_uv_;
1150	const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1151	uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC; // scratch buffer
1152	uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1153	uint8_t* dst = dst0;
1154	VP8ModeScore rd_best;
1155	int mode;
1156
1157	rd->mode_uv = -`1`;
1158	InitScore(&rd_best);
1159	for (mode = `0`; mode < NUM_PRED_MODES; ++mode) {
1160	VP8ModeScore rd_uv;
1161
1162	// Reconstruct
1163	rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1164
1165	// Compute RD-score
1166	rd_uv.D = VP8SSE16x8(src, tmp_dst);
1167	rd_uv.SD = `0`; // not calling TDisto here: it tends to flatten areas.
1168	rd_uv.H = VP8FixedCostsUV[mode];
1169	rd_uv.R = VP8GetCostUV(it, &rd_uv);
1170	if (mode > `0` && IsFlat(rd_uv.uv_levels[`0`], kNumBlocks, FLATNESS_LIMIT_UV)) {
1171	rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1172	}
1173
1174	SetRDScore(lambda, &rd_uv);
1175	if (mode == `0` \|\| rd_uv.score < rd_best.score) {
1176	CopyScore(&rd_best, &rd_uv);
1177	rd->mode_uv = mode;
1178	memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1179	if (it->top_derr_ != NULL) {
1180	memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
1181	}
1182	SwapPtr(&dst, &tmp_dst);
1183	}
1184	}
1185	VP8SetIntraUVMode(it, rd->mode_uv);
1186	AddScore(rd, &rd_best);
1187	if (dst != dst0) { // copy 16x8 block if needed
1188	VP8Copy16x8(dst, dst0);
1189	}
1190	if (it->top_derr_ != NULL) { // store diffusion errors for next block
1191	StoreDiffusionErrors(it, rd);
1192	}
1193	}
1194
1195	//------------------------------------------------------------------------------
1196	// Final reconstruction and quantization.
1197
1198	static void SimpleQuantize(VP8EncIterator* const it, VP8ModeScore* const rd) {
1199	const VP8Encoder* const enc = it->enc_;
1200	const int is_i16 = (it->mb_->type_ == `1`);
1201	int nz = `0`;
1202
1203	if (is_i16) {
1204	nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[`0`]);
1205	} else {
1206	VP8IteratorStartI4(it);
1207	do {
1208	const int mode =
1209	it->preds_[(it->i4_ & `3`) + (it->i4_ >> `2`) * enc->preds_w_];
1210	const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1211	uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1212	VP8MakeIntra4Preds(it);
1213	nz \|= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1214	src, dst, mode) << it->i4_;
1215	} while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1216	}
1217
1218	nz \|= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1219	rd->nz = nz;
1220	}
1221
1222	// Refine intra16/intra4 sub-modes based on distortion only (not rate).
1223	static void RefineUsingDistortion(VP8EncIterator* const it,
1224	int try_both_modes, int refine_uv_mode,
1225	VP8ModeScore* const rd) {
1226	score_t best_score = MAX_COST;
1227	int nz = `0`;
1228	int mode;
1229	int is_i16 = try_both_modes \|\| (it->mb_->type_ == `1`);
1230
1231	const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1232	// Some empiric constants, of approximate order of magnitude.
1233	const int lambda_d_i16 = `106`;
1234	const int lambda_d_i4 = `11`;
1235	const int lambda_d_uv = `120`;
1236	score_t score_i4 = dqm->i4_penalty_;
1237	score_t i4_bit_sum = `0`;
1238	const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1239	: MAX_COST; // no early-out allowed
1240
1241	if (is_i16) { // First, evaluate Intra16 distortion
1242	int best_mode = -`1`;
1243	const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1244	for (mode = `0`; mode < NUM_PRED_MODES; ++mode) {
1245	const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1246	const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
1247	+ VP8FixedCostsI16[mode] * lambda_d_i16;
1248	if (mode > `0` && VP8FixedCostsI16[mode] > bit_limit) {
1249	continue;
1250	}
1251
1252	if (score < best_score) {
1253	best_mode = mode;
1254	best_score = score;
1255	}
1256	}
1257	if (it->x_ == `0` \|\| it->y_ == `0`) {
1258	// avoid starting a checkerboard resonance from the border. See bug #432.
1259	if (IsFlatSource16(src)) {
1260	best_mode = (it->x_ == `0`) ? `0` : `2`;
1261	try_both_modes = `0`; // stick to i16
1262	}
1263	}
1264	VP8SetIntra16Mode(it, best_mode);
1265	// we'll reconstruct later, if i16 mode actually gets selected
1266	}
1267
1268	// Next, evaluate Intra4
1269	if (try_both_modes \|\| !is_i16) {
1270	// We don't evaluate the rate here, but just account for it through a
1271	// constant penalty (i4 mode usually needs more bits compared to i16).
1272	is_i16 = `0`;
1273	VP8IteratorStartI4(it);
1274	do {
1275	int best_i4_mode = -`1`;
1276	score_t best_i4_score = MAX_COST;
1277	const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1278	const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1279
1280	VP8MakeIntra4Preds(it);
1281	for (mode = `0`; mode < NUM_BMODES; ++mode) {
1282	const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1283	const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1284	+ mode_costs[mode] * lambda_d_i4;
1285	if (score < best_i4_score) {
1286	best_i4_mode = mode;
1287	best_i4_score = score;
1288	}
1289	}
1290	i4_bit_sum += mode_costs[best_i4_mode];
1291	rd->modes_i4[it->i4_] = best_i4_mode;
1292	score_i4 += best_i4_score;
1293	if (score_i4 >= best_score \|\| i4_bit_sum > bit_limit) {
1294	// Intra4 won't be better than Intra16. Bail out and pick Intra16.
1295	is_i16 = `1`;
1296	break;
1297	} else { // reconstruct partial block inside yuv_out2_ buffer
1298	uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1299	nz \|= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1300	src, tmp_dst, best_i4_mode) << it->i4_;
1301	}
1302	} while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1303	}
1304
1305	// Final reconstruction, depending on which mode is selected.
1306	if (!is_i16) {
1307	VP8SetIntra4Mode(it, rd->modes_i4);
1308	SwapOut(it);
1309	best_score = score_i4;
1310	} else {
1311	nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[`0`]);
1312	}
1313
1314	// ... and UV!
1315	if (refine_uv_mode) {
1316	int best_mode = -`1`;
1317	score_t best_uv_score = MAX_COST;
1318	const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1319	for (mode = `0`; mode < NUM_PRED_MODES; ++mode) {
1320	const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1321	const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1322	+ VP8FixedCostsUV[mode] * lambda_d_uv;
1323	if (score < best_uv_score) {
1324	best_mode = mode;
1325	best_uv_score = score;
1326	}
1327	}
1328	VP8SetIntraUVMode(it, best_mode);
1329	}
1330	nz \|= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1331
1332	rd->nz = nz;
1333	rd->score = best_score;
1334	}
1335
1336	//------------------------------------------------------------------------------
1337	// Entry point
1338
1339	int VP8Decimate(VP8EncIterator* const it, VP8ModeScore* const rd,
1340	VP8RDLevel rd_opt) {
1341	int is_skipped;
1342	const int method = it->enc_->method_;
1343
1344	InitScore(rd);
1345
1346	// We can perform predictions for Luma16x16 and Chroma8x8 already.
1347	// Luma4x4 predictions needs to be done as-we-go.
1348	VP8MakeLuma16Preds(it);
1349	VP8MakeChroma8Preds(it);
1350
1351	if (rd_opt > RD_OPT_NONE) {
1352	it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1353	PickBestIntra16(it, rd);
1354	if (method >= `2`) {
1355	PickBestIntra4(it, rd);
1356	}
1357	PickBestUV(it, rd);
1358	if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now
1359	it->do_trellis_ = `1`;
1360	SimpleQuantize(it, rd);
1361	}
1362	} else {
1363	// At this point we have heuristically decided intra16 / intra4.
1364	// For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1365	// For method <= 1, we don't re-examine the decision but just go ahead with
1366	// quantization/reconstruction.
1367	RefineUsingDistortion(it, (method >= `2`), (method >= `1`), rd);
1368	}
1369	is_skipped = (rd->nz == `0`);
1370	VP8SetSkip(it, is_skipped);
1371	return is_skipped;
1372	}
1373

Browse the source code of Skia/third_party/externals/libwebp/src/enc/quant_enc.c