quant_enc.c source code [Godot/thirdparty/libwebp/src/enc/quant_enc.c]

1	// Copyright 2011 Google Inc. All Rights Reserved.
2	//
3	// Use of this source code is governed by a BSD-style license
4	// that can be found in the COPYING file in the root of the source
5	// tree. An additional intellectual property rights grant can be found
6	// in the file PATENTS. All contributing project authors may
7	// be found in the AUTHORS file in the root of the source tree.
8	// -----------------------------------------------------------------------------
9	//
10	// Quantization
11	//
12	// Author: Skal (pascal.massimino@gmail.com)
13
14	#include <assert.h>
15	#include <math.h>
16	#include <stdlib.h> // for abs()
17
18	#include "src/dsp/quant.h"
19	#include "src/enc/vp8i_enc.h"
20	#include "src/enc/cost_enc.h"
21
22	#define DO_TRELLIS_I4 1
23	#define DO_TRELLIS_I16 1 // not a huge gain, but ok at low bitrate.
24	#define DO_TRELLIS_UV 0 // disable trellis for UV. Risky. Not worth.
25	#define USE_TDISTO 1
26
27	#define MID_ALPHA 64 // neutral value for susceptibility
28	#define MIN_ALPHA 30 // lowest usable value for susceptibility
29	#define MAX_ALPHA 100 // higher meaningful value for susceptibility
30
31	#define SNS_TO_DQ 0.9 // Scaling constant between the sns value and the QP
32	// power-law modulation. Must be strictly less than 1.
33
34	// number of non-zero coeffs below which we consider the block very flat
35	// (and apply a penalty to complex predictions)
36	#define FLATNESS_LIMIT_I16 0 // I16 mode (special case)
37	#define FLATNESS_LIMIT_I4 3 // I4 mode
38	#define FLATNESS_LIMIT_UV 2 // UV mode
39	#define FLATNESS_PENALTY 140 // roughly ~1bit per block
40
41	#define MULT_8B(a, b) (((a) * (b) + 128) >> 8)
42
43	#define RD_DISTO_MULT 256 // distortion multiplier (equivalent of lambda)
44
45	// #define DEBUG_BLOCK
46
47	//------------------------------------------------------------------------------
48
49	#if defined(DEBUG_BLOCK)
50
51	#include <stdio.h>
52	#include <stdlib.h>
53
54	static void PrintBlockInfo(const VP8EncIterator* const it,
55	const VP8ModeScore* const rd) {
56	int i, j;
57	const int is_i16 = (it->mb_->type_ == `1`);
58	const uint8_t* const y_in = it->yuv_in_ + Y_OFF_ENC;
59	const uint8_t* const y_out = it->yuv_out_ + Y_OFF_ENC;
60	const uint8_t* const uv_in = it->yuv_in_ + U_OFF_ENC;
61	const uint8_t* const uv_out = it->yuv_out_ + U_OFF_ENC;
62	printf("SOURCE / OUTPUT / ABS DELTA\n");
63	for (j = `0`; j < `16`; ++j) {
64	for (i = `0`; i < `16`; ++i) printf("%3d ", y_in[i + j * BPS]);
65	printf(" ");
66	for (i = `0`; i < `16`; ++i) printf("%3d ", y_out[i + j * BPS]);
67	printf(" ");
68	for (i = `0`; i < `16`; ++i) {
69	printf("%1d ", abs(y_in[i + j * BPS] - y_out[i + j * BPS]));
70	}
71	printf("\n");
72	}
73	printf("\n"); // newline before the U/V block
74	for (j = `0`; j < `8`; ++j) {
75	for (i = `0`; i < `8`; ++i) printf("%3d ", uv_in[i + j * BPS]);
76	printf(" ");
77	for (i = `8`; i < `16`; ++i) printf("%3d ", uv_in[i + j * BPS]);
78	printf(" ");
79	for (i = `0`; i < `8`; ++i) printf("%3d ", uv_out[i + j * BPS]);
80	printf(" ");
81	for (i = `8`; i < `16`; ++i) printf("%3d ", uv_out[i + j * BPS]);
82	printf(" ");
83	for (i = `0`; i < `8`; ++i) {
84	printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
85	}
86	printf(" ");
87	for (i = `8`; i < `16`; ++i) {
88	printf("%1d ", abs(uv_out[i + j * BPS] - uv_in[i + j * BPS]));
89	}
90	printf("\n");
91	}
92	printf("\nD:%d SD:%d R:%d H:%d nz:0x%x score:%d\n",
93	(int)rd->D, (int)rd->SD, (int)rd->R, (int)rd->H, (int)rd->nz,
94	(int)rd->score);
95	if (is_i16) {
96	printf("Mode: %d\n", rd->mode_i16);
97	printf("y_dc_levels:");
98	for (i = `0`; i < `16`; ++i) printf("%3d ", rd->y_dc_levels[i]);
99	printf("\n");
100	} else {
101	printf("Modes[16]: ");
102	for (i = `0`; i < `16`; ++i) printf("%d ", rd->modes_i4[i]);
103	printf("\n");
104	}
105	printf("y_ac_levels:\n");
106	for (j = `0`; j < `16`; ++j) {
107	for (i = is_i16 ? `1` : `0`; i < `16`; ++i) {
108	printf("%4d ", rd->y_ac_levels[j][i]);
109	}
110	printf("\n");
111	}
112	printf("\n");
113	printf("uv_levels (mode=%d):\n", rd->mode_uv);
114	for (j = `0`; j < `8`; ++j) {
115	for (i = `0`; i < `16`; ++i) {
116	printf("%4d ", rd->uv_levels[j][i]);
117	}
118	printf("\n");
119	}
120	}
121
122	#endif // DEBUG_BLOCK
123
124	//------------------------------------------------------------------------------
125
126	static WEBP_INLINE int clip(int v, int m, int M) {
127	return v < m ? m : v > M ? M : v;
128	}
129
130	static const uint8_t kZigzag[`16`] = {
131	`0`, `1`, `4`, `8`, `5`, `2`, `3`, `6`, `9`, `12`, `13`, `10`, `7`, `11`, `14`, `15`
132	};
133
134	static const uint8_t kDcTable[`128`] = {
135	`4`, `5`, `6`, `7`, `8`, `9`, `10`, `10`,
136	`11`, `12`, `13`, `14`, `15`, `16`, `17`, `17`,
137	`18`, `19`, `20`, `20`, `21`, `21`, `22`, `22`,
138	`23`, `23`, `24`, `25`, `25`, `26`, `27`, `28`,
139	`29`, `30`, `31`, `32`, `33`, `34`, `35`, `36`,
140	`37`, `37`, `38`, `39`, `40`, `41`, `42`, `43`,
141	`44`, `45`, `46`, `46`, `47`, `48`, `49`, `50`,
142	`51`, `52`, `53`, `54`, `55`, `56`, `57`, `58`,
143	`59`, `60`, `61`, `62`, `63`, `64`, `65`, `66`,
144	`67`, `68`, `69`, `70`, `71`, `72`, `73`, `74`,
145	`75`, `76`, `76`, `77`, `78`, `79`, `80`, `81`,
146	`82`, `83`, `84`, `85`, `86`, `87`, `88`, `89`,
147	`91`, `93`, `95`, `96`, `98`, `100`, `101`, `102`,
148	`104`, `106`, `108`, `110`, `112`, `114`, `116`, `118`,
149	`122`, `124`, `126`, `128`, `130`, `132`, `134`, `136`,
150	`138`, `140`, `143`, `145`, `148`, `151`, `154`, `157`
151	};
152
153	static const uint16_t kAcTable[`128`] = {
154	`4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`,
155	`12`, `13`, `14`, `15`, `16`, `17`, `18`, `19`,
156	`20`, `21`, `22`, `23`, `24`, `25`, `26`, `27`,
157	`28`, `29`, `30`, `31`, `32`, `33`, `34`, `35`,
158	`36`, `37`, `38`, `39`, `40`, `41`, `42`, `43`,
159	`44`, `45`, `46`, `47`, `48`, `49`, `50`, `51`,
160	`52`, `53`, `54`, `55`, `56`, `57`, `58`, `60`,
161	`62`, `64`, `66`, `68`, `70`, `72`, `74`, `76`,
162	`78`, `80`, `82`, `84`, `86`, `88`, `90`, `92`,
163	`94`, `96`, `98`, `100`, `102`, `104`, `106`, `108`,
164	`110`, `112`, `114`, `116`, `119`, `122`, `125`, `128`,
165	`131`, `134`, `137`, `140`, `143`, `146`, `149`, `152`,
166	`155`, `158`, `161`, `164`, `167`, `170`, `173`, `177`,
167	`181`, `185`, `189`, `193`, `197`, `201`, `205`, `209`,
168	`213`, `217`, `221`, `225`, `229`, `234`, `239`, `245`,
169	`249`, `254`, `259`, `264`, `269`, `274`, `279`, `284`
170	};
171
172	static const uint16_t kAcTable2[`128`] = {
173	`8`, `8`, `9`, `10`, `12`, `13`, `15`, `17`,
174	`18`, `20`, `21`, `23`, `24`, `26`, `27`, `29`,
175	`31`, `32`, `34`, `35`, `37`, `38`, `40`, `41`,
176	`43`, `44`, `46`, `48`, `49`, `51`, `52`, `54`,
177	`55`, `57`, `58`, `60`, `62`, `63`, `65`, `66`,
178	`68`, `69`, `71`, `72`, `74`, `75`, `77`, `79`,
179	`80`, `82`, `83`, `85`, `86`, `88`, `89`, `93`,
180	`96`, `99`, `102`, `105`, `108`, `111`, `114`, `117`,
181	`120`, `124`, `127`, `130`, `133`, `136`, `139`, `142`,
182	`145`, `148`, `151`, `155`, `158`, `161`, `164`, `167`,
183	`170`, `173`, `176`, `179`, `184`, `189`, `193`, `198`,
184	`203`, `207`, `212`, `217`, `221`, `226`, `230`, `235`,
185	`240`, `244`, `249`, `254`, `258`, `263`, `268`, `274`,
186	`280`, `286`, `292`, `299`, `305`, `311`, `317`, `323`,
187	`330`, `336`, `342`, `348`, `354`, `362`, `370`, `379`,
188	`385`, `393`, `401`, `409`, `416`, `424`, `432`, `440`
189	};
190
191	static const uint8_t kBiasMatrices[`3`][`2`] = { // [luma-ac,luma-dc,chroma][dc,ac]
192	{ `96`, `110` }, { `96`, `108` }, { `110`, `115` }
193	};
194
195	// Sharpening by (slightly) raising the hi-frequency coeffs.
196	// Hack-ish but helpful for mid-bitrate range. Use with care.
197	#define SHARPEN_BITS 11 // number of descaling bits for sharpening bias
198	static const uint8_t kFreqSharpening[`16`] = {
199	`0`, `30`, `60`, `90`,
200	`30`, `60`, `90`, `90`,
201	`60`, `90`, `90`, `90`,
202	`90`, `90`, `90`, `90`
203	};
204
205	//------------------------------------------------------------------------------
206	// Initialize quantization parameters in VP8Matrix
207
208	// Returns the average quantizer
209	static int ExpandMatrix(VP8Matrix* const m, int type) {
210	int i, sum;
211	for (i = `0`; i < `2`; ++i) {
212	const int is_ac_coeff = (i > `0`);
213	const int bias = kBiasMatrices[type][is_ac_coeff];
214	m->iq_[i] = (`1` << QFIX) / m->q_[i];
215	m->bias_[i] = BIAS(bias);
216	// zthresh_ is the exact value such that QUANTDIV(coeff, iQ, B) is:
217	// zero if coeff <= zthresh*
218	// non-zero if coeff > zthresh*
219	m->zthresh_[i] = ((`1` << QFIX) - `1` - m->bias_[i]) / m->iq_[i];
220	}
221	for (i = `2`; i < `16`; ++i) {
222	m->q_[i] = m->q_[`1`];
223	m->iq_[i] = m->iq_[`1`];
224	m->bias_[i] = m->bias_[`1`];
225	m->zthresh_[i] = m->zthresh_[`1`];
226	}
227	for (sum = `0`, i = `0`; i < `16`; ++i) {
228	if (type == `0`) { // we only use sharpening for AC luma coeffs
229	m->sharpen_[i] = (kFreqSharpening[i] * m->q_[i]) >> SHARPEN_BITS;
230	} else {
231	m->sharpen_[i] = `0`;
232	}
233	sum += m->q_[i];
234	}
235	return (sum + `8`) >> `4`;
236	}
237
238	static void CheckLambdaValue(int* const v) { if (v < `1`) v = `1`; }
239
240	static void SetupMatrices(VP8Encoder* enc) {
241	int i;
242	const int tlambda_scale =
243	(enc->method_ >= `4`) ? enc->config_->sns_strength
244	: `0`;
245	const int num_segments = enc->segment_hdr_.num_segments_;
246	for (i = `0`; i < num_segments; ++i) {
247	VP8SegmentInfo* const m = &enc->dqm_[i];
248	const int q = m->quant_;
249	int q_i4, q_i16, q_uv;
250	m->y1_.q_[`0`] = kDcTable[clip(q + enc->dq_y1_dc_, `0`, `127`)];
251	m->y1_.q_[`1`] = kAcTable[clip(q, `0`, `127`)];
252
253	m->y2_.q_[`0`] = kDcTable[ clip(q + enc->dq_y2_dc_, `0`, `127`)] * `2`;
254	m->y2_.q_[`1`] = kAcTable2[clip(q + enc->dq_y2_ac_, `0`, `127`)];
255
256	m->uv_.q_[`0`] = kDcTable[clip(q + enc->dq_uv_dc_, `0`, `117`)];
257	m->uv_.q_[`1`] = kAcTable[clip(q + enc->dq_uv_ac_, `0`, `127`)];
258
259	q_i4 = ExpandMatrix(&m->y1_, `0`);
260	q_i16 = ExpandMatrix(&m->y2_, `1`);
261	q_uv = ExpandMatrix(&m->uv_, `2`);
262
263	m->lambda_i4_ = (`3` * q_i4 * q_i4) >> `7`;
264	m->lambda_i16_ = (`3` * q_i16 * q_i16);
265	m->lambda_uv_ = (`3` * q_uv * q_uv) >> `6`;
266	m->lambda_mode_ = (`1` * q_i4 * q_i4) >> `7`;
267	m->lambda_trellis_i4_ = (`7` * q_i4 * q_i4) >> `3`;
268	m->lambda_trellis_i16_ = (q_i16 * q_i16) >> `2`;
269	m->lambda_trellis_uv_ = (q_uv * q_uv) << `1`;
270	m->tlambda_ = (tlambda_scale * q_i4) >> `5`;
271
272	// none of these constants should be < 1
273	CheckLambdaValue(&m->lambda_i4_);
274	CheckLambdaValue(&m->lambda_i16_);
275	CheckLambdaValue(&m->lambda_uv_);
276	CheckLambdaValue(&m->lambda_mode_);
277	CheckLambdaValue(&m->lambda_trellis_i4_);
278	CheckLambdaValue(&m->lambda_trellis_i16_);
279	CheckLambdaValue(&m->lambda_trellis_uv_);
280	CheckLambdaValue(&m->tlambda_);
281
282	m->min_disto_ = `20` * m->y1_.q_[`0`]; // quantization-aware min disto
283	m->max_edge_ = `0`;
284
285	m->i4_penalty_ = `1000` * q_i4 * q_i4;
286	}
287	}
288
289	//------------------------------------------------------------------------------
290	// Initialize filtering parameters
291
292	// Very small filter-strength values have close to no visual effect. So we can
293	// save a little decoding-CPU by turning filtering off for these.
294	#define FSTRENGTH_CUTOFF 2
295
296	static void SetupFilterStrength(VP8Encoder* const enc) {
297	int i;
298	// level0 is in [0..500]. Using '-f 50' as filter_strength is mid-filtering.
299	const int level0 = `5` * enc->config_->filter_strength;
300	for (i = `0`; i < NUM_MB_SEGMENTS; ++i) {
301	VP8SegmentInfo* const m = &enc->dqm_[i];
302	// We focus on the quantization of AC coeffs.
303	const int qstep = kAcTable[clip(m->quant_, `0`, `127`)] >> `2`;
304	const int base_strength =
305	VP8FilterStrengthFromDelta(enc->filter_hdr_.sharpness_, qstep);
306	// Segments with lower complexity ('beta') will be less filtered.
307	const int f = base_strength * level0 / (`256` + m->beta_);
308	m->fstrength_ = (f < FSTRENGTH_CUTOFF) ? `0` : (f > `63`) ? `63` : f;
309	}
310	// We record the initial strength (mainly for the case of 1-segment only).
311	enc->filter_hdr_.level_ = enc->dqm_[`0`].fstrength_;
312	enc->filter_hdr_.simple_ = (enc->config_->filter_type == `0`);
313	enc->filter_hdr_.sharpness_ = enc->config_->filter_sharpness;
314	}
315
316	//------------------------------------------------------------------------------
317
318	// Note: if you change the values below, remember that the max range
319	// allowed by the syntax for DQ_UV is [-16,16].
320	#define MAX_DQ_UV (6)
321	#define MIN_DQ_UV (-4)
322
323	// We want to emulate jpeg-like behaviour where the expected "good" quality
324	// is around q=75. Internally, our "good" middle is around c=50. So we
325	// map accordingly using linear piece-wise function
326	static double QualityToCompression(double c) {
327	const double linear_c = (c < `0.75`) ? c * (`2.` / `3.`) : `2.` * c - `1.`;
328	// The file size roughly scales as pow(quantizer, 3.). Actually, the
329	// exponent is somewhere between 2.8 and 3.2, but we're mostly interested
330	// in the mid-quant range. So we scale the compressibility inversely to
331	// this power-law: quant ~= compression ^ 1/3. This law holds well for
332	// low quant. Finer modeling for high-quant would make use of kAcTable[]
333	// more explicitly.
334	const double v = pow(linear_c, `1` / `3.`);
335	return v;
336	}
337
338	static double QualityToJPEGCompression(double c, double alpha) {
339	// We map the complexity 'alpha' and quality setting 'c' to a compression
340	// exponent empirically matched to the compression curve of libjpeg6b.
341	// On average, the WebP output size will be roughly similar to that of a
342	// JPEG file compressed with same quality factor.
343	const double amin = `0.30`;
344	const double amax = `0.85`;
345	const double exp_min = `0.4`;
346	const double exp_max = `0.9`;
347	const double slope = (exp_min - exp_max) / (amax - amin);
348	// Linearly interpolate 'expn' from exp_min to exp_max
349	// in the [amin, amax] range.
350	const double expn = (alpha > amax) ? exp_min
351	: (alpha < amin) ? exp_max
352	: exp_max + slope * (alpha - amin);
353	const double v = pow(c, expn);
354	return v;
355	}
356
357	static int SegmentsAreEquivalent(const VP8SegmentInfo* const S1,
358	const VP8SegmentInfo* const S2) {
359	return (S1->quant_ == S2->quant_) && (S1->fstrength_ == S2->fstrength_);
360	}
361
362	static void SimplifySegments(VP8Encoder* const enc) {
363	int map[NUM_MB_SEGMENTS] = { `0`, `1`, `2`, `3` };
364	// 'num_segments_' is previously validated and <= NUM_MB_SEGMENTS, but an
365	// explicit check is needed to avoid a spurious warning about 'i' exceeding
366	// array bounds of 'dqm_' with some compilers (noticed with gcc-4.9).
367	const int num_segments = (enc->segment_hdr_.num_segments_ < NUM_MB_SEGMENTS)
368	? enc->segment_hdr_.num_segments_
369	: NUM_MB_SEGMENTS;
370	int num_final_segments = `1`;
371	int s1, s2;
372	for (s1 = `1`; s1 < num_segments; ++s1) { // find similar segments
373	const VP8SegmentInfo* const S1 = &enc->dqm_[s1];
374	int found = `0`;
375	// check if we already have similar segment
376	for (s2 = `0`; s2 < num_final_segments; ++s2) {
377	const VP8SegmentInfo* const S2 = &enc->dqm_[s2];
378	if (SegmentsAreEquivalent(S1, S2)) {
379	found = `1`;
380	break;
381	}
382	}
383	map[s1] = s2;
384	if (!found) {
385	if (num_final_segments != s1) {
386	enc->dqm_[num_final_segments] = enc->dqm_[s1];
387	}
388	++num_final_segments;
389	}
390	}
391	if (num_final_segments < num_segments) { // Remap
392	int i = enc->mb_w_ * enc->mb_h_;
393	while (i-- > `0`) enc->mb_info_[i].segment_ = map[enc->mb_info_[i].segment_];
394	enc->segment_hdr_.num_segments_ = num_final_segments;
395	// Replicate the trailing segment infos (it's mostly cosmetics)
396	for (i = num_final_segments; i < num_segments; ++i) {
397	enc->dqm_[i] = enc->dqm_[num_final_segments - `1`];
398	}
399	}
400	}
401
402	void VP8SetSegmentParams(VP8Encoder* const enc, float quality) {
403	int i;
404	int dq_uv_ac, dq_uv_dc;
405	const int num_segments = enc->segment_hdr_.num_segments_;
406	const double amp = SNS_TO_DQ * enc->config_->sns_strength / `100.` / `128.`;
407	const double Q = quality / `100.`;
408	const double c_base = enc->config_->emulate_jpeg_size ?
409	QualityToJPEGCompression(Q, enc->alpha_ / `255.`) :
410	QualityToCompression(Q);
411	for (i = `0`; i < num_segments; ++i) {
412	// We modulate the base coefficient to accommodate for the quantization
413	// susceptibility and allow denser segments to be quantized more.
414	const double expn = `1.` - amp * enc->dqm_[i].alpha_;
415	const double c = pow(c_base, expn);
416	const int q = (int)(`127.` * (`1.` - c));
417	assert(expn > `0.`);
418	enc->dqm_[i].quant_ = clip(q, `0`, `127`);
419	}
420
421	// purely indicative in the bitstream (except for the 1-segment case)
422	enc->base_quant_ = enc->dqm_[`0`].quant_;
423
424	// fill-in values for the unused segments (required by the syntax)
425	for (i = num_segments; i < NUM_MB_SEGMENTS; ++i) {
426	enc->dqm_[i].quant_ = enc->base_quant_;
427	}
428
429	// uv_alpha_ is normally spread around ~60. The useful range is
430	// typically ~30 (quite bad) to ~100 (ok to decimate UV more).
431	// We map it to the safe maximal range of MAX/MIN_DQ_UV for dq_uv.
432	dq_uv_ac = (enc->uv_alpha_ - MID_ALPHA) * (MAX_DQ_UV - MIN_DQ_UV)
433	/ (MAX_ALPHA - MIN_ALPHA);
434	// we rescale by the user-defined strength of adaptation
435	dq_uv_ac = dq_uv_ac * enc->config_->sns_strength / `100`;
436	// and make it safe.
437	dq_uv_ac = clip(dq_uv_ac, MIN_DQ_UV, MAX_DQ_UV);
438	// We also boost the dc-uv-quant a little, based on sns-strength, since
439	// U/V channels are quite more reactive to high quants (flat DC-blocks
440	// tend to appear, and are unpleasant).
441	dq_uv_dc = -`4` * enc->config_->sns_strength / `100`;
442	dq_uv_dc = clip(dq_uv_dc, -`15`, `15`); // 4bit-signed max allowed
443
444	enc->dq_y1_dc_ = `0`; // TODO(skal): dq-lum
445	enc->dq_y2_dc_ = `0`;
446	enc->dq_y2_ac_ = `0`;
447	enc->dq_uv_dc_ = dq_uv_dc;
448	enc->dq_uv_ac_ = dq_uv_ac;
449
450	SetupFilterStrength(enc); // initialize segments' filtering, eventually
451
452	if (num_segments > `1`) SimplifySegments(enc);
453
454	SetupMatrices(enc); // finalize quantization matrices
455	}
456
457	//------------------------------------------------------------------------------
458	// Form the predictions in cache
459
460	// Must be ordered using {DC_PRED, TM_PRED, V_PRED, H_PRED} as index
461	const uint16_t VP8I16ModeOffsets[`4`] = { I16DC16, I16TM16, I16VE16, I16HE16 };
462	const uint16_t VP8UVModeOffsets[`4`] = { C8DC8, C8TM8, C8VE8, C8HE8 };
463
464	// Must be indexed using {B_DC_PRED -> B_HU_PRED} as index
465	const uint16_t VP8I4ModeOffsets[NUM_BMODES] = {
466	I4DC4, I4TM4, I4VE4, I4HE4, I4RD4, I4VR4, I4LD4, I4VL4, I4HD4, I4HU4
467	};
468
469	void VP8MakeLuma16Preds(const VP8EncIterator* const it) {
470	const uint8_t* const left = it->x_ ? it->y_left_ : NULL;
471	const uint8_t* const top = it->y_ ? it->y_top_ : NULL;
472	VP8EncPredLuma16(it->yuv_p_, left, top);
473	}
474
475	void VP8MakeChroma8Preds(const VP8EncIterator* const it) {
476	const uint8_t* const left = it->x_ ? it->u_left_ : NULL;
477	const uint8_t* const top = it->y_ ? it->uv_top_ : NULL;
478	VP8EncPredChroma8(it->yuv_p_, left, top);
479	}
480
481	void VP8MakeIntra4Preds(const VP8EncIterator* const it) {
482	VP8EncPredLuma4(it->yuv_p_, it->i4_top_);
483	}
484
485	//------------------------------------------------------------------------------
486	// Quantize
487
488	// Layout:
489	// +----+----+
490	// \|YYYY\|UUVV\| 0
491	// \|YYYY\|UUVV\| 4
492	// \|YYYY\|....\| 8
493	// \|YYYY\|....\| 12
494	// +----+----+
495
496	const uint16_t VP8Scan[`16`] = { // Luma
497	`0` + `0` * BPS, `4` + `0` * BPS, `8` + `0` * BPS, `12` + `0` * BPS,
498	`0` + `4` * BPS, `4` + `4` * BPS, `8` + `4` * BPS, `12` + `4` * BPS,
499	`0` + `8` * BPS, `4` + `8` * BPS, `8` + `8` * BPS, `12` + `8` * BPS,
500	`0` + `12` * BPS, `4` + `12` * BPS, `8` + `12` * BPS, `12` + `12` * BPS,
501	};
502
503	static const uint16_t VP8ScanUV[`4` + `4`] = {
504	`0` + `0` * BPS, `4` + `0` * BPS, `0` + `4` * BPS, `4` + `4` * BPS, // U
505	`8` + `0` * BPS, `12` + `0` * BPS, `8` + `4` * BPS, `12` + `4` * BPS // V
506	};
507
508	//------------------------------------------------------------------------------
509	// Distortion measurement
510
511	static const uint16_t kWeightY[`16`] = {
512	`38`, `32`, `20`, `9`, `32`, `28`, `17`, `7`, `20`, `17`, `10`, `4`, `9`, `7`, `4`, `2`
513	};
514
515	static const uint16_t kWeightTrellis[`16`] = {
516	#if USE_TDISTO == 0
517	`16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`, `16`
518	#else
519	`30`, `27`, `19`, `11`,
520	`27`, `24`, `17`, `10`,
521	`19`, `17`, `12`, `8`,
522	`11`, `10`, `8`, `6`
523	#endif
524	};
525
526	// Init/Copy the common fields in score.
527	static void InitScore(VP8ModeScore* const rd) {
528	rd->D = `0`;
529	rd->SD = `0`;
530	rd->R = `0`;
531	rd->H = `0`;
532	rd->nz = `0`;
533	rd->score = MAX_COST;
534	}
535
536	static void CopyScore(VP8ModeScore* WEBP_RESTRICT const dst,
537	const VP8ModeScore* WEBP_RESTRICT const src) {
538	dst->D = src->D;
539	dst->SD = src->SD;
540	dst->R = src->R;
541	dst->H = src->H;
542	dst->nz = src->nz; // note that nz is not accumulated, but just copied.
543	dst->score = src->score;
544	}
545
546	static void AddScore(VP8ModeScore* WEBP_RESTRICT const dst,
547	const VP8ModeScore* WEBP_RESTRICT const src) {
548	dst->D += src->D;
549	dst->SD += src->SD;
550	dst->R += src->R;
551	dst->H += src->H;
552	dst->nz \|= src->nz; // here, new nz bits are accumulated.
553	dst->score += src->score;
554	}
555
556	//------------------------------------------------------------------------------
557	// Performs trellis-optimized quantization.
558
559	// -- GODOT start --
560	// Prevents Visual Studio debugger from using this Node struct in place of the Godot Node class.
561	#define Node Node_libwebp_quant
562	// -- GODOT end --
563
564	// Trellis node
565	typedef struct {
566	int8_t prev; // best previous node
567	int8_t sign; // sign of coeff_i
568	int16_t level; // level
569	} Node;
570
571	// Score state
572	typedef struct {
573	score_t score; // partial RD score
574	const uint16_t* costs; // shortcut to cost tables
575	} ScoreState;
576
577	// If a coefficient was quantized to a value Q (using a neutral bias),
578	// we test all alternate possibilities between [Q-MIN_DELTA, Q+MAX_DELTA]
579	// We don't test negative values though.
580	#define MIN_DELTA 0 // how much lower level to try
581	#define MAX_DELTA 1 // how much higher
582	#define NUM_NODES (MIN_DELTA + 1 + MAX_DELTA)
583	#define NODE(n, l) (nodes[(n)][(l) + MIN_DELTA])
584	#define SCORE_STATE(n, l) (score_states[n][(l) + MIN_DELTA])
585
586	static WEBP_INLINE void SetRDScore(int lambda, VP8ModeScore* const rd) {
587	rd->score = (rd->R + rd->H) * lambda + RD_DISTO_MULT * (rd->D + rd->SD);
588	}
589
590	static WEBP_INLINE score_t RDScoreTrellis(int lambda, score_t rate,
591	score_t distortion) {
592	return rate * lambda + RD_DISTO_MULT * distortion;
593	}
594
595	// Coefficient type.
596	enum { TYPE_I16_AC = `0`, TYPE_I16_DC = `1`, TYPE_CHROMA_A = `2`, TYPE_I4_AC = `3` };
597
598	static int TrellisQuantizeBlock(const VP8Encoder* WEBP_RESTRICT const enc,
599	int16_t in[`16`], int16_t out[`16`],
600	int ctx0, int coeff_type,
601	const VP8Matrix* WEBP_RESTRICT const mtx,
602	int lambda) {
603	const ProbaArray* const probas = enc->proba_.coeffs_[coeff_type];
604	CostArrayPtr const costs =
605	(CostArrayPtr)enc->proba_.remapped_costs_[coeff_type];
606	const int first = (coeff_type == TYPE_I16_AC) ? `1` : `0`;
607	Node nodes[`16`][NUM_NODES];
608	ScoreState score_states[`2`][NUM_NODES];
609	ScoreState* ss_cur = &SCORE_STATE(`0`, MIN_DELTA);
610	ScoreState* ss_prev = &SCORE_STATE(`1`, MIN_DELTA);
611	int best_path[`3`] = {-`1`, -`1`, -`1`}; // store best-last/best-level/best-previous
612	score_t best_score;
613	int n, m, p, last;
614
615	{
616	score_t cost;
617	const int thresh = mtx->q_[`1`] * mtx->q_[`1`] / `4`;
618	const int last_proba = probas[VP8EncBands[first]][ctx0][`0`];
619
620	// compute the position of the last interesting coefficient
621	last = first - `1`;
622	for (n = `15`; n >= first; --n) {
623	const int j = kZigzag[n];
624	const int err = in[j] * in[j];
625	if (err > thresh) {
626	last = n;
627	break;
628	}
629	}
630	// we don't need to go inspect up to n = 16 coeffs. We can just go up
631	// to last + 1 (inclusive) without losing much.
632	if (last < `15`) ++last;
633
634	// compute 'skip' score. This is the max score one can do.
635	cost = VP8BitCost(`0`, last_proba);
636	best_score = RDScoreTrellis(lambda, cost, `0`);
637
638	// initialize source node.
639	for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
640	const score_t rate = (ctx0 == `0`) ? VP8BitCost(`1`, last_proba) : `0`;
641	ss_cur[m].score = RDScoreTrellis(lambda, rate, `0`);
642	ss_cur[m].costs = costs[first][ctx0];
643	}
644	}
645
646	// traverse trellis.
647	for (n = first; n <= last; ++n) {
648	const int j = kZigzag[n];
649	const uint32_t Q = mtx->q_[j];
650	const uint32_t iQ = mtx->iq_[j];
651	const uint32_t B = BIAS(`0x00`); // neutral bias
652	// note: it's important to take sign of the _original_ coeff,
653	// so we don't have to consider level < 0 afterward.
654	const int sign = (in[j] < `0`);
655	const uint32_t coeff0 = (sign ? -in[j] : in[j]) + mtx->sharpen_[j];
656	int level0 = QUANTDIV(coeff0, iQ, B);
657	int thresh_level = QUANTDIV(coeff0, iQ, BIAS(`0x80`));
658	if (thresh_level > MAX_LEVEL) thresh_level = MAX_LEVEL;
659	if (level0 > MAX_LEVEL) level0 = MAX_LEVEL;
660
661	{ // Swap current and previous score states
662	ScoreState* const tmp = ss_cur;
663	ss_cur = ss_prev;
664	ss_prev = tmp;
665	}
666
667	// test all alternate level values around level0.
668	for (m = -MIN_DELTA; m <= MAX_DELTA; ++m) {
669	Node* const cur = &NODE(n, m);
670	const int level = level0 + m;
671	const int ctx = (level > `2`) ? `2` : level;
672	const int band = VP8EncBands[n + `1`];
673	score_t base_score;
674	score_t best_cur_score;
675	int best_prev;
676	score_t cost, score;
677
678	ss_cur[m].costs = costs[n + `1`][ctx];
679	if (level < `0` \|\| level > thresh_level) {
680	ss_cur[m].score = MAX_COST;
681	// Node is dead.
682	continue;
683	}
684
685	{
686	// Compute delta_error = how much coding this level will
687	// subtract to max_error as distortion.
688	// Here, distortion = sum of (\|coeff_i\| - level_i Q_i)^2*
689	const int new_error = coeff0 - level * Q;
690	const int delta_error =
691	kWeightTrellis[j] * (new_error * new_error - coeff0 * coeff0);
692	base_score = RDScoreTrellis(lambda, `0`, delta_error);
693	}
694
695	// Inspect all possible non-dead predecessors. Retain only the best one.
696	// The base_score is added to all scores so it is only added for the final
697	// value after the loop.
698	cost = VP8LevelCost(ss_prev[-MIN_DELTA].costs, level);
699	best_cur_score =
700	ss_prev[-MIN_DELTA].score + RDScoreTrellis(lambda, cost, `0`);
701	best_prev = -MIN_DELTA;
702	for (p = -MIN_DELTA + `1`; p <= MAX_DELTA; ++p) {
703	// Dead nodes (with ss_prev[p].score >= MAX_COST) are automatically
704	// eliminated since their score can't be better than the current best.
705	cost = VP8LevelCost(ss_prev[p].costs, level);
706	// Examine node assuming it's a non-terminal one.
707	score = ss_prev[p].score + RDScoreTrellis(lambda, cost, `0`);
708	if (score < best_cur_score) {
709	best_cur_score = score;
710	best_prev = p;
711	}
712	}
713	best_cur_score += base_score;
714	// Store best finding in current node.
715	cur->sign = sign;
716	cur->level = level;
717	cur->prev = best_prev;
718	ss_cur[m].score = best_cur_score;
719
720	// Now, record best terminal node (and thus best entry in the graph).
721	if (level != `0` && best_cur_score < best_score) {
722	const score_t last_pos_cost =
723	(n < `15`) ? VP8BitCost(`0`, probas[band][ctx][`0`]) : `0`;
724	const score_t last_pos_score = RDScoreTrellis(lambda, last_pos_cost, `0`);
725	score = best_cur_score + last_pos_score;
726	if (score < best_score) {
727	best_score = score;
728	best_path[`0`] = n; // best eob position
729	best_path[`1`] = m; // best node index
730	best_path[`2`] = best_prev; // best predecessor
731	}
732	}
733	}
734	}
735
736	// Fresh start
737	// Beware! We must preserve in[0]/out[0] value for TYPE_I16_AC case.
738	if (coeff_type == TYPE_I16_AC) {
739	memset(in + `1`, `0`, `15` * sizeof(*in));
740	memset(out + `1`, `0`, `15` * sizeof(*out));
741	} else {
742	memset(in, `0`, `16` * sizeof(*in));
743	memset(out, `0`, `16` * sizeof(*out));
744	}
745	if (best_path[`0`] == -`1`) {
746	return `0`; // skip!
747	}
748
749	{
750	// Unwind the best path.
751	// Note: best-prev on terminal node is not necessarily equal to the
752	// best_prev for non-terminal. So we patch best_path[2] in.
753	int nz = `0`;
754	int best_node = best_path[`1`];
755	n = best_path[`0`];
756	NODE(n, best_node).prev = best_path[`2`]; // force best-prev for terminal
757
758	for (; n >= first; --n) {
759	const Node* const node = &NODE(n, best_node);
760	const int j = kZigzag[n];
761	out[n] = node->sign ? -node->level : node->level;
762	nz \|= node->level;
763	in[j] = out[n] * mtx->q_[j];
764	best_node = node->prev;
765	}
766	return (nz != `0`);
767	}
768	}
769
770	#undef NODE
771
772	//------------------------------------------------------------------------------
773	// Performs: difference, transform, quantize, back-transform, add
774	// all at once. Output is the reconstructed block in yuv_out, and the*
775	// quantized levels in levels.*
776
777	static int ReconstructIntra16(VP8EncIterator* WEBP_RESTRICT const it,
778	VP8ModeScore* WEBP_RESTRICT const rd,
779	uint8_t* WEBP_RESTRICT const yuv_out,
780	int mode) {
781	const VP8Encoder* const enc = it->enc_;
782	const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
783	const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
784	const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
785	int nz = `0`;
786	int n;
787	int16_t tmp[`16`][`16`], dc_tmp[`16`];
788
789	for (n = `0`; n < `16`; n += `2`) {
790	VP8FTransform2(src + VP8Scan[n], ref + VP8Scan[n], tmp[n]);
791	}
792	VP8FTransformWHT(tmp[`0`], dc_tmp);
793	nz \|= VP8EncQuantizeBlockWHT(dc_tmp, rd->y_dc_levels, &dqm->y2_) << `24`;
794
795	if (DO_TRELLIS_I16 && it->do_trellis_) {
796	int x, y;
797	VP8IteratorNzToBytes(it);
798	for (y = `0`, n = `0`; y < `4`; ++y) {
799	for (x = `0`; x < `4`; ++x, ++n) {
800	const int ctx = it->top_nz_[x] + it->left_nz_[y];
801	const int non_zero = TrellisQuantizeBlock(
802	enc, tmp[n], rd->y_ac_levels[n], ctx, TYPE_I16_AC, &dqm->y1_,
803	dqm->lambda_trellis_i16_);
804	it->top_nz_[x] = it->left_nz_[y] = non_zero;
805	rd->y_ac_levels[n][`0`] = `0`;
806	nz \|= non_zero << n;
807	}
808	}
809	} else {
810	for (n = `0`; n < `16`; n += `2`) {
811	// Zero-out the first coeff, so that: a) nz is correct below, and
812	// b) finding 'last' non-zero coeffs in SetResidualCoeffs() is simplified.
813	tmp[n][`0`] = tmp[n + `1`][`0`] = `0`;
814	nz \|= VP8EncQuantize2Blocks(tmp[n], rd->y_ac_levels[n], &dqm->y1_) << n;
815	assert(rd->y_ac_levels[n + `0`][`0`] == `0`);
816	assert(rd->y_ac_levels[n + `1`][`0`] == `0`);
817	}
818	}
819
820	// Transform back
821	VP8TransformWHT(dc_tmp, tmp[`0`]);
822	for (n = `0`; n < `16`; n += `2`) {
823	VP8ITransform(ref + VP8Scan[n], tmp[n], yuv_out + VP8Scan[n], `1`);
824	}
825
826	return nz;
827	}
828
829	static int ReconstructIntra4(VP8EncIterator* WEBP_RESTRICT const it,
830	int16_t levels[`16`],
831	const uint8_t* WEBP_RESTRICT const src,
832	uint8_t* WEBP_RESTRICT const yuv_out,
833	int mode) {
834	const VP8Encoder* const enc = it->enc_;
835	const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
836	const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
837	int nz = `0`;
838	int16_t tmp[`16`];
839
840	VP8FTransform(src, ref, tmp);
841	if (DO_TRELLIS_I4 && it->do_trellis_) {
842	const int x = it->i4_ & `3`, y = it->i4_ >> `2`;
843	const int ctx = it->top_nz_[x] + it->left_nz_[y];
844	nz = TrellisQuantizeBlock(enc, tmp, levels, ctx, TYPE_I4_AC, &dqm->y1_,
845	dqm->lambda_trellis_i4_);
846	} else {
847	nz = VP8EncQuantizeBlock(tmp, levels, &dqm->y1_);
848	}
849	VP8ITransform(ref, tmp, yuv_out, `0`);
850	return nz;
851	}
852
853	//------------------------------------------------------------------------------
854	// DC-error diffusion
855
856	// Diffusion weights. We under-correct a bit (15/16th of the error is actually
857	// diffused) to avoid 'rainbow' chessboard pattern of blocks at q~=0.
858	#define C1 7 // fraction of error sent to the 4x4 block below
859	#define C2 8 // fraction of error sent to the 4x4 block on the right
860	#define DSHIFT 4
861	#define DSCALE 1 // storage descaling, needed to make the error fit int8_t
862
863	// Quantize as usual, but also compute and return the quantization error.
864	// Error is already divided by DSHIFT.
865	static int QuantizeSingle(int16_t* WEBP_RESTRICT const v,
866	const VP8Matrix* WEBP_RESTRICT const mtx) {
867	int V = *v;
868	const int sign = (V < `0`);
869	if (sign) V = -V;
870	if (V > (int)mtx->zthresh_[`0`]) {
871	const int qV = QUANTDIV(V, mtx->iq_[`0`], mtx->bias_[`0`]) * mtx->q_[`0`];
872	const int err = (V - qV);
873	*v = sign ? -qV : qV;
874	return (sign ? -err : err) >> DSCALE;
875	}
876	*v = `0`;
877	return (sign ? -V : V) >> DSCALE;
878	}
879
880	static void CorrectDCValues(const VP8EncIterator* WEBP_RESTRICT const it,
881	const VP8Matrix* WEBP_RESTRICT const mtx,
882	int16_t tmp[][`16`],
883	VP8ModeScore* WEBP_RESTRICT const rd) {
884	// \| top[0] \| top[1]
885	// --------+--------+---------
886	// left[0] \| tmp[0] tmp[1] <-> err0 err1
887	// left[1] \| tmp[2] tmp[3] err2 err3
888	//
889	// Final errors {err1,err2,err3} are preserved and later restored
890	// as top[]/left[] on the next block.
891	int ch;
892	for (ch = `0`; ch <= `1`; ++ch) {
893	const int8_t* const top = it->top_derr_[it->x_][ch];
894	const int8_t* const left = it->left_derr_[ch];
895	int16_t (* const c)[`16`] = &tmp[ch * `4`];
896	int err0, err1, err2, err3;
897	c[`0`][`0`] += (C1 * top[`0`] + C2 * left[`0`]) >> (DSHIFT - DSCALE);
898	err0 = QuantizeSingle(&c[`0`][`0`], mtx);
899	c[`1`][`0`] += (C1 * top[`1`] + C2 * err0) >> (DSHIFT - DSCALE);
900	err1 = QuantizeSingle(&c[`1`][`0`], mtx);
901	c[`2`][`0`] += (C1 * err0 + C2 * left[`1`]) >> (DSHIFT - DSCALE);
902	err2 = QuantizeSingle(&c[`2`][`0`], mtx);
903	c[`3`][`0`] += (C1 * err1 + C2 * err2) >> (DSHIFT - DSCALE);
904	err3 = QuantizeSingle(&c[`3`][`0`], mtx);
905	// error 'err' is bounded by mtx->q_[0] which is 132 at max. Hence
906	// err >> DSCALE will fit in an int8_t type if DSCALE>=1.
907	assert(abs(err1) <= `127` && abs(err2) <= `127` && abs(err3) <= `127`);
908	rd->derr[ch][`0`] = (int8_t)err1;
909	rd->derr[ch][`1`] = (int8_t)err2;
910	rd->derr[ch][`2`] = (int8_t)err3;
911	}
912	}
913
914	static void StoreDiffusionErrors(VP8EncIterator* WEBP_RESTRICT const it,
915	const VP8ModeScore* WEBP_RESTRICT const rd) {
916	int ch;
917	for (ch = `0`; ch <= `1`; ++ch) {
918	int8_t* const top = it->top_derr_[it->x_][ch];
919	int8_t* const left = it->left_derr_[ch];
920	left[`0`] = rd->derr[ch][`0`]; // restore err1
921	left[`1`] = `3` * rd->derr[ch][`2`] >> `2`; // ... 3/4th of err3
922	top[`0`] = rd->derr[ch][`1`]; // ... err2
923	top[`1`] = rd->derr[ch][`2`] - left[`1`]; // ... 1/4th of err3.
924	}
925	}
926
927	#undef C1
928	#undef C2
929	#undef DSHIFT
930	#undef DSCALE
931
932	//------------------------------------------------------------------------------
933
934	static int ReconstructUV(VP8EncIterator* WEBP_RESTRICT const it,
935	VP8ModeScore* WEBP_RESTRICT const rd,
936	uint8_t* WEBP_RESTRICT const yuv_out, int mode) {
937	const VP8Encoder* const enc = it->enc_;
938	const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
939	const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
940	const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
941	int nz = `0`;
942	int n;
943	int16_t tmp[`8`][`16`];
944
945	for (n = `0`; n < `8`; n += `2`) {
946	VP8FTransform2(src + VP8ScanUV[n], ref + VP8ScanUV[n], tmp[n]);
947	}
948	if (it->top_derr_ != NULL) CorrectDCValues(it, &dqm->uv_, tmp, rd);
949
950	if (DO_TRELLIS_UV && it->do_trellis_) {
951	int ch, x, y;
952	for (ch = `0`, n = `0`; ch <= `2`; ch += `2`) {
953	for (y = `0`; y < `2`; ++y) {
954	for (x = `0`; x < `2`; ++x, ++n) {
955	const int ctx = it->top_nz_[`4` + ch + x] + it->left_nz_[`4` + ch + y];
956	const int non_zero = TrellisQuantizeBlock(
957	enc, tmp[n], rd->uv_levels[n], ctx, TYPE_CHROMA_A, &dqm->uv_,
958	dqm->lambda_trellis_uv_);
959	it->top_nz_[`4` + ch + x] = it->left_nz_[`4` + ch + y] = non_zero;
960	nz \|= non_zero << n;
961	}
962	}
963	}
964	} else {
965	for (n = `0`; n < `8`; n += `2`) {
966	nz \|= VP8EncQuantize2Blocks(tmp[n], rd->uv_levels[n], &dqm->uv_) << n;
967	}
968	}
969
970	for (n = `0`; n < `8`; n += `2`) {
971	VP8ITransform(ref + VP8ScanUV[n], tmp[n], yuv_out + VP8ScanUV[n], `1`);
972	}
973	return (nz << `16`);
974	}
975
976	//------------------------------------------------------------------------------
977	// RD-opt decision. Reconstruct each modes, evalue distortion and bit-cost.
978	// Pick the mode is lower RD-cost = Rate + lambda Distortion.*
979
980	static void StoreMaxDelta(VP8SegmentInfo* const dqm, const int16_t DCs[`16`]) {
981	// We look at the first three AC coefficients to determine what is the average
982	// delta between each sub-4x4 block.
983	const int v0 = abs(DCs[`1`]);
984	const int v1 = abs(DCs[`2`]);
985	const int v2 = abs(DCs[`4`]);
986	int max_v = (v1 > v0) ? v1 : v0;
987	max_v = (v2 > max_v) ? v2 : max_v;
988	if (max_v > dqm->max_edge_) dqm->max_edge_ = max_v;
989	}
990
991	static void SwapModeScore(VP8ModeScore a, VP8ModeScore b) {
992	VP8ModeScore* const tmp = *a;
993	a = b;
994	*b = tmp;
995	}
996
997	static void SwapPtr(uint8_t a, uint8_t b) {
998	uint8_t* const tmp = *a;
999	a = b;
1000	*b = tmp;
1001	}
1002
1003	static void SwapOut(VP8EncIterator* const it) {
1004	SwapPtr(&it->yuv_out_, &it->yuv_out2_);
1005	}
1006
1007	static void PickBestIntra16(VP8EncIterator* WEBP_RESTRICT const it,
1008	VP8ModeScore* WEBP_RESTRICT rd) {
1009	const int kNumBlocks = `16`;
1010	VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1011	const int lambda = dqm->lambda_i16_;
1012	const int tlambda = dqm->tlambda_;
1013	const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1014	VP8ModeScore rd_tmp;
1015	VP8ModeScore* rd_cur = &rd_tmp;
1016	VP8ModeScore* rd_best = rd;
1017	int mode;
1018	int is_flat = IsFlatSource16(it->yuv_in_ + Y_OFF_ENC);
1019
1020	rd->mode_i16 = -`1`;
1021	for (mode = `0`; mode < NUM_PRED_MODES; ++mode) {
1022	uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC; // scratch buffer
1023	rd_cur->mode_i16 = mode;
1024
1025	// Reconstruct
1026	rd_cur->nz = ReconstructIntra16(it, rd_cur, tmp_dst, mode);
1027
1028	// Measure RD-score
1029	rd_cur->D = VP8SSE16x16(src, tmp_dst);
1030	rd_cur->SD =
1031	tlambda ? MULT_8B(tlambda, VP8TDisto16x16(src, tmp_dst, kWeightY)) : `0`;
1032	rd_cur->H = VP8FixedCostsI16[mode];
1033	rd_cur->R = VP8GetCostLuma16(it, rd_cur);
1034	if (is_flat) {
1035	// refine the first impression (which was in pixel space)
1036	is_flat = IsFlat(rd_cur->y_ac_levels[`0`], kNumBlocks, FLATNESS_LIMIT_I16);
1037	if (is_flat) {
1038	// Block is very flat. We put emphasis on the distortion being very low!
1039	rd_cur->D *= `2`;
1040	rd_cur->SD *= `2`;
1041	}
1042	}
1043
1044	// Since we always examine Intra16 first, we can overwrite rd directly.*
1045	SetRDScore(lambda, rd_cur);
1046	if (mode == `0` \|\| rd_cur->score < rd_best->score) {
1047	SwapModeScore(&rd_cur, &rd_best);
1048	SwapOut(it);
1049	}
1050	}
1051	if (rd_best != rd) {
1052	memcpy(rd, rd_best, sizeof(*rd));
1053	}
1054	SetRDScore(dqm->lambda_mode_, rd); // finalize score for mode decision.
1055	VP8SetIntra16Mode(it, rd->mode_i16);
1056
1057	// we have a blocky macroblock (only DCs are non-zero) with fairly high
1058	// distortion, record max delta so we can later adjust the minimal filtering
1059	// strength needed to smooth these blocks out.
1060	if ((rd->nz & `0x100ffff`) == `0x1000000` && rd->D > dqm->min_disto_) {
1061	StoreMaxDelta(dqm, rd->y_dc_levels);
1062	}
1063	}
1064
1065	//------------------------------------------------------------------------------
1066
1067	// return the cost array corresponding to the surrounding prediction modes.
1068	static const uint16_t* GetCostModeI4(VP8EncIterator* WEBP_RESTRICT const it,
1069	const uint8_t modes[`16`]) {
1070	const int preds_w = it->enc_->preds_w_;
1071	const int x = (it->i4_ & `3`), y = it->i4_ >> `2`;
1072	const int left = (x == `0`) ? it->preds_[y * preds_w - `1`] : modes[it->i4_ - `1`];
1073	const int top = (y == `0`) ? it->preds_[-preds_w + x] : modes[it->i4_ - `4`];
1074	return VP8FixedCostsI4[top][left];
1075	}
1076
1077	static int PickBestIntra4(VP8EncIterator* WEBP_RESTRICT const it,
1078	VP8ModeScore* WEBP_RESTRICT const rd) {
1079	const VP8Encoder* const enc = it->enc_;
1080	const VP8SegmentInfo* const dqm = &enc->dqm_[it->mb_->segment_];
1081	const int lambda = dqm->lambda_i4_;
1082	const int tlambda = dqm->tlambda_;
1083	const uint8_t* const src0 = it->yuv_in_ + Y_OFF_ENC;
1084	uint8_t* const best_blocks = it->yuv_out2_ + Y_OFF_ENC;
1085	int total_header_bits = `0`;
1086	VP8ModeScore rd_best;
1087
1088	if (enc->max_i4_header_bits_ == `0`) {
1089	return `0`;
1090	}
1091
1092	InitScore(&rd_best);
1093	rd_best.H = `211`; // '211' is the value of VP8BitCost(0, 145)
1094	SetRDScore(dqm->lambda_mode_, &rd_best);
1095	VP8IteratorStartI4(it);
1096	do {
1097	const int kNumBlocks = `1`;
1098	VP8ModeScore rd_i4;
1099	int mode;
1100	int best_mode = -`1`;
1101	const uint8_t* const src = src0 + VP8Scan[it->i4_];
1102	const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1103	uint8_t* best_block = best_blocks + VP8Scan[it->i4_];
1104	uint8_t* tmp_dst = it->yuv_p_ + I4TMP; // scratch buffer.
1105
1106	InitScore(&rd_i4);
1107	VP8MakeIntra4Preds(it);
1108	for (mode = `0`; mode < NUM_BMODES; ++mode) {
1109	VP8ModeScore rd_tmp;
1110	int16_t tmp_levels[`16`];
1111
1112	// Reconstruct
1113	rd_tmp.nz =
1114	ReconstructIntra4(it, tmp_levels, src, tmp_dst, mode) << it->i4_;
1115
1116	// Compute RD-score
1117	rd_tmp.D = VP8SSE4x4(src, tmp_dst);
1118	rd_tmp.SD =
1119	tlambda ? MULT_8B(tlambda, VP8TDisto4x4(src, tmp_dst, kWeightY))
1120	: `0`;
1121	rd_tmp.H = mode_costs[mode];
1122
1123	// Add flatness penalty, to avoid flat area to be mispredicted
1124	// by a complex mode.
1125	if (mode > `0` && IsFlat(tmp_levels, kNumBlocks, FLATNESS_LIMIT_I4)) {
1126	rd_tmp.R = FLATNESS_PENALTY * kNumBlocks;
1127	} else {
1128	rd_tmp.R = `0`;
1129	}
1130
1131	// early-out check
1132	SetRDScore(lambda, &rd_tmp);
1133	if (best_mode >= `0` && rd_tmp.score >= rd_i4.score) continue;
1134
1135	// finish computing score
1136	rd_tmp.R += VP8GetCostLuma4(it, tmp_levels);
1137	SetRDScore(lambda, &rd_tmp);
1138
1139	if (best_mode < `0` \|\| rd_tmp.score < rd_i4.score) {
1140	CopyScore(&rd_i4, &rd_tmp);
1141	best_mode = mode;
1142	SwapPtr(&tmp_dst, &best_block);
1143	memcpy(rd_best.y_ac_levels[it->i4_], tmp_levels,
1144	sizeof(rd_best.y_ac_levels[it->i4_]));
1145	}
1146	}
1147	SetRDScore(dqm->lambda_mode_, &rd_i4);
1148	AddScore(&rd_best, &rd_i4);
1149	if (rd_best.score >= rd->score) {
1150	return `0`;
1151	}
1152	total_header_bits += (int)rd_i4.H; // <- equal to mode_costs[best_mode];
1153	if (total_header_bits > enc->max_i4_header_bits_) {
1154	return `0`;
1155	}
1156	// Copy selected samples if not in the right place already.
1157	if (best_block != best_blocks + VP8Scan[it->i4_]) {
1158	VP8Copy4x4(best_block, best_blocks + VP8Scan[it->i4_]);
1159	}
1160	rd->modes_i4[it->i4_] = best_mode;
1161	it->top_nz_[it->i4_ & `3`] = it->left_nz_[it->i4_ >> `2`] = (rd_i4.nz ? `1` : `0`);
1162	} while (VP8IteratorRotateI4(it, best_blocks));
1163
1164	// finalize state
1165	CopyScore(rd, &rd_best);
1166	VP8SetIntra4Mode(it, rd->modes_i4);
1167	SwapOut(it);
1168	memcpy(rd->y_ac_levels, rd_best.y_ac_levels, sizeof(rd->y_ac_levels));
1169	return `1`; // select intra4x4 over intra16x16
1170	}
1171
1172	//------------------------------------------------------------------------------
1173
1174	static void PickBestUV(VP8EncIterator* WEBP_RESTRICT const it,
1175	VP8ModeScore* WEBP_RESTRICT const rd) {
1176	const int kNumBlocks = `8`;
1177	const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1178	const int lambda = dqm->lambda_uv_;
1179	const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1180	uint8_t* tmp_dst = it->yuv_out2_ + U_OFF_ENC; // scratch buffer
1181	uint8_t* dst0 = it->yuv_out_ + U_OFF_ENC;
1182	uint8_t* dst = dst0;
1183	VP8ModeScore rd_best;
1184	int mode;
1185
1186	rd->mode_uv = -`1`;
1187	InitScore(&rd_best);
1188	for (mode = `0`; mode < NUM_PRED_MODES; ++mode) {
1189	VP8ModeScore rd_uv;
1190
1191	// Reconstruct
1192	rd_uv.nz = ReconstructUV(it, &rd_uv, tmp_dst, mode);
1193
1194	// Compute RD-score
1195	rd_uv.D = VP8SSE16x8(src, tmp_dst);
1196	rd_uv.SD = `0`; // not calling TDisto here: it tends to flatten areas.
1197	rd_uv.H = VP8FixedCostsUV[mode];
1198	rd_uv.R = VP8GetCostUV(it, &rd_uv);
1199	if (mode > `0` && IsFlat(rd_uv.uv_levels[`0`], kNumBlocks, FLATNESS_LIMIT_UV)) {
1200	rd_uv.R += FLATNESS_PENALTY * kNumBlocks;
1201	}
1202
1203	SetRDScore(lambda, &rd_uv);
1204	if (mode == `0` \|\| rd_uv.score < rd_best.score) {
1205	CopyScore(&rd_best, &rd_uv);
1206	rd->mode_uv = mode;
1207	memcpy(rd->uv_levels, rd_uv.uv_levels, sizeof(rd->uv_levels));
1208	if (it->top_derr_ != NULL) {
1209	memcpy(rd->derr, rd_uv.derr, sizeof(rd_uv.derr));
1210	}
1211	SwapPtr(&dst, &tmp_dst);
1212	}
1213	}
1214	VP8SetIntraUVMode(it, rd->mode_uv);
1215	AddScore(rd, &rd_best);
1216	if (dst != dst0) { // copy 16x8 block if needed
1217	VP8Copy16x8(dst, dst0);
1218	}
1219	if (it->top_derr_ != NULL) { // store diffusion errors for next block
1220	StoreDiffusionErrors(it, rd);
1221	}
1222	}
1223
1224	//------------------------------------------------------------------------------
1225	// Final reconstruction and quantization.
1226
1227	static void SimpleQuantize(VP8EncIterator* WEBP_RESTRICT const it,
1228	VP8ModeScore* WEBP_RESTRICT const rd) {
1229	const VP8Encoder* const enc = it->enc_;
1230	const int is_i16 = (it->mb_->type_ == `1`);
1231	int nz = `0`;
1232
1233	if (is_i16) {
1234	nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[`0`]);
1235	} else {
1236	VP8IteratorStartI4(it);
1237	do {
1238	const int mode =
1239	it->preds_[(it->i4_ & `3`) + (it->i4_ >> `2`) * enc->preds_w_];
1240	const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1241	uint8_t* const dst = it->yuv_out_ + Y_OFF_ENC + VP8Scan[it->i4_];
1242	VP8MakeIntra4Preds(it);
1243	nz \|= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1244	src, dst, mode) << it->i4_;
1245	} while (VP8IteratorRotateI4(it, it->yuv_out_ + Y_OFF_ENC));
1246	}
1247
1248	nz \|= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1249	rd->nz = nz;
1250	}
1251
1252	// Refine intra16/intra4 sub-modes based on distortion only (not rate).
1253	static void RefineUsingDistortion(VP8EncIterator* WEBP_RESTRICT const it,
1254	int try_both_modes, int refine_uv_mode,
1255	VP8ModeScore* WEBP_RESTRICT const rd) {
1256	score_t best_score = MAX_COST;
1257	int nz = `0`;
1258	int mode;
1259	int is_i16 = try_both_modes \|\| (it->mb_->type_ == `1`);
1260
1261	const VP8SegmentInfo* const dqm = &it->enc_->dqm_[it->mb_->segment_];
1262	// Some empiric constants, of approximate order of magnitude.
1263	const int lambda_d_i16 = `106`;
1264	const int lambda_d_i4 = `11`;
1265	const int lambda_d_uv = `120`;
1266	score_t score_i4 = dqm->i4_penalty_;
1267	score_t i4_bit_sum = `0`;
1268	const score_t bit_limit = try_both_modes ? it->enc_->mb_header_limit_
1269	: MAX_COST; // no early-out allowed
1270
1271	if (is_i16) { // First, evaluate Intra16 distortion
1272	int best_mode = -`1`;
1273	const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC;
1274	for (mode = `0`; mode < NUM_PRED_MODES; ++mode) {
1275	const uint8_t* const ref = it->yuv_p_ + VP8I16ModeOffsets[mode];
1276	const score_t score = (score_t)VP8SSE16x16(src, ref) * RD_DISTO_MULT
1277	+ VP8FixedCostsI16[mode] * lambda_d_i16;
1278	if (mode > `0` && VP8FixedCostsI16[mode] > bit_limit) {
1279	continue;
1280	}
1281
1282	if (score < best_score) {
1283	best_mode = mode;
1284	best_score = score;
1285	}
1286	}
1287	if (it->x_ == `0` \|\| it->y_ == `0`) {
1288	// avoid starting a checkerboard resonance from the border. See bug #432.
1289	if (IsFlatSource16(src)) {
1290	best_mode = (it->x_ == `0`) ? `0` : `2`;
1291	try_both_modes = `0`; // stick to i16
1292	}
1293	}
1294	VP8SetIntra16Mode(it, best_mode);
1295	// we'll reconstruct later, if i16 mode actually gets selected
1296	}
1297
1298	// Next, evaluate Intra4
1299	if (try_both_modes \|\| !is_i16) {
1300	// We don't evaluate the rate here, but just account for it through a
1301	// constant penalty (i4 mode usually needs more bits compared to i16).
1302	is_i16 = `0`;
1303	VP8IteratorStartI4(it);
1304	do {
1305	int best_i4_mode = -`1`;
1306	score_t best_i4_score = MAX_COST;
1307	const uint8_t* const src = it->yuv_in_ + Y_OFF_ENC + VP8Scan[it->i4_];
1308	const uint16_t* const mode_costs = GetCostModeI4(it, rd->modes_i4);
1309
1310	VP8MakeIntra4Preds(it);
1311	for (mode = `0`; mode < NUM_BMODES; ++mode) {
1312	const uint8_t* const ref = it->yuv_p_ + VP8I4ModeOffsets[mode];
1313	const score_t score = VP8SSE4x4(src, ref) * RD_DISTO_MULT
1314	+ mode_costs[mode] * lambda_d_i4;
1315	if (score < best_i4_score) {
1316	best_i4_mode = mode;
1317	best_i4_score = score;
1318	}
1319	}
1320	i4_bit_sum += mode_costs[best_i4_mode];
1321	rd->modes_i4[it->i4_] = best_i4_mode;
1322	score_i4 += best_i4_score;
1323	if (score_i4 >= best_score \|\| i4_bit_sum > bit_limit) {
1324	// Intra4 won't be better than Intra16. Bail out and pick Intra16.
1325	is_i16 = `1`;
1326	break;
1327	} else { // reconstruct partial block inside yuv_out2_ buffer
1328	uint8_t* const tmp_dst = it->yuv_out2_ + Y_OFF_ENC + VP8Scan[it->i4_];
1329	nz \|= ReconstructIntra4(it, rd->y_ac_levels[it->i4_],
1330	src, tmp_dst, best_i4_mode) << it->i4_;
1331	}
1332	} while (VP8IteratorRotateI4(it, it->yuv_out2_ + Y_OFF_ENC));
1333	}
1334
1335	// Final reconstruction, depending on which mode is selected.
1336	if (!is_i16) {
1337	VP8SetIntra4Mode(it, rd->modes_i4);
1338	SwapOut(it);
1339	best_score = score_i4;
1340	} else {
1341	nz = ReconstructIntra16(it, rd, it->yuv_out_ + Y_OFF_ENC, it->preds_[`0`]);
1342	}
1343
1344	// ... and UV!
1345	if (refine_uv_mode) {
1346	int best_mode = -`1`;
1347	score_t best_uv_score = MAX_COST;
1348	const uint8_t* const src = it->yuv_in_ + U_OFF_ENC;
1349	for (mode = `0`; mode < NUM_PRED_MODES; ++mode) {
1350	const uint8_t* const ref = it->yuv_p_ + VP8UVModeOffsets[mode];
1351	const score_t score = VP8SSE16x8(src, ref) * RD_DISTO_MULT
1352	+ VP8FixedCostsUV[mode] * lambda_d_uv;
1353	if (score < best_uv_score) {
1354	best_mode = mode;
1355	best_uv_score = score;
1356	}
1357	}
1358	VP8SetIntraUVMode(it, best_mode);
1359	}
1360	nz \|= ReconstructUV(it, rd, it->yuv_out_ + U_OFF_ENC, it->mb_->uv_mode_);
1361
1362	rd->nz = nz;
1363	rd->score = best_score;
1364	}
1365
1366	//------------------------------------------------------------------------------
1367	// Entry point
1368
1369	int VP8Decimate(VP8EncIterator* WEBP_RESTRICT const it,
1370	VP8ModeScore* WEBP_RESTRICT const rd,
1371	VP8RDLevel rd_opt) {
1372	int is_skipped;
1373	const int method = it->enc_->method_;
1374
1375	InitScore(rd);
1376
1377	// We can perform predictions for Luma16x16 and Chroma8x8 already.
1378	// Luma4x4 predictions needs to be done as-we-go.
1379	VP8MakeLuma16Preds(it);
1380	VP8MakeChroma8Preds(it);
1381
1382	if (rd_opt > RD_OPT_NONE) {
1383	it->do_trellis_ = (rd_opt >= RD_OPT_TRELLIS_ALL);
1384	PickBestIntra16(it, rd);
1385	if (method >= `2`) {
1386	PickBestIntra4(it, rd);
1387	}
1388	PickBestUV(it, rd);
1389	if (rd_opt == RD_OPT_TRELLIS) { // finish off with trellis-optim now
1390	it->do_trellis_ = `1`;
1391	SimpleQuantize(it, rd);
1392	}
1393	} else {
1394	// At this point we have heuristically decided intra16 / intra4.
1395	// For method >= 2, pick the best intra4/intra16 based on SSE (~tad slower).
1396	// For method <= 1, we don't re-examine the decision but just go ahead with
1397	// quantization/reconstruction.
1398	RefineUsingDistortion(it, (method >= `2`), (method >= `1`), rd);
1399	}
1400	is_skipped = (rd->nz == `0`);
1401	VP8SetSkip(it, is_skipped);
1402	return is_skipped;
1403	}
1404

Browse the source code of Godot/thirdparty/libwebp/src/enc/quant_enc.c