literal_cost.c source code [ClickHouse/contrib/brotli/c/enc/literal_cost.c]

1	/ Copyright 2013 Google Inc. All Rights Reserved.*
2
3	Distributed under MIT license.
4	See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5	*/
6
7	/ Literal cost model to allow backward reference replacement to be efficient.*
8	*/
9
10	#include "./literal_cost.h"
11
12	#include "../common/platform.h"
13	#include <brotli/types.h>
14	#include "./fast_log.h"
15	#include "./utf8_util.h"
16
17	#if defined(__cplusplus) \|\| defined(c_plusplus)
18	extern "C" {
19	#endif
20
21	static size_t UTF8Position(size_t last, size_t c, size_t clamp) {
22	if (c < `128`) {
23	return `0`; / Next one is the 'Byte 1' again. /
24	} else if (c >= `192`) { / Next one is the 'Byte 2' of utf-8 encoding. /
25	return BROTLI_MIN(size_t, `1`, clamp);
26	} else {
27	/ Let's decide over the last byte if this ends the sequence. /
28	if (last < `0xE0`) {
29	return `0`; / Completed two or three byte coding. /
30	} else { / Next one is the 'Byte 3' of utf-8 encoding. /
31	return BROTLI_MIN(size_t, `2`, clamp);
32	}
33	}
34	}
35
36	static size_t DecideMultiByteStatsLevel(size_t pos, size_t len, size_t mask,
37	const uint8_t* data) {
38	size_t counts[`3`] = { `0` };
39	size_t max_utf8 = `1`; / should be 2, but 1 compresses better. /
40	size_t last_c = `0`;
41	size_t i;
42	for (i = `0`; i < len; ++i) {
43	size_t c = data[(pos + i) & mask];
44	++counts[UTF8Position(last_c, c, `2`)];
45	last_c = c;
46	}
47	if (counts[`2`] < `500`) {
48	max_utf8 = `1`;
49	}
50	if (counts[`1`] + counts[`2`] < `25`) {
51	max_utf8 = `0`;
52	}
53	return max_utf8;
54	}
55
56	static void EstimateBitCostsForLiteralsUTF8(size_t pos, size_t len, size_t mask,
57	const uint8_t* data, float* cost) {
58	/ max_utf8 is 0 (normal ASCII single byte modeling),*
59	1 (for 2-byte UTF-8 modeling), or 2 (for 3-byte UTF-8 modeling). /*
60	const size_t max_utf8 = DecideMultiByteStatsLevel(pos, len, mask, data);
61	size_t histogram[`3`][`256`] = { { `0` } };
62	size_t window_half = `495`;
63	size_t in_window = BROTLI_MIN(size_t, window_half, len);
64	size_t in_window_utf8[`3`] = { `0` };
65
66	size_t i;
67	{ / Bootstrap histograms. /
68	size_t last_c = `0`;
69	size_t utf8_pos = `0`;
70	for (i = `0`; i < in_window; ++i) {
71	size_t c = data[(pos + i) & mask];
72	++histogram[utf8_pos][c];
73	++in_window_utf8[utf8_pos];
74	utf8_pos = UTF8Position(last_c, c, max_utf8);
75	last_c = c;
76	}
77	}
78
79	/ Compute bit costs with sliding window. /
80	for (i = `0`; i < len; ++i) {
81	if (i >= window_half) {
82	/ Remove a byte in the past. /
83	size_t c =
84	i < window_half + `1` ? `0` : data[(pos + i - window_half - `1`) & mask];
85	size_t last_c =
86	i < window_half + `2` ? `0` : data[(pos + i - window_half - `2`) & mask];
87	size_t utf8_pos2 = UTF8Position(last_c, c, max_utf8);
88	--histogram[utf8_pos2][data[(pos + i - window_half) & mask]];
89	--in_window_utf8[utf8_pos2];
90	}
91	if (i + window_half < len) {
92	/ Add a byte in the future. /
93	size_t c = data[(pos + i + window_half - `1`) & mask];
94	size_t last_c = data[(pos + i + window_half - `2`) & mask];
95	size_t utf8_pos2 = UTF8Position(last_c, c, max_utf8);
96	++histogram[utf8_pos2][data[(pos + i + window_half) & mask]];
97	++in_window_utf8[utf8_pos2];
98	}
99	{
100	size_t c = i < `1` ? `0` : data[(pos + i - `1`) & mask];
101	size_t last_c = i < `2` ? `0` : data[(pos + i - `2`) & mask];
102	size_t utf8_pos = UTF8Position(last_c, c, max_utf8);
103	size_t masked_pos = (pos + i) & mask;
104	size_t histo = histogram[utf8_pos][data[masked_pos]];
105	double lit_cost;
106	if (histo == `0`) {
107	histo = `1`;
108	}
109	lit_cost = FastLog2(in_window_utf8[utf8_pos]) - FastLog2(histo);
110	lit_cost += `0.02905`;
111	if (lit_cost < `1.0`) {
112	lit_cost *= `0.5`;
113	lit_cost += `0.5`;
114	}
115	/ Make the first bytes more expensive -- seems to help, not sure why.*
116	Perhaps because the entropy source is changing its properties
117	rapidly in the beginning of the file, perhaps because the beginning
118	of the data is a statistical "anomaly". /*
119	if (i < `2000`) {
120	lit_cost += `0.7` - ((double)(`2000` - i) / `2000.0` * `0.35`);
121	}
122	cost[i] = (float)lit_cost;
123	}
124	}
125	}
126
127	void BrotliEstimateBitCostsForLiterals(size_t pos, size_t len, size_t mask,
128	const uint8_t* data, float* cost) {
129	if (BrotliIsMostlyUTF8(data, pos, mask, len, kMinUTF8Ratio)) {
130	EstimateBitCostsForLiteralsUTF8(pos, len, mask, data, cost);
131	return;
132	} else {
133	size_t histogram[`256`] = { `0` };
134	size_t window_half = `2000`;
135	size_t in_window = BROTLI_MIN(size_t, window_half, len);
136
137	/ Bootstrap histogram. /
138	size_t i;
139	for (i = `0`; i < in_window; ++i) {
140	++histogram[data[(pos + i) & mask]];
141	}
142
143	/ Compute bit costs with sliding window. /
144	for (i = `0`; i < len; ++i) {
145	size_t histo;
146	if (i >= window_half) {
147	/ Remove a byte in the past. /
148	--histogram[data[(pos + i - window_half) & mask]];
149	--in_window;
150	}
151	if (i + window_half < len) {
152	/ Add a byte in the future. /
153	++histogram[data[(pos + i + window_half) & mask]];
154	++in_window;
155	}
156	histo = histogram[data[(pos + i) & mask]];
157	if (histo == `0`) {
158	histo = `1`;
159	}
160	{
161	double lit_cost = FastLog2(in_window) - FastLog2(histo);
162	lit_cost += `0.029`;
163	if (lit_cost < `1.0`) {
164	lit_cost *= `0.5`;
165	lit_cost += `0.5`;
166	}
167	cost[i] = (float)lit_cost;
168	}
169	}
170	}
171	}
172
173	#if defined(__cplusplus) \|\| defined(c_plusplus)
174	} / extern "C" /
175	#endif
176

Browse the source code of ClickHouse/contrib/brotli/c/enc/literal_cost.c