1 | /* Copyright 2010 Google Inc. All Rights Reserved. |
2 | |
3 | Distributed under MIT license. |
4 | See file LICENSE for detail or copy at https://opensource.org/licenses/MIT |
5 | */ |
6 | |
7 | /* A (forgetful) hash table to the data seen by the compressor, to |
8 | help create backward references to previous data. */ |
9 | |
10 | #ifndef BROTLI_ENC_HASH_H_ |
11 | #define BROTLI_ENC_HASH_H_ |
12 | |
13 | #include <string.h> /* memcmp, memset */ |
14 | |
15 | #include "../common/constants.h" |
16 | #include "../common/dictionary.h" |
17 | #include "../common/platform.h" |
18 | #include <brotli/types.h> |
19 | #include "./encoder_dict.h" |
20 | #include "./fast_log.h" |
21 | #include "./find_match_length.h" |
22 | #include "./memory.h" |
23 | #include "./quality.h" |
24 | #include "./static_dict.h" |
25 | |
26 | #if defined(__cplusplus) || defined(c_plusplus) |
27 | extern "C" { |
28 | #endif |
29 | |
30 | /* Pointer to hasher data. |
31 | * |
32 | * Excluding initialization and destruction, hasher can be passed as |
33 | * HasherHandle by value. |
34 | * |
35 | * Typically hasher data consists of 3 sections: |
36 | * * HasherCommon structure |
37 | * * private structured hasher data, depending on hasher type |
38 | * * private dynamic hasher data, depending on hasher type and parameters |
39 | * |
40 | * Using "define" instead of "typedef", because on MSVC __restrict does not work |
41 | * on typedef pointer types. */ |
42 | #define HasherHandle uint8_t* |
43 | |
44 | typedef struct { |
45 | BrotliHasherParams params; |
46 | |
47 | /* False if hasher needs to be "prepared" before use. */ |
48 | BROTLI_BOOL is_prepared_; |
49 | |
50 | size_t dict_num_lookups; |
51 | size_t dict_num_matches; |
52 | } HasherCommon; |
53 | |
54 | static BROTLI_INLINE HasherCommon* GetHasherCommon(HasherHandle handle) { |
55 | return (HasherCommon*)handle; |
56 | } |
57 | |
58 | #define score_t size_t |
59 | |
60 | static const uint32_t kCutoffTransformsCount = 10; |
61 | /* 0, 12, 27, 23, 42, 63, 56, 48, 59, 64 */ |
62 | /* 0+0, 4+8, 8+19, 12+11, 16+26, 20+43, 24+32, 28+20, 32+27, 36+28 */ |
63 | static const uint64_t kCutoffTransforms = |
64 | BROTLI_MAKE_UINT64_T(0x071B520A, 0xDA2D3200); |
65 | |
66 | typedef struct HasherSearchResult { |
67 | size_t len; |
68 | size_t distance; |
69 | score_t score; |
70 | int len_code_delta; /* == len_code - len */ |
71 | } HasherSearchResult; |
72 | |
73 | /* kHashMul32 multiplier has these properties: |
74 | * The multiplier must be odd. Otherwise we may lose the highest bit. |
75 | * No long streaks of ones or zeros. |
76 | * There is no effort to ensure that it is a prime, the oddity is enough |
77 | for this use. |
78 | * The number has been tuned heuristically against compression benchmarks. */ |
79 | static const uint32_t kHashMul32 = 0x1E35A7BD; |
80 | static const uint64_t kHashMul64 = BROTLI_MAKE_UINT64_T(0x1E35A7BD, 0x1E35A7BD); |
81 | static const uint64_t kHashMul64Long = |
82 | BROTLI_MAKE_UINT64_T(0x1FE35A7Bu, 0xD3579BD3u); |
83 | |
84 | static BROTLI_INLINE uint32_t Hash14(const uint8_t* data) { |
85 | uint32_t h = BROTLI_UNALIGNED_LOAD32LE(data) * kHashMul32; |
86 | /* The higher bits contain more mixture from the multiplication, |
87 | so we take our results from there. */ |
88 | return h >> (32 - 14); |
89 | } |
90 | |
91 | static BROTLI_INLINE void PrepareDistanceCache( |
92 | int* BROTLI_RESTRICT distance_cache, const int num_distances) { |
93 | if (num_distances > 4) { |
94 | int last_distance = distance_cache[0]; |
95 | distance_cache[4] = last_distance - 1; |
96 | distance_cache[5] = last_distance + 1; |
97 | distance_cache[6] = last_distance - 2; |
98 | distance_cache[7] = last_distance + 2; |
99 | distance_cache[8] = last_distance - 3; |
100 | distance_cache[9] = last_distance + 3; |
101 | if (num_distances > 10) { |
102 | int next_last_distance = distance_cache[1]; |
103 | distance_cache[10] = next_last_distance - 1; |
104 | distance_cache[11] = next_last_distance + 1; |
105 | distance_cache[12] = next_last_distance - 2; |
106 | distance_cache[13] = next_last_distance + 2; |
107 | distance_cache[14] = next_last_distance - 3; |
108 | distance_cache[15] = next_last_distance + 3; |
109 | } |
110 | } |
111 | } |
112 | |
113 | #define BROTLI_LITERAL_BYTE_SCORE 135 |
114 | #define BROTLI_DISTANCE_BIT_PENALTY 30 |
115 | /* Score must be positive after applying maximal penalty. */ |
116 | #define BROTLI_SCORE_BASE (BROTLI_DISTANCE_BIT_PENALTY * 8 * sizeof(size_t)) |
117 | |
118 | /* Usually, we always choose the longest backward reference. This function |
119 | allows for the exception of that rule. |
120 | |
121 | If we choose a backward reference that is further away, it will |
122 | usually be coded with more bits. We approximate this by assuming |
123 | log2(distance). If the distance can be expressed in terms of the |
124 | last four distances, we use some heuristic constants to estimate |
125 | the bits cost. For the first up to four literals we use the bit |
126 | cost of the literals from the literal cost model, after that we |
127 | use the average bit cost of the cost model. |
128 | |
129 | This function is used to sometimes discard a longer backward reference |
130 | when it is not much longer and the bit cost for encoding it is more |
131 | than the saved literals. |
132 | |
133 | backward_reference_offset MUST be positive. */ |
134 | static BROTLI_INLINE score_t BackwardReferenceScore( |
135 | size_t copy_length, size_t backward_reference_offset) { |
136 | return BROTLI_SCORE_BASE + BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length - |
137 | BROTLI_DISTANCE_BIT_PENALTY * Log2FloorNonZero(backward_reference_offset); |
138 | } |
139 | |
140 | static BROTLI_INLINE score_t BackwardReferenceScoreUsingLastDistance( |
141 | size_t copy_length) { |
142 | return BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length + |
143 | BROTLI_SCORE_BASE + 15; |
144 | } |
145 | |
146 | static BROTLI_INLINE score_t BackwardReferencePenaltyUsingLastDistance( |
147 | size_t distance_short_code) { |
148 | return (score_t)39 + ((0x1CA10 >> (distance_short_code & 0xE)) & 0xE); |
149 | } |
150 | |
151 | static BROTLI_INLINE BROTLI_BOOL TestStaticDictionaryItem( |
152 | const BrotliEncoderDictionary* dictionary, size_t item, |
153 | const uint8_t* data, size_t max_length, size_t max_backward, |
154 | size_t max_distance, HasherSearchResult* out) { |
155 | size_t len; |
156 | size_t word_idx; |
157 | size_t offset; |
158 | size_t matchlen; |
159 | size_t backward; |
160 | score_t score; |
161 | len = item & 0x1F; |
162 | word_idx = item >> 5; |
163 | offset = dictionary->words->offsets_by_length[len] + len * word_idx; |
164 | if (len > max_length) { |
165 | return BROTLI_FALSE; |
166 | } |
167 | |
168 | matchlen = |
169 | FindMatchLengthWithLimit(data, &dictionary->words->data[offset], len); |
170 | if (matchlen + dictionary->cutoffTransformsCount <= len || matchlen == 0) { |
171 | return BROTLI_FALSE; |
172 | } |
173 | { |
174 | size_t cut = len - matchlen; |
175 | size_t transform_id = (cut << 2) + |
176 | (size_t)((dictionary->cutoffTransforms >> (cut * 6)) & 0x3F); |
177 | backward = max_backward + 1 + word_idx + |
178 | (transform_id << dictionary->words->size_bits_by_length[len]); |
179 | } |
180 | if (backward > max_distance) { |
181 | return BROTLI_FALSE; |
182 | } |
183 | score = BackwardReferenceScore(matchlen, backward); |
184 | if (score < out->score) { |
185 | return BROTLI_FALSE; |
186 | } |
187 | out->len = matchlen; |
188 | out->len_code_delta = (int)len - (int)matchlen; |
189 | out->distance = backward; |
190 | out->score = score; |
191 | return BROTLI_TRUE; |
192 | } |
193 | |
194 | static BROTLI_INLINE void SearchInStaticDictionary( |
195 | const BrotliEncoderDictionary* dictionary, |
196 | HasherHandle handle, const uint8_t* data, size_t max_length, |
197 | size_t max_backward, size_t max_distance, |
198 | HasherSearchResult* out, BROTLI_BOOL shallow) { |
199 | size_t key; |
200 | size_t i; |
201 | HasherCommon* self = GetHasherCommon(handle); |
202 | if (self->dict_num_matches < (self->dict_num_lookups >> 7)) { |
203 | return; |
204 | } |
205 | key = Hash14(data) << 1; |
206 | for (i = 0; i < (shallow ? 1u : 2u); ++i, ++key) { |
207 | size_t item = dictionary->hash_table[key]; |
208 | self->dict_num_lookups++; |
209 | if (item != 0) { |
210 | BROTLI_BOOL item_matches = TestStaticDictionaryItem( |
211 | dictionary, item, data, |
212 | max_length, max_backward, max_distance, out); |
213 | if (item_matches) { |
214 | self->dict_num_matches++; |
215 | } |
216 | } |
217 | } |
218 | } |
219 | |
220 | typedef struct BackwardMatch { |
221 | uint32_t distance; |
222 | uint32_t length_and_code; |
223 | } BackwardMatch; |
224 | |
225 | static BROTLI_INLINE void InitBackwardMatch(BackwardMatch* self, |
226 | size_t dist, size_t len) { |
227 | self->distance = (uint32_t)dist; |
228 | self->length_and_code = (uint32_t)(len << 5); |
229 | } |
230 | |
231 | static BROTLI_INLINE void InitDictionaryBackwardMatch(BackwardMatch* self, |
232 | size_t dist, size_t len, size_t len_code) { |
233 | self->distance = (uint32_t)dist; |
234 | self->length_and_code = |
235 | (uint32_t)((len << 5) | (len == len_code ? 0 : len_code)); |
236 | } |
237 | |
238 | static BROTLI_INLINE size_t BackwardMatchLength(const BackwardMatch* self) { |
239 | return self->length_and_code >> 5; |
240 | } |
241 | |
242 | static BROTLI_INLINE size_t BackwardMatchLengthCode(const BackwardMatch* self) { |
243 | size_t code = self->length_and_code & 31; |
244 | return code ? code : BackwardMatchLength(self); |
245 | } |
246 | |
247 | #define EXPAND_CAT(a, b) CAT(a, b) |
248 | #define CAT(a, b) a ## b |
249 | #define FN(X) EXPAND_CAT(X, HASHER()) |
250 | |
251 | #define HASHER() H10 |
252 | #define BUCKET_BITS 17 |
253 | #define MAX_TREE_SEARCH_DEPTH 64 |
254 | #define MAX_TREE_COMP_LENGTH 128 |
255 | #include "./hash_to_binary_tree_inc.h" /* NOLINT(build/include) */ |
256 | #undef MAX_TREE_SEARCH_DEPTH |
257 | #undef MAX_TREE_COMP_LENGTH |
258 | #undef BUCKET_BITS |
259 | #undef HASHER |
260 | /* MAX_NUM_MATCHES == 64 + MAX_TREE_SEARCH_DEPTH */ |
261 | #define MAX_NUM_MATCHES_H10 128 |
262 | |
263 | /* For BUCKET_SWEEP == 1, enabling the dictionary lookup makes compression |
264 | a little faster (0.5% - 1%) and it compresses 0.15% better on small text |
265 | and HTML inputs. */ |
266 | |
267 | #define HASHER() H2 |
268 | #define BUCKET_BITS 16 |
269 | #define BUCKET_SWEEP 1 |
270 | #define HASH_LEN 5 |
271 | #define USE_DICTIONARY 1 |
272 | #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */ |
273 | #undef BUCKET_SWEEP |
274 | #undef USE_DICTIONARY |
275 | #undef HASHER |
276 | |
277 | #define HASHER() H3 |
278 | #define BUCKET_SWEEP 2 |
279 | #define USE_DICTIONARY 0 |
280 | #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */ |
281 | #undef USE_DICTIONARY |
282 | #undef BUCKET_SWEEP |
283 | #undef BUCKET_BITS |
284 | #undef HASHER |
285 | |
286 | #define HASHER() H4 |
287 | #define BUCKET_BITS 17 |
288 | #define BUCKET_SWEEP 4 |
289 | #define USE_DICTIONARY 1 |
290 | #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */ |
291 | #undef USE_DICTIONARY |
292 | #undef HASH_LEN |
293 | #undef BUCKET_SWEEP |
294 | #undef BUCKET_BITS |
295 | #undef HASHER |
296 | |
297 | #define HASHER() H5 |
298 | #include "./hash_longest_match_inc.h" /* NOLINT(build/include) */ |
299 | #undef HASHER |
300 | |
301 | #define HASHER() H6 |
302 | #include "./hash_longest_match64_inc.h" /* NOLINT(build/include) */ |
303 | #undef HASHER |
304 | |
305 | #define BUCKET_BITS 15 |
306 | |
307 | #define NUM_LAST_DISTANCES_TO_CHECK 4 |
308 | #define NUM_BANKS 1 |
309 | #define BANK_BITS 16 |
310 | #define HASHER() H40 |
311 | #include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */ |
312 | #undef HASHER |
313 | #undef NUM_LAST_DISTANCES_TO_CHECK |
314 | |
315 | #define NUM_LAST_DISTANCES_TO_CHECK 10 |
316 | #define HASHER() H41 |
317 | #include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */ |
318 | #undef HASHER |
319 | #undef NUM_LAST_DISTANCES_TO_CHECK |
320 | #undef NUM_BANKS |
321 | #undef BANK_BITS |
322 | |
323 | #define NUM_LAST_DISTANCES_TO_CHECK 16 |
324 | #define NUM_BANKS 512 |
325 | #define BANK_BITS 9 |
326 | #define HASHER() H42 |
327 | #include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */ |
328 | #undef HASHER |
329 | #undef NUM_LAST_DISTANCES_TO_CHECK |
330 | #undef NUM_BANKS |
331 | #undef BANK_BITS |
332 | |
333 | #undef BUCKET_BITS |
334 | |
335 | #define HASHER() H54 |
336 | #define BUCKET_BITS 20 |
337 | #define BUCKET_SWEEP 4 |
338 | #define HASH_LEN 7 |
339 | #define USE_DICTIONARY 0 |
340 | #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */ |
341 | #undef USE_DICTIONARY |
342 | #undef HASH_LEN |
343 | #undef BUCKET_SWEEP |
344 | #undef BUCKET_BITS |
345 | #undef HASHER |
346 | |
347 | /* fast large window hashers */ |
348 | |
349 | #define HASHER() HROLLING_FAST |
350 | #define CHUNKLEN 32 |
351 | #define JUMP 4 |
352 | #define NUMBUCKETS 16777216 |
353 | #define MASK ((NUMBUCKETS * 64) - 1) |
354 | #include "./hash_rolling_inc.h" /* NOLINT(build/include) */ |
355 | #undef JUMP |
356 | #undef HASHER |
357 | |
358 | |
359 | #define HASHER() HROLLING |
360 | #define JUMP 1 |
361 | #include "./hash_rolling_inc.h" /* NOLINT(build/include) */ |
362 | #undef MASK |
363 | #undef NUMBUCKETS |
364 | #undef JUMP |
365 | #undef CHUNKLEN |
366 | #undef HASHER |
367 | |
368 | #define HASHER() H35 |
369 | #define HASHER_A H3 |
370 | #define HASHER_B HROLLING_FAST |
371 | #include "./hash_composite_inc.h" /* NOLINT(build/include) */ |
372 | #undef HASHER_A |
373 | #undef HASHER_B |
374 | #undef HASHER |
375 | |
376 | #define HASHER() H55 |
377 | #define HASHER_A H54 |
378 | #define HASHER_B HROLLING_FAST |
379 | #include "./hash_composite_inc.h" /* NOLINT(build/include) */ |
380 | #undef HASHER_A |
381 | #undef HASHER_B |
382 | #undef HASHER |
383 | |
384 | #define HASHER() H65 |
385 | #define HASHER_A H6 |
386 | #define HASHER_B HROLLING |
387 | #include "./hash_composite_inc.h" /* NOLINT(build/include) */ |
388 | #undef HASHER_A |
389 | #undef HASHER_B |
390 | #undef HASHER |
391 | |
392 | #undef FN |
393 | #undef CAT |
394 | #undef EXPAND_CAT |
395 | |
396 | #define FOR_GENERIC_HASHERS(H) H(2) H(3) H(4) H(5) H(6) H(40) H(41) H(42) H(54)\ |
397 | H(35) H(55) H(65) |
398 | #define FOR_ALL_HASHERS(H) FOR_GENERIC_HASHERS(H) H(10) |
399 | |
400 | static BROTLI_INLINE void DestroyHasher( |
401 | MemoryManager* m, HasherHandle* handle) { |
402 | if (*handle == NULL) return; |
403 | BROTLI_FREE(m, *handle); |
404 | } |
405 | |
406 | static BROTLI_INLINE void HasherReset(HasherHandle handle) { |
407 | if (handle == NULL) return; |
408 | GetHasherCommon(handle)->is_prepared_ = BROTLI_FALSE; |
409 | } |
410 | |
411 | static BROTLI_INLINE size_t HasherSize(const BrotliEncoderParams* params, |
412 | BROTLI_BOOL one_shot, const size_t input_size) { |
413 | size_t result = sizeof(HasherCommon); |
414 | switch (params->hasher.type) { |
415 | #define SIZE_(N) \ |
416 | case N: \ |
417 | result += HashMemAllocInBytesH ## N(params, one_shot, input_size); \ |
418 | break; |
419 | FOR_ALL_HASHERS(SIZE_) |
420 | #undef SIZE_ |
421 | default: |
422 | break; |
423 | } |
424 | return result; |
425 | } |
426 | |
427 | static BROTLI_INLINE void HasherSetup(MemoryManager* m, HasherHandle* handle, |
428 | BrotliEncoderParams* params, const uint8_t* data, size_t position, |
429 | size_t input_size, BROTLI_BOOL is_last) { |
430 | HasherHandle self = NULL; |
431 | HasherCommon* common = NULL; |
432 | BROTLI_BOOL one_shot = (position == 0 && is_last); |
433 | if (*handle == NULL) { |
434 | size_t alloc_size; |
435 | ChooseHasher(params, ¶ms->hasher); |
436 | alloc_size = HasherSize(params, one_shot, input_size); |
437 | self = BROTLI_ALLOC(m, uint8_t, alloc_size); |
438 | if (BROTLI_IS_OOM(m)) return; |
439 | *handle = self; |
440 | common = GetHasherCommon(self); |
441 | common->params = params->hasher; |
442 | switch (common->params.type) { |
443 | #define INITIALIZE_(N) \ |
444 | case N: \ |
445 | InitializeH ## N(*handle, params); \ |
446 | break; |
447 | FOR_ALL_HASHERS(INITIALIZE_); |
448 | #undef INITIALIZE_ |
449 | default: |
450 | break; |
451 | } |
452 | HasherReset(*handle); |
453 | } |
454 | |
455 | self = *handle; |
456 | common = GetHasherCommon(self); |
457 | if (!common->is_prepared_) { |
458 | switch (common->params.type) { |
459 | #define PREPARE_(N) \ |
460 | case N: \ |
461 | PrepareH ## N(self, one_shot, input_size, data); \ |
462 | break; |
463 | FOR_ALL_HASHERS(PREPARE_) |
464 | #undef PREPARE_ |
465 | default: break; |
466 | } |
467 | if (position == 0) { |
468 | common->dict_num_lookups = 0; |
469 | common->dict_num_matches = 0; |
470 | } |
471 | common->is_prepared_ = BROTLI_TRUE; |
472 | } |
473 | } |
474 | |
475 | static BROTLI_INLINE void InitOrStitchToPreviousBlock( |
476 | MemoryManager* m, HasherHandle* handle, const uint8_t* data, size_t mask, |
477 | BrotliEncoderParams* params, size_t position, size_t input_size, |
478 | BROTLI_BOOL is_last) { |
479 | HasherHandle self; |
480 | HasherSetup(m, handle, params, data, position, input_size, is_last); |
481 | if (BROTLI_IS_OOM(m)) return; |
482 | self = *handle; |
483 | switch (GetHasherCommon(self)->params.type) { |
484 | #define INIT_(N) \ |
485 | case N: \ |
486 | StitchToPreviousBlockH ## N(self, input_size, position, data, mask); \ |
487 | break; |
488 | FOR_ALL_HASHERS(INIT_) |
489 | #undef INIT_ |
490 | default: break; |
491 | } |
492 | } |
493 | |
494 | #if defined(__cplusplus) || defined(c_plusplus) |
495 | } /* extern "C" */ |
496 | #endif |
497 | |
498 | #endif /* BROTLI_ENC_HASH_H_ */ |
499 | |