1/* Copyright 2010 Google Inc. All Rights Reserved.
2
3 Distributed under MIT license.
4 See file LICENSE for detail or copy at https://opensource.org/licenses/MIT
5*/
6
7/* A (forgetful) hash table to the data seen by the compressor, to
8 help create backward references to previous data. */
9
10#ifndef BROTLI_ENC_HASH_H_
11#define BROTLI_ENC_HASH_H_
12
13#include <string.h> /* memcmp, memset */
14
15#include "../common/constants.h"
16#include "../common/dictionary.h"
17#include "../common/platform.h"
18#include <brotli/types.h>
19#include "./encoder_dict.h"
20#include "./fast_log.h"
21#include "./find_match_length.h"
22#include "./memory.h"
23#include "./quality.h"
24#include "./static_dict.h"
25
26#if defined(__cplusplus) || defined(c_plusplus)
27extern "C" {
28#endif
29
30/* Pointer to hasher data.
31 *
32 * Excluding initialization and destruction, hasher can be passed as
33 * HasherHandle by value.
34 *
35 * Typically hasher data consists of 3 sections:
36 * * HasherCommon structure
37 * * private structured hasher data, depending on hasher type
38 * * private dynamic hasher data, depending on hasher type and parameters
39 *
40 * Using "define" instead of "typedef", because on MSVC __restrict does not work
41 * on typedef pointer types. */
42#define HasherHandle uint8_t*
43
44typedef struct {
45 BrotliHasherParams params;
46
47 /* False if hasher needs to be "prepared" before use. */
48 BROTLI_BOOL is_prepared_;
49
50 size_t dict_num_lookups;
51 size_t dict_num_matches;
52} HasherCommon;
53
54static BROTLI_INLINE HasherCommon* GetHasherCommon(HasherHandle handle) {
55 return (HasherCommon*)handle;
56}
57
58#define score_t size_t
59
60static const uint32_t kCutoffTransformsCount = 10;
61/* 0, 12, 27, 23, 42, 63, 56, 48, 59, 64 */
62/* 0+0, 4+8, 8+19, 12+11, 16+26, 20+43, 24+32, 28+20, 32+27, 36+28 */
63static const uint64_t kCutoffTransforms =
64 BROTLI_MAKE_UINT64_T(0x071B520A, 0xDA2D3200);
65
66typedef struct HasherSearchResult {
67 size_t len;
68 size_t distance;
69 score_t score;
70 int len_code_delta; /* == len_code - len */
71} HasherSearchResult;
72
73/* kHashMul32 multiplier has these properties:
74 * The multiplier must be odd. Otherwise we may lose the highest bit.
75 * No long streaks of ones or zeros.
76 * There is no effort to ensure that it is a prime, the oddity is enough
77 for this use.
78 * The number has been tuned heuristically against compression benchmarks. */
79static const uint32_t kHashMul32 = 0x1E35A7BD;
80static const uint64_t kHashMul64 = BROTLI_MAKE_UINT64_T(0x1E35A7BD, 0x1E35A7BD);
81static const uint64_t kHashMul64Long =
82 BROTLI_MAKE_UINT64_T(0x1FE35A7Bu, 0xD3579BD3u);
83
84static BROTLI_INLINE uint32_t Hash14(const uint8_t* data) {
85 uint32_t h = BROTLI_UNALIGNED_LOAD32LE(data) * kHashMul32;
86 /* The higher bits contain more mixture from the multiplication,
87 so we take our results from there. */
88 return h >> (32 - 14);
89}
90
91static BROTLI_INLINE void PrepareDistanceCache(
92 int* BROTLI_RESTRICT distance_cache, const int num_distances) {
93 if (num_distances > 4) {
94 int last_distance = distance_cache[0];
95 distance_cache[4] = last_distance - 1;
96 distance_cache[5] = last_distance + 1;
97 distance_cache[6] = last_distance - 2;
98 distance_cache[7] = last_distance + 2;
99 distance_cache[8] = last_distance - 3;
100 distance_cache[9] = last_distance + 3;
101 if (num_distances > 10) {
102 int next_last_distance = distance_cache[1];
103 distance_cache[10] = next_last_distance - 1;
104 distance_cache[11] = next_last_distance + 1;
105 distance_cache[12] = next_last_distance - 2;
106 distance_cache[13] = next_last_distance + 2;
107 distance_cache[14] = next_last_distance - 3;
108 distance_cache[15] = next_last_distance + 3;
109 }
110 }
111}
112
113#define BROTLI_LITERAL_BYTE_SCORE 135
114#define BROTLI_DISTANCE_BIT_PENALTY 30
115/* Score must be positive after applying maximal penalty. */
116#define BROTLI_SCORE_BASE (BROTLI_DISTANCE_BIT_PENALTY * 8 * sizeof(size_t))
117
118/* Usually, we always choose the longest backward reference. This function
119 allows for the exception of that rule.
120
121 If we choose a backward reference that is further away, it will
122 usually be coded with more bits. We approximate this by assuming
123 log2(distance). If the distance can be expressed in terms of the
124 last four distances, we use some heuristic constants to estimate
125 the bits cost. For the first up to four literals we use the bit
126 cost of the literals from the literal cost model, after that we
127 use the average bit cost of the cost model.
128
129 This function is used to sometimes discard a longer backward reference
130 when it is not much longer and the bit cost for encoding it is more
131 than the saved literals.
132
133 backward_reference_offset MUST be positive. */
134static BROTLI_INLINE score_t BackwardReferenceScore(
135 size_t copy_length, size_t backward_reference_offset) {
136 return BROTLI_SCORE_BASE + BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length -
137 BROTLI_DISTANCE_BIT_PENALTY * Log2FloorNonZero(backward_reference_offset);
138}
139
140static BROTLI_INLINE score_t BackwardReferenceScoreUsingLastDistance(
141 size_t copy_length) {
142 return BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length +
143 BROTLI_SCORE_BASE + 15;
144}
145
146static BROTLI_INLINE score_t BackwardReferencePenaltyUsingLastDistance(
147 size_t distance_short_code) {
148 return (score_t)39 + ((0x1CA10 >> (distance_short_code & 0xE)) & 0xE);
149}
150
151static BROTLI_INLINE BROTLI_BOOL TestStaticDictionaryItem(
152 const BrotliEncoderDictionary* dictionary, size_t item,
153 const uint8_t* data, size_t max_length, size_t max_backward,
154 size_t max_distance, HasherSearchResult* out) {
155 size_t len;
156 size_t word_idx;
157 size_t offset;
158 size_t matchlen;
159 size_t backward;
160 score_t score;
161 len = item & 0x1F;
162 word_idx = item >> 5;
163 offset = dictionary->words->offsets_by_length[len] + len * word_idx;
164 if (len > max_length) {
165 return BROTLI_FALSE;
166 }
167
168 matchlen =
169 FindMatchLengthWithLimit(data, &dictionary->words->data[offset], len);
170 if (matchlen + dictionary->cutoffTransformsCount <= len || matchlen == 0) {
171 return BROTLI_FALSE;
172 }
173 {
174 size_t cut = len - matchlen;
175 size_t transform_id = (cut << 2) +
176 (size_t)((dictionary->cutoffTransforms >> (cut * 6)) & 0x3F);
177 backward = max_backward + 1 + word_idx +
178 (transform_id << dictionary->words->size_bits_by_length[len]);
179 }
180 if (backward > max_distance) {
181 return BROTLI_FALSE;
182 }
183 score = BackwardReferenceScore(matchlen, backward);
184 if (score < out->score) {
185 return BROTLI_FALSE;
186 }
187 out->len = matchlen;
188 out->len_code_delta = (int)len - (int)matchlen;
189 out->distance = backward;
190 out->score = score;
191 return BROTLI_TRUE;
192}
193
194static BROTLI_INLINE void SearchInStaticDictionary(
195 const BrotliEncoderDictionary* dictionary,
196 HasherHandle handle, const uint8_t* data, size_t max_length,
197 size_t max_backward, size_t max_distance,
198 HasherSearchResult* out, BROTLI_BOOL shallow) {
199 size_t key;
200 size_t i;
201 HasherCommon* self = GetHasherCommon(handle);
202 if (self->dict_num_matches < (self->dict_num_lookups >> 7)) {
203 return;
204 }
205 key = Hash14(data) << 1;
206 for (i = 0; i < (shallow ? 1u : 2u); ++i, ++key) {
207 size_t item = dictionary->hash_table[key];
208 self->dict_num_lookups++;
209 if (item != 0) {
210 BROTLI_BOOL item_matches = TestStaticDictionaryItem(
211 dictionary, item, data,
212 max_length, max_backward, max_distance, out);
213 if (item_matches) {
214 self->dict_num_matches++;
215 }
216 }
217 }
218}
219
220typedef struct BackwardMatch {
221 uint32_t distance;
222 uint32_t length_and_code;
223} BackwardMatch;
224
225static BROTLI_INLINE void InitBackwardMatch(BackwardMatch* self,
226 size_t dist, size_t len) {
227 self->distance = (uint32_t)dist;
228 self->length_and_code = (uint32_t)(len << 5);
229}
230
231static BROTLI_INLINE void InitDictionaryBackwardMatch(BackwardMatch* self,
232 size_t dist, size_t len, size_t len_code) {
233 self->distance = (uint32_t)dist;
234 self->length_and_code =
235 (uint32_t)((len << 5) | (len == len_code ? 0 : len_code));
236}
237
238static BROTLI_INLINE size_t BackwardMatchLength(const BackwardMatch* self) {
239 return self->length_and_code >> 5;
240}
241
242static BROTLI_INLINE size_t BackwardMatchLengthCode(const BackwardMatch* self) {
243 size_t code = self->length_and_code & 31;
244 return code ? code : BackwardMatchLength(self);
245}
246
247#define EXPAND_CAT(a, b) CAT(a, b)
248#define CAT(a, b) a ## b
249#define FN(X) EXPAND_CAT(X, HASHER())
250
251#define HASHER() H10
252#define BUCKET_BITS 17
253#define MAX_TREE_SEARCH_DEPTH 64
254#define MAX_TREE_COMP_LENGTH 128
255#include "./hash_to_binary_tree_inc.h" /* NOLINT(build/include) */
256#undef MAX_TREE_SEARCH_DEPTH
257#undef MAX_TREE_COMP_LENGTH
258#undef BUCKET_BITS
259#undef HASHER
260/* MAX_NUM_MATCHES == 64 + MAX_TREE_SEARCH_DEPTH */
261#define MAX_NUM_MATCHES_H10 128
262
263/* For BUCKET_SWEEP == 1, enabling the dictionary lookup makes compression
264 a little faster (0.5% - 1%) and it compresses 0.15% better on small text
265 and HTML inputs. */
266
267#define HASHER() H2
268#define BUCKET_BITS 16
269#define BUCKET_SWEEP 1
270#define HASH_LEN 5
271#define USE_DICTIONARY 1
272#include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */
273#undef BUCKET_SWEEP
274#undef USE_DICTIONARY
275#undef HASHER
276
277#define HASHER() H3
278#define BUCKET_SWEEP 2
279#define USE_DICTIONARY 0
280#include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */
281#undef USE_DICTIONARY
282#undef BUCKET_SWEEP
283#undef BUCKET_BITS
284#undef HASHER
285
286#define HASHER() H4
287#define BUCKET_BITS 17
288#define BUCKET_SWEEP 4
289#define USE_DICTIONARY 1
290#include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */
291#undef USE_DICTIONARY
292#undef HASH_LEN
293#undef BUCKET_SWEEP
294#undef BUCKET_BITS
295#undef HASHER
296
297#define HASHER() H5
298#include "./hash_longest_match_inc.h" /* NOLINT(build/include) */
299#undef HASHER
300
301#define HASHER() H6
302#include "./hash_longest_match64_inc.h" /* NOLINT(build/include) */
303#undef HASHER
304
305#define BUCKET_BITS 15
306
307#define NUM_LAST_DISTANCES_TO_CHECK 4
308#define NUM_BANKS 1
309#define BANK_BITS 16
310#define HASHER() H40
311#include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */
312#undef HASHER
313#undef NUM_LAST_DISTANCES_TO_CHECK
314
315#define NUM_LAST_DISTANCES_TO_CHECK 10
316#define HASHER() H41
317#include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */
318#undef HASHER
319#undef NUM_LAST_DISTANCES_TO_CHECK
320#undef NUM_BANKS
321#undef BANK_BITS
322
323#define NUM_LAST_DISTANCES_TO_CHECK 16
324#define NUM_BANKS 512
325#define BANK_BITS 9
326#define HASHER() H42
327#include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */
328#undef HASHER
329#undef NUM_LAST_DISTANCES_TO_CHECK
330#undef NUM_BANKS
331#undef BANK_BITS
332
333#undef BUCKET_BITS
334
335#define HASHER() H54
336#define BUCKET_BITS 20
337#define BUCKET_SWEEP 4
338#define HASH_LEN 7
339#define USE_DICTIONARY 0
340#include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */
341#undef USE_DICTIONARY
342#undef HASH_LEN
343#undef BUCKET_SWEEP
344#undef BUCKET_BITS
345#undef HASHER
346
347/* fast large window hashers */
348
349#define HASHER() HROLLING_FAST
350#define CHUNKLEN 32
351#define JUMP 4
352#define NUMBUCKETS 16777216
353#define MASK ((NUMBUCKETS * 64) - 1)
354#include "./hash_rolling_inc.h" /* NOLINT(build/include) */
355#undef JUMP
356#undef HASHER
357
358
359#define HASHER() HROLLING
360#define JUMP 1
361#include "./hash_rolling_inc.h" /* NOLINT(build/include) */
362#undef MASK
363#undef NUMBUCKETS
364#undef JUMP
365#undef CHUNKLEN
366#undef HASHER
367
368#define HASHER() H35
369#define HASHER_A H3
370#define HASHER_B HROLLING_FAST
371#include "./hash_composite_inc.h" /* NOLINT(build/include) */
372#undef HASHER_A
373#undef HASHER_B
374#undef HASHER
375
376#define HASHER() H55
377#define HASHER_A H54
378#define HASHER_B HROLLING_FAST
379#include "./hash_composite_inc.h" /* NOLINT(build/include) */
380#undef HASHER_A
381#undef HASHER_B
382#undef HASHER
383
384#define HASHER() H65
385#define HASHER_A H6
386#define HASHER_B HROLLING
387#include "./hash_composite_inc.h" /* NOLINT(build/include) */
388#undef HASHER_A
389#undef HASHER_B
390#undef HASHER
391
392#undef FN
393#undef CAT
394#undef EXPAND_CAT
395
396#define FOR_GENERIC_HASHERS(H) H(2) H(3) H(4) H(5) H(6) H(40) H(41) H(42) H(54)\
397 H(35) H(55) H(65)
398#define FOR_ALL_HASHERS(H) FOR_GENERIC_HASHERS(H) H(10)
399
400static BROTLI_INLINE void DestroyHasher(
401 MemoryManager* m, HasherHandle* handle) {
402 if (*handle == NULL) return;
403 BROTLI_FREE(m, *handle);
404}
405
406static BROTLI_INLINE void HasherReset(HasherHandle handle) {
407 if (handle == NULL) return;
408 GetHasherCommon(handle)->is_prepared_ = BROTLI_FALSE;
409}
410
411static BROTLI_INLINE size_t HasherSize(const BrotliEncoderParams* params,
412 BROTLI_BOOL one_shot, const size_t input_size) {
413 size_t result = sizeof(HasherCommon);
414 switch (params->hasher.type) {
415#define SIZE_(N) \
416 case N: \
417 result += HashMemAllocInBytesH ## N(params, one_shot, input_size); \
418 break;
419 FOR_ALL_HASHERS(SIZE_)
420#undef SIZE_
421 default:
422 break;
423 }
424 return result;
425}
426
427static BROTLI_INLINE void HasherSetup(MemoryManager* m, HasherHandle* handle,
428 BrotliEncoderParams* params, const uint8_t* data, size_t position,
429 size_t input_size, BROTLI_BOOL is_last) {
430 HasherHandle self = NULL;
431 HasherCommon* common = NULL;
432 BROTLI_BOOL one_shot = (position == 0 && is_last);
433 if (*handle == NULL) {
434 size_t alloc_size;
435 ChooseHasher(params, &params->hasher);
436 alloc_size = HasherSize(params, one_shot, input_size);
437 self = BROTLI_ALLOC(m, uint8_t, alloc_size);
438 if (BROTLI_IS_OOM(m)) return;
439 *handle = self;
440 common = GetHasherCommon(self);
441 common->params = params->hasher;
442 switch (common->params.type) {
443#define INITIALIZE_(N) \
444 case N: \
445 InitializeH ## N(*handle, params); \
446 break;
447 FOR_ALL_HASHERS(INITIALIZE_);
448#undef INITIALIZE_
449 default:
450 break;
451 }
452 HasherReset(*handle);
453 }
454
455 self = *handle;
456 common = GetHasherCommon(self);
457 if (!common->is_prepared_) {
458 switch (common->params.type) {
459#define PREPARE_(N) \
460 case N: \
461 PrepareH ## N(self, one_shot, input_size, data); \
462 break;
463 FOR_ALL_HASHERS(PREPARE_)
464#undef PREPARE_
465 default: break;
466 }
467 if (position == 0) {
468 common->dict_num_lookups = 0;
469 common->dict_num_matches = 0;
470 }
471 common->is_prepared_ = BROTLI_TRUE;
472 }
473}
474
475static BROTLI_INLINE void InitOrStitchToPreviousBlock(
476 MemoryManager* m, HasherHandle* handle, const uint8_t* data, size_t mask,
477 BrotliEncoderParams* params, size_t position, size_t input_size,
478 BROTLI_BOOL is_last) {
479 HasherHandle self;
480 HasherSetup(m, handle, params, data, position, input_size, is_last);
481 if (BROTLI_IS_OOM(m)) return;
482 self = *handle;
483 switch (GetHasherCommon(self)->params.type) {
484#define INIT_(N) \
485 case N: \
486 StitchToPreviousBlockH ## N(self, input_size, position, data, mask); \
487 break;
488 FOR_ALL_HASHERS(INIT_)
489#undef INIT_
490 default: break;
491 }
492}
493
494#if defined(__cplusplus) || defined(c_plusplus)
495} /* extern "C" */
496#endif
497
498#endif /* BROTLI_ENC_HASH_H_ */
499