| 1 | /* Copyright 2010 Google Inc. All Rights Reserved. |
| 2 | |
| 3 | Distributed under MIT license. |
| 4 | See file LICENSE for detail or copy at https://opensource.org/licenses/MIT |
| 5 | */ |
| 6 | |
| 7 | /* A (forgetful) hash table to the data seen by the compressor, to |
| 8 | help create backward references to previous data. */ |
| 9 | |
| 10 | #ifndef BROTLI_ENC_HASH_H_ |
| 11 | #define BROTLI_ENC_HASH_H_ |
| 12 | |
| 13 | #include <string.h> /* memcmp, memset */ |
| 14 | |
| 15 | #include "../common/constants.h" |
| 16 | #include "../common/dictionary.h" |
| 17 | #include "../common/platform.h" |
| 18 | #include <brotli/types.h> |
| 19 | #include "./encoder_dict.h" |
| 20 | #include "./fast_log.h" |
| 21 | #include "./find_match_length.h" |
| 22 | #include "./memory.h" |
| 23 | #include "./quality.h" |
| 24 | #include "./static_dict.h" |
| 25 | |
| 26 | #if defined(__cplusplus) || defined(c_plusplus) |
| 27 | extern "C" { |
| 28 | #endif |
| 29 | |
| 30 | /* Pointer to hasher data. |
| 31 | * |
| 32 | * Excluding initialization and destruction, hasher can be passed as |
| 33 | * HasherHandle by value. |
| 34 | * |
| 35 | * Typically hasher data consists of 3 sections: |
| 36 | * * HasherCommon structure |
| 37 | * * private structured hasher data, depending on hasher type |
| 38 | * * private dynamic hasher data, depending on hasher type and parameters |
| 39 | * |
| 40 | * Using "define" instead of "typedef", because on MSVC __restrict does not work |
| 41 | * on typedef pointer types. */ |
| 42 | #define HasherHandle uint8_t* |
| 43 | |
| 44 | typedef struct { |
| 45 | BrotliHasherParams params; |
| 46 | |
| 47 | /* False if hasher needs to be "prepared" before use. */ |
| 48 | BROTLI_BOOL is_prepared_; |
| 49 | |
| 50 | size_t dict_num_lookups; |
| 51 | size_t dict_num_matches; |
| 52 | } HasherCommon; |
| 53 | |
| 54 | static BROTLI_INLINE HasherCommon* GetHasherCommon(HasherHandle handle) { |
| 55 | return (HasherCommon*)handle; |
| 56 | } |
| 57 | |
| 58 | #define score_t size_t |
| 59 | |
| 60 | static const uint32_t kCutoffTransformsCount = 10; |
| 61 | /* 0, 12, 27, 23, 42, 63, 56, 48, 59, 64 */ |
| 62 | /* 0+0, 4+8, 8+19, 12+11, 16+26, 20+43, 24+32, 28+20, 32+27, 36+28 */ |
| 63 | static const uint64_t kCutoffTransforms = |
| 64 | BROTLI_MAKE_UINT64_T(0x071B520A, 0xDA2D3200); |
| 65 | |
| 66 | typedef struct HasherSearchResult { |
| 67 | size_t len; |
| 68 | size_t distance; |
| 69 | score_t score; |
| 70 | int len_code_delta; /* == len_code - len */ |
| 71 | } HasherSearchResult; |
| 72 | |
| 73 | /* kHashMul32 multiplier has these properties: |
| 74 | * The multiplier must be odd. Otherwise we may lose the highest bit. |
| 75 | * No long streaks of ones or zeros. |
| 76 | * There is no effort to ensure that it is a prime, the oddity is enough |
| 77 | for this use. |
| 78 | * The number has been tuned heuristically against compression benchmarks. */ |
| 79 | static const uint32_t kHashMul32 = 0x1E35A7BD; |
| 80 | static const uint64_t kHashMul64 = BROTLI_MAKE_UINT64_T(0x1E35A7BD, 0x1E35A7BD); |
| 81 | static const uint64_t kHashMul64Long = |
| 82 | BROTLI_MAKE_UINT64_T(0x1FE35A7Bu, 0xD3579BD3u); |
| 83 | |
| 84 | static BROTLI_INLINE uint32_t Hash14(const uint8_t* data) { |
| 85 | uint32_t h = BROTLI_UNALIGNED_LOAD32LE(data) * kHashMul32; |
| 86 | /* The higher bits contain more mixture from the multiplication, |
| 87 | so we take our results from there. */ |
| 88 | return h >> (32 - 14); |
| 89 | } |
| 90 | |
| 91 | static BROTLI_INLINE void PrepareDistanceCache( |
| 92 | int* BROTLI_RESTRICT distance_cache, const int num_distances) { |
| 93 | if (num_distances > 4) { |
| 94 | int last_distance = distance_cache[0]; |
| 95 | distance_cache[4] = last_distance - 1; |
| 96 | distance_cache[5] = last_distance + 1; |
| 97 | distance_cache[6] = last_distance - 2; |
| 98 | distance_cache[7] = last_distance + 2; |
| 99 | distance_cache[8] = last_distance - 3; |
| 100 | distance_cache[9] = last_distance + 3; |
| 101 | if (num_distances > 10) { |
| 102 | int next_last_distance = distance_cache[1]; |
| 103 | distance_cache[10] = next_last_distance - 1; |
| 104 | distance_cache[11] = next_last_distance + 1; |
| 105 | distance_cache[12] = next_last_distance - 2; |
| 106 | distance_cache[13] = next_last_distance + 2; |
| 107 | distance_cache[14] = next_last_distance - 3; |
| 108 | distance_cache[15] = next_last_distance + 3; |
| 109 | } |
| 110 | } |
| 111 | } |
| 112 | |
| 113 | #define BROTLI_LITERAL_BYTE_SCORE 135 |
| 114 | #define BROTLI_DISTANCE_BIT_PENALTY 30 |
| 115 | /* Score must be positive after applying maximal penalty. */ |
| 116 | #define BROTLI_SCORE_BASE (BROTLI_DISTANCE_BIT_PENALTY * 8 * sizeof(size_t)) |
| 117 | |
| 118 | /* Usually, we always choose the longest backward reference. This function |
| 119 | allows for the exception of that rule. |
| 120 | |
| 121 | If we choose a backward reference that is further away, it will |
| 122 | usually be coded with more bits. We approximate this by assuming |
| 123 | log2(distance). If the distance can be expressed in terms of the |
| 124 | last four distances, we use some heuristic constants to estimate |
| 125 | the bits cost. For the first up to four literals we use the bit |
| 126 | cost of the literals from the literal cost model, after that we |
| 127 | use the average bit cost of the cost model. |
| 128 | |
| 129 | This function is used to sometimes discard a longer backward reference |
| 130 | when it is not much longer and the bit cost for encoding it is more |
| 131 | than the saved literals. |
| 132 | |
| 133 | backward_reference_offset MUST be positive. */ |
| 134 | static BROTLI_INLINE score_t BackwardReferenceScore( |
| 135 | size_t copy_length, size_t backward_reference_offset) { |
| 136 | return BROTLI_SCORE_BASE + BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length - |
| 137 | BROTLI_DISTANCE_BIT_PENALTY * Log2FloorNonZero(backward_reference_offset); |
| 138 | } |
| 139 | |
| 140 | static BROTLI_INLINE score_t BackwardReferenceScoreUsingLastDistance( |
| 141 | size_t copy_length) { |
| 142 | return BROTLI_LITERAL_BYTE_SCORE * (score_t)copy_length + |
| 143 | BROTLI_SCORE_BASE + 15; |
| 144 | } |
| 145 | |
| 146 | static BROTLI_INLINE score_t BackwardReferencePenaltyUsingLastDistance( |
| 147 | size_t distance_short_code) { |
| 148 | return (score_t)39 + ((0x1CA10 >> (distance_short_code & 0xE)) & 0xE); |
| 149 | } |
| 150 | |
| 151 | static BROTLI_INLINE BROTLI_BOOL TestStaticDictionaryItem( |
| 152 | const BrotliEncoderDictionary* dictionary, size_t item, |
| 153 | const uint8_t* data, size_t max_length, size_t max_backward, |
| 154 | size_t max_distance, HasherSearchResult* out) { |
| 155 | size_t len; |
| 156 | size_t word_idx; |
| 157 | size_t offset; |
| 158 | size_t matchlen; |
| 159 | size_t backward; |
| 160 | score_t score; |
| 161 | len = item & 0x1F; |
| 162 | word_idx = item >> 5; |
| 163 | offset = dictionary->words->offsets_by_length[len] + len * word_idx; |
| 164 | if (len > max_length) { |
| 165 | return BROTLI_FALSE; |
| 166 | } |
| 167 | |
| 168 | matchlen = |
| 169 | FindMatchLengthWithLimit(data, &dictionary->words->data[offset], len); |
| 170 | if (matchlen + dictionary->cutoffTransformsCount <= len || matchlen == 0) { |
| 171 | return BROTLI_FALSE; |
| 172 | } |
| 173 | { |
| 174 | size_t cut = len - matchlen; |
| 175 | size_t transform_id = (cut << 2) + |
| 176 | (size_t)((dictionary->cutoffTransforms >> (cut * 6)) & 0x3F); |
| 177 | backward = max_backward + 1 + word_idx + |
| 178 | (transform_id << dictionary->words->size_bits_by_length[len]); |
| 179 | } |
| 180 | if (backward > max_distance) { |
| 181 | return BROTLI_FALSE; |
| 182 | } |
| 183 | score = BackwardReferenceScore(matchlen, backward); |
| 184 | if (score < out->score) { |
| 185 | return BROTLI_FALSE; |
| 186 | } |
| 187 | out->len = matchlen; |
| 188 | out->len_code_delta = (int)len - (int)matchlen; |
| 189 | out->distance = backward; |
| 190 | out->score = score; |
| 191 | return BROTLI_TRUE; |
| 192 | } |
| 193 | |
| 194 | static BROTLI_INLINE void SearchInStaticDictionary( |
| 195 | const BrotliEncoderDictionary* dictionary, |
| 196 | HasherHandle handle, const uint8_t* data, size_t max_length, |
| 197 | size_t max_backward, size_t max_distance, |
| 198 | HasherSearchResult* out, BROTLI_BOOL shallow) { |
| 199 | size_t key; |
| 200 | size_t i; |
| 201 | HasherCommon* self = GetHasherCommon(handle); |
| 202 | if (self->dict_num_matches < (self->dict_num_lookups >> 7)) { |
| 203 | return; |
| 204 | } |
| 205 | key = Hash14(data) << 1; |
| 206 | for (i = 0; i < (shallow ? 1u : 2u); ++i, ++key) { |
| 207 | size_t item = dictionary->hash_table[key]; |
| 208 | self->dict_num_lookups++; |
| 209 | if (item != 0) { |
| 210 | BROTLI_BOOL item_matches = TestStaticDictionaryItem( |
| 211 | dictionary, item, data, |
| 212 | max_length, max_backward, max_distance, out); |
| 213 | if (item_matches) { |
| 214 | self->dict_num_matches++; |
| 215 | } |
| 216 | } |
| 217 | } |
| 218 | } |
| 219 | |
| 220 | typedef struct BackwardMatch { |
| 221 | uint32_t distance; |
| 222 | uint32_t length_and_code; |
| 223 | } BackwardMatch; |
| 224 | |
| 225 | static BROTLI_INLINE void InitBackwardMatch(BackwardMatch* self, |
| 226 | size_t dist, size_t len) { |
| 227 | self->distance = (uint32_t)dist; |
| 228 | self->length_and_code = (uint32_t)(len << 5); |
| 229 | } |
| 230 | |
| 231 | static BROTLI_INLINE void InitDictionaryBackwardMatch(BackwardMatch* self, |
| 232 | size_t dist, size_t len, size_t len_code) { |
| 233 | self->distance = (uint32_t)dist; |
| 234 | self->length_and_code = |
| 235 | (uint32_t)((len << 5) | (len == len_code ? 0 : len_code)); |
| 236 | } |
| 237 | |
| 238 | static BROTLI_INLINE size_t BackwardMatchLength(const BackwardMatch* self) { |
| 239 | return self->length_and_code >> 5; |
| 240 | } |
| 241 | |
| 242 | static BROTLI_INLINE size_t BackwardMatchLengthCode(const BackwardMatch* self) { |
| 243 | size_t code = self->length_and_code & 31; |
| 244 | return code ? code : BackwardMatchLength(self); |
| 245 | } |
| 246 | |
| 247 | #define EXPAND_CAT(a, b) CAT(a, b) |
| 248 | #define CAT(a, b) a ## b |
| 249 | #define FN(X) EXPAND_CAT(X, HASHER()) |
| 250 | |
| 251 | #define HASHER() H10 |
| 252 | #define BUCKET_BITS 17 |
| 253 | #define MAX_TREE_SEARCH_DEPTH 64 |
| 254 | #define MAX_TREE_COMP_LENGTH 128 |
| 255 | #include "./hash_to_binary_tree_inc.h" /* NOLINT(build/include) */ |
| 256 | #undef MAX_TREE_SEARCH_DEPTH |
| 257 | #undef MAX_TREE_COMP_LENGTH |
| 258 | #undef BUCKET_BITS |
| 259 | #undef HASHER |
| 260 | /* MAX_NUM_MATCHES == 64 + MAX_TREE_SEARCH_DEPTH */ |
| 261 | #define MAX_NUM_MATCHES_H10 128 |
| 262 | |
| 263 | /* For BUCKET_SWEEP == 1, enabling the dictionary lookup makes compression |
| 264 | a little faster (0.5% - 1%) and it compresses 0.15% better on small text |
| 265 | and HTML inputs. */ |
| 266 | |
| 267 | #define HASHER() H2 |
| 268 | #define BUCKET_BITS 16 |
| 269 | #define BUCKET_SWEEP 1 |
| 270 | #define HASH_LEN 5 |
| 271 | #define USE_DICTIONARY 1 |
| 272 | #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */ |
| 273 | #undef BUCKET_SWEEP |
| 274 | #undef USE_DICTIONARY |
| 275 | #undef HASHER |
| 276 | |
| 277 | #define HASHER() H3 |
| 278 | #define BUCKET_SWEEP 2 |
| 279 | #define USE_DICTIONARY 0 |
| 280 | #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */ |
| 281 | #undef USE_DICTIONARY |
| 282 | #undef BUCKET_SWEEP |
| 283 | #undef BUCKET_BITS |
| 284 | #undef HASHER |
| 285 | |
| 286 | #define HASHER() H4 |
| 287 | #define BUCKET_BITS 17 |
| 288 | #define BUCKET_SWEEP 4 |
| 289 | #define USE_DICTIONARY 1 |
| 290 | #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */ |
| 291 | #undef USE_DICTIONARY |
| 292 | #undef HASH_LEN |
| 293 | #undef BUCKET_SWEEP |
| 294 | #undef BUCKET_BITS |
| 295 | #undef HASHER |
| 296 | |
| 297 | #define HASHER() H5 |
| 298 | #include "./hash_longest_match_inc.h" /* NOLINT(build/include) */ |
| 299 | #undef HASHER |
| 300 | |
| 301 | #define HASHER() H6 |
| 302 | #include "./hash_longest_match64_inc.h" /* NOLINT(build/include) */ |
| 303 | #undef HASHER |
| 304 | |
| 305 | #define BUCKET_BITS 15 |
| 306 | |
| 307 | #define NUM_LAST_DISTANCES_TO_CHECK 4 |
| 308 | #define NUM_BANKS 1 |
| 309 | #define BANK_BITS 16 |
| 310 | #define HASHER() H40 |
| 311 | #include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */ |
| 312 | #undef HASHER |
| 313 | #undef NUM_LAST_DISTANCES_TO_CHECK |
| 314 | |
| 315 | #define NUM_LAST_DISTANCES_TO_CHECK 10 |
| 316 | #define HASHER() H41 |
| 317 | #include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */ |
| 318 | #undef HASHER |
| 319 | #undef NUM_LAST_DISTANCES_TO_CHECK |
| 320 | #undef NUM_BANKS |
| 321 | #undef BANK_BITS |
| 322 | |
| 323 | #define NUM_LAST_DISTANCES_TO_CHECK 16 |
| 324 | #define NUM_BANKS 512 |
| 325 | #define BANK_BITS 9 |
| 326 | #define HASHER() H42 |
| 327 | #include "./hash_forgetful_chain_inc.h" /* NOLINT(build/include) */ |
| 328 | #undef HASHER |
| 329 | #undef NUM_LAST_DISTANCES_TO_CHECK |
| 330 | #undef NUM_BANKS |
| 331 | #undef BANK_BITS |
| 332 | |
| 333 | #undef BUCKET_BITS |
| 334 | |
| 335 | #define HASHER() H54 |
| 336 | #define BUCKET_BITS 20 |
| 337 | #define BUCKET_SWEEP 4 |
| 338 | #define HASH_LEN 7 |
| 339 | #define USE_DICTIONARY 0 |
| 340 | #include "./hash_longest_match_quickly_inc.h" /* NOLINT(build/include) */ |
| 341 | #undef USE_DICTIONARY |
| 342 | #undef HASH_LEN |
| 343 | #undef BUCKET_SWEEP |
| 344 | #undef BUCKET_BITS |
| 345 | #undef HASHER |
| 346 | |
| 347 | /* fast large window hashers */ |
| 348 | |
| 349 | #define HASHER() HROLLING_FAST |
| 350 | #define CHUNKLEN 32 |
| 351 | #define JUMP 4 |
| 352 | #define NUMBUCKETS 16777216 |
| 353 | #define MASK ((NUMBUCKETS * 64) - 1) |
| 354 | #include "./hash_rolling_inc.h" /* NOLINT(build/include) */ |
| 355 | #undef JUMP |
| 356 | #undef HASHER |
| 357 | |
| 358 | |
| 359 | #define HASHER() HROLLING |
| 360 | #define JUMP 1 |
| 361 | #include "./hash_rolling_inc.h" /* NOLINT(build/include) */ |
| 362 | #undef MASK |
| 363 | #undef NUMBUCKETS |
| 364 | #undef JUMP |
| 365 | #undef CHUNKLEN |
| 366 | #undef HASHER |
| 367 | |
| 368 | #define HASHER() H35 |
| 369 | #define HASHER_A H3 |
| 370 | #define HASHER_B HROLLING_FAST |
| 371 | #include "./hash_composite_inc.h" /* NOLINT(build/include) */ |
| 372 | #undef HASHER_A |
| 373 | #undef HASHER_B |
| 374 | #undef HASHER |
| 375 | |
| 376 | #define HASHER() H55 |
| 377 | #define HASHER_A H54 |
| 378 | #define HASHER_B HROLLING_FAST |
| 379 | #include "./hash_composite_inc.h" /* NOLINT(build/include) */ |
| 380 | #undef HASHER_A |
| 381 | #undef HASHER_B |
| 382 | #undef HASHER |
| 383 | |
| 384 | #define HASHER() H65 |
| 385 | #define HASHER_A H6 |
| 386 | #define HASHER_B HROLLING |
| 387 | #include "./hash_composite_inc.h" /* NOLINT(build/include) */ |
| 388 | #undef HASHER_A |
| 389 | #undef HASHER_B |
| 390 | #undef HASHER |
| 391 | |
| 392 | #undef FN |
| 393 | #undef CAT |
| 394 | #undef EXPAND_CAT |
| 395 | |
| 396 | #define FOR_GENERIC_HASHERS(H) H(2) H(3) H(4) H(5) H(6) H(40) H(41) H(42) H(54)\ |
| 397 | H(35) H(55) H(65) |
| 398 | #define FOR_ALL_HASHERS(H) FOR_GENERIC_HASHERS(H) H(10) |
| 399 | |
| 400 | static BROTLI_INLINE void DestroyHasher( |
| 401 | MemoryManager* m, HasherHandle* handle) { |
| 402 | if (*handle == NULL) return; |
| 403 | BROTLI_FREE(m, *handle); |
| 404 | } |
| 405 | |
| 406 | static BROTLI_INLINE void HasherReset(HasherHandle handle) { |
| 407 | if (handle == NULL) return; |
| 408 | GetHasherCommon(handle)->is_prepared_ = BROTLI_FALSE; |
| 409 | } |
| 410 | |
| 411 | static BROTLI_INLINE size_t HasherSize(const BrotliEncoderParams* params, |
| 412 | BROTLI_BOOL one_shot, const size_t input_size) { |
| 413 | size_t result = sizeof(HasherCommon); |
| 414 | switch (params->hasher.type) { |
| 415 | #define SIZE_(N) \ |
| 416 | case N: \ |
| 417 | result += HashMemAllocInBytesH ## N(params, one_shot, input_size); \ |
| 418 | break; |
| 419 | FOR_ALL_HASHERS(SIZE_) |
| 420 | #undef SIZE_ |
| 421 | default: |
| 422 | break; |
| 423 | } |
| 424 | return result; |
| 425 | } |
| 426 | |
| 427 | static BROTLI_INLINE void HasherSetup(MemoryManager* m, HasherHandle* handle, |
| 428 | BrotliEncoderParams* params, const uint8_t* data, size_t position, |
| 429 | size_t input_size, BROTLI_BOOL is_last) { |
| 430 | HasherHandle self = NULL; |
| 431 | HasherCommon* common = NULL; |
| 432 | BROTLI_BOOL one_shot = (position == 0 && is_last); |
| 433 | if (*handle == NULL) { |
| 434 | size_t alloc_size; |
| 435 | ChooseHasher(params, ¶ms->hasher); |
| 436 | alloc_size = HasherSize(params, one_shot, input_size); |
| 437 | self = BROTLI_ALLOC(m, uint8_t, alloc_size); |
| 438 | if (BROTLI_IS_OOM(m)) return; |
| 439 | *handle = self; |
| 440 | common = GetHasherCommon(self); |
| 441 | common->params = params->hasher; |
| 442 | switch (common->params.type) { |
| 443 | #define INITIALIZE_(N) \ |
| 444 | case N: \ |
| 445 | InitializeH ## N(*handle, params); \ |
| 446 | break; |
| 447 | FOR_ALL_HASHERS(INITIALIZE_); |
| 448 | #undef INITIALIZE_ |
| 449 | default: |
| 450 | break; |
| 451 | } |
| 452 | HasherReset(*handle); |
| 453 | } |
| 454 | |
| 455 | self = *handle; |
| 456 | common = GetHasherCommon(self); |
| 457 | if (!common->is_prepared_) { |
| 458 | switch (common->params.type) { |
| 459 | #define PREPARE_(N) \ |
| 460 | case N: \ |
| 461 | PrepareH ## N(self, one_shot, input_size, data); \ |
| 462 | break; |
| 463 | FOR_ALL_HASHERS(PREPARE_) |
| 464 | #undef PREPARE_ |
| 465 | default: break; |
| 466 | } |
| 467 | if (position == 0) { |
| 468 | common->dict_num_lookups = 0; |
| 469 | common->dict_num_matches = 0; |
| 470 | } |
| 471 | common->is_prepared_ = BROTLI_TRUE; |
| 472 | } |
| 473 | } |
| 474 | |
| 475 | static BROTLI_INLINE void InitOrStitchToPreviousBlock( |
| 476 | MemoryManager* m, HasherHandle* handle, const uint8_t* data, size_t mask, |
| 477 | BrotliEncoderParams* params, size_t position, size_t input_size, |
| 478 | BROTLI_BOOL is_last) { |
| 479 | HasherHandle self; |
| 480 | HasherSetup(m, handle, params, data, position, input_size, is_last); |
| 481 | if (BROTLI_IS_OOM(m)) return; |
| 482 | self = *handle; |
| 483 | switch (GetHasherCommon(self)->params.type) { |
| 484 | #define INIT_(N) \ |
| 485 | case N: \ |
| 486 | StitchToPreviousBlockH ## N(self, input_size, position, data, mask); \ |
| 487 | break; |
| 488 | FOR_ALL_HASHERS(INIT_) |
| 489 | #undef INIT_ |
| 490 | default: break; |
| 491 | } |
| 492 | } |
| 493 | |
| 494 | #if defined(__cplusplus) || defined(c_plusplus) |
| 495 | } /* extern "C" */ |
| 496 | #endif |
| 497 | |
| 498 | #endif /* BROTLI_ENC_HASH_H_ */ |
| 499 | |