| 1 | /* |
| 2 | * Set match_start to the longest match starting at the given string and |
| 3 | * return its length. Matches shorter or equal to prev_length are discarded, |
| 4 | * in which case the result is equal to prev_length and match_start is garbage. |
| 5 | * |
| 6 | * IN assertions: cur_match is the head of the hash chain for the current |
| 7 | * string (strstart) and its distance is <= MAX_DIST, and prev_length >=1 |
| 8 | * OUT assertion: the match length is not greater than s->lookahead |
| 9 | */ |
| 10 | |
| 11 | #include "zbuild.h" |
| 12 | #include "deflate.h" |
| 13 | |
| 14 | #if (defined(UNALIGNED_OK) && MAX_MATCH == 258) |
| 15 | |
| 16 | /* ARM 32-bit clang/gcc builds perform better, on average, with std2. Both gcc and clang and define __GNUC__. */ |
| 17 | # if defined(__GNUC__) && defined(__arm__) && !defined(__aarch64__) |
| 18 | # define std2_longest_match |
| 19 | /* Only use std3_longest_match for little_endian systems, also avoid using it with |
| 20 | non-gcc compilers since the __builtin_ctzl() function might not be optimized. */ |
| 21 | # elif(defined(__GNUC__) && defined(HAVE_BUILTIN_CTZL) && ((__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ |
| 22 | || defined(__LITTLE_ENDIAN__))) |
| 23 | # define std3_longest_match |
| 24 | # elif(defined(_MSC_VER) && defined(_WIN32)) |
| 25 | # define std3_longest_match |
| 26 | # else |
| 27 | # define std2_longest_match |
| 28 | # endif |
| 29 | |
| 30 | #else |
| 31 | # define std1_longest_match |
| 32 | #endif |
| 33 | |
| 34 | |
| 35 | #if defined(_MSC_VER) && !defined(__clang__) |
| 36 | # if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64) || defined(_M_ARM) || defined(_M_ARM64) |
| 37 | # include "fallback_builtins.h" |
| 38 | # endif |
| 39 | #endif |
| 40 | |
| 41 | |
| 42 | |
| 43 | #ifdef std1_longest_match |
| 44 | |
| 45 | /* |
| 46 | * Standard longest_match |
| 47 | * |
| 48 | */ |
| 49 | static inline unsigned longest_match(deflate_state *const s, IPos cur_match) { |
| 50 | const unsigned wmask = s->w_mask; |
| 51 | const Pos *prev = s->prev; |
| 52 | |
| 53 | unsigned chain_length; |
| 54 | IPos limit; |
| 55 | unsigned int len, best_len, nice_match; |
| 56 | unsigned char *scan, *match, *strend, scan_end, scan_end1; |
| 57 | |
| 58 | /* |
| 59 | * The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple |
| 60 | * of 16. It is easy to get rid of this optimization if necessary. |
| 61 | */ |
| 62 | Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever" ); |
| 63 | |
| 64 | /* |
| 65 | * Do not waste too much time if we already have a good match |
| 66 | */ |
| 67 | best_len = s->prev_length ? s->prev_length : 1; |
| 68 | chain_length = s->max_chain_length; |
| 69 | if (best_len >= s->good_match) |
| 70 | chain_length >>= 2; |
| 71 | |
| 72 | /* |
| 73 | * Do not looks for matches beyond the end of the input. This is |
| 74 | * necessary to make deflate deterministic |
| 75 | */ |
| 76 | nice_match = (unsigned int)s->nice_match > s->lookahead ? s->lookahead : (unsigned int)s->nice_match; |
| 77 | |
| 78 | /* |
| 79 | * Stop when cur_match becomes <= limit. To simplify the code, |
| 80 | * we prevent matches with the string of window index 0 |
| 81 | */ |
| 82 | limit = s->strstart > MAX_DIST(s) ? s->strstart - MAX_DIST(s) : 0; |
| 83 | |
| 84 | scan = s->window + s->strstart; |
| 85 | strend = s->window + s->strstart + MAX_MATCH; |
| 86 | scan_end1 = scan[best_len-1]; |
| 87 | scan_end = scan[best_len]; |
| 88 | |
| 89 | Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead" ); |
| 90 | do { |
| 91 | if (cur_match >= s->strstart) { |
| 92 | break; |
| 93 | } |
| 94 | match = s->window + cur_match; |
| 95 | |
| 96 | /* |
| 97 | * Skip to next match if the match length cannot increase |
| 98 | * or if the match length is less than 2. Note that the checks |
| 99 | * below for insufficient lookahead only occur occasionally |
| 100 | * for performance reasons. Therefore uninitialized memory |
| 101 | * will be accessed and conditional jumps will be made that |
| 102 | * depend on those values. However the length of the match |
| 103 | * is limited to the lookahead, so the output of deflate is not |
| 104 | * affected by the uninitialized values. |
| 105 | */ |
| 106 | if (match[best_len] != scan_end || |
| 107 | match[best_len-1] != scan_end1 || |
| 108 | *match != *scan || |
| 109 | *++match != scan[1]) |
| 110 | continue; |
| 111 | |
| 112 | /* |
| 113 | * The check at best_len-1 can be removed because it will |
| 114 | * be made again later. (This heuristic is not always a win.) |
| 115 | * It is not necessary to compare scan[2] and match[2] since |
| 116 | * they are always equal when the other bytes match, given |
| 117 | * that the hash keys are equal and that HASH_BITS >= 8. |
| 118 | */ |
| 119 | scan += 2; |
| 120 | match++; |
| 121 | Assert(*scan == *match, "match[2]?" ); |
| 122 | |
| 123 | /* |
| 124 | * We check for insufficient lookahead only every 8th |
| 125 | * comparision; the 256th check will be made at strstart + 258. |
| 126 | */ |
| 127 | do { |
| 128 | } while (*++scan == *++match && *++scan == *++match && |
| 129 | *++scan == *++match && *++scan == *++match && |
| 130 | *++scan == *++match && *++scan == *++match && |
| 131 | *++scan == *++match && *++scan == *++match && |
| 132 | scan < strend); |
| 133 | |
| 134 | Assert(scan <= s->window+(unsigned int)(s->window_size-1), "wild scan" ); |
| 135 | |
| 136 | len = MAX_MATCH - (int)(strend - scan); |
| 137 | scan = strend - MAX_MATCH; |
| 138 | |
| 139 | if (len > best_len) { |
| 140 | s->match_start = cur_match; |
| 141 | best_len = len; |
| 142 | if (len >= nice_match) |
| 143 | break; |
| 144 | scan_end1 = scan[best_len-1]; |
| 145 | scan_end = scan[best_len]; |
| 146 | } else { |
| 147 | /* |
| 148 | * The probability of finding a match later if we here |
| 149 | * is pretty low, so for performance it's best to |
| 150 | * outright stop here for the lower compression levels |
| 151 | */ |
| 152 | if (s->level < TRIGGER_LEVEL) |
| 153 | break; |
| 154 | } |
| 155 | } while ((cur_match = prev[cur_match & wmask]) > limit && --chain_length); |
| 156 | |
| 157 | if ((unsigned int)best_len <= s->lookahead) |
| 158 | return best_len; |
| 159 | return s->lookahead; |
| 160 | } |
| 161 | #endif |
| 162 | |
| 163 | #ifdef std2_longest_match |
| 164 | /* |
| 165 | * UNALIGNED_OK longest_match |
| 166 | * |
| 167 | */ |
| 168 | static inline unsigned longest_match(deflate_state *const s, IPos cur_match) { |
| 169 | const unsigned wmask = s->w_mask; |
| 170 | const Pos *prev = s->prev; |
| 171 | |
| 172 | uint16_t scan_start, scan_end; |
| 173 | unsigned chain_length; |
| 174 | IPos limit; |
| 175 | unsigned int len, best_len, nice_match; |
| 176 | unsigned char *scan, *strend; |
| 177 | |
| 178 | /* |
| 179 | * The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple |
| 180 | * of 16. It is easy to get rid of this optimization if necessary. |
| 181 | */ |
| 182 | Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever" ); |
| 183 | |
| 184 | /* |
| 185 | * Do not waste too much time if we already have a good match |
| 186 | */ |
| 187 | best_len = s->prev_length ? s->prev_length : 1; |
| 188 | chain_length = s->max_chain_length; |
| 189 | if (best_len >= s->good_match) |
| 190 | chain_length >>= 2; |
| 191 | |
| 192 | /* |
| 193 | * Do not look for matches beyond the end of the input. This is |
| 194 | * necessary to make deflate deterministic |
| 195 | */ |
| 196 | nice_match = (unsigned int)s->nice_match > s->lookahead ? s->lookahead : (unsigned int)s->nice_match; |
| 197 | |
| 198 | /* |
| 199 | * Stop when cur_match becomes <= limit. To simplify the code, |
| 200 | * we prevent matches with the string of window index 0 |
| 201 | */ |
| 202 | limit = s->strstart > MAX_DIST(s) ? s->strstart - MAX_DIST(s) : 0; |
| 203 | |
| 204 | scan = s->window + s->strstart; |
| 205 | strend = s->window + s->strstart + MAX_MATCH - 1; |
| 206 | memcpy(&scan_start, scan, sizeof(scan_start)); |
| 207 | memcpy(&scan_end, scan + best_len - 1, sizeof(scan_end)); |
| 208 | |
| 209 | Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead" ); |
| 210 | do { |
| 211 | unsigned char *match; |
| 212 | if (cur_match >= s->strstart) { |
| 213 | break; |
| 214 | } |
| 215 | match = s->window + cur_match; |
| 216 | |
| 217 | /* |
| 218 | * Skip to next match if the match length cannot increase |
| 219 | * or if the match length is less than 2. Note that the checks |
| 220 | * below for insufficient lookahead only occur occasionally |
| 221 | * for performance reasons. Therefore uninitialized memory |
| 222 | * will be accessed and conditional jumps will be made that |
| 223 | * depend on those values. However the length of the match |
| 224 | * is limited to the lookahead, so the output of deflate is not |
| 225 | * affected by the uninitialized values. |
| 226 | */ |
| 227 | uint16_t val; |
| 228 | memcpy(&val, match + best_len - 1, sizeof(val)); |
| 229 | if (LIKELY(val != scan_end)) |
| 230 | continue; |
| 231 | |
| 232 | memcpy(&val, match, sizeof(val)); |
| 233 | if (val != scan_start) |
| 234 | continue; |
| 235 | |
| 236 | /* It is not necessary to compare scan[2] and match[2] since |
| 237 | * they are always equal when the other bytes match, given that |
| 238 | * the hash keys are equal and that HASH_BITS >= 8. Compare 2 |
| 239 | * bytes at a time at strstart+3, +5, ... up to strstart+257. |
| 240 | * We check for insufficient lookahead only every 4th |
| 241 | * comparison; the 128th check will be made at strstart+257. |
| 242 | * If MAX_MATCH-2 is not a multiple of 8, it is necessary to |
| 243 | * put more guard bytes at the end of the window, or to check |
| 244 | * more often for insufficient lookahead. |
| 245 | */ |
| 246 | Assert(scan[2] == match[2], "scan[2]?" ); |
| 247 | scan++; |
| 248 | match++; |
| 249 | |
| 250 | do { |
| 251 | uint16_t mval, sval; |
| 252 | |
| 253 | memcpy(&mval, match, sizeof(mval)); |
| 254 | memcpy(&sval, scan, sizeof(sval)); |
| 255 | if (mval != sval) |
| 256 | break; |
| 257 | match += sizeof(mval); |
| 258 | scan += sizeof(sval); |
| 259 | |
| 260 | memcpy(&mval, match, sizeof(mval)); |
| 261 | memcpy(&sval, scan, sizeof(sval)); |
| 262 | if (mval != sval) |
| 263 | break; |
| 264 | match += sizeof(mval); |
| 265 | scan += sizeof(sval); |
| 266 | |
| 267 | memcpy(&mval, match, sizeof(mval)); |
| 268 | memcpy(&sval, scan, sizeof(sval)); |
| 269 | if (mval != sval) |
| 270 | break; |
| 271 | match += sizeof(mval); |
| 272 | scan += sizeof(sval); |
| 273 | |
| 274 | memcpy(&mval, match, sizeof(mval)); |
| 275 | memcpy(&sval, scan, sizeof(sval)); |
| 276 | if (mval != sval) |
| 277 | break; |
| 278 | match += sizeof(mval); |
| 279 | scan += sizeof(sval); |
| 280 | } while (scan < strend); |
| 281 | |
| 282 | /* |
| 283 | * Here, scan <= window + strstart + 257 |
| 284 | */ |
| 285 | Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan" ); |
| 286 | if (*scan == *match) |
| 287 | scan++; |
| 288 | |
| 289 | len = (MAX_MATCH -1) - (int)(strend-scan); |
| 290 | scan = strend - (MAX_MATCH-1); |
| 291 | |
| 292 | if (len > best_len) { |
| 293 | s->match_start = cur_match; |
| 294 | best_len = len; |
| 295 | if (len >= nice_match) |
| 296 | break; |
| 297 | memcpy(&scan_end, scan + best_len - 1, sizeof(scan_end)); |
| 298 | } else { |
| 299 | /* |
| 300 | * The probability of finding a match later if we here |
| 301 | * is pretty low, so for performance it's best to |
| 302 | * outright stop here for the lower compression levels |
| 303 | */ |
| 304 | if (s->level < TRIGGER_LEVEL) |
| 305 | break; |
| 306 | } |
| 307 | } while (--chain_length && (cur_match = prev[cur_match & wmask]) > limit); |
| 308 | |
| 309 | if ((unsigned)best_len <= s->lookahead) |
| 310 | return best_len; |
| 311 | return s->lookahead; |
| 312 | } |
| 313 | #endif |
| 314 | |
| 315 | #ifdef std3_longest_match |
| 316 | /* longest_match() with minor change to improve performance (in terms of |
| 317 | * execution time). |
| 318 | * |
| 319 | * The pristine longest_match() function is sketched below (strip the |
| 320 | * then-clause of the "#ifdef UNALIGNED_OK"-directive) |
| 321 | * |
| 322 | * ------------------------------------------------------------ |
| 323 | * unsigned int longest_match(...) { |
| 324 | * ... |
| 325 | * do { |
| 326 | * match = s->window + cur_match; //s0 |
| 327 | * if (*(ushf*)(match+best_len-1) != scan_end || //s1 |
| 328 | * *(ushf*)match != scan_start) continue; //s2 |
| 329 | * ... |
| 330 | * |
| 331 | * do { |
| 332 | * } while (*(ushf*)(scan+=2) == *(ushf*)(match+=2) && |
| 333 | * *(ushf*)(scan+=2) == *(ushf*)(match+=2) && |
| 334 | * *(ushf*)(scan+=2) == *(ushf*)(match+=2) && |
| 335 | * *(ushf*)(scan+=2) == *(ushf*)(match+=2) && |
| 336 | * scan < strend); //s3 |
| 337 | * |
| 338 | * ... |
| 339 | * } while(cond); //s4 |
| 340 | * |
| 341 | * ------------------------------------------------------------- |
| 342 | * |
| 343 | * The change include: |
| 344 | * |
| 345 | * 1) The hottest statements of the function is: s0, s1 and s4. Pull them |
| 346 | * together to form a new loop. The benefit is two-fold: |
| 347 | * |
| 348 | * o. Ease the compiler to yield good code layout: the conditional-branch |
| 349 | * corresponding to s1 and its biased target s4 become very close (likely, |
| 350 | * fit in the same cache-line), hence improving instruction-fetching |
| 351 | * efficiency. |
| 352 | * |
| 353 | * o. Ease the compiler to promote "s->window" into register. "s->window" |
| 354 | * is loop-invariant; it is supposed to be promoted into register and keep |
| 355 | * the value throughout the entire loop. However, there are many such |
| 356 | * loop-invariant, and x86-family has small register file; "s->window" is |
| 357 | * likely to be chosen as register-allocation victim such that its value |
| 358 | * is reloaded from memory in every single iteration. By forming a new loop, |
| 359 | * "s->window" is loop-invariant of that newly created tight loop. It is |
| 360 | * lot easier for compiler to promote this quantity to register and keep |
| 361 | * its value throughout the entire small loop. |
| 362 | * |
| 363 | * 2) Transfrom s3 such that it examines sizeof(long)-byte-match at a time. |
| 364 | * This is done by: |
| 365 | * ------------------------------------------------ |
| 366 | * v1 = load from "scan" by sizeof(long) bytes |
| 367 | * v2 = load from "match" by sizeof(lnog) bytes |
| 368 | * v3 = v1 xor v2 |
| 369 | * match-bit = little-endian-machine(yes-for-x86) ? |
| 370 | * count-trailing-zero(v3) : |
| 371 | * count-leading-zero(v3); |
| 372 | * |
| 373 | * match-byte = match-bit/8 |
| 374 | * |
| 375 | * "scan" and "match" advance if necessary |
| 376 | * ------------------------------------------------- |
| 377 | */ |
| 378 | |
| 379 | static inline unsigned longest_match(deflate_state *const s, IPos cur_match) { |
| 380 | unsigned int strstart = s->strstart; |
| 381 | unsigned chain_length = s->max_chain_length;/* max hash chain length */ |
| 382 | unsigned char *window = s->window; |
| 383 | register unsigned char *scan = window + strstart; /* current string */ |
| 384 | register unsigned char *match; /* matched string */ |
| 385 | register unsigned int len; /* length of current match */ |
| 386 | unsigned int best_len = s->prev_length ? s->prev_length : 1; /* best match length so far */ |
| 387 | unsigned int nice_match = s->nice_match; /* stop if match long enough */ |
| 388 | IPos limit = strstart > (IPos)MAX_DIST(s) ? |
| 389 | strstart - (IPos)MAX_DIST(s) : NIL; |
| 390 | /* Stop when cur_match becomes <= limit. To simplify the code, |
| 391 | * we prevent matches with the string of window index 0. |
| 392 | */ |
| 393 | Pos *prev = s->prev; |
| 394 | unsigned int wmask = s->w_mask; |
| 395 | |
| 396 | register unsigned char *strend = window + strstart + MAX_MATCH; |
| 397 | |
| 398 | uint16_t scan_start, scan_end; |
| 399 | |
| 400 | memcpy(&scan_start, scan, sizeof(scan_start)); |
| 401 | memcpy(&scan_end, scan+best_len-1, sizeof(scan_end)); |
| 402 | |
| 403 | /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. |
| 404 | * It is easy to get rid of this optimization if necessary. |
| 405 | */ |
| 406 | Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever" ); |
| 407 | |
| 408 | /* Do not waste too much time if we already have a good match: */ |
| 409 | if (s->prev_length >= s->good_match) { |
| 410 | chain_length >>= 2; |
| 411 | } |
| 412 | /* Do not look for matches beyond the end of the input. This is necessary |
| 413 | * to make deflate deterministic. |
| 414 | */ |
| 415 | if ((unsigned int)nice_match > s->lookahead) nice_match = s->lookahead; |
| 416 | |
| 417 | Assert((unsigned long)strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead" ); |
| 418 | |
| 419 | do { |
| 420 | if (cur_match >= strstart) { |
| 421 | break; |
| 422 | } |
| 423 | |
| 424 | /* Skip to next match if the match length cannot increase |
| 425 | * or if the match length is less than 2. Note that the checks below |
| 426 | * for insufficient lookahead only occur occasionally for performance |
| 427 | * reasons. Therefore uninitialized memory will be accessed, and |
| 428 | * conditional jumps will be made that depend on those values. |
| 429 | * However the length of the match is limited to the lookahead, so |
| 430 | * the output of deflate is not affected by the uninitialized values. |
| 431 | */ |
| 432 | int cont = 1; |
| 433 | do { |
| 434 | match = window + cur_match; |
| 435 | if (LIKELY(memcmp(match+best_len-1, &scan_end, sizeof(scan_end)) != 0 |
| 436 | || memcmp(match, &scan_start, sizeof(scan_start)) != 0)) { |
| 437 | if ((cur_match = prev[cur_match & wmask]) > limit |
| 438 | && --chain_length != 0) { |
| 439 | continue; |
| 440 | } else { |
| 441 | cont = 0; |
| 442 | } |
| 443 | } |
| 444 | break; |
| 445 | } while (1); |
| 446 | |
| 447 | if (!cont) |
| 448 | break; |
| 449 | |
| 450 | /* It is not necessary to compare scan[2] and match[2] since they are |
| 451 | * always equal when the other bytes match, given that the hash keys |
| 452 | * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at |
| 453 | * strstart+3, +5, ... up to strstart+257. We check for insufficient |
| 454 | * lookahead only every 4th comparison; the 128th check will be made |
| 455 | * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is |
| 456 | * necessary to put more guard bytes at the end of the window, or |
| 457 | * to check more often for insufficient lookahead. |
| 458 | */ |
| 459 | scan += 2, match+=2; |
| 460 | Assert(*scan == *match, "match[2]?" ); |
| 461 | do { |
| 462 | unsigned long sv, mv, xor; |
| 463 | |
| 464 | memcpy(&sv, scan, sizeof(sv)); |
| 465 | memcpy(&mv, match, sizeof(mv)); |
| 466 | |
| 467 | xor = sv ^ mv; |
| 468 | |
| 469 | if (xor) { |
| 470 | int match_byte = __builtin_ctzl(xor) / 8; |
| 471 | scan += match_byte; |
| 472 | break; |
| 473 | } else { |
| 474 | scan += sizeof(unsigned long); |
| 475 | match += sizeof(unsigned long); |
| 476 | } |
| 477 | } while (scan < strend); |
| 478 | |
| 479 | if (scan > strend) |
| 480 | scan = strend; |
| 481 | |
| 482 | Assert(scan <= window + (unsigned)(s->window_size-1), "wild scan" ); |
| 483 | |
| 484 | len = MAX_MATCH - (int)(strend - scan); |
| 485 | scan = strend - MAX_MATCH; |
| 486 | |
| 487 | if (len > best_len) { |
| 488 | s->match_start = cur_match; |
| 489 | best_len = len; |
| 490 | if (len >= nice_match) |
| 491 | break; |
| 492 | memcpy(&scan_end, scan+best_len-1, sizeof(scan_end)); |
| 493 | } else { |
| 494 | /* |
| 495 | * The probability of finding a match later if we here |
| 496 | * is pretty low, so for performance it's best to |
| 497 | * outright stop here for the lower compression levels |
| 498 | */ |
| 499 | if (s->level < TRIGGER_LEVEL) |
| 500 | break; |
| 501 | } |
| 502 | } while ((cur_match = prev[cur_match & wmask]) > limit && --chain_length != 0); |
| 503 | |
| 504 | if ((unsigned int)best_len <= s->lookahead) |
| 505 | return (unsigned int)best_len; |
| 506 | return s->lookahead; |
| 507 | } |
| 508 | #endif |
| 509 | |