1 | /* |
2 | * Set match_start to the longest match starting at the given string and |
3 | * return its length. Matches shorter or equal to prev_length are discarded, |
4 | * in which case the result is equal to prev_length and match_start is garbage. |
5 | * |
6 | * IN assertions: cur_match is the head of the hash chain for the current |
7 | * string (strstart) and its distance is <= MAX_DIST, and prev_length >=1 |
8 | * OUT assertion: the match length is not greater than s->lookahead |
9 | */ |
10 | |
11 | #include "zbuild.h" |
12 | #include "deflate.h" |
13 | |
14 | #if (defined(UNALIGNED_OK) && MAX_MATCH == 258) |
15 | |
16 | /* ARM 32-bit clang/gcc builds perform better, on average, with std2. Both gcc and clang and define __GNUC__. */ |
17 | # if defined(__GNUC__) && defined(__arm__) && !defined(__aarch64__) |
18 | # define std2_longest_match |
19 | /* Only use std3_longest_match for little_endian systems, also avoid using it with |
20 | non-gcc compilers since the __builtin_ctzl() function might not be optimized. */ |
21 | # elif(defined(__GNUC__) && defined(HAVE_BUILTIN_CTZL) && ((__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \ |
22 | || defined(__LITTLE_ENDIAN__))) |
23 | # define std3_longest_match |
24 | # elif(defined(_MSC_VER) && defined(_WIN32)) |
25 | # define std3_longest_match |
26 | # else |
27 | # define std2_longest_match |
28 | # endif |
29 | |
30 | #else |
31 | # define std1_longest_match |
32 | #endif |
33 | |
34 | |
35 | #if defined(_MSC_VER) && !defined(__clang__) |
36 | # if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64) || defined(_M_ARM) || defined(_M_ARM64) |
37 | # include "fallback_builtins.h" |
38 | # endif |
39 | #endif |
40 | |
41 | |
42 | |
43 | #ifdef std1_longest_match |
44 | |
45 | /* |
46 | * Standard longest_match |
47 | * |
48 | */ |
49 | static inline unsigned longest_match(deflate_state *const s, IPos cur_match) { |
50 | const unsigned wmask = s->w_mask; |
51 | const Pos *prev = s->prev; |
52 | |
53 | unsigned chain_length; |
54 | IPos limit; |
55 | unsigned int len, best_len, nice_match; |
56 | unsigned char *scan, *match, *strend, scan_end, scan_end1; |
57 | |
58 | /* |
59 | * The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple |
60 | * of 16. It is easy to get rid of this optimization if necessary. |
61 | */ |
62 | Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever" ); |
63 | |
64 | /* |
65 | * Do not waste too much time if we already have a good match |
66 | */ |
67 | best_len = s->prev_length ? s->prev_length : 1; |
68 | chain_length = s->max_chain_length; |
69 | if (best_len >= s->good_match) |
70 | chain_length >>= 2; |
71 | |
72 | /* |
73 | * Do not looks for matches beyond the end of the input. This is |
74 | * necessary to make deflate deterministic |
75 | */ |
76 | nice_match = (unsigned int)s->nice_match > s->lookahead ? s->lookahead : (unsigned int)s->nice_match; |
77 | |
78 | /* |
79 | * Stop when cur_match becomes <= limit. To simplify the code, |
80 | * we prevent matches with the string of window index 0 |
81 | */ |
82 | limit = s->strstart > MAX_DIST(s) ? s->strstart - MAX_DIST(s) : 0; |
83 | |
84 | scan = s->window + s->strstart; |
85 | strend = s->window + s->strstart + MAX_MATCH; |
86 | scan_end1 = scan[best_len-1]; |
87 | scan_end = scan[best_len]; |
88 | |
89 | Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead" ); |
90 | do { |
91 | if (cur_match >= s->strstart) { |
92 | break; |
93 | } |
94 | match = s->window + cur_match; |
95 | |
96 | /* |
97 | * Skip to next match if the match length cannot increase |
98 | * or if the match length is less than 2. Note that the checks |
99 | * below for insufficient lookahead only occur occasionally |
100 | * for performance reasons. Therefore uninitialized memory |
101 | * will be accessed and conditional jumps will be made that |
102 | * depend on those values. However the length of the match |
103 | * is limited to the lookahead, so the output of deflate is not |
104 | * affected by the uninitialized values. |
105 | */ |
106 | if (match[best_len] != scan_end || |
107 | match[best_len-1] != scan_end1 || |
108 | *match != *scan || |
109 | *++match != scan[1]) |
110 | continue; |
111 | |
112 | /* |
113 | * The check at best_len-1 can be removed because it will |
114 | * be made again later. (This heuristic is not always a win.) |
115 | * It is not necessary to compare scan[2] and match[2] since |
116 | * they are always equal when the other bytes match, given |
117 | * that the hash keys are equal and that HASH_BITS >= 8. |
118 | */ |
119 | scan += 2; |
120 | match++; |
121 | Assert(*scan == *match, "match[2]?" ); |
122 | |
123 | /* |
124 | * We check for insufficient lookahead only every 8th |
125 | * comparision; the 256th check will be made at strstart + 258. |
126 | */ |
127 | do { |
128 | } while (*++scan == *++match && *++scan == *++match && |
129 | *++scan == *++match && *++scan == *++match && |
130 | *++scan == *++match && *++scan == *++match && |
131 | *++scan == *++match && *++scan == *++match && |
132 | scan < strend); |
133 | |
134 | Assert(scan <= s->window+(unsigned int)(s->window_size-1), "wild scan" ); |
135 | |
136 | len = MAX_MATCH - (int)(strend - scan); |
137 | scan = strend - MAX_MATCH; |
138 | |
139 | if (len > best_len) { |
140 | s->match_start = cur_match; |
141 | best_len = len; |
142 | if (len >= nice_match) |
143 | break; |
144 | scan_end1 = scan[best_len-1]; |
145 | scan_end = scan[best_len]; |
146 | } else { |
147 | /* |
148 | * The probability of finding a match later if we here |
149 | * is pretty low, so for performance it's best to |
150 | * outright stop here for the lower compression levels |
151 | */ |
152 | if (s->level < TRIGGER_LEVEL) |
153 | break; |
154 | } |
155 | } while ((cur_match = prev[cur_match & wmask]) > limit && --chain_length); |
156 | |
157 | if ((unsigned int)best_len <= s->lookahead) |
158 | return best_len; |
159 | return s->lookahead; |
160 | } |
161 | #endif |
162 | |
163 | #ifdef std2_longest_match |
164 | /* |
165 | * UNALIGNED_OK longest_match |
166 | * |
167 | */ |
168 | static inline unsigned longest_match(deflate_state *const s, IPos cur_match) { |
169 | const unsigned wmask = s->w_mask; |
170 | const Pos *prev = s->prev; |
171 | |
172 | uint16_t scan_start, scan_end; |
173 | unsigned chain_length; |
174 | IPos limit; |
175 | unsigned int len, best_len, nice_match; |
176 | unsigned char *scan, *strend; |
177 | |
178 | /* |
179 | * The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple |
180 | * of 16. It is easy to get rid of this optimization if necessary. |
181 | */ |
182 | Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever" ); |
183 | |
184 | /* |
185 | * Do not waste too much time if we already have a good match |
186 | */ |
187 | best_len = s->prev_length ? s->prev_length : 1; |
188 | chain_length = s->max_chain_length; |
189 | if (best_len >= s->good_match) |
190 | chain_length >>= 2; |
191 | |
192 | /* |
193 | * Do not look for matches beyond the end of the input. This is |
194 | * necessary to make deflate deterministic |
195 | */ |
196 | nice_match = (unsigned int)s->nice_match > s->lookahead ? s->lookahead : (unsigned int)s->nice_match; |
197 | |
198 | /* |
199 | * Stop when cur_match becomes <= limit. To simplify the code, |
200 | * we prevent matches with the string of window index 0 |
201 | */ |
202 | limit = s->strstart > MAX_DIST(s) ? s->strstart - MAX_DIST(s) : 0; |
203 | |
204 | scan = s->window + s->strstart; |
205 | strend = s->window + s->strstart + MAX_MATCH - 1; |
206 | memcpy(&scan_start, scan, sizeof(scan_start)); |
207 | memcpy(&scan_end, scan + best_len - 1, sizeof(scan_end)); |
208 | |
209 | Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead" ); |
210 | do { |
211 | unsigned char *match; |
212 | if (cur_match >= s->strstart) { |
213 | break; |
214 | } |
215 | match = s->window + cur_match; |
216 | |
217 | /* |
218 | * Skip to next match if the match length cannot increase |
219 | * or if the match length is less than 2. Note that the checks |
220 | * below for insufficient lookahead only occur occasionally |
221 | * for performance reasons. Therefore uninitialized memory |
222 | * will be accessed and conditional jumps will be made that |
223 | * depend on those values. However the length of the match |
224 | * is limited to the lookahead, so the output of deflate is not |
225 | * affected by the uninitialized values. |
226 | */ |
227 | uint16_t val; |
228 | memcpy(&val, match + best_len - 1, sizeof(val)); |
229 | if (LIKELY(val != scan_end)) |
230 | continue; |
231 | |
232 | memcpy(&val, match, sizeof(val)); |
233 | if (val != scan_start) |
234 | continue; |
235 | |
236 | /* It is not necessary to compare scan[2] and match[2] since |
237 | * they are always equal when the other bytes match, given that |
238 | * the hash keys are equal and that HASH_BITS >= 8. Compare 2 |
239 | * bytes at a time at strstart+3, +5, ... up to strstart+257. |
240 | * We check for insufficient lookahead only every 4th |
241 | * comparison; the 128th check will be made at strstart+257. |
242 | * If MAX_MATCH-2 is not a multiple of 8, it is necessary to |
243 | * put more guard bytes at the end of the window, or to check |
244 | * more often for insufficient lookahead. |
245 | */ |
246 | Assert(scan[2] == match[2], "scan[2]?" ); |
247 | scan++; |
248 | match++; |
249 | |
250 | do { |
251 | uint16_t mval, sval; |
252 | |
253 | memcpy(&mval, match, sizeof(mval)); |
254 | memcpy(&sval, scan, sizeof(sval)); |
255 | if (mval != sval) |
256 | break; |
257 | match += sizeof(mval); |
258 | scan += sizeof(sval); |
259 | |
260 | memcpy(&mval, match, sizeof(mval)); |
261 | memcpy(&sval, scan, sizeof(sval)); |
262 | if (mval != sval) |
263 | break; |
264 | match += sizeof(mval); |
265 | scan += sizeof(sval); |
266 | |
267 | memcpy(&mval, match, sizeof(mval)); |
268 | memcpy(&sval, scan, sizeof(sval)); |
269 | if (mval != sval) |
270 | break; |
271 | match += sizeof(mval); |
272 | scan += sizeof(sval); |
273 | |
274 | memcpy(&mval, match, sizeof(mval)); |
275 | memcpy(&sval, scan, sizeof(sval)); |
276 | if (mval != sval) |
277 | break; |
278 | match += sizeof(mval); |
279 | scan += sizeof(sval); |
280 | } while (scan < strend); |
281 | |
282 | /* |
283 | * Here, scan <= window + strstart + 257 |
284 | */ |
285 | Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan" ); |
286 | if (*scan == *match) |
287 | scan++; |
288 | |
289 | len = (MAX_MATCH -1) - (int)(strend-scan); |
290 | scan = strend - (MAX_MATCH-1); |
291 | |
292 | if (len > best_len) { |
293 | s->match_start = cur_match; |
294 | best_len = len; |
295 | if (len >= nice_match) |
296 | break; |
297 | memcpy(&scan_end, scan + best_len - 1, sizeof(scan_end)); |
298 | } else { |
299 | /* |
300 | * The probability of finding a match later if we here |
301 | * is pretty low, so for performance it's best to |
302 | * outright stop here for the lower compression levels |
303 | */ |
304 | if (s->level < TRIGGER_LEVEL) |
305 | break; |
306 | } |
307 | } while (--chain_length && (cur_match = prev[cur_match & wmask]) > limit); |
308 | |
309 | if ((unsigned)best_len <= s->lookahead) |
310 | return best_len; |
311 | return s->lookahead; |
312 | } |
313 | #endif |
314 | |
315 | #ifdef std3_longest_match |
316 | /* longest_match() with minor change to improve performance (in terms of |
317 | * execution time). |
318 | * |
319 | * The pristine longest_match() function is sketched below (strip the |
320 | * then-clause of the "#ifdef UNALIGNED_OK"-directive) |
321 | * |
322 | * ------------------------------------------------------------ |
323 | * unsigned int longest_match(...) { |
324 | * ... |
325 | * do { |
326 | * match = s->window + cur_match; //s0 |
327 | * if (*(ushf*)(match+best_len-1) != scan_end || //s1 |
328 | * *(ushf*)match != scan_start) continue; //s2 |
329 | * ... |
330 | * |
331 | * do { |
332 | * } while (*(ushf*)(scan+=2) == *(ushf*)(match+=2) && |
333 | * *(ushf*)(scan+=2) == *(ushf*)(match+=2) && |
334 | * *(ushf*)(scan+=2) == *(ushf*)(match+=2) && |
335 | * *(ushf*)(scan+=2) == *(ushf*)(match+=2) && |
336 | * scan < strend); //s3 |
337 | * |
338 | * ... |
339 | * } while(cond); //s4 |
340 | * |
341 | * ------------------------------------------------------------- |
342 | * |
343 | * The change include: |
344 | * |
345 | * 1) The hottest statements of the function is: s0, s1 and s4. Pull them |
346 | * together to form a new loop. The benefit is two-fold: |
347 | * |
348 | * o. Ease the compiler to yield good code layout: the conditional-branch |
349 | * corresponding to s1 and its biased target s4 become very close (likely, |
350 | * fit in the same cache-line), hence improving instruction-fetching |
351 | * efficiency. |
352 | * |
353 | * o. Ease the compiler to promote "s->window" into register. "s->window" |
354 | * is loop-invariant; it is supposed to be promoted into register and keep |
355 | * the value throughout the entire loop. However, there are many such |
356 | * loop-invariant, and x86-family has small register file; "s->window" is |
357 | * likely to be chosen as register-allocation victim such that its value |
358 | * is reloaded from memory in every single iteration. By forming a new loop, |
359 | * "s->window" is loop-invariant of that newly created tight loop. It is |
360 | * lot easier for compiler to promote this quantity to register and keep |
361 | * its value throughout the entire small loop. |
362 | * |
363 | * 2) Transfrom s3 such that it examines sizeof(long)-byte-match at a time. |
364 | * This is done by: |
365 | * ------------------------------------------------ |
366 | * v1 = load from "scan" by sizeof(long) bytes |
367 | * v2 = load from "match" by sizeof(lnog) bytes |
368 | * v3 = v1 xor v2 |
369 | * match-bit = little-endian-machine(yes-for-x86) ? |
370 | * count-trailing-zero(v3) : |
371 | * count-leading-zero(v3); |
372 | * |
373 | * match-byte = match-bit/8 |
374 | * |
375 | * "scan" and "match" advance if necessary |
376 | * ------------------------------------------------- |
377 | */ |
378 | |
379 | static inline unsigned longest_match(deflate_state *const s, IPos cur_match) { |
380 | unsigned int strstart = s->strstart; |
381 | unsigned chain_length = s->max_chain_length;/* max hash chain length */ |
382 | unsigned char *window = s->window; |
383 | register unsigned char *scan = window + strstart; /* current string */ |
384 | register unsigned char *match; /* matched string */ |
385 | register unsigned int len; /* length of current match */ |
386 | unsigned int best_len = s->prev_length ? s->prev_length : 1; /* best match length so far */ |
387 | unsigned int nice_match = s->nice_match; /* stop if match long enough */ |
388 | IPos limit = strstart > (IPos)MAX_DIST(s) ? |
389 | strstart - (IPos)MAX_DIST(s) : NIL; |
390 | /* Stop when cur_match becomes <= limit. To simplify the code, |
391 | * we prevent matches with the string of window index 0. |
392 | */ |
393 | Pos *prev = s->prev; |
394 | unsigned int wmask = s->w_mask; |
395 | |
396 | register unsigned char *strend = window + strstart + MAX_MATCH; |
397 | |
398 | uint16_t scan_start, scan_end; |
399 | |
400 | memcpy(&scan_start, scan, sizeof(scan_start)); |
401 | memcpy(&scan_end, scan+best_len-1, sizeof(scan_end)); |
402 | |
403 | /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16. |
404 | * It is easy to get rid of this optimization if necessary. |
405 | */ |
406 | Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever" ); |
407 | |
408 | /* Do not waste too much time if we already have a good match: */ |
409 | if (s->prev_length >= s->good_match) { |
410 | chain_length >>= 2; |
411 | } |
412 | /* Do not look for matches beyond the end of the input. This is necessary |
413 | * to make deflate deterministic. |
414 | */ |
415 | if ((unsigned int)nice_match > s->lookahead) nice_match = s->lookahead; |
416 | |
417 | Assert((unsigned long)strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead" ); |
418 | |
419 | do { |
420 | if (cur_match >= strstart) { |
421 | break; |
422 | } |
423 | |
424 | /* Skip to next match if the match length cannot increase |
425 | * or if the match length is less than 2. Note that the checks below |
426 | * for insufficient lookahead only occur occasionally for performance |
427 | * reasons. Therefore uninitialized memory will be accessed, and |
428 | * conditional jumps will be made that depend on those values. |
429 | * However the length of the match is limited to the lookahead, so |
430 | * the output of deflate is not affected by the uninitialized values. |
431 | */ |
432 | int cont = 1; |
433 | do { |
434 | match = window + cur_match; |
435 | if (LIKELY(memcmp(match+best_len-1, &scan_end, sizeof(scan_end)) != 0 |
436 | || memcmp(match, &scan_start, sizeof(scan_start)) != 0)) { |
437 | if ((cur_match = prev[cur_match & wmask]) > limit |
438 | && --chain_length != 0) { |
439 | continue; |
440 | } else { |
441 | cont = 0; |
442 | } |
443 | } |
444 | break; |
445 | } while (1); |
446 | |
447 | if (!cont) |
448 | break; |
449 | |
450 | /* It is not necessary to compare scan[2] and match[2] since they are |
451 | * always equal when the other bytes match, given that the hash keys |
452 | * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at |
453 | * strstart+3, +5, ... up to strstart+257. We check for insufficient |
454 | * lookahead only every 4th comparison; the 128th check will be made |
455 | * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is |
456 | * necessary to put more guard bytes at the end of the window, or |
457 | * to check more often for insufficient lookahead. |
458 | */ |
459 | scan += 2, match+=2; |
460 | Assert(*scan == *match, "match[2]?" ); |
461 | do { |
462 | unsigned long sv, mv, xor; |
463 | |
464 | memcpy(&sv, scan, sizeof(sv)); |
465 | memcpy(&mv, match, sizeof(mv)); |
466 | |
467 | xor = sv ^ mv; |
468 | |
469 | if (xor) { |
470 | int match_byte = __builtin_ctzl(xor) / 8; |
471 | scan += match_byte; |
472 | break; |
473 | } else { |
474 | scan += sizeof(unsigned long); |
475 | match += sizeof(unsigned long); |
476 | } |
477 | } while (scan < strend); |
478 | |
479 | if (scan > strend) |
480 | scan = strend; |
481 | |
482 | Assert(scan <= window + (unsigned)(s->window_size-1), "wild scan" ); |
483 | |
484 | len = MAX_MATCH - (int)(strend - scan); |
485 | scan = strend - MAX_MATCH; |
486 | |
487 | if (len > best_len) { |
488 | s->match_start = cur_match; |
489 | best_len = len; |
490 | if (len >= nice_match) |
491 | break; |
492 | memcpy(&scan_end, scan+best_len-1, sizeof(scan_end)); |
493 | } else { |
494 | /* |
495 | * The probability of finding a match later if we here |
496 | * is pretty low, so for performance it's best to |
497 | * outright stop here for the lower compression levels |
498 | */ |
499 | if (s->level < TRIGGER_LEVEL) |
500 | break; |
501 | } |
502 | } while ((cur_match = prev[cur_match & wmask]) > limit && --chain_length != 0); |
503 | |
504 | if ((unsigned int)best_len <= s->lookahead) |
505 | return (unsigned int)best_len; |
506 | return s->lookahead; |
507 | } |
508 | #endif |
509 | |