1/*
2 * Set match_start to the longest match starting at the given string and
3 * return its length. Matches shorter or equal to prev_length are discarded,
4 * in which case the result is equal to prev_length and match_start is garbage.
5 *
6 * IN assertions: cur_match is the head of the hash chain for the current
7 * string (strstart) and its distance is <= MAX_DIST, and prev_length >=1
8 * OUT assertion: the match length is not greater than s->lookahead
9 */
10
11#include "zbuild.h"
12#include "deflate.h"
13
14#if (defined(UNALIGNED_OK) && MAX_MATCH == 258)
15
16 /* ARM 32-bit clang/gcc builds perform better, on average, with std2. Both gcc and clang and define __GNUC__. */
17# if defined(__GNUC__) && defined(__arm__) && !defined(__aarch64__)
18# define std2_longest_match
19 /* Only use std3_longest_match for little_endian systems, also avoid using it with
20 non-gcc compilers since the __builtin_ctzl() function might not be optimized. */
21# elif(defined(__GNUC__) && defined(HAVE_BUILTIN_CTZL) && ((__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
22 || defined(__LITTLE_ENDIAN__)))
23# define std3_longest_match
24# elif(defined(_MSC_VER) && defined(_WIN32))
25# define std3_longest_match
26# else
27# define std2_longest_match
28# endif
29
30#else
31# define std1_longest_match
32#endif
33
34
35#if defined(_MSC_VER) && !defined(__clang__)
36# if defined(_M_IX86) || defined(_M_AMD64) || defined(_M_IA64) || defined(_M_ARM) || defined(_M_ARM64)
37# include "fallback_builtins.h"
38# endif
39#endif
40
41
42
43#ifdef std1_longest_match
44
45/*
46 * Standard longest_match
47 *
48 */
49static inline unsigned longest_match(deflate_state *const s, IPos cur_match) {
50 const unsigned wmask = s->w_mask;
51 const Pos *prev = s->prev;
52
53 unsigned chain_length;
54 IPos limit;
55 unsigned int len, best_len, nice_match;
56 unsigned char *scan, *match, *strend, scan_end, scan_end1;
57
58 /*
59 * The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple
60 * of 16. It is easy to get rid of this optimization if necessary.
61 */
62 Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
63
64 /*
65 * Do not waste too much time if we already have a good match
66 */
67 best_len = s->prev_length ? s->prev_length : 1;
68 chain_length = s->max_chain_length;
69 if (best_len >= s->good_match)
70 chain_length >>= 2;
71
72 /*
73 * Do not looks for matches beyond the end of the input. This is
74 * necessary to make deflate deterministic
75 */
76 nice_match = (unsigned int)s->nice_match > s->lookahead ? s->lookahead : (unsigned int)s->nice_match;
77
78 /*
79 * Stop when cur_match becomes <= limit. To simplify the code,
80 * we prevent matches with the string of window index 0
81 */
82 limit = s->strstart > MAX_DIST(s) ? s->strstart - MAX_DIST(s) : 0;
83
84 scan = s->window + s->strstart;
85 strend = s->window + s->strstart + MAX_MATCH;
86 scan_end1 = scan[best_len-1];
87 scan_end = scan[best_len];
88
89 Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead");
90 do {
91 if (cur_match >= s->strstart) {
92 break;
93 }
94 match = s->window + cur_match;
95
96 /*
97 * Skip to next match if the match length cannot increase
98 * or if the match length is less than 2. Note that the checks
99 * below for insufficient lookahead only occur occasionally
100 * for performance reasons. Therefore uninitialized memory
101 * will be accessed and conditional jumps will be made that
102 * depend on those values. However the length of the match
103 * is limited to the lookahead, so the output of deflate is not
104 * affected by the uninitialized values.
105 */
106 if (match[best_len] != scan_end ||
107 match[best_len-1] != scan_end1 ||
108 *match != *scan ||
109 *++match != scan[1])
110 continue;
111
112 /*
113 * The check at best_len-1 can be removed because it will
114 * be made again later. (This heuristic is not always a win.)
115 * It is not necessary to compare scan[2] and match[2] since
116 * they are always equal when the other bytes match, given
117 * that the hash keys are equal and that HASH_BITS >= 8.
118 */
119 scan += 2;
120 match++;
121 Assert(*scan == *match, "match[2]?");
122
123 /*
124 * We check for insufficient lookahead only every 8th
125 * comparision; the 256th check will be made at strstart + 258.
126 */
127 do {
128 } while (*++scan == *++match && *++scan == *++match &&
129 *++scan == *++match && *++scan == *++match &&
130 *++scan == *++match && *++scan == *++match &&
131 *++scan == *++match && *++scan == *++match &&
132 scan < strend);
133
134 Assert(scan <= s->window+(unsigned int)(s->window_size-1), "wild scan");
135
136 len = MAX_MATCH - (int)(strend - scan);
137 scan = strend - MAX_MATCH;
138
139 if (len > best_len) {
140 s->match_start = cur_match;
141 best_len = len;
142 if (len >= nice_match)
143 break;
144 scan_end1 = scan[best_len-1];
145 scan_end = scan[best_len];
146 } else {
147 /*
148 * The probability of finding a match later if we here
149 * is pretty low, so for performance it's best to
150 * outright stop here for the lower compression levels
151 */
152 if (s->level < TRIGGER_LEVEL)
153 break;
154 }
155 } while ((cur_match = prev[cur_match & wmask]) > limit && --chain_length);
156
157 if ((unsigned int)best_len <= s->lookahead)
158 return best_len;
159 return s->lookahead;
160}
161#endif
162
163#ifdef std2_longest_match
164/*
165 * UNALIGNED_OK longest_match
166 *
167 */
168static inline unsigned longest_match(deflate_state *const s, IPos cur_match) {
169 const unsigned wmask = s->w_mask;
170 const Pos *prev = s->prev;
171
172 uint16_t scan_start, scan_end;
173 unsigned chain_length;
174 IPos limit;
175 unsigned int len, best_len, nice_match;
176 unsigned char *scan, *strend;
177
178 /*
179 * The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple
180 * of 16. It is easy to get rid of this optimization if necessary.
181 */
182 Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
183
184 /*
185 * Do not waste too much time if we already have a good match
186 */
187 best_len = s->prev_length ? s->prev_length : 1;
188 chain_length = s->max_chain_length;
189 if (best_len >= s->good_match)
190 chain_length >>= 2;
191
192 /*
193 * Do not look for matches beyond the end of the input. This is
194 * necessary to make deflate deterministic
195 */
196 nice_match = (unsigned int)s->nice_match > s->lookahead ? s->lookahead : (unsigned int)s->nice_match;
197
198 /*
199 * Stop when cur_match becomes <= limit. To simplify the code,
200 * we prevent matches with the string of window index 0
201 */
202 limit = s->strstart > MAX_DIST(s) ? s->strstart - MAX_DIST(s) : 0;
203
204 scan = s->window + s->strstart;
205 strend = s->window + s->strstart + MAX_MATCH - 1;
206 memcpy(&scan_start, scan, sizeof(scan_start));
207 memcpy(&scan_end, scan + best_len - 1, sizeof(scan_end));
208
209 Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead");
210 do {
211 unsigned char *match;
212 if (cur_match >= s->strstart) {
213 break;
214 }
215 match = s->window + cur_match;
216
217 /*
218 * Skip to next match if the match length cannot increase
219 * or if the match length is less than 2. Note that the checks
220 * below for insufficient lookahead only occur occasionally
221 * for performance reasons. Therefore uninitialized memory
222 * will be accessed and conditional jumps will be made that
223 * depend on those values. However the length of the match
224 * is limited to the lookahead, so the output of deflate is not
225 * affected by the uninitialized values.
226 */
227 uint16_t val;
228 memcpy(&val, match + best_len - 1, sizeof(val));
229 if (LIKELY(val != scan_end))
230 continue;
231
232 memcpy(&val, match, sizeof(val));
233 if (val != scan_start)
234 continue;
235
236 /* It is not necessary to compare scan[2] and match[2] since
237 * they are always equal when the other bytes match, given that
238 * the hash keys are equal and that HASH_BITS >= 8. Compare 2
239 * bytes at a time at strstart+3, +5, ... up to strstart+257.
240 * We check for insufficient lookahead only every 4th
241 * comparison; the 128th check will be made at strstart+257.
242 * If MAX_MATCH-2 is not a multiple of 8, it is necessary to
243 * put more guard bytes at the end of the window, or to check
244 * more often for insufficient lookahead.
245 */
246 Assert(scan[2] == match[2], "scan[2]?");
247 scan++;
248 match++;
249
250 do {
251 uint16_t mval, sval;
252
253 memcpy(&mval, match, sizeof(mval));
254 memcpy(&sval, scan, sizeof(sval));
255 if (mval != sval)
256 break;
257 match += sizeof(mval);
258 scan += sizeof(sval);
259
260 memcpy(&mval, match, sizeof(mval));
261 memcpy(&sval, scan, sizeof(sval));
262 if (mval != sval)
263 break;
264 match += sizeof(mval);
265 scan += sizeof(sval);
266
267 memcpy(&mval, match, sizeof(mval));
268 memcpy(&sval, scan, sizeof(sval));
269 if (mval != sval)
270 break;
271 match += sizeof(mval);
272 scan += sizeof(sval);
273
274 memcpy(&mval, match, sizeof(mval));
275 memcpy(&sval, scan, sizeof(sval));
276 if (mval != sval)
277 break;
278 match += sizeof(mval);
279 scan += sizeof(sval);
280 } while (scan < strend);
281
282 /*
283 * Here, scan <= window + strstart + 257
284 */
285 Assert(scan <= s->window+(unsigned)(s->window_size-1), "wild scan");
286 if (*scan == *match)
287 scan++;
288
289 len = (MAX_MATCH -1) - (int)(strend-scan);
290 scan = strend - (MAX_MATCH-1);
291
292 if (len > best_len) {
293 s->match_start = cur_match;
294 best_len = len;
295 if (len >= nice_match)
296 break;
297 memcpy(&scan_end, scan + best_len - 1, sizeof(scan_end));
298 } else {
299 /*
300 * The probability of finding a match later if we here
301 * is pretty low, so for performance it's best to
302 * outright stop here for the lower compression levels
303 */
304 if (s->level < TRIGGER_LEVEL)
305 break;
306 }
307 } while (--chain_length && (cur_match = prev[cur_match & wmask]) > limit);
308
309 if ((unsigned)best_len <= s->lookahead)
310 return best_len;
311 return s->lookahead;
312}
313#endif
314
315#ifdef std3_longest_match
316/* longest_match() with minor change to improve performance (in terms of
317 * execution time).
318 *
319 * The pristine longest_match() function is sketched below (strip the
320 * then-clause of the "#ifdef UNALIGNED_OK"-directive)
321 *
322 * ------------------------------------------------------------
323 * unsigned int longest_match(...) {
324 * ...
325 * do {
326 * match = s->window + cur_match; //s0
327 * if (*(ushf*)(match+best_len-1) != scan_end || //s1
328 * *(ushf*)match != scan_start) continue; //s2
329 * ...
330 *
331 * do {
332 * } while (*(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
333 * *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
334 * *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
335 * *(ushf*)(scan+=2) == *(ushf*)(match+=2) &&
336 * scan < strend); //s3
337 *
338 * ...
339 * } while(cond); //s4
340 *
341 * -------------------------------------------------------------
342 *
343 * The change include:
344 *
345 * 1) The hottest statements of the function is: s0, s1 and s4. Pull them
346 * together to form a new loop. The benefit is two-fold:
347 *
348 * o. Ease the compiler to yield good code layout: the conditional-branch
349 * corresponding to s1 and its biased target s4 become very close (likely,
350 * fit in the same cache-line), hence improving instruction-fetching
351 * efficiency.
352 *
353 * o. Ease the compiler to promote "s->window" into register. "s->window"
354 * is loop-invariant; it is supposed to be promoted into register and keep
355 * the value throughout the entire loop. However, there are many such
356 * loop-invariant, and x86-family has small register file; "s->window" is
357 * likely to be chosen as register-allocation victim such that its value
358 * is reloaded from memory in every single iteration. By forming a new loop,
359 * "s->window" is loop-invariant of that newly created tight loop. It is
360 * lot easier for compiler to promote this quantity to register and keep
361 * its value throughout the entire small loop.
362 *
363 * 2) Transfrom s3 such that it examines sizeof(long)-byte-match at a time.
364 * This is done by:
365 * ------------------------------------------------
366 * v1 = load from "scan" by sizeof(long) bytes
367 * v2 = load from "match" by sizeof(lnog) bytes
368 * v3 = v1 xor v2
369 * match-bit = little-endian-machine(yes-for-x86) ?
370 * count-trailing-zero(v3) :
371 * count-leading-zero(v3);
372 *
373 * match-byte = match-bit/8
374 *
375 * "scan" and "match" advance if necessary
376 * -------------------------------------------------
377 */
378
379static inline unsigned longest_match(deflate_state *const s, IPos cur_match) {
380 unsigned int strstart = s->strstart;
381 unsigned chain_length = s->max_chain_length;/* max hash chain length */
382 unsigned char *window = s->window;
383 register unsigned char *scan = window + strstart; /* current string */
384 register unsigned char *match; /* matched string */
385 register unsigned int len; /* length of current match */
386 unsigned int best_len = s->prev_length ? s->prev_length : 1; /* best match length so far */
387 unsigned int nice_match = s->nice_match; /* stop if match long enough */
388 IPos limit = strstart > (IPos)MAX_DIST(s) ?
389 strstart - (IPos)MAX_DIST(s) : NIL;
390 /* Stop when cur_match becomes <= limit. To simplify the code,
391 * we prevent matches with the string of window index 0.
392 */
393 Pos *prev = s->prev;
394 unsigned int wmask = s->w_mask;
395
396 register unsigned char *strend = window + strstart + MAX_MATCH;
397
398 uint16_t scan_start, scan_end;
399
400 memcpy(&scan_start, scan, sizeof(scan_start));
401 memcpy(&scan_end, scan+best_len-1, sizeof(scan_end));
402
403 /* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.
404 * It is easy to get rid of this optimization if necessary.
405 */
406 Assert(s->hash_bits >= 8 && MAX_MATCH == 258, "Code too clever");
407
408 /* Do not waste too much time if we already have a good match: */
409 if (s->prev_length >= s->good_match) {
410 chain_length >>= 2;
411 }
412 /* Do not look for matches beyond the end of the input. This is necessary
413 * to make deflate deterministic.
414 */
415 if ((unsigned int)nice_match > s->lookahead) nice_match = s->lookahead;
416
417 Assert((unsigned long)strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
418
419 do {
420 if (cur_match >= strstart) {
421 break;
422 }
423
424 /* Skip to next match if the match length cannot increase
425 * or if the match length is less than 2. Note that the checks below
426 * for insufficient lookahead only occur occasionally for performance
427 * reasons. Therefore uninitialized memory will be accessed, and
428 * conditional jumps will be made that depend on those values.
429 * However the length of the match is limited to the lookahead, so
430 * the output of deflate is not affected by the uninitialized values.
431 */
432 int cont = 1;
433 do {
434 match = window + cur_match;
435 if (LIKELY(memcmp(match+best_len-1, &scan_end, sizeof(scan_end)) != 0
436 || memcmp(match, &scan_start, sizeof(scan_start)) != 0)) {
437 if ((cur_match = prev[cur_match & wmask]) > limit
438 && --chain_length != 0) {
439 continue;
440 } else {
441 cont = 0;
442 }
443 }
444 break;
445 } while (1);
446
447 if (!cont)
448 break;
449
450 /* It is not necessary to compare scan[2] and match[2] since they are
451 * always equal when the other bytes match, given that the hash keys
452 * are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at
453 * strstart+3, +5, ... up to strstart+257. We check for insufficient
454 * lookahead only every 4th comparison; the 128th check will be made
455 * at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is
456 * necessary to put more guard bytes at the end of the window, or
457 * to check more often for insufficient lookahead.
458 */
459 scan += 2, match+=2;
460 Assert(*scan == *match, "match[2]?");
461 do {
462 unsigned long sv, mv, xor;
463
464 memcpy(&sv, scan, sizeof(sv));
465 memcpy(&mv, match, sizeof(mv));
466
467 xor = sv ^ mv;
468
469 if (xor) {
470 int match_byte = __builtin_ctzl(xor) / 8;
471 scan += match_byte;
472 break;
473 } else {
474 scan += sizeof(unsigned long);
475 match += sizeof(unsigned long);
476 }
477 } while (scan < strend);
478
479 if (scan > strend)
480 scan = strend;
481
482 Assert(scan <= window + (unsigned)(s->window_size-1), "wild scan");
483
484 len = MAX_MATCH - (int)(strend - scan);
485 scan = strend - MAX_MATCH;
486
487 if (len > best_len) {
488 s->match_start = cur_match;
489 best_len = len;
490 if (len >= nice_match)
491 break;
492 memcpy(&scan_end, scan+best_len-1, sizeof(scan_end));
493 } else {
494 /*
495 * The probability of finding a match later if we here
496 * is pretty low, so for performance it's best to
497 * outright stop here for the lower compression levels
498 */
499 if (s->level < TRIGGER_LEVEL)
500 break;
501 }
502 } while ((cur_match = prev[cur_match & wmask]) > limit && --chain_length != 0);
503
504 if ((unsigned int)best_len <= s->lookahead)
505 return (unsigned int)best_len;
506 return s->lookahead;
507}
508#endif
509