match_p.h source code [ClickHouse/contrib/zlib-ng/match_p.h]

1	/*
2	* Set match_start to the longest match starting at the given string and
3	* return its length. Matches shorter or equal to prev_length are discarded,
4	* in which case the result is equal to prev_length and match_start is garbage.
5	*
6	* IN assertions: cur_match is the head of the hash chain for the current
7	* string (strstart) and its distance is <= MAX_DIST, and prev_length >=1
8	* OUT assertion: the match length is not greater than s->lookahead
9	*/
10
11	#include "zbuild.h"
12	#include "deflate.h"
13
14	#if (defined(UNALIGNED_OK) && MAX_MATCH == 258)
15
16	/ ARM 32-bit clang/gcc builds perform better, on average, with std2. Both gcc and clang and define __GNUC__. /
17	# if defined(__GNUC__) && defined(__arm__) && !defined(__aarch64__)
18	# define std2_longest_match
19	/ Only use std3_longest_match for little_endian systems, also avoid using it with*
20	non-gcc compilers since the __builtin_ctzl() function might not be optimized. /*
21	# elif(defined(__GNUC__) && defined(HAVE_BUILTIN_CTZL) && ((__BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__) \
22	\|\| defined(__LITTLE_ENDIAN__)))
23	# define std3_longest_match
24	# elif(defined(_MSC_VER) && defined(_WIN32))
25	# define std3_longest_match
26	# else
27	# define std2_longest_match
28	# endif
29
30	#else
31	# define std1_longest_match
32	#endif
33
34
35	#if defined(_MSC_VER) && !defined(__clang__)
36	# if defined(_M_IX86) \|\| defined(_M_AMD64) \|\| defined(_M_IA64) \|\| defined(_M_ARM) \|\| defined(_M_ARM64)
37	# include "fallback_builtins.h"
38	# endif
39	#endif
40
41
42
43	#ifdef std1_longest_match
44
45	/*
46	* Standard longest_match
47	*
48	*/
49	static inline unsigned longest_match(deflate_state *const s, IPos cur_match) {
50	const unsigned wmask = s->w_mask;
51	const Pos *prev = s->prev;
52
53	unsigned chain_length;
54	IPos limit;
55	unsigned int len, best_len, nice_match;
56	unsigned char scan, match, *strend, scan_end, scan_end1;
57
58	/*
59	* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple
60	* of 16. It is easy to get rid of this optimization if necessary.
61	*/
62	Assert(s->hash_bits >= `8` && MAX_MATCH == `258`, "Code too clever");
63
64	/*
65	* Do not waste too much time if we already have a good match
66	*/
67	best_len = s->prev_length ? s->prev_length : `1`;
68	chain_length = s->max_chain_length;
69	if (best_len >= s->good_match)
70	chain_length >>= `2`;
71
72	/*
73	* Do not looks for matches beyond the end of the input. This is
74	* necessary to make deflate deterministic
75	*/
76	nice_match = (unsigned int)s->nice_match > s->lookahead ? s->lookahead : (unsigned int)s->nice_match;
77
78	/*
79	* Stop when cur_match becomes <= limit. To simplify the code,
80	* we prevent matches with the string of window index 0
81	*/
82	limit = s->strstart > MAX_DIST(s) ? s->strstart - MAX_DIST(s) : `0`;
83
84	scan = s->window + s->strstart;
85	strend = s->window + s->strstart + MAX_MATCH;
86	scan_end1 = scan[best_len-`1`];
87	scan_end = scan[best_len];
88
89	Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead");
90	do {
91	if (cur_match >= s->strstart) {
92	break;
93	}
94	match = s->window + cur_match;
95
96	/*
97	* Skip to next match if the match length cannot increase
98	* or if the match length is less than 2. Note that the checks
99	* below for insufficient lookahead only occur occasionally
100	* for performance reasons. Therefore uninitialized memory
101	* will be accessed and conditional jumps will be made that
102	* depend on those values. However the length of the match
103	* is limited to the lookahead, so the output of deflate is not
104	* affected by the uninitialized values.
105	*/
106	if (match[best_len] != scan_end \|\|
107	match[best_len-`1`] != scan_end1 \|\|
108	match != scan \|\|
109	*++match != scan[`1`])
110	continue;
111
112	/*
113	* The check at best_len-1 can be removed because it will
114	* be made again later. (This heuristic is not always a win.)
115	* It is not necessary to compare scan[2] and match[2] since
116	* they are always equal when the other bytes match, given
117	* that the hash keys are equal and that HASH_BITS >= 8.
118	*/
119	scan += `2`;
120	match++;
121	Assert(scan == match, "match[2]?");
122
123	/*
124	* We check for insufficient lookahead only every 8th
125	* comparision; the 256th check will be made at strstart + 258.
126	*/
127	do {
128	} while (++scan == ++match && ++scan == ++match &&
129	++scan == ++match && ++scan == ++match &&
130	++scan == ++match && ++scan == ++match &&
131	++scan == ++match && ++scan == ++match &&
132	scan < strend);
133
134	Assert(scan <= s->window+(unsigned int)(s->window_size-`1`), "wild scan");
135
136	len = MAX_MATCH - (int)(strend - scan);
137	scan = strend - MAX_MATCH;
138
139	if (len > best_len) {
140	s->match_start = cur_match;
141	best_len = len;
142	if (len >= nice_match)
143	break;
144	scan_end1 = scan[best_len-`1`];
145	scan_end = scan[best_len];
146	} else {
147	/*
148	* The probability of finding a match later if we here
149	* is pretty low, so for performance it's best to
150	* outright stop here for the lower compression levels
151	*/
152	if (s->level < TRIGGER_LEVEL)
153	break;
154	}
155	} while ((cur_match = prev[cur_match & wmask]) > limit && --chain_length);
156
157	if ((unsigned int)best_len <= s->lookahead)
158	return best_len;
159	return s->lookahead;
160	}
161	#endif
162
163	#ifdef std2_longest_match
164	/*
165	* UNALIGNED_OK longest_match
166	*
167	*/
168	static inline unsigned longest_match(deflate_state *const s, IPos cur_match) {
169	const unsigned wmask = s->w_mask;
170	const Pos *prev = s->prev;
171
172	uint16_t scan_start, scan_end;
173	unsigned chain_length;
174	IPos limit;
175	unsigned int len, best_len, nice_match;
176	unsigned char scan, strend;
177
178	/*
179	* The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple
180	* of 16. It is easy to get rid of this optimization if necessary.
181	*/
182	Assert(s->hash_bits >= `8` && MAX_MATCH == `258`, "Code too clever");
183
184	/*
185	* Do not waste too much time if we already have a good match
186	*/
187	best_len = s->prev_length ? s->prev_length : `1`;
188	chain_length = s->max_chain_length;
189	if (best_len >= s->good_match)
190	chain_length >>= `2`;
191
192	/*
193	* Do not look for matches beyond the end of the input. This is
194	* necessary to make deflate deterministic
195	*/
196	nice_match = (unsigned int)s->nice_match > s->lookahead ? s->lookahead : (unsigned int)s->nice_match;
197
198	/*
199	* Stop when cur_match becomes <= limit. To simplify the code,
200	* we prevent matches with the string of window index 0
201	*/
202	limit = s->strstart > MAX_DIST(s) ? s->strstart - MAX_DIST(s) : `0`;
203
204	scan = s->window + s->strstart;
205	strend = s->window + s->strstart + MAX_MATCH - `1`;
206	memcpy(&scan_start, scan, sizeof(scan_start));
207	memcpy(&scan_end, scan + best_len - `1`, sizeof(scan_end));
208
209	Assert((unsigned long)s->strstart <= s->window_size - MIN_LOOKAHEAD, "need lookahead");
210	do {
211	unsigned char *match;
212	if (cur_match >= s->strstart) {
213	break;
214	}
215	match = s->window + cur_match;
216
217	/*
218	* Skip to next match if the match length cannot increase
219	* or if the match length is less than 2. Note that the checks
220	* below for insufficient lookahead only occur occasionally
221	* for performance reasons. Therefore uninitialized memory
222	* will be accessed and conditional jumps will be made that
223	* depend on those values. However the length of the match
224	* is limited to the lookahead, so the output of deflate is not
225	* affected by the uninitialized values.
226	*/
227	uint16_t val;
228	memcpy(&val, match + best_len - `1`, sizeof(val));
229	if (LIKELY(val != scan_end))
230	continue;
231
232	memcpy(&val, match, sizeof(val));
233	if (val != scan_start)
234	continue;
235
236	/ It is not necessary to compare scan[2] and match[2] since*
237	* they are always equal when the other bytes match, given that
238	* the hash keys are equal and that HASH_BITS >= 8. Compare 2
239	* bytes at a time at strstart+3, +5, ... up to strstart+257.
240	* We check for insufficient lookahead only every 4th
241	* comparison; the 128th check will be made at strstart+257.
242	* If MAX_MATCH-2 is not a multiple of 8, it is necessary to
243	* put more guard bytes at the end of the window, or to check
244	* more often for insufficient lookahead.
245	*/
246	Assert(scan[`2`] == match[`2`], "scan[2]?");
247	scan++;
248	match++;
249
250	do {
251	uint16_t mval, sval;
252
253	memcpy(&mval, match, sizeof(mval));
254	memcpy(&sval, scan, sizeof(sval));
255	if (mval != sval)
256	break;
257	match += sizeof(mval);
258	scan += sizeof(sval);
259
260	memcpy(&mval, match, sizeof(mval));
261	memcpy(&sval, scan, sizeof(sval));
262	if (mval != sval)
263	break;
264	match += sizeof(mval);
265	scan += sizeof(sval);
266
267	memcpy(&mval, match, sizeof(mval));
268	memcpy(&sval, scan, sizeof(sval));
269	if (mval != sval)
270	break;
271	match += sizeof(mval);
272	scan += sizeof(sval);
273
274	memcpy(&mval, match, sizeof(mval));
275	memcpy(&sval, scan, sizeof(sval));
276	if (mval != sval)
277	break;
278	match += sizeof(mval);
279	scan += sizeof(sval);
280	} while (scan < strend);
281
282	/*
283	* Here, scan <= window + strstart + 257
284	*/
285	Assert(scan <= s->window+(unsigned)(s->window_size-`1`), "wild scan");
286	if (scan == match)
287	scan++;
288
289	len = (MAX_MATCH -`1`) - (int)(strend-scan);
290	scan = strend - (MAX_MATCH-`1`);
291
292	if (len > best_len) {
293	s->match_start = cur_match;
294	best_len = len;
295	if (len >= nice_match)
296	break;
297	memcpy(&scan_end, scan + best_len - `1`, sizeof(scan_end));
298	} else {
299	/*
300	* The probability of finding a match later if we here
301	* is pretty low, so for performance it's best to
302	* outright stop here for the lower compression levels
303	*/
304	if (s->level < TRIGGER_LEVEL)
305	break;
306	}
307	} while (--chain_length && (cur_match = prev[cur_match & wmask]) > limit);
308
309	if ((unsigned)best_len <= s->lookahead)
310	return best_len;
311	return s->lookahead;
312	}
313	#endif
314
315	#ifdef std3_longest_match
316	/ longest_match() with minor change to improve performance (in terms of*
317	* execution time).
318	*
319	* The pristine longest_match() function is sketched below (strip the
320	* then-clause of the "#ifdef UNALIGNED_OK"-directive)
321	*
322	* ------------------------------------------------------------
323	* unsigned int longest_match(...) {
324	* ...
325	* do {
326	* match = s->window + cur_match; //s0
327	* if ((ushf)(match+best_len-1) != scan_end \|\| //s1
328	* (ushf)match != scan_start) continue; //s2
329	* ...
330	*
331	* do {
332	* } while ((ushf)(scan+=2) == (ushf)(match+=2) &&
333	* (ushf)(scan+=2) == (ushf)(match+=2) &&
334	* (ushf)(scan+=2) == (ushf)(match+=2) &&
335	* (ushf)(scan+=2) == (ushf)(match+=2) &&
336	* scan < strend); //s3
337	*
338	* ...
339	* } while(cond); //s4
340	*
341	* -------------------------------------------------------------
342	*
343	* The change include:
344	*
345	* 1) The hottest statements of the function is: s0, s1 and s4. Pull them
346	* together to form a new loop. The benefit is two-fold:
347	*
348	* o. Ease the compiler to yield good code layout: the conditional-branch
349	* corresponding to s1 and its biased target s4 become very close (likely,
350	* fit in the same cache-line), hence improving instruction-fetching
351	* efficiency.
352	*
353	* o. Ease the compiler to promote "s->window" into register. "s->window"
354	* is loop-invariant; it is supposed to be promoted into register and keep
355	* the value throughout the entire loop. However, there are many such
356	* loop-invariant, and x86-family has small register file; "s->window" is
357	* likely to be chosen as register-allocation victim such that its value
358	* is reloaded from memory in every single iteration. By forming a new loop,
359	* "s->window" is loop-invariant of that newly created tight loop. It is
360	* lot easier for compiler to promote this quantity to register and keep
361	* its value throughout the entire small loop.
362	*
363	* 2) Transfrom s3 such that it examines sizeof(long)-byte-match at a time.
364	* This is done by:
365	* ------------------------------------------------
366	* v1 = load from "scan" by sizeof(long) bytes
367	* v2 = load from "match" by sizeof(lnog) bytes
368	* v3 = v1 xor v2
369	* match-bit = little-endian-machine(yes-for-x86) ?
370	* count-trailing-zero(v3) :
371	* count-leading-zero(v3);
372	*
373	* match-byte = match-bit/8
374	*
375	* "scan" and "match" advance if necessary
376	* -------------------------------------------------
377	*/
378
379	static inline unsigned longest_match(deflate_state *const s, IPos cur_match) {
380	unsigned int strstart = s->strstart;
381	unsigned chain_length = s->max_chain_length;/ max hash chain length /
382	unsigned char *window = s->window;
383	register unsigned char scan = window + strstart; /* current string /
384	register unsigned char match; /* matched string /
385	register unsigned int len; / length of current match /
386	unsigned int best_len = s->prev_length ? s->prev_length : `1`; / best match length so far /
387	unsigned int nice_match = s->nice_match; / stop if match long enough /
388	IPos limit = strstart > (IPos)MAX_DIST(s) ?
389	strstart - (IPos)MAX_DIST(s) : NIL;
390	/ Stop when cur_match becomes <= limit. To simplify the code,*
391	* we prevent matches with the string of window index 0.
392	*/
393	Pos *prev = s->prev;
394	unsigned int wmask = s->w_mask;
395
396	register unsigned char *strend = window + strstart + MAX_MATCH;
397
398	uint16_t scan_start, scan_end;
399
400	memcpy(&scan_start, scan, sizeof(scan_start));
401	memcpy(&scan_end, scan+best_len-`1`, sizeof(scan_end));
402
403	/ The code is optimized for HASH_BITS >= 8 and MAX_MATCH-2 multiple of 16.*
404	* It is easy to get rid of this optimization if necessary.
405	*/
406	Assert(s->hash_bits >= `8` && MAX_MATCH == `258`, "Code too clever");
407
408	/ Do not waste too much time if we already have a good match: /
409	if (s->prev_length >= s->good_match) {
410	chain_length >>= `2`;
411	}
412	/ Do not look for matches beyond the end of the input. This is necessary*
413	* to make deflate deterministic.
414	*/
415	if ((unsigned int)nice_match > s->lookahead) nice_match = s->lookahead;
416
417	Assert((unsigned long)strstart <= s->window_size-MIN_LOOKAHEAD, "need lookahead");
418
419	do {
420	if (cur_match >= strstart) {
421	break;
422	}
423
424	/ Skip to next match if the match length cannot increase*
425	* or if the match length is less than 2. Note that the checks below
426	* for insufficient lookahead only occur occasionally for performance
427	* reasons. Therefore uninitialized memory will be accessed, and
428	* conditional jumps will be made that depend on those values.
429	* However the length of the match is limited to the lookahead, so
430	* the output of deflate is not affected by the uninitialized values.
431	*/
432	int cont = `1`;
433	do {
434	match = window + cur_match;
435	if (LIKELY(memcmp(match+best_len-`1`, &scan_end, sizeof(scan_end)) != `0`
436	\|\| memcmp(match, &scan_start, sizeof(scan_start)) != `0`)) {
437	if ((cur_match = prev[cur_match & wmask]) > limit
438	&& --chain_length != `0`) {
439	continue;
440	} else {
441	cont = `0`;
442	}
443	}
444	break;
445	} while (`1`);
446
447	if (!cont)
448	break;
449
450	/ It is not necessary to compare scan[2] and match[2] since they are*
451	* always equal when the other bytes match, given that the hash keys
452	* are equal and that HASH_BITS >= 8. Compare 2 bytes at a time at
453	* strstart+3, +5, ... up to strstart+257. We check for insufficient
454	* lookahead only every 4th comparison; the 128th check will be made
455	* at strstart+257. If MAX_MATCH-2 is not a multiple of 8, it is
456	* necessary to put more guard bytes at the end of the window, or
457	* to check more often for insufficient lookahead.
458	*/
459	scan += `2`, match+=`2`;
460	Assert(scan == match, "match[2]?");
461	do {
462	unsigned long sv, mv, xor;
463
464	memcpy(&sv, scan, sizeof(sv));
465	memcpy(&mv, match, sizeof(mv));
466
467	xor = sv ^ mv;
468
469	if (xor) {
470	int match_byte = __builtin_ctzl(xor) / `8`;
471	scan += match_byte;
472	break;
473	} else {
474	scan += sizeof(unsigned long);
475	match += sizeof(unsigned long);
476	}
477	} while (scan < strend);
478
479	if (scan > strend)
480	scan = strend;
481
482	Assert(scan <= window + (unsigned)(s->window_size-`1`), "wild scan");
483
484	len = MAX_MATCH - (int)(strend - scan);
485	scan = strend - MAX_MATCH;
486
487	if (len > best_len) {
488	s->match_start = cur_match;
489	best_len = len;
490	if (len >= nice_match)
491	break;
492	memcpy(&scan_end, scan+best_len-`1`, sizeof(scan_end));
493	} else {
494	/*
495	* The probability of finding a match later if we here
496	* is pretty low, so for performance it's best to
497	* outright stop here for the lower compression levels
498	*/
499	if (s->level < TRIGGER_LEVEL)
500	break;
501	}
502	} while ((cur_match = prev[cur_match & wmask]) > limit && --chain_length != `0`);
503
504	if ((unsigned int)best_len <= s->lookahead)
505	return (unsigned int)best_len;
506	return s->lookahead;
507	}
508	#endif
509

Browse the source code of ClickHouse/contrib/zlib-ng/match_p.h