zstd_lazy.c source code [Godot/thirdparty/zstd/compress/zstd_lazy.c]

1	/*
2	* Copyright (c) Meta Platforms, Inc. and affiliates.
3	* All rights reserved.
4	*
5	* This source code is licensed under both the BSD-style license (found in the
6	* LICENSE file in the root directory of this source tree) and the GPLv2 (found
7	* in the COPYING file in the root directory of this source tree).
8	* You may select, at your option, one of the above-listed licenses.
9	*/
10
11	#include "zstd_compress_internal.h"
12	#include "zstd_lazy.h"
13	#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */
14
15	#define kLazySkippingStep 8
16
17
18	/-************************************
19	* Binary Tree search
20	***************************************/
21
22	static void
23	ZSTD_updateDUBT(ZSTD_matchState_t* ms,
24	const BYTE* ip, const BYTE* iend,
25	U32 mls)
26	{
27	const ZSTD_compressionParameters* const cParams = &ms->cParams;
28	U32* const hashTable = ms->hashTable;
29	U32 const hashLog = cParams->hashLog;
30
31	U32* const bt = ms->chainTable;
32	U32 const btLog = cParams->chainLog - `1`;
33	U32 const btMask = (`1` << btLog) - `1`;
34
35	const BYTE* const base = ms->window.base;
36	U32 const target = (U32)(ip - base);
37	U32 idx = ms->nextToUpdate;
38
39	if (idx != target)
40	DEBUGLOG(`7`, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)",
41	idx, target, ms->window.dictLimit);
42	assert(ip + `8` <= iend); / condition for ZSTD_hashPtr /
43	(void)iend;
44
45	assert(idx >= ms->window.dictLimit); / condition for valid base+idx /
46	for ( ; idx < target ; idx++) {
47	size_t const h = ZSTD_hashPtr(base + idx, hashLog, mls); / assumption : ip + 8 <= iend /
48	U32 const matchIndex = hashTable[h];
49
50	U32* const nextCandidatePtr = bt + `2`*(idx&btMask);
51	U32* const sortMarkPtr = nextCandidatePtr + `1`;
52
53	DEBUGLOG(`8`, "ZSTD_updateDUBT: insert %u", idx);
54	hashTable[h] = idx; / Update Hash Table /
55	nextCandidatePtr = matchIndex; /* update BT like a chain /
56	*sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK;
57	}
58	ms->nextToUpdate = target;
59	}
60
61
62	/* ZSTD_insertDUBT1() :*
63	* sort one already inserted but unsorted position
64	* assumption : curr >= btlow == (curr - btmask)
65	* doesn't fail */
66	static void
67	ZSTD_insertDUBT1(const ZSTD_matchState_t* ms,
68	U32 curr, const BYTE* inputEnd,
69	U32 nbCompares, U32 btLow,
70	const ZSTD_dictMode_e dictMode)
71	{
72	const ZSTD_compressionParameters* const cParams = &ms->cParams;
73	U32* const bt = ms->chainTable;
74	U32 const btLog = cParams->chainLog - `1`;
75	U32 const btMask = (`1` << btLog) - `1`;
76	size_t commonLengthSmaller=`0`, commonLengthLarger=`0`;
77	const BYTE* const base = ms->window.base;
78	const BYTE* const dictBase = ms->window.dictBase;
79	const U32 dictLimit = ms->window.dictLimit;
80	const BYTE* const ip = (curr>=dictLimit) ? base + curr : dictBase + curr;
81	const BYTE* const iend = (curr>=dictLimit) ? inputEnd : dictBase + dictLimit;
82	const BYTE* const dictEnd = dictBase + dictLimit;
83	const BYTE* const prefixStart = base + dictLimit;
84	const BYTE* match;
85	U32* smallerPtr = bt + `2`*(curr&btMask);
86	U32* largerPtr = smallerPtr + `1`;
87	U32 matchIndex = smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through smallerPtr, while largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) /
88	U32 dummy32; / to be nullified at the end /
89	U32 const windowValid = ms->window.lowLimit;
90	U32 const maxDistance = `1U` << cParams->windowLog;
91	U32 const windowLow = (curr - windowValid > maxDistance) ? curr - maxDistance : windowValid;
92
93
94	DEBUGLOG(`8`, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)",
95	curr, dictLimit, windowLow);
96	assert(curr >= btLow);
97	assert(ip < iend); / condition for ZSTD_count /
98
99	for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
100	U32* const nextPtr = bt + `2`*(matchIndex & btMask);
101	size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); / guaranteed minimum nb of common bytes /
102	assert(matchIndex < curr);
103	/ note : all candidates are now supposed sorted,*
104	* but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK
105	* when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */
106
107	if ( (dictMode != ZSTD_extDict)
108	\|\| (matchIndex+matchLength >= dictLimit) / both in current segment/
109	\|\| (curr < dictLimit) / both in extDict /) {
110	const BYTE* const mBase = ( (dictMode != ZSTD_extDict)
111	\|\| (matchIndex+matchLength >= dictLimit)) ?
112	base : dictBase;
113	assert( (matchIndex+matchLength >= dictLimit) / might be wrong if extDict is incorrectly set to 0 /
114	\|\| (curr < dictLimit) );
115	match = mBase + matchIndex;
116	matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
117	} else {
118	match = dictBase + matchIndex;
119	matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
120	if (matchIndex+matchLength >= dictLimit)
121	match = base + matchIndex; / preparation for next read of match[matchLength] /
122	}
123
124	DEBUGLOG(`8`, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ",
125	curr, matchIndex, (U32)matchLength);
126
127	if (ip+matchLength == iend) { / equal : no way to know if inf or sup /
128	break; / drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree /
129	}
130
131	if (match[matchLength] < ip[matchLength]) { / necessarily within buffer /
132	/ match is smaller than current /
133	smallerPtr = matchIndex; /* update smaller idx /
134	commonLengthSmaller = matchLength; / all smaller will now have at least this guaranteed common length /
135	if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } / beyond tree size, stop searching /
136	DEBUGLOG(`8`, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u",
137	matchIndex, btLow, nextPtr[`1`]);
138	smallerPtr = nextPtr+`1`; / new "candidate" => larger than match, which was smaller than target /
139	matchIndex = nextPtr[`1`]; / new matchIndex, larger than previous and closer to current /
140	} else {
141	/ match is larger than current /
142	*largerPtr = matchIndex;
143	commonLengthLarger = matchLength;
144	if (matchIndex <= btLow) { largerPtr=&dummy32; break; } / beyond tree size, stop searching /
145	DEBUGLOG(`8`, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u",
146	matchIndex, btLow, nextPtr[`0`]);
147	largerPtr = nextPtr;
148	matchIndex = nextPtr[`0`];
149	} }
150
151	smallerPtr = largerPtr = `0`;
152	}
153
154
155	static size_t
156	ZSTD_DUBT_findBetterDictMatch (
157	const ZSTD_matchState_t* ms,
158	const BYTE* const ip, const BYTE* const iend,
159	size_t* offsetPtr,
160	size_t bestLength,
161	U32 nbCompares,
162	U32 const mls,
163	const ZSTD_dictMode_e dictMode)
164	{
165	const ZSTD_matchState_t * const dms = ms->dictMatchState;
166	const ZSTD_compressionParameters* const dmsCParams = &dms->cParams;
167	const U32 * const dictHashTable = dms->hashTable;
168	U32 const hashLog = dmsCParams->hashLog;
169	size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
170	U32 dictMatchIndex = dictHashTable[h];
171
172	const BYTE* const base = ms->window.base;
173	const BYTE* const prefixStart = base + ms->window.dictLimit;
174	U32 const curr = (U32)(ip-base);
175	const BYTE* const dictBase = dms->window.base;
176	const BYTE* const dictEnd = dms->window.nextSrc;
177	U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base);
178	U32 const dictLowLimit = dms->window.lowLimit;
179	U32 const dictIndexDelta = ms->window.lowLimit - dictHighLimit;
180
181	U32* const dictBt = dms->chainTable;
182	U32 const btLog = dmsCParams->chainLog - `1`;
183	U32 const btMask = (`1` << btLog) - `1`;
184	U32 const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask;
185
186	size_t commonLengthSmaller=`0`, commonLengthLarger=`0`;
187
188	(void)dictMode;
189	assert(dictMode == ZSTD_dictMatchState);
190
191	for (; nbCompares && (dictMatchIndex > dictLowLimit); --nbCompares) {
192	U32* const nextPtr = dictBt + `2`*(dictMatchIndex & btMask);
193	size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); / guaranteed minimum nb of common bytes /
194	const BYTE* match = dictBase + dictMatchIndex;
195	matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
196	if (dictMatchIndex+matchLength >= dictHighLimit)
197	match = base + dictMatchIndex + dictIndexDelta; / to prepare for next usage of match[matchLength] /
198
199	if (matchLength > bestLength) {
200	U32 matchIndex = dictMatchIndex + dictIndexDelta;
201	if ( (`4`(int)(matchLength-bestLength)) > (int*)(ZSTD_highbit32(curr-matchIndex+`1`) - ZSTD_highbit32((U32)offsetPtr[`0`]+`1`)) ) {
202	DEBUGLOG(`9`, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)",
203	curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex);
204	bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
205	}
206	if (ip+matchLength == iend) { / reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match /
207	break; / drop, to guarantee consistency (miss a little bit of compression) /
208	}
209	}
210
211	if (match[matchLength] < ip[matchLength]) {
212	if (dictMatchIndex <= btLow) { break; } / beyond tree size, stop the search /
213	commonLengthSmaller = matchLength; / all smaller will now have at least this guaranteed common length /
214	dictMatchIndex = nextPtr[`1`]; / new matchIndex larger than previous (closer to current) /
215	} else {
216	/ match is larger than current /
217	if (dictMatchIndex <= btLow) { break; } / beyond tree size, stop the search /
218	commonLengthLarger = matchLength;
219	dictMatchIndex = nextPtr[`0`];
220	}
221	}
222
223	if (bestLength >= MINMATCH) {
224	U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(offsetPtr); (void*)mIndex;
225	DEBUGLOG(`8`, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
226	curr, (U32)bestLength, (U32)*offsetPtr, mIndex);
227	}
228	return bestLength;
229
230	}
231
232
233	static size_t
234	ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms,
235	const BYTE* const ip, const BYTE* const iend,
236	size_t* offBasePtr,
237	U32 const mls,
238	const ZSTD_dictMode_e dictMode)
239	{
240	const ZSTD_compressionParameters* const cParams = &ms->cParams;
241	U32* const hashTable = ms->hashTable;
242	U32 const hashLog = cParams->hashLog;
243	size_t const h = ZSTD_hashPtr(ip, hashLog, mls);
244	U32 matchIndex = hashTable[h];
245
246	const BYTE* const base = ms->window.base;
247	U32 const curr = (U32)(ip-base);
248	U32 const windowLow = ZSTD_getLowestMatchIndex(ms, curr, cParams->windowLog);
249
250	U32* const bt = ms->chainTable;
251	U32 const btLog = cParams->chainLog - `1`;
252	U32 const btMask = (`1` << btLog) - `1`;
253	U32 const btLow = (btMask >= curr) ? `0` : curr - btMask;
254	U32 const unsortLimit = MAX(btLow, windowLow);
255
256	U32* nextCandidate = bt + `2`*(matchIndex&btMask);
257	U32* unsortedMark = bt + `2`*(matchIndex&btMask) + `1`;
258	U32 nbCompares = `1U` << cParams->searchLog;
259	U32 nbCandidates = nbCompares;
260	U32 previousCandidate = `0`;
261
262	DEBUGLOG(`7`, "ZSTD_DUBT_findBestMatch (%u) ", curr);
263	assert(ip <= iend-`8`); / required for h calculation /
264	assert(dictMode != ZSTD_dedicatedDictSearch);
265
266	/ reach end of unsorted candidates list /
267	while ( (matchIndex > unsortLimit)
268	&& (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK)
269	&& (nbCandidates > `1`) ) {
270	DEBUGLOG(`8`, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted",
271	matchIndex);
272	unsortedMark = previousCandidate; /* the unsortedMark becomes a reversed chain, to move up back to original position /
273	previousCandidate = matchIndex;
274	matchIndex = *nextCandidate;
275	nextCandidate = bt + `2`*(matchIndex&btMask);
276	unsortedMark = bt + `2`*(matchIndex&btMask) + `1`;
277	nbCandidates --;
278	}
279
280	/ nullify last candidate if it's still unsorted*
281	* simplification, detrimental to compression ratio, beneficial for speed */
282	if ( (matchIndex > unsortLimit)
283	&& (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) {
284	DEBUGLOG(`7`, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u",
285	matchIndex);
286	nextCandidate = unsortedMark = `0`;
287	}
288
289	/ batch sort stacked candidates /
290	matchIndex = previousCandidate;
291	while (matchIndex) { / will end on matchIndex == 0 /
292	U32* const nextCandidateIdxPtr = bt + `2`*(matchIndex&btMask) + `1`;
293	U32 const nextCandidateIdx = *nextCandidateIdxPtr;
294	ZSTD_insertDUBT1(ms, matchIndex, iend,
295	nbCandidates, unsortLimit, dictMode);
296	matchIndex = nextCandidateIdx;
297	nbCandidates++;
298	}
299
300	/ find longest match /
301	{ size_t commonLengthSmaller = `0`, commonLengthLarger = `0`;
302	const BYTE* const dictBase = ms->window.dictBase;
303	const U32 dictLimit = ms->window.dictLimit;
304	const BYTE* const dictEnd = dictBase + dictLimit;
305	const BYTE* const prefixStart = base + dictLimit;
306	U32* smallerPtr = bt + `2`*(curr&btMask);
307	U32* largerPtr = bt + `2`*(curr&btMask) + `1`;
308	U32 matchEndIdx = curr + `8` + `1`;
309	U32 dummy32; / to be nullified at the end /
310	size_t bestLength = `0`;
311
312	matchIndex = hashTable[h];
313	hashTable[h] = curr; / Update Hash Table /
314
315	for (; nbCompares && (matchIndex > windowLow); --nbCompares) {
316	U32* const nextPtr = bt + `2`*(matchIndex & btMask);
317	size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); / guaranteed minimum nb of common bytes /
318	const BYTE* match;
319
320	if ((dictMode != ZSTD_extDict) \|\| (matchIndex+matchLength >= dictLimit)) {
321	match = base + matchIndex;
322	matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend);
323	} else {
324	match = dictBase + matchIndex;
325	matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart);
326	if (matchIndex+matchLength >= dictLimit)
327	match = base + matchIndex; / to prepare for next usage of match[matchLength] /
328	}
329
330	if (matchLength > bestLength) {
331	if (matchLength > matchEndIdx - matchIndex)
332	matchEndIdx = matchIndex + (U32)matchLength;
333	if ( (`4`(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + `1`) - ZSTD_highbit32((U32)offBasePtr)) )
334	bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex);
335	if (ip+matchLength == iend) { / equal : no way to know if inf or sup /
336	if (dictMode == ZSTD_dictMatchState) {
337	nbCompares = `0`; / in addition to avoiding checking any*
338	* further in this loop, make sure we
339	* skip checking in the dictionary. */
340	}
341	break; / drop, to guarantee consistency (miss a little bit of compression) /
342	}
343	}
344
345	if (match[matchLength] < ip[matchLength]) {
346	/ match is smaller than current /
347	smallerPtr = matchIndex; /* update smaller idx /
348	commonLengthSmaller = matchLength; / all smaller will now have at least this guaranteed common length /
349	if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } / beyond tree size, stop the search /
350	smallerPtr = nextPtr+`1`; / new "smaller" => larger of match /
351	matchIndex = nextPtr[`1`]; / new matchIndex larger than previous (closer to current) /
352	} else {
353	/ match is larger than current /
354	*largerPtr = matchIndex;
355	commonLengthLarger = matchLength;
356	if (matchIndex <= btLow) { largerPtr=&dummy32; break; } / beyond tree size, stop the search /
357	largerPtr = nextPtr;
358	matchIndex = nextPtr[`0`];
359	} }
360
361	smallerPtr = largerPtr = `0`;
362
363	assert(nbCompares <= (`1U` << ZSTD_SEARCHLOG_MAX)); / Check we haven't underflowed. /
364	if (dictMode == ZSTD_dictMatchState && nbCompares) {
365	bestLength = ZSTD_DUBT_findBetterDictMatch(
366	ms, ip, iend,
367	offBasePtr, bestLength, nbCompares,
368	mls, dictMode);
369	}
370
371	assert(matchEndIdx > curr+`8`); / ensure nextToUpdate is increased /
372	ms->nextToUpdate = matchEndIdx - `8`; / skip repetitive patterns /
373	if (bestLength >= MINMATCH) {
374	U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(offBasePtr); (void*)mIndex;
375	DEBUGLOG(`8`, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)",
376	curr, (U32)bestLength, (U32)*offBasePtr, mIndex);
377	}
378	return bestLength;
379	}
380	}
381
382
383	/* ZSTD_BtFindBestMatch() : Tree updater, providing best match /
384	FORCE_INLINE_TEMPLATE size_t
385	ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms,
386	const BYTE* const ip, const BYTE* const iLimit,
387	size_t* offBasePtr,
388	const U32 mls / template /,
389	const ZSTD_dictMode_e dictMode)
390	{
391	DEBUGLOG(`7`, "ZSTD_BtFindBestMatch");
392	if (ip < ms->window.base + ms->nextToUpdate) return `0`; / skipped area /
393	ZSTD_updateDUBT(ms, ip, iLimit, mls);
394	return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode);
395	}
396
397	/***********************************
398	* Dedicated dict search
399	***********************************/
400
401	void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip)
402	{
403	const BYTE* const base = ms->window.base;
404	U32 const target = (U32)(ip - base);
405	U32* const hashTable = ms->hashTable;
406	U32* const chainTable = ms->chainTable;
407	U32 const chainSize = `1` << ms->cParams.chainLog;
408	U32 idx = ms->nextToUpdate;
409	U32 const minChain = chainSize < target - idx ? target - chainSize : idx;
410	U32 const bucketSize = `1` << ZSTD_LAZY_DDSS_BUCKET_LOG;
411	U32 const cacheSize = bucketSize - `1`;
412	U32 const chainAttempts = (`1` << ms->cParams.searchLog) - cacheSize;
413	U32 const chainLimit = chainAttempts > `255` ? `255` : chainAttempts;
414
415	/ We know the hashtable is oversized by a factor of `bucketSize`.*
416	* We are going to temporarily pretend `bucketSize == 1`, keeping only a
417	* single entry. We will use the rest of the space to construct a temporary
418	* chaintable.
419	*/
420	U32 const hashLog = ms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
421	U32* const tmpHashTable = hashTable;
422	U32* const tmpChainTable = hashTable + ((size_t)`1` << hashLog);
423	U32 const tmpChainSize = (U32)((`1` << ZSTD_LAZY_DDSS_BUCKET_LOG) - `1`) << hashLog;
424	U32 const tmpMinChain = tmpChainSize < target ? target - tmpChainSize : idx;
425	U32 hashIdx;
426
427	assert(ms->cParams.chainLog <= `24`);
428	assert(ms->cParams.hashLog > ms->cParams.chainLog);
429	assert(idx != `0`);
430	assert(tmpMinChain <= minChain);
431
432	/ fill conventional hash table and conventional chain table /
433	for ( ; idx < target; idx++) {
434	U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch);
435	if (idx >= tmpMinChain) {
436	tmpChainTable[idx - tmpMinChain] = hashTable[h];
437	}
438	tmpHashTable[h] = idx;
439	}
440
441	/ sort chains into ddss chain table /
442	{
443	U32 chainPos = `0`;
444	for (hashIdx = `0`; hashIdx < (`1U` << hashLog); hashIdx++) {
445	U32 count;
446	U32 countBeyondMinChain = `0`;
447	U32 i = tmpHashTable[hashIdx];
448	for (count = `0`; i >= tmpMinChain && count < cacheSize; count++) {
449	/ skip through the chain to the first position that won't be*
450	* in the hash cache bucket */
451	if (i < minChain) {
452	countBeyondMinChain++;
453	}
454	i = tmpChainTable[i - tmpMinChain];
455	}
456	if (count == cacheSize) {
457	for (count = `0`; count < chainLimit;) {
458	if (i < minChain) {
459	if (!i \|\| ++countBeyondMinChain > cacheSize) {
460	/ only allow pulling `cacheSize` number of entries*
461	* into the cache or chainTable beyond `minChain`,
462	* to replace the entries pulled out of the
463	* chainTable into the cache. This lets us reach
464	* back further without increasing the total number
465	* of entries in the chainTable, guaranteeing the
466	* DDSS chain table will fit into the space
467	* allocated for the regular one. */
468	break;
469	}
470	}
471	chainTable[chainPos++] = i;
472	count++;
473	if (i < tmpMinChain) {
474	break;
475	}
476	i = tmpChainTable[i - tmpMinChain];
477	}
478	} else {
479	count = `0`;
480	}
481	if (count) {
482	tmpHashTable[hashIdx] = ((chainPos - count) << `8`) + count;
483	} else {
484	tmpHashTable[hashIdx] = `0`;
485	}
486	}
487	assert(chainPos <= chainSize); / I believe this is guaranteed... /
488	}
489
490	/ move chain pointers into the last entry of each hash bucket /
491	for (hashIdx = (`1` << hashLog); hashIdx; ) {
492	U32 const bucketIdx = --hashIdx << ZSTD_LAZY_DDSS_BUCKET_LOG;
493	U32 const chainPackedPointer = tmpHashTable[hashIdx];
494	U32 i;
495	for (i = `0`; i < cacheSize; i++) {
496	hashTable[bucketIdx + i] = `0`;
497	}
498	hashTable[bucketIdx + bucketSize - `1`] = chainPackedPointer;
499	}
500
501	/ fill the buckets of the hash table /
502	for (idx = ms->nextToUpdate; idx < target; idx++) {
503	U32 const h = (U32)ZSTD_hashPtr(base + idx, hashLog, ms->cParams.minMatch)
504	<< ZSTD_LAZY_DDSS_BUCKET_LOG;
505	U32 i;
506	/ Shift hash cache down 1. /
507	for (i = cacheSize - `1`; i; i--)
508	hashTable[h + i] = hashTable[h + i - `1`];
509	hashTable[h] = idx;
510	}
511
512	ms->nextToUpdate = target;
513	}
514
515	/ Returns the longest match length found in the dedicated dict search structure.*
516	* If none are longer than the argument ml, then ml will be returned.
517	*/
518	FORCE_INLINE_TEMPLATE
519	size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts,
520	const ZSTD_matchState_t* const dms,
521	const BYTE* const ip, const BYTE* const iLimit,
522	const BYTE* const prefixStart, const U32 curr,
523	const U32 dictLimit, const size_t ddsIdx) {
524	const U32 ddsLowestIndex = dms->window.dictLimit;
525	const BYTE* const ddsBase = dms->window.base;
526	const BYTE* const ddsEnd = dms->window.nextSrc;
527	const U32 ddsSize = (U32)(ddsEnd - ddsBase);
528	const U32 ddsIndexDelta = dictLimit - ddsSize;
529	const U32 bucketSize = (`1` << ZSTD_LAZY_DDSS_BUCKET_LOG);
530	const U32 bucketLimit = nbAttempts < bucketSize - `1` ? nbAttempts : bucketSize - `1`;
531	U32 ddsAttempt;
532	U32 matchIndex;
533
534	for (ddsAttempt = `0`; ddsAttempt < bucketSize - `1`; ddsAttempt++) {
535	PREFETCH_L1(ddsBase + dms->hashTable[ddsIdx + ddsAttempt]);
536	}
537
538	{
539	U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - `1`];
540	U32 const chainIndex = chainPackedPointer >> `8`;
541
542	PREFETCH_L1(&dms->chainTable[chainIndex]);
543	}
544
545	for (ddsAttempt = `0`; ddsAttempt < bucketLimit; ddsAttempt++) {
546	size_t currentMl=`0`;
547	const BYTE* match;
548	matchIndex = dms->hashTable[ddsIdx + ddsAttempt];
549	match = ddsBase + matchIndex;
550
551	if (!matchIndex) {
552	return ml;
553	}
554
555	/ guaranteed by table construction /
556	(void)ddsLowestIndex;
557	assert(matchIndex >= ddsLowestIndex);
558	assert(match+`4` <= ddsEnd);
559	if (MEM_read32(match) == MEM_read32(ip)) {
560	/ assumption : matchIndex <= dictLimit-4 (by table construction) /
561	currentMl = ZSTD_count_2segments(ip+`4`, match+`4`, iLimit, ddsEnd, prefixStart) + `4`;
562	}
563
564	/ save best solution /
565	if (currentMl > ml) {
566	ml = currentMl;
567	*offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
568	if (ip+currentMl == iLimit) {
569	/ best possible, avoids read overflow on next attempt /
570	return ml;
571	}
572	}
573	}
574
575	{
576	U32 const chainPackedPointer = dms->hashTable[ddsIdx + bucketSize - `1`];
577	U32 chainIndex = chainPackedPointer >> `8`;
578	U32 const chainLength = chainPackedPointer & `0xFF`;
579	U32 const chainAttempts = nbAttempts - ddsAttempt;
580	U32 const chainLimit = chainAttempts > chainLength ? chainLength : chainAttempts;
581	U32 chainAttempt;
582
583	for (chainAttempt = `0` ; chainAttempt < chainLimit; chainAttempt++) {
584	PREFETCH_L1(ddsBase + dms->chainTable[chainIndex + chainAttempt]);
585	}
586
587	for (chainAttempt = `0` ; chainAttempt < chainLimit; chainAttempt++, chainIndex++) {
588	size_t currentMl=`0`;
589	const BYTE* match;
590	matchIndex = dms->chainTable[chainIndex];
591	match = ddsBase + matchIndex;
592
593	/ guaranteed by table construction /
594	assert(matchIndex >= ddsLowestIndex);
595	assert(match+`4` <= ddsEnd);
596	if (MEM_read32(match) == MEM_read32(ip)) {
597	/ assumption : matchIndex <= dictLimit-4 (by table construction) /
598	currentMl = ZSTD_count_2segments(ip+`4`, match+`4`, iLimit, ddsEnd, prefixStart) + `4`;
599	}
600
601	/ save best solution /
602	if (currentMl > ml) {
603	ml = currentMl;
604	*offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta));
605	if (ip+currentMl == iLimit) break; / best possible, avoids read overflow on next attempt /
606	}
607	}
608	}
609	return ml;
610	}
611
612
613	/* *********************************
614	* Hash Chain
615	***********************************/
616	#define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)]
617
618	/ Update chains up to ip (excluded)*
619	Assumption : always within prefix (i.e. not within extDict) /*
620	FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal(
621	ZSTD_matchState_t* ms,
622	const ZSTD_compressionParameters* const cParams,
623	const BYTE* ip, U32 const mls, U32 const lazySkipping)
624	{
625	U32* const hashTable = ms->hashTable;
626	const U32 hashLog = cParams->hashLog;
627	U32* const chainTable = ms->chainTable;
628	const U32 chainMask = (`1` << cParams->chainLog) - `1`;
629	const BYTE* const base = ms->window.base;
630	const U32 target = (U32)(ip - base);
631	U32 idx = ms->nextToUpdate;
632
633	while(idx < target) { / catch up /
634	size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls);
635	NEXT_IN_CHAIN(idx, chainMask) = hashTable[h];
636	hashTable[h] = idx;
637	idx++;
638	/ Stop inserting every position when in the lazy skipping mode. /
639	if (lazySkipping)
640	break;
641	}
642
643	ms->nextToUpdate = target;
644	return hashTable[ZSTD_hashPtr(ip, hashLog, mls)];
645	}
646
647	U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) {
648	const ZSTD_compressionParameters* const cParams = &ms->cParams;
649	return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, / lazySkipping/ `0`);
650	}
651
652	/ inlining is important to hardwire a hot branch (template emulation) /
653	FORCE_INLINE_TEMPLATE
654	size_t ZSTD_HcFindBestMatch(
655	ZSTD_matchState_t* ms,
656	const BYTE* const ip, const BYTE* const iLimit,
657	size_t* offsetPtr,
658	const U32 mls, const ZSTD_dictMode_e dictMode)
659	{
660	const ZSTD_compressionParameters* const cParams = &ms->cParams;
661	U32* const chainTable = ms->chainTable;
662	const U32 chainSize = (`1` << cParams->chainLog);
663	const U32 chainMask = chainSize-`1`;
664	const BYTE* const base = ms->window.base;
665	const BYTE* const dictBase = ms->window.dictBase;
666	const U32 dictLimit = ms->window.dictLimit;
667	const BYTE* const prefixStart = base + dictLimit;
668	const BYTE* const dictEnd = dictBase + dictLimit;
669	const U32 curr = (U32)(ip-base);
670	const U32 maxDistance = `1U` << cParams->windowLog;
671	const U32 lowestValid = ms->window.lowLimit;
672	const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
673	const U32 isDictionary = (ms->loadedDictEnd != `0`);
674	const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
675	const U32 minChain = curr > chainSize ? curr - chainSize : `0`;
676	U32 nbAttempts = `1U` << cParams->searchLog;
677	size_t ml=`4`-`1`;
678
679	const ZSTD_matchState_t* const dms = ms->dictMatchState;
680	const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch
681	? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : `0`;
682	const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch
683	? ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG : `0`;
684
685	U32 matchIndex;
686
687	if (dictMode == ZSTD_dedicatedDictSearch) {
688	const U32* entry = &dms->hashTable[ddsIdx];
689	PREFETCH_L1(entry);
690	}
691
692	/ HC4 match finder /
693	matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping);
694
695	for ( ; (matchIndex>=lowLimit) & (nbAttempts>`0`) ; nbAttempts--) {
696	size_t currentMl=`0`;
697	if ((dictMode != ZSTD_extDict) \|\| matchIndex >= dictLimit) {
698	const BYTE* const match = base + matchIndex;
699	assert(matchIndex >= dictLimit); / ensures this is true if dictMode != ZSTD_extDict /
700	/ read 4B starting from (match + ml + 1 - sizeof(U32)) /
701	if (MEM_read32(match + ml - `3`) == MEM_read32(ip + ml - `3`)) / potentially better /
702	currentMl = ZSTD_count(ip, match, iLimit);
703	} else {
704	const BYTE* const match = dictBase + matchIndex;
705	assert(match+`4` <= dictEnd);
706	if (MEM_read32(match) == MEM_read32(ip)) / assumption : matchIndex <= dictLimit-4 (by table construction) /
707	currentMl = ZSTD_count_2segments(ip+`4`, match+`4`, iLimit, dictEnd, prefixStart) + `4`;
708	}
709
710	/ save best solution /
711	if (currentMl > ml) {
712	ml = currentMl;
713	*offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
714	if (ip+currentMl == iLimit) break; / best possible, avoids read overflow on next attempt /
715	}
716
717	if (matchIndex <= minChain) break;
718	matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask);
719	}
720
721	assert(nbAttempts <= (`1U` << ZSTD_SEARCHLOG_MAX)); / Check we haven't underflowed. /
722	if (dictMode == ZSTD_dedicatedDictSearch) {
723	ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts, dms,
724	ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
725	} else if (dictMode == ZSTD_dictMatchState) {
726	const U32* const dmsChainTable = dms->chainTable;
727	const U32 dmsChainSize = (`1` << dms->cParams.chainLog);
728	const U32 dmsChainMask = dmsChainSize - `1`;
729	const U32 dmsLowestIndex = dms->window.dictLimit;
730	const BYTE* const dmsBase = dms->window.base;
731	const BYTE* const dmsEnd = dms->window.nextSrc;
732	const U32 dmsSize = (U32)(dmsEnd - dmsBase);
733	const U32 dmsIndexDelta = dictLimit - dmsSize;
734	const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : `0`;
735
736	matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)];
737
738	for ( ; (matchIndex>=dmsLowestIndex) & (nbAttempts>`0`) ; nbAttempts--) {
739	size_t currentMl=`0`;
740	const BYTE* const match = dmsBase + matchIndex;
741	assert(match+`4` <= dmsEnd);
742	if (MEM_read32(match) == MEM_read32(ip)) / assumption : matchIndex <= dictLimit-4 (by table construction) /
743	currentMl = ZSTD_count_2segments(ip+`4`, match+`4`, iLimit, dmsEnd, prefixStart) + `4`;
744
745	/ save best solution /
746	if (currentMl > ml) {
747	ml = currentMl;
748	assert(curr > matchIndex + dmsIndexDelta);
749	*offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
750	if (ip+currentMl == iLimit) break; / best possible, avoids read overflow on next attempt /
751	}
752
753	if (matchIndex <= dmsMinChain) break;
754
755	matchIndex = dmsChainTable[matchIndex & dmsChainMask];
756	}
757	}
758
759	return ml;
760	}
761
762	/* *********************************
763	* (SIMD) Row-based matchfinder
764	***********************************/
765	/ Constants for row-based hash /
766	#define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1)
767	#define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */
768
769	#define ZSTD_ROW_HASH_CACHE_MASK (ZSTD_ROW_HASH_CACHE_SIZE - 1)
770
771	typedef U64 ZSTD_VecMask; / Clarifies when we are interacting with a U64 representing a mask of matches /
772
773	/ ZSTD_VecMask_next():*
774	* Starting from the LSB, returns the idx of the next non-zero bit.
775	* Basically counting the nb of trailing zeroes.
776	*/
777	MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) {
778	return ZSTD_countTrailingZeros64(val);
779	}
780
781	/ ZSTD_row_nextIndex():*
782	* Returns the next index to insert at within a tagTable row, and updates the "head"
783	* value to reflect the update. Essentially cycles backwards from [1, {entries per row})
784	*/
785	FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) {
786	U32 next = (*tagRow-`1`) & rowMask;
787	next += (next == `0`) ? rowMask : `0`; / skip first position /
788	*tagRow = (BYTE)next;
789	return next;
790	}
791
792	/ ZSTD_isAligned():*
793	* Checks that a pointer is aligned to "align" bytes which must be a power of 2.
794	*/
795	MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) {
796	assert((align & (align - `1`)) == `0`);
797	return (((size_t)ptr) & (align - `1`)) == `0`;
798	}
799
800	/ ZSTD_row_prefetch():*
801	* Performs prefetching for the hashTable and tagTable at a given row.
802	*/
803	FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) {
804	PREFETCH_L1(hashTable + relRow);
805	if (rowLog >= `5`) {
806	PREFETCH_L1(hashTable + relRow + `16`);
807	/ Note: prefetching more of the hash table does not appear to be beneficial for 128-entry rows /
808	}
809	PREFETCH_L1(tagTable + relRow);
810	if (rowLog == `6`) {
811	PREFETCH_L1(tagTable + relRow + `32`);
812	}
813	assert(rowLog == `4` \|\| rowLog == `5` \|\| rowLog == `6`);
814	assert(ZSTD_isAligned(hashTable + relRow, `64`)); / prefetched hash row always 64-byte aligned /
815	assert(ZSTD_isAligned(tagTable + relRow, (size_t)`1` << rowLog)); / prefetched tagRow sits on correct multiple of bytes (32,64,128) /
816	}
817
818	/ ZSTD_row_fillHashCache():*
819	* Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries,
820	* but not beyond iLimit.
821	*/
822	FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base,
823	U32 const rowLog, U32 const mls,
824	U32 idx, const BYTE* const iLimit)
825	{
826	U32 const* const hashTable = ms->hashTable;
827	BYTE const* const tagTable = ms->tagTable;
828	U32 const hashLog = ms->rowHashLog;
829	U32 const maxElemsToPrefetch = (base + idx) > iLimit ? `0` : (U32)(iLimit - (base + idx) + `1`);
830	U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch);
831
832	for (; idx < lim; ++idx) {
833	U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
834	U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
835	ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
836	ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash;
837	}
838
839	DEBUGLOG(`6`, "ZSTD_row_fillHashCache(): [%u %u %u %u %u %u %u %u]", ms->hashCache[`0`], ms->hashCache[`1`],
840	ms->hashCache[`2`], ms->hashCache[`3`], ms->hashCache[`4`],
841	ms->hashCache[`5`], ms->hashCache[`6`], ms->hashCache[`7`]);
842	}
843
844	/ ZSTD_row_nextCachedHash():*
845	* Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at
846	* base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable.
847	*/
848	FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable,
849	BYTE const* tagTable, BYTE const* base,
850	U32 idx, U32 const hashLog,
851	U32 const rowLog, U32 const mls,
852	U64 const hashSalt)
853	{
854	U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
855	U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
856	ZSTD_row_prefetch(hashTable, tagTable, row, rowLog);
857	{ U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK];
858	cache[idx & ZSTD_ROW_HASH_CACHE_MASK] = newHash;
859	return hash;
860	}
861	}
862
863	/ ZSTD_row_update_internalImpl():*
864	* Updates the hash table with positions starting from updateStartIdx until updateEndIdx.
865	*/
866	FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms,
867	U32 updateStartIdx, U32 const updateEndIdx,
868	U32 const mls, U32 const rowLog,
869	U32 const rowMask, U32 const useCache)
870	{
871	U32* const hashTable = ms->hashTable;
872	BYTE* const tagTable = ms->tagTable;
873	U32 const hashLog = ms->rowHashLog;
874	const BYTE* const base = ms->window.base;
875
876	DEBUGLOG(`6`, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx);
877	for (; updateStartIdx < updateEndIdx; ++updateStartIdx) {
878	U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt)
879	: (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt);
880	U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
881	U32* const row = hashTable + relRow;
882	BYTE* tagRow = tagTable + relRow;
883	U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
884
885	assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt));
886	tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK;
887	row[pos] = updateStartIdx;
888	}
889	}
890
891	/ ZSTD_row_update_internal():*
892	* Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate.
893	* Skips sections of long matches as is necessary.
894	*/
895	FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip,
896	U32 const mls, U32 const rowLog,
897	U32 const rowMask, U32 const useCache)
898	{
899	U32 idx = ms->nextToUpdate;
900	const BYTE* const base = ms->window.base;
901	const U32 target = (U32)(ip - base);
902	const U32 kSkipThreshold = `384`;
903	const U32 kMaxMatchStartPositionsToUpdate = `96`;
904	const U32 kMaxMatchEndPositionsToUpdate = `32`;
905
906	if (useCache) {
907	/ Only skip positions when using hash cache, i.e.*
908	* if we are loading a dict, don't skip anything.
909	* If we decide to skip, then we only update a set number
910	* of positions at the beginning and end of the match.
911	*/
912	if (UNLIKELY(target - idx > kSkipThreshold)) {
913	U32 const bound = idx + kMaxMatchStartPositionsToUpdate;
914	ZSTD_row_update_internalImpl(ms, idx, bound, mls, rowLog, rowMask, useCache);
915	idx = target - kMaxMatchEndPositionsToUpdate;
916	ZSTD_row_fillHashCache(ms, base, rowLog, mls, idx, ip+`1`);
917	}
918	}
919	assert(target >= idx);
920	ZSTD_row_update_internalImpl(ms, idx, target, mls, rowLog, rowMask, useCache);
921	ms->nextToUpdate = target;
922	}
923
924	/ ZSTD_row_update():*
925	* External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary
926	* processing.
927	*/
928	void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) {
929	const U32 rowLog = BOUNDED(`4`, ms->cParams.searchLog, `6`);
930	const U32 rowMask = (`1u` << rowLog) - `1`;
931	const U32 mls = MIN(ms->cParams.minMatch, `6` / mls caps out at 6 /);
932
933	DEBUGLOG(`5`, "ZSTD_row_update(), rowLog=%u", rowLog);
934	ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, `0` / don't use cache /);
935	}
936
937	/ Returns the mask width of bits group of which will be set to 1. Given not all*
938	* architectures have easy movemask instruction, this helps to iterate over
939	* groups of bits easier and faster.
940	*/
941	FORCE_INLINE_TEMPLATE U32
942	ZSTD_row_matchMaskGroupWidth(const U32 rowEntries)
943	{
944	assert((rowEntries == `16`) \|\| (rowEntries == `32`) \|\| rowEntries == `64`);
945	assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
946	(void)rowEntries;
947	#if defined(ZSTD_ARCH_ARM_NEON)
948	/ NEON path only works for little endian /
949	if (!MEM_isLittleEndian()) {
950	return `1`;
951	}
952	if (rowEntries == `16`) {
953	return `4`;
954	}
955	if (rowEntries == `32`) {
956	return `2`;
957	}
958	if (rowEntries == `64`) {
959	return `1`;
960	}
961	#endif
962	return `1`;
963	}
964
965	#if defined(ZSTD_ARCH_X86_SSE2)
966	FORCE_INLINE_TEMPLATE ZSTD_VecMask
967	ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U32 head)
968	{
969	const __m128i comparisonMask = _mm_set1_epi8((char)tag);
970	int matches[`4`] = {`0`};
971	int i;
972	assert(nbChunks == `1` \|\| nbChunks == `2` \|\| nbChunks == `4`);
973	for (i=`0`; i<nbChunks; i++) {
974	const __m128i chunk = _mm_loadu_si128((const __m128i)(const* void)(src + `16`i));
975	const __m128i equalMask = _mm_cmpeq_epi8(chunk, comparisonMask);
976	matches[i] = _mm_movemask_epi8(equalMask);
977	}
978	if (nbChunks == `1`) return ZSTD_rotateRight_U16((U16)matches[`0`], head);
979	if (nbChunks == `2`) return ZSTD_rotateRight_U32((U32)matches[`1`] << `16` \| (U32)matches[`0`], head);
980	assert(nbChunks == `4`);
981	return ZSTD_rotateRight_U64((U64)matches[`3`] << `48` \| (U64)matches[`2`] << `32` \| (U64)matches[`1`] << `16` \| (U64)matches[`0`], head);
982	}
983	#endif
984
985	#if defined(ZSTD_ARCH_ARM_NEON)
986	FORCE_INLINE_TEMPLATE ZSTD_VecMask
987	ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped)
988	{
989	assert((rowEntries == `16`) \|\| (rowEntries == `32`) \|\| rowEntries == `64`);
990	if (rowEntries == `16`) {
991	/ vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits.*
992	* After that groups of 4 bits represent the equalMask. We lower
993	* all bits except the highest in these groups by doing AND with
994	* 0x88 = 0b10001000.
995	*/
996	const uint8x16_t chunk = vld1q_u8(src);
997	const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag)));
998	const uint8x8_t res = vshrn_n_u16(equalMask, `4`);
999	const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), `0`);
1000	return ZSTD_rotateRight_U64(matches, headGrouped) & `0x8888888888888888ull`;
1001	} else if (rowEntries == `32`) {
1002	/ Same idea as with rowEntries == 16 but doing AND with*
1003	* 0x55 = 0b01010101.
1004	*/
1005	const uint16x8x2_t chunk = vld2q_u16((const uint16_t)(const* void*)src);
1006	const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[`0`]);
1007	const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[`1`]);
1008	const uint8x16_t dup = vdupq_n_u8(tag);
1009	const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), `6`);
1010	const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), `6`);
1011	const uint8x8_t res = vsli_n_u8(t0, t1, `4`);
1012	const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), `0`) ;
1013	return ZSTD_rotateRight_U64(matches, headGrouped) & `0x5555555555555555ull`;
1014	} else { / rowEntries == 64 /
1015	const uint8x16x4_t chunk = vld4q_u8(src);
1016	const uint8x16_t dup = vdupq_n_u8(tag);
1017	const uint8x16_t cmp0 = vceqq_u8(chunk.val[`0`], dup);
1018	const uint8x16_t cmp1 = vceqq_u8(chunk.val[`1`], dup);
1019	const uint8x16_t cmp2 = vceqq_u8(chunk.val[`2`], dup);
1020	const uint8x16_t cmp3 = vceqq_u8(chunk.val[`3`], dup);
1021
1022	const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, `1`);
1023	const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, `1`);
1024	const uint8x16_t t2 = vsriq_n_u8(t1, t0, `2`);
1025	const uint8x16_t t3 = vsriq_n_u8(t2, t2, `4`);
1026	const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), `4`);
1027	const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), `0`);
1028	return ZSTD_rotateRight_U64(matches, headGrouped);
1029	}
1030	}
1031	#endif
1032
1033	/ Returns a ZSTD_VecMask (U64) that has the nth group (determined by*
1034	* ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag"
1035	* matches the hash at the nth position in a row of the tagTable.
1036	* Each row is a circular buffer beginning at the value of "headGrouped". So we
1037	* must rotate the "matches" bitfield to match up with the actual layout of the
1038	* entries within the hashTable */
1039	FORCE_INLINE_TEMPLATE ZSTD_VecMask
1040	ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries)
1041	{
1042	const BYTE* const src = tagRow;
1043	assert((rowEntries == `16`) \|\| (rowEntries == `32`) \|\| rowEntries == `64`);
1044	assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES);
1045	assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * `8`);
1046
1047	#if defined(ZSTD_ARCH_X86_SSE2)
1048
1049	return ZSTD_row_getSSEMask(rowEntries / `16`, src, tag, headGrouped);
1050
1051	#else /* SW or NEON-LE */
1052
1053	# if defined(ZSTD_ARCH_ARM_NEON)
1054	/ This NEON path only works for little endian - otherwise use SWAR below /
1055	if (MEM_isLittleEndian()) {
1056	return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped);
1057	}
1058	# endif /* ZSTD_ARCH_ARM_NEON */
1059	/ SWAR /
1060	{ const int chunkSize = sizeof(size_t);
1061	const size_t shiftAmount = ((chunkSize * `8`) - chunkSize);
1062	const size_t xFF = ~((size_t)`0`);
1063	const size_t x01 = xFF / `0xFF`;
1064	const size_t x80 = x01 << `7`;
1065	const size_t splatChar = tag * x01;
1066	ZSTD_VecMask matches = `0`;
1067	int i = rowEntries - chunkSize;
1068	assert((sizeof(size_t) == `4`) \|\| (sizeof(size_t) == `8`));
1069	if (MEM_isLittleEndian()) { / runtime check so have two loops /
1070	const size_t extractMagic = (xFF / `0x7F`) >> chunkSize;
1071	do {
1072	size_t chunk = MEM_readST(&src[i]);
1073	chunk ^= splatChar;
1074	chunk = (((chunk \| x80) - x01) \| chunk) & x80;
1075	matches <<= chunkSize;
1076	matches \|= (chunk * extractMagic) >> shiftAmount;
1077	i -= chunkSize;
1078	} while (i >= `0`);
1079	} else { / big endian: reverse bits during extraction /
1080	const size_t msb = xFF ^ (xFF >> `1`);
1081	const size_t extractMagic = (msb / `0x1FF`) \| msb;
1082	do {
1083	size_t chunk = MEM_readST(&src[i]);
1084	chunk ^= splatChar;
1085	chunk = (((chunk \| x80) - x01) \| chunk) & x80;
1086	matches <<= chunkSize;
1087	matches \|= ((chunk >> `7`) * extractMagic) >> shiftAmount;
1088	i -= chunkSize;
1089	} while (i >= `0`);
1090	}
1091	matches = ~matches;
1092	if (rowEntries == `16`) {
1093	return ZSTD_rotateRight_U16((U16)matches, headGrouped);
1094	} else if (rowEntries == `32`) {
1095	return ZSTD_rotateRight_U32((U32)matches, headGrouped);
1096	} else {
1097	return ZSTD_rotateRight_U64((U64)matches, headGrouped);
1098	}
1099	}
1100	#endif
1101	}
1102
1103	/ The high-level approach of the SIMD row based match finder is as follows:*
1104	* - Figure out where to insert the new entry:
1105	* - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag"
1106	* - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines
1107	* which row to insert into.
1108	* - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can
1109	* be considered as a circular buffer with a "head" index that resides in the tagTable.
1110	* - Also insert the "tag" into the equivalent row and position in the tagTable.
1111	* - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry.
1112	* The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively,
1113	* for alignment/performance reasons, leaving some bytes unused.
1114	* - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and
1115	* generate a bitfield that we can cycle through to check the collisions in the hash table.
1116	* - Pick the longest match.
1117	*/
1118	FORCE_INLINE_TEMPLATE
1119	size_t ZSTD_RowFindBestMatch(
1120	ZSTD_matchState_t* ms,
1121	const BYTE* const ip, const BYTE* const iLimit,
1122	size_t* offsetPtr,
1123	const U32 mls, const ZSTD_dictMode_e dictMode,
1124	const U32 rowLog)
1125	{
1126	U32* const hashTable = ms->hashTable;
1127	BYTE* const tagTable = ms->tagTable;
1128	U32* const hashCache = ms->hashCache;
1129	const U32 hashLog = ms->rowHashLog;
1130	const ZSTD_compressionParameters* const cParams = &ms->cParams;
1131	const BYTE* const base = ms->window.base;
1132	const BYTE* const dictBase = ms->window.dictBase;
1133	const U32 dictLimit = ms->window.dictLimit;
1134	const BYTE* const prefixStart = base + dictLimit;
1135	const BYTE* const dictEnd = dictBase + dictLimit;
1136	const U32 curr = (U32)(ip-base);
1137	const U32 maxDistance = `1U` << cParams->windowLog;
1138	const U32 lowestValid = ms->window.lowLimit;
1139	const U32 withinMaxDistance = (curr - lowestValid > maxDistance) ? curr - maxDistance : lowestValid;
1140	const U32 isDictionary = (ms->loadedDictEnd != `0`);
1141	const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance;
1142	const U32 rowEntries = (`1U` << rowLog);
1143	const U32 rowMask = rowEntries - `1`;
1144	const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); / nb of searches is capped at nb entries per row /
1145	const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries);
1146	const U64 hashSalt = ms->hashSalt;
1147	U32 nbAttempts = `1U` << cappedSearchLog;
1148	size_t ml=`4`-`1`;
1149	U32 hash;
1150
1151	/ DMS/DDS variables that may be referenced laster /
1152	const ZSTD_matchState_t* const dms = ms->dictMatchState;
1153
1154	/ Initialize the following variables to satisfy static analyzer /
1155	size_t ddsIdx = `0`;
1156	U32 ddsExtraAttempts = `0`; / cctx hash tables are limited in searches, but allow extra searches into DDS /
1157	U32 dmsTag = `0`;
1158	U32* dmsRow = NULL;
1159	BYTE* dmsTagRow = NULL;
1160
1161	if (dictMode == ZSTD_dedicatedDictSearch) {
1162	const U32 ddsHashLog = dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG;
1163	{ / Prefetch DDS hashtable entry /
1164	ddsIdx = ZSTD_hashPtr(ip, ddsHashLog, mls) << ZSTD_LAZY_DDSS_BUCKET_LOG;
1165	PREFETCH_L1(&dms->hashTable[ddsIdx]);
1166	}
1167	ddsExtraAttempts = cParams->searchLog > rowLog ? `1U` << (cParams->searchLog - rowLog) : `0`;
1168	}
1169
1170	if (dictMode == ZSTD_dictMatchState) {
1171	/ Prefetch DMS rows /
1172	U32* const dmsHashTable = dms->hashTable;
1173	BYTE* const dmsTagTable = dms->tagTable;
1174	U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls);
1175	U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1176	dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK;
1177	dmsTagRow = (BYTE*)(dmsTagTable + dmsRelRow);
1178	dmsRow = dmsHashTable + dmsRelRow;
1179	ZSTD_row_prefetch(dmsHashTable, dmsTagTable, dmsRelRow, rowLog);
1180	}
1181
1182	/ Update the hashTable and tagTable up to (but not including) ip /
1183	if (!ms->lazySkipping) {
1184	ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, `1` / useCache /);
1185	hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt);
1186	} else {
1187	/ Stop inserting every position when in the lazy skipping mode.*
1188	* The hash cache is also not kept up to date in this mode.
1189	*/
1190	hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt);
1191	ms->nextToUpdate = curr;
1192	}
1193	ms->hashSaltEntropy += hash; / collect salt entropy /
1194
1195	{ / Get the hash for ip, compute the appropriate row /
1196	U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog;
1197	U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK;
1198	U32* const row = hashTable + relRow;
1199	BYTE* tagRow = (BYTE*)(tagTable + relRow);
1200	U32 const headGrouped = (tagRow & rowMask) groupWidth;
1201	U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1202	size_t numMatches = `0`;
1203	size_t currMatch = `0`;
1204	ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries);
1205
1206	/ Cycle through the matches and prefetch /
1207	for (; (matches > `0`) && (nbAttempts > `0`); matches &= (matches - `1`)) {
1208	U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
1209	U32 const matchIndex = row[matchPos];
1210	if(matchPos == `0`) continue;
1211	assert(numMatches < rowEntries);
1212	if (matchIndex < lowLimit)
1213	break;
1214	if ((dictMode != ZSTD_extDict) \|\| matchIndex >= dictLimit) {
1215	PREFETCH_L1(base + matchIndex);
1216	} else {
1217	PREFETCH_L1(dictBase + matchIndex);
1218	}
1219	matchBuffer[numMatches++] = matchIndex;
1220	--nbAttempts;
1221	}
1222
1223	/ Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop*
1224	in ZSTD_row_update_internal() at the next search. /*
1225	{
1226	U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask);
1227	tagRow[pos] = (BYTE)tag;
1228	row[pos] = ms->nextToUpdate++;
1229	}
1230
1231	/ Return the longest match /
1232	for (; currMatch < numMatches; ++currMatch) {
1233	U32 const matchIndex = matchBuffer[currMatch];
1234	size_t currentMl=`0`;
1235	assert(matchIndex < curr);
1236	assert(matchIndex >= lowLimit);
1237
1238	if ((dictMode != ZSTD_extDict) \|\| matchIndex >= dictLimit) {
1239	const BYTE* const match = base + matchIndex;
1240	assert(matchIndex >= dictLimit); / ensures this is true if dictMode != ZSTD_extDict /
1241	/ read 4B starting from (match + ml + 1 - sizeof(U32)) /
1242	if (MEM_read32(match + ml - `3`) == MEM_read32(ip + ml - `3`)) / potentially better /
1243	currentMl = ZSTD_count(ip, match, iLimit);
1244	} else {
1245	const BYTE* const match = dictBase + matchIndex;
1246	assert(match+`4` <= dictEnd);
1247	if (MEM_read32(match) == MEM_read32(ip)) / assumption : matchIndex <= dictLimit-4 (by table construction) /
1248	currentMl = ZSTD_count_2segments(ip+`4`, match+`4`, iLimit, dictEnd, prefixStart) + `4`;
1249	}
1250
1251	/ Save best solution /
1252	if (currentMl > ml) {
1253	ml = currentMl;
1254	*offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex);
1255	if (ip+currentMl == iLimit) break; / best possible, avoids read overflow on next attempt /
1256	}
1257	}
1258	}
1259
1260	assert(nbAttempts <= (`1U` << ZSTD_SEARCHLOG_MAX)); / Check we haven't underflowed. /
1261	if (dictMode == ZSTD_dedicatedDictSearch) {
1262	ml = ZSTD_dedicatedDictSearch_lazy_search(offsetPtr, ml, nbAttempts + ddsExtraAttempts, dms,
1263	ip, iLimit, prefixStart, curr, dictLimit, ddsIdx);
1264	} else if (dictMode == ZSTD_dictMatchState) {
1265	/ TODO: Measure and potentially add prefetching to DMS /
1266	const U32 dmsLowestIndex = dms->window.dictLimit;
1267	const BYTE* const dmsBase = dms->window.base;
1268	const BYTE* const dmsEnd = dms->window.nextSrc;
1269	const U32 dmsSize = (U32)(dmsEnd - dmsBase);
1270	const U32 dmsIndexDelta = dictLimit - dmsSize;
1271
1272	{ U32 const headGrouped = (dmsTagRow & rowMask) groupWidth;
1273	U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES];
1274	size_t numMatches = `0`;
1275	size_t currMatch = `0`;
1276	ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries);
1277
1278	for (; (matches > `0`) && (nbAttempts > `0`); matches &= (matches - `1`)) {
1279	U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask;
1280	U32 const matchIndex = dmsRow[matchPos];
1281	if(matchPos == `0`) continue;
1282	if (matchIndex < dmsLowestIndex)
1283	break;
1284	PREFETCH_L1(dmsBase + matchIndex);
1285	matchBuffer[numMatches++] = matchIndex;
1286	--nbAttempts;
1287	}
1288
1289	/ Return the longest match /
1290	for (; currMatch < numMatches; ++currMatch) {
1291	U32 const matchIndex = matchBuffer[currMatch];
1292	size_t currentMl=`0`;
1293	assert(matchIndex >= dmsLowestIndex);
1294	assert(matchIndex < curr);
1295
1296	{ const BYTE* const match = dmsBase + matchIndex;
1297	assert(match+`4` <= dmsEnd);
1298	if (MEM_read32(match) == MEM_read32(ip))
1299	currentMl = ZSTD_count_2segments(ip+`4`, match+`4`, iLimit, dmsEnd, prefixStart) + `4`;
1300	}
1301
1302	if (currentMl > ml) {
1303	ml = currentMl;
1304	assert(curr > matchIndex + dmsIndexDelta);
1305	*offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta));
1306	if (ip+currentMl == iLimit) break;
1307	}
1308	}
1309	}
1310	}
1311	return ml;
1312	}
1313
1314
1315	/**
1316	* Generate search functions templated on (dictMode, mls, rowLog).
1317	* These functions are outlined for code size & compilation time.
1318	* ZSTD_searchMax() dispatches to the correct implementation function.
1319	*
1320	* TODO: The start of the search function involves loading and calculating a
1321	* bunch of constants from the ZSTD_matchState_t. These computations could be
1322	* done in an initialization function, and saved somewhere in the match state.
1323	* Then we could pass a pointer to the saved state instead of the match state,
1324	* and avoid duplicate computations.
1325	*
1326	* TODO: Move the match re-winding into searchMax. This improves compression
1327	* ratio, and unlocks further simplifications with the next TODO.
1328	*
1329	* TODO: Try moving the repcode search into searchMax. After the re-winding
1330	* and repcode search are in searchMax, there is no more logic in the match
1331	* finder loop that requires knowledge about the dictMode. So we should be
1332	* able to avoid force inlining it, and we can join the extDict loop with
1333	* the single segment loop. It should go in searchMax instead of its own
1334	* function to avoid having multiple virtual function calls per search.
1335	*/
1336
1337	#define ZSTD_BT_SEARCH_FN(dictMode, mls) ZSTD_BtFindBestMatch_##dictMode##_##mls
1338	#define ZSTD_HC_SEARCH_FN(dictMode, mls) ZSTD_HcFindBestMatch_##dictMode##_##mls
1339	#define ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) ZSTD_RowFindBestMatch_##dictMode##_##mls##_##rowLog
1340
1341	#define ZSTD_SEARCH_FN_ATTRS FORCE_NOINLINE
1342
1343	#define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \
1344	ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \
1345	ZSTD_matchState_t* ms, \
1346	const BYTE* ip, const BYTE* const iLimit, \
1347	size_t* offBasePtr) \
1348	{ \
1349	assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1350	return ZSTD_BtFindBestMatch(ms, ip, iLimit, offBasePtr, mls, ZSTD_##dictMode); \
1351	} \
1352
1353	#define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \
1354	ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \
1355	ZSTD_matchState_t* ms, \
1356	const BYTE* ip, const BYTE* const iLimit, \
1357	size_t* offsetPtr) \
1358	{ \
1359	assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1360	return ZSTD_HcFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode); \
1361	} \
1362
1363	#define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \
1364	ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \
1365	ZSTD_matchState_t* ms, \
1366	const BYTE* ip, const BYTE* const iLimit, \
1367	size_t* offsetPtr) \
1368	{ \
1369	assert(MAX(4, MIN(6, ms->cParams.minMatch)) == mls); \
1370	assert(MAX(4, MIN(6, ms->cParams.searchLog)) == rowLog); \
1371	return ZSTD_RowFindBestMatch(ms, ip, iLimit, offsetPtr, mls, ZSTD_##dictMode, rowLog); \
1372	} \
1373
1374	#define ZSTD_FOR_EACH_ROWLOG(X, dictMode, mls) \
1375	X(dictMode, mls, 4) \
1376	X(dictMode, mls, 5) \
1377	X(dictMode, mls, 6)
1378
1379	#define ZSTD_FOR_EACH_MLS_ROWLOG(X, dictMode) \
1380	ZSTD_FOR_EACH_ROWLOG(X, dictMode, 4) \
1381	ZSTD_FOR_EACH_ROWLOG(X, dictMode, 5) \
1382	ZSTD_FOR_EACH_ROWLOG(X, dictMode, 6)
1383
1384	#define ZSTD_FOR_EACH_MLS(X, dictMode) \
1385	X(dictMode, 4) \
1386	X(dictMode, 5) \
1387	X(dictMode, 6)
1388
1389	#define ZSTD_FOR_EACH_DICT_MODE(X, ...) \
1390	X(__VA_ARGS__, noDict) \
1391	X(__VA_ARGS__, extDict) \
1392	X(__VA_ARGS__, dictMatchState) \
1393	X(__VA_ARGS__, dedicatedDictSearch)
1394
1395	/ Generate row search fns for each combination of (dictMode, mls, rowLog) /
1396	ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS_ROWLOG, GEN_ZSTD_ROW_SEARCH_FN)
1397	/ Generate binary Tree search fns for each combination of (dictMode, mls) /
1398	ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_BT_SEARCH_FN)
1399	/ Generate hash chain search fns for each combination of (dictMode, mls) /
1400	ZSTD_FOR_EACH_DICT_MODE(ZSTD_FOR_EACH_MLS, GEN_ZSTD_HC_SEARCH_FN)
1401
1402	typedef enum { search_hashChain=`0`, search_binaryTree=`1`, search_rowHash=`2` } searchMethod_e;
1403
1404	#define GEN_ZSTD_CALL_BT_SEARCH_FN(dictMode, mls) \
1405	case mls: \
1406	return ZSTD_BT_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
1407	#define GEN_ZSTD_CALL_HC_SEARCH_FN(dictMode, mls) \
1408	case mls: \
1409	return ZSTD_HC_SEARCH_FN(dictMode, mls)(ms, ip, iend, offsetPtr);
1410	#define GEN_ZSTD_CALL_ROW_SEARCH_FN(dictMode, mls, rowLog) \
1411	case rowLog: \
1412	return ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)(ms, ip, iend, offsetPtr);
1413
1414	#define ZSTD_SWITCH_MLS(X, dictMode) \
1415	switch (mls) { \
1416	ZSTD_FOR_EACH_MLS(X, dictMode) \
1417	}
1418
1419	#define ZSTD_SWITCH_ROWLOG(dictMode, mls) \
1420	case mls: \
1421	switch (rowLog) { \
1422	ZSTD_FOR_EACH_ROWLOG(GEN_ZSTD_CALL_ROW_SEARCH_FN, dictMode, mls) \
1423	} \
1424	ZSTD_UNREACHABLE; \
1425	break;
1426
1427	#define ZSTD_SWITCH_SEARCH_METHOD(dictMode) \
1428	switch (searchMethod) { \
1429	case search_hashChain: \
1430	ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_HC_SEARCH_FN, dictMode) \
1431	break; \
1432	case search_binaryTree: \
1433	ZSTD_SWITCH_MLS(GEN_ZSTD_CALL_BT_SEARCH_FN, dictMode) \
1434	break; \
1435	case search_rowHash: \
1436	ZSTD_SWITCH_MLS(ZSTD_SWITCH_ROWLOG, dictMode) \
1437	break; \
1438	} \
1439	ZSTD_UNREACHABLE;
1440
1441	/**
1442	* Searches for the longest match at @p ip.
1443	* Dispatches to the correct implementation function based on the
1444	* (searchMethod, dictMode, mls, rowLog). We use switch statements
1445	* here instead of using an indirect function call through a function
1446	* pointer because after Spectre and Meltdown mitigations, indirect
1447	* function calls can be very costly, especially in the kernel.
1448	*
1449	* NOTE: dictMode and searchMethod should be templated, so those switch
1450	* statements should be optimized out. Only the mls & rowLog switches
1451	* should be left.
1452	*
1453	* @param ms The match state.
1454	* @param ip The position to search at.
1455	* @param iend The end of the input data.
1456	* @param[out] offsetPtr Stores the match offset into this pointer.
1457	* @param mls The minimum search length, in the range [4, 6].
1458	* @param rowLog The row log (if applicable), in the range [4, 6].
1459	* @param searchMethod The search method to use (templated).
1460	* @param dictMode The dictMode (templated).
1461	*
1462	* @returns The length of the longest match found, or < mls if no match is found.
1463	* If a match is found its offset is stored in @p offsetPtr.
1464	*/
1465	FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax(
1466	ZSTD_matchState_t* ms,
1467	const BYTE* ip,
1468	const BYTE* iend,
1469	size_t* offsetPtr,
1470	U32 const mls,
1471	U32 const rowLog,
1472	searchMethod_e const searchMethod,
1473	ZSTD_dictMode_e const dictMode)
1474	{
1475	if (dictMode == ZSTD_noDict) {
1476	ZSTD_SWITCH_SEARCH_METHOD(noDict)
1477	} else if (dictMode == ZSTD_extDict) {
1478	ZSTD_SWITCH_SEARCH_METHOD(extDict)
1479	} else if (dictMode == ZSTD_dictMatchState) {
1480	ZSTD_SWITCH_SEARCH_METHOD(dictMatchState)
1481	} else if (dictMode == ZSTD_dedicatedDictSearch) {
1482	ZSTD_SWITCH_SEARCH_METHOD(dedicatedDictSearch)
1483	}
1484	ZSTD_UNREACHABLE;
1485	return `0`;
1486	}
1487
1488	/* *******************************
1489	* Common parser - lazy strategy
1490	*********************************/
1491
1492	FORCE_INLINE_TEMPLATE size_t
1493	ZSTD_compressBlock_lazy_generic(
1494	ZSTD_matchState_t* ms, seqStore_t* seqStore,
1495	U32 rep[ZSTD_REP_NUM],
1496	const void* src, size_t srcSize,
1497	const searchMethod_e searchMethod, const U32 depth,
1498	ZSTD_dictMode_e const dictMode)
1499	{
1500	const BYTE* const istart = (const BYTE*)src;
1501	const BYTE* ip = istart;
1502	const BYTE* anchor = istart;
1503	const BYTE* const iend = istart + srcSize;
1504	const BYTE* const ilimit = (searchMethod == search_rowHash) ? iend - `8` - ZSTD_ROW_HASH_CACHE_SIZE : iend - `8`;
1505	const BYTE* const base = ms->window.base;
1506	const U32 prefixLowestIndex = ms->window.dictLimit;
1507	const BYTE* const prefixLowest = base + prefixLowestIndex;
1508	const U32 mls = BOUNDED(`4`, ms->cParams.minMatch, `6`);
1509	const U32 rowLog = BOUNDED(`4`, ms->cParams.searchLog, `6`);
1510
1511	U32 offset_1 = rep[`0`], offset_2 = rep[`1`];
1512	U32 offsetSaved1 = `0`, offsetSaved2 = `0`;
1513
1514	const int isDMS = dictMode == ZSTD_dictMatchState;
1515	const int isDDS = dictMode == ZSTD_dedicatedDictSearch;
1516	const int isDxS = isDMS \|\| isDDS;
1517	const ZSTD_matchState_t* const dms = ms->dictMatchState;
1518	const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : `0`;
1519	const BYTE* const dictBase = isDxS ? dms->window.base : NULL;
1520	const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL;
1521	const BYTE* const dictEnd = isDxS ? dms->window.nextSrc : NULL;
1522	const U32 dictIndexDelta = isDxS ?
1523	prefixLowestIndex - (U32)(dictEnd - dictBase) :
1524	`0`;
1525	const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest));
1526
1527	DEBUGLOG(`5`, "ZSTD_compressBlock_lazy_generic (dictMode=%u) (searchFunc=%u)", (U32)dictMode, (U32)searchMethod);
1528	ip += (dictAndPrefixLength == `0`);
1529	if (dictMode == ZSTD_noDict) {
1530	U32 const curr = (U32)(ip - base);
1531	U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog);
1532	U32 const maxRep = curr - windowLow;
1533	if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = `0`;
1534	if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = `0`;
1535	}
1536	if (isDxS) {
1537	/ dictMatchState repCode checks don't currently handle repCode == 0*
1538	* disabling. */
1539	assert(offset_1 <= dictAndPrefixLength);
1540	assert(offset_2 <= dictAndPrefixLength);
1541	}
1542
1543	/ Reset the lazy skipping state /
1544	ms->lazySkipping = `0`;
1545
1546	if (searchMethod == search_rowHash) {
1547	ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
1548	}
1549
1550	/ Match Loop /
1551	#if defined(__GNUC__) && defined(__x86_64__)
1552	/ I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the*
1553	* code alignment is perturbed. To fix the instability align the loop on 32-bytes.
1554	*/
1555	__asm__(".p2align 5");
1556	#endif
1557	while (ip < ilimit) {
1558	size_t matchLength=`0`;
1559	size_t offBase = REPCODE1_TO_OFFBASE;
1560	const BYTE* start=ip+`1`;
1561	DEBUGLOG(`7`, "search baseline (depth 0)");
1562
1563	/ check repCode /
1564	if (isDxS) {
1565	const U32 repIndex = (U32)(ip - base) + `1` - offset_1;
1566	const BYTE* repMatch = ((dictMode == ZSTD_dictMatchState \|\| dictMode == ZSTD_dedicatedDictSearch)
1567	&& repIndex < prefixLowestIndex) ?
1568	dictBase + (repIndex - dictIndexDelta) :
1569	base + repIndex;
1570	if (((U32)((prefixLowestIndex-`1`) - repIndex) >= `3` / intentional underflow /)
1571	&& (MEM_read32(repMatch) == MEM_read32(ip+`1`)) ) {
1572	const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1573	matchLength = ZSTD_count_2segments(ip+`1`+`4`, repMatch+`4`, iend, repMatchEnd, prefixLowest) + `4`;
1574	if (depth==`0`) goto _storeSequence;
1575	}
1576	}
1577	if ( dictMode == ZSTD_noDict
1578	&& ((offset_1 > `0`) & (MEM_read32(ip+`1`-offset_1) == MEM_read32(ip+`1`)))) {
1579	matchLength = ZSTD_count(ip+`1`+`4`, ip+`1`+`4`-offset_1, iend) + `4`;
1580	if (depth==`0`) goto _storeSequence;
1581	}
1582
1583	/ first search (depth 0) /
1584	{ size_t offbaseFound = `999999999`;
1585	size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode);
1586	if (ml2 > matchLength)
1587	matchLength = ml2, start = ip, offBase = offbaseFound;
1588	}
1589
1590	if (matchLength < `4`) {
1591	size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + `1`; / jump faster over incompressible sections /;
1592	ip += step;
1593	/ Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.*
1594	* In this mode we stop inserting every position into our tables, and only insert
1595	* positions that we search, which is one in step positions.
1596	* The exact cutoff is flexible, I've just chosen a number that is reasonably high,
1597	* so we minimize the compression ratio loss in "normal" scenarios. This mode gets
1598	* triggered once we've gone 2KB without finding any matches.
1599	*/
1600	ms->lazySkipping = step > kLazySkippingStep;
1601	continue;
1602	}
1603
1604	/ let's try to find a better solution /
1605	if (depth>=`1`)
1606	while (ip<ilimit) {
1607	DEBUGLOG(`7`, "search depth 1");
1608	ip ++;
1609	if ( (dictMode == ZSTD_noDict)
1610	&& (offBase) && ((offset_1>`0`) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1611	size_t const mlRep = ZSTD_count(ip+`4`, ip+`4`-offset_1, iend) + `4`;
1612	int const gain2 = (int)(mlRep * `3`);
1613	int const gain1 = (int)(matchLength*`3` - ZSTD_highbit32((U32)offBase) + `1`);
1614	if ((mlRep >= `4`) && (gain2 > gain1))
1615	matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1616	}
1617	if (isDxS) {
1618	const U32 repIndex = (U32)(ip - base) - offset_1;
1619	const BYTE* repMatch = repIndex < prefixLowestIndex ?
1620	dictBase + (repIndex - dictIndexDelta) :
1621	base + repIndex;
1622	if (((U32)((prefixLowestIndex-`1`) - repIndex) >= `3` / intentional underflow /)
1623	&& (MEM_read32(repMatch) == MEM_read32(ip)) ) {
1624	const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1625	size_t const mlRep = ZSTD_count_2segments(ip+`4`, repMatch+`4`, iend, repMatchEnd, prefixLowest) + `4`;
1626	int const gain2 = (int)(mlRep * `3`);
1627	int const gain1 = (int)(matchLength*`3` - ZSTD_highbit32((U32)offBase) + `1`);
1628	if ((mlRep >= `4`) && (gain2 > gain1))
1629	matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1630	}
1631	}
1632	{ size_t ofbCandidate=`999999999`;
1633	size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
1634	int const gain2 = (int)(ml2`4` - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx /
1635	int const gain1 = (int)(matchLength*`4` - ZSTD_highbit32((U32)offBase) + `4`);
1636	if ((ml2 >= `4`) && (gain2 > gain1)) {
1637	matchLength = ml2, offBase = ofbCandidate, start = ip;
1638	continue; / search a better one /
1639	} }
1640
1641	/ let's find an even better one /
1642	if ((depth==`2`) && (ip<ilimit)) {
1643	DEBUGLOG(`7`, "search depth 2");
1644	ip ++;
1645	if ( (dictMode == ZSTD_noDict)
1646	&& (offBase) && ((offset_1>`0`) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) {
1647	size_t const mlRep = ZSTD_count(ip+`4`, ip+`4`-offset_1, iend) + `4`;
1648	int const gain2 = (int)(mlRep * `4`);
1649	int const gain1 = (int)(matchLength*`4` - ZSTD_highbit32((U32)offBase) + `1`);
1650	if ((mlRep >= `4`) && (gain2 > gain1))
1651	matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1652	}
1653	if (isDxS) {
1654	const U32 repIndex = (U32)(ip - base) - offset_1;
1655	const BYTE* repMatch = repIndex < prefixLowestIndex ?
1656	dictBase + (repIndex - dictIndexDelta) :
1657	base + repIndex;
1658	if (((U32)((prefixLowestIndex-`1`) - repIndex) >= `3` / intentional underflow /)
1659	&& (MEM_read32(repMatch) == MEM_read32(ip)) ) {
1660	const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend;
1661	size_t const mlRep = ZSTD_count_2segments(ip+`4`, repMatch+`4`, iend, repMatchEnd, prefixLowest) + `4`;
1662	int const gain2 = (int)(mlRep * `4`);
1663	int const gain1 = (int)(matchLength*`4` - ZSTD_highbit32((U32)offBase) + `1`);
1664	if ((mlRep >= `4`) && (gain2 > gain1))
1665	matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip;
1666	}
1667	}
1668	{ size_t ofbCandidate=`999999999`;
1669	size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode);
1670	int const gain2 = (int)(ml2`4` - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx /
1671	int const gain1 = (int)(matchLength*`4` - ZSTD_highbit32((U32)offBase) + `7`);
1672	if ((ml2 >= `4`) && (gain2 > gain1)) {
1673	matchLength = ml2, offBase = ofbCandidate, start = ip;
1674	continue;
1675	} } }
1676	break; / nothing found : store previous solution /
1677	}
1678
1679	/ NOTE:*
1680	* Pay attention that `start[-value]` can lead to strange undefined behavior
1681	* notably if `value` is unsigned, resulting in a large positive `-value`.
1682	*/
1683	/ catch up /
1684	if (OFFBASE_IS_OFFSET(offBase)) {
1685	if (dictMode == ZSTD_noDict) {
1686	while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest))
1687	&& (start[-`1`] == (start-OFFBASE_TO_OFFSET(offBase))[-`1`]) ) / only search for offset within prefix /
1688	{ start--; matchLength++; }
1689	}
1690	if (isDxS) {
1691	U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
1692	const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex;
1693	const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest;
1694	while ((start>anchor) && (match>mStart) && (start[-`1`] == match[-`1`])) { start--; match--; matchLength++; } / catch up /
1695	}
1696	offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
1697	}
1698	/ store sequence /
1699	_storeSequence:
1700	{ size_t const litLength = (size_t)(start - anchor);
1701	ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
1702	anchor = ip = start + matchLength;
1703	}
1704	if (ms->lazySkipping) {
1705	/ We've found a match, disable lazy skipping mode, and refill the hash cache. /
1706	if (searchMethod == search_rowHash) {
1707	ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
1708	}
1709	ms->lazySkipping = `0`;
1710	}
1711
1712	/ check immediate repcode /
1713	if (isDxS) {
1714	while (ip <= ilimit) {
1715	U32 const current2 = (U32)(ip-base);
1716	U32 const repIndex = current2 - offset_2;
1717	const BYTE* repMatch = repIndex < prefixLowestIndex ?
1718	dictBase - dictIndexDelta + repIndex :
1719	base + repIndex;
1720	if ( ((U32)((prefixLowestIndex-`1`) - (U32)repIndex) >= `3` / intentional overflow /)
1721	&& (MEM_read32(repMatch) == MEM_read32(ip)) ) {
1722	const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend;
1723	matchLength = ZSTD_count_2segments(ip+`4`, repMatch+`4`, iend, repEnd2, prefixLowest) + `4`;
1724	offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; / swap offset_2 <=> offset_1 /
1725	ZSTD_storeSeq(seqStore, `0`, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
1726	ip += matchLength;
1727	anchor = ip;
1728	continue;
1729	}
1730	break;
1731	}
1732	}
1733
1734	if (dictMode == ZSTD_noDict) {
1735	while ( ((ip <= ilimit) & (offset_2>`0`))
1736	&& (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) {
1737	/ store sequence /
1738	matchLength = ZSTD_count(ip+`4`, ip+`4`-offset_2, iend) + `4`;
1739	offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; / swap repcodes /
1740	ZSTD_storeSeq(seqStore, `0`, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
1741	ip += matchLength;
1742	anchor = ip;
1743	continue; / faster when present ... (?) /
1744	} } }
1745
1746	/ If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),*
1747	* rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
1748	offsetSaved2 = ((offsetSaved1 != `0`) && (offset_1 != `0`)) ? offsetSaved1 : offsetSaved2;
1749
1750	/ save reps for next block /
1751	rep[`0`] = offset_1 ? offset_1 : offsetSaved1;
1752	rep[`1`] = offset_2 ? offset_2 : offsetSaved2;
1753
1754	/ Return the last literals size /
1755	return (size_t)(iend - anchor);
1756	}
1757
1758
1759	size_t ZSTD_compressBlock_btlazy2(
1760	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1761	void const* src, size_t srcSize)
1762	{
1763	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, `2`, ZSTD_noDict);
1764	}
1765
1766	size_t ZSTD_compressBlock_lazy2(
1767	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1768	void const* src, size_t srcSize)
1769	{
1770	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, `2`, ZSTD_noDict);
1771	}
1772
1773	size_t ZSTD_compressBlock_lazy(
1774	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1775	void const* src, size_t srcSize)
1776	{
1777	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, `1`, ZSTD_noDict);
1778	}
1779
1780	size_t ZSTD_compressBlock_greedy(
1781	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1782	void const* src, size_t srcSize)
1783	{
1784	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, `0`, ZSTD_noDict);
1785	}
1786
1787	size_t ZSTD_compressBlock_btlazy2_dictMatchState(
1788	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1789	void const* src, size_t srcSize)
1790	{
1791	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, `2`, ZSTD_dictMatchState);
1792	}
1793
1794	size_t ZSTD_compressBlock_lazy2_dictMatchState(
1795	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1796	void const* src, size_t srcSize)
1797	{
1798	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, `2`, ZSTD_dictMatchState);
1799	}
1800
1801	size_t ZSTD_compressBlock_lazy_dictMatchState(
1802	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1803	void const* src, size_t srcSize)
1804	{
1805	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, `1`, ZSTD_dictMatchState);
1806	}
1807
1808	size_t ZSTD_compressBlock_greedy_dictMatchState(
1809	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1810	void const* src, size_t srcSize)
1811	{
1812	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, `0`, ZSTD_dictMatchState);
1813	}
1814
1815
1816	size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch(
1817	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1818	void const* src, size_t srcSize)
1819	{
1820	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, `2`, ZSTD_dedicatedDictSearch);
1821	}
1822
1823	size_t ZSTD_compressBlock_lazy_dedicatedDictSearch(
1824	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1825	void const* src, size_t srcSize)
1826	{
1827	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, `1`, ZSTD_dedicatedDictSearch);
1828	}
1829
1830	size_t ZSTD_compressBlock_greedy_dedicatedDictSearch(
1831	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1832	void const* src, size_t srcSize)
1833	{
1834	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, `0`, ZSTD_dedicatedDictSearch);
1835	}
1836
1837	/ Row-based matchfinder /
1838	size_t ZSTD_compressBlock_lazy2_row(
1839	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1840	void const* src, size_t srcSize)
1841	{
1842	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, `2`, ZSTD_noDict);
1843	}
1844
1845	size_t ZSTD_compressBlock_lazy_row(
1846	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1847	void const* src, size_t srcSize)
1848	{
1849	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, `1`, ZSTD_noDict);
1850	}
1851
1852	size_t ZSTD_compressBlock_greedy_row(
1853	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1854	void const* src, size_t srcSize)
1855	{
1856	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, `0`, ZSTD_noDict);
1857	}
1858
1859	size_t ZSTD_compressBlock_lazy2_dictMatchState_row(
1860	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1861	void const* src, size_t srcSize)
1862	{
1863	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, `2`, ZSTD_dictMatchState);
1864	}
1865
1866	size_t ZSTD_compressBlock_lazy_dictMatchState_row(
1867	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1868	void const* src, size_t srcSize)
1869	{
1870	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, `1`, ZSTD_dictMatchState);
1871	}
1872
1873	size_t ZSTD_compressBlock_greedy_dictMatchState_row(
1874	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1875	void const* src, size_t srcSize)
1876	{
1877	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, `0`, ZSTD_dictMatchState);
1878	}
1879
1880
1881	size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row(
1882	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1883	void const* src, size_t srcSize)
1884	{
1885	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, `2`, ZSTD_dedicatedDictSearch);
1886	}
1887
1888	size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row(
1889	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1890	void const* src, size_t srcSize)
1891	{
1892	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, `1`, ZSTD_dedicatedDictSearch);
1893	}
1894
1895	size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row(
1896	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
1897	void const* src, size_t srcSize)
1898	{
1899	return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, `0`, ZSTD_dedicatedDictSearch);
1900	}
1901
1902	FORCE_INLINE_TEMPLATE
1903	size_t ZSTD_compressBlock_lazy_extDict_generic(
1904	ZSTD_matchState_t* ms, seqStore_t* seqStore,
1905	U32 rep[ZSTD_REP_NUM],
1906	const void* src, size_t srcSize,
1907	const searchMethod_e searchMethod, const U32 depth)
1908	{
1909	const BYTE* const istart = (const BYTE*)src;
1910	const BYTE* ip = istart;
1911	const BYTE* anchor = istart;
1912	const BYTE* const iend = istart + srcSize;
1913	const BYTE* const ilimit = searchMethod == search_rowHash ? iend - `8` - ZSTD_ROW_HASH_CACHE_SIZE : iend - `8`;
1914	const BYTE* const base = ms->window.base;
1915	const U32 dictLimit = ms->window.dictLimit;
1916	const BYTE* const prefixStart = base + dictLimit;
1917	const BYTE* const dictBase = ms->window.dictBase;
1918	const BYTE* const dictEnd = dictBase + dictLimit;
1919	const BYTE* const dictStart = dictBase + ms->window.lowLimit;
1920	const U32 windowLog = ms->cParams.windowLog;
1921	const U32 mls = BOUNDED(`4`, ms->cParams.minMatch, `6`);
1922	const U32 rowLog = BOUNDED(`4`, ms->cParams.searchLog, `6`);
1923
1924	U32 offset_1 = rep[`0`], offset_2 = rep[`1`];
1925
1926	DEBUGLOG(`5`, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod);
1927
1928	/ Reset the lazy skipping state /
1929	ms->lazySkipping = `0`;
1930
1931	/ init /
1932	ip += (ip == prefixStart);
1933	if (searchMethod == search_rowHash) {
1934	ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
1935	}
1936
1937	/ Match Loop /
1938	#if defined(__GNUC__) && defined(__x86_64__)
1939	/ I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the*
1940	* code alignment is perturbed. To fix the instability align the loop on 32-bytes.
1941	*/
1942	__asm__(".p2align 5");
1943	#endif
1944	while (ip < ilimit) {
1945	size_t matchLength=`0`;
1946	size_t offBase = REPCODE1_TO_OFFBASE;
1947	const BYTE* start=ip+`1`;
1948	U32 curr = (U32)(ip-base);
1949
1950	/ check repCode /
1951	{ const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr+`1`, windowLog);
1952	const U32 repIndex = (U32)(curr+`1` - offset_1);
1953	const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1954	const BYTE* const repMatch = repBase + repIndex;
1955	if ( ((U32)((dictLimit-`1`) - repIndex) >= `3`) / intentional overflow /
1956	& (offset_1 <= curr+`1` - windowLow) ) / note: we are searching at curr+1 /
1957	if (MEM_read32(ip+`1`) == MEM_read32(repMatch)) {
1958	/ repcode detected we should take it /
1959	const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
1960	matchLength = ZSTD_count_2segments(ip+`1`+`4`, repMatch+`4`, iend, repEnd, prefixStart) + `4`;
1961	if (depth==`0`) goto _storeSequence;
1962	} }
1963
1964	/ first search (depth 0) /
1965	{ size_t ofbCandidate = `999999999`;
1966	size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
1967	if (ml2 > matchLength)
1968	matchLength = ml2, start = ip, offBase = ofbCandidate;
1969	}
1970
1971	if (matchLength < `4`) {
1972	size_t const step = ((size_t)(ip-anchor) >> kSearchStrength);
1973	ip += step + `1`; / jump faster over incompressible sections /
1974	/ Enter the lazy skipping mode once we are skipping more than 8 bytes at a time.*
1975	* In this mode we stop inserting every position into our tables, and only insert
1976	* positions that we search, which is one in step positions.
1977	* The exact cutoff is flexible, I've just chosen a number that is reasonably high,
1978	* so we minimize the compression ratio loss in "normal" scenarios. This mode gets
1979	* triggered once we've gone 2KB without finding any matches.
1980	*/
1981	ms->lazySkipping = step > kLazySkippingStep;
1982	continue;
1983	}
1984
1985	/ let's try to find a better solution /
1986	if (depth>=`1`)
1987	while (ip<ilimit) {
1988	ip ++;
1989	curr++;
1990	/ check repCode /
1991	if (offBase) {
1992	const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
1993	const U32 repIndex = (U32)(curr - offset_1);
1994	const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
1995	const BYTE* const repMatch = repBase + repIndex;
1996	if ( ((U32)((dictLimit-`1`) - repIndex) >= `3`) / intentional overflow : do not test positions overlapping 2 memory segments /
1997	& (offset_1 <= curr - windowLow) ) / equivalent to `curr > repIndex >= windowLow` /
1998	if (MEM_read32(ip) == MEM_read32(repMatch)) {
1999	/ repcode detected /
2000	const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2001	size_t const repLength = ZSTD_count_2segments(ip+`4`, repMatch+`4`, iend, repEnd, prefixStart) + `4`;
2002	int const gain2 = (int)(repLength * `3`);
2003	int const gain1 = (int)(matchLength*`3` - ZSTD_highbit32((U32)offBase) + `1`);
2004	if ((repLength >= `4`) && (gain2 > gain1))
2005	matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
2006	} }
2007
2008	/ search match, depth 1 /
2009	{ size_t ofbCandidate = `999999999`;
2010	size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
2011	int const gain2 = (int)(ml2`4` - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx /
2012	int const gain1 = (int)(matchLength*`4` - ZSTD_highbit32((U32)offBase) + `4`);
2013	if ((ml2 >= `4`) && (gain2 > gain1)) {
2014	matchLength = ml2, offBase = ofbCandidate, start = ip;
2015	continue; / search a better one /
2016	} }
2017
2018	/ let's find an even better one /
2019	if ((depth==`2`) && (ip<ilimit)) {
2020	ip ++;
2021	curr++;
2022	/ check repCode /
2023	if (offBase) {
2024	const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog);
2025	const U32 repIndex = (U32)(curr - offset_1);
2026	const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
2027	const BYTE* const repMatch = repBase + repIndex;
2028	if ( ((U32)((dictLimit-`1`) - repIndex) >= `3`) / intentional overflow : do not test positions overlapping 2 memory segments /
2029	& (offset_1 <= curr - windowLow) ) / equivalent to `curr > repIndex >= windowLow` /
2030	if (MEM_read32(ip) == MEM_read32(repMatch)) {
2031	/ repcode detected /
2032	const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2033	size_t const repLength = ZSTD_count_2segments(ip+`4`, repMatch+`4`, iend, repEnd, prefixStart) + `4`;
2034	int const gain2 = (int)(repLength * `4`);
2035	int const gain1 = (int)(matchLength*`4` - ZSTD_highbit32((U32)offBase) + `1`);
2036	if ((repLength >= `4`) && (gain2 > gain1))
2037	matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip;
2038	} }
2039
2040	/ search match, depth 2 /
2041	{ size_t ofbCandidate = `999999999`;
2042	size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict);
2043	int const gain2 = (int)(ml2`4` - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx /
2044	int const gain1 = (int)(matchLength*`4` - ZSTD_highbit32((U32)offBase) + `7`);
2045	if ((ml2 >= `4`) && (gain2 > gain1)) {
2046	matchLength = ml2, offBase = ofbCandidate, start = ip;
2047	continue;
2048	} } }
2049	break; / nothing found : store previous solution /
2050	}
2051
2052	/ catch up /
2053	if (OFFBASE_IS_OFFSET(offBase)) {
2054	U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase));
2055	const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex;
2056	const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart;
2057	while ((start>anchor) && (match>mStart) && (start[-`1`] == match[-`1`])) { start--; match--; matchLength++; } / catch up /
2058	offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase);
2059	}
2060
2061	/ store sequence /
2062	_storeSequence:
2063	{ size_t const litLength = (size_t)(start - anchor);
2064	ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength);
2065	anchor = ip = start + matchLength;
2066	}
2067	if (ms->lazySkipping) {
2068	/ We've found a match, disable lazy skipping mode, and refill the hash cache. /
2069	if (searchMethod == search_rowHash) {
2070	ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit);
2071	}
2072	ms->lazySkipping = `0`;
2073	}
2074
2075	/ check immediate repcode /
2076	while (ip <= ilimit) {
2077	const U32 repCurrent = (U32)(ip-base);
2078	const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog);
2079	const U32 repIndex = repCurrent - offset_2;
2080	const BYTE* const repBase = repIndex < dictLimit ? dictBase : base;
2081	const BYTE* const repMatch = repBase + repIndex;
2082	if ( ((U32)((dictLimit-`1`) - repIndex) >= `3`) / intentional overflow : do not test positions overlapping 2 memory segments /
2083	& (offset_2 <= repCurrent - windowLow) ) / equivalent to `curr > repIndex >= windowLow` /
2084	if (MEM_read32(ip) == MEM_read32(repMatch)) {
2085	/ repcode detected we should take it /
2086	const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend;
2087	matchLength = ZSTD_count_2segments(ip+`4`, repMatch+`4`, iend, repEnd, prefixStart) + `4`;
2088	offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; / swap offset history /
2089	ZSTD_storeSeq(seqStore, `0`, anchor, iend, REPCODE1_TO_OFFBASE, matchLength);
2090	ip += matchLength;
2091	anchor = ip;
2092	continue; / faster when present ... (?) /
2093	}
2094	break;
2095	} }
2096
2097	/ Save reps for next block /
2098	rep[`0`] = offset_1;
2099	rep[`1`] = offset_2;
2100
2101	/ Return the last literals size /
2102	return (size_t)(iend - anchor);
2103	}
2104
2105
2106	size_t ZSTD_compressBlock_greedy_extDict(
2107	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2108	void const* src, size_t srcSize)
2109	{
2110	return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, `0`);
2111	}
2112
2113	size_t ZSTD_compressBlock_lazy_extDict(
2114	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2115	void const* src, size_t srcSize)
2116
2117	{
2118	return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, `1`);
2119	}
2120
2121	size_t ZSTD_compressBlock_lazy2_extDict(
2122	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2123	void const* src, size_t srcSize)
2124
2125	{
2126	return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, `2`);
2127	}
2128
2129	size_t ZSTD_compressBlock_btlazy2_extDict(
2130	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2131	void const* src, size_t srcSize)
2132
2133	{
2134	return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, `2`);
2135	}
2136
2137	size_t ZSTD_compressBlock_greedy_extDict_row(
2138	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2139	void const* src, size_t srcSize)
2140	{
2141	return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, `0`);
2142	}
2143
2144	size_t ZSTD_compressBlock_lazy_extDict_row(
2145	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2146	void const* src, size_t srcSize)
2147
2148	{
2149	return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, `1`);
2150	}
2151
2152	size_t ZSTD_compressBlock_lazy2_extDict_row(
2153	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
2154	void const* src, size_t srcSize)
2155	{
2156	return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, `2`);
2157	}
2158

Browse the source code of Godot/thirdparty/zstd/compress/zstd_lazy.c