zstd_fast.c source code [Godot/thirdparty/zstd/compress/zstd_fast.c]

1	/*
2	* Copyright (c) Meta Platforms, Inc. and affiliates.
3	* All rights reserved.
4	*
5	* This source code is licensed under both the BSD-style license (found in the
6	* LICENSE file in the root directory of this source tree) and the GPLv2 (found
7	* in the COPYING file in the root directory of this source tree).
8	* You may select, at your option, one of the above-listed licenses.
9	*/
10
11	#include "zstd_compress_internal.h" /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */
12	#include "zstd_fast.h"
13
14	static void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms,
15	const void* const end,
16	ZSTD_dictTableLoadMethod_e dtlm)
17	{
18	const ZSTD_compressionParameters* const cParams = &ms->cParams;
19	U32* const hashTable = ms->hashTable;
20	U32 const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
21	U32 const mls = cParams->minMatch;
22	const BYTE* const base = ms->window.base;
23	const BYTE* ip = base + ms->nextToUpdate;
24	const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
25	const U32 fastHashFillStep = `3`;
26
27	/ Currently, we always use ZSTD_dtlm_full for filling CDict tables.*
28	* Feel free to remove this assert if there's a good reason! */
29	assert(dtlm == ZSTD_dtlm_full);
30
31	/ Always insert every fastHashFillStep position into the hash table.*
32	* Insert the other positions if their hash entry is empty.
33	*/
34	for ( ; ip + fastHashFillStep < iend + `2`; ip += fastHashFillStep) {
35	U32 const curr = (U32)(ip - base);
36	{ size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls);
37	ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr); }
38
39	if (dtlm == ZSTD_dtlm_fast) continue;
40	/ Only load extra positions for ZSTD_dtlm_full /
41	{ U32 p;
42	for (p = `1`; p < fastHashFillStep; ++p) {
43	size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls);
44	if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == `0`) { / not yet filled /
45	ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p);
46	} } } }
47	}
48
49	static void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms,
50	const void* const end,
51	ZSTD_dictTableLoadMethod_e dtlm)
52	{
53	const ZSTD_compressionParameters* const cParams = &ms->cParams;
54	U32* const hashTable = ms->hashTable;
55	U32 const hBits = cParams->hashLog;
56	U32 const mls = cParams->minMatch;
57	const BYTE* const base = ms->window.base;
58	const BYTE* ip = base + ms->nextToUpdate;
59	const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE;
60	const U32 fastHashFillStep = `3`;
61
62	/ Currently, we always use ZSTD_dtlm_fast for filling CCtx tables.*
63	* Feel free to remove this assert if there's a good reason! */
64	assert(dtlm == ZSTD_dtlm_fast);
65
66	/ Always insert every fastHashFillStep position into the hash table.*
67	* Insert the other positions if their hash entry is empty.
68	*/
69	for ( ; ip + fastHashFillStep < iend + `2`; ip += fastHashFillStep) {
70	U32 const curr = (U32)(ip - base);
71	size_t const hash0 = ZSTD_hashPtr(ip, hBits, mls);
72	hashTable[hash0] = curr;
73	if (dtlm == ZSTD_dtlm_fast) continue;
74	/ Only load extra positions for ZSTD_dtlm_full /
75	{ U32 p;
76	for (p = `1`; p < fastHashFillStep; ++p) {
77	size_t const hash = ZSTD_hashPtr(ip + p, hBits, mls);
78	if (hashTable[hash] == `0`) { / not yet filled /
79	hashTable[hash] = curr + p;
80	} } } }
81	}
82
83	void ZSTD_fillHashTable(ZSTD_matchState_t* ms,
84	const void* const end,
85	ZSTD_dictTableLoadMethod_e dtlm,
86	ZSTD_tableFillPurpose_e tfp)
87	{
88	if (tfp == ZSTD_tfp_forCDict) {
89	ZSTD_fillHashTableForCDict(ms, end, dtlm);
90	} else {
91	ZSTD_fillHashTableForCCtx(ms, end, dtlm);
92	}
93	}
94
95
96	/**
97	* If you squint hard enough (and ignore repcodes), the search operation at any
98	* given position is broken into 4 stages:
99	*
100	* 1. Hash (map position to hash value via input read)
101	* 2. Lookup (map hash val to index via hashtable read)
102	* 3. Load (map index to value at that position via input read)
103	* 4. Compare
104	*
105	* Each of these steps involves a memory read at an address which is computed
106	* from the previous step. This means these steps must be sequenced and their
107	* latencies are cumulative.
108	*
109	* Rather than do 1->2->3->4 sequentially for a single position before moving
110	* onto the next, this implementation interleaves these operations across the
111	* next few positions:
112	*
113	* R = Repcode Read & Compare
114	* H = Hash
115	* T = Table Lookup
116	* M = Match Read & Compare
117	*
118	* Pos \| Time -->
119	* ----+-------------------
120	* N \| ... M
121	* N+1 \| ... TM
122	* N+2 \| R H T M
123	* N+3 \| H TM
124	* N+4 \| R H T M
125	* N+5 \| H ...
126	* N+6 \| R ...
127	*
128	* This is very much analogous to the pipelining of execution in a CPU. And just
129	* like a CPU, we have to dump the pipeline when we find a match (i.e., take a
130	* branch).
131	*
132	* When this happens, we throw away our current state, and do the following prep
133	* to re-enter the loop:
134	*
135	* Pos \| Time -->
136	* ----+-------------------
137	* N \| H T
138	* N+1 \| H
139	*
140	* This is also the work we do at the beginning to enter the loop initially.
141	*/
142	FORCE_INLINE_TEMPLATE size_t
143	ZSTD_compressBlock_fast_noDict_generic(
144	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
145	void const* src, size_t srcSize,
146	U32 const mls, U32 const hasStep)
147	{
148	const ZSTD_compressionParameters* const cParams = &ms->cParams;
149	U32* const hashTable = ms->hashTable;
150	U32 const hlog = cParams->hashLog;
151	/ support stepSize of 0 /
152	size_t const stepSize = hasStep ? (cParams->targetLength + !(cParams->targetLength) + `1`) : `2`;
153	const BYTE* const base = ms->window.base;
154	const BYTE* const istart = (const BYTE*)src;
155	const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
156	const U32 prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog);
157	const BYTE* const prefixStart = base + prefixStartIndex;
158	const BYTE* const iend = istart + srcSize;
159	const BYTE* const ilimit = iend - HASH_READ_SIZE;
160
161	const BYTE* anchor = istart;
162	const BYTE* ip0 = istart;
163	const BYTE* ip1;
164	const BYTE* ip2;
165	const BYTE* ip3;
166	U32 current0;
167
168	U32 rep_offset1 = rep[`0`];
169	U32 rep_offset2 = rep[`1`];
170	U32 offsetSaved1 = `0`, offsetSaved2 = `0`;
171
172	size_t hash0; / hash for ip0 /
173	size_t hash1; / hash for ip1 /
174	U32 idx; / match idx for ip0 /
175	U32 mval; / src value at match idx /
176
177	U32 offcode;
178	const BYTE* match0;
179	size_t mLength;
180
181	/ ip0 and ip1 are always adjacent. The targetLength skipping and*
182	* uncompressibility acceleration is applied to every other position,
183	* matching the behavior of #1562. step therefore represents the gap
184	* between pairs of positions, from ip0 to ip2 or ip1 to ip3. */
185	size_t step;
186	const BYTE* nextStep;
187	const size_t kStepIncr = (`1` << (kSearchStrength - `1`));
188
189	DEBUGLOG(`5`, "ZSTD_compressBlock_fast_generic");
190	ip0 += (ip0 == prefixStart);
191	{ U32 const curr = (U32)(ip0 - base);
192	U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog);
193	U32 const maxRep = curr - windowLow;
194	if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = `0`;
195	if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = `0`;
196	}
197
198	/ start each op /
199	_start: / Requires: ip0 /
200
201	step = stepSize;
202	nextStep = ip0 + kStepIncr;
203
204	/ calculate positions, ip0 - anchor == 0, so we skip step calc /
205	ip1 = ip0 + `1`;
206	ip2 = ip0 + step;
207	ip3 = ip2 + `1`;
208
209	if (ip3 >= ilimit) {
210	goto _cleanup;
211	}
212
213	hash0 = ZSTD_hashPtr(ip0, hlog, mls);
214	hash1 = ZSTD_hashPtr(ip1, hlog, mls);
215
216	idx = hashTable[hash0];
217
218	do {
219	/ load repcode match for ip[2]/
220	const U32 rval = MEM_read32(ip2 - rep_offset1);
221
222	/ write back hash table entry /
223	current0 = (U32)(ip0 - base);
224	hashTable[hash0] = current0;
225
226	/ check repcode at ip[2] /
227	if ((MEM_read32(ip2) == rval) & (rep_offset1 > `0`)) {
228	ip0 = ip2;
229	match0 = ip0 - rep_offset1;
230	mLength = ip0[-`1`] == match0[-`1`];
231	ip0 -= mLength;
232	match0 -= mLength;
233	offcode = REPCODE1_TO_OFFBASE;
234	mLength += `4`;
235
236	/ First write next hash table entry; we've already calculated it.*
237	* This write is known to be safe because the ip1 is before the
238	* repcode (ip2). */
239	hashTable[hash1] = (U32)(ip1 - base);
240
241	goto _match;
242	}
243
244	/ load match for ip[0] /
245	if (idx >= prefixStartIndex) {
246	mval = MEM_read32(base + idx);
247	} else {
248	mval = MEM_read32(ip0) ^ `1`; / guaranteed to not match. /
249	}
250
251	/ check match at ip[0] /
252	if (MEM_read32(ip0) == mval) {
253	/ found a match! /
254
255	/ First write next hash table entry; we've already calculated it.*
256	* This write is known to be safe because the ip1 == ip0 + 1, so
257	* we know we will resume searching after ip1 */
258	hashTable[hash1] = (U32)(ip1 - base);
259
260	goto _offset;
261	}
262
263	/ lookup ip[1] /
264	idx = hashTable[hash1];
265
266	/ hash ip[2] /
267	hash0 = hash1;
268	hash1 = ZSTD_hashPtr(ip2, hlog, mls);
269
270	/ advance to next positions /
271	ip0 = ip1;
272	ip1 = ip2;
273	ip2 = ip3;
274
275	/ write back hash table entry /
276	current0 = (U32)(ip0 - base);
277	hashTable[hash0] = current0;
278
279	/ load match for ip[0] /
280	if (idx >= prefixStartIndex) {
281	mval = MEM_read32(base + idx);
282	} else {
283	mval = MEM_read32(ip0) ^ `1`; / guaranteed to not match. /
284	}
285
286	/ check match at ip[0] /
287	if (MEM_read32(ip0) == mval) {
288	/ found a match! /
289
290	/ first write next hash table entry; we've already calculated it /
291	if (step <= `4`) {
292	/ We need to avoid writing an index into the hash table >= the*
293	* position at which we will pick up our searching after we've
294	* taken this match.
295	*
296	* The minimum possible match has length 4, so the earliest ip0
297	* can be after we take this match will be the current ip0 + 4.
298	* ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely
299	* write this position.
300	*/
301	hashTable[hash1] = (U32)(ip1 - base);
302	}
303
304	goto _offset;
305	}
306
307	/ lookup ip[1] /
308	idx = hashTable[hash1];
309
310	/ hash ip[2] /
311	hash0 = hash1;
312	hash1 = ZSTD_hashPtr(ip2, hlog, mls);
313
314	/ advance to next positions /
315	ip0 = ip1;
316	ip1 = ip2;
317	ip2 = ip0 + step;
318	ip3 = ip1 + step;
319
320	/ calculate step /
321	if (ip2 >= nextStep) {
322	step++;
323	PREFETCH_L1(ip1 + `64`);
324	PREFETCH_L1(ip1 + `128`);
325	nextStep += kStepIncr;
326	}
327	} while (ip3 < ilimit);
328
329	_cleanup:
330	/ Note that there are probably still a couple positions we could search.*
331	* However, it seems to be a meaningful performance hit to try to search
332	* them. So let's not. */
333
334	/ When the repcodes are outside of the prefix, we set them to zero before the loop.*
335	* When the offsets are still zero, we need to restore them after the block to have a correct
336	* repcode history. If only one offset was invalid, it is easy. The tricky case is when both
337	* offsets were invalid. We need to figure out which offset to refill with.
338	* - If both offsets are zero they are in the same order.
339	* - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`.
340	* - If only one is zero, we need to decide which offset to restore.
341	* - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1.
342	* - It is impossible for rep_offset2 to be non-zero.
343	*
344	* So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then
345	* set rep[0] = rep_offset1 and rep[1] = offsetSaved1.
346	*/
347	offsetSaved2 = ((offsetSaved1 != `0`) && (rep_offset1 != `0`)) ? offsetSaved1 : offsetSaved2;
348
349	/ save reps for next block /
350	rep[`0`] = rep_offset1 ? rep_offset1 : offsetSaved1;
351	rep[`1`] = rep_offset2 ? rep_offset2 : offsetSaved2;
352
353	/ Return the last literals size /
354	return (size_t)(iend - anchor);
355
356	_offset: / Requires: ip0, idx /
357
358	/ Compute the offset code. /
359	match0 = base + idx;
360	rep_offset2 = rep_offset1;
361	rep_offset1 = (U32)(ip0-match0);
362	offcode = OFFSET_TO_OFFBASE(rep_offset1);
363	mLength = `4`;
364
365	/ Count the backwards match length. /
366	while (((ip0>anchor) & (match0>prefixStart)) && (ip0[-`1`] == match0[-`1`])) {
367	ip0--;
368	match0--;
369	mLength++;
370	}
371
372	_match: / Requires: ip0, match0, offcode /
373
374	/ Count the forward length. /
375	mLength += ZSTD_count(ip0 + mLength, match0 + mLength, iend);
376
377	ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
378
379	ip0 += mLength;
380	anchor = ip0;
381
382	/ Fill table and check for immediate repcode. /
383	if (ip0 <= ilimit) {
384	/ Fill Table /
385	assert(base+current0+`2` > istart); / check base overflow /
386	hashTable[ZSTD_hashPtr(base+current0+`2`, hlog, mls)] = current0+`2`; / here because current+2 could be > iend-8 /
387	hashTable[ZSTD_hashPtr(ip0-`2`, hlog, mls)] = (U32)(ip0-`2`-base);
388
389	if (rep_offset2 > `0`) { / rep_offset2==0 means rep_offset2 is invalidated /
390	while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - rep_offset2)) ) {
391	/ store sequence /
392	size_t const rLength = ZSTD_count(ip0+`4`, ip0+`4`-rep_offset2, iend) + `4`;
393	{ U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } / swap rep_offset2 <=> rep_offset1 /
394	hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
395	ip0 += rLength;
396	ZSTD_storeSeq(seqStore, `0` /litLen/, anchor, iend, REPCODE1_TO_OFFBASE, rLength);
397	anchor = ip0;
398	continue; / faster when present (confirmed on gcc-8) ... (?) /
399	} } }
400
401	goto _start;
402	}
403
404	#define ZSTD_GEN_FAST_FN(dictMode, mls, step) \
405	static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##step( \
406	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \
407	void const* src, size_t srcSize) \
408	{ \
409	return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, step); \
410	}
411
412	ZSTD_GEN_FAST_FN(noDict, `4`, `1`)
413	ZSTD_GEN_FAST_FN(noDict, `5`, `1`)
414	ZSTD_GEN_FAST_FN(noDict, `6`, `1`)
415	ZSTD_GEN_FAST_FN(noDict, `7`, `1`)
416
417	ZSTD_GEN_FAST_FN(noDict, `4`, `0`)
418	ZSTD_GEN_FAST_FN(noDict, `5`, `0`)
419	ZSTD_GEN_FAST_FN(noDict, `6`, `0`)
420	ZSTD_GEN_FAST_FN(noDict, `7`, `0`)
421
422	size_t ZSTD_compressBlock_fast(
423	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
424	void const* src, size_t srcSize)
425	{
426	U32 const mls = ms->cParams.minMatch;
427	assert(ms->dictMatchState == NULL);
428	if (ms->cParams.targetLength > `1`) {
429	switch(mls)
430	{
431	default: / includes case 3 /
432	case `4` :
433	return ZSTD_compressBlock_fast_noDict_4_1(ms, seqStore, rep, src, srcSize);
434	case `5` :
435	return ZSTD_compressBlock_fast_noDict_5_1(ms, seqStore, rep, src, srcSize);
436	case `6` :
437	return ZSTD_compressBlock_fast_noDict_6_1(ms, seqStore, rep, src, srcSize);
438	case `7` :
439	return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize);
440	}
441	} else {
442	switch(mls)
443	{
444	default: / includes case 3 /
445	case `4` :
446	return ZSTD_compressBlock_fast_noDict_4_0(ms, seqStore, rep, src, srcSize);
447	case `5` :
448	return ZSTD_compressBlock_fast_noDict_5_0(ms, seqStore, rep, src, srcSize);
449	case `6` :
450	return ZSTD_compressBlock_fast_noDict_6_0(ms, seqStore, rep, src, srcSize);
451	case `7` :
452	return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize);
453	}
454
455	}
456	}
457
458	FORCE_INLINE_TEMPLATE
459	size_t ZSTD_compressBlock_fast_dictMatchState_generic(
460	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
461	void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
462	{
463	const ZSTD_compressionParameters* const cParams = &ms->cParams;
464	U32* const hashTable = ms->hashTable;
465	U32 const hlog = cParams->hashLog;
466	/ support stepSize of 0 /
467	U32 const stepSize = cParams->targetLength + !(cParams->targetLength);
468	const BYTE* const base = ms->window.base;
469	const BYTE* const istart = (const BYTE*)src;
470	const BYTE* ip0 = istart;
471	const BYTE* ip1 = ip0 + stepSize; / we assert below that stepSize >= 1 /
472	const BYTE* anchor = istart;
473	const U32 prefixStartIndex = ms->window.dictLimit;
474	const BYTE* const prefixStart = base + prefixStartIndex;
475	const BYTE* const iend = istart + srcSize;
476	const BYTE* const ilimit = iend - HASH_READ_SIZE;
477	U32 offset_1=rep[`0`], offset_2=rep[`1`];
478
479	const ZSTD_matchState_t* const dms = ms->dictMatchState;
480	const ZSTD_compressionParameters* const dictCParams = &dms->cParams ;
481	const U32* const dictHashTable = dms->hashTable;
482	const U32 dictStartIndex = dms->window.dictLimit;
483	const BYTE* const dictBase = dms->window.base;
484	const BYTE* const dictStart = dictBase + dictStartIndex;
485	const BYTE* const dictEnd = dms->window.nextSrc;
486	const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase);
487	const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart);
488	const U32 dictHBits = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS;
489
490	/ if a dictionary is still attached, it necessarily means that*
491	* it is within window size. So we just check it. */
492	const U32 maxDistance = `1U` << cParams->windowLog;
493	const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
494	assert(endIndex - prefixStartIndex <= maxDistance);
495	(void)maxDistance; (void)endIndex; / these variables are not used when assert() is disabled /
496
497	(void)hasStep; / not currently specialized on whether it's accelerated /
498
499	/ ensure there will be no underflow*
500	* when translating a dict index into a local index */
501	assert(prefixStartIndex >= (U32)(dictEnd - dictBase));
502
503	if (ms->prefetchCDictTables) {
504	size_t const hashTableBytes = (((size_t)`1`) << dictCParams->hashLog) * sizeof(U32);
505	PREFETCH_AREA(dictHashTable, hashTableBytes)
506	}
507
508	/ init /
509	DEBUGLOG(`5`, "ZSTD_compressBlock_fast_dictMatchState_generic");
510	ip0 += (dictAndPrefixLength == `0`);
511	/ dictMatchState repCode checks don't currently handle repCode == 0*
512	* disabling. */
513	assert(offset_1 <= dictAndPrefixLength);
514	assert(offset_2 <= dictAndPrefixLength);
515
516	/ Outer search loop /
517	assert(stepSize >= `1`);
518	while (ip1 <= ilimit) { / repcode check at (ip0 + 1) is safe because ip0 < ip1 /
519	size_t mLength;
520	size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls);
521
522	size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls);
523	U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS];
524	int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0);
525
526	U32 matchIndex = hashTable[hash0];
527	U32 curr = (U32)(ip0 - base);
528	size_t step = stepSize;
529	const size_t kStepIncr = `1` << kSearchStrength;
530	const BYTE* nextStep = ip0 + kStepIncr;
531
532	/ Inner search loop /
533	while (`1`) {
534	const BYTE* match = base + matchIndex;
535	const U32 repIndex = curr + `1` - offset_1;
536	const BYTE* repMatch = (repIndex < prefixStartIndex) ?
537	dictBase + (repIndex - dictIndexDelta) :
538	base + repIndex;
539	const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls);
540	size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls);
541	hashTable[hash0] = curr; / update hash table /
542
543	if (((U32) ((prefixStartIndex - `1`) - repIndex) >=
544	`3`) / intentional underflow : ensure repIndex isn't overlapping dict + prefix /
545	&& (MEM_read32(repMatch) == MEM_read32(ip0 + `1`))) {
546	const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
547	mLength = ZSTD_count_2segments(ip0 + `1` + `4`, repMatch + `4`, iend, repMatchEnd, prefixStart) + `4`;
548	ip0++;
549	ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength);
550	break;
551	}
552
553	if (dictTagsMatch) {
554	/ Found a possible dict match /
555	const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS;
556	const BYTE* dictMatch = dictBase + dictMatchIndex;
557	if (dictMatchIndex > dictStartIndex &&
558	MEM_read32(dictMatch) == MEM_read32(ip0)) {
559	/ To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid /
560	if (matchIndex <= prefixStartIndex) {
561	U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta);
562	mLength = ZSTD_count_2segments(ip0 + `4`, dictMatch + `4`, iend, dictEnd, prefixStart) + `4`;
563	while (((ip0 > anchor) & (dictMatch > dictStart))
564	&& (ip0[-`1`] == dictMatch[-`1`])) {
565	ip0--;
566	dictMatch--;
567	mLength++;
568	} / catch up /
569	offset_2 = offset_1;
570	offset_1 = offset;
571	ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
572	break;
573	}
574	}
575	}
576
577	if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) {
578	/ found a regular match /
579	U32 const offset = (U32) (ip0 - match);
580	mLength = ZSTD_count(ip0 + `4`, match + `4`, iend) + `4`;
581	while (((ip0 > anchor) & (match > prefixStart))
582	&& (ip0[-`1`] == match[-`1`])) {
583	ip0--;
584	match--;
585	mLength++;
586	} / catch up /
587	offset_2 = offset_1;
588	offset_1 = offset;
589	ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength);
590	break;
591	}
592
593	/ Prepare for next iteration /
594	dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS];
595	dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1);
596	matchIndex = hashTable[hash1];
597
598	if (ip1 >= nextStep) {
599	step++;
600	nextStep += kStepIncr;
601	}
602	ip0 = ip1;
603	ip1 = ip1 + step;
604	if (ip1 > ilimit) goto _cleanup;
605
606	curr = (U32)(ip0 - base);
607	hash0 = hash1;
608	} / end inner search loop /
609
610	/ match found /
611	assert(mLength);
612	ip0 += mLength;
613	anchor = ip0;
614
615	if (ip0 <= ilimit) {
616	/ Fill Table /
617	assert(base+curr+`2` > istart); / check base overflow /
618	hashTable[ZSTD_hashPtr(base+curr+`2`, hlog, mls)] = curr+`2`; / here because curr+2 could be > iend-8 /
619	hashTable[ZSTD_hashPtr(ip0-`2`, hlog, mls)] = (U32)(ip0-`2`-base);
620
621	/ check immediate repcode /
622	while (ip0 <= ilimit) {
623	U32 const current2 = (U32)(ip0-base);
624	U32 const repIndex2 = current2 - offset_2;
625	const BYTE* repMatch2 = repIndex2 < prefixStartIndex ?
626	dictBase - dictIndexDelta + repIndex2 :
627	base + repIndex2;
628	if ( ((U32)((prefixStartIndex-`1`) - (U32)repIndex2) >= `3` / intentional overflow /)
629	&& (MEM_read32(repMatch2) == MEM_read32(ip0))) {
630	const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
631	size_t const repLength2 = ZSTD_count_2segments(ip0+`4`, repMatch2+`4`, iend, repEnd2, prefixStart) + `4`;
632	U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; / swap offset_2 <=> offset_1 /
633	ZSTD_storeSeq(seqStore, `0`, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
634	hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2;
635	ip0 += repLength2;
636	anchor = ip0;
637	continue;
638	}
639	break;
640	}
641	}
642
643	/ Prepare for next iteration /
644	assert(ip0 == anchor);
645	ip1 = ip0 + stepSize;
646	}
647
648	_cleanup:
649	/ save reps for next block /
650	rep[`0`] = offset_1;
651	rep[`1`] = offset_2;
652
653	/ Return the last literals size /
654	return (size_t)(iend - anchor);
655	}
656
657
658	ZSTD_GEN_FAST_FN(dictMatchState, `4`, `0`)
659	ZSTD_GEN_FAST_FN(dictMatchState, `5`, `0`)
660	ZSTD_GEN_FAST_FN(dictMatchState, `6`, `0`)
661	ZSTD_GEN_FAST_FN(dictMatchState, `7`, `0`)
662
663	size_t ZSTD_compressBlock_fast_dictMatchState(
664	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
665	void const* src, size_t srcSize)
666	{
667	U32 const mls = ms->cParams.minMatch;
668	assert(ms->dictMatchState != NULL);
669	switch(mls)
670	{
671	default: / includes case 3 /
672	case `4` :
673	return ZSTD_compressBlock_fast_dictMatchState_4_0(ms, seqStore, rep, src, srcSize);
674	case `5` :
675	return ZSTD_compressBlock_fast_dictMatchState_5_0(ms, seqStore, rep, src, srcSize);
676	case `6` :
677	return ZSTD_compressBlock_fast_dictMatchState_6_0(ms, seqStore, rep, src, srcSize);
678	case `7` :
679	return ZSTD_compressBlock_fast_dictMatchState_7_0(ms, seqStore, rep, src, srcSize);
680	}
681	}
682
683
684	static size_t ZSTD_compressBlock_fast_extDict_generic(
685	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
686	void const* src, size_t srcSize, U32 const mls, U32 const hasStep)
687	{
688	const ZSTD_compressionParameters* const cParams = &ms->cParams;
689	U32* const hashTable = ms->hashTable;
690	U32 const hlog = cParams->hashLog;
691	/ support stepSize of 0 /
692	size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + `1`;
693	const BYTE* const base = ms->window.base;
694	const BYTE* const dictBase = ms->window.dictBase;
695	const BYTE* const istart = (const BYTE*)src;
696	const BYTE* anchor = istart;
697	const U32 endIndex = (U32)((size_t)(istart - base) + srcSize);
698	const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog);
699	const U32 dictStartIndex = lowLimit;
700	const BYTE* const dictStart = dictBase + dictStartIndex;
701	const U32 dictLimit = ms->window.dictLimit;
702	const U32 prefixStartIndex = dictLimit < lowLimit ? lowLimit : dictLimit;
703	const BYTE* const prefixStart = base + prefixStartIndex;
704	const BYTE* const dictEnd = dictBase + prefixStartIndex;
705	const BYTE* const iend = istart + srcSize;
706	const BYTE* const ilimit = iend - `8`;
707	U32 offset_1=rep[`0`], offset_2=rep[`1`];
708	U32 offsetSaved1 = `0`, offsetSaved2 = `0`;
709
710	const BYTE* ip0 = istart;
711	const BYTE* ip1;
712	const BYTE* ip2;
713	const BYTE* ip3;
714	U32 current0;
715
716
717	size_t hash0; / hash for ip0 /
718	size_t hash1; / hash for ip1 /
719	U32 idx; / match idx for ip0 /
720	const BYTE* idxBase; / base pointer for idx /
721
722	U32 offcode;
723	const BYTE* match0;
724	size_t mLength;
725	const BYTE* matchEnd = `0`; / initialize to avoid warning, assert != 0 later /
726
727	size_t step;
728	const BYTE* nextStep;
729	const size_t kStepIncr = (`1` << (kSearchStrength - `1`));
730
731	(void)hasStep; / not currently specialized on whether it's accelerated /
732
733	DEBUGLOG(`5`, "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)", offset_1);
734
735	/ switch to "regular" variant if extDict is invalidated due to maxDistance /
736	if (prefixStartIndex == dictStartIndex)
737	return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize);
738
739	{ U32 const curr = (U32)(ip0 - base);
740	U32 const maxRep = curr - dictStartIndex;
741	if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = `0`;
742	if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = `0`;
743	}
744
745	/ start each op /
746	_start: / Requires: ip0 /
747
748	step = stepSize;
749	nextStep = ip0 + kStepIncr;
750
751	/ calculate positions, ip0 - anchor == 0, so we skip step calc /
752	ip1 = ip0 + `1`;
753	ip2 = ip0 + step;
754	ip3 = ip2 + `1`;
755
756	if (ip3 >= ilimit) {
757	goto _cleanup;
758	}
759
760	hash0 = ZSTD_hashPtr(ip0, hlog, mls);
761	hash1 = ZSTD_hashPtr(ip1, hlog, mls);
762
763	idx = hashTable[hash0];
764	idxBase = idx < prefixStartIndex ? dictBase : base;
765
766	do {
767	{ / load repcode match for ip[2] /
768	U32 const current2 = (U32)(ip2 - base);
769	U32 const repIndex = current2 - offset_1;
770	const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base;
771	U32 rval;
772	if ( ((U32)(prefixStartIndex - repIndex) >= `4`) / intentional underflow /
773	& (offset_1 > `0`) ) {
774	rval = MEM_read32(repBase + repIndex);
775	} else {
776	rval = MEM_read32(ip2) ^ `1`; / guaranteed to not match. /
777	}
778
779	/ write back hash table entry /
780	current0 = (U32)(ip0 - base);
781	hashTable[hash0] = current0;
782
783	/ check repcode at ip[2] /
784	if (MEM_read32(ip2) == rval) {
785	ip0 = ip2;
786	match0 = repBase + repIndex;
787	matchEnd = repIndex < prefixStartIndex ? dictEnd : iend;
788	assert((match0 != prefixStart) & (match0 != dictStart));
789	mLength = ip0[-`1`] == match0[-`1`];
790	ip0 -= mLength;
791	match0 -= mLength;
792	offcode = REPCODE1_TO_OFFBASE;
793	mLength += `4`;
794	goto _match;
795	} }
796
797	{ / load match for ip[0] /
798	U32 const mval = idx >= dictStartIndex ?
799	MEM_read32(idxBase + idx) :
800	MEM_read32(ip0) ^ `1`; / guaranteed not to match /
801
802	/ check match at ip[0] /
803	if (MEM_read32(ip0) == mval) {
804	/ found a match! /
805	goto _offset;
806	} }
807
808	/ lookup ip[1] /
809	idx = hashTable[hash1];
810	idxBase = idx < prefixStartIndex ? dictBase : base;
811
812	/ hash ip[2] /
813	hash0 = hash1;
814	hash1 = ZSTD_hashPtr(ip2, hlog, mls);
815
816	/ advance to next positions /
817	ip0 = ip1;
818	ip1 = ip2;
819	ip2 = ip3;
820
821	/ write back hash table entry /
822	current0 = (U32)(ip0 - base);
823	hashTable[hash0] = current0;
824
825	{ / load match for ip[0] /
826	U32 const mval = idx >= dictStartIndex ?
827	MEM_read32(idxBase + idx) :
828	MEM_read32(ip0) ^ `1`; / guaranteed not to match /
829
830	/ check match at ip[0] /
831	if (MEM_read32(ip0) == mval) {
832	/ found a match! /
833	goto _offset;
834	} }
835
836	/ lookup ip[1] /
837	idx = hashTable[hash1];
838	idxBase = idx < prefixStartIndex ? dictBase : base;
839
840	/ hash ip[2] /
841	hash0 = hash1;
842	hash1 = ZSTD_hashPtr(ip2, hlog, mls);
843
844	/ advance to next positions /
845	ip0 = ip1;
846	ip1 = ip2;
847	ip2 = ip0 + step;
848	ip3 = ip1 + step;
849
850	/ calculate step /
851	if (ip2 >= nextStep) {
852	step++;
853	PREFETCH_L1(ip1 + `64`);
854	PREFETCH_L1(ip1 + `128`);
855	nextStep += kStepIncr;
856	}
857	} while (ip3 < ilimit);
858
859	_cleanup:
860	/ Note that there are probably still a couple positions we could search.*
861	* However, it seems to be a meaningful performance hit to try to search
862	* them. So let's not. */
863
864	/ If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0),*
865	* rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */
866	offsetSaved2 = ((offsetSaved1 != `0`) && (offset_1 != `0`)) ? offsetSaved1 : offsetSaved2;
867
868	/ save reps for next block /
869	rep[`0`] = offset_1 ? offset_1 : offsetSaved1;
870	rep[`1`] = offset_2 ? offset_2 : offsetSaved2;
871
872	/ Return the last literals size /
873	return (size_t)(iend - anchor);
874
875	_offset: / Requires: ip0, idx, idxBase /
876
877	/ Compute the offset code. /
878	{ U32 const offset = current0 - idx;
879	const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart;
880	matchEnd = idx < prefixStartIndex ? dictEnd : iend;
881	match0 = idxBase + idx;
882	offset_2 = offset_1;
883	offset_1 = offset;
884	offcode = OFFSET_TO_OFFBASE(offset);
885	mLength = `4`;
886
887	/ Count the backwards match length. /
888	while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-`1`] == match0[-`1`])) {
889	ip0--;
890	match0--;
891	mLength++;
892	} }
893
894	_match: / Requires: ip0, match0, offcode, matchEnd /
895
896	/ Count the forward length. /
897	assert(matchEnd != `0`);
898	mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart);
899
900	ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength);
901
902	ip0 += mLength;
903	anchor = ip0;
904
905	/ write next hash table entry /
906	if (ip1 < ip0) {
907	hashTable[hash1] = (U32)(ip1 - base);
908	}
909
910	/ Fill table and check for immediate repcode. /
911	if (ip0 <= ilimit) {
912	/ Fill Table /
913	assert(base+current0+`2` > istart); / check base overflow /
914	hashTable[ZSTD_hashPtr(base+current0+`2`, hlog, mls)] = current0+`2`; / here because current+2 could be > iend-8 /
915	hashTable[ZSTD_hashPtr(ip0-`2`, hlog, mls)] = (U32)(ip0-`2`-base);
916
917	while (ip0 <= ilimit) {
918	U32 const repIndex2 = (U32)(ip0-base) - offset_2;
919	const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2;
920	if ( (((U32)((prefixStartIndex-`1`) - repIndex2) >= `3`) & (offset_2 > `0`)) / intentional underflow /
921	&& (MEM_read32(repMatch2) == MEM_read32(ip0)) ) {
922	const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend;
923	size_t const repLength2 = ZSTD_count_2segments(ip0+`4`, repMatch2+`4`, iend, repEnd2, prefixStart) + `4`;
924	{ U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } / swap offset_2 <=> offset_1 /
925	ZSTD_storeSeq(seqStore, `0` /litlen/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2);
926	hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base);
927	ip0 += repLength2;
928	anchor = ip0;
929	continue;
930	}
931	break;
932	} }
933
934	goto _start;
935	}
936
937	ZSTD_GEN_FAST_FN(extDict, `4`, `0`)
938	ZSTD_GEN_FAST_FN(extDict, `5`, `0`)
939	ZSTD_GEN_FAST_FN(extDict, `6`, `0`)
940	ZSTD_GEN_FAST_FN(extDict, `7`, `0`)
941
942	size_t ZSTD_compressBlock_fast_extDict(
943	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
944	void const* src, size_t srcSize)
945	{
946	U32 const mls = ms->cParams.minMatch;
947	assert(ms->dictMatchState == NULL);
948	switch(mls)
949	{
950	default: / includes case 3 /
951	case `4` :
952	return ZSTD_compressBlock_fast_extDict_4_0(ms, seqStore, rep, src, srcSize);
953	case `5` :
954	return ZSTD_compressBlock_fast_extDict_5_0(ms, seqStore, rep, src, srcSize);
955	case `6` :
956	return ZSTD_compressBlock_fast_extDict_6_0(ms, seqStore, rep, src, srcSize);
957	case `7` :
958	return ZSTD_compressBlock_fast_extDict_7_0(ms, seqStore, rep, src, srcSize);
959	}
960	}
961

Browse the source code of Godot/thirdparty/zstd/compress/zstd_fast.c