zstd_ldm.c source code [ClickHouse/contrib/zstd/lib/compress/zstd_ldm.c]

1	/*
2	* Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
3	* All rights reserved.
4	*
5	* This source code is licensed under both the BSD-style license (found in the
6	* LICENSE file in the root directory of this source tree) and the GPLv2 (found
7	* in the COPYING file in the root directory of this source tree).
8	*/
9
10	#include "zstd_ldm.h"
11
12	#include "zstd_fast.h" /* ZSTD_fillHashTable() */
13	#include "zstd_double_fast.h" /* ZSTD_fillDoubleHashTable() */
14
15	#define LDM_BUCKET_SIZE_LOG 3
16	#define LDM_MIN_MATCH_LENGTH 64
17	#define LDM_HASH_RLOG 7
18	#define LDM_HASH_CHAR_OFFSET 10
19
20	void ZSTD_ldm_adjustParameters(ldmParams_t* params,
21	ZSTD_compressionParameters const* cParams)
22	{
23	U32 const windowLog = cParams->windowLog;
24	ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX);
25	DEBUGLOG(`4`, "ZSTD_ldm_adjustParameters");
26	if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG;
27	if (!params->minMatchLength) params->minMatchLength = LDM_MIN_MATCH_LENGTH;
28	if (cParams->strategy >= ZSTD_btopt) {
29	/ Get out of the way of the optimal parser /
30	U32 const minMatch = MAX(cParams->targetLength, params->minMatchLength);
31	assert(minMatch >= ZSTD_LDM_MINMATCH_MIN);
32	assert(minMatch <= ZSTD_LDM_MINMATCH_MAX);
33	params->minMatchLength = minMatch;
34	}
35	if (params->hashLog == `0`) {
36	params->hashLog = MAX(ZSTD_HASHLOG_MIN, windowLog - LDM_HASH_RLOG);
37	assert(params->hashLog <= ZSTD_HASHLOG_MAX);
38	}
39	if (params->hashEveryLog == `0`) {
40	params->hashEveryLog =
41	windowLog < params->hashLog ? `0` : windowLog - params->hashLog;
42	}
43	params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog);
44	}
45
46	size_t ZSTD_ldm_getTableSize(ldmParams_t params)
47	{
48	size_t const ldmHSize = ((size_t)`1`) << params.hashLog;
49	size_t const ldmBucketSizeLog = MIN(params.bucketSizeLog, params.hashLog);
50	size_t const ldmBucketSize =
51	((size_t)`1`) << (params.hashLog - ldmBucketSizeLog);
52	size_t const totalSize = ldmBucketSize + ldmHSize * sizeof(ldmEntry_t);
53	return params.enableLdm ? totalSize : `0`;
54	}
55
56	size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize)
57	{
58	return params.enableLdm ? (maxChunkSize / params.minMatchLength) : `0`;
59	}
60
61	/* ZSTD_ldm_getSmallHash() :*
62	* numBits should be <= 32
63	* If numBits==0, returns 0.
64	* @return : the most significant numBits of value. */
65	static U32 ZSTD_ldm_getSmallHash(U64 value, U32 numBits)
66	{
67	assert(numBits <= `32`);
68	return numBits == `0` ? `0` : (U32)(value >> (`64` - numBits));
69	}
70
71	/* ZSTD_ldm_getChecksum() :*
72	* numBitsToDiscard should be <= 32
73	* @return : the next most significant 32 bits after numBitsToDiscard */
74	static U32 ZSTD_ldm_getChecksum(U64 hash, U32 numBitsToDiscard)
75	{
76	assert(numBitsToDiscard <= `32`);
77	return (hash >> (`64` - `32` - numBitsToDiscard)) & `0xFFFFFFFF`;
78	}
79
80	/* ZSTD_ldm_getTag() ;*
81	* Given the hash, returns the most significant numTagBits bits
82	* after (32 + hbits) bits.
83	*
84	* If there are not enough bits remaining, return the last
85	* numTagBits bits. */
86	static U32 ZSTD_ldm_getTag(U64 hash, U32 hbits, U32 numTagBits)
87	{
88	assert(numTagBits < `32` && hbits <= `32`);
89	if (`32` - hbits < numTagBits) {
90	return hash & (((U32)`1` << numTagBits) - `1`);
91	} else {
92	return (hash >> (`32` - hbits - numTagBits)) & (((U32)`1` << numTagBits) - `1`);
93	}
94	}
95
96	/* ZSTD_ldm_getBucket() :*
97	* Returns a pointer to the start of the bucket associated with hash. */
98	static ldmEntry_t* ZSTD_ldm_getBucket(
99	ldmState_t* ldmState, size_t hash, ldmParams_t const ldmParams)
100	{
101	return ldmState->hashTable + (hash << ldmParams.bucketSizeLog);
102	}
103
104	/* ZSTD_ldm_insertEntry() :*
105	* Insert the entry with corresponding hash into the hash table */
106	static void ZSTD_ldm_insertEntry(ldmState_t* ldmState,
107	size_t const hash, const ldmEntry_t entry,
108	ldmParams_t const ldmParams)
109	{
110	BYTE* const bucketOffsets = ldmState->bucketOffsets;
111	*(ZSTD_ldm_getBucket(ldmState, hash, ldmParams) + bucketOffsets[hash]) = entry;
112	bucketOffsets[hash]++;
113	bucketOffsets[hash] &= ((U32)`1` << ldmParams.bucketSizeLog) - `1`;
114	}
115
116	/* ZSTD_ldm_makeEntryAndInsertByTag() :*
117	*
118	* Gets the small hash, checksum, and tag from the rollingHash.
119	*
120	* If the tag matches (1 << ldmParams.hashEveryLog)-1, then
121	* creates an ldmEntry from the offset, and inserts it into the hash table.
122	*
123	* hBits is the length of the small hash, which is the most significant hBits
124	* of rollingHash. The checksum is the next 32 most significant bits, followed
125	* by ldmParams.hashEveryLog bits that make up the tag. */
126	static void ZSTD_ldm_makeEntryAndInsertByTag(ldmState_t* ldmState,
127	U64 const rollingHash,
128	U32 const hBits,
129	U32 const offset,
130	ldmParams_t const ldmParams)
131	{
132	U32 const tag = ZSTD_ldm_getTag(rollingHash, hBits, ldmParams.hashEveryLog);
133	U32 const tagMask = ((U32)`1` << ldmParams.hashEveryLog) - `1`;
134	if (tag == tagMask) {
135	U32 const hash = ZSTD_ldm_getSmallHash(rollingHash, hBits);
136	U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits);
137	ldmEntry_t entry;
138	entry.offset = offset;
139	entry.checksum = checksum;
140	ZSTD_ldm_insertEntry(ldmState, hash, entry, ldmParams);
141	}
142	}
143
144	/* ZSTD_ldm_getRollingHash() :*
145	* Get a 64-bit hash using the first len bytes from buf.
146	*
147	* Giving bytes s = s_1, s_2, ... s_k, the hash is defined to be
148	* H(s) = s_1(a^(k-1)) + s_2(a^(k-2)) + ... + s_k*(a^0)
149	*
150	* where the constant a is defined to be prime8bytes.
151	*
152	* The implementation adds an offset to each byte, so
153	* H(s) = (s_1 + HASH_CHAR_OFFSET)(a^(k-1)) + ... /
154	static U64 ZSTD_ldm_getRollingHash(const BYTE* buf, U32 len)
155	{
156	U64 ret = `0`;
157	U32 i;
158	for (i = `0`; i < len; i++) {
159	ret *= prime8bytes;
160	ret += buf[i] + LDM_HASH_CHAR_OFFSET;
161	}
162	return ret;
163	}
164
165	/* ZSTD_ldm_ipow() :*
166	* Return base^exp. */
167	static U64 ZSTD_ldm_ipow(U64 base, U64 exp)
168	{
169	U64 ret = `1`;
170	while (exp) {
171	if (exp & `1`) { ret *= base; }
172	exp >>= `1`;
173	base *= base;
174	}
175	return ret;
176	}
177
178	U64 ZSTD_ldm_getHashPower(U32 minMatchLength) {
179	DEBUGLOG(`4`, "ZSTD_ldm_getHashPower: mml=%u", minMatchLength);
180	assert(minMatchLength >= ZSTD_LDM_MINMATCH_MIN);
181	return ZSTD_ldm_ipow(prime8bytes, minMatchLength - `1`);
182	}
183
184	/* ZSTD_ldm_updateHash() :*
185	* Updates hash by removing toRemove and adding toAdd. */
186	static U64 ZSTD_ldm_updateHash(U64 hash, BYTE toRemove, BYTE toAdd, U64 hashPower)
187	{
188	hash -= ((toRemove + LDM_HASH_CHAR_OFFSET) * hashPower);
189	hash *= prime8bytes;
190	hash += toAdd + LDM_HASH_CHAR_OFFSET;
191	return hash;
192	}
193
194	/* ZSTD_ldm_countBackwardsMatch() :*
195	* Returns the number of bytes that match backwards before pIn and pMatch.
196	*
197	* We count only bytes where pMatch >= pBase and pIn >= pAnchor. */
198	static size_t ZSTD_ldm_countBackwardsMatch(
199	const BYTE* pIn, const BYTE* pAnchor,
200	const BYTE* pMatch, const BYTE* pBase)
201	{
202	size_t matchLength = `0`;
203	while (pIn > pAnchor && pMatch > pBase && pIn[-`1`] == pMatch[-`1`]) {
204	pIn--;
205	pMatch--;
206	matchLength++;
207	}
208	return matchLength;
209	}
210
211	/* ZSTD_ldm_fillFastTables() :*
212	*
213	* Fills the relevant tables for the ZSTD_fast and ZSTD_dfast strategies.
214	* This is similar to ZSTD_loadDictionaryContent.
215	*
216	* The tables for the other strategies are filled within their
217	* block compressors. */
218	static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms,
219	ZSTD_compressionParameters const* cParams,
220	void const* end)
221	{
222	const BYTE* const iend = (const BYTE*)end;
223
224	switch(cParams->strategy)
225	{
226	case ZSTD_fast:
227	ZSTD_fillHashTable(ms, cParams, iend);
228	ms->nextToUpdate = (U32)(iend - ms->window.base);
229	break;
230
231	case ZSTD_dfast:
232	ZSTD_fillDoubleHashTable(ms, cParams, iend);
233	ms->nextToUpdate = (U32)(iend - ms->window.base);
234	break;
235
236	case ZSTD_greedy:
237	case ZSTD_lazy:
238	case ZSTD_lazy2:
239	case ZSTD_btlazy2:
240	case ZSTD_btopt:
241	case ZSTD_btultra:
242	break;
243	default:
244	assert(`0`); / not possible : not a valid strategy id /
245	}
246
247	return `0`;
248	}
249
250	/* ZSTD_ldm_fillLdmHashTable() :*
251	*
252	* Fills hashTable from (lastHashed + 1) to iend (non-inclusive).
253	* lastHash is the rolling hash that corresponds to lastHashed.
254	*
255	* Returns the rolling hash corresponding to position iend-1. */
256	static U64 ZSTD_ldm_fillLdmHashTable(ldmState_t* state,
257	U64 lastHash, const BYTE* lastHashed,
258	const BYTE* iend, const BYTE* base,
259	U32 hBits, ldmParams_t const ldmParams)
260	{
261	U64 rollingHash = lastHash;
262	const BYTE* cur = lastHashed + `1`;
263
264	while (cur < iend) {
265	rollingHash = ZSTD_ldm_updateHash(rollingHash, cur[-`1`],
266	cur[ldmParams.minMatchLength-`1`],
267	state->hashPower);
268	ZSTD_ldm_makeEntryAndInsertByTag(state,
269	rollingHash, hBits,
270	(U32)(cur - base), ldmParams);
271	++cur;
272	}
273	return rollingHash;
274	}
275
276
277	/* ZSTD_ldm_limitTableUpdate() :*
278	*
279	* Sets cctx->nextToUpdate to a position corresponding closer to anchor
280	* if it is far way
281	* (after a long match, only update tables a limited amount). */
282	static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor)
283	{
284	U32 const current = (U32)(anchor - ms->window.base);
285	if (current > ms->nextToUpdate + `1024`) {
286	ms->nextToUpdate =
287	current - MIN(`512`, current - ms->nextToUpdate - `1024`);
288	}
289	}
290
291	static size_t ZSTD_ldm_generateSequences_internal(
292	ldmState_t* ldmState, rawSeqStore_t* rawSeqStore,
293	ldmParams_t const* params, void const* src, size_t srcSize)
294	{
295	/ LDM parameters /
296	int const extDict = ZSTD_window_hasExtDict(ldmState->window);
297	U32 const minMatchLength = params->minMatchLength;
298	U64 const hashPower = ldmState->hashPower;
299	U32 const hBits = params->hashLog - params->bucketSizeLog;
300	U32 const ldmBucketSize = `1U` << params->bucketSizeLog;
301	U32 const hashEveryLog = params->hashEveryLog;
302	U32 const ldmTagMask = (`1U` << params->hashEveryLog) - `1`;
303	/ Prefix and extDict parameters /
304	U32 const dictLimit = ldmState->window.dictLimit;
305	U32 const lowestIndex = extDict ? ldmState->window.lowLimit : dictLimit;
306	BYTE const* const base = ldmState->window.base;
307	BYTE const* const dictBase = extDict ? ldmState->window.dictBase : NULL;
308	BYTE const* const dictStart = extDict ? dictBase + lowestIndex : NULL;
309	BYTE const* const dictEnd = extDict ? dictBase + dictLimit : NULL;
310	BYTE const* const lowPrefixPtr = base + dictLimit;
311	/ Input bounds /
312	BYTE const* const istart = (BYTE const*)src;
313	BYTE const* const iend = istart + srcSize;
314	BYTE const* const ilimit = iend - MAX(minMatchLength, HASH_READ_SIZE);
315	/ Input positions /
316	BYTE const* anchor = istart;
317	BYTE const* ip = istart;
318	/ Rolling hash /
319	BYTE const* lastHashed = NULL;
320	U64 rollingHash = `0`;
321
322	while (ip <= ilimit) {
323	size_t mLength;
324	U32 const current = (U32)(ip - base);
325	size_t forwardMatchLength = `0`, backwardMatchLength = `0`;
326	ldmEntry_t* bestEntry = NULL;
327	if (ip != istart) {
328	rollingHash = ZSTD_ldm_updateHash(rollingHash, lastHashed[`0`],
329	lastHashed[minMatchLength],
330	hashPower);
331	} else {
332	rollingHash = ZSTD_ldm_getRollingHash(ip, minMatchLength);
333	}
334	lastHashed = ip;
335
336	/ Do not insert and do not look for a match /
337	if (ZSTD_ldm_getTag(rollingHash, hBits, hashEveryLog) != ldmTagMask) {
338	ip++;
339	continue;
340	}
341
342	/ Get the best entry and compute the match lengths /
343	{
344	ldmEntry_t* const bucket =
345	ZSTD_ldm_getBucket(ldmState,
346	ZSTD_ldm_getSmallHash(rollingHash, hBits),
347	*params);
348	ldmEntry_t* cur;
349	size_t bestMatchLength = `0`;
350	U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits);
351
352	for (cur = bucket; cur < bucket + ldmBucketSize; ++cur) {
353	size_t curForwardMatchLength, curBackwardMatchLength,
354	curTotalMatchLength;
355	if (cur->checksum != checksum \|\| cur->offset <= lowestIndex) {
356	continue;
357	}
358	if (extDict) {
359	BYTE const* const curMatchBase =
360	cur->offset < dictLimit ? dictBase : base;
361	BYTE const* const pMatch = curMatchBase + cur->offset;
362	BYTE const* const matchEnd =
363	cur->offset < dictLimit ? dictEnd : iend;
364	BYTE const* const lowMatchPtr =
365	cur->offset < dictLimit ? dictStart : lowPrefixPtr;
366
367	curForwardMatchLength = ZSTD_count_2segments(
368	ip, pMatch, iend,
369	matchEnd, lowPrefixPtr);
370	if (curForwardMatchLength < minMatchLength) {
371	continue;
372	}
373	curBackwardMatchLength =
374	ZSTD_ldm_countBackwardsMatch(ip, anchor, pMatch,
375	lowMatchPtr);
376	curTotalMatchLength = curForwardMatchLength +
377	curBackwardMatchLength;
378	} else { / !extDict /
379	BYTE const* const pMatch = base + cur->offset;
380	curForwardMatchLength = ZSTD_count(ip, pMatch, iend);
381	if (curForwardMatchLength < minMatchLength) {
382	continue;
383	}
384	curBackwardMatchLength =
385	ZSTD_ldm_countBackwardsMatch(ip, anchor, pMatch,
386	lowPrefixPtr);
387	curTotalMatchLength = curForwardMatchLength +
388	curBackwardMatchLength;
389	}
390
391	if (curTotalMatchLength > bestMatchLength) {
392	bestMatchLength = curTotalMatchLength;
393	forwardMatchLength = curForwardMatchLength;
394	backwardMatchLength = curBackwardMatchLength;
395	bestEntry = cur;
396	}
397	}
398	}
399
400	/ No match found -- continue searching /
401	if (bestEntry == NULL) {
402	ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash,
403	hBits, current,
404	*params);
405	ip++;
406	continue;
407	}
408
409	/ Match found /
410	mLength = forwardMatchLength + backwardMatchLength;
411	ip -= backwardMatchLength;
412
413	{
414	/ Store the sequence:*
415	* ip = current - backwardMatchLength
416	* The match is at (bestEntry->offset - backwardMatchLength)
417	*/
418	U32 const matchIndex = bestEntry->offset;
419	U32 const offset = current - matchIndex;
420	rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size;
421
422	/ Out of sequence storage /
423	if (rawSeqStore->size == rawSeqStore->capacity)
424	return ERROR(dstSize_tooSmall);
425	seq->litLength = (U32)(ip - anchor);
426	seq->matchLength = (U32)mLength;
427	seq->offset = offset;
428	rawSeqStore->size++;
429	}
430
431	/ Insert the current entry into the hash table /
432	ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash, hBits,
433	(U32)(lastHashed - base),
434	*params);
435
436	assert(ip + backwardMatchLength == lastHashed);
437
438	/ Fill the hash table from lastHashed+1 to ip+mLength/
439	/ Heuristic: don't need to fill the entire table at end of block /
440	if (ip + mLength <= ilimit) {
441	rollingHash = ZSTD_ldm_fillLdmHashTable(
442	ldmState, rollingHash, lastHashed,
443	ip + mLength, base, hBits, *params);
444	lastHashed = ip + mLength - `1`;
445	}
446	ip += mLength;
447	anchor = ip;
448	}
449	return iend - anchor;
450	}
451
452	/! ZSTD_ldm_reduceTable() :*
453	* reduce table indexes by `reducerValue` */
454	static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size,
455	U32 const reducerValue)
456	{
457	U32 u;
458	for (u = `0`; u < size; u++) {
459	if (table[u].offset < reducerValue) table[u].offset = `0`;
460	else table[u].offset -= reducerValue;
461	}
462	}
463
464	size_t ZSTD_ldm_generateSequences(
465	ldmState_t* ldmState, rawSeqStore_t* sequences,
466	ldmParams_t const* params, void const* src, size_t srcSize)
467	{
468	U32 const maxDist = `1U` << params->windowLog;
469	BYTE const* const istart = (BYTE const*)src;
470	BYTE const* const iend = istart + srcSize;
471	size_t const kMaxChunkSize = `1` << `20`;
472	size_t const nbChunks = (srcSize / kMaxChunkSize) + ((srcSize % kMaxChunkSize) != `0`);
473	size_t chunk;
474	size_t leftoverSize = `0`;
475
476	assert(ZSTD_CHUNKSIZE_MAX >= kMaxChunkSize);
477	/ Check that ZSTD_window_update() has been called for this chunk prior*
478	* to passing it to this function.
479	*/
480	assert(ldmState->window.nextSrc >= (BYTE const*)src + srcSize);
481	/ The input could be very large (in zstdmt), so it must be broken up into*
482	* chunks to enforce the maximmum distance and handle overflow correction.
483	*/
484	assert(sequences->pos <= sequences->size);
485	assert(sequences->size <= sequences->capacity);
486	for (chunk = `0`; chunk < nbChunks && sequences->size < sequences->capacity; ++chunk) {
487	BYTE const* const chunkStart = istart + chunk * kMaxChunkSize;
488	size_t const remaining = (size_t)(iend - chunkStart);
489	BYTE const *const chunkEnd =
490	(remaining < kMaxChunkSize) ? iend : chunkStart + kMaxChunkSize;
491	size_t const chunkSize = chunkEnd - chunkStart;
492	size_t newLeftoverSize;
493	size_t const prevSize = sequences->size;
494
495	assert(chunkStart < iend);
496	/ 1. Perform overflow correction if necessary. /
497	if (ZSTD_window_needOverflowCorrection(ldmState->window, chunkEnd)) {
498	U32 const ldmHSize = `1U` << params->hashLog;
499	U32 const correction = ZSTD_window_correctOverflow(
500	&ldmState->window, / cycleLog / `0`, maxDist, src);
501	ZSTD_ldm_reduceTable(ldmState->hashTable, ldmHSize, correction);
502	}
503	/ 2. We enforce the maximum offset allowed.*
504	*
505	* kMaxChunkSize should be small enough that we don't lose too much of
506	* the window through early invalidation.
507	* TODO: * Test the chunk size.
508	* * Try invalidation after the sequence generation and test the
509	* the offset against maxDist directly.
510	*/
511	ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, NULL);
512	/ 3. Generate the sequences for the chunk, and get newLeftoverSize. /
513	newLeftoverSize = ZSTD_ldm_generateSequences_internal(
514	ldmState, sequences, params, chunkStart, chunkSize);
515	if (ZSTD_isError(newLeftoverSize))
516	return newLeftoverSize;
517	/ 4. We add the leftover literals from previous iterations to the first*
518	* newly generated sequence, or add the `newLeftoverSize` if none are
519	* generated.
520	*/
521	/ Prepend the leftover literals from the last call /
522	if (prevSize < sequences->size) {
523	sequences->seq[prevSize].litLength += (U32)leftoverSize;
524	leftoverSize = newLeftoverSize;
525	} else {
526	assert(newLeftoverSize == chunkSize);
527	leftoverSize += chunkSize;
528	}
529	}
530	return `0`;
531	}
532
533	void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) {
534	while (srcSize > `0` && rawSeqStore->pos < rawSeqStore->size) {
535	rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos;
536	if (srcSize <= seq->litLength) {
537	/ Skip past srcSize literals /
538	seq->litLength -= (U32)srcSize;
539	return;
540	}
541	srcSize -= seq->litLength;
542	seq->litLength = `0`;
543	if (srcSize < seq->matchLength) {
544	/ Skip past the first srcSize of the match /
545	seq->matchLength -= (U32)srcSize;
546	if (seq->matchLength < minMatch) {
547	/ The match is too short, omit it /
548	if (rawSeqStore->pos + `1` < rawSeqStore->size) {
549	seq[`1`].litLength += seq[`0`].matchLength;
550	}
551	rawSeqStore->pos++;
552	}
553	return;
554	}
555	srcSize -= seq->matchLength;
556	seq->matchLength = `0`;
557	rawSeqStore->pos++;
558	}
559	}
560
561	/**
562	* If the sequence length is longer than remaining then the sequence is split
563	* between this block and the next.
564	*
565	* Returns the current sequence to handle, or if the rest of the block should
566	* be literals, it returns a sequence with offset == 0.
567	*/
568	static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore,
569	U32 const remaining, U32 const minMatch)
570	{
571	rawSeq sequence = rawSeqStore->seq[rawSeqStore->pos];
572	assert(sequence.offset > `0`);
573	/ Likely: No partial sequence /
574	if (remaining >= sequence.litLength + sequence.matchLength) {
575	rawSeqStore->pos++;
576	return sequence;
577	}
578	/ Cut the sequence short (offset == 0 ==> rest is literals). /
579	if (remaining <= sequence.litLength) {
580	sequence.offset = `0`;
581	} else if (remaining < sequence.litLength + sequence.matchLength) {
582	sequence.matchLength = remaining - sequence.litLength;
583	if (sequence.matchLength < minMatch) {
584	sequence.offset = `0`;
585	}
586	}
587	/ Skip past `remaining` bytes for the future sequences. /
588	ZSTD_ldm_skipSequences(rawSeqStore, remaining, minMatch);
589	return sequence;
590	}
591
592	size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore,
593	ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM],
594	ZSTD_compressionParameters const* cParams, void const* src, size_t srcSize,
595	int const extDict)
596	{
597	unsigned const minMatch = cParams->searchLength;
598	ZSTD_blockCompressor const blockCompressor =
599	ZSTD_selectBlockCompressor(cParams->strategy, extDict);
600	BYTE const* const base = ms->window.base;
601	/ Input bounds /
602	BYTE const* const istart = (BYTE const*)src;
603	BYTE const* const iend = istart + srcSize;
604	/ Input positions /
605	BYTE const* ip = istart;
606
607	assert(rawSeqStore->pos <= rawSeqStore->size);
608	assert(rawSeqStore->size <= rawSeqStore->capacity);
609	/ Loop through each sequence and apply the block compressor to the lits /
610	while (rawSeqStore->pos < rawSeqStore->size && ip < iend) {
611	/ maybeSplitSequence updates rawSeqStore->pos /
612	rawSeq const sequence = maybeSplitSequence(rawSeqStore,
613	(U32)(iend - ip), minMatch);
614	int i;
615	/ End signal /
616	if (sequence.offset == `0`)
617	break;
618
619	assert(sequence.offset <= (`1U` << cParams->windowLog));
620	assert(ip + sequence.litLength + sequence.matchLength <= iend);
621
622	/ Fill tables for block compressor /
623	ZSTD_ldm_limitTableUpdate(ms, ip);
624	ZSTD_ldm_fillFastTables(ms, cParams, ip);
625	/ Run the block compressor /
626	{
627	size_t const newLitLength =
628	blockCompressor(ms, seqStore, rep, cParams, ip,
629	sequence.litLength);
630	ip += sequence.litLength;
631	ms->nextToUpdate = (U32)(ip - base);
632	/ Update the repcodes /
633	for (i = ZSTD_REP_NUM - `1`; i > `0`; i--)
634	rep[i] = rep[i-`1`];
635	rep[`0`] = sequence.offset;
636	/ Store the sequence /
637	ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength,
638	sequence.offset + ZSTD_REP_MOVE,
639	sequence.matchLength - MINMATCH);
640	ip += sequence.matchLength;
641	}
642	}
643	/ Fill the tables for the block compressor /
644	ZSTD_ldm_limitTableUpdate(ms, ip);
645	ZSTD_ldm_fillFastTables(ms, cParams, ip);
646	/ Compress the last literals /
647	{
648	size_t const lastLiterals = blockCompressor(ms, seqStore, rep, cParams,
649	ip, iend - ip);
650	ms->nextToUpdate = (U32)(iend - base);
651	return lastLiterals;
652	}
653	}
654

Browse the source code of ClickHouse/contrib/zstd/lib/compress/zstd_ldm.c