zdict.c source code [ClickHouse/contrib/zstd/lib/dictBuilder/zdict.c]

1	/*
2	* Copyright (c) 2016-present, Yann Collet, Facebook, Inc.
3	* All rights reserved.
4	*
5	* This source code is licensed under both the BSD-style license (found in the
6	* LICENSE file in the root directory of this source tree) and the GPLv2 (found
7	* in the COPYING file in the root directory of this source tree).
8	* You may select, at your option, one of the above-listed licenses.
9	*/
10
11
12	/-*************************************
13	* Tuning parameters
14	****************************************/
15	#define MINRATIO 4 /* minimum nb of apparition to be selected in dictionary */
16	#define ZDICT_MAX_SAMPLES_SIZE (2000U << 20)
17	#define ZDICT_MIN_SAMPLES_SIZE (ZDICT_CONTENTSIZE_MIN * MINRATIO)
18
19
20	/-*************************************
21	* Compiler Options
22	****************************************/
23	/ Unix Large Files support (>4GB) /
24	#define _FILE_OFFSET_BITS 64
25	#if (defined(__sun__) && (!defined(__LP64__))) /* Sun Solaris 32-bits requires specific definitions */
26	# define _LARGEFILE_SOURCE
27	#elif ! defined(__LP64__) /* No point defining Large file for 64 bit */
28	# define _LARGEFILE64_SOURCE
29	#endif
30
31
32	/-************************************
33	* Dependencies
34	***************************************/
35	#include <stdlib.h> /* malloc, free */
36	#include <string.h> /* memset */
37	#include <stdio.h> /* fprintf, fopen, ftello64 */
38	#include <time.h> /* clock */
39
40	#include "mem.h" /* read */
41	#include "fse.h" /* FSE_normalizeCount, FSE_writeNCount */
42	#define HUF_STATIC_LINKING_ONLY
43	#include "huf.h" /* HUF_buildCTable, HUF_writeCTable */
44	#include "zstd_internal.h" /* includes zstd.h */
45	#include "xxhash.h" /* XXH64 */
46	#include "divsufsort.h"
47	#ifndef ZDICT_STATIC_LINKING_ONLY
48	# define ZDICT_STATIC_LINKING_ONLY
49	#endif
50	#include "zdict.h"
51
52
53	/-************************************
54	* Constants
55	***************************************/
56	#define KB *(1 <<10)
57	#define MB *(1 <<20)
58	#define GB *(1U<<30)
59
60	#define DICTLISTSIZE_DEFAULT 10000
61
62	#define NOISELENGTH 32
63
64	static const int g_compressionLevel_default = `3`;
65	static const U32 g_selectivity_default = `9`;
66
67
68	/-************************************
69	* Console display
70	***************************************/
71	#define DISPLAY(...) { fprintf(stderr, __VA_ARGS__); fflush( stderr ); }
72	#define DISPLAYLEVEL(l, ...) if (notificationLevel>=l) { DISPLAY(__VA_ARGS__); } /* 0 : no display; 1: errors; 2: default; 3: details; 4: debug */
73
74	static clock_t ZDICT_clockSpan(clock_t nPrevious) { return clock() - nPrevious; }
75
76	static void ZDICT_printHex(const void* ptr, size_t length)
77	{
78	const BYTE* const b = (const BYTE*)ptr;
79	size_t u;
80	for (u=`0`; u<length; u++) {
81	BYTE c = b[u];
82	if (c<`32` \|\| c>`126`) c = `'.'`; / non-printable char /
83	DISPLAY("%c", c);
84	}
85	}
86
87
88	/-*******************************************************
89	* Helper functions
90	**********************************************************/
91	unsigned ZDICT_isError(size_t errorCode) { return ERR_isError(errorCode); }
92
93	const char* ZDICT_getErrorName(size_t errorCode) { return ERR_getErrorName(errorCode); }
94
95	unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize)
96	{
97	if (dictSize < `8`) return `0`;
98	if (MEM_readLE32(dictBuffer) != ZSTD_MAGIC_DICTIONARY) return `0`;
99	return MEM_readLE32((const char*)dictBuffer + `4`);
100	}
101
102
103	/-*******************************************************
104	* Dictionary training functions
105	**********************************************************/
106	static unsigned ZDICT_NbCommonBytes (size_t val)
107	{
108	if (MEM_isLittleEndian()) {
109	if (MEM_64bits()) {
110	# if defined(_MSC_VER) && defined(_WIN64)
111	unsigned long r = `0`;
112	_BitScanForward64( &r, (U64)val );
113	return (unsigned)(r>>`3`);
114	# elif defined(__GNUC__) && (__GNUC__ >= 3)
115	return (__builtin_ctzll((U64)val) >> `3`);
116	# else
117	static const int DeBruijnBytePos[`64`] = { `0`, `0`, `0`, `0`, `0`, `1`, `1`, `2`, `0`, `3`, `1`, `3`, `1`, `4`, `2`, `7`, `0`, `2`, `3`, `6`, `1`, `5`, `3`, `5`, `1`, `3`, `4`, `4`, `2`, `5`, `6`, `7`, `7`, `0`, `1`, `2`, `3`, `3`, `4`, `6`, `2`, `6`, `5`, `5`, `3`, `4`, `5`, `6`, `7`, `1`, `2`, `4`, `6`, `4`, `4`, `5`, `7`, `2`, `6`, `5`, `7`, `6`, `7`, `7` };
118	return DeBruijnBytePos[((U64)((val & -(long long)val) * `0x0218A392CDABBD3FULL`)) >> `58`];
119	# endif
120	} else { / 32 bits /
121	# if defined(_MSC_VER)
122	unsigned long r=`0`;
123	_BitScanForward( &r, (U32)val );
124	return (unsigned)(r>>`3`);
125	# elif defined(__GNUC__) && (__GNUC__ >= 3)
126	return (__builtin_ctz((U32)val) >> `3`);
127	# else
128	static const int DeBruijnBytePos[`32`] = { `0`, `0`, `3`, `0`, `3`, `1`, `3`, `0`, `3`, `2`, `2`, `1`, `3`, `2`, `0`, `1`, `3`, `3`, `1`, `2`, `2`, `2`, `2`, `0`, `3`, `1`, `2`, `0`, `1`, `0`, `1`, `1` };
129	return DeBruijnBytePos[((U32)((val & -(S32)val) * `0x077CB531U`)) >> `27`];
130	# endif
131	}
132	} else { / Big Endian CPU /
133	if (MEM_64bits()) {
134	# if defined(_MSC_VER) && defined(_WIN64)
135	unsigned long r = `0`;
136	_BitScanReverse64( &r, val );
137	return (unsigned)(r>>`3`);
138	# elif defined(__GNUC__) && (__GNUC__ >= 3)
139	return (__builtin_clzll(val) >> `3`);
140	# else
141	unsigned r;
142	const unsigned n32 = sizeof(size_t)`4`; /* calculate this way due to compiler complaining in 32-bits mode /
143	if (!(val>>n32)) { r=`4`; } else { r=`0`; val>>=n32; }
144	if (!(val>>`16`)) { r+=`2`; val>>=`8`; } else { val>>=`24`; }
145	r += (!val);
146	return r;
147	# endif
148	} else { / 32 bits /
149	# if defined(_MSC_VER)
150	unsigned long r = `0`;
151	_BitScanReverse( &r, (unsigned long)val );
152	return (unsigned)(r>>`3`);
153	# elif defined(__GNUC__) && (__GNUC__ >= 3)
154	return (__builtin_clz((U32)val) >> `3`);
155	# else
156	unsigned r;
157	if (!(val>>`16`)) { r=`2`; val>>=`8`; } else { r=`0`; val>>=`24`; }
158	r += (!val);
159	return r;
160	# endif
161	} }
162	}
163
164
165	/! ZDICT_count() :*
166	Count the nb of common bytes between 2 pointers.
167	Note : this function presumes end of buffer followed by noisy guard band.
168	*/
169	static size_t ZDICT_count(const void* pIn, const void* pMatch)
170	{
171	const char* const pStart = (const char*)pIn;
172	for (;;) {
173	size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn);
174	if (!diff) {
175	pIn = (const char)pIn+sizeof*(size_t);
176	pMatch = (const char)pMatch+sizeof*(size_t);
177	continue;
178	}
179	pIn = (const char*)pIn+ZDICT_NbCommonBytes(diff);
180	return (size_t)((const char*)pIn - pStart);
181	}
182	}
183
184
185	typedef struct {
186	U32 pos;
187	U32 length;
188	U32 savings;
189	} dictItem;
190
191	static void ZDICT_initDictItem(dictItem* d)
192	{
193	d->pos = `1`;
194	d->length = `0`;
195	d->savings = (U32)(-`1`);
196	}
197
198
199	#define LLIMIT 64 /* heuristic determined experimentally */
200	#define MINMATCHLENGTH 7 /* heuristic determined experimentally */
201	static dictItem ZDICT_analyzePos(
202	BYTE* doneMarks,
203	const int* suffix, U32 start,
204	const void* buffer, U32 minRatio, U32 notificationLevel)
205	{
206	U32 lengthList[LLIMIT] = {`0`};
207	U32 cumulLength[LLIMIT] = {`0`};
208	U32 savings[LLIMIT] = {`0`};
209	const BYTE* b = (const BYTE*)buffer;
210	size_t maxLength = LLIMIT;
211	size_t pos = suffix[start];
212	U32 end = start;
213	dictItem solution;
214
215	/ init /
216	memset(&solution, `0`, sizeof(solution));
217	doneMarks[pos] = `1`;
218
219	/ trivial repetition cases /
220	if ( (MEM_read16(b+pos+`0`) == MEM_read16(b+pos+`2`))
221	\|\|(MEM_read16(b+pos+`1`) == MEM_read16(b+pos+`3`))
222	\|\|(MEM_read16(b+pos+`2`) == MEM_read16(b+pos+`4`)) ) {
223	/ skip and mark segment /
224	U16 const pattern16 = MEM_read16(b+pos+`4`);
225	U32 u, patternEnd = `6`;
226	while (MEM_read16(b+pos+patternEnd) == pattern16) patternEnd+=`2` ;
227	if (b[pos+patternEnd] == b[pos+patternEnd-`1`]) patternEnd++;
228	for (u=`1`; u<patternEnd; u++)
229	doneMarks[pos+u] = `1`;
230	return solution;
231	}
232
233	/ look forward /
234	{ size_t length;
235	do {
236	end++;
237	length = ZDICT_count(b + pos, b + suffix[end]);
238	} while (length >= MINMATCHLENGTH);
239	}
240
241	/ look backward /
242	{ size_t length;
243	do {
244	length = ZDICT_count(b + pos, b + *(suffix+start-`1`));
245	if (length >=MINMATCHLENGTH) start--;
246	} while(length >= MINMATCHLENGTH);
247	}
248
249	/ exit if not found a minimum nb of repetitions /
250	if (end-start < minRatio) {
251	U32 idx;
252	for(idx=start; idx<end; idx++)
253	doneMarks[suffix[idx]] = `1`;
254	return solution;
255	}
256
257	{ int i;
258	U32 searchLength;
259	U32 refinedStart = start;
260	U32 refinedEnd = end;
261
262	DISPLAYLEVEL(`4`, "\n");
263	DISPLAYLEVEL(`4`, "found %3u matches of length >= %i at pos %7u ", (U32)(end-start), MINMATCHLENGTH, (U32)pos);
264	DISPLAYLEVEL(`4`, "\n");
265
266	for (searchLength = MINMATCHLENGTH ; ; searchLength++) {
267	BYTE currentChar = `0`;
268	U32 currentCount = `0`;
269	U32 currentID = refinedStart;
270	U32 id;
271	U32 selectedCount = `0`;
272	U32 selectedID = currentID;
273	for (id =refinedStart; id < refinedEnd; id++) {
274	if (b[suffix[id] + searchLength] != currentChar) {
275	if (currentCount > selectedCount) {
276	selectedCount = currentCount;
277	selectedID = currentID;
278	}
279	currentID = id;
280	currentChar = b[ suffix[id] + searchLength];
281	currentCount = `0`;
282	}
283	currentCount ++;
284	}
285	if (currentCount > selectedCount) { / for last /
286	selectedCount = currentCount;
287	selectedID = currentID;
288	}
289
290	if (selectedCount < minRatio)
291	break;
292	refinedStart = selectedID;
293	refinedEnd = refinedStart + selectedCount;
294	}
295
296	/ evaluate gain based on new ref /
297	start = refinedStart;
298	pos = suffix[refinedStart];
299	end = start;
300	memset(lengthList, `0`, sizeof(lengthList));
301
302	/ look forward /
303	{ size_t length;
304	do {
305	end++;
306	length = ZDICT_count(b + pos, b + suffix[end]);
307	if (length >= LLIMIT) length = LLIMIT-`1`;
308	lengthList[length]++;
309	} while (length >=MINMATCHLENGTH);
310	}
311
312	/ look backward /
313	{ size_t length = MINMATCHLENGTH;
314	while ((length >= MINMATCHLENGTH) & (start > `0`)) {
315	length = ZDICT_count(b + pos, b + suffix[start - `1`]);
316	if (length >= LLIMIT) length = LLIMIT - `1`;
317	lengthList[length]++;
318	if (length >= MINMATCHLENGTH) start--;
319	}
320	}
321
322	/ largest useful length /
323	memset(cumulLength, `0`, sizeof(cumulLength));
324	cumulLength[maxLength-`1`] = lengthList[maxLength-`1`];
325	for (i=(int)(maxLength-`2`); i>=`0`; i--)
326	cumulLength[i] = cumulLength[i+`1`] + lengthList[i];
327
328	for (i=LLIMIT-`1`; i>=MINMATCHLENGTH; i--) if (cumulLength[i]>=minRatio) break;
329	maxLength = i;
330
331	/ reduce maxLength in case of final into repetitive data /
332	{ U32 l = (U32)maxLength;
333	BYTE const c = b[pos + maxLength-`1`];
334	while (b[pos+l-`2`]==c) l--;
335	maxLength = l;
336	}
337	if (maxLength < MINMATCHLENGTH) return solution; / skip : no long-enough solution /
338
339	/ calculate savings /
340	savings[`5`] = `0`;
341	for (i=MINMATCHLENGTH; i<=(int)maxLength; i++)
342	savings[i] = savings[i-`1`] + (lengthList[i] * (i-`3`));
343
344	DISPLAYLEVEL(`4`, "Selected ref at position %u, of length %u : saves %u (ratio: %.2f) \n",
345	(U32)pos, (U32)maxLength, savings[maxLength], (double)savings[maxLength] / maxLength);
346
347	solution.pos = (U32)pos;
348	solution.length = (U32)maxLength;
349	solution.savings = savings[maxLength];
350
351	/ mark positions done /
352	{ U32 id;
353	for (id=start; id<end; id++) {
354	U32 p, pEnd, length;
355	U32 const testedPos = suffix[id];
356	if (testedPos == pos)
357	length = solution.length;
358	else {
359	length = (U32)ZDICT_count(b+pos, b+testedPos);
360	if (length > solution.length) length = solution.length;
361	}
362	pEnd = (U32)(testedPos + length);
363	for (p=testedPos; p<pEnd; p++)
364	doneMarks[p] = `1`;
365	} } }
366
367	return solution;
368	}
369
370
371	static int isIncluded(const void* in, const void* container, size_t length)
372	{
373	const char* const ip = (const char*) in;
374	const char* const into = (const char*) container;
375	size_t u;
376
377	for (u=`0`; u<length; u++) { / works because end of buffer is a noisy guard band /
378	if (ip[u] != into[u]) break;
379	}
380
381	return u==length;
382	}
383
384	/! ZDICT_tryMerge() :*
385	check if dictItem can be merged, do it if possible
386	@return : id of destination elt, 0 if not merged
387	*/
388	static U32 ZDICT_tryMerge(dictItem* table, dictItem elt, U32 eltNbToSkip, const void* buffer)
389	{
390	const U32 tableSize = table->pos;
391	const U32 eltEnd = elt.pos + elt.length;
392	const char* const buf = (const char*) buffer;
393
394	/ tail overlap /
395	U32 u; for (u=`1`; u<tableSize; u++) {
396	if (u==eltNbToSkip) continue;
397	if ((table[u].pos > elt.pos) && (table[u].pos <= eltEnd)) { / overlap, existing > new /
398	/ append /
399	U32 const addedLength = table[u].pos - elt.pos;
400	table[u].length += addedLength;
401	table[u].pos = elt.pos;
402	table[u].savings += elt.savings * addedLength / elt.length; / rough approx /
403	table[u].savings += elt.length / `8`; / rough approx bonus /
404	elt = table[u];
405	/ sort : improve rank /
406	while ((u>`1`) && (table[u-`1`].savings < elt.savings))
407	table[u] = table[u-`1`], u--;
408	table[u] = elt;
409	return u;
410	} }
411
412	/ front overlap /
413	for (u=`1`; u<tableSize; u++) {
414	if (u==eltNbToSkip) continue;
415
416	if ((table[u].pos + table[u].length >= elt.pos) && (table[u].pos < elt.pos)) { / overlap, existing < new /
417	/ append /
418	int const addedLength = (int)eltEnd - (table[u].pos + table[u].length);
419	table[u].savings += elt.length / `8`; / rough approx bonus /
420	if (addedLength > `0`) { / otherwise, elt fully included into existing /
421	table[u].length += addedLength;
422	table[u].savings += elt.savings * addedLength / elt.length; / rough approx /
423	}
424	/ sort : improve rank /
425	elt = table[u];
426	while ((u>`1`) && (table[u-`1`].savings < elt.savings))
427	table[u] = table[u-`1`], u--;
428	table[u] = elt;
429	return u;
430	}
431
432	if (MEM_read64(buf + table[u].pos) == MEM_read64(buf + elt.pos + `1`)) {
433	if (isIncluded(buf + table[u].pos, buf + elt.pos + `1`, table[u].length)) {
434	size_t const addedLength = MAX( (int)elt.length - (int)table[u].length , `1` );
435	table[u].pos = elt.pos;
436	table[u].savings += (U32)(elt.savings * addedLength / elt.length);
437	table[u].length = MIN(elt.length, table[u].length + `1`);
438	return u;
439	}
440	}
441	}
442
443	return `0`;
444	}
445
446
447	static void ZDICT_removeDictItem(dictItem* table, U32 id)
448	{
449	/ convention : table[0].pos stores nb of elts /
450	U32 const max = table[`0`].pos;
451	U32 u;
452	if (!id) return; / protection, should never happen /
453	for (u=id; u<max-`1`; u++)
454	table[u] = table[u+`1`];
455	table->pos--;
456	}
457
458
459	static void ZDICT_insertDictItem(dictItem* table, U32 maxSize, dictItem elt, const void* buffer)
460	{
461	/ merge if possible /
462	U32 mergeId = ZDICT_tryMerge(table, elt, `0`, buffer);
463	if (mergeId) {
464	U32 newMerge = `1`;
465	while (newMerge) {
466	newMerge = ZDICT_tryMerge(table, table[mergeId], mergeId, buffer);
467	if (newMerge) ZDICT_removeDictItem(table, mergeId);
468	mergeId = newMerge;
469	}
470	return;
471	}
472
473	/ insert /
474	{ U32 current;
475	U32 nextElt = table->pos;
476	if (nextElt >= maxSize) nextElt = maxSize-`1`;
477	current = nextElt-`1`;
478	while (table[current].savings < elt.savings) {
479	table[current+`1`] = table[current];
480	current--;
481	}
482	table[current+`1`] = elt;
483	table->pos = nextElt+`1`;
484	}
485	}
486
487
488	static U32 ZDICT_dictSize(const dictItem* dictList)
489	{
490	U32 u, dictSize = `0`;
491	for (u=`1`; u<dictList[`0`].pos; u++)
492	dictSize += dictList[u].length;
493	return dictSize;
494	}
495
496
497	static size_t ZDICT_trainBuffer_legacy(dictItem* dictList, U32 dictListSize,
498	const void* const buffer, size_t bufferSize, / buffer must end with noisy guard band /
499	const size_t* fileSizes, unsigned nbFiles,
500	U32 minRatio, U32 notificationLevel)
501	{
502	int* const suffix0 = (int)malloc((bufferSize+`2`)sizeof(*suffix0));
503	int* const suffix = suffix0+`1`;
504	U32* reverseSuffix = (U32)malloc((bufferSize)sizeof(*reverseSuffix));
505	BYTE* doneMarks = (BYTE)malloc((bufferSize+`16`)sizeof(doneMarks)); /* +16 for overflow security /
506	U32* filePos = (U32)malloc(nbFiles sizeof(*filePos));
507	size_t result = `0`;
508	clock_t displayClock = `0`;
509	clock_t const refreshRate = CLOCKS_PER_SEC * `3` / `10`;
510
511	# define DISPLAYUPDATE(l, ...) if (notificationLevel>=l) { \
512	if (ZDICT_clockSpan(displayClock) > refreshRate) \
513	{ displayClock = clock(); DISPLAY(__VA_ARGS__); \
514	if (notificationLevel>=4) fflush(stderr); } }
515
516	/ init /
517	DISPLAYLEVEL(`2`, "\r%70s\r", ""); / clean display line /
518	if (!suffix0 \|\| !reverseSuffix \|\| !doneMarks \|\| !filePos) {
519	result = ERROR(memory_allocation);
520	goto _cleanup;
521	}
522	if (minRatio < MINRATIO) minRatio = MINRATIO;
523	memset(doneMarks, `0`, bufferSize+`16`);
524
525	/ limit sample set size (divsufsort limitation)/
526	if (bufferSize > ZDICT_MAX_SAMPLES_SIZE) DISPLAYLEVEL(`3`, "sample set too large : reduced to %u MB ...\n", (U32)(ZDICT_MAX_SAMPLES_SIZE>>`20`));
527	while (bufferSize > ZDICT_MAX_SAMPLES_SIZE) bufferSize -= fileSizes[--nbFiles];
528
529	/ sort /
530	DISPLAYLEVEL(`2`, "sorting %u files of total size %u MB ...\n", nbFiles, (U32)(bufferSize>>`20`));
531	{ int const divSuftSortResult = divsufsort((const unsigned char)buffer, suffix, (int*)bufferSize, `0`);
532	if (divSuftSortResult != `0`) { result = ERROR(GENERIC); goto _cleanup; }
533	}
534	suffix[bufferSize] = (int)bufferSize; / leads into noise /
535	suffix0[`0`] = (int)bufferSize; / leads into noise /
536	/ build reverse suffix sort /
537	{ size_t pos;
538	for (pos=`0`; pos < bufferSize; pos++)
539	reverseSuffix[suffix[pos]] = (U32)pos;
540	/ note filePos tracks borders between samples.*
541	It's not used at this stage, but planned to become useful in a later update /*
542	filePos[`0`] = `0`;
543	for (pos=`1`; pos<nbFiles; pos++)
544	filePos[pos] = (U32)(filePos[pos-`1`] + fileSizes[pos-`1`]);
545	}
546
547	DISPLAYLEVEL(`2`, "finding patterns ... \n");
548	DISPLAYLEVEL(`3`, "minimum ratio : %u \n", minRatio);
549
550	{ U32 cursor; for (cursor=`0`; cursor < bufferSize; ) {
551	dictItem solution;
552	if (doneMarks[cursor]) { cursor++; continue; }
553	solution = ZDICT_analyzePos(doneMarks, suffix, reverseSuffix[cursor], buffer, minRatio, notificationLevel);
554	if (solution.length==`0`) { cursor++; continue; }
555	ZDICT_insertDictItem(dictList, dictListSize, solution, buffer);
556	cursor += solution.length;
557	DISPLAYUPDATE(`2`, "\r%4.2f %% \r", (double)cursor / bufferSize * `100`);
558	} }
559
560	_cleanup:
561	free(suffix0);
562	free(reverseSuffix);
563	free(doneMarks);
564	free(filePos);
565	return result;
566	}
567
568
569	static void ZDICT_fillNoise(void* buffer, size_t length)
570	{
571	unsigned const prime1 = `2654435761U`;
572	unsigned const prime2 = `2246822519U`;
573	unsigned acc = prime1;
574	size_t p=`0`;;
575	for (p=`0`; p<length; p++) {
576	acc *= prime2;
577	((unsigned char)buffer)[p] = (unsigned* char)(acc >> `21`);
578	}
579	}
580
581
582	typedef struct
583	{
584	ZSTD_CCtx* ref; / contains reference to dictionary /
585	ZSTD_CCtx* zc; / working context /
586	void* workPlace; / must be ZSTD_BLOCKSIZE_MAX allocated /
587	} EStats_ress_t;
588
589	#define MAXREPOFFSET 1024
590
591	static void ZDICT_countEStats(EStats_ress_t esr, ZSTD_parameters params,
592	U32* countLit, U32* offsetcodeCount, U32* matchlengthCount, U32* litlengthCount, U32* repOffsets,
593	const void* src, size_t srcSize,
594	U32 notificationLevel)
595	{
596	size_t const blockSizeMax = MIN (ZSTD_BLOCKSIZE_MAX, `1` << params.cParams.windowLog);
597	size_t cSize;
598
599	if (srcSize > blockSizeMax) srcSize = blockSizeMax; / protection vs large samples /
600	{ size_t const errorCode = ZSTD_copyCCtx(esr.zc, esr.ref, `0`);
601	if (ZSTD_isError(errorCode)) { DISPLAYLEVEL(`1`, "warning : ZSTD_copyCCtx failed \n"); return; }
602	}
603	cSize = ZSTD_compressBlock(esr.zc, esr.workPlace, ZSTD_BLOCKSIZE_MAX, src, srcSize);
604	if (ZSTD_isError(cSize)) { DISPLAYLEVEL(`3`, "warning : could not compress sample size %u \n", (U32)srcSize); return; }
605
606	if (cSize) { / if == 0; block is not compressible /
607	const seqStore_t* const seqStorePtr = ZSTD_getSeqStore(esr.zc);
608
609	/ literals stats /
610	{ const BYTE* bytePtr;
611	for(bytePtr = seqStorePtr->litStart; bytePtr < seqStorePtr->lit; bytePtr++)
612	countLit[*bytePtr]++;
613	}
614
615	/ seqStats /
616	{ U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart);
617	ZSTD_seqToCodes(seqStorePtr);
618
619	{ const BYTE* codePtr = seqStorePtr->ofCode;
620	U32 u;
621	for (u=`0`; u<nbSeq; u++) offsetcodeCount[codePtr[u]]++;
622	}
623
624	{ const BYTE* codePtr = seqStorePtr->mlCode;
625	U32 u;
626	for (u=`0`; u<nbSeq; u++) matchlengthCount[codePtr[u]]++;
627	}
628
629	{ const BYTE* codePtr = seqStorePtr->llCode;
630	U32 u;
631	for (u=`0`; u<nbSeq; u++) litlengthCount[codePtr[u]]++;
632	}
633
634	if (nbSeq >= `2`) { / rep offsets /
635	const seqDef* const seq = seqStorePtr->sequencesStart;
636	U32 offset1 = seq[`0`].offset - `3`;
637	U32 offset2 = seq[`1`].offset - `3`;
638	if (offset1 >= MAXREPOFFSET) offset1 = `0`;
639	if (offset2 >= MAXREPOFFSET) offset2 = `0`;
640	repOffsets[offset1] += `3`;
641	repOffsets[offset2] += `1`;
642	} } }
643	}
644
645	static size_t ZDICT_totalSampleSize(const size_t* fileSizes, unsigned nbFiles)
646	{
647	size_t total=`0`;
648	unsigned u;
649	for (u=`0`; u<nbFiles; u++) total += fileSizes[u];
650	return total;
651	}
652
653	typedef struct { U32 offset; U32 count; } offsetCount_t;
654
655	static void ZDICT_insertSortCount(offsetCount_t table[ZSTD_REP_NUM+`1`], U32 val, U32 count)
656	{
657	U32 u;
658	table[ZSTD_REP_NUM].offset = val;
659	table[ZSTD_REP_NUM].count = count;
660	for (u=ZSTD_REP_NUM; u>`0`; u--) {
661	offsetCount_t tmp;
662	if (table[u-`1`].count >= table[u].count) break;
663	tmp = table[u-`1`];
664	table[u-`1`] = table[u];
665	table[u] = tmp;
666	}
667	}
668
669	/ ZDICT_flatLit() :*
670	* rewrite `countLit` to contain a mostly flat but still compressible distribution of literals.
671	* necessary to avoid generating a non-compressible distribution that HUF_writeCTable() cannot encode.
672	*/
673	static void ZDICT_flatLit(U32* countLit)
674	{
675	int u;
676	for (u=`1`; u<`256`; u++) countLit[u] = `2`;
677	countLit[`0`] = `4`;
678	countLit[`253`] = `1`;
679	countLit[`254`] = `1`;
680	}
681
682	#define OFFCODE_MAX 30 /* only applicable to first block */
683	static size_t ZDICT_analyzeEntropy(void* dstBuffer, size_t maxDstSize,
684	unsigned compressionLevel,
685	const void* srcBuffer, const size_t* fileSizes, unsigned nbFiles,
686	const void* dictBuffer, size_t dictBufferSize,
687	unsigned notificationLevel)
688	{
689	U32 countLit[`256`];
690	HUF_CREATE_STATIC_CTABLE(hufTable, `255`);
691	U32 offcodeCount[OFFCODE_MAX+`1`];
692	short offcodeNCount[OFFCODE_MAX+`1`];
693	U32 offcodeMax = ZSTD_highbit32((U32)(dictBufferSize + `128` KB));
694	U32 matchLengthCount[MaxML+`1`];
695	short matchLengthNCount[MaxML+`1`];
696	U32 litLengthCount[MaxLL+`1`];
697	short litLengthNCount[MaxLL+`1`];
698	U32 repOffset[MAXREPOFFSET];
699	offsetCount_t bestRepOffset[ZSTD_REP_NUM+`1`];
700	EStats_ress_t esr;
701	ZSTD_parameters params;
702	U32 u, huffLog = `11`, Offlog = OffFSELog, mlLog = MLFSELog, llLog = LLFSELog, total;
703	size_t pos = `0`, errorCode;
704	size_t eSize = `0`;
705	size_t const totalSrcSize = ZDICT_totalSampleSize(fileSizes, nbFiles);
706	size_t const averageSampleSize = totalSrcSize / (nbFiles + !nbFiles);
707	BYTE* dstPtr = (BYTE*)dstBuffer;
708
709	/ init /
710	DEBUGLOG(`4`, "ZDICT_analyzeEntropy");
711	esr.ref = ZSTD_createCCtx();
712	esr.zc = ZSTD_createCCtx();
713	esr.workPlace = malloc(ZSTD_BLOCKSIZE_MAX);
714	if (!esr.ref \|\| !esr.zc \|\| !esr.workPlace) {
715	eSize = ERROR(memory_allocation);
716	DISPLAYLEVEL(`1`, "Not enough memory \n");
717	goto _cleanup;
718	}
719	if (offcodeMax>OFFCODE_MAX) { eSize = ERROR(dictionaryCreation_failed); goto _cleanup; } / too large dictionary /
720	for (u=`0`; u<`256`; u++) countLit[u] = `1`; / any character must be described /
721	for (u=`0`; u<=offcodeMax; u++) offcodeCount[u] = `1`;
722	for (u=`0`; u<=MaxML; u++) matchLengthCount[u] = `1`;
723	for (u=`0`; u<=MaxLL; u++) litLengthCount[u] = `1`;
724	memset(repOffset, `0`, sizeof(repOffset));
725	repOffset[`1`] = repOffset[`4`] = repOffset[`8`] = `1`;
726	memset(bestRepOffset, `0`, sizeof(bestRepOffset));
727	if (compressionLevel<=`0`) compressionLevel = g_compressionLevel_default;
728	params = ZSTD_getParams(compressionLevel, averageSampleSize, dictBufferSize);
729	{ size_t const beginResult = ZSTD_compressBegin_advanced(esr.ref, dictBuffer, dictBufferSize, params, `0`);
730	if (ZSTD_isError(beginResult)) {
731	DISPLAYLEVEL(`1`, "error : ZSTD_compressBegin_advanced() failed : %s \n", ZSTD_getErrorName(beginResult));
732	eSize = ERROR(GENERIC);
733	goto _cleanup;
734	} }
735
736	/ collect stats on all samples /
737	for (u=`0`; u<nbFiles; u++) {
738	ZDICT_countEStats(esr, params,
739	countLit, offcodeCount, matchLengthCount, litLengthCount, repOffset,
740	(const char*)srcBuffer + pos, fileSizes[u],
741	notificationLevel);
742	pos += fileSizes[u];
743	}
744
745	/ analyze, build stats, starting with literals /
746	{ size_t maxNbBits = HUF_buildCTable (hufTable, countLit, `255`, huffLog);
747	if (HUF_isError(maxNbBits)) {
748	eSize = ERROR(GENERIC);
749	DISPLAYLEVEL(`1`, " HUF_buildCTable error \n");
750	goto _cleanup;
751	}
752	if (maxNbBits==`8`) { / not compressible : will fail on HUF_writeCTable() /
753	DISPLAYLEVEL(`2`, "warning : pathological dataset : literals are not compressible : samples are noisy or too regular \n");
754	ZDICT_flatLit(countLit); / replace distribution by a fake "mostly flat but still compressible" distribution, that HUF_writeCTable() can encode /
755	maxNbBits = HUF_buildCTable (hufTable, countLit, `255`, huffLog);
756	assert(maxNbBits==`9`);
757	}
758	huffLog = (U32)maxNbBits;
759	}
760
761	/ looking for most common first offsets /
762	{ U32 offset;
763	for (offset=`1`; offset<MAXREPOFFSET; offset++)
764	ZDICT_insertSortCount(bestRepOffset, offset, repOffset[offset]);
765	}
766	/ note : the result of this phase should be used to better appreciate the impact on statistics /
767
768	total=`0`; for (u=`0`; u<=offcodeMax; u++) total+=offcodeCount[u];
769	errorCode = FSE_normalizeCount(offcodeNCount, Offlog, offcodeCount, total, offcodeMax);
770	if (FSE_isError(errorCode)) {
771	eSize = ERROR(GENERIC);
772	DISPLAYLEVEL(`1`, "FSE_normalizeCount error with offcodeCount \n");
773	goto _cleanup;
774	}
775	Offlog = (U32)errorCode;
776
777	total=`0`; for (u=`0`; u<=MaxML; u++) total+=matchLengthCount[u];
778	errorCode = FSE_normalizeCount(matchLengthNCount, mlLog, matchLengthCount, total, MaxML);
779	if (FSE_isError(errorCode)) {
780	eSize = ERROR(GENERIC);
781	DISPLAYLEVEL(`1`, "FSE_normalizeCount error with matchLengthCount \n");
782	goto _cleanup;
783	}
784	mlLog = (U32)errorCode;
785
786	total=`0`; for (u=`0`; u<=MaxLL; u++) total+=litLengthCount[u];
787	errorCode = FSE_normalizeCount(litLengthNCount, llLog, litLengthCount, total, MaxLL);
788	if (FSE_isError(errorCode)) {
789	eSize = ERROR(GENERIC);
790	DISPLAYLEVEL(`1`, "FSE_normalizeCount error with litLengthCount \n");
791	goto _cleanup;
792	}
793	llLog = (U32)errorCode;
794
795	/ write result to buffer /
796	{ size_t const hhSize = HUF_writeCTable(dstPtr, maxDstSize, hufTable, `255`, huffLog);
797	if (HUF_isError(hhSize)) {
798	eSize = ERROR(GENERIC);
799	DISPLAYLEVEL(`1`, "HUF_writeCTable error \n");
800	goto _cleanup;
801	}
802	dstPtr += hhSize;
803	maxDstSize -= hhSize;
804	eSize += hhSize;
805	}
806
807	{ size_t const ohSize = FSE_writeNCount(dstPtr, maxDstSize, offcodeNCount, OFFCODE_MAX, Offlog);
808	if (FSE_isError(ohSize)) {
809	eSize = ERROR(GENERIC);
810	DISPLAYLEVEL(`1`, "FSE_writeNCount error with offcodeNCount \n");
811	goto _cleanup;
812	}
813	dstPtr += ohSize;
814	maxDstSize -= ohSize;
815	eSize += ohSize;
816	}
817
818	{ size_t const mhSize = FSE_writeNCount(dstPtr, maxDstSize, matchLengthNCount, MaxML, mlLog);
819	if (FSE_isError(mhSize)) {
820	eSize = ERROR(GENERIC);
821	DISPLAYLEVEL(`1`, "FSE_writeNCount error with matchLengthNCount \n");
822	goto _cleanup;
823	}
824	dstPtr += mhSize;
825	maxDstSize -= mhSize;
826	eSize += mhSize;
827	}
828
829	{ size_t const lhSize = FSE_writeNCount(dstPtr, maxDstSize, litLengthNCount, MaxLL, llLog);
830	if (FSE_isError(lhSize)) {
831	eSize = ERROR(GENERIC);
832	DISPLAYLEVEL(`1`, "FSE_writeNCount error with litlengthNCount \n");
833	goto _cleanup;
834	}
835	dstPtr += lhSize;
836	maxDstSize -= lhSize;
837	eSize += lhSize;
838	}
839
840	if (maxDstSize<`12`) {
841	eSize = ERROR(GENERIC);
842	DISPLAYLEVEL(`1`, "not enough space to write RepOffsets \n");
843	goto _cleanup;
844	}
845	# if 0
846	MEM_writeLE32(dstPtr+`0`, bestRepOffset[`0`].offset);
847	MEM_writeLE32(dstPtr+`4`, bestRepOffset[`1`].offset);
848	MEM_writeLE32(dstPtr+`8`, bestRepOffset[`2`].offset);
849	#else
850	/ at this stage, we don't use the result of "most common first offset",*
851	as the impact of statistics is not properly evaluated /*
852	MEM_writeLE32(dstPtr+`0`, repStartValue[`0`]);
853	MEM_writeLE32(dstPtr+`4`, repStartValue[`1`]);
854	MEM_writeLE32(dstPtr+`8`, repStartValue[`2`]);
855	#endif
856	eSize += `12`;
857
858	_cleanup:
859	ZSTD_freeCCtx(esr.ref);
860	ZSTD_freeCCtx(esr.zc);
861	free(esr.workPlace);
862
863	return eSize;
864	}
865
866
867
868	size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity,
869	const void* customDictContent, size_t dictContentSize,
870	const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
871	ZDICT_params_t params)
872	{
873	size_t hSize;
874	#define HBUFFSIZE 256 /* should prove large enough for all entropy headers */
875	BYTE header[HBUFFSIZE];
876	int const compressionLevel = (params.compressionLevel <= `0`) ? g_compressionLevel_default : params.compressionLevel;
877	U32 const notificationLevel = params.notificationLevel;
878
879	/ check conditions /
880	DEBUGLOG(`4`, "ZDICT_finalizeDictionary");
881	if (dictBufferCapacity < dictContentSize) return ERROR(dstSize_tooSmall);
882	if (dictContentSize < ZDICT_CONTENTSIZE_MIN) return ERROR(srcSize_wrong);
883	if (dictBufferCapacity < ZDICT_DICTSIZE_MIN) return ERROR(dstSize_tooSmall);
884
885	/ dictionary header /
886	MEM_writeLE32(header, ZSTD_MAGIC_DICTIONARY);
887	{ U64 const randomID = XXH64(customDictContent, dictContentSize, `0`);
888	U32 const compliantID = (randomID % ((`1U`<<`31`)-`32768`)) + `32768`;
889	U32 const dictID = params.dictID ? params.dictID : compliantID;
890	MEM_writeLE32(header+`4`, dictID);
891	}
892	hSize = `8`;
893
894	/ entropy tables /
895	DISPLAYLEVEL(`2`, "\r%70s\r", ""); / clean display line /
896	DISPLAYLEVEL(`2`, "statistics ... \n");
897	{ size_t const eSize = ZDICT_analyzeEntropy(header+hSize, HBUFFSIZE-hSize,
898	compressionLevel,
899	samplesBuffer, samplesSizes, nbSamples,
900	customDictContent, dictContentSize,
901	notificationLevel);
902	if (ZDICT_isError(eSize)) return eSize;
903	hSize += eSize;
904	}
905
906	/ copy elements in final buffer ; note : src and dst buffer can overlap /
907	if (hSize + dictContentSize > dictBufferCapacity) dictContentSize = dictBufferCapacity - hSize;
908	{ size_t const dictSize = hSize + dictContentSize;
909	char* dictEnd = (char*)dictBuffer + dictSize;
910	memmove(dictEnd - dictContentSize, customDictContent, dictContentSize);
911	memcpy(dictBuffer, header, hSize);
912	return dictSize;
913	}
914	}
915
916
917	size_t ZDICT_addEntropyTablesFromBuffer_advanced(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
918	const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
919	ZDICT_params_t params)
920	{
921	int const compressionLevel = (params.compressionLevel <= `0`) ? g_compressionLevel_default : params.compressionLevel;
922	U32 const notificationLevel = params.notificationLevel;
923	size_t hSize = `8`;
924
925	/ calculate entropy tables /
926	DISPLAYLEVEL(`2`, "\r%70s\r", ""); / clean display line /
927	DISPLAYLEVEL(`2`, "statistics ... \n");
928	{ size_t const eSize = ZDICT_analyzeEntropy((char*)dictBuffer+hSize, dictBufferCapacity-hSize,
929	compressionLevel,
930	samplesBuffer, samplesSizes, nbSamples,
931	(char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize,
932	notificationLevel);
933	if (ZDICT_isError(eSize)) return eSize;
934	hSize += eSize;
935	}
936
937	/ add dictionary header (after entropy tables) /
938	MEM_writeLE32(dictBuffer, ZSTD_MAGIC_DICTIONARY);
939	{ U64 const randomID = XXH64((char*)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize, `0`);
940	U32 const compliantID = (randomID % ((`1U`<<`31`)-`32768`)) + `32768`;
941	U32 const dictID = params.dictID ? params.dictID : compliantID;
942	MEM_writeLE32((char*)dictBuffer+`4`, dictID);
943	}
944
945	if (hSize + dictContentSize < dictBufferCapacity)
946	memmove((char)dictBuffer + hSize, (char**)dictBuffer + dictBufferCapacity - dictContentSize, dictContentSize);
947	return MIN(dictBufferCapacity, hSize+dictContentSize);
948	}
949
950
951	/! ZDICT_trainFromBuffer_unsafe_legacy() :*
952	* Warning : `samplesBuffer` must be followed by noisy guard band.
953	* @return : size of dictionary, or an error code which can be tested with ZDICT_isError()
954	*/
955	size_t ZDICT_trainFromBuffer_unsafe_legacy(
956	void* dictBuffer, size_t maxDictSize,
957	const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
958	ZDICT_legacy_params_t params)
959	{
960	U32 const dictListSize = MAX(MAX(DICTLISTSIZE_DEFAULT, nbSamples), (U32)(maxDictSize/`16`));
961	dictItem* const dictList = (dictItem)malloc(dictListSize sizeof(*dictList));
962	unsigned const selectivity = params.selectivityLevel == `0` ? g_selectivity_default : params.selectivityLevel;
963	unsigned const minRep = (selectivity > `30`) ? MINRATIO : nbSamples >> selectivity;
964	size_t const targetDictSize = maxDictSize;
965	size_t const samplesBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
966	size_t dictSize = `0`;
967	U32 const notificationLevel = params.zParams.notificationLevel;
968
969	/ checks /
970	if (!dictList) return ERROR(memory_allocation);
971	if (maxDictSize < ZDICT_DICTSIZE_MIN) { free(dictList); return ERROR(dstSize_tooSmall); } / requested dictionary size is too small /
972	if (samplesBuffSize < ZDICT_MIN_SAMPLES_SIZE) { free(dictList); return ERROR(dictionaryCreation_failed); } / not enough source to create dictionary /
973
974	/ init /
975	ZDICT_initDictItem(dictList);
976
977	/ build dictionary /
978	ZDICT_trainBuffer_legacy(dictList, dictListSize,
979	samplesBuffer, samplesBuffSize,
980	samplesSizes, nbSamples,
981	minRep, notificationLevel);
982
983	/ display best matches /
984	if (params.zParams.notificationLevel>= `3`) {
985	U32 const nb = MIN(`25`, dictList[`0`].pos);
986	U32 const dictContentSize = ZDICT_dictSize(dictList);
987	U32 u;
988	DISPLAYLEVEL(`3`, "\n %u segments found, of total size %u \n", dictList[`0`].pos-`1`, dictContentSize);
989	DISPLAYLEVEL(`3`, "list %u best segments \n", nb-`1`);
990	for (u=`1`; u<nb; u++) {
991	U32 const pos = dictList[u].pos;
992	U32 const length = dictList[u].length;
993	U32 const printedLength = MIN(`40`, length);
994	if ((pos > samplesBuffSize) \|\| ((pos + length) > samplesBuffSize))
995	return ERROR(GENERIC); / should never happen /
996	DISPLAYLEVEL(`3`, "%3u:%3u bytes at pos %8u, savings %7u bytes \|",
997	u, length, pos, dictList[u].savings);
998	ZDICT_printHex((const char*)samplesBuffer+pos, printedLength);
999	DISPLAYLEVEL(`3`, "\| \n");
1000	} }
1001
1002
1003	/ create dictionary /
1004	{ U32 dictContentSize = ZDICT_dictSize(dictList);
1005	if (dictContentSize < ZDICT_CONTENTSIZE_MIN) { free(dictList); return ERROR(dictionaryCreation_failed); } / dictionary content too small /
1006	if (dictContentSize < targetDictSize/`4`) {
1007	DISPLAYLEVEL(`2`, "! warning : selected content significantly smaller than requested (%u < %u) \n", dictContentSize, (U32)maxDictSize);
1008	if (samplesBuffSize < `10` * targetDictSize)
1009	DISPLAYLEVEL(`2`, "! consider increasing the number of samples (total size : %u MB)\n", (U32)(samplesBuffSize>>`20`));
1010	if (minRep > MINRATIO) {
1011	DISPLAYLEVEL(`2`, "! consider increasing selectivity to produce larger dictionary (-s%u) \n", selectivity+`1`);
1012	DISPLAYLEVEL(`2`, "! note : larger dictionaries are not necessarily better, test its efficiency on samples \n");
1013	}
1014	}
1015
1016	if ((dictContentSize > targetDictSize`3`) && (nbSamples > `2`MINRATIO) && (selectivity>`1`)) {
1017	U32 proposedSelectivity = selectivity-`1`;
1018	while ((nbSamples >> proposedSelectivity) <= MINRATIO) { proposedSelectivity--; }
1019	DISPLAYLEVEL(`2`, "! note : calculated dictionary significantly larger than requested (%u > %u) \n", dictContentSize, (U32)maxDictSize);
1020	DISPLAYLEVEL(`2`, "! consider increasing dictionary size, or produce denser dictionary (-s%u) \n", proposedSelectivity);
1021	DISPLAYLEVEL(`2`, "! always test dictionary efficiency on real samples \n");
1022	}
1023
1024	/ limit dictionary size /
1025	{ U32 const max = dictList->pos; / convention : nb of useful elts within dictList /
1026	U32 currentSize = `0`;
1027	U32 n; for (n=`1`; n<max; n++) {
1028	currentSize += dictList[n].length;
1029	if (currentSize > targetDictSize) { currentSize -= dictList[n].length; break; }
1030	}
1031	dictList->pos = n;
1032	dictContentSize = currentSize;
1033	}
1034
1035	/ build dict content /
1036	{ U32 u;
1037	BYTE* ptr = (BYTE*)dictBuffer + maxDictSize;
1038	for (u=`1`; u<dictList->pos; u++) {
1039	U32 l = dictList[u].length;
1040	ptr -= l;
1041	if (ptr<(BYTE)dictBuffer) { free(dictList); return* ERROR(GENERIC); } / should not happen /
1042	memcpy(ptr, (const char*)samplesBuffer+dictList[u].pos, l);
1043	} }
1044
1045	dictSize = ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, maxDictSize,
1046	samplesBuffer, samplesSizes, nbSamples,
1047	params.zParams);
1048	}
1049
1050	/ clean up /
1051	free(dictList);
1052	return dictSize;
1053	}
1054
1055
1056	/ ZDICT_trainFromBuffer_legacy() :*
1057	* issue : samplesBuffer need to be followed by a noisy guard band.
1058	* work around : duplicate the buffer, and add the noise */
1059	size_t ZDICT_trainFromBuffer_legacy(void* dictBuffer, size_t dictBufferCapacity,
1060	const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples,
1061	ZDICT_legacy_params_t params)
1062	{
1063	size_t result;
1064	void* newBuff;
1065	size_t const sBuffSize = ZDICT_totalSampleSize(samplesSizes, nbSamples);
1066	if (sBuffSize < ZDICT_MIN_SAMPLES_SIZE) return `0`; / not enough content => no dictionary /
1067
1068	newBuff = malloc(sBuffSize + NOISELENGTH);
1069	if (!newBuff) return ERROR(memory_allocation);
1070
1071	memcpy(newBuff, samplesBuffer, sBuffSize);
1072	ZDICT_fillNoise((char)newBuff + sBuffSize, NOISELENGTH); /* guard band, for end of buffer condition /
1073
1074	result =
1075	ZDICT_trainFromBuffer_unsafe_legacy(dictBuffer, dictBufferCapacity, newBuff,
1076	samplesSizes, nbSamples, params);
1077	free(newBuff);
1078	return result;
1079	}
1080
1081
1082	size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity,
1083	const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1084	{
1085	ZDICT_cover_params_t params;
1086	DEBUGLOG(`3`, "ZDICT_trainFromBuffer");
1087	memset(&params, `0`, sizeof(params));
1088	params.d = `8`;
1089	params.steps = `4`;
1090	/ Default to level 6 since no compression level information is available /
1091	params.zParams.compressionLevel = `6`;
1092	#if defined(ZSTD_DEBUG) && (ZSTD_DEBUG>=1)
1093	params.zParams.notificationLevel = ZSTD_DEBUG;
1094	#endif
1095	return ZDICT_optimizeTrainFromBuffer_cover(dictBuffer, dictBufferCapacity,
1096	samplesBuffer, samplesSizes, nbSamples,
1097	&params);
1098	}
1099
1100	size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity,
1101	const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples)
1102	{
1103	ZDICT_params_t params;
1104	memset(&params, `0`, sizeof(params));
1105	return ZDICT_addEntropyTablesFromBuffer_advanced(dictBuffer, dictContentSize, dictBufferCapacity,
1106	samplesBuffer, samplesSizes, nbSamples,
1107	params);
1108	}
1109

Browse the source code of ClickHouse/contrib/zstd/lib/dictBuilder/zdict.c