1 | /* |
2 | * Copyright (c) 2016-present, Yann Collet, Facebook, Inc. |
3 | * All rights reserved. |
4 | * |
5 | * This source code is licensed under both the BSD-style license (found in the |
6 | * LICENSE file in the root directory of this source tree) and the GPLv2 (found |
7 | * in the COPYING file in the root directory of this source tree). |
8 | * You may select, at your option, one of the above-listed licenses. |
9 | */ |
10 | |
11 | #ifndef DICTBUILDER_H_001 |
12 | #define DICTBUILDER_H_001 |
13 | |
14 | #if defined (__cplusplus) |
15 | extern "C" { |
16 | #endif |
17 | |
18 | |
19 | /*====== Dependencies ======*/ |
20 | #include <stddef.h> /* size_t */ |
21 | |
22 | |
23 | /* ===== ZDICTLIB_API : control library symbols visibility ===== */ |
24 | #ifndef ZDICTLIB_VISIBILITY |
25 | # if defined(__GNUC__) && (__GNUC__ >= 4) |
26 | # define ZDICTLIB_VISIBILITY __attribute__ ((visibility ("default"))) |
27 | # else |
28 | # define ZDICTLIB_VISIBILITY |
29 | # endif |
30 | #endif |
31 | #if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) |
32 | # define ZDICTLIB_API __declspec(dllexport) ZDICTLIB_VISIBILITY |
33 | #elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) |
34 | # define ZDICTLIB_API __declspec(dllimport) ZDICTLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ |
35 | #else |
36 | # define ZDICTLIB_API ZDICTLIB_VISIBILITY |
37 | #endif |
38 | |
39 | |
40 | /*! ZDICT_trainFromBuffer(): |
41 | * Train a dictionary from an array of samples. |
42 | * Redirect towards ZDICT_optimizeTrainFromBuffer_cover() single-threaded, with d=8 and steps=4. |
43 | * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, |
44 | * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. |
45 | * The resulting dictionary will be saved into `dictBuffer`. |
46 | * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) |
47 | * or an error code, which can be tested with ZDICT_isError(). |
48 | * Note: ZDICT_trainFromBuffer() requires about 9 bytes of memory for each input byte. |
49 | * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. |
50 | * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. |
51 | * In general, it's recommended to provide a few thousands samples, though this can vary a lot. |
52 | * It's recommended that total size of all samples be about ~x100 times the target size of dictionary. |
53 | */ |
54 | ZDICTLIB_API size_t ZDICT_trainFromBuffer(void* dictBuffer, size_t dictBufferCapacity, |
55 | const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples); |
56 | |
57 | |
58 | /*====== Helper functions ======*/ |
59 | ZDICTLIB_API unsigned ZDICT_getDictID(const void* dictBuffer, size_t dictSize); /**< extracts dictID; @return zero if error (not a valid dictionary) */ |
60 | ZDICTLIB_API unsigned ZDICT_isError(size_t errorCode); |
61 | ZDICTLIB_API const char* ZDICT_getErrorName(size_t errorCode); |
62 | |
63 | |
64 | |
65 | #ifdef ZDICT_STATIC_LINKING_ONLY |
66 | |
67 | /* ==================================================================================== |
68 | * The definitions in this section are considered experimental. |
69 | * They should never be used with a dynamic library, as they may change in the future. |
70 | * They are provided for advanced usages. |
71 | * Use them only in association with static linking. |
72 | * ==================================================================================== */ |
73 | |
74 | typedef struct { |
75 | int compressionLevel; /* optimize for a specific zstd compression level; 0 means default */ |
76 | unsigned notificationLevel; /* Write log to stderr; 0 = none (default); 1 = errors; 2 = progression; 3 = details; 4 = debug; */ |
77 | unsigned dictID; /* force dictID value; 0 means auto mode (32-bits random value) */ |
78 | } ZDICT_params_t; |
79 | |
80 | /*! ZDICT_cover_params_t: |
81 | * k and d are the only required parameters. |
82 | * For others, value 0 means default. |
83 | */ |
84 | typedef struct { |
85 | unsigned k; /* Segment size : constraint: 0 < k : Reasonable range [16, 2048+] */ |
86 | unsigned d; /* dmer size : constraint: 0 < d <= k : Reasonable range [6, 16] */ |
87 | unsigned steps; /* Number of steps : Only used for optimization : 0 means default (32) : Higher means more parameters checked */ |
88 | unsigned nbThreads; /* Number of threads : constraint: 0 < nbThreads : 1 means single-threaded : Only used for optimization : Ignored if ZSTD_MULTITHREAD is not defined */ |
89 | ZDICT_params_t zParams; |
90 | } ZDICT_cover_params_t; |
91 | |
92 | |
93 | /*! ZDICT_trainFromBuffer_cover(): |
94 | * Train a dictionary from an array of samples using the COVER algorithm. |
95 | * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, |
96 | * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. |
97 | * The resulting dictionary will be saved into `dictBuffer`. |
98 | * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) |
99 | * or an error code, which can be tested with ZDICT_isError(). |
100 | * Note: ZDICT_trainFromBuffer_cover() requires about 9 bytes of memory for each input byte. |
101 | * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. |
102 | * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. |
103 | * In general, it's recommended to provide a few thousands samples, though this can vary a lot. |
104 | * It's recommended that total size of all samples be about ~x100 times the target size of dictionary. |
105 | */ |
106 | ZDICTLIB_API size_t ZDICT_trainFromBuffer_cover( |
107 | void *dictBuffer, size_t dictBufferCapacity, |
108 | const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, |
109 | ZDICT_cover_params_t parameters); |
110 | |
111 | /*! ZDICT_optimizeTrainFromBuffer_cover(): |
112 | * The same requirements as above hold for all the parameters except `parameters`. |
113 | * This function tries many parameter combinations and picks the best parameters. |
114 | * `*parameters` is filled with the best parameters found, |
115 | * dictionary constructed with those parameters is stored in `dictBuffer`. |
116 | * |
117 | * All of the parameters d, k, steps are optional. |
118 | * If d is non-zero then we don't check multiple values of d, otherwise we check d = {6, 8, 10, 12, 14, 16}. |
119 | * if steps is zero it defaults to its default value. |
120 | * If k is non-zero then we don't check multiple values of k, otherwise we check steps values in [16, 2048]. |
121 | * |
122 | * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) |
123 | * or an error code, which can be tested with ZDICT_isError(). |
124 | * On success `*parameters` contains the parameters selected. |
125 | * Note: ZDICT_optimizeTrainFromBuffer_cover() requires about 8 bytes of memory for each input byte and additionally another 5 bytes of memory for each byte of memory for each thread. |
126 | */ |
127 | ZDICTLIB_API size_t ZDICT_optimizeTrainFromBuffer_cover( |
128 | void* dictBuffer, size_t dictBufferCapacity, |
129 | const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, |
130 | ZDICT_cover_params_t* parameters); |
131 | |
132 | /*! ZDICT_finalizeDictionary(): |
133 | * Given a custom content as a basis for dictionary, and a set of samples, |
134 | * finalize dictionary by adding headers and statistics. |
135 | * |
136 | * Samples must be stored concatenated in a flat buffer `samplesBuffer`, |
137 | * supplied with an array of sizes `samplesSizes`, providing the size of each sample in order. |
138 | * |
139 | * dictContentSize must be >= ZDICT_CONTENTSIZE_MIN bytes. |
140 | * maxDictSize must be >= dictContentSize, and must be >= ZDICT_DICTSIZE_MIN bytes. |
141 | * |
142 | * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`), |
143 | * or an error code, which can be tested by ZDICT_isError(). |
144 | * Note: ZDICT_finalizeDictionary() will push notifications into stderr if instructed to, using notificationLevel>0. |
145 | * Note 2: dictBuffer and dictContent can overlap |
146 | */ |
147 | #define ZDICT_CONTENTSIZE_MIN 128 |
148 | #define ZDICT_DICTSIZE_MIN 256 |
149 | ZDICTLIB_API size_t ZDICT_finalizeDictionary(void* dictBuffer, size_t dictBufferCapacity, |
150 | const void* dictContent, size_t dictContentSize, |
151 | const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples, |
152 | ZDICT_params_t parameters); |
153 | |
154 | typedef struct { |
155 | unsigned selectivityLevel; /* 0 means default; larger => select more => larger dictionary */ |
156 | ZDICT_params_t zParams; |
157 | } ZDICT_legacy_params_t; |
158 | |
159 | /*! ZDICT_trainFromBuffer_legacy(): |
160 | * Train a dictionary from an array of samples. |
161 | * Samples must be stored concatenated in a single flat buffer `samplesBuffer`, |
162 | * supplied with an array of sizes `samplesSizes`, providing the size of each sample, in order. |
163 | * The resulting dictionary will be saved into `dictBuffer`. |
164 | * `parameters` is optional and can be provided with values set to 0 to mean "default". |
165 | * @return: size of dictionary stored into `dictBuffer` (<= `dictBufferCapacity`) |
166 | * or an error code, which can be tested with ZDICT_isError(). |
167 | * Tips: In general, a reasonable dictionary has a size of ~ 100 KB. |
168 | * It's possible to select smaller or larger size, just by specifying `dictBufferCapacity`. |
169 | * In general, it's recommended to provide a few thousands samples, though this can vary a lot. |
170 | * It's recommended that total size of all samples be about ~x100 times the target size of dictionary. |
171 | * Note: ZDICT_trainFromBuffer_legacy() will send notifications into stderr if instructed to, using notificationLevel>0. |
172 | */ |
173 | ZDICTLIB_API size_t ZDICT_trainFromBuffer_legacy( |
174 | void *dictBuffer, size_t dictBufferCapacity, |
175 | const void *samplesBuffer, const size_t *samplesSizes, unsigned nbSamples, |
176 | ZDICT_legacy_params_t parameters); |
177 | |
178 | /* Deprecation warnings */ |
179 | /* It is generally possible to disable deprecation warnings from compiler, |
180 | for example with -Wno-deprecated-declarations for gcc |
181 | or _CRT_SECURE_NO_WARNINGS in Visual. |
182 | Otherwise, it's also possible to manually define ZDICT_DISABLE_DEPRECATE_WARNINGS */ |
183 | #ifdef ZDICT_DISABLE_DEPRECATE_WARNINGS |
184 | # define ZDICT_DEPRECATED(message) ZDICTLIB_API /* disable deprecation warnings */ |
185 | #else |
186 | # define ZDICT_GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) |
187 | # if defined (__cplusplus) && (__cplusplus >= 201402) /* C++14 or greater */ |
188 | # define ZDICT_DEPRECATED(message) [[deprecated(message)]] ZDICTLIB_API |
189 | # elif (ZDICT_GCC_VERSION >= 405) || defined(__clang__) |
190 | # define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated(message))) |
191 | # elif (ZDICT_GCC_VERSION >= 301) |
192 | # define ZDICT_DEPRECATED(message) ZDICTLIB_API __attribute__((deprecated)) |
193 | # elif defined(_MSC_VER) |
194 | # define ZDICT_DEPRECATED(message) ZDICTLIB_API __declspec(deprecated(message)) |
195 | # else |
196 | # pragma message("WARNING: You need to implement ZDICT_DEPRECATED for this compiler") |
197 | # define ZDICT_DEPRECATED(message) ZDICTLIB_API |
198 | # endif |
199 | #endif /* ZDICT_DISABLE_DEPRECATE_WARNINGS */ |
200 | |
201 | ZDICT_DEPRECATED("use ZDICT_finalizeDictionary() instead" ) |
202 | size_t ZDICT_addEntropyTablesFromBuffer(void* dictBuffer, size_t dictContentSize, size_t dictBufferCapacity, |
203 | const void* samplesBuffer, const size_t* samplesSizes, unsigned nbSamples); |
204 | |
205 | |
206 | #endif /* ZDICT_STATIC_LINKING_ONLY */ |
207 | |
208 | #if defined (__cplusplus) |
209 | } |
210 | #endif |
211 | |
212 | #endif /* DICTBUILDER_H_001 */ |
213 | |