1 | /* -*- mode: C; c-basic-offset: 4; indent-tabs-mode: nil -*- */ |
2 | // vim: expandtab:ts=8:sw=4:softtabstop=4: |
3 | /** |
4 | * \file lzma/lzma.h |
5 | * \brief LZMA1 and LZMA2 filters |
6 | */ |
7 | |
8 | /* |
9 | * Author: Lasse Collin |
10 | * |
11 | * This file has been put into the public domain. |
12 | * You can do whatever you want with this file. |
13 | * |
14 | * See ../lzma.h for information about liblzma as a whole. |
15 | */ |
16 | |
17 | #ifndef LZMA_H_INTERNAL |
18 | # error Never include this file directly. Use <lzma.h> instead. |
19 | #endif |
20 | |
21 | |
22 | /** |
23 | * \brief LZMA1 Filter ID |
24 | * |
25 | * LZMA1 is the very same thing as what was called just LZMA in LZMA Utils, |
26 | * 7-Zip, and LZMA SDK. It's called LZMA1 here to prevent developers from |
27 | * accidentally using LZMA when they actually want LZMA2. |
28 | * |
29 | * LZMA1 shouldn't be used for new applications unless you _really_ know |
30 | * what you are doing. LZMA2 is almost always a better choice. |
31 | */ |
32 | #define LZMA_FILTER_LZMA1 LZMA_VLI_C(0x4000000000000001) |
33 | |
34 | /** |
35 | * \brief LZMA2 Filter ID |
36 | * |
37 | * Usually you want this instead of LZMA1. Compared to LZMA1, LZMA2 adds |
38 | * support for LZMA_SYNC_FLUSH, uncompressed chunks (smaller expansion |
39 | * when trying to compress uncompressible data), possibility to change |
40 | * lc/lp/pb in the middle of encoding, and some other internal improvements. |
41 | */ |
42 | #define LZMA_FILTER_LZMA2 LZMA_VLI_C(0x21) |
43 | |
44 | |
45 | /** |
46 | * \brief Match finders |
47 | * |
48 | * Match finder has major effect on both speed and compression ratio. |
49 | * Usually hash chains are faster than binary trees. |
50 | * |
51 | * The memory usage formulas are only rough estimates, which are closest to |
52 | * reality when dict_size is a power of two. The formulas are more complex |
53 | * in reality, and can also change a little between liblzma versions. Use |
54 | * lzma_memusage_encoder() to get more accurate estimate of memory usage. |
55 | */ |
56 | typedef enum { |
57 | LZMA_MF_HC3 = 0x03, |
58 | /**< |
59 | * \brief Hash Chain with 2- and 3-byte hashing |
60 | * |
61 | * Minimum nice_len: 3 |
62 | * |
63 | * Memory usage: |
64 | * - dict_size <= 16 MiB: dict_size * 7.5 |
65 | * - dict_size > 16 MiB: dict_size * 5.5 + 64 MiB |
66 | */ |
67 | |
68 | LZMA_MF_HC4 = 0x04, |
69 | /**< |
70 | * \brief Hash Chain with 2-, 3-, and 4-byte hashing |
71 | * |
72 | * Minimum nice_len: 4 |
73 | * |
74 | * Memory usage: dict_size * 7.5 |
75 | */ |
76 | |
77 | LZMA_MF_BT2 = 0x12, |
78 | /**< |
79 | * \brief Binary Tree with 2-byte hashing |
80 | * |
81 | * Minimum nice_len: 2 |
82 | * |
83 | * Memory usage: dict_size * 9.5 |
84 | */ |
85 | |
86 | LZMA_MF_BT3 = 0x13, |
87 | /**< |
88 | * \brief Binary Tree with 2- and 3-byte hashing |
89 | * |
90 | * Minimum nice_len: 3 |
91 | * |
92 | * Memory usage: |
93 | * - dict_size <= 16 MiB: dict_size * 11.5 |
94 | * - dict_size > 16 MiB: dict_size * 9.5 + 64 MiB |
95 | */ |
96 | |
97 | LZMA_MF_BT4 = 0x14 |
98 | /**< |
99 | * \brief Binary Tree with 2-, 3-, and 4-byte hashing |
100 | * |
101 | * Minimum nice_len: 4 |
102 | * |
103 | * Memory usage: dict_size * 11.5 |
104 | */ |
105 | } lzma_match_finder; |
106 | |
107 | |
108 | /** |
109 | * \brief Test if given match finder is supported |
110 | * |
111 | * Return true if the given match finder is supported by this liblzma build. |
112 | * Otherwise false is returned. It is safe to call this with a value that |
113 | * isn't listed in lzma_match_finder enumeration; the return value will be |
114 | * false. |
115 | * |
116 | * There is no way to list which match finders are available in this |
117 | * particular liblzma version and build. It would be useless, because |
118 | * a new match finder, which the application developer wasn't aware, |
119 | * could require giving additional options to the encoder that the older |
120 | * match finders don't need. |
121 | */ |
122 | extern LZMA_API(lzma_bool) lzma_mf_is_supported(lzma_match_finder match_finder) |
123 | lzma_nothrow lzma_attr_const; |
124 | |
125 | |
126 | /** |
127 | * \brief Compression modes |
128 | * |
129 | * This selects the function used to analyze the data produced by the match |
130 | * finder. |
131 | */ |
132 | typedef enum { |
133 | LZMA_MODE_FAST = 1, |
134 | /**< |
135 | * \brief Fast compression |
136 | * |
137 | * Fast mode is usually at its best when combined with |
138 | * a hash chain match finder. |
139 | */ |
140 | |
141 | LZMA_MODE_NORMAL = 2 |
142 | /**< |
143 | * \brief Normal compression |
144 | * |
145 | * This is usually notably slower than fast mode. Use this |
146 | * together with binary tree match finders to expose the |
147 | * full potential of the LZMA1 or LZMA2 encoder. |
148 | */ |
149 | } lzma_mode; |
150 | |
151 | |
152 | /** |
153 | * \brief Test if given compression mode is supported |
154 | * |
155 | * Return true if the given compression mode is supported by this liblzma |
156 | * build. Otherwise false is returned. It is safe to call this with a value |
157 | * that isn't listed in lzma_mode enumeration; the return value will be false. |
158 | * |
159 | * There is no way to list which modes are available in this particular |
160 | * liblzma version and build. It would be useless, because a new compression |
161 | * mode, which the application developer wasn't aware, could require giving |
162 | * additional options to the encoder that the older modes don't need. |
163 | */ |
164 | extern LZMA_API(lzma_bool) lzma_mode_is_supported(lzma_mode mode) |
165 | lzma_nothrow lzma_attr_const; |
166 | |
167 | |
168 | /** |
169 | * \brief Options specific to the LZMA1 and LZMA2 filters |
170 | * |
171 | * Since LZMA1 and LZMA2 share most of the code, it's simplest to share |
172 | * the options structure too. For encoding, all but the reserved variables |
173 | * need to be initialized unless specifically mentioned otherwise. |
174 | * |
175 | * For raw decoding, both LZMA1 and LZMA2 need dict_size, preset_dict, and |
176 | * preset_dict_size (if preset_dict != NULL). LZMA1 needs also lc, lp, and pb. |
177 | */ |
178 | typedef struct { |
179 | /** |
180 | * \brief Dictionary size in bytes |
181 | * |
182 | * Dictionary size indicates how many bytes of the recently processed |
183 | * uncompressed data is kept in memory. One method to reduce size of |
184 | * the uncompressed data is to store distance-length pairs, which |
185 | * indicate what data to repeat from the dictionary buffer. Thus, |
186 | * the bigger the dictionary, the better the compression ratio |
187 | * usually is. |
188 | * |
189 | * Maximum size of the dictionary depends on multiple things: |
190 | * - Memory usage limit |
191 | * - Available address space (not a problem on 64-bit systems) |
192 | * - Selected match finder (encoder only) |
193 | * |
194 | * Currently the maximum dictionary size for encoding is 1.5 GiB |
195 | * (i.e. (UINT32_C(1) << 30) + (UINT32_C(1) << 29)) even on 64-bit |
196 | * systems for certain match finder implementation reasons. In the |
197 | * future, there may be match finders that support bigger |
198 | * dictionaries. |
199 | * |
200 | * Decoder already supports dictionaries up to 4 GiB - 1 B (i.e. |
201 | * UINT32_MAX), so increasing the maximum dictionary size of the |
202 | * encoder won't cause problems for old decoders. |
203 | * |
204 | * Because extremely small dictionaries sizes would have unneeded |
205 | * overhead in the decoder, the minimum dictionary size is 4096 bytes. |
206 | * |
207 | * \note When decoding, too big dictionary does no other harm |
208 | * than wasting memory. |
209 | */ |
210 | uint32_t dict_size; |
211 | # define LZMA_DICT_SIZE_MIN UINT32_C(4096) |
212 | # define LZMA_DICT_SIZE_DEFAULT (UINT32_C(1) << 23) |
213 | |
214 | /** |
215 | * \brief Pointer to an initial dictionary |
216 | * |
217 | * It is possible to initialize the LZ77 history window using |
218 | * a preset dictionary. It is useful when compressing many |
219 | * similar, relatively small chunks of data independently from |
220 | * each other. The preset dictionary should contain typical |
221 | * strings that occur in the files being compressed. The most |
222 | * probable strings should be near the end of the preset dictionary. |
223 | * |
224 | * This feature should be used only in special situations. For |
225 | * now, it works correctly only with raw encoding and decoding. |
226 | * Currently none of the container formats supported by |
227 | * liblzma allow preset dictionary when decoding, thus if |
228 | * you create a .xz or .lzma file with preset dictionary, it |
229 | * cannot be decoded with the regular decoder functions. In the |
230 | * future, the .xz format will likely get support for preset |
231 | * dictionary though. |
232 | */ |
233 | const uint8_t *preset_dict; |
234 | |
235 | /** |
236 | * \brief Size of the preset dictionary |
237 | * |
238 | * Specifies the size of the preset dictionary. If the size is |
239 | * bigger than dict_size, only the last dict_size bytes are |
240 | * processed. |
241 | * |
242 | * This variable is read only when preset_dict is not NULL. |
243 | * If preset_dict is not NULL but preset_dict_size is zero, |
244 | * no preset dictionary is used (identical to only setting |
245 | * preset_dict to NULL). |
246 | */ |
247 | uint32_t preset_dict_size; |
248 | |
249 | /** |
250 | * \brief Number of literal context bits |
251 | * |
252 | * How many of the highest bits of the previous uncompressed |
253 | * eight-bit byte (also known as `literal') are taken into |
254 | * account when predicting the bits of the next literal. |
255 | * |
256 | * \todo Example |
257 | * |
258 | * There is a limit that applies to literal context bits and literal |
259 | * position bits together: lc + lp <= 4. Without this limit the |
260 | * decoding could become very slow, which could have security related |
261 | * results in some cases like email servers doing virus scanning. |
262 | * This limit also simplifies the internal implementation in liblzma. |
263 | * |
264 | * There may be LZMA1 streams that have lc + lp > 4 (maximum possible |
265 | * lc would be 8). It is not possible to decode such streams with |
266 | * liblzma. |
267 | */ |
268 | uint32_t lc; |
269 | # define LZMA_LCLP_MIN 0 |
270 | # define LZMA_LCLP_MAX 4 |
271 | # define LZMA_LC_DEFAULT 3 |
272 | |
273 | /** |
274 | * \brief Number of literal position bits |
275 | * |
276 | * How many of the lowest bits of the current position (number |
277 | * of bytes from the beginning of the uncompressed data) in the |
278 | * uncompressed data is taken into account when predicting the |
279 | * bits of the next literal (a single eight-bit byte). |
280 | * |
281 | * \todo Example |
282 | */ |
283 | uint32_t lp; |
284 | # define LZMA_LP_DEFAULT 0 |
285 | |
286 | /** |
287 | * \brief Number of position bits |
288 | * |
289 | * How many of the lowest bits of the current position in the |
290 | * uncompressed data is taken into account when estimating |
291 | * probabilities of matches. A match is a sequence of bytes for |
292 | * which a matching sequence is found from the dictionary and |
293 | * thus can be stored as distance-length pair. |
294 | * |
295 | * Example: If most of the matches occur at byte positions of |
296 | * 8 * n + 3, that is, 3, 11, 19, ... set pb to 3, because 2**3 == 8. |
297 | */ |
298 | uint32_t pb; |
299 | # define LZMA_PB_MIN 0 |
300 | # define LZMA_PB_MAX 4 |
301 | # define LZMA_PB_DEFAULT 2 |
302 | |
303 | /** |
304 | * \brief Indicate if the options structure is persistent |
305 | * |
306 | * If this is true, the application must keep this options structure |
307 | * available after the LZMA2 encoder has been initialized. With |
308 | * persistent structure it is possible to change some encoder options |
309 | * in the middle of the encoding process without resetting the encoder. |
310 | * |
311 | * This option is used only by LZMA2. LZMA1 ignores this and it is |
312 | * safe to not initialize this when encoding with LZMA1. |
313 | */ |
314 | lzma_bool persistent; |
315 | |
316 | /** Compression mode */ |
317 | lzma_mode mode; |
318 | |
319 | /** |
320 | * \brief Nice length of a match |
321 | * |
322 | * This determines how many bytes the encoder compares from the match |
323 | * candidates when looking for the best match. Once a match of at |
324 | * least nice_len bytes long is found, the encoder stops looking for |
325 | * better condidates and encodes the match. (Naturally, if the found |
326 | * match is actually longer than nice_len, the actual length is |
327 | * encoded; it's not truncated to nice_len.) |
328 | * |
329 | * Bigger values usually increase the compression ratio and |
330 | * compression time. For most files, 32 to 128 is a good value, |
331 | * which gives very good compression ratio at good speed. |
332 | * |
333 | * The exact minimum value depends on the match finder. The maximum |
334 | * is 273, which is the maximum length of a match that LZMA1 and |
335 | * LZMA2 can encode. |
336 | */ |
337 | uint32_t nice_len; |
338 | |
339 | /** Match finder ID */ |
340 | lzma_match_finder mf; |
341 | |
342 | /** |
343 | * \brief Maximum search depth in the match finder |
344 | * |
345 | * For every input byte, match finder searches through the hash chain |
346 | * or binary tree in a loop, each iteration going one step deeper in |
347 | * the chain or tree. The searching stops if |
348 | * - a match of at least nice_len bytes long is found; |
349 | * - all match candidates from the hash chain or binary tree have |
350 | * been checked; or |
351 | * - maximum search depth is reached. |
352 | * |
353 | * Maximum search depth is needed to prevent the match finder from |
354 | * wasting too much time in case there are lots of short match |
355 | * candidates. On the other hand, stopping the search before all |
356 | * candidates have been checked can reduce compression ratio. |
357 | * |
358 | * Setting depth to zero tells liblzma to use an automatic default |
359 | * value, that depends on the selected match finder and nice_len. |
360 | * The default is in the range [10, 200] or so (it may vary between |
361 | * liblzma versions). |
362 | * |
363 | * Using a bigger depth value than the default can increase |
364 | * compression ratio in some cases. There is no strict maximum value, |
365 | * but high values (thousands or millions) should be used with care: |
366 | * the encoder could remain fast enough with typical input, but |
367 | * malicious input could cause the match finder to slow down |
368 | * dramatically, possibly creating a denial of service attack. |
369 | */ |
370 | uint32_t depth; |
371 | |
372 | /* |
373 | * Reserved space to allow possible future extensions without |
374 | * breaking the ABI. You should not touch these, because the names |
375 | * of these variables may change. These are and will never be used |
376 | * with the currently supported options, so it is safe to leave these |
377 | * uninitialized. |
378 | */ |
379 | void *reserved_ptr1; |
380 | void *reserved_ptr2; |
381 | uint32_t reserved_int1; |
382 | uint32_t reserved_int2; |
383 | uint32_t reserved_int3; |
384 | uint32_t reserved_int4; |
385 | uint32_t reserved_int5; |
386 | uint32_t reserved_int6; |
387 | uint32_t reserved_int7; |
388 | uint32_t reserved_int8; |
389 | lzma_reserved_enum reserved_enum1; |
390 | lzma_reserved_enum reserved_enum2; |
391 | lzma_reserved_enum reserved_enum3; |
392 | lzma_reserved_enum reserved_enum4; |
393 | |
394 | } lzma_options_lzma; |
395 | |
396 | |
397 | /** |
398 | * \brief Set a compression preset to lzma_options_lzma structure |
399 | * |
400 | * 0 is the fastest and 9 is the slowest. These match the switches -0 .. -9 |
401 | * of the xz command line tool. In addition, it is possible to bitwise-or |
402 | * flags to the preset. Currently only LZMA_PRESET_EXTREME is supported. |
403 | * The flags are defined in container.h, because the flags are used also |
404 | * with lzma_easy_encoder(). |
405 | * |
406 | * The preset values are subject to changes between liblzma versions. |
407 | * |
408 | * This function is available only if LZMA1 or LZMA2 encoder has been enabled |
409 | * when building liblzma. |
410 | */ |
411 | extern LZMA_API(lzma_bool) lzma_lzma_preset( |
412 | lzma_options_lzma *options, uint32_t preset) lzma_nothrow; |
413 | |