1 | /* |
2 | * Copyright 2013-present Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #pragma once |
18 | |
19 | #include <cstdint> |
20 | #include <limits> |
21 | #include <memory> |
22 | #include <string> |
23 | #include <vector> |
24 | |
25 | #include <folly/Optional.h> |
26 | #include <folly/Range.h> |
27 | #include <folly/compression/Counters.h> |
28 | #include <folly/io/IOBuf.h> |
29 | |
30 | /** |
31 | * Compression / decompression over IOBufs |
32 | */ |
33 | |
34 | namespace folly { |
35 | namespace io { |
36 | |
37 | enum class CodecType { |
38 | /** |
39 | * This codec type is not defined; getCodec() will throw an exception |
40 | * if used. Useful if deriving your own classes from Codec without |
41 | * going through the getCodec() interface. |
42 | */ |
43 | USER_DEFINED = 0, |
44 | |
45 | /** |
46 | * Use no compression. |
47 | * Levels supported: 0 |
48 | */ |
49 | NO_COMPRESSION = 1, |
50 | |
51 | /** |
52 | * Use LZ4 compression. |
53 | * Levels supported: 1 = fast, 2 = best; default = 1 |
54 | */ |
55 | LZ4 = 2, |
56 | |
57 | /** |
58 | * Use Snappy compression. |
59 | * Levels supported: 1 |
60 | */ |
61 | SNAPPY = 3, |
62 | |
63 | /** |
64 | * Use zlib compression. |
65 | * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6 |
66 | * Streaming compression is supported. |
67 | */ |
68 | ZLIB = 4, |
69 | |
70 | /** |
71 | * Use LZ4 compression, prefixed with size (as Varint). |
72 | */ |
73 | LZ4_VARINT_SIZE = 5, |
74 | |
75 | /** |
76 | * Use LZMA2 compression. |
77 | * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6 |
78 | * Streaming compression is supported. |
79 | */ |
80 | LZMA2 = 6, |
81 | LZMA2_VARINT_SIZE = 7, |
82 | |
83 | /** |
84 | * Use ZSTD compression. |
85 | * Levels supported: 1 = fast, ..., 19 = best; default = 3 |
86 | * Use ZSTD_FAST for the fastest zstd compression (negative levels). |
87 | * Streaming compression is supported. |
88 | */ |
89 | ZSTD = 8, |
90 | |
91 | /** |
92 | * Use gzip compression. This is the same compression algorithm as ZLIB but |
93 | * gzip-compressed files tend to be easier to work with from the command line. |
94 | * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6 |
95 | * Streaming compression is supported. |
96 | */ |
97 | GZIP = 9, |
98 | |
99 | /** |
100 | * Use LZ4 frame compression. |
101 | * Levels supported: 0 = fast, 16 = best; default = 0 |
102 | */ |
103 | LZ4_FRAME = 10, |
104 | |
105 | /** |
106 | * Use bzip2 compression. |
107 | * Levels supported: 1 = fast, 9 = best; default = 9 |
108 | * Streaming compression is supported BUT FlushOp::FLUSH does NOT ensure that |
109 | * the decompressor can read all the data up to that point, due to a bug in |
110 | * the bzip2 library. |
111 | */ |
112 | BZIP2 = 11, |
113 | |
114 | /** |
115 | * Use ZSTD compression with a negative compression level (1=-1, 2=-2, ...). |
116 | * Higher compression levels mean faster. |
117 | * Level 1 is around the same speed as Snappy with better compression. |
118 | * Level 5 is around the same speed as LZ4 with slightly worse compression. |
119 | * Each level gains about 6-15% speed and loses 3-7% compression. |
120 | * Decompression speed improves for each level, and level 1 decompression |
121 | * speed is around 25% faster than ZSTD. |
122 | * This codec is fully compatible with ZSTD. |
123 | * Levels supported: 1 = best, ..., 5 = fast; default = 1 |
124 | * Streaming compression is supported. |
125 | */ |
126 | ZSTD_FAST = 12, |
127 | |
128 | NUM_CODEC_TYPES = 13, |
129 | }; |
130 | |
131 | class Codec { |
132 | public: |
133 | virtual ~Codec() {} |
134 | |
135 | static constexpr uint64_t UNLIMITED_UNCOMPRESSED_LENGTH = uint64_t(-1); |
136 | /** |
137 | * Return the maximum length of data that may be compressed with this codec. |
138 | * NO_COMPRESSION and ZLIB support arbitrary lengths; |
139 | * LZ4 supports up to 1.9GiB; SNAPPY supports up to 4GiB. |
140 | * May return UNLIMITED_UNCOMPRESSED_LENGTH if unlimited. |
141 | */ |
142 | uint64_t maxUncompressedLength() const; |
143 | |
144 | /** |
145 | * Return the codec's type. |
146 | */ |
147 | CodecType type() const { |
148 | return type_; |
149 | } |
150 | |
151 | /** |
152 | * Does this codec need the exact uncompressed length on decompression? |
153 | */ |
154 | bool needsUncompressedLength() const; |
155 | |
156 | /** |
157 | * Compress data, returning an IOBuf (which may share storage with data). |
158 | * Throws std::invalid_argument if data is larger than |
159 | * maxUncompressedLength(). |
160 | */ |
161 | std::unique_ptr<IOBuf> compress(const folly::IOBuf* data); |
162 | |
163 | /** |
164 | * Compresses data. May involve additional copies compared to the overload |
165 | * that takes and returns IOBufs. Has the same error semantics as the IOBuf |
166 | * version. |
167 | */ |
168 | std::string compress(StringPiece data); |
169 | |
170 | /** |
171 | * Uncompress data. Throws std::runtime_error on decompression error. |
172 | * |
173 | * Some codecs (LZ4) require the exact uncompressed length; this is indicated |
174 | * by needsUncompressedLength(). |
175 | * |
176 | * For other codes (zlib), knowing the exact uncompressed length ahead of |
177 | * time might be faster. |
178 | * |
179 | * Regardless of the behavior of the underlying compressor, uncompressing |
180 | * an empty IOBuf chain will return an empty IOBuf chain. |
181 | */ |
182 | std::unique_ptr<IOBuf> uncompress( |
183 | const IOBuf* data, |
184 | folly::Optional<uint64_t> uncompressedLength = folly::none); |
185 | |
186 | /** |
187 | * Uncompresses data. May involve additional copies compared to the overload |
188 | * that takes and returns IOBufs. Has the same error semantics as the IOBuf |
189 | * version. |
190 | */ |
191 | std::string uncompress( |
192 | StringPiece data, |
193 | folly::Optional<uint64_t> uncompressedLength = folly::none); |
194 | |
195 | /** |
196 | * Returns a bound on the maximum compressed length when compressing data with |
197 | * the given uncompressed length. |
198 | */ |
199 | uint64_t maxCompressedLength(uint64_t uncompressedLength) const; |
200 | |
201 | /** |
202 | * Extracts the uncompressed length from the compressed data if possible. |
203 | * If the codec doesn't store the uncompressed length, or the data is |
204 | * corrupted it returns the given uncompressedLength. |
205 | * If the uncompressed length is stored in the compressed data and |
206 | * uncompressedLength is not none and they do not match a std::runtime_error |
207 | * is thrown. |
208 | */ |
209 | folly::Optional<uint64_t> getUncompressedLength( |
210 | const folly::IOBuf* data, |
211 | folly::Optional<uint64_t> uncompressedLength = folly::none) const; |
212 | |
213 | protected: |
214 | Codec( |
215 | CodecType type, |
216 | folly::Optional<int> level = folly::none, |
217 | folly::StringPiece name = {}, |
218 | bool counters = true); |
219 | |
220 | public: |
221 | /** |
222 | * Returns a superset of the set of prefixes for which canUncompress() will |
223 | * return true. A superset is allowed for optimizations in canUncompress() |
224 | * based on other knowledge such as length. None of the prefixes may be empty. |
225 | * default: No prefixes. |
226 | */ |
227 | virtual std::vector<std::string> validPrefixes() const; |
228 | |
229 | /** |
230 | * Returns true if the codec thinks it can uncompress the data. |
231 | * If a codec doesn't have magic bytes at the beginning, like LZ4 and Snappy, |
232 | * it can always return false. |
233 | * default: Returns false. |
234 | */ |
235 | virtual bool canUncompress( |
236 | const folly::IOBuf* data, |
237 | folly::Optional<uint64_t> uncompressedLength = folly::none) const; |
238 | |
239 | private: |
240 | // default: no limits (save for special value UNKNOWN_UNCOMPRESSED_LENGTH) |
241 | virtual uint64_t doMaxUncompressedLength() const; |
242 | // default: doesn't need uncompressed length |
243 | virtual bool doNeedsUncompressedLength() const; |
244 | virtual std::unique_ptr<IOBuf> doCompress(const folly::IOBuf* data) = 0; |
245 | virtual std::unique_ptr<IOBuf> doUncompress( |
246 | const folly::IOBuf* data, |
247 | folly::Optional<uint64_t> uncompressedLength) = 0; |
248 | // default: an implementation is provided by default to wrap the strings into |
249 | // IOBufs and delegate to the IOBuf methods. This incurs a copy of the output |
250 | // from IOBuf to string. Implementers, at their discretion, can override |
251 | // these methods to avoid the copy. |
252 | virtual std::string doCompressString(StringPiece data); |
253 | virtual std::string doUncompressString( |
254 | StringPiece data, |
255 | folly::Optional<uint64_t> uncompressedLength); |
256 | |
257 | virtual uint64_t doMaxCompressedLength(uint64_t uncompressedLength) const = 0; |
258 | // default: returns the passed uncompressedLength. |
259 | virtual folly::Optional<uint64_t> doGetUncompressedLength( |
260 | const folly::IOBuf* data, |
261 | folly::Optional<uint64_t> uncompressedLength) const; |
262 | |
263 | CodecType type_; |
264 | folly::detail::CompressionCounter bytesBeforeCompression_; |
265 | folly::detail::CompressionCounter bytesAfterCompression_; |
266 | folly::detail::CompressionCounter bytesBeforeDecompression_; |
267 | folly::detail::CompressionCounter bytesAfterDecompression_; |
268 | folly::detail::CompressionCounter compressions_; |
269 | folly::detail::CompressionCounter decompressions_; |
270 | folly::detail::CompressionCounter compressionMilliseconds_; |
271 | folly::detail::CompressionCounter decompressionMilliseconds_; |
272 | }; |
273 | |
274 | class StreamCodec : public Codec { |
275 | public: |
276 | ~StreamCodec() override {} |
277 | |
278 | /** |
279 | * Does the codec need the data length before compression streaming? |
280 | */ |
281 | bool needsDataLength() const; |
282 | |
283 | /***************************************************************************** |
284 | * Streaming API |
285 | ***************************************************************************** |
286 | * A low-level stateful streaming API. |
287 | * Streaming operations can be started in two ways: |
288 | * 1. From a clean Codec on which no non-const methods have been called. |
289 | * 2. A call to resetStream(), which will reset any codec to a clean state. |
290 | * After a streaming operation has begun, either compressStream() or |
291 | * uncompressStream() must be called until the streaming operation ends. |
292 | * compressStream() ends when it returns true with flushOp END. |
293 | * uncompressStream() ends when it returns true. At this point the codec |
294 | * may be reused by calling resetStream(). |
295 | * |
296 | * compress() and uncompress() can be called at any time, but they interrupt |
297 | * any ongoing streaming operations (state is lost and resetStream() must be |
298 | * called before another streaming operation). |
299 | */ |
300 | |
301 | /** |
302 | * Reset the state of the codec, and set the uncompressed length for the next |
303 | * streaming operation. If uncompressedLength is not none it must be exactly |
304 | * the uncompressed length. compressStream() must be passed exactly |
305 | * uncompressedLength input bytes before the stream is ended. |
306 | * uncompressStream() must be passed a compressed frame that uncompresses to |
307 | * uncompressedLength. |
308 | */ |
309 | void resetStream(folly::Optional<uint64_t> uncompressedLength = folly::none); |
310 | |
311 | enum class FlushOp { NONE, FLUSH, END }; |
312 | |
313 | /** |
314 | * Compresses some data from the input buffer and writes the compressed data |
315 | * into the output buffer. It may read input without producing any output, |
316 | * except when forced to flush. |
317 | * |
318 | * The input buffer is advanced to point to the range of data that hasn't yet |
319 | * been read. Compression will resume at this point for the next call to |
320 | * compressStream(). The output buffer is advanced one byte past the last byte |
321 | * written. |
322 | * |
323 | * The default flushOp is NONE, which allows compressStream() complete |
324 | * discretion in how much data to gather before writing any output. |
325 | * |
326 | * If flushOp is END, all pending and input data is flushed to the output |
327 | * buffer, and the frame is ended. compressStream() must be called with the |
328 | * same input and flushOp END until it returns true. At this point the caller |
329 | * must call resetStream() to use the codec again. |
330 | * |
331 | * If flushOp is FLUSH, all pending and input data is flushed to the output |
332 | * buffer, but the frame is not ended. compressStream() must be called with |
333 | * the same input and flushOp END until it returns true. At this point the |
334 | * caller can continue to compressStream() with any input data and flushOp. |
335 | * The uncompressor, if passed all the produced output data, will be able to |
336 | * uncompress all the input data passed to compressStream() so far. Excessive |
337 | * use of flushOp FLUSH will deteriorate compression ratio. This is useful for |
338 | * stateful streaming across a network. Most users don't need to use this |
339 | * flushOp. |
340 | * |
341 | * A std::logic_error is thrown on incorrect usage of the API. |
342 | * A std::runtime_error is thrown upon error conditions or if no forward |
343 | * progress could be made twice in a row. |
344 | */ |
345 | bool compressStream( |
346 | folly::ByteRange& input, |
347 | folly::MutableByteRange& output, |
348 | FlushOp flushOp = StreamCodec::FlushOp::NONE); |
349 | |
350 | /** |
351 | * Uncompresses some data from the input buffer and writes the uncompressed |
352 | * data into the output buffer. It may read input without producing any |
353 | * output. |
354 | * |
355 | * The input buffer is advanced to point to the range of data that hasn't yet |
356 | * been read. Uncompression will resume at this point for the next call to |
357 | * uncompressStream(). The output buffer is advanced one byte past the last |
358 | * byte written. |
359 | * |
360 | * The default flushOp is NONE, which allows uncompressStream() complete |
361 | * discretion in how much output data to flush. The uncompressor may not make |
362 | * maximum forward progress, but will make some forward progress when |
363 | * possible. |
364 | * |
365 | * If flushOp is END, the caller guarantees that no more input will be |
366 | * presented to uncompressStream(). uncompressStream() must be called with the |
367 | * same input and flushOp END until it returns true. This is not mandatory, |
368 | * but if the input is all available in one buffer, and there is enough output |
369 | * space to write the entire frame, codecs can uncompress faster. |
370 | * |
371 | * If flushOp is FLUSH, uncompressStream() is guaranteed to make the maximum |
372 | * amount of forward progress possible. When using this flushOp and |
373 | * uncompressStream() returns with `!output.empty()` the caller knows that all |
374 | * pending output has been flushed. This is useful for stateful streaming |
375 | * across a network, and it should be used in conjunction with |
376 | * compressStream() with flushOp FLUSH. Most users don't need to use this |
377 | * flushOp. |
378 | * |
379 | * A std::runtime_error is thrown upon error conditions or if no forward |
380 | * progress could be made upon two consecutive calls to the function (only the |
381 | * second call will throw an exception). |
382 | * |
383 | * Returns true at the end of a frame. At this point resetStream() must be |
384 | * called to reuse the codec. |
385 | */ |
386 | bool uncompressStream( |
387 | folly::ByteRange& input, |
388 | folly::MutableByteRange& output, |
389 | FlushOp flushOp = StreamCodec::FlushOp::NONE); |
390 | |
391 | protected: |
392 | StreamCodec( |
393 | CodecType type, |
394 | folly::Optional<int> level = folly::none, |
395 | folly::StringPiece name = {}, |
396 | bool counters = true) |
397 | : Codec(type, std::move(level), name, counters) {} |
398 | |
399 | // Returns the uncompressed length last passed to resetStream() or none if it |
400 | // hasn't been called yet. |
401 | folly::Optional<uint64_t> uncompressedLength() const { |
402 | return uncompressedLength_; |
403 | } |
404 | |
405 | private: |
406 | // default: Implemented using the streaming API. |
407 | std::unique_ptr<IOBuf> doCompress(const folly::IOBuf* data) override; |
408 | std::unique_ptr<IOBuf> doUncompress( |
409 | const folly::IOBuf* data, |
410 | folly::Optional<uint64_t> uncompressedLength) override; |
411 | |
412 | // default: Returns false |
413 | virtual bool doNeedsDataLength() const; |
414 | virtual void doResetStream() = 0; |
415 | virtual bool doCompressStream( |
416 | folly::ByteRange& input, |
417 | folly::MutableByteRange& output, |
418 | FlushOp flushOp) = 0; |
419 | virtual bool doUncompressStream( |
420 | folly::ByteRange& input, |
421 | folly::MutableByteRange& output, |
422 | FlushOp flushOp) = 0; |
423 | |
424 | enum class State { |
425 | RESET, |
426 | COMPRESS, |
427 | COMPRESS_FLUSH, |
428 | COMPRESS_END, |
429 | UNCOMPRESS, |
430 | END, |
431 | }; |
432 | void assertStateIs(State expected) const; |
433 | |
434 | State state_{State::RESET}; |
435 | ByteRange previousInput_{}; |
436 | folly::Optional<uint64_t> uncompressedLength_{}; |
437 | bool progressMade_{true}; |
438 | }; |
439 | |
440 | constexpr int COMPRESSION_LEVEL_FASTEST = -1; |
441 | constexpr int COMPRESSION_LEVEL_DEFAULT = -2; |
442 | constexpr int COMPRESSION_LEVEL_BEST = -3; |
443 | |
444 | /** |
445 | * Return a codec for the given type. Throws on error. The level |
446 | * is a non-negative codec-dependent integer indicating the level of |
447 | * compression desired, or one of the following constants: |
448 | * |
449 | * COMPRESSION_LEVEL_FASTEST is fastest (uses least CPU / memory, |
450 | * worst compression) |
451 | * COMPRESSION_LEVEL_DEFAULT is the default (likely a tradeoff between |
452 | * FASTEST and BEST) |
453 | * COMPRESSION_LEVEL_BEST is the best compression (uses most CPU / memory, |
454 | * best compression) |
455 | * |
456 | * When decompressing, the compression level is ignored. All codecs will |
457 | * decompress all data compressed with the a codec of the same type, regardless |
458 | * of compression level. |
459 | */ |
460 | std::unique_ptr<Codec> getCodec( |
461 | CodecType type, |
462 | int level = COMPRESSION_LEVEL_DEFAULT); |
463 | |
464 | /** |
465 | * Return a codec for the given type. Throws on error. The level |
466 | * is a non-negative codec-dependent integer indicating the level of |
467 | * compression desired, or one of the following constants: |
468 | * |
469 | * COMPRESSION_LEVEL_FASTEST is fastest (uses least CPU / memory, |
470 | * worst compression) |
471 | * COMPRESSION_LEVEL_DEFAULT is the default (likely a tradeoff between |
472 | * FASTEST and BEST) |
473 | * COMPRESSION_LEVEL_BEST is the best compression (uses most CPU / memory, |
474 | * best compression) |
475 | * |
476 | * When decompressing, the compression level is ignored. All codecs will |
477 | * decompress all data compressed with the a codec of the same type, regardless |
478 | * of compression level. |
479 | */ |
480 | std::unique_ptr<StreamCodec> getStreamCodec( |
481 | CodecType type, |
482 | int level = COMPRESSION_LEVEL_DEFAULT); |
483 | |
484 | /** |
485 | * Returns a codec that can uncompress any of the given codec types as well as |
486 | * {LZ4_FRAME, ZSTD, ZLIB, GZIP, LZMA2, BZIP2}. Appends each default codec to |
487 | * customCodecs in order, so long as a codec with the same type() isn't already |
488 | * present in customCodecs or as the terminalCodec. When uncompress() is called, |
489 | * each codec's canUncompress() is called in the order that they are given. |
490 | * Appended default codecs are checked last. uncompress() is called on the |
491 | * first codec whose canUncompress() returns true. |
492 | * |
493 | * In addition, an optional `terminalCodec` can be provided. This codec's |
494 | * uncompress() will be called either when no other codec canUncompress() the |
495 | * data or the chosen codec throws an exception on the data. The terminalCodec |
496 | * is intended for ambiguous headers, when canUncompress() is false for some |
497 | * data it can actually uncompress. The terminalCodec does not need to override |
498 | * validPrefixes() or canUncompress() and overriding these functions will have |
499 | * no effect on the returned codec's validPrefixes() or canUncompress() |
500 | * functions. The terminalCodec's needsUncompressedLength() and |
501 | * maxUncompressedLength() will affect the returned codec's respective |
502 | * functions. The terminalCodec must not be duplicated in customCodecs. |
503 | * |
504 | * An exception is thrown if no codec canUncompress() the data and either no |
505 | * terminal codec was provided or a terminal codec was provided and it throws on |
506 | * the data. |
507 | * An exception is thrown if the chosen codec's uncompress() throws on the data |
508 | * and either no terminal codec was provided or a terminal codec was provided |
509 | * and it also throws on the data. |
510 | * An exception is thrown if compress() is called on the returned codec. |
511 | * |
512 | * Requirements are checked in debug mode and are as follows: |
513 | * Let headers be the concatenation of every codec's validPrefixes(). |
514 | * 1. Each codec must override validPrefixes() and canUncompress(). |
515 | * 2. No codec's validPrefixes() may be empty. |
516 | * 3. No header in headers may be empty. |
517 | * 4. headers must not contain any duplicate elements. |
518 | * 5. No strict non-empty prefix of any header in headers may be in headers. |
519 | * 6. The terminalCodec's type must not be the same as any other codec's type |
520 | * (with USER_DEFINED being the exception). |
521 | */ |
522 | std::unique_ptr<Codec> getAutoUncompressionCodec( |
523 | std::vector<std::unique_ptr<Codec>> customCodecs = {}, |
524 | std::unique_ptr<Codec> terminalCodec = {}); |
525 | |
526 | /** |
527 | * Check if a specified codec is supported. |
528 | */ |
529 | bool hasCodec(CodecType type); |
530 | |
531 | /** |
532 | * Check if a specified codec is supported and supports streaming. |
533 | */ |
534 | bool hasStreamCodec(CodecType type); |
535 | } // namespace io |
536 | } // namespace folly |
537 | |