1/*
2 * Copyright 2013-present Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#pragma once
18
19#include <cstdint>
20#include <limits>
21#include <memory>
22#include <string>
23#include <vector>
24
25#include <folly/Optional.h>
26#include <folly/Range.h>
27#include <folly/compression/Counters.h>
28#include <folly/io/IOBuf.h>
29
30/**
31 * Compression / decompression over IOBufs
32 */
33
34namespace folly {
35namespace io {
36
37enum class CodecType {
38 /**
39 * This codec type is not defined; getCodec() will throw an exception
40 * if used. Useful if deriving your own classes from Codec without
41 * going through the getCodec() interface.
42 */
43 USER_DEFINED = 0,
44
45 /**
46 * Use no compression.
47 * Levels supported: 0
48 */
49 NO_COMPRESSION = 1,
50
51 /**
52 * Use LZ4 compression.
53 * Levels supported: 1 = fast, 2 = best; default = 1
54 */
55 LZ4 = 2,
56
57 /**
58 * Use Snappy compression.
59 * Levels supported: 1
60 */
61 SNAPPY = 3,
62
63 /**
64 * Use zlib compression.
65 * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6
66 * Streaming compression is supported.
67 */
68 ZLIB = 4,
69
70 /**
71 * Use LZ4 compression, prefixed with size (as Varint).
72 */
73 LZ4_VARINT_SIZE = 5,
74
75 /**
76 * Use LZMA2 compression.
77 * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6
78 * Streaming compression is supported.
79 */
80 LZMA2 = 6,
81 LZMA2_VARINT_SIZE = 7,
82
83 /**
84 * Use ZSTD compression.
85 * Levels supported: 1 = fast, ..., 19 = best; default = 3
86 * Use ZSTD_FAST for the fastest zstd compression (negative levels).
87 * Streaming compression is supported.
88 */
89 ZSTD = 8,
90
91 /**
92 * Use gzip compression. This is the same compression algorithm as ZLIB but
93 * gzip-compressed files tend to be easier to work with from the command line.
94 * Levels supported: 0 = no compression, 1 = fast, ..., 9 = best; default = 6
95 * Streaming compression is supported.
96 */
97 GZIP = 9,
98
99 /**
100 * Use LZ4 frame compression.
101 * Levels supported: 0 = fast, 16 = best; default = 0
102 */
103 LZ4_FRAME = 10,
104
105 /**
106 * Use bzip2 compression.
107 * Levels supported: 1 = fast, 9 = best; default = 9
108 * Streaming compression is supported BUT FlushOp::FLUSH does NOT ensure that
109 * the decompressor can read all the data up to that point, due to a bug in
110 * the bzip2 library.
111 */
112 BZIP2 = 11,
113
114 /**
115 * Use ZSTD compression with a negative compression level (1=-1, 2=-2, ...).
116 * Higher compression levels mean faster.
117 * Level 1 is around the same speed as Snappy with better compression.
118 * Level 5 is around the same speed as LZ4 with slightly worse compression.
119 * Each level gains about 6-15% speed and loses 3-7% compression.
120 * Decompression speed improves for each level, and level 1 decompression
121 * speed is around 25% faster than ZSTD.
122 * This codec is fully compatible with ZSTD.
123 * Levels supported: 1 = best, ..., 5 = fast; default = 1
124 * Streaming compression is supported.
125 */
126 ZSTD_FAST = 12,
127
128 NUM_CODEC_TYPES = 13,
129};
130
131class Codec {
132 public:
133 virtual ~Codec() {}
134
135 static constexpr uint64_t UNLIMITED_UNCOMPRESSED_LENGTH = uint64_t(-1);
136 /**
137 * Return the maximum length of data that may be compressed with this codec.
138 * NO_COMPRESSION and ZLIB support arbitrary lengths;
139 * LZ4 supports up to 1.9GiB; SNAPPY supports up to 4GiB.
140 * May return UNLIMITED_UNCOMPRESSED_LENGTH if unlimited.
141 */
142 uint64_t maxUncompressedLength() const;
143
144 /**
145 * Return the codec's type.
146 */
147 CodecType type() const {
148 return type_;
149 }
150
151 /**
152 * Does this codec need the exact uncompressed length on decompression?
153 */
154 bool needsUncompressedLength() const;
155
156 /**
157 * Compress data, returning an IOBuf (which may share storage with data).
158 * Throws std::invalid_argument if data is larger than
159 * maxUncompressedLength().
160 */
161 std::unique_ptr<IOBuf> compress(const folly::IOBuf* data);
162
163 /**
164 * Compresses data. May involve additional copies compared to the overload
165 * that takes and returns IOBufs. Has the same error semantics as the IOBuf
166 * version.
167 */
168 std::string compress(StringPiece data);
169
170 /**
171 * Uncompress data. Throws std::runtime_error on decompression error.
172 *
173 * Some codecs (LZ4) require the exact uncompressed length; this is indicated
174 * by needsUncompressedLength().
175 *
176 * For other codes (zlib), knowing the exact uncompressed length ahead of
177 * time might be faster.
178 *
179 * Regardless of the behavior of the underlying compressor, uncompressing
180 * an empty IOBuf chain will return an empty IOBuf chain.
181 */
182 std::unique_ptr<IOBuf> uncompress(
183 const IOBuf* data,
184 folly::Optional<uint64_t> uncompressedLength = folly::none);
185
186 /**
187 * Uncompresses data. May involve additional copies compared to the overload
188 * that takes and returns IOBufs. Has the same error semantics as the IOBuf
189 * version.
190 */
191 std::string uncompress(
192 StringPiece data,
193 folly::Optional<uint64_t> uncompressedLength = folly::none);
194
195 /**
196 * Returns a bound on the maximum compressed length when compressing data with
197 * the given uncompressed length.
198 */
199 uint64_t maxCompressedLength(uint64_t uncompressedLength) const;
200
201 /**
202 * Extracts the uncompressed length from the compressed data if possible.
203 * If the codec doesn't store the uncompressed length, or the data is
204 * corrupted it returns the given uncompressedLength.
205 * If the uncompressed length is stored in the compressed data and
206 * uncompressedLength is not none and they do not match a std::runtime_error
207 * is thrown.
208 */
209 folly::Optional<uint64_t> getUncompressedLength(
210 const folly::IOBuf* data,
211 folly::Optional<uint64_t> uncompressedLength = folly::none) const;
212
213 protected:
214 Codec(
215 CodecType type,
216 folly::Optional<int> level = folly::none,
217 folly::StringPiece name = {},
218 bool counters = true);
219
220 public:
221 /**
222 * Returns a superset of the set of prefixes for which canUncompress() will
223 * return true. A superset is allowed for optimizations in canUncompress()
224 * based on other knowledge such as length. None of the prefixes may be empty.
225 * default: No prefixes.
226 */
227 virtual std::vector<std::string> validPrefixes() const;
228
229 /**
230 * Returns true if the codec thinks it can uncompress the data.
231 * If a codec doesn't have magic bytes at the beginning, like LZ4 and Snappy,
232 * it can always return false.
233 * default: Returns false.
234 */
235 virtual bool canUncompress(
236 const folly::IOBuf* data,
237 folly::Optional<uint64_t> uncompressedLength = folly::none) const;
238
239 private:
240 // default: no limits (save for special value UNKNOWN_UNCOMPRESSED_LENGTH)
241 virtual uint64_t doMaxUncompressedLength() const;
242 // default: doesn't need uncompressed length
243 virtual bool doNeedsUncompressedLength() const;
244 virtual std::unique_ptr<IOBuf> doCompress(const folly::IOBuf* data) = 0;
245 virtual std::unique_ptr<IOBuf> doUncompress(
246 const folly::IOBuf* data,
247 folly::Optional<uint64_t> uncompressedLength) = 0;
248 // default: an implementation is provided by default to wrap the strings into
249 // IOBufs and delegate to the IOBuf methods. This incurs a copy of the output
250 // from IOBuf to string. Implementers, at their discretion, can override
251 // these methods to avoid the copy.
252 virtual std::string doCompressString(StringPiece data);
253 virtual std::string doUncompressString(
254 StringPiece data,
255 folly::Optional<uint64_t> uncompressedLength);
256
257 virtual uint64_t doMaxCompressedLength(uint64_t uncompressedLength) const = 0;
258 // default: returns the passed uncompressedLength.
259 virtual folly::Optional<uint64_t> doGetUncompressedLength(
260 const folly::IOBuf* data,
261 folly::Optional<uint64_t> uncompressedLength) const;
262
263 CodecType type_;
264 folly::detail::CompressionCounter bytesBeforeCompression_;
265 folly::detail::CompressionCounter bytesAfterCompression_;
266 folly::detail::CompressionCounter bytesBeforeDecompression_;
267 folly::detail::CompressionCounter bytesAfterDecompression_;
268 folly::detail::CompressionCounter compressions_;
269 folly::detail::CompressionCounter decompressions_;
270 folly::detail::CompressionCounter compressionMilliseconds_;
271 folly::detail::CompressionCounter decompressionMilliseconds_;
272};
273
274class StreamCodec : public Codec {
275 public:
276 ~StreamCodec() override {}
277
278 /**
279 * Does the codec need the data length before compression streaming?
280 */
281 bool needsDataLength() const;
282
283 /*****************************************************************************
284 * Streaming API
285 *****************************************************************************
286 * A low-level stateful streaming API.
287 * Streaming operations can be started in two ways:
288 * 1. From a clean Codec on which no non-const methods have been called.
289 * 2. A call to resetStream(), which will reset any codec to a clean state.
290 * After a streaming operation has begun, either compressStream() or
291 * uncompressStream() must be called until the streaming operation ends.
292 * compressStream() ends when it returns true with flushOp END.
293 * uncompressStream() ends when it returns true. At this point the codec
294 * may be reused by calling resetStream().
295 *
296 * compress() and uncompress() can be called at any time, but they interrupt
297 * any ongoing streaming operations (state is lost and resetStream() must be
298 * called before another streaming operation).
299 */
300
301 /**
302 * Reset the state of the codec, and set the uncompressed length for the next
303 * streaming operation. If uncompressedLength is not none it must be exactly
304 * the uncompressed length. compressStream() must be passed exactly
305 * uncompressedLength input bytes before the stream is ended.
306 * uncompressStream() must be passed a compressed frame that uncompresses to
307 * uncompressedLength.
308 */
309 void resetStream(folly::Optional<uint64_t> uncompressedLength = folly::none);
310
311 enum class FlushOp { NONE, FLUSH, END };
312
313 /**
314 * Compresses some data from the input buffer and writes the compressed data
315 * into the output buffer. It may read input without producing any output,
316 * except when forced to flush.
317 *
318 * The input buffer is advanced to point to the range of data that hasn't yet
319 * been read. Compression will resume at this point for the next call to
320 * compressStream(). The output buffer is advanced one byte past the last byte
321 * written.
322 *
323 * The default flushOp is NONE, which allows compressStream() complete
324 * discretion in how much data to gather before writing any output.
325 *
326 * If flushOp is END, all pending and input data is flushed to the output
327 * buffer, and the frame is ended. compressStream() must be called with the
328 * same input and flushOp END until it returns true. At this point the caller
329 * must call resetStream() to use the codec again.
330 *
331 * If flushOp is FLUSH, all pending and input data is flushed to the output
332 * buffer, but the frame is not ended. compressStream() must be called with
333 * the same input and flushOp END until it returns true. At this point the
334 * caller can continue to compressStream() with any input data and flushOp.
335 * The uncompressor, if passed all the produced output data, will be able to
336 * uncompress all the input data passed to compressStream() so far. Excessive
337 * use of flushOp FLUSH will deteriorate compression ratio. This is useful for
338 * stateful streaming across a network. Most users don't need to use this
339 * flushOp.
340 *
341 * A std::logic_error is thrown on incorrect usage of the API.
342 * A std::runtime_error is thrown upon error conditions or if no forward
343 * progress could be made twice in a row.
344 */
345 bool compressStream(
346 folly::ByteRange& input,
347 folly::MutableByteRange& output,
348 FlushOp flushOp = StreamCodec::FlushOp::NONE);
349
350 /**
351 * Uncompresses some data from the input buffer and writes the uncompressed
352 * data into the output buffer. It may read input without producing any
353 * output.
354 *
355 * The input buffer is advanced to point to the range of data that hasn't yet
356 * been read. Uncompression will resume at this point for the next call to
357 * uncompressStream(). The output buffer is advanced one byte past the last
358 * byte written.
359 *
360 * The default flushOp is NONE, which allows uncompressStream() complete
361 * discretion in how much output data to flush. The uncompressor may not make
362 * maximum forward progress, but will make some forward progress when
363 * possible.
364 *
365 * If flushOp is END, the caller guarantees that no more input will be
366 * presented to uncompressStream(). uncompressStream() must be called with the
367 * same input and flushOp END until it returns true. This is not mandatory,
368 * but if the input is all available in one buffer, and there is enough output
369 * space to write the entire frame, codecs can uncompress faster.
370 *
371 * If flushOp is FLUSH, uncompressStream() is guaranteed to make the maximum
372 * amount of forward progress possible. When using this flushOp and
373 * uncompressStream() returns with `!output.empty()` the caller knows that all
374 * pending output has been flushed. This is useful for stateful streaming
375 * across a network, and it should be used in conjunction with
376 * compressStream() with flushOp FLUSH. Most users don't need to use this
377 * flushOp.
378 *
379 * A std::runtime_error is thrown upon error conditions or if no forward
380 * progress could be made upon two consecutive calls to the function (only the
381 * second call will throw an exception).
382 *
383 * Returns true at the end of a frame. At this point resetStream() must be
384 * called to reuse the codec.
385 */
386 bool uncompressStream(
387 folly::ByteRange& input,
388 folly::MutableByteRange& output,
389 FlushOp flushOp = StreamCodec::FlushOp::NONE);
390
391 protected:
392 StreamCodec(
393 CodecType type,
394 folly::Optional<int> level = folly::none,
395 folly::StringPiece name = {},
396 bool counters = true)
397 : Codec(type, std::move(level), name, counters) {}
398
399 // Returns the uncompressed length last passed to resetStream() or none if it
400 // hasn't been called yet.
401 folly::Optional<uint64_t> uncompressedLength() const {
402 return uncompressedLength_;
403 }
404
405 private:
406 // default: Implemented using the streaming API.
407 std::unique_ptr<IOBuf> doCompress(const folly::IOBuf* data) override;
408 std::unique_ptr<IOBuf> doUncompress(
409 const folly::IOBuf* data,
410 folly::Optional<uint64_t> uncompressedLength) override;
411
412 // default: Returns false
413 virtual bool doNeedsDataLength() const;
414 virtual void doResetStream() = 0;
415 virtual bool doCompressStream(
416 folly::ByteRange& input,
417 folly::MutableByteRange& output,
418 FlushOp flushOp) = 0;
419 virtual bool doUncompressStream(
420 folly::ByteRange& input,
421 folly::MutableByteRange& output,
422 FlushOp flushOp) = 0;
423
424 enum class State {
425 RESET,
426 COMPRESS,
427 COMPRESS_FLUSH,
428 COMPRESS_END,
429 UNCOMPRESS,
430 END,
431 };
432 void assertStateIs(State expected) const;
433
434 State state_{State::RESET};
435 ByteRange previousInput_{};
436 folly::Optional<uint64_t> uncompressedLength_{};
437 bool progressMade_{true};
438};
439
440constexpr int COMPRESSION_LEVEL_FASTEST = -1;
441constexpr int COMPRESSION_LEVEL_DEFAULT = -2;
442constexpr int COMPRESSION_LEVEL_BEST = -3;
443
444/**
445 * Return a codec for the given type. Throws on error. The level
446 * is a non-negative codec-dependent integer indicating the level of
447 * compression desired, or one of the following constants:
448 *
449 * COMPRESSION_LEVEL_FASTEST is fastest (uses least CPU / memory,
450 * worst compression)
451 * COMPRESSION_LEVEL_DEFAULT is the default (likely a tradeoff between
452 * FASTEST and BEST)
453 * COMPRESSION_LEVEL_BEST is the best compression (uses most CPU / memory,
454 * best compression)
455 *
456 * When decompressing, the compression level is ignored. All codecs will
457 * decompress all data compressed with the a codec of the same type, regardless
458 * of compression level.
459 */
460std::unique_ptr<Codec> getCodec(
461 CodecType type,
462 int level = COMPRESSION_LEVEL_DEFAULT);
463
464/**
465 * Return a codec for the given type. Throws on error. The level
466 * is a non-negative codec-dependent integer indicating the level of
467 * compression desired, or one of the following constants:
468 *
469 * COMPRESSION_LEVEL_FASTEST is fastest (uses least CPU / memory,
470 * worst compression)
471 * COMPRESSION_LEVEL_DEFAULT is the default (likely a tradeoff between
472 * FASTEST and BEST)
473 * COMPRESSION_LEVEL_BEST is the best compression (uses most CPU / memory,
474 * best compression)
475 *
476 * When decompressing, the compression level is ignored. All codecs will
477 * decompress all data compressed with the a codec of the same type, regardless
478 * of compression level.
479 */
480std::unique_ptr<StreamCodec> getStreamCodec(
481 CodecType type,
482 int level = COMPRESSION_LEVEL_DEFAULT);
483
484/**
485 * Returns a codec that can uncompress any of the given codec types as well as
486 * {LZ4_FRAME, ZSTD, ZLIB, GZIP, LZMA2, BZIP2}. Appends each default codec to
487 * customCodecs in order, so long as a codec with the same type() isn't already
488 * present in customCodecs or as the terminalCodec. When uncompress() is called,
489 * each codec's canUncompress() is called in the order that they are given.
490 * Appended default codecs are checked last. uncompress() is called on the
491 * first codec whose canUncompress() returns true.
492 *
493 * In addition, an optional `terminalCodec` can be provided. This codec's
494 * uncompress() will be called either when no other codec canUncompress() the
495 * data or the chosen codec throws an exception on the data. The terminalCodec
496 * is intended for ambiguous headers, when canUncompress() is false for some
497 * data it can actually uncompress. The terminalCodec does not need to override
498 * validPrefixes() or canUncompress() and overriding these functions will have
499 * no effect on the returned codec's validPrefixes() or canUncompress()
500 * functions. The terminalCodec's needsUncompressedLength() and
501 * maxUncompressedLength() will affect the returned codec's respective
502 * functions. The terminalCodec must not be duplicated in customCodecs.
503 *
504 * An exception is thrown if no codec canUncompress() the data and either no
505 * terminal codec was provided or a terminal codec was provided and it throws on
506 * the data.
507 * An exception is thrown if the chosen codec's uncompress() throws on the data
508 * and either no terminal codec was provided or a terminal codec was provided
509 * and it also throws on the data.
510 * An exception is thrown if compress() is called on the returned codec.
511 *
512 * Requirements are checked in debug mode and are as follows:
513 * Let headers be the concatenation of every codec's validPrefixes().
514 * 1. Each codec must override validPrefixes() and canUncompress().
515 * 2. No codec's validPrefixes() may be empty.
516 * 3. No header in headers may be empty.
517 * 4. headers must not contain any duplicate elements.
518 * 5. No strict non-empty prefix of any header in headers may be in headers.
519 * 6. The terminalCodec's type must not be the same as any other codec's type
520 * (with USER_DEFINED being the exception).
521 */
522std::unique_ptr<Codec> getAutoUncompressionCodec(
523 std::vector<std::unique_ptr<Codec>> customCodecs = {},
524 std::unique_ptr<Codec> terminalCodec = {});
525
526/**
527 * Check if a specified codec is supported.
528 */
529bool hasCodec(CodecType type);
530
531/**
532 * Check if a specified codec is supported and supports streaming.
533 */
534bool hasStreamCodec(CodecType type);
535} // namespace io
536} // namespace folly
537