1 | /** |
2 | * Licensed to the Apache Software Foundation (ASF) under one |
3 | * or more contributor license agreements. See the NOTICE file |
4 | * distributed with this work for additional information |
5 | * regarding copyright ownership. The ASF licenses this file |
6 | * to you under the Apache License, Version 2.0 (the |
7 | * "License"); you may not use this file except in compliance |
8 | * with the License. You may obtain a copy of the License at |
9 | * |
10 | * http://www.apache.org/licenses/LICENSE-2.0 |
11 | * |
12 | * Unless required by applicable law or agreed to in writing, software |
13 | * distributed under the License is distributed on an "AS IS" BASIS, |
14 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
15 | * See the License for the specific language governing permissions and |
16 | * limitations under the License. |
17 | */ |
18 | |
19 | #include "Adaptor.hh" |
20 | #include "Compression.hh" |
21 | #include "orc/Exceptions.hh" |
22 | #include "LzoDecompressor.hh" |
23 | #include "lz4.h" |
24 | |
25 | #include <algorithm> |
26 | #include <iomanip> |
27 | #include <iostream> |
28 | #include <sstream> |
29 | |
30 | #include "zlib.h" |
31 | |
32 | #include "wrap/snappy-wrapper.h" |
33 | |
34 | namespace orc { |
35 | |
36 | class CompressionStreamBase: public BufferedOutputStream { |
37 | public: |
38 | CompressionStreamBase(OutputStream * outStream, |
39 | int compressionLevel, |
40 | uint64_t capacity, |
41 | uint64_t blockSize, |
42 | MemoryPool& pool); |
43 | |
44 | virtual bool Next(void** data, int*size) override = 0; |
45 | virtual void BackUp(int count) override; |
46 | |
47 | virtual std::string getName() const override = 0; |
48 | virtual uint64_t flush() override; |
49 | |
50 | virtual bool isCompressed() const override { return true; } |
51 | virtual uint64_t getSize() const override; |
52 | |
53 | protected: |
54 | void (char * buffer, size_t compressedSize, bool original) { |
55 | buffer[0] = static_cast<char>((compressedSize << 1) + (original ? 1 : 0)); |
56 | buffer[1] = static_cast<char>(compressedSize >> 7); |
57 | buffer[2] = static_cast<char>(compressedSize >> 15); |
58 | } |
59 | |
60 | // Buffer to hold uncompressed data until user calls Next() |
61 | DataBuffer<unsigned char> rawInputBuffer; |
62 | |
63 | // Compress level |
64 | int level; |
65 | |
66 | // Compressed data output buffer |
67 | char * outputBuffer; |
68 | |
69 | // Size for compressionBuffer |
70 | int bufferSize; |
71 | |
72 | // Compress output position |
73 | int outputPosition; |
74 | |
75 | // Compress output buffer size |
76 | int outputSize; |
77 | }; |
78 | |
79 | CompressionStreamBase::CompressionStreamBase(OutputStream * outStream, |
80 | int compressionLevel, |
81 | uint64_t capacity, |
82 | uint64_t blockSize, |
83 | MemoryPool& pool) : |
84 | BufferedOutputStream(pool, |
85 | outStream, |
86 | capacity, |
87 | blockSize), |
88 | rawInputBuffer(pool, blockSize), |
89 | level(compressionLevel), |
90 | outputBuffer(nullptr), |
91 | bufferSize(0), |
92 | outputPosition(0), |
93 | outputSize(0) { |
94 | // PASS |
95 | } |
96 | |
97 | void CompressionStreamBase::BackUp(int count) { |
98 | if (count > bufferSize) { |
99 | throw std::logic_error("Can't backup that much!" ); |
100 | } |
101 | bufferSize -= count; |
102 | } |
103 | |
104 | uint64_t CompressionStreamBase::flush() { |
105 | void * data; |
106 | int size; |
107 | if (!Next(&data, &size)) { |
108 | throw std::runtime_error("Failed to flush compression buffer." ); |
109 | } |
110 | BufferedOutputStream::BackUp(outputSize - outputPosition); |
111 | bufferSize = outputSize = outputPosition = 0; |
112 | return BufferedOutputStream::flush(); |
113 | } |
114 | |
115 | uint64_t CompressionStreamBase::getSize() const { |
116 | return BufferedOutputStream::getSize() - |
117 | static_cast<uint64_t>(outputSize - outputPosition); |
118 | } |
119 | |
120 | /** |
121 | * Streaming compression base class |
122 | */ |
123 | class CompressionStream: public CompressionStreamBase { |
124 | public: |
125 | CompressionStream(OutputStream * outStream, |
126 | int compressionLevel, |
127 | uint64_t capacity, |
128 | uint64_t blockSize, |
129 | MemoryPool& pool); |
130 | |
131 | virtual bool Next(void** data, int*size) override; |
132 | virtual std::string getName() const override = 0; |
133 | |
134 | protected: |
135 | // return total compressed size |
136 | virtual uint64_t doStreamingCompression() = 0; |
137 | }; |
138 | |
139 | CompressionStream::CompressionStream(OutputStream * outStream, |
140 | int compressionLevel, |
141 | uint64_t capacity, |
142 | uint64_t blockSize, |
143 | MemoryPool& pool) : |
144 | CompressionStreamBase(outStream, |
145 | compressionLevel, |
146 | capacity, |
147 | blockSize, |
148 | pool) { |
149 | // PASS |
150 | } |
151 | |
152 | bool CompressionStream::Next(void** data, int*size) { |
153 | if (bufferSize != 0) { |
154 | // adjust 3 bytes for the compression header |
155 | if (outputPosition + 3 >= outputSize) { |
156 | int newPosition = outputPosition + 3 - outputSize; |
157 | if (!BufferedOutputStream::Next( |
158 | reinterpret_cast<void **>(&outputBuffer), |
159 | &outputSize)) { |
160 | throw std::runtime_error( |
161 | "Failed to get next output buffer from output stream." ); |
162 | } |
163 | outputPosition = newPosition; |
164 | } else { |
165 | outputPosition += 3; |
166 | } |
167 | |
168 | uint64_t totalCompressedSize = doStreamingCompression(); |
169 | |
170 | char * = outputBuffer + outputPosition - totalCompressedSize - 3; |
171 | if (totalCompressedSize >= static_cast<unsigned long>(bufferSize)) { |
172 | writeHeader(header, static_cast<size_t>(bufferSize), true); |
173 | memcpy( |
174 | header + 3, |
175 | rawInputBuffer.data(), |
176 | static_cast<size_t>(bufferSize)); |
177 | |
178 | int backup = static_cast<int>(totalCompressedSize) - bufferSize; |
179 | BufferedOutputStream::BackUp(backup); |
180 | outputPosition -= backup; |
181 | outputSize -= backup; |
182 | } else { |
183 | writeHeader(header, totalCompressedSize, false); |
184 | } |
185 | } |
186 | |
187 | *data = rawInputBuffer.data(); |
188 | *size = static_cast<int>(rawInputBuffer.size()); |
189 | bufferSize = *size; |
190 | |
191 | return true; |
192 | } |
193 | |
194 | class ZlibCompressionStream: public CompressionStream { |
195 | public: |
196 | ZlibCompressionStream(OutputStream * outStream, |
197 | int compressionLevel, |
198 | uint64_t capacity, |
199 | uint64_t blockSize, |
200 | MemoryPool& pool); |
201 | |
202 | virtual std::string getName() const override; |
203 | |
204 | protected: |
205 | virtual uint64_t doStreamingCompression() override; |
206 | |
207 | private: |
208 | void init(); |
209 | z_stream strm; |
210 | }; |
211 | |
212 | ZlibCompressionStream::ZlibCompressionStream( |
213 | OutputStream * outStream, |
214 | int compressionLevel, |
215 | uint64_t capacity, |
216 | uint64_t blockSize, |
217 | MemoryPool& pool) |
218 | : CompressionStream(outStream, |
219 | compressionLevel, |
220 | capacity, |
221 | blockSize, |
222 | pool) { |
223 | init(); |
224 | } |
225 | |
226 | uint64_t ZlibCompressionStream::doStreamingCompression() { |
227 | if (deflateReset(&strm) != Z_OK) { |
228 | throw std::runtime_error("Failed to reset inflate." ); |
229 | } |
230 | |
231 | strm.avail_in = static_cast<unsigned int>(bufferSize); |
232 | strm.next_in = rawInputBuffer.data(); |
233 | |
234 | do { |
235 | if (outputPosition >= outputSize) { |
236 | if (!BufferedOutputStream::Next( |
237 | reinterpret_cast<void **>(&outputBuffer), |
238 | &outputSize)) { |
239 | throw std::runtime_error( |
240 | "Failed to get next output buffer from output stream." ); |
241 | } |
242 | outputPosition = 0; |
243 | } |
244 | strm.next_out = reinterpret_cast<unsigned char *> |
245 | (outputBuffer + outputPosition); |
246 | strm.avail_out = static_cast<unsigned int> |
247 | (outputSize - outputPosition); |
248 | |
249 | int ret = deflate(&strm, Z_FINISH); |
250 | outputPosition = outputSize - static_cast<int>(strm.avail_out); |
251 | |
252 | if (ret == Z_STREAM_END) { |
253 | break; |
254 | } else if (ret == Z_OK) { |
255 | // needs more buffer so will continue the loop |
256 | } else { |
257 | throw std::runtime_error("Failed to deflate input data." ); |
258 | } |
259 | } while (strm.avail_out == 0); |
260 | |
261 | return strm.total_out; |
262 | } |
263 | |
264 | std::string ZlibCompressionStream::getName() const { |
265 | return "ZlibCompressionStream" ; |
266 | } |
267 | |
268 | DIAGNOSTIC_PUSH |
269 | |
270 | #if defined(__GNUC__) || defined(__clang__) |
271 | DIAGNOSTIC_IGNORE("-Wold-style-cast" ) |
272 | #endif |
273 | |
274 | void ZlibCompressionStream::init() { |
275 | strm.zalloc = nullptr; |
276 | strm.zfree = nullptr; |
277 | strm.opaque = nullptr; |
278 | |
279 | if (deflateInit2(&strm, level, Z_DEFLATED, -15, 8, Z_DEFAULT_STRATEGY) |
280 | != Z_OK) { |
281 | throw std::runtime_error("Error while calling deflateInit2() for zlib." ); |
282 | } |
283 | } |
284 | |
285 | DIAGNOSTIC_PUSH |
286 | |
287 | enum DecompressState { , |
288 | DECOMPRESS_START, |
289 | DECOMPRESS_CONTINUE, |
290 | DECOMPRESS_ORIGINAL, |
291 | DECOMPRESS_EOF}; |
292 | |
293 | class ZlibDecompressionStream: public SeekableInputStream { |
294 | public: |
295 | ZlibDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, |
296 | size_t blockSize, |
297 | MemoryPool& pool); |
298 | virtual ~ZlibDecompressionStream() override; |
299 | virtual bool Next(const void** data, int*size) override; |
300 | virtual void BackUp(int count) override; |
301 | virtual bool Skip(int count) override; |
302 | virtual int64_t ByteCount() const override; |
303 | virtual void seek(PositionProvider& position) override; |
304 | virtual std::string getName() const override; |
305 | |
306 | private: |
307 | void readBuffer(bool failOnEof) { |
308 | int length; |
309 | if (!input->Next(reinterpret_cast<const void**>(&inputBuffer), |
310 | &length)) { |
311 | if (failOnEof) { |
312 | throw ParseError("Read past EOF in " |
313 | "ZlibDecompressionStream::readBuffer" ); |
314 | } |
315 | state = DECOMPRESS_EOF; |
316 | inputBuffer = nullptr; |
317 | inputBufferEnd = nullptr; |
318 | } else { |
319 | inputBufferEnd = inputBuffer + length; |
320 | } |
321 | } |
322 | |
323 | uint32_t readByte(bool failOnEof) { |
324 | if (inputBuffer == inputBufferEnd) { |
325 | readBuffer(failOnEof); |
326 | if (state == DECOMPRESS_EOF) { |
327 | return 0; |
328 | } |
329 | } |
330 | return static_cast<unsigned char>(*(inputBuffer++)); |
331 | } |
332 | |
333 | void () { |
334 | uint32_t = readByte(false); |
335 | if (state != DECOMPRESS_EOF) { |
336 | header |= readByte(true) << 8; |
337 | header |= readByte(true) << 16; |
338 | if (header & 1) { |
339 | state = DECOMPRESS_ORIGINAL; |
340 | } else { |
341 | state = DECOMPRESS_START; |
342 | } |
343 | remainingLength = header >> 1; |
344 | } else { |
345 | remainingLength = 0; |
346 | } |
347 | } |
348 | |
349 | MemoryPool& pool; |
350 | const size_t blockSize; |
351 | std::unique_ptr<SeekableInputStream> input; |
352 | z_stream zstream; |
353 | DataBuffer<char> buffer; |
354 | |
355 | // the current state |
356 | DecompressState state; |
357 | |
358 | // the start of the current buffer |
359 | // This pointer is not owned by us. It is either owned by zstream or |
360 | // the underlying stream. |
361 | const char* outputBuffer; |
362 | // the size of the current buffer |
363 | size_t outputBufferLength; |
364 | // the size of the current chunk |
365 | size_t remainingLength; |
366 | |
367 | // the last buffer returned from the input |
368 | const char *inputBuffer; |
369 | const char *inputBufferEnd; |
370 | |
371 | // roughly the number of bytes returned |
372 | off_t bytesReturned; |
373 | }; |
374 | |
375 | DIAGNOSTIC_PUSH |
376 | |
377 | #if defined(__GNUC__) || defined(__clang__) |
378 | DIAGNOSTIC_IGNORE("-Wold-style-cast" ) |
379 | #endif |
380 | |
381 | ZlibDecompressionStream::ZlibDecompressionStream |
382 | (std::unique_ptr<SeekableInputStream> inStream, |
383 | size_t _blockSize, |
384 | MemoryPool& _pool |
385 | ): pool(_pool), |
386 | blockSize(_blockSize), |
387 | buffer(pool, _blockSize) { |
388 | input.reset(inStream.release()); |
389 | zstream.next_in = nullptr; |
390 | zstream.avail_in = 0; |
391 | zstream.zalloc = nullptr; |
392 | zstream.zfree = nullptr; |
393 | zstream.opaque = nullptr; |
394 | zstream.next_out = reinterpret_cast<Bytef*>(buffer.data()); |
395 | zstream.avail_out = static_cast<uInt>(blockSize); |
396 | int64_t result = inflateInit2(&zstream, -15); |
397 | switch (result) { |
398 | case Z_OK: |
399 | break; |
400 | case Z_MEM_ERROR: |
401 | throw std::logic_error("Memory error from inflateInit2" ); |
402 | case Z_VERSION_ERROR: |
403 | throw std::logic_error("Version error from inflateInit2" ); |
404 | case Z_STREAM_ERROR: |
405 | throw std::logic_error("Stream error from inflateInit2" ); |
406 | default: |
407 | throw std::logic_error("Unknown error from inflateInit2" ); |
408 | } |
409 | outputBuffer = nullptr; |
410 | outputBufferLength = 0; |
411 | remainingLength = 0; |
412 | state = DECOMPRESS_HEADER; |
413 | inputBuffer = nullptr; |
414 | inputBufferEnd = nullptr; |
415 | bytesReturned = 0; |
416 | } |
417 | |
418 | DIAGNOSTIC_POP |
419 | |
420 | ZlibDecompressionStream::~ZlibDecompressionStream() { |
421 | int64_t result = inflateEnd(&zstream); |
422 | if (result != Z_OK) { |
423 | // really can't throw in destructors |
424 | std::cout << "Error in ~ZlibDecompressionStream() " << result << "\n" ; |
425 | } |
426 | } |
427 | |
428 | bool ZlibDecompressionStream::Next(const void** data, int*size) { |
429 | // if the user pushed back, return them the partial buffer |
430 | if (outputBufferLength) { |
431 | *data = outputBuffer; |
432 | *size = static_cast<int>(outputBufferLength); |
433 | outputBuffer += outputBufferLength; |
434 | outputBufferLength = 0; |
435 | return true; |
436 | } |
437 | if (state == DECOMPRESS_HEADER || remainingLength == 0) { |
438 | readHeader(); |
439 | } |
440 | if (state == DECOMPRESS_EOF) { |
441 | return false; |
442 | } |
443 | if (inputBuffer == inputBufferEnd) { |
444 | readBuffer(true); |
445 | } |
446 | size_t availSize = |
447 | std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), |
448 | remainingLength); |
449 | if (state == DECOMPRESS_ORIGINAL) { |
450 | *data = inputBuffer; |
451 | *size = static_cast<int>(availSize); |
452 | outputBuffer = inputBuffer + availSize; |
453 | outputBufferLength = 0; |
454 | } else if (state == DECOMPRESS_START) { |
455 | zstream.next_in = |
456 | reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer)); |
457 | zstream.avail_in = static_cast<uInt>(availSize); |
458 | outputBuffer = buffer.data(); |
459 | zstream.next_out = |
460 | reinterpret_cast<Bytef*>(const_cast<char*>(outputBuffer)); |
461 | zstream.avail_out = static_cast<uInt>(blockSize); |
462 | if (inflateReset(&zstream) != Z_OK) { |
463 | throw std::logic_error("Bad inflateReset in " |
464 | "ZlibDecompressionStream::Next" ); |
465 | } |
466 | int64_t result; |
467 | do { |
468 | result = inflate(&zstream, availSize == remainingLength ? Z_FINISH : |
469 | Z_SYNC_FLUSH); |
470 | switch (result) { |
471 | case Z_OK: |
472 | remainingLength -= availSize; |
473 | inputBuffer += availSize; |
474 | readBuffer(true); |
475 | availSize = |
476 | std::min(static_cast<size_t>(inputBufferEnd - inputBuffer), |
477 | remainingLength); |
478 | zstream.next_in = |
479 | reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer)); |
480 | zstream.avail_in = static_cast<uInt>(availSize); |
481 | break; |
482 | case Z_STREAM_END: |
483 | break; |
484 | case Z_BUF_ERROR: |
485 | throw std::logic_error("Buffer error in " |
486 | "ZlibDecompressionStream::Next" ); |
487 | case Z_DATA_ERROR: |
488 | throw std::logic_error("Data error in " |
489 | "ZlibDecompressionStream::Next" ); |
490 | case Z_STREAM_ERROR: |
491 | throw std::logic_error("Stream error in " |
492 | "ZlibDecompressionStream::Next" ); |
493 | default: |
494 | throw std::logic_error("Unknown error in " |
495 | "ZlibDecompressionStream::Next" ); |
496 | } |
497 | } while (result != Z_STREAM_END); |
498 | *size = static_cast<int>(blockSize - zstream.avail_out); |
499 | *data = outputBuffer; |
500 | outputBufferLength = 0; |
501 | outputBuffer += *size; |
502 | } else { |
503 | throw std::logic_error("Unknown compression state in " |
504 | "ZlibDecompressionStream::Next" ); |
505 | } |
506 | inputBuffer += availSize; |
507 | remainingLength -= availSize; |
508 | bytesReturned += *size; |
509 | return true; |
510 | } |
511 | |
512 | void ZlibDecompressionStream::BackUp(int count) { |
513 | if (outputBuffer == nullptr || outputBufferLength != 0) { |
514 | throw std::logic_error("Backup without previous Next in " |
515 | "ZlibDecompressionStream" ); |
516 | } |
517 | outputBuffer -= static_cast<size_t>(count); |
518 | outputBufferLength = static_cast<size_t>(count); |
519 | bytesReturned -= count; |
520 | } |
521 | |
522 | bool ZlibDecompressionStream::Skip(int count) { |
523 | bytesReturned += count; |
524 | // this is a stupid implementation for now. |
525 | // should skip entire blocks without decompressing |
526 | while (count > 0) { |
527 | const void *ptr; |
528 | int len; |
529 | if (!Next(&ptr, &len)) { |
530 | return false; |
531 | } |
532 | if (len > count) { |
533 | BackUp(len - count); |
534 | count = 0; |
535 | } else { |
536 | count -= len; |
537 | } |
538 | } |
539 | return true; |
540 | } |
541 | |
542 | int64_t ZlibDecompressionStream::ByteCount() const { |
543 | return bytesReturned; |
544 | } |
545 | |
546 | void ZlibDecompressionStream::seek(PositionProvider& position) { |
547 | input->seek(position); |
548 | bytesReturned = static_cast<off_t>(input->ByteCount()); |
549 | if (!Skip(static_cast<int>(position.next()))) { |
550 | throw ParseError("Bad skip in ZlibDecompressionStream::seek" ); |
551 | } |
552 | } |
553 | |
554 | std::string ZlibDecompressionStream::getName() const { |
555 | std::ostringstream result; |
556 | result << "zlib(" << input->getName() << ")" ; |
557 | return result.str(); |
558 | } |
559 | |
560 | class BlockDecompressionStream: public SeekableInputStream { |
561 | public: |
562 | BlockDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, |
563 | size_t blockSize, |
564 | MemoryPool& pool); |
565 | |
566 | virtual ~BlockDecompressionStream() override {} |
567 | virtual bool Next(const void** data, int*size) override; |
568 | virtual void BackUp(int count) override; |
569 | virtual bool Skip(int count) override; |
570 | virtual int64_t ByteCount() const override; |
571 | virtual void seek(PositionProvider& position) override; |
572 | virtual std::string getName() const override = 0; |
573 | |
574 | protected: |
575 | virtual uint64_t decompress(const char *input, uint64_t length, |
576 | char *output, size_t maxOutputLength) = 0; |
577 | |
578 | std::string getStreamName() const { |
579 | return input->getName(); |
580 | } |
581 | |
582 | private: |
583 | void readBuffer(bool failOnEof) { |
584 | int length; |
585 | if (!input->Next(reinterpret_cast<const void**>(&inputBufferPtr), |
586 | &length)) { |
587 | if (failOnEof) { |
588 | throw ParseError(getName() + "read past EOF" ); |
589 | } |
590 | state = DECOMPRESS_EOF; |
591 | inputBufferPtr = nullptr; |
592 | inputBufferPtrEnd = nullptr; |
593 | } else { |
594 | inputBufferPtrEnd = inputBufferPtr + length; |
595 | } |
596 | } |
597 | |
598 | uint32_t readByte(bool failOnEof) { |
599 | if (inputBufferPtr == inputBufferPtrEnd) { |
600 | readBuffer(failOnEof); |
601 | if (state == DECOMPRESS_EOF) { |
602 | return 0; |
603 | } |
604 | } |
605 | return static_cast<unsigned char>(*(inputBufferPtr++)); |
606 | } |
607 | |
608 | void () { |
609 | uint32_t = readByte(false); |
610 | if (state != DECOMPRESS_EOF) { |
611 | header |= readByte(true) << 8; |
612 | header |= readByte(true) << 16; |
613 | if (header & 1) { |
614 | state = DECOMPRESS_ORIGINAL; |
615 | } else { |
616 | state = DECOMPRESS_START; |
617 | } |
618 | remainingLength = header >> 1; |
619 | } else { |
620 | remainingLength = 0; |
621 | } |
622 | } |
623 | |
624 | std::unique_ptr<SeekableInputStream> input; |
625 | MemoryPool& pool; |
626 | |
627 | // may need to stitch together multiple input buffers; |
628 | // to give snappy a contiguous block |
629 | DataBuffer<char> inputBuffer; |
630 | |
631 | // uncompressed output |
632 | DataBuffer<char> outputBuffer; |
633 | |
634 | // the current state |
635 | DecompressState state; |
636 | |
637 | // the start of the current output buffer |
638 | const char* outputBufferPtr; |
639 | // the size of the current output buffer |
640 | size_t outputBufferLength; |
641 | |
642 | // the size of the current chunk |
643 | size_t remainingLength; |
644 | |
645 | // the last buffer returned from the input |
646 | const char *inputBufferPtr; |
647 | const char *inputBufferPtrEnd; |
648 | |
649 | // bytes returned by this stream |
650 | off_t bytesReturned; |
651 | }; |
652 | |
653 | BlockDecompressionStream::BlockDecompressionStream |
654 | (std::unique_ptr<SeekableInputStream> inStream, |
655 | size_t bufferSize, |
656 | MemoryPool& _pool |
657 | ) : pool(_pool), |
658 | inputBuffer(pool, bufferSize), |
659 | outputBuffer(pool, bufferSize), |
660 | state(DECOMPRESS_HEADER), |
661 | outputBufferPtr(nullptr), |
662 | outputBufferLength(0), |
663 | remainingLength(0), |
664 | inputBufferPtr(nullptr), |
665 | inputBufferPtrEnd(nullptr), |
666 | bytesReturned(0) { |
667 | input.reset(inStream.release()); |
668 | } |
669 | |
670 | bool BlockDecompressionStream::Next(const void** data, int*size) { |
671 | // if the user pushed back, return them the partial buffer |
672 | if (outputBufferLength) { |
673 | *data = outputBufferPtr; |
674 | *size = static_cast<int>(outputBufferLength); |
675 | outputBufferPtr += outputBufferLength; |
676 | bytesReturned += static_cast<off_t>(outputBufferLength); |
677 | outputBufferLength = 0; |
678 | return true; |
679 | } |
680 | if (state == DECOMPRESS_HEADER || remainingLength == 0) { |
681 | readHeader(); |
682 | } |
683 | if (state == DECOMPRESS_EOF) { |
684 | return false; |
685 | } |
686 | if (inputBufferPtr == inputBufferPtrEnd) { |
687 | readBuffer(true); |
688 | } |
689 | |
690 | size_t availSize = |
691 | std::min(static_cast<size_t>(inputBufferPtrEnd - inputBufferPtr), |
692 | remainingLength); |
693 | if (state == DECOMPRESS_ORIGINAL) { |
694 | *data = inputBufferPtr; |
695 | *size = static_cast<int>(availSize); |
696 | outputBufferPtr = inputBufferPtr + availSize; |
697 | outputBufferLength = 0; |
698 | inputBufferPtr += availSize; |
699 | remainingLength -= availSize; |
700 | } else if (state == DECOMPRESS_START) { |
701 | // Get contiguous bytes of compressed block. |
702 | const char *compressed = inputBufferPtr; |
703 | if (remainingLength == availSize) { |
704 | inputBufferPtr += availSize; |
705 | } else { |
706 | // Did not read enough from input. |
707 | if (inputBuffer.capacity() < remainingLength) { |
708 | inputBuffer.resize(remainingLength); |
709 | } |
710 | ::memcpy(inputBuffer.data(), inputBufferPtr, availSize); |
711 | inputBufferPtr += availSize; |
712 | compressed = inputBuffer.data(); |
713 | |
714 | for (size_t pos = availSize; pos < remainingLength; ) { |
715 | readBuffer(true); |
716 | size_t avail = |
717 | std::min(static_cast<size_t>(inputBufferPtrEnd - |
718 | inputBufferPtr), |
719 | remainingLength - pos); |
720 | ::memcpy(inputBuffer.data() + pos, inputBufferPtr, avail); |
721 | pos += avail; |
722 | inputBufferPtr += avail; |
723 | } |
724 | } |
725 | |
726 | outputBufferLength = decompress(compressed, remainingLength, |
727 | outputBuffer.data(), |
728 | outputBuffer.capacity()); |
729 | |
730 | remainingLength = 0; |
731 | state = DECOMPRESS_HEADER; |
732 | *data = outputBuffer.data(); |
733 | *size = static_cast<int>(outputBufferLength); |
734 | outputBufferPtr = outputBuffer.data() + outputBufferLength; |
735 | outputBufferLength = 0; |
736 | } |
737 | |
738 | bytesReturned += *size; |
739 | return true; |
740 | } |
741 | |
742 | void BlockDecompressionStream::BackUp(int count) { |
743 | if (outputBufferPtr == nullptr || outputBufferLength != 0) { |
744 | throw std::logic_error("Backup without previous Next in " +getName()); |
745 | } |
746 | outputBufferPtr -= static_cast<size_t>(count); |
747 | outputBufferLength = static_cast<size_t>(count); |
748 | bytesReturned -= count; |
749 | } |
750 | |
751 | bool BlockDecompressionStream::Skip(int count) { |
752 | bytesReturned += count; |
753 | // this is a stupid implementation for now. |
754 | // should skip entire blocks without decompressing |
755 | while (count > 0) { |
756 | const void *ptr; |
757 | int len; |
758 | if (!Next(&ptr, &len)) { |
759 | return false; |
760 | } |
761 | if (len > count) { |
762 | BackUp(len - count); |
763 | count = 0; |
764 | } else { |
765 | count -= len; |
766 | } |
767 | } |
768 | return true; |
769 | } |
770 | |
771 | int64_t BlockDecompressionStream::ByteCount() const { |
772 | return bytesReturned; |
773 | } |
774 | |
775 | void BlockDecompressionStream::seek(PositionProvider& position) { |
776 | input->seek(position); |
777 | if (!Skip(static_cast<int>(position.next()))) { |
778 | throw ParseError("Bad skip in " + getName()); |
779 | } |
780 | } |
781 | |
782 | class SnappyDecompressionStream: public BlockDecompressionStream { |
783 | public: |
784 | SnappyDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, |
785 | size_t blockSize, |
786 | MemoryPool& pool |
787 | ): BlockDecompressionStream |
788 | (std::move(inStream), |
789 | blockSize, |
790 | pool) { |
791 | // PASS |
792 | } |
793 | |
794 | std::string getName() const override { |
795 | std::ostringstream result; |
796 | result << "snappy(" << getStreamName() << ")" ; |
797 | return result.str(); |
798 | } |
799 | |
800 | protected: |
801 | virtual uint64_t decompress(const char *input, uint64_t length, |
802 | char *output, size_t maxOutputLength |
803 | ) override; |
804 | }; |
805 | |
806 | uint64_t SnappyDecompressionStream::decompress(const char *input, |
807 | uint64_t length, |
808 | char *output, |
809 | size_t maxOutputLength) { |
810 | size_t outLength; |
811 | if (!snappy::GetUncompressedLength(input, length, &outLength)) { |
812 | throw ParseError("SnappyDecompressionStream choked on corrupt input" ); |
813 | } |
814 | |
815 | if (outLength > maxOutputLength) { |
816 | throw std::logic_error("Snappy length exceeds block size" ); |
817 | } |
818 | |
819 | if (!snappy::RawUncompress(input, length, output)) { |
820 | throw ParseError("SnappyDecompressionStream choked on corrupt input" ); |
821 | } |
822 | return outLength; |
823 | } |
824 | |
825 | class LzoDecompressionStream: public BlockDecompressionStream { |
826 | public: |
827 | LzoDecompressionStream(std::unique_ptr<SeekableInputStream> inStream, |
828 | size_t blockSize, |
829 | MemoryPool& pool |
830 | ): BlockDecompressionStream |
831 | (std::move(inStream), |
832 | blockSize, |
833 | pool) { |
834 | // PASS |
835 | } |
836 | |
837 | std::string getName() const override { |
838 | std::ostringstream result; |
839 | result << "lzo(" << getStreamName() << ")" ; |
840 | return result.str(); |
841 | } |
842 | |
843 | protected: |
844 | virtual uint64_t decompress(const char *input, uint64_t length, |
845 | char *output, size_t maxOutputLength |
846 | ) override; |
847 | }; |
848 | |
849 | uint64_t LzoDecompressionStream::decompress(const char *input, |
850 | uint64_t length, |
851 | char *output, |
852 | size_t maxOutputLength) { |
853 | return lzoDecompress(input, input + length, output, |
854 | output + maxOutputLength); |
855 | } |
856 | |
857 | class Lz4DecompressionStream: public BlockDecompressionStream { |
858 | public: |
859 | Lz4DecompressionStream(std::unique_ptr<SeekableInputStream> inStream, |
860 | size_t blockSize, |
861 | MemoryPool& pool |
862 | ): BlockDecompressionStream |
863 | (std::move(inStream), |
864 | blockSize, |
865 | pool) { |
866 | // PASS |
867 | } |
868 | |
869 | std::string getName() const override { |
870 | std::ostringstream result; |
871 | result << "lz4(" << getStreamName() << ")" ; |
872 | return result.str(); |
873 | } |
874 | |
875 | protected: |
876 | virtual uint64_t decompress(const char *input, uint64_t length, |
877 | char *output, size_t maxOutputLength |
878 | ) override; |
879 | }; |
880 | |
881 | uint64_t Lz4DecompressionStream::decompress(const char *input, |
882 | uint64_t length, |
883 | char *output, |
884 | size_t maxOutputLength) { |
885 | int result = LZ4_decompress_safe(input, output, static_cast<int>(length), |
886 | static_cast<int>(maxOutputLength)); |
887 | if (result < 0) { |
888 | throw ParseError(getName() + " - failed to decompress" ); |
889 | } |
890 | return static_cast<uint64_t>(result); |
891 | } |
892 | |
893 | std::unique_ptr<BufferedOutputStream> |
894 | createCompressor( |
895 | CompressionKind kind, |
896 | OutputStream * outStream, |
897 | CompressionStrategy strategy, |
898 | uint64_t bufferCapacity, |
899 | uint64_t compressionBlockSize, |
900 | MemoryPool& pool) { |
901 | switch (static_cast<int64_t>(kind)) { |
902 | case CompressionKind_NONE: { |
903 | return std::unique_ptr<BufferedOutputStream> |
904 | (new BufferedOutputStream( |
905 | pool, outStream, bufferCapacity, compressionBlockSize)); |
906 | } |
907 | case CompressionKind_ZLIB: { |
908 | int level = (strategy == CompressionStrategy_SPEED) ? |
909 | Z_BEST_SPEED + 1 : Z_DEFAULT_COMPRESSION; |
910 | return std::unique_ptr<BufferedOutputStream> |
911 | (new ZlibCompressionStream( |
912 | outStream, level, bufferCapacity, compressionBlockSize, pool)); |
913 | } |
914 | case CompressionKind_SNAPPY: |
915 | case CompressionKind_LZO: |
916 | case CompressionKind_LZ4: |
917 | default: |
918 | throw NotImplementedYet("compression codec" ); |
919 | } |
920 | } |
921 | |
922 | std::unique_ptr<SeekableInputStream> |
923 | createDecompressor(CompressionKind kind, |
924 | std::unique_ptr<SeekableInputStream> input, |
925 | uint64_t blockSize, |
926 | MemoryPool& pool) { |
927 | switch (static_cast<int64_t>(kind)) { |
928 | case CompressionKind_NONE: |
929 | return REDUNDANT_MOVE(input); |
930 | case CompressionKind_ZLIB: |
931 | return std::unique_ptr<SeekableInputStream> |
932 | (new ZlibDecompressionStream(std::move(input), blockSize, pool)); |
933 | case CompressionKind_SNAPPY: |
934 | return std::unique_ptr<SeekableInputStream> |
935 | (new SnappyDecompressionStream(std::move(input), blockSize, pool)); |
936 | case CompressionKind_LZO: |
937 | return std::unique_ptr<SeekableInputStream> |
938 | (new LzoDecompressionStream(std::move(input), blockSize, pool)); |
939 | case CompressionKind_LZ4: |
940 | return std::unique_ptr<SeekableInputStream> |
941 | (new Lz4DecompressionStream(std::move(input), blockSize, pool)); |
942 | default: { |
943 | std::ostringstream buffer; |
944 | buffer << "Unknown compression codec " << kind; |
945 | throw NotImplementedYet(buffer.str()); |
946 | } |
947 | } |
948 | } |
949 | |
950 | } |
951 | |