| 1 | #include "duckdb/common/gzip_stream.hpp" |
| 2 | |
| 3 | #include "duckdb/common/exception.hpp" |
| 4 | #include "duckdb/common/file_system.hpp" |
| 5 | #include "duckdb/common/fstream_util.hpp" |
| 6 | |
| 7 | #include "miniz.hpp" |
| 8 | |
| 9 | #include <limits> |
| 10 | |
| 11 | using namespace std; |
| 12 | using namespace duckdb; |
| 13 | |
| 14 | /* |
| 15 | |
| 16 | 0 2 bytes magic header 0x1f, 0x8b (\037 \213) |
| 17 | 2 1 byte compression method |
| 18 | 0: store (copied) |
| 19 | 1: compress |
| 20 | 2: pack |
| 21 | 3: lzh |
| 22 | 4..7: reserved |
| 23 | 8: deflate |
| 24 | 3 1 byte flags |
| 25 | bit 0 set: file probably ascii text |
| 26 | bit 1 set: continuation of multi-part gzip file, part number present |
| 27 | bit 2 set: extra field present |
| 28 | bit 3 set: original file name present |
| 29 | bit 4 set: file comment present |
| 30 | bit 5 set: file is encrypted, encryption header present |
| 31 | bit 6,7: reserved |
| 32 | 4 4 bytes file modification time in Unix format |
| 33 | 8 1 byte extra flags (depend on compression method) |
| 34 | 9 1 byte OS type |
| 35 | [ |
| 36 | 2 bytes optional part number (second part=1) |
| 37 | ]? |
| 38 | [ |
| 39 | 2 bytes optional extra field length (e) |
| 40 | (e)bytes optional extra field |
| 41 | ]? |
| 42 | [ |
| 43 | bytes optional original file name, zero terminated |
| 44 | ]? |
| 45 | [ |
| 46 | bytes optional file comment, zero terminated |
| 47 | ]? |
| 48 | [ |
| 49 | 12 bytes optional encryption header |
| 50 | ]? |
| 51 | bytes compressed data |
| 52 | 4 bytes crc32 |
| 53 | 4 bytes uncompressed input size modulo 2^32 |
| 54 | |
| 55 | */ |
| 56 | |
| 57 | static idx_t consume_string(fstream &input) { |
| 58 | idx_t size = 1; // terminator |
| 59 | while (input.get() != '\0') { |
| 60 | size++; |
| 61 | } |
| 62 | return size; |
| 63 | } |
| 64 | |
| 65 | static const uint8_t GZIP_COMPRESSION_DEFLATE = 0x08; |
| 66 | |
| 67 | static const uint8_t GZIP_FLAG_ASCII = 0x1; |
| 68 | static const uint8_t GZIP_FLAG_MULTIPART = 0x2; |
| 69 | static const uint8_t = 0x4; |
| 70 | static const uint8_t GZIP_FLAG_NAME = 0x8; |
| 71 | static const uint8_t = 0x10; |
| 72 | static const uint8_t GZIP_FLAG_ENCRYPT = 0x20; |
| 73 | |
| 74 | static const uint8_t = 10; |
| 75 | |
| 76 | static const unsigned char GZIP_FLAG_UNSUPPORTED = |
| 77 | GZIP_FLAG_ASCII | GZIP_FLAG_MULTIPART | GZIP_FLAG_EXTRA | GZIP_FLAG_COMMENT | GZIP_FLAG_ENCRYPT; |
| 78 | |
| 79 | void GzipStreamBuf::initialize() { |
| 80 | if (is_initialized) { |
| 81 | return; |
| 82 | } |
| 83 | assert(BUFFER_SIZE >= 3); // found to work fine with 3 |
| 84 | uint8_t gzip_hdr[10]; |
| 85 | data_start = GZIP_HEADER_MINSIZE; |
| 86 | |
| 87 | in_buff = new data_t[BUFFER_SIZE]; |
| 88 | in_buff_start = in_buff; |
| 89 | in_buff_end = in_buff; |
| 90 | out_buff = new data_t[BUFFER_SIZE]; |
| 91 | |
| 92 | mz_stream_ptr = new mz_stream(); |
| 93 | // TODO use custom alloc/free methods in miniz to throw exceptions on OOM |
| 94 | |
| 95 | FstreamUtil::OpenFile(filename, input, ios::in | ios::binary); |
| 96 | |
| 97 | input.read((char *)gzip_hdr, GZIP_HEADER_MINSIZE); |
| 98 | if (!input) { |
| 99 | throw Exception("Input is not a GZIP stream" ); |
| 100 | } |
| 101 | if (gzip_hdr[0] != 0x1F || gzip_hdr[1] != 0x8B) { // magic header |
| 102 | throw Exception("Input is not a GZIP stream" ); |
| 103 | } |
| 104 | if (gzip_hdr[2] != GZIP_COMPRESSION_DEFLATE) { // compression method |
| 105 | throw Exception("Unsupported GZIP compression method" ); |
| 106 | } |
| 107 | if (gzip_hdr[3] & GZIP_FLAG_UNSUPPORTED) { |
| 108 | throw Exception("Unsupported GZIP archive" ); |
| 109 | } |
| 110 | |
| 111 | if (gzip_hdr[3] & GZIP_FLAG_NAME) { |
| 112 | input.seekg(data_start, input.beg); |
| 113 | data_start += consume_string(input); |
| 114 | } |
| 115 | input.seekg(data_start, input.beg); |
| 116 | // stream is now set to beginning of payload data |
| 117 | |
| 118 | auto ret = mz_inflateInit2((mz_streamp)mz_stream_ptr, -MZ_DEFAULT_WINDOW_BITS); |
| 119 | if (ret != MZ_OK) { |
| 120 | throw Exception("Failed to initialize miniz" ); |
| 121 | } |
| 122 | // initialize eback, gptr, egptr |
| 123 | setg((char *)out_buff, (char *)out_buff, (char *)out_buff); |
| 124 | is_initialized = true; |
| 125 | } |
| 126 | |
| 127 | streambuf::int_type GzipStreamBuf::underflow() { |
| 128 | if (!is_initialized) { |
| 129 | initialize(); |
| 130 | } |
| 131 | |
| 132 | // adapted from https://github.com/mateidavid/zstr |
| 133 | auto zstrm_p = (mz_streamp)mz_stream_ptr; |
| 134 | if (!zstrm_p) { |
| 135 | return traits_type::eof(); |
| 136 | } |
| 137 | |
| 138 | if (gptr() == egptr()) { |
| 139 | // pointers for free region in output buffer |
| 140 | auto out_buff_free_start = out_buff; |
| 141 | do { |
| 142 | assert(in_buff_start <= in_buff_end); |
| 143 | assert(in_buff_end <= in_buff_start + BUFFER_SIZE); |
| 144 | |
| 145 | // read more input if none available |
| 146 | if (in_buff_start == in_buff_end) { |
| 147 | // empty input buffer: refill from the start |
| 148 | in_buff_start = in_buff; |
| 149 | std::streamsize sz = input.rdbuf()->sgetn((char *)in_buff, BUFFER_SIZE); |
| 150 | if (sz == 0) { |
| 151 | break; // end of input |
| 152 | } |
| 153 | in_buff_end = in_buff + sz; |
| 154 | } |
| 155 | |
| 156 | // actually decompress |
| 157 | assert(zstrm_p); |
| 158 | zstrm_p->next_in = (data_ptr_t)in_buff_start; |
| 159 | assert(in_buff_end - in_buff_start < numeric_limits<int32_t>::max()); |
| 160 | zstrm_p->avail_in = (uint32_t)(in_buff_end - in_buff_start); |
| 161 | zstrm_p->next_out = (data_ptr_t)out_buff_free_start; |
| 162 | assert((out_buff + BUFFER_SIZE) - out_buff_free_start < numeric_limits<int32_t>::max()); |
| 163 | zstrm_p->avail_out = (uint32_t)((out_buff + BUFFER_SIZE) - out_buff_free_start); |
| 164 | auto ret = mz_inflate(zstrm_p, MZ_NO_FLUSH); |
| 165 | if (ret != MZ_OK && ret != MZ_STREAM_END) { |
| 166 | throw Exception(mz_error(ret)); |
| 167 | } |
| 168 | // update pointers following inflate() |
| 169 | in_buff_start = (data_ptr_t)zstrm_p->next_in; |
| 170 | in_buff_end = in_buff_start + zstrm_p->avail_in; |
| 171 | out_buff_free_start = (data_ptr_t)zstrm_p->next_out; |
| 172 | assert(out_buff_free_start + zstrm_p->avail_out == out_buff + BUFFER_SIZE); |
| 173 | // if stream ended, deallocate inflator |
| 174 | if (ret == MZ_STREAM_END) { |
| 175 | mz_inflateEnd(zstrm_p); |
| 176 | delete zstrm_p; |
| 177 | mz_stream_ptr = nullptr; |
| 178 | break; |
| 179 | } |
| 180 | |
| 181 | } while (out_buff_free_start == out_buff); |
| 182 | // 2 exit conditions: |
| 183 | // - end of input: there might or might not be output available |
| 184 | // - out_buff_free_start != out_buff: output available |
| 185 | setg((char *)out_buff, (char *)out_buff, (char *)out_buff_free_start); |
| 186 | } |
| 187 | |
| 188 | // ensure all those pointers point at something sane |
| 189 | assert(out_buff); |
| 190 | assert(gptr() <= egptr()); |
| 191 | assert(eback() == (char *)out_buff); |
| 192 | assert(gptr() >= (char *)out_buff); |
| 193 | assert(gptr() <= (char *)out_buff + BUFFER_SIZE); |
| 194 | assert(egptr() >= (char *)out_buff); |
| 195 | assert(egptr() <= (char *)out_buff + BUFFER_SIZE); |
| 196 | assert(gptr() <= egptr()); |
| 197 | |
| 198 | return this->gptr() == this->egptr() ? traits_type::eof() : traits_type::to_int_type(*this->gptr()); |
| 199 | } |
| 200 | |
| 201 | GzipStreamBuf::~GzipStreamBuf() { |
| 202 | delete[] in_buff; |
| 203 | delete[] out_buff; |
| 204 | auto zstrm_p = (mz_streamp)mz_stream_ptr; |
| 205 | if (zstrm_p) { |
| 206 | mz_inflateEnd(zstrm_p); |
| 207 | } |
| 208 | delete zstrm_p; |
| 209 | } |
| 210 | |