1#include "duckdb/common/gzip_file_system.hpp"
2#include "duckdb/common/exception.hpp"
3#include "duckdb/common/file_system.hpp"
4
5#include "miniz.hpp"
6#include "miniz_wrapper.hpp"
7
8#include "duckdb/common/limits.hpp"
9
10namespace duckdb {
11
12/*
13
14 0 2 bytes magic header 0x1f, 0x8b (\037 \213)
15 2 1 byte compression method
16 0: store (copied)
17 1: compress
18 2: pack
19 3: lzh
20 4..7: reserved
21 8: deflate
22 3 1 byte flags
23 bit 0 set: file probably ascii text
24 bit 1 set: continuation of multi-part gzip file, part number present
25 bit 2 set: extra field present
26 bit 3 set: original file name present
27 bit 4 set: file comment present
28 bit 5 set: file is encrypted, encryption header present
29 bit 6,7: reserved
30 4 4 bytes file modification time in Unix format
31 8 1 byte extra flags (depend on compression method)
32 9 1 byte OS type
33[
34 2 bytes optional part number (second part=1)
35]?
36[
37 2 bytes optional extra field length (e)
38 (e)bytes optional extra field
39]?
40[
41 bytes optional original file name, zero terminated
42]?
43[
44 bytes optional file comment, zero terminated
45]?
46[
47 12 bytes optional encryption header
48]?
49 bytes compressed data
50 4 bytes crc32
51 4 bytes uncompressed input size modulo 2^32
52
53 */
54
55static idx_t GZipConsumeString(FileHandle &input) {
56 idx_t size = 1; // terminator
57 char buffer[1];
58 while (input.Read(buffer, nr_bytes: 1) == 1) {
59 if (buffer[0] == '\0') {
60 break;
61 }
62 size++;
63 }
64 return size;
65}
66
67struct MiniZStreamWrapper : public StreamWrapper {
68 ~MiniZStreamWrapper() override;
69
70 CompressedFile *file = nullptr;
71 duckdb_miniz::mz_stream *mz_stream_ptr = nullptr;
72 bool writing = false;
73 duckdb_miniz::mz_ulong crc;
74 idx_t total_size;
75
76public:
77 void Initialize(CompressedFile &file, bool write) override;
78
79 bool Read(StreamData &stream_data) override;
80 void Write(CompressedFile &file, StreamData &stream_data, data_ptr_t buffer, int64_t nr_bytes) override;
81
82 void Close() override;
83
84 void FlushStream();
85};
86
87MiniZStreamWrapper::~MiniZStreamWrapper() {
88 // avoid closing if destroyed during stack unwinding
89 if (Exception::UncaughtException()) {
90 return;
91 }
92 try {
93 MiniZStreamWrapper::Close();
94 } catch (...) {
95 }
96}
97
98void MiniZStreamWrapper::Initialize(CompressedFile &file, bool write) {
99 Close();
100 this->file = &file;
101 mz_stream_ptr = new duckdb_miniz::mz_stream();
102 memset(s: mz_stream_ptr, c: 0, n: sizeof(duckdb_miniz::mz_stream));
103 this->writing = write;
104
105 // TODO use custom alloc/free methods in miniz to throw exceptions on OOM
106 uint8_t gzip_hdr[GZIP_HEADER_MINSIZE];
107 if (write) {
108 crc = MZ_CRC32_INIT;
109 total_size = 0;
110
111 MiniZStream::InitializeGZIPHeader(gzip_header: gzip_hdr);
112 file.child_handle->Write(buffer: gzip_hdr, nr_bytes: GZIP_HEADER_MINSIZE);
113
114 auto ret = mz_deflateInit2(pStream: (duckdb_miniz::mz_streamp)mz_stream_ptr, level: duckdb_miniz::MZ_DEFAULT_LEVEL, MZ_DEFLATED,
115 window_bits: -MZ_DEFAULT_WINDOW_BITS, mem_level: 1, strategy: 0);
116 if (ret != duckdb_miniz::MZ_OK) {
117 throw InternalException("Failed to initialize miniz");
118 }
119 } else {
120 idx_t data_start = GZIP_HEADER_MINSIZE;
121 auto read_count = file.child_handle->Read(buffer: gzip_hdr, nr_bytes: GZIP_HEADER_MINSIZE);
122 GZipFileSystem::VerifyGZIPHeader(gzip_hdr, read_count);
123 // Skip over the extra field if necessary
124 if (gzip_hdr[3] & GZIP_FLAG_EXTRA) {
125 uint8_t gzip_xlen[2];
126 file.child_handle->Seek(location: data_start);
127 file.child_handle->Read(buffer: gzip_xlen, nr_bytes: 2);
128 idx_t xlen = (uint8_t)gzip_xlen[0] | (uint8_t)gzip_xlen[1] << 8;
129 data_start += xlen + 2;
130 }
131 // Skip over the file name if necessary
132 if (gzip_hdr[3] & GZIP_FLAG_NAME) {
133 file.child_handle->Seek(location: data_start);
134 data_start += GZipConsumeString(input&: *file.child_handle);
135 }
136 file.child_handle->Seek(location: data_start);
137 // stream is now set to beginning of payload data
138 auto ret = duckdb_miniz::mz_inflateInit2(pStream: (duckdb_miniz::mz_streamp)mz_stream_ptr, window_bits: -MZ_DEFAULT_WINDOW_BITS);
139 if (ret != duckdb_miniz::MZ_OK) {
140 throw InternalException("Failed to initialize miniz");
141 }
142 }
143}
144
145bool MiniZStreamWrapper::Read(StreamData &sd) {
146 // Handling for the concatenated files
147 if (sd.refresh) {
148 sd.refresh = false;
149 auto body_ptr = sd.in_buff_start + GZIP_FOOTER_SIZE;
150 uint8_t gzip_hdr[GZIP_HEADER_MINSIZE];
151 memcpy(dest: gzip_hdr, src: body_ptr, n: GZIP_HEADER_MINSIZE);
152 GZipFileSystem::VerifyGZIPHeader(gzip_hdr, read_count: GZIP_HEADER_MINSIZE);
153 body_ptr += GZIP_HEADER_MINSIZE;
154 if (gzip_hdr[3] & GZIP_FLAG_EXTRA) {
155 idx_t xlen = (uint8_t)*body_ptr | (uint8_t) * (body_ptr + 1) << 8;
156 body_ptr += xlen + 2;
157 if (GZIP_FOOTER_SIZE + GZIP_HEADER_MINSIZE + 2 + xlen >= GZIP_HEADER_MAXSIZE) {
158 throw InternalException("Extra field resulting in GZIP header larger than defined maximum (%d)",
159 GZIP_HEADER_MAXSIZE);
160 }
161 }
162 if (gzip_hdr[3] & GZIP_FLAG_NAME) {
163 char c;
164 do {
165 c = *body_ptr;
166 body_ptr++;
167 } while (c != '\0' && body_ptr < sd.in_buff_end);
168 if ((idx_t)(body_ptr - sd.in_buff_start) >= GZIP_HEADER_MAXSIZE) {
169 throw InternalException("Filename resulting in GZIP header larger than defined maximum (%d)",
170 GZIP_HEADER_MAXSIZE);
171 }
172 }
173 sd.in_buff_start = body_ptr;
174 if (sd.in_buff_end - sd.in_buff_start < 1) {
175 Close();
176 return true;
177 }
178 duckdb_miniz::mz_inflateEnd(pStream: mz_stream_ptr);
179 auto sta = duckdb_miniz::mz_inflateInit2(pStream: (duckdb_miniz::mz_streamp)mz_stream_ptr, window_bits: -MZ_DEFAULT_WINDOW_BITS);
180 if (sta != duckdb_miniz::MZ_OK) {
181 throw InternalException("Failed to initialize miniz");
182 }
183 }
184
185 // actually decompress
186 mz_stream_ptr->next_in = sd.in_buff_start;
187 D_ASSERT(sd.in_buff_end - sd.in_buff_start < NumericLimits<int32_t>::Maximum());
188 mz_stream_ptr->avail_in = (uint32_t)(sd.in_buff_end - sd.in_buff_start);
189 mz_stream_ptr->next_out = data_ptr_cast(src: sd.out_buff_end);
190 mz_stream_ptr->avail_out = (uint32_t)((sd.out_buff.get() + sd.out_buf_size) - sd.out_buff_end);
191 auto ret = duckdb_miniz::mz_inflate(pStream: mz_stream_ptr, flush: duckdb_miniz::MZ_NO_FLUSH);
192 if (ret != duckdb_miniz::MZ_OK && ret != duckdb_miniz::MZ_STREAM_END) {
193 throw IOException("Failed to decode gzip stream: %s", duckdb_miniz::mz_error(err: ret));
194 }
195 // update pointers following inflate()
196 sd.in_buff_start = (data_ptr_t)mz_stream_ptr->next_in; // NOLINT
197 sd.in_buff_end = sd.in_buff_start + mz_stream_ptr->avail_in;
198 sd.out_buff_end = data_ptr_cast(src: mz_stream_ptr->next_out);
199 D_ASSERT(sd.out_buff_end + mz_stream_ptr->avail_out == sd.out_buff.get() + sd.out_buf_size);
200
201 // if stream ended, deallocate inflator
202 if (ret == duckdb_miniz::MZ_STREAM_END) {
203 // Last read from file done and remaining bytes only for footer or less
204 if ((sd.in_buff_end < sd.in_buff.get() + sd.in_buf_size) && mz_stream_ptr->avail_in <= GZIP_FOOTER_SIZE) {
205 Close();
206 return true;
207 }
208 if (mz_stream_ptr->avail_in > GZIP_FOOTER_SIZE) {
209 // Definitely not concatenated gzip
210 if (*(sd.in_buff_start + GZIP_FOOTER_SIZE) != 0x1F) {
211 Close();
212 return true;
213 }
214 }
215 // Concatenated GZIP potentially coming up - refresh input buffer
216 sd.refresh = true;
217 }
218 return false;
219}
220
221void MiniZStreamWrapper::Write(CompressedFile &file, StreamData &sd, data_ptr_t uncompressed_data,
222 int64_t uncompressed_size) {
223 // update the src and the total size
224 crc = duckdb_miniz::mz_crc32(crc, ptr: reinterpret_cast<const unsigned char *>(uncompressed_data), buf_len: uncompressed_size);
225 total_size += uncompressed_size;
226
227 auto remaining = uncompressed_size;
228 while (remaining > 0) {
229 idx_t output_remaining = (sd.out_buff.get() + sd.out_buf_size) - sd.out_buff_start;
230
231 mz_stream_ptr->next_in = reinterpret_cast<const unsigned char *>(uncompressed_data);
232 mz_stream_ptr->avail_in = remaining;
233 mz_stream_ptr->next_out = sd.out_buff_start;
234 mz_stream_ptr->avail_out = output_remaining;
235
236 auto res = mz_deflate(pStream: mz_stream_ptr, flush: duckdb_miniz::MZ_NO_FLUSH);
237 if (res != duckdb_miniz::MZ_OK) {
238 D_ASSERT(res != duckdb_miniz::MZ_STREAM_END);
239 throw InternalException("Failed to compress GZIP block");
240 }
241 sd.out_buff_start += output_remaining - mz_stream_ptr->avail_out;
242 if (mz_stream_ptr->avail_out == 0) {
243 // no more output buffer available: flush
244 file.child_handle->Write(buffer: sd.out_buff.get(), nr_bytes: sd.out_buff_start - sd.out_buff.get());
245 sd.out_buff_start = sd.out_buff.get();
246 }
247 idx_t written = remaining - mz_stream_ptr->avail_in;
248 uncompressed_data += written;
249 remaining = mz_stream_ptr->avail_in;
250 }
251}
252
253void MiniZStreamWrapper::FlushStream() {
254 auto &sd = file->stream_data;
255 mz_stream_ptr->next_in = nullptr;
256 mz_stream_ptr->avail_in = 0;
257 while (true) {
258 auto output_remaining = (sd.out_buff.get() + sd.out_buf_size) - sd.out_buff_start;
259 mz_stream_ptr->next_out = sd.out_buff_start;
260 mz_stream_ptr->avail_out = output_remaining;
261
262 auto res = mz_deflate(pStream: mz_stream_ptr, flush: duckdb_miniz::MZ_FINISH);
263 sd.out_buff_start += (output_remaining - mz_stream_ptr->avail_out);
264 if (sd.out_buff_start > sd.out_buff.get()) {
265 file->child_handle->Write(buffer: sd.out_buff.get(), nr_bytes: sd.out_buff_start - sd.out_buff.get());
266 sd.out_buff_start = sd.out_buff.get();
267 }
268 if (res == duckdb_miniz::MZ_STREAM_END) {
269 break;
270 }
271 if (res != duckdb_miniz::MZ_OK) {
272 throw InternalException("Failed to compress GZIP block");
273 }
274 }
275}
276
277void MiniZStreamWrapper::Close() {
278 if (!mz_stream_ptr) {
279 return;
280 }
281 if (writing) {
282 // flush anything remaining in the stream
283 FlushStream();
284
285 // write the footer
286 unsigned char gzip_footer[MiniZStream::GZIP_FOOTER_SIZE];
287 MiniZStream::InitializeGZIPFooter(gzip_footer, crc, uncompressed_size: total_size);
288 file->child_handle->Write(buffer: gzip_footer, nr_bytes: MiniZStream::GZIP_FOOTER_SIZE);
289
290 duckdb_miniz::mz_deflateEnd(pStream: mz_stream_ptr);
291 } else {
292 duckdb_miniz::mz_inflateEnd(pStream: mz_stream_ptr);
293 }
294 delete mz_stream_ptr;
295 mz_stream_ptr = nullptr;
296 file = nullptr;
297}
298
299class GZipFile : public CompressedFile {
300public:
301 GZipFile(unique_ptr<FileHandle> child_handle_p, const string &path, bool write)
302 : CompressedFile(gzip_fs, std::move(child_handle_p), path) {
303 Initialize(write);
304 }
305
306 GZipFileSystem gzip_fs;
307};
308
309void GZipFileSystem::VerifyGZIPHeader(uint8_t gzip_hdr[], idx_t read_count) {
310 // check for incorrectly formatted files
311 if (read_count != GZIP_HEADER_MINSIZE) {
312 throw IOException("Input is not a GZIP stream");
313 }
314 if (gzip_hdr[0] != 0x1F || gzip_hdr[1] != 0x8B) { // magic header
315 throw IOException("Input is not a GZIP stream");
316 }
317 if (gzip_hdr[2] != GZIP_COMPRESSION_DEFLATE) { // compression method
318 throw IOException("Unsupported GZIP compression method");
319 }
320 if (gzip_hdr[3] & GZIP_FLAG_UNSUPPORTED) {
321 throw IOException("Unsupported GZIP archive");
322 }
323}
324
325string GZipFileSystem::UncompressGZIPString(const string &in) {
326 // decompress file
327 auto body_ptr = in.data();
328
329 auto mz_stream_ptr = new duckdb_miniz::mz_stream();
330 memset(s: mz_stream_ptr, c: 0, n: sizeof(duckdb_miniz::mz_stream));
331
332 uint8_t gzip_hdr[GZIP_HEADER_MINSIZE];
333
334 // check for incorrectly formatted files
335
336 // TODO this is mostly the same as gzip_file_system.cpp
337 if (in.size() < GZIP_HEADER_MINSIZE) {
338 throw IOException("Input is not a GZIP stream");
339 }
340 memcpy(dest: gzip_hdr, src: body_ptr, n: GZIP_HEADER_MINSIZE);
341 body_ptr += GZIP_HEADER_MINSIZE;
342 GZipFileSystem::VerifyGZIPHeader(gzip_hdr, read_count: GZIP_HEADER_MINSIZE);
343
344 if (gzip_hdr[3] & GZIP_FLAG_EXTRA) {
345 throw IOException("Extra field in a GZIP stream unsupported");
346 }
347
348 if (gzip_hdr[3] & GZIP_FLAG_NAME) {
349 char c;
350 do {
351 c = *body_ptr;
352 body_ptr++;
353 } while (c != '\0' && (idx_t)(body_ptr - in.data()) < in.size());
354 }
355
356 // stream is now set to beginning of payload data
357 auto status = duckdb_miniz::mz_inflateInit2(pStream: mz_stream_ptr, window_bits: -MZ_DEFAULT_WINDOW_BITS);
358 if (status != duckdb_miniz::MZ_OK) {
359 throw InternalException("Failed to initialize miniz");
360 }
361
362 auto bytes_remaining = in.size() - (body_ptr - in.data());
363 mz_stream_ptr->next_in = const_uchar_ptr_cast(src: body_ptr);
364 mz_stream_ptr->avail_in = bytes_remaining;
365
366 unsigned char decompress_buffer[BUFSIZ];
367 string decompressed;
368
369 while (status == duckdb_miniz::MZ_OK) {
370 mz_stream_ptr->next_out = decompress_buffer;
371 mz_stream_ptr->avail_out = sizeof(decompress_buffer);
372 status = mz_inflate(pStream: mz_stream_ptr, flush: duckdb_miniz::MZ_NO_FLUSH);
373 if (status != duckdb_miniz::MZ_STREAM_END && status != duckdb_miniz::MZ_OK) {
374 throw IOException("Failed to uncompress");
375 }
376 decompressed.append(s: char_ptr_cast(src: decompress_buffer), n: mz_stream_ptr->total_out - decompressed.size());
377 }
378 duckdb_miniz::mz_inflateEnd(pStream: mz_stream_ptr);
379 if (decompressed.empty()) {
380 throw IOException("Failed to uncompress");
381 }
382 return decompressed;
383}
384
385unique_ptr<FileHandle> GZipFileSystem::OpenCompressedFile(unique_ptr<FileHandle> handle, bool write) {
386 auto path = handle->path;
387 return make_uniq<GZipFile>(args: std::move(handle), args&: path, args&: write);
388}
389
390unique_ptr<StreamWrapper> GZipFileSystem::CreateStream() {
391 return make_uniq<MiniZStreamWrapper>();
392}
393
394idx_t GZipFileSystem::InBufferSize() {
395 return BUFFER_SIZE;
396}
397
398idx_t GZipFileSystem::OutBufferSize() {
399 return BUFFER_SIZE;
400}
401
402} // namespace duckdb
403