1#include "duckdb/common/gzip_stream.hpp"
2
3#include "duckdb/common/exception.hpp"
4#include "duckdb/common/file_system.hpp"
5#include "duckdb/common/fstream_util.hpp"
6
7#include "miniz.hpp"
8
9#include <limits>
10
11using namespace std;
12using namespace duckdb;
13
14/*
15
16 0 2 bytes magic header 0x1f, 0x8b (\037 \213)
17 2 1 byte compression method
18 0: store (copied)
19 1: compress
20 2: pack
21 3: lzh
22 4..7: reserved
23 8: deflate
24 3 1 byte flags
25 bit 0 set: file probably ascii text
26 bit 1 set: continuation of multi-part gzip file, part number present
27 bit 2 set: extra field present
28 bit 3 set: original file name present
29 bit 4 set: file comment present
30 bit 5 set: file is encrypted, encryption header present
31 bit 6,7: reserved
32 4 4 bytes file modification time in Unix format
33 8 1 byte extra flags (depend on compression method)
34 9 1 byte OS type
35[
36 2 bytes optional part number (second part=1)
37]?
38[
39 2 bytes optional extra field length (e)
40 (e)bytes optional extra field
41]?
42[
43 bytes optional original file name, zero terminated
44]?
45[
46 bytes optional file comment, zero terminated
47]?
48[
49 12 bytes optional encryption header
50]?
51 bytes compressed data
52 4 bytes crc32
53 4 bytes uncompressed input size modulo 2^32
54
55 */
56
57static idx_t consume_string(fstream &input) {
58 idx_t size = 1; // terminator
59 while (input.get() != '\0') {
60 size++;
61 }
62 return size;
63}
64
65static const uint8_t GZIP_COMPRESSION_DEFLATE = 0x08;
66
67static const uint8_t GZIP_FLAG_ASCII = 0x1;
68static const uint8_t GZIP_FLAG_MULTIPART = 0x2;
69static const uint8_t GZIP_FLAG_EXTRA = 0x4;
70static const uint8_t GZIP_FLAG_NAME = 0x8;
71static const uint8_t GZIP_FLAG_COMMENT = 0x10;
72static const uint8_t GZIP_FLAG_ENCRYPT = 0x20;
73
74static const uint8_t GZIP_HEADER_MINSIZE = 10;
75
76static const unsigned char GZIP_FLAG_UNSUPPORTED =
77 GZIP_FLAG_ASCII | GZIP_FLAG_MULTIPART | GZIP_FLAG_EXTRA | GZIP_FLAG_COMMENT | GZIP_FLAG_ENCRYPT;
78
79void GzipStreamBuf::initialize() {
80 if (is_initialized) {
81 return;
82 }
83 assert(BUFFER_SIZE >= 3); // found to work fine with 3
84 uint8_t gzip_hdr[10];
85 data_start = GZIP_HEADER_MINSIZE;
86
87 in_buff = new data_t[BUFFER_SIZE];
88 in_buff_start = in_buff;
89 in_buff_end = in_buff;
90 out_buff = new data_t[BUFFER_SIZE];
91
92 mz_stream_ptr = new mz_stream();
93 // TODO use custom alloc/free methods in miniz to throw exceptions on OOM
94
95 FstreamUtil::OpenFile(filename, input, ios::in | ios::binary);
96
97 input.read((char *)gzip_hdr, GZIP_HEADER_MINSIZE);
98 if (!input) {
99 throw Exception("Input is not a GZIP stream");
100 }
101 if (gzip_hdr[0] != 0x1F || gzip_hdr[1] != 0x8B) { // magic header
102 throw Exception("Input is not a GZIP stream");
103 }
104 if (gzip_hdr[2] != GZIP_COMPRESSION_DEFLATE) { // compression method
105 throw Exception("Unsupported GZIP compression method");
106 }
107 if (gzip_hdr[3] & GZIP_FLAG_UNSUPPORTED) {
108 throw Exception("Unsupported GZIP archive");
109 }
110
111 if (gzip_hdr[3] & GZIP_FLAG_NAME) {
112 input.seekg(data_start, input.beg);
113 data_start += consume_string(input);
114 }
115 input.seekg(data_start, input.beg);
116 // stream is now set to beginning of payload data
117
118 auto ret = mz_inflateInit2((mz_streamp)mz_stream_ptr, -MZ_DEFAULT_WINDOW_BITS);
119 if (ret != MZ_OK) {
120 throw Exception("Failed to initialize miniz");
121 }
122 // initialize eback, gptr, egptr
123 setg((char *)out_buff, (char *)out_buff, (char *)out_buff);
124 is_initialized = true;
125}
126
127streambuf::int_type GzipStreamBuf::underflow() {
128 if (!is_initialized) {
129 initialize();
130 }
131
132 // adapted from https://github.com/mateidavid/zstr
133 auto zstrm_p = (mz_streamp)mz_stream_ptr;
134 if (!zstrm_p) {
135 return traits_type::eof();
136 }
137
138 if (gptr() == egptr()) {
139 // pointers for free region in output buffer
140 auto out_buff_free_start = out_buff;
141 do {
142 assert(in_buff_start <= in_buff_end);
143 assert(in_buff_end <= in_buff_start + BUFFER_SIZE);
144
145 // read more input if none available
146 if (in_buff_start == in_buff_end) {
147 // empty input buffer: refill from the start
148 in_buff_start = in_buff;
149 std::streamsize sz = input.rdbuf()->sgetn((char *)in_buff, BUFFER_SIZE);
150 if (sz == 0) {
151 break; // end of input
152 }
153 in_buff_end = in_buff + sz;
154 }
155
156 // actually decompress
157 assert(zstrm_p);
158 zstrm_p->next_in = (data_ptr_t)in_buff_start;
159 assert(in_buff_end - in_buff_start < numeric_limits<int32_t>::max());
160 zstrm_p->avail_in = (uint32_t)(in_buff_end - in_buff_start);
161 zstrm_p->next_out = (data_ptr_t)out_buff_free_start;
162 assert((out_buff + BUFFER_SIZE) - out_buff_free_start < numeric_limits<int32_t>::max());
163 zstrm_p->avail_out = (uint32_t)((out_buff + BUFFER_SIZE) - out_buff_free_start);
164 auto ret = mz_inflate(zstrm_p, MZ_NO_FLUSH);
165 if (ret != MZ_OK && ret != MZ_STREAM_END) {
166 throw Exception(mz_error(ret));
167 }
168 // update pointers following inflate()
169 in_buff_start = (data_ptr_t)zstrm_p->next_in;
170 in_buff_end = in_buff_start + zstrm_p->avail_in;
171 out_buff_free_start = (data_ptr_t)zstrm_p->next_out;
172 assert(out_buff_free_start + zstrm_p->avail_out == out_buff + BUFFER_SIZE);
173 // if stream ended, deallocate inflator
174 if (ret == MZ_STREAM_END) {
175 mz_inflateEnd(zstrm_p);
176 delete zstrm_p;
177 mz_stream_ptr = nullptr;
178 break;
179 }
180
181 } while (out_buff_free_start == out_buff);
182 // 2 exit conditions:
183 // - end of input: there might or might not be output available
184 // - out_buff_free_start != out_buff: output available
185 setg((char *)out_buff, (char *)out_buff, (char *)out_buff_free_start);
186 }
187
188 // ensure all those pointers point at something sane
189 assert(out_buff);
190 assert(gptr() <= egptr());
191 assert(eback() == (char *)out_buff);
192 assert(gptr() >= (char *)out_buff);
193 assert(gptr() <= (char *)out_buff + BUFFER_SIZE);
194 assert(egptr() >= (char *)out_buff);
195 assert(egptr() <= (char *)out_buff + BUFFER_SIZE);
196 assert(gptr() <= egptr());
197
198 return this->gptr() == this->egptr() ? traits_type::eof() : traits_type::to_int_type(*this->gptr());
199}
200
201GzipStreamBuf::~GzipStreamBuf() {
202 delete[] in_buff;
203 delete[] out_buff;
204 auto zstrm_p = (mz_streamp)mz_stream_ptr;
205 if (zstrm_p) {
206 mz_inflateEnd(zstrm_p);
207 }
208 delete zstrm_p;
209}
210