1 | #include "duckdb/common/gzip_stream.hpp" |
2 | |
3 | #include "duckdb/common/exception.hpp" |
4 | #include "duckdb/common/file_system.hpp" |
5 | #include "duckdb/common/fstream_util.hpp" |
6 | |
7 | #include "miniz.hpp" |
8 | |
9 | #include <limits> |
10 | |
11 | using namespace std; |
12 | using namespace duckdb; |
13 | |
14 | /* |
15 | |
16 | 0 2 bytes magic header 0x1f, 0x8b (\037 \213) |
17 | 2 1 byte compression method |
18 | 0: store (copied) |
19 | 1: compress |
20 | 2: pack |
21 | 3: lzh |
22 | 4..7: reserved |
23 | 8: deflate |
24 | 3 1 byte flags |
25 | bit 0 set: file probably ascii text |
26 | bit 1 set: continuation of multi-part gzip file, part number present |
27 | bit 2 set: extra field present |
28 | bit 3 set: original file name present |
29 | bit 4 set: file comment present |
30 | bit 5 set: file is encrypted, encryption header present |
31 | bit 6,7: reserved |
32 | 4 4 bytes file modification time in Unix format |
33 | 8 1 byte extra flags (depend on compression method) |
34 | 9 1 byte OS type |
35 | [ |
36 | 2 bytes optional part number (second part=1) |
37 | ]? |
38 | [ |
39 | 2 bytes optional extra field length (e) |
40 | (e)bytes optional extra field |
41 | ]? |
42 | [ |
43 | bytes optional original file name, zero terminated |
44 | ]? |
45 | [ |
46 | bytes optional file comment, zero terminated |
47 | ]? |
48 | [ |
49 | 12 bytes optional encryption header |
50 | ]? |
51 | bytes compressed data |
52 | 4 bytes crc32 |
53 | 4 bytes uncompressed input size modulo 2^32 |
54 | |
55 | */ |
56 | |
57 | static idx_t consume_string(fstream &input) { |
58 | idx_t size = 1; // terminator |
59 | while (input.get() != '\0') { |
60 | size++; |
61 | } |
62 | return size; |
63 | } |
64 | |
65 | static const uint8_t GZIP_COMPRESSION_DEFLATE = 0x08; |
66 | |
67 | static const uint8_t GZIP_FLAG_ASCII = 0x1; |
68 | static const uint8_t GZIP_FLAG_MULTIPART = 0x2; |
69 | static const uint8_t = 0x4; |
70 | static const uint8_t GZIP_FLAG_NAME = 0x8; |
71 | static const uint8_t = 0x10; |
72 | static const uint8_t GZIP_FLAG_ENCRYPT = 0x20; |
73 | |
74 | static const uint8_t = 10; |
75 | |
76 | static const unsigned char GZIP_FLAG_UNSUPPORTED = |
77 | GZIP_FLAG_ASCII | GZIP_FLAG_MULTIPART | GZIP_FLAG_EXTRA | GZIP_FLAG_COMMENT | GZIP_FLAG_ENCRYPT; |
78 | |
79 | void GzipStreamBuf::initialize() { |
80 | if (is_initialized) { |
81 | return; |
82 | } |
83 | assert(BUFFER_SIZE >= 3); // found to work fine with 3 |
84 | uint8_t gzip_hdr[10]; |
85 | data_start = GZIP_HEADER_MINSIZE; |
86 | |
87 | in_buff = new data_t[BUFFER_SIZE]; |
88 | in_buff_start = in_buff; |
89 | in_buff_end = in_buff; |
90 | out_buff = new data_t[BUFFER_SIZE]; |
91 | |
92 | mz_stream_ptr = new mz_stream(); |
93 | // TODO use custom alloc/free methods in miniz to throw exceptions on OOM |
94 | |
95 | FstreamUtil::OpenFile(filename, input, ios::in | ios::binary); |
96 | |
97 | input.read((char *)gzip_hdr, GZIP_HEADER_MINSIZE); |
98 | if (!input) { |
99 | throw Exception("Input is not a GZIP stream" ); |
100 | } |
101 | if (gzip_hdr[0] != 0x1F || gzip_hdr[1] != 0x8B) { // magic header |
102 | throw Exception("Input is not a GZIP stream" ); |
103 | } |
104 | if (gzip_hdr[2] != GZIP_COMPRESSION_DEFLATE) { // compression method |
105 | throw Exception("Unsupported GZIP compression method" ); |
106 | } |
107 | if (gzip_hdr[3] & GZIP_FLAG_UNSUPPORTED) { |
108 | throw Exception("Unsupported GZIP archive" ); |
109 | } |
110 | |
111 | if (gzip_hdr[3] & GZIP_FLAG_NAME) { |
112 | input.seekg(data_start, input.beg); |
113 | data_start += consume_string(input); |
114 | } |
115 | input.seekg(data_start, input.beg); |
116 | // stream is now set to beginning of payload data |
117 | |
118 | auto ret = mz_inflateInit2((mz_streamp)mz_stream_ptr, -MZ_DEFAULT_WINDOW_BITS); |
119 | if (ret != MZ_OK) { |
120 | throw Exception("Failed to initialize miniz" ); |
121 | } |
122 | // initialize eback, gptr, egptr |
123 | setg((char *)out_buff, (char *)out_buff, (char *)out_buff); |
124 | is_initialized = true; |
125 | } |
126 | |
127 | streambuf::int_type GzipStreamBuf::underflow() { |
128 | if (!is_initialized) { |
129 | initialize(); |
130 | } |
131 | |
132 | // adapted from https://github.com/mateidavid/zstr |
133 | auto zstrm_p = (mz_streamp)mz_stream_ptr; |
134 | if (!zstrm_p) { |
135 | return traits_type::eof(); |
136 | } |
137 | |
138 | if (gptr() == egptr()) { |
139 | // pointers for free region in output buffer |
140 | auto out_buff_free_start = out_buff; |
141 | do { |
142 | assert(in_buff_start <= in_buff_end); |
143 | assert(in_buff_end <= in_buff_start + BUFFER_SIZE); |
144 | |
145 | // read more input if none available |
146 | if (in_buff_start == in_buff_end) { |
147 | // empty input buffer: refill from the start |
148 | in_buff_start = in_buff; |
149 | std::streamsize sz = input.rdbuf()->sgetn((char *)in_buff, BUFFER_SIZE); |
150 | if (sz == 0) { |
151 | break; // end of input |
152 | } |
153 | in_buff_end = in_buff + sz; |
154 | } |
155 | |
156 | // actually decompress |
157 | assert(zstrm_p); |
158 | zstrm_p->next_in = (data_ptr_t)in_buff_start; |
159 | assert(in_buff_end - in_buff_start < numeric_limits<int32_t>::max()); |
160 | zstrm_p->avail_in = (uint32_t)(in_buff_end - in_buff_start); |
161 | zstrm_p->next_out = (data_ptr_t)out_buff_free_start; |
162 | assert((out_buff + BUFFER_SIZE) - out_buff_free_start < numeric_limits<int32_t>::max()); |
163 | zstrm_p->avail_out = (uint32_t)((out_buff + BUFFER_SIZE) - out_buff_free_start); |
164 | auto ret = mz_inflate(zstrm_p, MZ_NO_FLUSH); |
165 | if (ret != MZ_OK && ret != MZ_STREAM_END) { |
166 | throw Exception(mz_error(ret)); |
167 | } |
168 | // update pointers following inflate() |
169 | in_buff_start = (data_ptr_t)zstrm_p->next_in; |
170 | in_buff_end = in_buff_start + zstrm_p->avail_in; |
171 | out_buff_free_start = (data_ptr_t)zstrm_p->next_out; |
172 | assert(out_buff_free_start + zstrm_p->avail_out == out_buff + BUFFER_SIZE); |
173 | // if stream ended, deallocate inflator |
174 | if (ret == MZ_STREAM_END) { |
175 | mz_inflateEnd(zstrm_p); |
176 | delete zstrm_p; |
177 | mz_stream_ptr = nullptr; |
178 | break; |
179 | } |
180 | |
181 | } while (out_buff_free_start == out_buff); |
182 | // 2 exit conditions: |
183 | // - end of input: there might or might not be output available |
184 | // - out_buff_free_start != out_buff: output available |
185 | setg((char *)out_buff, (char *)out_buff, (char *)out_buff_free_start); |
186 | } |
187 | |
188 | // ensure all those pointers point at something sane |
189 | assert(out_buff); |
190 | assert(gptr() <= egptr()); |
191 | assert(eback() == (char *)out_buff); |
192 | assert(gptr() >= (char *)out_buff); |
193 | assert(gptr() <= (char *)out_buff + BUFFER_SIZE); |
194 | assert(egptr() >= (char *)out_buff); |
195 | assert(egptr() <= (char *)out_buff + BUFFER_SIZE); |
196 | assert(gptr() <= egptr()); |
197 | |
198 | return this->gptr() == this->egptr() ? traits_type::eof() : traits_type::to_int_type(*this->gptr()); |
199 | } |
200 | |
201 | GzipStreamBuf::~GzipStreamBuf() { |
202 | delete[] in_buff; |
203 | delete[] out_buff; |
204 | auto zstrm_p = (mz_streamp)mz_stream_ptr; |
205 | if (zstrm_p) { |
206 | mz_inflateEnd(zstrm_p); |
207 | } |
208 | delete zstrm_p; |
209 | } |
210 | |