1 | /* |
2 | * Licensed under the Apache License, Version 2.0 (the "License"); |
3 | * you may not use this file except in compliance with the License. |
4 | * You may obtain a copy of the License at |
5 | * |
6 | * http://www.apache.org/licenses/LICENSE-2.0 |
7 | * |
8 | * Unless required by applicable law or agreed to in writing, software |
9 | * distributed under the License is distributed on an "AS IS" BASIS, |
10 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
11 | * See the License for the specific language governing permissions and |
12 | * limitations under the License. |
13 | */ |
14 | |
15 | #include "Adaptor.hh" |
16 | #include "Compression.hh" |
17 | #include "orc/Exceptions.hh" |
18 | |
19 | #include <string> |
20 | |
21 | namespace orc { |
22 | |
23 | static const int32_t DEC_32_TABLE[] = {4, 1, 2, 1, 4, 4, 4, 4}; |
24 | static const int32_t DEC_64_TABLE[] = {0, 0, 0, -1, 0, 1, 2, 3}; |
25 | |
26 | static const int32_t SIZE_OF_SHORT = 2; |
27 | static const int32_t SIZE_OF_INT = 4; |
28 | static const int32_t SIZE_OF_LONG = 8; |
29 | |
30 | static std::string toHex(uint64_t val) { |
31 | std::ostringstream out; |
32 | out << "0x" << std::hex << val; |
33 | return out.str(); |
34 | } |
35 | |
36 | static std::string toString(int64_t val) { |
37 | std::ostringstream out; |
38 | out << val; |
39 | return out.str(); |
40 | } |
41 | |
42 | class MalformedInputException: public ParseError { |
43 | public: |
44 | MalformedInputException(int64_t off |
45 | ) :ParseError("MalformedInputException at " + |
46 | toString(off)) { |
47 | } |
48 | |
49 | MalformedInputException(int64_t off, const std::string& msg |
50 | ): ParseError("MalformedInputException " + msg + |
51 | " at " + toString(off)) { |
52 | } |
53 | |
54 | MalformedInputException(const MalformedInputException& other |
55 | ): ParseError(other.what()) { |
56 | } |
57 | |
58 | virtual ~MalformedInputException() noexcept; |
59 | }; |
60 | |
61 | MalformedInputException::~MalformedInputException() noexcept { |
62 | // PASS |
63 | } |
64 | |
65 | uint64_t lzoDecompress(const char *inputAddress, |
66 | const char *inputLimit, |
67 | char *outputAddress, |
68 | char *outputLimit) { |
69 | // nothing compresses to nothing |
70 | if (inputAddress == inputLimit) { |
71 | return 0; |
72 | } |
73 | |
74 | // maximum offset in buffers to which it's safe to write long-at-a-time |
75 | char * const fastOutputLimit = outputLimit - SIZE_OF_LONG; |
76 | |
77 | // LZO can concat two blocks together so, decode until the input data is |
78 | // consumed |
79 | const char *input = inputAddress; |
80 | char *output = outputAddress; |
81 | while (input < inputLimit) { |
82 | // |
83 | // Note: For safety some of the code below may stop decoding early or |
84 | // skip decoding, because input is not available. This makes the code |
85 | // safe, and since LZO requires an explicit "stop" command, the decoder |
86 | // will still throw a exception. |
87 | // |
88 | |
89 | bool firstCommand = true; |
90 | uint32_t lastLiteralLength = 0; |
91 | while (true) { |
92 | if (input >= inputLimit) { |
93 | throw MalformedInputException(input - inputAddress); |
94 | } |
95 | uint32_t command = *(input++) & 0xFF; |
96 | if (command == 0x11) { |
97 | break; |
98 | } |
99 | |
100 | // Commands are described using a bit pattern notation: |
101 | // 0: bit is not set |
102 | // 1: bit is set |
103 | // L: part of literal length |
104 | // P: part of match offset position |
105 | // M: part of match length |
106 | // ?: see documentation in command decoder |
107 | |
108 | int32_t matchLength; |
109 | int32_t matchOffset; |
110 | uint32_t literalLength; |
111 | if ((command & 0xf0) == 0) { |
112 | if (lastLiteralLength == 0) { |
113 | // 0b0000_LLLL (0bLLLL_LLLL)* |
114 | |
115 | // copy length :: fixed |
116 | // 0 |
117 | matchOffset = 0; |
118 | |
119 | // copy offset :: fixed |
120 | // 0 |
121 | matchLength = 0; |
122 | |
123 | // literal length - 3 :: variable bits :: valid range [4..] |
124 | // 3 + variableLength(command bits [0..3], 4) |
125 | literalLength = command & 0xf; |
126 | if (literalLength == 0) { |
127 | literalLength = 0xf; |
128 | |
129 | uint32_t nextByte = 0; |
130 | while (input < inputLimit && |
131 | (nextByte = *(input++) & 0xFF) == 0) { |
132 | literalLength += 0xff; |
133 | } |
134 | literalLength += nextByte; |
135 | } |
136 | literalLength += 3; |
137 | } else if (lastLiteralLength <= 3) { |
138 | // 0b0000_PPLL 0bPPPP_PPPP |
139 | |
140 | // copy length: fixed |
141 | // 3 |
142 | matchLength = 3; |
143 | |
144 | // copy offset :: 12 bits :: valid range [2048..3071] |
145 | // [0..1] from command [2..3] |
146 | // [2..9] from trailer [0..7] |
147 | // [10] unset |
148 | // [11] set |
149 | if (input >= inputLimit) { |
150 | throw MalformedInputException(input - inputAddress); |
151 | } |
152 | matchOffset = (command & 0xc) >> 2; |
153 | matchOffset |= (*(input++) & 0xFF) << 2; |
154 | matchOffset |= 0x800; |
155 | |
156 | // literal length :: 2 bits :: valid range [0..3] |
157 | // [0..1] from command [0..1] |
158 | literalLength = (command & 0x3); |
159 | } else { |
160 | // 0b0000_PPLL 0bPPPP_PPPP |
161 | |
162 | // copy length :: fixed |
163 | // 2 |
164 | matchLength = 2; |
165 | |
166 | // copy offset :: 10 bits :: valid range [0..1023] |
167 | // [0..1] from command [2..3] |
168 | // [2..9] from trailer [0..7] |
169 | if (input >= inputLimit) { |
170 | throw MalformedInputException(input - inputAddress); |
171 | } |
172 | matchOffset = (command & 0xc) >> 2; |
173 | matchOffset |= (*(input++) & 0xFF) << 2; |
174 | |
175 | // literal length :: 2 bits :: valid range [0..3] |
176 | // [0..1] from command [0..1] |
177 | literalLength = (command & 0x3); |
178 | } |
179 | } else if (firstCommand) { |
180 | // first command has special handling when high nibble is set |
181 | matchLength = 0; |
182 | matchOffset = 0; |
183 | literalLength = command - 17; |
184 | } else if ((command & 0xf0) == 0x10) { |
185 | // 0b0001_?MMM (0bMMMM_MMMM)* 0bPPPP_PPPP_PPPP_PPLL |
186 | |
187 | // copy length - 2 :: variable bits :: valid range [3..] |
188 | // 2 + variableLength(command bits [0..2], 3) |
189 | matchLength = command & 0x7; |
190 | if (matchLength == 0) { |
191 | matchLength = 0x7; |
192 | |
193 | int32_t nextByte = 0; |
194 | while (input < inputLimit && |
195 | (nextByte = *(input++) & 0xFF) == 0) { |
196 | matchLength += 0xff; |
197 | } |
198 | matchLength += nextByte; |
199 | } |
200 | matchLength += 2; |
201 | |
202 | // read trailer |
203 | if (input + SIZE_OF_SHORT > inputLimit) { |
204 | throw MalformedInputException(input - inputAddress); |
205 | } |
206 | uint32_t trailer = *reinterpret_cast<const uint16_t*>(input) & 0xFFFF; |
207 | input += SIZE_OF_SHORT; |
208 | |
209 | // copy offset :: 16 bits :: valid range [32767..49151] |
210 | // [0..13] from trailer [2..15] |
211 | // [14] if command bit [3] unset |
212 | // [15] if command bit [3] set |
213 | matchOffset = trailer >> 2; |
214 | if ((command & 0x8) == 0) { |
215 | matchOffset |= 0x4000; |
216 | } else { |
217 | matchOffset |= 0x8000; |
218 | } |
219 | matchOffset--; |
220 | |
221 | // literal length :: 2 bits :: valid range [0..3] |
222 | // [0..1] from trailer [0..1] |
223 | literalLength = trailer & 0x3; |
224 | } else if ((command & 0xe0) == 0x20) { |
225 | // 0b001M_MMMM (0bMMMM_MMMM)* 0bPPPP_PPPP_PPPP_PPLL |
226 | |
227 | // copy length - 2 :: variable bits :: valid range [3..] |
228 | // 2 + variableLength(command bits [0..4], 5) |
229 | matchLength = command & 0x1f; |
230 | if (matchLength == 0) { |
231 | matchLength = 0x1f; |
232 | |
233 | int nextByte = 0; |
234 | while (input < inputLimit && |
235 | (nextByte = *(input++) & 0xFF) == 0) { |
236 | matchLength += 0xff; |
237 | } |
238 | matchLength += nextByte; |
239 | } |
240 | matchLength += 2; |
241 | |
242 | // read trailer |
243 | if (input + SIZE_OF_SHORT > inputLimit) { |
244 | throw MalformedInputException(input - inputAddress); |
245 | } |
246 | int32_t trailer = *reinterpret_cast<const int16_t*>(input) & 0xFFFF; |
247 | input += SIZE_OF_SHORT; |
248 | |
249 | // copy offset :: 14 bits :: valid range [0..16383] |
250 | // [0..13] from trailer [2..15] |
251 | matchOffset = trailer >> 2; |
252 | |
253 | // literal length :: 2 bits :: valid range [0..3] |
254 | // [0..1] from trailer [0..1] |
255 | literalLength = trailer & 0x3; |
256 | } else if ((command & 0xc0) != 0) { |
257 | // 0bMMMP_PPLL 0bPPPP_PPPP |
258 | |
259 | // copy length - 1 :: 3 bits :: valid range [1..8] |
260 | // [0..2] from command [5..7] |
261 | // add 1 |
262 | matchLength = (command & 0xe0) >> 5; |
263 | matchLength += 1; |
264 | |
265 | // copy offset :: 11 bits :: valid range [0..4095] |
266 | // [0..2] from command [2..4] |
267 | // [3..10] from trailer [0..7] |
268 | if (input >= inputLimit) { |
269 | throw MalformedInputException(input - inputAddress); |
270 | } |
271 | matchOffset = (command & 0x1c) >> 2; |
272 | matchOffset |= (*(input++) & 0xFF) << 3; |
273 | |
274 | // literal length :: 2 bits :: valid range [0..3] |
275 | // [0..1] from command [0..1] |
276 | literalLength = (command & 0x3); |
277 | } else { |
278 | throw MalformedInputException(input - inputAddress - 1, |
279 | "Invalid LZO command " + |
280 | toHex(command)); |
281 | } |
282 | firstCommand = false; |
283 | |
284 | // copy match |
285 | if (matchLength != 0) { |
286 | // lzo encodes match offset minus one |
287 | matchOffset++; |
288 | |
289 | char *matchAddress = output - matchOffset; |
290 | if (matchAddress < outputAddress || |
291 | output + matchLength > outputLimit) { |
292 | throw MalformedInputException(input - inputAddress); |
293 | } |
294 | char *matchOutputLimit = output + matchLength; |
295 | |
296 | if (output > fastOutputLimit) { |
297 | // slow match copy |
298 | while (output < matchOutputLimit) { |
299 | *(output++) = *(matchAddress++); |
300 | } |
301 | } else { |
302 | // copy repeated sequence |
303 | if (matchOffset < SIZE_OF_LONG) { |
304 | // 8 bytes apart so that we can copy long-at-a-time below |
305 | int32_t increment32 = DEC_32_TABLE[matchOffset]; |
306 | int32_t decrement64 = DEC_64_TABLE[matchOffset]; |
307 | |
308 | output[0] = *matchAddress; |
309 | output[1] = *(matchAddress + 1); |
310 | output[2] = *(matchAddress + 2); |
311 | output[3] = *(matchAddress + 3); |
312 | output += SIZE_OF_INT; |
313 | matchAddress += increment32; |
314 | |
315 | *reinterpret_cast<int32_t*>(output) = |
316 | *reinterpret_cast<int32_t*>(matchAddress); |
317 | output += SIZE_OF_INT; |
318 | matchAddress -= decrement64; |
319 | } else { |
320 | *reinterpret_cast<int64_t*>(output) = |
321 | *reinterpret_cast<int64_t*>(matchAddress); |
322 | matchAddress += SIZE_OF_LONG; |
323 | output += SIZE_OF_LONG; |
324 | } |
325 | |
326 | if (matchOutputLimit >= fastOutputLimit) { |
327 | if (matchOutputLimit > outputLimit) { |
328 | throw MalformedInputException(input - inputAddress); |
329 | } |
330 | |
331 | while (output < fastOutputLimit) { |
332 | *reinterpret_cast<int64_t*>(output) = |
333 | *reinterpret_cast<int64_t*>(matchAddress); |
334 | matchAddress += SIZE_OF_LONG; |
335 | output += SIZE_OF_LONG; |
336 | } |
337 | |
338 | while (output < matchOutputLimit) { |
339 | *(output++) = *(matchAddress++); |
340 | } |
341 | } else { |
342 | while (output < matchOutputLimit) { |
343 | *reinterpret_cast<int64_t*>(output) = |
344 | *reinterpret_cast<int64_t*>(matchAddress); |
345 | matchAddress += SIZE_OF_LONG; |
346 | output += SIZE_OF_LONG; |
347 | } |
348 | } |
349 | } |
350 | output = matchOutputLimit; // correction in case we over-copied |
351 | } |
352 | |
353 | // copy literal |
354 | char *literalOutputLimit = output + literalLength; |
355 | if (literalOutputLimit > fastOutputLimit || |
356 | input + literalLength > inputLimit - SIZE_OF_LONG) { |
357 | if (literalOutputLimit > outputLimit) { |
358 | throw MalformedInputException(input - inputAddress); |
359 | } |
360 | |
361 | // slow, precise copy |
362 | memcpy(output, input, literalLength); |
363 | input += literalLength; |
364 | output += literalLength; |
365 | } else { |
366 | // fast copy. We may over-copy but there's enough room in input |
367 | // and output to not overrun them |
368 | do { |
369 | *reinterpret_cast<int64_t*>(output) = |
370 | *reinterpret_cast<const int64_t*>(input); |
371 | input += SIZE_OF_LONG; |
372 | output += SIZE_OF_LONG; |
373 | } while (output < literalOutputLimit); |
374 | // adjust index if we over-copied |
375 | input -= (output - literalOutputLimit); |
376 | output = literalOutputLimit; |
377 | } |
378 | lastLiteralLength = literalLength; |
379 | } |
380 | |
381 | if (input + SIZE_OF_SHORT > inputLimit && |
382 | *reinterpret_cast<const int16_t*>(input) != 0) { |
383 | throw MalformedInputException(input - inputAddress); |
384 | } |
385 | input += SIZE_OF_SHORT; |
386 | } |
387 | |
388 | return static_cast<uint64_t>(output - outputAddress); |
389 | } |
390 | |
391 | } |
392 | |