1/*
2 * Licensed under the Apache License, Version 2.0 (the "License");
3 * you may not use this file except in compliance with the License.
4 * You may obtain a copy of the License at
5 *
6 * http://www.apache.org/licenses/LICENSE-2.0
7 *
8 * Unless required by applicable law or agreed to in writing, software
9 * distributed under the License is distributed on an "AS IS" BASIS,
10 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 * See the License for the specific language governing permissions and
12 * limitations under the License.
13 */
14
15#include "Adaptor.hh"
16#include "Compression.hh"
17#include "orc/Exceptions.hh"
18
19#include <string>
20
21namespace orc {
22
23 static const int32_t DEC_32_TABLE[] = {4, 1, 2, 1, 4, 4, 4, 4};
24 static const int32_t DEC_64_TABLE[] = {0, 0, 0, -1, 0, 1, 2, 3};
25
26 static const int32_t SIZE_OF_SHORT = 2;
27 static const int32_t SIZE_OF_INT = 4;
28 static const int32_t SIZE_OF_LONG = 8;
29
30 static std::string toHex(uint64_t val) {
31 std::ostringstream out;
32 out << "0x" << std::hex << val;
33 return out.str();
34 }
35
36 static std::string toString(int64_t val) {
37 std::ostringstream out;
38 out << val;
39 return out.str();
40 }
41
42 class MalformedInputException: public ParseError {
43 public:
44 MalformedInputException(int64_t off
45 ) :ParseError("MalformedInputException at " +
46 toString(off)) {
47 }
48
49 MalformedInputException(int64_t off, const std::string& msg
50 ): ParseError("MalformedInputException " + msg +
51 " at " + toString(off)) {
52 }
53
54 MalformedInputException(const MalformedInputException& other
55 ): ParseError(other.what()) {
56 }
57
58 virtual ~MalformedInputException() noexcept;
59 };
60
61 MalformedInputException::~MalformedInputException() noexcept {
62 // PASS
63 }
64
65 uint64_t lzoDecompress(const char *inputAddress,
66 const char *inputLimit,
67 char *outputAddress,
68 char *outputLimit) {
69 // nothing compresses to nothing
70 if (inputAddress == inputLimit) {
71 return 0;
72 }
73
74 // maximum offset in buffers to which it's safe to write long-at-a-time
75 char * const fastOutputLimit = outputLimit - SIZE_OF_LONG;
76
77 // LZO can concat two blocks together so, decode until the input data is
78 // consumed
79 const char *input = inputAddress;
80 char *output = outputAddress;
81 while (input < inputLimit) {
82 //
83 // Note: For safety some of the code below may stop decoding early or
84 // skip decoding, because input is not available. This makes the code
85 // safe, and since LZO requires an explicit "stop" command, the decoder
86 // will still throw a exception.
87 //
88
89 bool firstCommand = true;
90 uint32_t lastLiteralLength = 0;
91 while (true) {
92 if (input >= inputLimit) {
93 throw MalformedInputException(input - inputAddress);
94 }
95 uint32_t command = *(input++) & 0xFF;
96 if (command == 0x11) {
97 break;
98 }
99
100 // Commands are described using a bit pattern notation:
101 // 0: bit is not set
102 // 1: bit is set
103 // L: part of literal length
104 // P: part of match offset position
105 // M: part of match length
106 // ?: see documentation in command decoder
107
108 int32_t matchLength;
109 int32_t matchOffset;
110 uint32_t literalLength;
111 if ((command & 0xf0) == 0) {
112 if (lastLiteralLength == 0) {
113 // 0b0000_LLLL (0bLLLL_LLLL)*
114
115 // copy length :: fixed
116 // 0
117 matchOffset = 0;
118
119 // copy offset :: fixed
120 // 0
121 matchLength = 0;
122
123 // literal length - 3 :: variable bits :: valid range [4..]
124 // 3 + variableLength(command bits [0..3], 4)
125 literalLength = command & 0xf;
126 if (literalLength == 0) {
127 literalLength = 0xf;
128
129 uint32_t nextByte = 0;
130 while (input < inputLimit &&
131 (nextByte = *(input++) & 0xFF) == 0) {
132 literalLength += 0xff;
133 }
134 literalLength += nextByte;
135 }
136 literalLength += 3;
137 } else if (lastLiteralLength <= 3) {
138 // 0b0000_PPLL 0bPPPP_PPPP
139
140 // copy length: fixed
141 // 3
142 matchLength = 3;
143
144 // copy offset :: 12 bits :: valid range [2048..3071]
145 // [0..1] from command [2..3]
146 // [2..9] from trailer [0..7]
147 // [10] unset
148 // [11] set
149 if (input >= inputLimit) {
150 throw MalformedInputException(input - inputAddress);
151 }
152 matchOffset = (command & 0xc) >> 2;
153 matchOffset |= (*(input++) & 0xFF) << 2;
154 matchOffset |= 0x800;
155
156 // literal length :: 2 bits :: valid range [0..3]
157 // [0..1] from command [0..1]
158 literalLength = (command & 0x3);
159 } else {
160 // 0b0000_PPLL 0bPPPP_PPPP
161
162 // copy length :: fixed
163 // 2
164 matchLength = 2;
165
166 // copy offset :: 10 bits :: valid range [0..1023]
167 // [0..1] from command [2..3]
168 // [2..9] from trailer [0..7]
169 if (input >= inputLimit) {
170 throw MalformedInputException(input - inputAddress);
171 }
172 matchOffset = (command & 0xc) >> 2;
173 matchOffset |= (*(input++) & 0xFF) << 2;
174
175 // literal length :: 2 bits :: valid range [0..3]
176 // [0..1] from command [0..1]
177 literalLength = (command & 0x3);
178 }
179 } else if (firstCommand) {
180 // first command has special handling when high nibble is set
181 matchLength = 0;
182 matchOffset = 0;
183 literalLength = command - 17;
184 } else if ((command & 0xf0) == 0x10) {
185 // 0b0001_?MMM (0bMMMM_MMMM)* 0bPPPP_PPPP_PPPP_PPLL
186
187 // copy length - 2 :: variable bits :: valid range [3..]
188 // 2 + variableLength(command bits [0..2], 3)
189 matchLength = command & 0x7;
190 if (matchLength == 0) {
191 matchLength = 0x7;
192
193 int32_t nextByte = 0;
194 while (input < inputLimit &&
195 (nextByte = *(input++) & 0xFF) == 0) {
196 matchLength += 0xff;
197 }
198 matchLength += nextByte;
199 }
200 matchLength += 2;
201
202 // read trailer
203 if (input + SIZE_OF_SHORT > inputLimit) {
204 throw MalformedInputException(input - inputAddress);
205 }
206 uint32_t trailer = *reinterpret_cast<const uint16_t*>(input) & 0xFFFF;
207 input += SIZE_OF_SHORT;
208
209 // copy offset :: 16 bits :: valid range [32767..49151]
210 // [0..13] from trailer [2..15]
211 // [14] if command bit [3] unset
212 // [15] if command bit [3] set
213 matchOffset = trailer >> 2;
214 if ((command & 0x8) == 0) {
215 matchOffset |= 0x4000;
216 } else {
217 matchOffset |= 0x8000;
218 }
219 matchOffset--;
220
221 // literal length :: 2 bits :: valid range [0..3]
222 // [0..1] from trailer [0..1]
223 literalLength = trailer & 0x3;
224 } else if ((command & 0xe0) == 0x20) {
225 // 0b001M_MMMM (0bMMMM_MMMM)* 0bPPPP_PPPP_PPPP_PPLL
226
227 // copy length - 2 :: variable bits :: valid range [3..]
228 // 2 + variableLength(command bits [0..4], 5)
229 matchLength = command & 0x1f;
230 if (matchLength == 0) {
231 matchLength = 0x1f;
232
233 int nextByte = 0;
234 while (input < inputLimit &&
235 (nextByte = *(input++) & 0xFF) == 0) {
236 matchLength += 0xff;
237 }
238 matchLength += nextByte;
239 }
240 matchLength += 2;
241
242 // read trailer
243 if (input + SIZE_OF_SHORT > inputLimit) {
244 throw MalformedInputException(input - inputAddress);
245 }
246 int32_t trailer = *reinterpret_cast<const int16_t*>(input) & 0xFFFF;
247 input += SIZE_OF_SHORT;
248
249 // copy offset :: 14 bits :: valid range [0..16383]
250 // [0..13] from trailer [2..15]
251 matchOffset = trailer >> 2;
252
253 // literal length :: 2 bits :: valid range [0..3]
254 // [0..1] from trailer [0..1]
255 literalLength = trailer & 0x3;
256 } else if ((command & 0xc0) != 0) {
257 // 0bMMMP_PPLL 0bPPPP_PPPP
258
259 // copy length - 1 :: 3 bits :: valid range [1..8]
260 // [0..2] from command [5..7]
261 // add 1
262 matchLength = (command & 0xe0) >> 5;
263 matchLength += 1;
264
265 // copy offset :: 11 bits :: valid range [0..4095]
266 // [0..2] from command [2..4]
267 // [3..10] from trailer [0..7]
268 if (input >= inputLimit) {
269 throw MalformedInputException(input - inputAddress);
270 }
271 matchOffset = (command & 0x1c) >> 2;
272 matchOffset |= (*(input++) & 0xFF) << 3;
273
274 // literal length :: 2 bits :: valid range [0..3]
275 // [0..1] from command [0..1]
276 literalLength = (command & 0x3);
277 } else {
278 throw MalformedInputException(input - inputAddress - 1,
279 "Invalid LZO command " +
280 toHex(command));
281 }
282 firstCommand = false;
283
284 // copy match
285 if (matchLength != 0) {
286 // lzo encodes match offset minus one
287 matchOffset++;
288
289 char *matchAddress = output - matchOffset;
290 if (matchAddress < outputAddress ||
291 output + matchLength > outputLimit) {
292 throw MalformedInputException(input - inputAddress);
293 }
294 char *matchOutputLimit = output + matchLength;
295
296 if (output > fastOutputLimit) {
297 // slow match copy
298 while (output < matchOutputLimit) {
299 *(output++) = *(matchAddress++);
300 }
301 } else {
302 // copy repeated sequence
303 if (matchOffset < SIZE_OF_LONG) {
304 // 8 bytes apart so that we can copy long-at-a-time below
305 int32_t increment32 = DEC_32_TABLE[matchOffset];
306 int32_t decrement64 = DEC_64_TABLE[matchOffset];
307
308 output[0] = *matchAddress;
309 output[1] = *(matchAddress + 1);
310 output[2] = *(matchAddress + 2);
311 output[3] = *(matchAddress + 3);
312 output += SIZE_OF_INT;
313 matchAddress += increment32;
314
315 *reinterpret_cast<int32_t*>(output) =
316 *reinterpret_cast<int32_t*>(matchAddress);
317 output += SIZE_OF_INT;
318 matchAddress -= decrement64;
319 } else {
320 *reinterpret_cast<int64_t*>(output) =
321 *reinterpret_cast<int64_t*>(matchAddress);
322 matchAddress += SIZE_OF_LONG;
323 output += SIZE_OF_LONG;
324 }
325
326 if (matchOutputLimit >= fastOutputLimit) {
327 if (matchOutputLimit > outputLimit) {
328 throw MalformedInputException(input - inputAddress);
329 }
330
331 while (output < fastOutputLimit) {
332 *reinterpret_cast<int64_t*>(output) =
333 *reinterpret_cast<int64_t*>(matchAddress);
334 matchAddress += SIZE_OF_LONG;
335 output += SIZE_OF_LONG;
336 }
337
338 while (output < matchOutputLimit) {
339 *(output++) = *(matchAddress++);
340 }
341 } else {
342 while (output < matchOutputLimit) {
343 *reinterpret_cast<int64_t*>(output) =
344 *reinterpret_cast<int64_t*>(matchAddress);
345 matchAddress += SIZE_OF_LONG;
346 output += SIZE_OF_LONG;
347 }
348 }
349 }
350 output = matchOutputLimit; // correction in case we over-copied
351 }
352
353 // copy literal
354 char *literalOutputLimit = output + literalLength;
355 if (literalOutputLimit > fastOutputLimit ||
356 input + literalLength > inputLimit - SIZE_OF_LONG) {
357 if (literalOutputLimit > outputLimit) {
358 throw MalformedInputException(input - inputAddress);
359 }
360
361 // slow, precise copy
362 memcpy(output, input, literalLength);
363 input += literalLength;
364 output += literalLength;
365 } else {
366 // fast copy. We may over-copy but there's enough room in input
367 // and output to not overrun them
368 do {
369 *reinterpret_cast<int64_t*>(output) =
370 *reinterpret_cast<const int64_t*>(input);
371 input += SIZE_OF_LONG;
372 output += SIZE_OF_LONG;
373 } while (output < literalOutputLimit);
374 // adjust index if we over-copied
375 input -= (output - literalOutputLimit);
376 output = literalOutputLimit;
377 }
378 lastLiteralLength = literalLength;
379 }
380
381 if (input + SIZE_OF_SHORT > inputLimit &&
382 *reinterpret_cast<const int16_t*>(input) != 0) {
383 throw MalformedInputException(input - inputAddress);
384 }
385 input += SIZE_OF_SHORT;
386 }
387
388 return static_cast<uint64_t>(output - outputAddress);
389 }
390
391}
392