1#include "LZ4_decompress_faster.h"
2
3#include <string.h>
4#include <iostream>
5#include <random>
6#include <algorithm>
7#include <Core/Defines.h>
8#include <Common/Stopwatch.h>
9#include <common/likely.h>
10#include <common/Types.h>
11#include <common/unaligned.h>
12
13#ifdef __SSE2__
14#include <emmintrin.h>
15#endif
16
17#ifdef __SSSE3__
18#include <tmmintrin.h>
19#endif
20
21#ifdef __aarch64__
22#include <arm_neon.h>
23#endif
24
25namespace LZ4
26{
27
28namespace
29{
30
31template <size_t N> [[maybe_unused]] void copy(UInt8 * dst, const UInt8 * src);
32template <size_t N> [[maybe_unused]] void wildCopy(UInt8 * dst, const UInt8 * src, UInt8 * dst_end);
33template <size_t N, bool USE_SHUFFLE> [[maybe_unused]] void copyOverlap(UInt8 * op, const UInt8 *& match, const size_t offset);
34
35
36inline void copy8(UInt8 * dst, const UInt8 * src)
37{
38 memcpy(dst, src, 8);
39}
40
41inline void wildCopy8(UInt8 * dst, const UInt8 * src, UInt8 * dst_end)
42{
43 /// Unrolling with clang is doing >10% performance degrade.
44#if defined(__clang__)
45 #pragma nounroll
46#endif
47 do
48 {
49 copy8(dst, src);
50 dst += 8;
51 src += 8;
52 } while (dst < dst_end);
53}
54
55inline void copyOverlap8(UInt8 * op, const UInt8 *& match, const size_t offset)
56{
57 /// 4 % n.
58 /// Or if 4 % n is zero, we use n.
59 /// It gives equivalent result, but is better CPU friendly for unknown reason.
60 static constexpr int shift1[] = { 0, 1, 2, 1, 4, 4, 4, 4 };
61
62 /// 8 % n - 4 % n
63 static constexpr int shift2[] = { 0, 0, 0, 1, 0, -1, -2, -3 };
64
65 op[0] = match[0];
66 op[1] = match[1];
67 op[2] = match[2];
68 op[3] = match[3];
69
70 match += shift1[offset];
71 memcpy(op + 4, match, 4);
72 match += shift2[offset];
73}
74
75
76#if defined(__x86_64__) || defined(__PPC__)
77
78/** We use 'xmm' (128bit SSE) registers here to shuffle 16 bytes.
79 *
80 * It is possible to use 'mm' (64bit MMX) registers to shuffle just 8 bytes as we need.
81 *
82 * There is corresponding version of 'pshufb' instruction that operates on 'mm' registers,
83 * (it operates on MMX registers although it is available in SSSE3)
84 * and compiler library has the corresponding intrinsic: '_mm_shuffle_pi8'.
85 *
86 * It can be done like this:
87 *
88 * unalignedStore(op, _mm_shuffle_pi8(
89 * unalignedLoad<__m64>(match),
90 * unalignedLoad<__m64>(masks + 8 * offset)));
91 *
92 * This is perfectly correct and this code have the same or even better performance.
93 *
94 * But if we write code this way, it will lead to
95 * extremely weird and extremely non obvious
96 * effects in completely unrelated parts of code.
97 *
98 * Because using MMX registers alters the mode of operation of x87 FPU,
99 * and then operations with FPU become broken.
100 *
101 * Example 1.
102 * Compile this code without optimizations:
103 *
104 #include <vector>
105 #include <unordered_set>
106 #include <iostream>
107 #include <tmmintrin.h>
108
109 int main(int, char **)
110 {
111 [[maybe_unused]] __m64 shuffled = _mm_shuffle_pi8(__m64{}, __m64{});
112
113 std::vector<int> vec;
114 std::unordered_set<int> set(vec.begin(), vec.end());
115
116 std::cerr << set.size() << "\n";
117 return 0;
118 }
119
120 $ g++ -g -O0 -mssse3 -std=c++17 mmx_bug1.cpp && ./a.out
121 terminate called after throwing an instance of 'std::bad_alloc'
122 what(): std::bad_alloc
123
124 Also reproduced with clang. But only with libstdc++, not with libc++.
125
126 * Example 2.
127
128 #include <math.h>
129 #include <iostream>
130 #include <tmmintrin.h>
131
132 int main(int, char **)
133 {
134 double max_fill = 1;
135
136 std::cerr << (long double)max_fill << "\n";
137 [[maybe_unused]] __m64 shuffled = _mm_shuffle_pi8(__m64{}, __m64{});
138 std::cerr << (long double)max_fill << "\n";
139
140 return 0;
141 }
142
143 $ g++ -g -O0 -mssse3 -std=c++17 mmx_bug2.cpp && ./a.out
144 1
145 -nan
146
147 * Explanation:
148 *
149 * https://stackoverflow.com/questions/33692969/assembler-mmx-errors
150 * https://software.intel.com/en-us/node/524274
151 *
152 * Actually it's possible to use 'emms' instruction after decompression routine.
153 * But it's more easy to just use 'xmm' registers and avoid using 'mm' registers.
154 */
155inline void copyOverlap8Shuffle(UInt8 * op, const UInt8 *& match, const size_t offset)
156{
157#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
158
159 static constexpr UInt8 __attribute__((__aligned__(8))) masks[] =
160 {
161 0, 1, 2, 2, 4, 3, 2, 1, /* offset = 0, not used as mask, but for shift amount instead */
162 0, 0, 0, 0, 0, 0, 0, 0, /* offset = 1 */
163 0, 1, 0, 1, 0, 1, 0, 1,
164 0, 1, 2, 0, 1, 2, 0, 1,
165 0, 1, 2, 3, 0, 1, 2, 3,
166 0, 1, 2, 3, 4, 0, 1, 2,
167 0, 1, 2, 3, 4, 5, 0, 1,
168 0, 1, 2, 3, 4, 5, 6, 0,
169 0, 0, 0, 0, 0, 0, 0, 0, /* this row is not used: padding to allow read 16 bytes starting at previous row */
170 };
171
172 _mm_storeu_si128(reinterpret_cast<__m128i *>(op),
173 _mm_shuffle_epi8(
174 _mm_loadu_si128(reinterpret_cast<const __m128i *>(match)),
175 _mm_loadu_si128(reinterpret_cast<const __m128i *>(masks + 8 * offset))));
176
177 match += masks[offset];
178
179#else
180 copyOverlap8(op, match, offset);
181#endif
182}
183
184#endif
185
186
187#ifdef __aarch64__
188
189inline void copyOverlap8Shuffle(UInt8 * op, const UInt8 *& match, const size_t offset)
190{
191 static constexpr UInt8 __attribute__((__aligned__(8))) masks[] =
192 {
193 0, 1, 2, 2, 4, 3, 2, 1, /* offset = 0, not used as mask, but for shift amount instead */
194 0, 0, 0, 0, 0, 0, 0, 0, /* offset = 1 */
195 0, 1, 0, 1, 0, 1, 0, 1,
196 0, 1, 2, 0, 1, 2, 0, 1,
197 0, 1, 2, 3, 0, 1, 2, 3,
198 0, 1, 2, 3, 4, 0, 1, 2,
199 0, 1, 2, 3, 4, 5, 0, 1,
200 0, 1, 2, 3, 4, 5, 6, 0,
201 };
202
203 unalignedStore<uint8x8_t>(op, vtbl1_u8(unalignedLoad<uint8x8_t>(match), unalignedLoad<uint8x8_t>(masks + 8 * offset)));
204 match += masks[offset];
205}
206
207#endif
208
209
210template <> void inline copy<8>(UInt8 * dst, const UInt8 * src) { copy8(dst, src); }
211template <> void inline wildCopy<8>(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) { wildCopy8(dst, src, dst_end); }
212template <> void inline copyOverlap<8, false>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap8(op, match, offset); }
213template <> void inline copyOverlap<8, true>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap8Shuffle(op, match, offset); }
214
215
216inline void copy16(UInt8 * dst, const UInt8 * src)
217{
218#ifdef __SSE2__
219 _mm_storeu_si128(reinterpret_cast<__m128i *>(dst),
220 _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
221#else
222 memcpy(dst, src, 16);
223#endif
224}
225
226inline void wildCopy16(UInt8 * dst, const UInt8 * src, UInt8 * dst_end)
227{
228 /// Unrolling with clang is doing >10% performance degrade.
229#if defined(__clang__)
230 #pragma nounroll
231#endif
232 do
233 {
234 copy16(dst, src);
235 dst += 16;
236 src += 16;
237 } while (dst < dst_end);
238}
239
240inline void copyOverlap16(UInt8 * op, const UInt8 *& match, const size_t offset)
241{
242 /// 4 % n.
243 static constexpr int shift1[]
244 = { 0, 1, 2, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
245
246 /// 8 % n - 4 % n
247 static constexpr int shift2[]
248 = { 0, 0, 0, 1, 0, -1, -2, -3, -4, 4, 4, 4, 4, 4, 4, 4 };
249
250 /// 16 % n - 8 % n
251 static constexpr int shift3[]
252 = { 0, 0, 0, -1, 0, -2, 2, 1, 8, -1, -2, -3, -4, -5, -6, -7 };
253
254 op[0] = match[0];
255 op[1] = match[1];
256 op[2] = match[2];
257 op[3] = match[3];
258
259 match += shift1[offset];
260 memcpy(op + 4, match, 4);
261 match += shift2[offset];
262 memcpy(op + 8, match, 8);
263 match += shift3[offset];
264}
265
266
267#if defined(__x86_64__) || defined(__PPC__)
268
269inline void copyOverlap16Shuffle(UInt8 * op, const UInt8 *& match, const size_t offset)
270{
271#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
272
273 static constexpr UInt8 __attribute__((__aligned__(16))) masks[] =
274 {
275 0, 1, 2, 1, 4, 1, 4, 2, 8, 7, 6, 5, 4, 3, 2, 1, /* offset = 0, not used as mask, but for shift amount instead */
276 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* offset = 1 */
277 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
278 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0,
279 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
280 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0,
281 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
282 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1,
283 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
284 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6,
285 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5,
286 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4,
287 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3,
288 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2,
289 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1,
290 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
291 };
292
293 _mm_storeu_si128(reinterpret_cast<__m128i *>(op),
294 _mm_shuffle_epi8(
295 _mm_loadu_si128(reinterpret_cast<const __m128i *>(match)),
296 _mm_load_si128(reinterpret_cast<const __m128i *>(masks) + offset)));
297
298 match += masks[offset];
299
300#else
301 copyOverlap16(op, match, offset);
302#endif
303}
304
305#endif
306
307#ifdef __aarch64__
308
309inline void copyOverlap16Shuffle(UInt8 * op, const UInt8 *& match, const size_t offset)
310{
311 static constexpr UInt8 __attribute__((__aligned__(16))) masks[] =
312 {
313 0, 1, 2, 1, 4, 1, 4, 2, 8, 7, 6, 5, 4, 3, 2, 1, /* offset = 0, not used as mask, but for shift amount instead */
314 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* offset = 1 */
315 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
316 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0,
317 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3,
318 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0,
319 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3,
320 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1,
321 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7,
322 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6,
323 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5,
324 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4,
325 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3,
326 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2,
327 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1,
328 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0,
329 };
330
331 unalignedStore<uint8x8_t>(op,
332 vtbl2_u8(unalignedLoad<uint8x8x2_t>(match), unalignedLoad<uint8x8_t>(masks + 16 * offset)));
333
334 unalignedStore<uint8x8_t>(op + 8,
335 vtbl2_u8(unalignedLoad<uint8x8x2_t>(match), unalignedLoad<uint8x8_t>(masks + 16 * offset + 8)));
336
337 match += masks[offset];
338}
339
340#endif
341
342
343template <> void inline copy<16>(UInt8 * dst, const UInt8 * src) { copy16(dst, src); }
344template <> void inline wildCopy<16>(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) { wildCopy16(dst, src, dst_end); }
345template <> void inline copyOverlap<16, false>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap16(op, match, offset); }
346template <> void inline copyOverlap<16, true>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap16Shuffle(op, match, offset); }
347
348
349inline void copy32(UInt8 * dst, const UInt8 * src)
350{
351 /// There was an AVX here but with mash with SSE instructions, we got a big slowdown.
352#if defined(__SSE2__)
353 _mm_storeu_si128(reinterpret_cast<__m128i *>(dst),
354 _mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
355 _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + 16),
356 _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16)));
357#else
358 memcpy(dst, src, 16);
359 memcpy(dst + 16, src + 16, 16);
360#endif
361}
362
363inline void wildCopy32(UInt8 * dst, const UInt8 * src, UInt8 * dst_end)
364{
365 /// Unrolling with clang is doing >10% performance degrade.
366#if defined(__clang__)
367 #pragma nounroll
368#endif
369 do
370 {
371 copy32(dst, src);
372 dst += 32;
373 src += 32;
374 } while (dst < dst_end);
375}
376
377inline void copyOverlap32(UInt8 * op, const UInt8 *& match, const size_t offset)
378{
379 /// 4 % n.
380 static constexpr int shift1[]
381 = { 0, 1, 2, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
382
383 /// 8 % n - 4 % n
384 static constexpr int shift2[]
385 = { 0, 0, 0, 1, 0, -1, -2, -3, -4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 };
386
387 /// 16 % n - 8 % n
388 static constexpr int shift3[]
389 = { 0, 0, 0, -1, 0, -2, 2, 1, 8, -1, -2, -3, -4, -5, -6, -7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 };
390
391 /// 32 % n - 16 % n
392 static constexpr int shift4[]
393 = { 0, 0, 0, 1, 0, 1, -2, 2, 0, -2, -4, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15 };
394
395 op[0] = match[0];
396 op[1] = match[1];
397 op[2] = match[2];
398 op[3] = match[3];
399
400 match += shift1[offset];
401 memcpy(op + 4, match, 4);
402 match += shift2[offset];
403 memcpy(op + 8, match, 8);
404 match += shift3[offset];
405 memcpy(op + 16, match, 16);
406 match += shift4[offset];
407}
408
409
410template <> void inline copy<32>(UInt8 * dst, const UInt8 * src) { copy32(dst, src); }
411template <> void inline wildCopy<32>(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) { wildCopy32(dst, src, dst_end); }
412template <> void inline copyOverlap<32, false>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap32(op, match, offset); }
413
414
415/// See also https://stackoverflow.com/a/30669632
416
417template <size_t copy_amount, bool use_shuffle>
418void NO_INLINE decompressImpl(
419 const char * const source,
420 char * const dest,
421 size_t dest_size)
422{
423 const UInt8 * ip = reinterpret_cast<const UInt8 *>(source);
424 UInt8 * op = reinterpret_cast<UInt8 *>(dest);
425 UInt8 * const output_end = op + dest_size;
426
427 /// Unrolling with clang is doing >10% performance degrade.
428#if defined(__clang__)
429 #pragma nounroll
430#endif
431 while (1)
432 {
433 size_t length;
434
435 auto continue_read_length = [&]
436 {
437 unsigned s;
438 do
439 {
440 s = *ip++;
441 length += s;
442 } while (unlikely(s == 255));
443 };
444
445 /// Get literal length.
446
447 const unsigned token = *ip++;
448 length = token >> 4;
449 if (length == 0x0F)
450 continue_read_length();
451
452 /// Copy literals.
453
454 UInt8 * copy_end = op + length;
455
456 /// input: Hello, world
457 /// ^-ip
458 /// output: xyz
459 /// ^-op ^-copy_end
460 /// output: xyzHello, w
461 /// ^- excessive copied bytes due to "wildCopy"
462 /// input: Hello, world
463 /// ^-ip
464 /// output: xyzHello, w
465 /// ^-op (we will overwrite excessive bytes on next iteration)
466
467 wildCopy<copy_amount>(op, ip, copy_end); /// Here we can write up to copy_amount - 1 bytes after buffer.
468
469 ip += length;
470 op = copy_end;
471
472 if (copy_end >= output_end)
473 return;
474
475 /// Get match offset.
476
477 size_t offset = unalignedLoad<UInt16>(ip);
478 ip += 2;
479 const UInt8 * match = op - offset;
480
481 /// Get match length.
482
483 length = token & 0x0F;
484 if (length == 0x0F)
485 continue_read_length();
486 length += 4;
487
488 /// Copy match within block, that produce overlapping pattern. Match may replicate itself.
489
490 copy_end = op + length;
491
492 /** Here we can write up to copy_amount - 1 - 4 * 2 bytes after buffer.
493 * The worst case when offset = 1 and length = 4
494 */
495
496 if (unlikely(offset < copy_amount))
497 {
498 /// output: Hello
499 /// ^-op
500 /// ^-match; offset = 5
501 ///
502 /// output: Hello
503 /// [------] - copy_amount bytes
504 /// [------] - copy them here
505 ///
506 /// output: HelloHelloHel
507 /// ^-match ^-op
508
509 copyOverlap<copy_amount, use_shuffle>(op, match, offset);
510 }
511 else
512 {
513 copy<copy_amount>(op, match);
514 match += copy_amount;
515 }
516
517 op += copy_amount;
518
519 copy<copy_amount>(op, match); /// copy_amount + copy_amount - 1 - 4 * 2 bytes after buffer.
520 if (length > copy_amount * 2)
521 wildCopy<copy_amount>(op + copy_amount, match + copy_amount, copy_end);
522
523 op = copy_end;
524 }
525}
526
527}
528
529
530void decompress(
531 const char * const source,
532 char * const dest,
533 size_t source_size,
534 size_t dest_size,
535 PerformanceStatistics & statistics [[maybe_unused]])
536{
537 if (source_size == 0 || dest_size == 0)
538 return;
539
540 /// Don't run timer if the block is too small.
541 if (dest_size >= 32768)
542 {
543 size_t best_variant = statistics.select();
544
545 /// Run the selected method and measure time.
546
547 Stopwatch watch;
548 if (best_variant == 0)
549 decompressImpl<16, true>(source, dest, dest_size);
550 if (best_variant == 1)
551 decompressImpl<16, false>(source, dest, dest_size);
552 if (best_variant == 2)
553 decompressImpl<8, true>(source, dest, dest_size);
554 if (best_variant == 3)
555 decompressImpl<32, false>(source, dest, dest_size);
556
557 watch.stop();
558
559 /// Update performance statistics.
560
561 statistics.data[best_variant].update(watch.elapsedSeconds(), dest_size);
562 }
563 else
564 {
565 decompressImpl<8, false>(source, dest, dest_size);
566 }
567}
568
569
570void StreamStatistics::literal(size_t length)
571{
572 ++num_tokens;
573 sum_literal_lengths += length;
574}
575
576void StreamStatistics::match(size_t length, size_t offset)
577{
578 ++num_tokens;
579 sum_match_lengths += length;
580 sum_match_offsets += offset;
581 count_match_offset_less_8 += offset < 8;
582 count_match_offset_less_16 += offset < 16;
583 count_match_replicate_itself += offset < length;
584}
585
586void StreamStatistics::print() const
587{
588 std::cerr
589 << "Num tokens: " << num_tokens
590 << ", Avg literal length: " << double(sum_literal_lengths) / num_tokens
591 << ", Avg match length: " << double(sum_match_lengths) / num_tokens
592 << ", Avg match offset: " << double(sum_match_offsets) / num_tokens
593 << ", Offset < 8 ratio: " << double(count_match_offset_less_8) / num_tokens
594 << ", Offset < 16 ratio: " << double(count_match_offset_less_16) / num_tokens
595 << ", Match replicate itself: " << double(count_match_replicate_itself) / num_tokens
596 << "\n";
597}
598
599void statistics(
600 const char * const source,
601 char * const dest,
602 size_t dest_size,
603 StreamStatistics & stat)
604{
605 const UInt8 * ip = reinterpret_cast<const UInt8 *>(source);
606 UInt8 * op = reinterpret_cast<UInt8 *>(dest);
607 UInt8 * const output_end = op + dest_size;
608 while (1)
609 {
610 size_t length;
611
612 auto continue_read_length = [&]
613 {
614 unsigned s;
615 do
616 {
617 s = *ip++;
618 length += s;
619 } while (unlikely(s == 255));
620 };
621
622 auto token = *ip++;
623 length = token >> 4;
624 if (length == 0x0F)
625 continue_read_length();
626
627 stat.literal(length);
628
629 ip += length;
630 op += length;
631
632 if (op > output_end)
633 return;
634
635 size_t offset = unalignedLoad<UInt16>(ip);
636 ip += 2;
637
638 length = token & 0x0F;
639 if (length == 0x0F)
640 continue_read_length();
641 length += 4;
642
643 stat.match(length, offset);
644
645 op += length;
646 }
647}
648
649}
650