1 | #include "LZ4_decompress_faster.h" |
2 | |
3 | #include <string.h> |
4 | #include <iostream> |
5 | #include <random> |
6 | #include <algorithm> |
7 | #include <Core/Defines.h> |
8 | #include <Common/Stopwatch.h> |
9 | #include <common/likely.h> |
10 | #include <common/Types.h> |
11 | #include <common/unaligned.h> |
12 | |
13 | #ifdef __SSE2__ |
14 | #include <emmintrin.h> |
15 | #endif |
16 | |
17 | #ifdef __SSSE3__ |
18 | #include <tmmintrin.h> |
19 | #endif |
20 | |
21 | #ifdef __aarch64__ |
22 | #include <arm_neon.h> |
23 | #endif |
24 | |
25 | namespace LZ4 |
26 | { |
27 | |
28 | namespace |
29 | { |
30 | |
31 | template <size_t N> [[maybe_unused]] void copy(UInt8 * dst, const UInt8 * src); |
32 | template <size_t N> [[maybe_unused]] void wildCopy(UInt8 * dst, const UInt8 * src, UInt8 * dst_end); |
33 | template <size_t N, bool USE_SHUFFLE> [[maybe_unused]] void copyOverlap(UInt8 * op, const UInt8 *& match, const size_t offset); |
34 | |
35 | |
36 | inline void copy8(UInt8 * dst, const UInt8 * src) |
37 | { |
38 | memcpy(dst, src, 8); |
39 | } |
40 | |
41 | inline void wildCopy8(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) |
42 | { |
43 | /// Unrolling with clang is doing >10% performance degrade. |
44 | #if defined(__clang__) |
45 | #pragma nounroll |
46 | #endif |
47 | do |
48 | { |
49 | copy8(dst, src); |
50 | dst += 8; |
51 | src += 8; |
52 | } while (dst < dst_end); |
53 | } |
54 | |
55 | inline void copyOverlap8(UInt8 * op, const UInt8 *& match, const size_t offset) |
56 | { |
57 | /// 4 % n. |
58 | /// Or if 4 % n is zero, we use n. |
59 | /// It gives equivalent result, but is better CPU friendly for unknown reason. |
60 | static constexpr int shift1[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; |
61 | |
62 | /// 8 % n - 4 % n |
63 | static constexpr int shift2[] = { 0, 0, 0, 1, 0, -1, -2, -3 }; |
64 | |
65 | op[0] = match[0]; |
66 | op[1] = match[1]; |
67 | op[2] = match[2]; |
68 | op[3] = match[3]; |
69 | |
70 | match += shift1[offset]; |
71 | memcpy(op + 4, match, 4); |
72 | match += shift2[offset]; |
73 | } |
74 | |
75 | |
76 | #if defined(__x86_64__) || defined(__PPC__) |
77 | |
78 | /** We use 'xmm' (128bit SSE) registers here to shuffle 16 bytes. |
79 | * |
80 | * It is possible to use 'mm' (64bit MMX) registers to shuffle just 8 bytes as we need. |
81 | * |
82 | * There is corresponding version of 'pshufb' instruction that operates on 'mm' registers, |
83 | * (it operates on MMX registers although it is available in SSSE3) |
84 | * and compiler library has the corresponding intrinsic: '_mm_shuffle_pi8'. |
85 | * |
86 | * It can be done like this: |
87 | * |
88 | * unalignedStore(op, _mm_shuffle_pi8( |
89 | * unalignedLoad<__m64>(match), |
90 | * unalignedLoad<__m64>(masks + 8 * offset))); |
91 | * |
92 | * This is perfectly correct and this code have the same or even better performance. |
93 | * |
94 | * But if we write code this way, it will lead to |
95 | * extremely weird and extremely non obvious |
96 | * effects in completely unrelated parts of code. |
97 | * |
98 | * Because using MMX registers alters the mode of operation of x87 FPU, |
99 | * and then operations with FPU become broken. |
100 | * |
101 | * Example 1. |
102 | * Compile this code without optimizations: |
103 | * |
104 | #include <vector> |
105 | #include <unordered_set> |
106 | #include <iostream> |
107 | #include <tmmintrin.h> |
108 | |
109 | int main(int, char **) |
110 | { |
111 | [[maybe_unused]] __m64 shuffled = _mm_shuffle_pi8(__m64{}, __m64{}); |
112 | |
113 | std::vector<int> vec; |
114 | std::unordered_set<int> set(vec.begin(), vec.end()); |
115 | |
116 | std::cerr << set.size() << "\n"; |
117 | return 0; |
118 | } |
119 | |
120 | $ g++ -g -O0 -mssse3 -std=c++17 mmx_bug1.cpp && ./a.out |
121 | terminate called after throwing an instance of 'std::bad_alloc' |
122 | what(): std::bad_alloc |
123 | |
124 | Also reproduced with clang. But only with libstdc++, not with libc++. |
125 | |
126 | * Example 2. |
127 | |
128 | #include <math.h> |
129 | #include <iostream> |
130 | #include <tmmintrin.h> |
131 | |
132 | int main(int, char **) |
133 | { |
134 | double max_fill = 1; |
135 | |
136 | std::cerr << (long double)max_fill << "\n"; |
137 | [[maybe_unused]] __m64 shuffled = _mm_shuffle_pi8(__m64{}, __m64{}); |
138 | std::cerr << (long double)max_fill << "\n"; |
139 | |
140 | return 0; |
141 | } |
142 | |
143 | $ g++ -g -O0 -mssse3 -std=c++17 mmx_bug2.cpp && ./a.out |
144 | 1 |
145 | -nan |
146 | |
147 | * Explanation: |
148 | * |
149 | * https://stackoverflow.com/questions/33692969/assembler-mmx-errors |
150 | * https://software.intel.com/en-us/node/524274 |
151 | * |
152 | * Actually it's possible to use 'emms' instruction after decompression routine. |
153 | * But it's more easy to just use 'xmm' registers and avoid using 'mm' registers. |
154 | */ |
155 | inline void copyOverlap8Shuffle(UInt8 * op, const UInt8 *& match, const size_t offset) |
156 | { |
157 | #if defined(__SSSE3__) && !defined(MEMORY_SANITIZER) |
158 | |
159 | static constexpr UInt8 __attribute__((__aligned__(8))) masks[] = |
160 | { |
161 | 0, 1, 2, 2, 4, 3, 2, 1, /* offset = 0, not used as mask, but for shift amount instead */ |
162 | 0, 0, 0, 0, 0, 0, 0, 0, /* offset = 1 */ |
163 | 0, 1, 0, 1, 0, 1, 0, 1, |
164 | 0, 1, 2, 0, 1, 2, 0, 1, |
165 | 0, 1, 2, 3, 0, 1, 2, 3, |
166 | 0, 1, 2, 3, 4, 0, 1, 2, |
167 | 0, 1, 2, 3, 4, 5, 0, 1, |
168 | 0, 1, 2, 3, 4, 5, 6, 0, |
169 | 0, 0, 0, 0, 0, 0, 0, 0, /* this row is not used: padding to allow read 16 bytes starting at previous row */ |
170 | }; |
171 | |
172 | _mm_storeu_si128(reinterpret_cast<__m128i *>(op), |
173 | _mm_shuffle_epi8( |
174 | _mm_loadu_si128(reinterpret_cast<const __m128i *>(match)), |
175 | _mm_loadu_si128(reinterpret_cast<const __m128i *>(masks + 8 * offset)))); |
176 | |
177 | match += masks[offset]; |
178 | |
179 | #else |
180 | copyOverlap8(op, match, offset); |
181 | #endif |
182 | } |
183 | |
184 | #endif |
185 | |
186 | |
187 | #ifdef __aarch64__ |
188 | |
189 | inline void copyOverlap8Shuffle(UInt8 * op, const UInt8 *& match, const size_t offset) |
190 | { |
191 | static constexpr UInt8 __attribute__((__aligned__(8))) masks[] = |
192 | { |
193 | 0, 1, 2, 2, 4, 3, 2, 1, /* offset = 0, not used as mask, but for shift amount instead */ |
194 | 0, 0, 0, 0, 0, 0, 0, 0, /* offset = 1 */ |
195 | 0, 1, 0, 1, 0, 1, 0, 1, |
196 | 0, 1, 2, 0, 1, 2, 0, 1, |
197 | 0, 1, 2, 3, 0, 1, 2, 3, |
198 | 0, 1, 2, 3, 4, 0, 1, 2, |
199 | 0, 1, 2, 3, 4, 5, 0, 1, |
200 | 0, 1, 2, 3, 4, 5, 6, 0, |
201 | }; |
202 | |
203 | unalignedStore<uint8x8_t>(op, vtbl1_u8(unalignedLoad<uint8x8_t>(match), unalignedLoad<uint8x8_t>(masks + 8 * offset))); |
204 | match += masks[offset]; |
205 | } |
206 | |
207 | #endif |
208 | |
209 | |
210 | template <> void inline copy<8>(UInt8 * dst, const UInt8 * src) { copy8(dst, src); } |
211 | template <> void inline wildCopy<8>(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) { wildCopy8(dst, src, dst_end); } |
212 | template <> void inline copyOverlap<8, false>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap8(op, match, offset); } |
213 | template <> void inline copyOverlap<8, true>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap8Shuffle(op, match, offset); } |
214 | |
215 | |
216 | inline void copy16(UInt8 * dst, const UInt8 * src) |
217 | { |
218 | #ifdef __SSE2__ |
219 | _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), |
220 | _mm_loadu_si128(reinterpret_cast<const __m128i *>(src))); |
221 | #else |
222 | memcpy(dst, src, 16); |
223 | #endif |
224 | } |
225 | |
226 | inline void wildCopy16(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) |
227 | { |
228 | /// Unrolling with clang is doing >10% performance degrade. |
229 | #if defined(__clang__) |
230 | #pragma nounroll |
231 | #endif |
232 | do |
233 | { |
234 | copy16(dst, src); |
235 | dst += 16; |
236 | src += 16; |
237 | } while (dst < dst_end); |
238 | } |
239 | |
240 | inline void copyOverlap16(UInt8 * op, const UInt8 *& match, const size_t offset) |
241 | { |
242 | /// 4 % n. |
243 | static constexpr int shift1[] |
244 | = { 0, 1, 2, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; |
245 | |
246 | /// 8 % n - 4 % n |
247 | static constexpr int shift2[] |
248 | = { 0, 0, 0, 1, 0, -1, -2, -3, -4, 4, 4, 4, 4, 4, 4, 4 }; |
249 | |
250 | /// 16 % n - 8 % n |
251 | static constexpr int shift3[] |
252 | = { 0, 0, 0, -1, 0, -2, 2, 1, 8, -1, -2, -3, -4, -5, -6, -7 }; |
253 | |
254 | op[0] = match[0]; |
255 | op[1] = match[1]; |
256 | op[2] = match[2]; |
257 | op[3] = match[3]; |
258 | |
259 | match += shift1[offset]; |
260 | memcpy(op + 4, match, 4); |
261 | match += shift2[offset]; |
262 | memcpy(op + 8, match, 8); |
263 | match += shift3[offset]; |
264 | } |
265 | |
266 | |
267 | #if defined(__x86_64__) || defined(__PPC__) |
268 | |
269 | inline void copyOverlap16Shuffle(UInt8 * op, const UInt8 *& match, const size_t offset) |
270 | { |
271 | #if defined(__SSSE3__) && !defined(MEMORY_SANITIZER) |
272 | |
273 | static constexpr UInt8 __attribute__((__aligned__(16))) masks[] = |
274 | { |
275 | 0, 1, 2, 1, 4, 1, 4, 2, 8, 7, 6, 5, 4, 3, 2, 1, /* offset = 0, not used as mask, but for shift amount instead */ |
276 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* offset = 1 */ |
277 | 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, |
278 | 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, |
279 | 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, |
280 | 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, |
281 | 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, |
282 | 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, |
283 | 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, |
284 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, |
285 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, |
286 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, |
287 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, |
288 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, |
289 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, |
290 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, |
291 | }; |
292 | |
293 | _mm_storeu_si128(reinterpret_cast<__m128i *>(op), |
294 | _mm_shuffle_epi8( |
295 | _mm_loadu_si128(reinterpret_cast<const __m128i *>(match)), |
296 | _mm_load_si128(reinterpret_cast<const __m128i *>(masks) + offset))); |
297 | |
298 | match += masks[offset]; |
299 | |
300 | #else |
301 | copyOverlap16(op, match, offset); |
302 | #endif |
303 | } |
304 | |
305 | #endif |
306 | |
307 | #ifdef __aarch64__ |
308 | |
309 | inline void copyOverlap16Shuffle(UInt8 * op, const UInt8 *& match, const size_t offset) |
310 | { |
311 | static constexpr UInt8 __attribute__((__aligned__(16))) masks[] = |
312 | { |
313 | 0, 1, 2, 1, 4, 1, 4, 2, 8, 7, 6, 5, 4, 3, 2, 1, /* offset = 0, not used as mask, but for shift amount instead */ |
314 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* offset = 1 */ |
315 | 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, |
316 | 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, |
317 | 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, |
318 | 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, |
319 | 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, |
320 | 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, |
321 | 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, |
322 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, |
323 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, |
324 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, |
325 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, |
326 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, |
327 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, |
328 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, |
329 | }; |
330 | |
331 | unalignedStore<uint8x8_t>(op, |
332 | vtbl2_u8(unalignedLoad<uint8x8x2_t>(match), unalignedLoad<uint8x8_t>(masks + 16 * offset))); |
333 | |
334 | unalignedStore<uint8x8_t>(op + 8, |
335 | vtbl2_u8(unalignedLoad<uint8x8x2_t>(match), unalignedLoad<uint8x8_t>(masks + 16 * offset + 8))); |
336 | |
337 | match += masks[offset]; |
338 | } |
339 | |
340 | #endif |
341 | |
342 | |
343 | template <> void inline copy<16>(UInt8 * dst, const UInt8 * src) { copy16(dst, src); } |
344 | template <> void inline wildCopy<16>(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) { wildCopy16(dst, src, dst_end); } |
345 | template <> void inline copyOverlap<16, false>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap16(op, match, offset); } |
346 | template <> void inline copyOverlap<16, true>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap16Shuffle(op, match, offset); } |
347 | |
348 | |
349 | inline void copy32(UInt8 * dst, const UInt8 * src) |
350 | { |
351 | /// There was an AVX here but with mash with SSE instructions, we got a big slowdown. |
352 | #if defined(__SSE2__) |
353 | _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), |
354 | _mm_loadu_si128(reinterpret_cast<const __m128i *>(src))); |
355 | _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + 16), |
356 | _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16))); |
357 | #else |
358 | memcpy(dst, src, 16); |
359 | memcpy(dst + 16, src + 16, 16); |
360 | #endif |
361 | } |
362 | |
363 | inline void wildCopy32(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) |
364 | { |
365 | /// Unrolling with clang is doing >10% performance degrade. |
366 | #if defined(__clang__) |
367 | #pragma nounroll |
368 | #endif |
369 | do |
370 | { |
371 | copy32(dst, src); |
372 | dst += 32; |
373 | src += 32; |
374 | } while (dst < dst_end); |
375 | } |
376 | |
377 | inline void copyOverlap32(UInt8 * op, const UInt8 *& match, const size_t offset) |
378 | { |
379 | /// 4 % n. |
380 | static constexpr int shift1[] |
381 | = { 0, 1, 2, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; |
382 | |
383 | /// 8 % n - 4 % n |
384 | static constexpr int shift2[] |
385 | = { 0, 0, 0, 1, 0, -1, -2, -3, -4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; |
386 | |
387 | /// 16 % n - 8 % n |
388 | static constexpr int shift3[] |
389 | = { 0, 0, 0, -1, 0, -2, 2, 1, 8, -1, -2, -3, -4, -5, -6, -7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 }; |
390 | |
391 | /// 32 % n - 16 % n |
392 | static constexpr int shift4[] |
393 | = { 0, 0, 0, 1, 0, 1, -2, 2, 0, -2, -4, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15 }; |
394 | |
395 | op[0] = match[0]; |
396 | op[1] = match[1]; |
397 | op[2] = match[2]; |
398 | op[3] = match[3]; |
399 | |
400 | match += shift1[offset]; |
401 | memcpy(op + 4, match, 4); |
402 | match += shift2[offset]; |
403 | memcpy(op + 8, match, 8); |
404 | match += shift3[offset]; |
405 | memcpy(op + 16, match, 16); |
406 | match += shift4[offset]; |
407 | } |
408 | |
409 | |
410 | template <> void inline copy<32>(UInt8 * dst, const UInt8 * src) { copy32(dst, src); } |
411 | template <> void inline wildCopy<32>(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) { wildCopy32(dst, src, dst_end); } |
412 | template <> void inline copyOverlap<32, false>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap32(op, match, offset); } |
413 | |
414 | |
415 | /// See also https://stackoverflow.com/a/30669632 |
416 | |
417 | template <size_t copy_amount, bool use_shuffle> |
418 | void NO_INLINE decompressImpl( |
419 | const char * const source, |
420 | char * const dest, |
421 | size_t dest_size) |
422 | { |
423 | const UInt8 * ip = reinterpret_cast<const UInt8 *>(source); |
424 | UInt8 * op = reinterpret_cast<UInt8 *>(dest); |
425 | UInt8 * const output_end = op + dest_size; |
426 | |
427 | /// Unrolling with clang is doing >10% performance degrade. |
428 | #if defined(__clang__) |
429 | #pragma nounroll |
430 | #endif |
431 | while (1) |
432 | { |
433 | size_t length; |
434 | |
435 | auto continue_read_length = [&] |
436 | { |
437 | unsigned s; |
438 | do |
439 | { |
440 | s = *ip++; |
441 | length += s; |
442 | } while (unlikely(s == 255)); |
443 | }; |
444 | |
445 | /// Get literal length. |
446 | |
447 | const unsigned token = *ip++; |
448 | length = token >> 4; |
449 | if (length == 0x0F) |
450 | continue_read_length(); |
451 | |
452 | /// Copy literals. |
453 | |
454 | UInt8 * copy_end = op + length; |
455 | |
456 | /// input: Hello, world |
457 | /// ^-ip |
458 | /// output: xyz |
459 | /// ^-op ^-copy_end |
460 | /// output: xyzHello, w |
461 | /// ^- excessive copied bytes due to "wildCopy" |
462 | /// input: Hello, world |
463 | /// ^-ip |
464 | /// output: xyzHello, w |
465 | /// ^-op (we will overwrite excessive bytes on next iteration) |
466 | |
467 | wildCopy<copy_amount>(op, ip, copy_end); /// Here we can write up to copy_amount - 1 bytes after buffer. |
468 | |
469 | ip += length; |
470 | op = copy_end; |
471 | |
472 | if (copy_end >= output_end) |
473 | return; |
474 | |
475 | /// Get match offset. |
476 | |
477 | size_t offset = unalignedLoad<UInt16>(ip); |
478 | ip += 2; |
479 | const UInt8 * match = op - offset; |
480 | |
481 | /// Get match length. |
482 | |
483 | length = token & 0x0F; |
484 | if (length == 0x0F) |
485 | continue_read_length(); |
486 | length += 4; |
487 | |
488 | /// Copy match within block, that produce overlapping pattern. Match may replicate itself. |
489 | |
490 | copy_end = op + length; |
491 | |
492 | /** Here we can write up to copy_amount - 1 - 4 * 2 bytes after buffer. |
493 | * The worst case when offset = 1 and length = 4 |
494 | */ |
495 | |
496 | if (unlikely(offset < copy_amount)) |
497 | { |
498 | /// output: Hello |
499 | /// ^-op |
500 | /// ^-match; offset = 5 |
501 | /// |
502 | /// output: Hello |
503 | /// [------] - copy_amount bytes |
504 | /// [------] - copy them here |
505 | /// |
506 | /// output: HelloHelloHel |
507 | /// ^-match ^-op |
508 | |
509 | copyOverlap<copy_amount, use_shuffle>(op, match, offset); |
510 | } |
511 | else |
512 | { |
513 | copy<copy_amount>(op, match); |
514 | match += copy_amount; |
515 | } |
516 | |
517 | op += copy_amount; |
518 | |
519 | copy<copy_amount>(op, match); /// copy_amount + copy_amount - 1 - 4 * 2 bytes after buffer. |
520 | if (length > copy_amount * 2) |
521 | wildCopy<copy_amount>(op + copy_amount, match + copy_amount, copy_end); |
522 | |
523 | op = copy_end; |
524 | } |
525 | } |
526 | |
527 | } |
528 | |
529 | |
530 | void decompress( |
531 | const char * const source, |
532 | char * const dest, |
533 | size_t source_size, |
534 | size_t dest_size, |
535 | PerformanceStatistics & statistics [[maybe_unused]]) |
536 | { |
537 | if (source_size == 0 || dest_size == 0) |
538 | return; |
539 | |
540 | /// Don't run timer if the block is too small. |
541 | if (dest_size >= 32768) |
542 | { |
543 | size_t best_variant = statistics.select(); |
544 | |
545 | /// Run the selected method and measure time. |
546 | |
547 | Stopwatch watch; |
548 | if (best_variant == 0) |
549 | decompressImpl<16, true>(source, dest, dest_size); |
550 | if (best_variant == 1) |
551 | decompressImpl<16, false>(source, dest, dest_size); |
552 | if (best_variant == 2) |
553 | decompressImpl<8, true>(source, dest, dest_size); |
554 | if (best_variant == 3) |
555 | decompressImpl<32, false>(source, dest, dest_size); |
556 | |
557 | watch.stop(); |
558 | |
559 | /// Update performance statistics. |
560 | |
561 | statistics.data[best_variant].update(watch.elapsedSeconds(), dest_size); |
562 | } |
563 | else |
564 | { |
565 | decompressImpl<8, false>(source, dest, dest_size); |
566 | } |
567 | } |
568 | |
569 | |
570 | void StreamStatistics::literal(size_t length) |
571 | { |
572 | ++num_tokens; |
573 | sum_literal_lengths += length; |
574 | } |
575 | |
576 | void StreamStatistics::match(size_t length, size_t offset) |
577 | { |
578 | ++num_tokens; |
579 | sum_match_lengths += length; |
580 | sum_match_offsets += offset; |
581 | count_match_offset_less_8 += offset < 8; |
582 | count_match_offset_less_16 += offset < 16; |
583 | count_match_replicate_itself += offset < length; |
584 | } |
585 | |
586 | void StreamStatistics::print() const |
587 | { |
588 | std::cerr |
589 | << "Num tokens: " << num_tokens |
590 | << ", Avg literal length: " << double(sum_literal_lengths) / num_tokens |
591 | << ", Avg match length: " << double(sum_match_lengths) / num_tokens |
592 | << ", Avg match offset: " << double(sum_match_offsets) / num_tokens |
593 | << ", Offset < 8 ratio: " << double(count_match_offset_less_8) / num_tokens |
594 | << ", Offset < 16 ratio: " << double(count_match_offset_less_16) / num_tokens |
595 | << ", Match replicate itself: " << double(count_match_replicate_itself) / num_tokens |
596 | << "\n" ; |
597 | } |
598 | |
599 | void statistics( |
600 | const char * const source, |
601 | char * const dest, |
602 | size_t dest_size, |
603 | StreamStatistics & stat) |
604 | { |
605 | const UInt8 * ip = reinterpret_cast<const UInt8 *>(source); |
606 | UInt8 * op = reinterpret_cast<UInt8 *>(dest); |
607 | UInt8 * const output_end = op + dest_size; |
608 | while (1) |
609 | { |
610 | size_t length; |
611 | |
612 | auto continue_read_length = [&] |
613 | { |
614 | unsigned s; |
615 | do |
616 | { |
617 | s = *ip++; |
618 | length += s; |
619 | } while (unlikely(s == 255)); |
620 | }; |
621 | |
622 | auto token = *ip++; |
623 | length = token >> 4; |
624 | if (length == 0x0F) |
625 | continue_read_length(); |
626 | |
627 | stat.literal(length); |
628 | |
629 | ip += length; |
630 | op += length; |
631 | |
632 | if (op > output_end) |
633 | return; |
634 | |
635 | size_t offset = unalignedLoad<UInt16>(ip); |
636 | ip += 2; |
637 | |
638 | length = token & 0x0F; |
639 | if (length == 0x0F) |
640 | continue_read_length(); |
641 | length += 4; |
642 | |
643 | stat.match(length, offset); |
644 | |
645 | op += length; |
646 | } |
647 | } |
648 | |
649 | } |
650 | |