| 1 | #include "LZ4_decompress_faster.h" |
| 2 | |
| 3 | #include <string.h> |
| 4 | #include <iostream> |
| 5 | #include <random> |
| 6 | #include <algorithm> |
| 7 | #include <Core/Defines.h> |
| 8 | #include <Common/Stopwatch.h> |
| 9 | #include <common/likely.h> |
| 10 | #include <common/Types.h> |
| 11 | #include <common/unaligned.h> |
| 12 | |
| 13 | #ifdef __SSE2__ |
| 14 | #include <emmintrin.h> |
| 15 | #endif |
| 16 | |
| 17 | #ifdef __SSSE3__ |
| 18 | #include <tmmintrin.h> |
| 19 | #endif |
| 20 | |
| 21 | #ifdef __aarch64__ |
| 22 | #include <arm_neon.h> |
| 23 | #endif |
| 24 | |
| 25 | namespace LZ4 |
| 26 | { |
| 27 | |
| 28 | namespace |
| 29 | { |
| 30 | |
| 31 | template <size_t N> [[maybe_unused]] void copy(UInt8 * dst, const UInt8 * src); |
| 32 | template <size_t N> [[maybe_unused]] void wildCopy(UInt8 * dst, const UInt8 * src, UInt8 * dst_end); |
| 33 | template <size_t N, bool USE_SHUFFLE> [[maybe_unused]] void copyOverlap(UInt8 * op, const UInt8 *& match, const size_t offset); |
| 34 | |
| 35 | |
| 36 | inline void copy8(UInt8 * dst, const UInt8 * src) |
| 37 | { |
| 38 | memcpy(dst, src, 8); |
| 39 | } |
| 40 | |
| 41 | inline void wildCopy8(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) |
| 42 | { |
| 43 | /// Unrolling with clang is doing >10% performance degrade. |
| 44 | #if defined(__clang__) |
| 45 | #pragma nounroll |
| 46 | #endif |
| 47 | do |
| 48 | { |
| 49 | copy8(dst, src); |
| 50 | dst += 8; |
| 51 | src += 8; |
| 52 | } while (dst < dst_end); |
| 53 | } |
| 54 | |
| 55 | inline void copyOverlap8(UInt8 * op, const UInt8 *& match, const size_t offset) |
| 56 | { |
| 57 | /// 4 % n. |
| 58 | /// Or if 4 % n is zero, we use n. |
| 59 | /// It gives equivalent result, but is better CPU friendly for unknown reason. |
| 60 | static constexpr int shift1[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; |
| 61 | |
| 62 | /// 8 % n - 4 % n |
| 63 | static constexpr int shift2[] = { 0, 0, 0, 1, 0, -1, -2, -3 }; |
| 64 | |
| 65 | op[0] = match[0]; |
| 66 | op[1] = match[1]; |
| 67 | op[2] = match[2]; |
| 68 | op[3] = match[3]; |
| 69 | |
| 70 | match += shift1[offset]; |
| 71 | memcpy(op + 4, match, 4); |
| 72 | match += shift2[offset]; |
| 73 | } |
| 74 | |
| 75 | |
| 76 | #if defined(__x86_64__) || defined(__PPC__) |
| 77 | |
| 78 | /** We use 'xmm' (128bit SSE) registers here to shuffle 16 bytes. |
| 79 | * |
| 80 | * It is possible to use 'mm' (64bit MMX) registers to shuffle just 8 bytes as we need. |
| 81 | * |
| 82 | * There is corresponding version of 'pshufb' instruction that operates on 'mm' registers, |
| 83 | * (it operates on MMX registers although it is available in SSSE3) |
| 84 | * and compiler library has the corresponding intrinsic: '_mm_shuffle_pi8'. |
| 85 | * |
| 86 | * It can be done like this: |
| 87 | * |
| 88 | * unalignedStore(op, _mm_shuffle_pi8( |
| 89 | * unalignedLoad<__m64>(match), |
| 90 | * unalignedLoad<__m64>(masks + 8 * offset))); |
| 91 | * |
| 92 | * This is perfectly correct and this code have the same or even better performance. |
| 93 | * |
| 94 | * But if we write code this way, it will lead to |
| 95 | * extremely weird and extremely non obvious |
| 96 | * effects in completely unrelated parts of code. |
| 97 | * |
| 98 | * Because using MMX registers alters the mode of operation of x87 FPU, |
| 99 | * and then operations with FPU become broken. |
| 100 | * |
| 101 | * Example 1. |
| 102 | * Compile this code without optimizations: |
| 103 | * |
| 104 | #include <vector> |
| 105 | #include <unordered_set> |
| 106 | #include <iostream> |
| 107 | #include <tmmintrin.h> |
| 108 | |
| 109 | int main(int, char **) |
| 110 | { |
| 111 | [[maybe_unused]] __m64 shuffled = _mm_shuffle_pi8(__m64{}, __m64{}); |
| 112 | |
| 113 | std::vector<int> vec; |
| 114 | std::unordered_set<int> set(vec.begin(), vec.end()); |
| 115 | |
| 116 | std::cerr << set.size() << "\n"; |
| 117 | return 0; |
| 118 | } |
| 119 | |
| 120 | $ g++ -g -O0 -mssse3 -std=c++17 mmx_bug1.cpp && ./a.out |
| 121 | terminate called after throwing an instance of 'std::bad_alloc' |
| 122 | what(): std::bad_alloc |
| 123 | |
| 124 | Also reproduced with clang. But only with libstdc++, not with libc++. |
| 125 | |
| 126 | * Example 2. |
| 127 | |
| 128 | #include <math.h> |
| 129 | #include <iostream> |
| 130 | #include <tmmintrin.h> |
| 131 | |
| 132 | int main(int, char **) |
| 133 | { |
| 134 | double max_fill = 1; |
| 135 | |
| 136 | std::cerr << (long double)max_fill << "\n"; |
| 137 | [[maybe_unused]] __m64 shuffled = _mm_shuffle_pi8(__m64{}, __m64{}); |
| 138 | std::cerr << (long double)max_fill << "\n"; |
| 139 | |
| 140 | return 0; |
| 141 | } |
| 142 | |
| 143 | $ g++ -g -O0 -mssse3 -std=c++17 mmx_bug2.cpp && ./a.out |
| 144 | 1 |
| 145 | -nan |
| 146 | |
| 147 | * Explanation: |
| 148 | * |
| 149 | * https://stackoverflow.com/questions/33692969/assembler-mmx-errors |
| 150 | * https://software.intel.com/en-us/node/524274 |
| 151 | * |
| 152 | * Actually it's possible to use 'emms' instruction after decompression routine. |
| 153 | * But it's more easy to just use 'xmm' registers and avoid using 'mm' registers. |
| 154 | */ |
| 155 | inline void copyOverlap8Shuffle(UInt8 * op, const UInt8 *& match, const size_t offset) |
| 156 | { |
| 157 | #if defined(__SSSE3__) && !defined(MEMORY_SANITIZER) |
| 158 | |
| 159 | static constexpr UInt8 __attribute__((__aligned__(8))) masks[] = |
| 160 | { |
| 161 | 0, 1, 2, 2, 4, 3, 2, 1, /* offset = 0, not used as mask, but for shift amount instead */ |
| 162 | 0, 0, 0, 0, 0, 0, 0, 0, /* offset = 1 */ |
| 163 | 0, 1, 0, 1, 0, 1, 0, 1, |
| 164 | 0, 1, 2, 0, 1, 2, 0, 1, |
| 165 | 0, 1, 2, 3, 0, 1, 2, 3, |
| 166 | 0, 1, 2, 3, 4, 0, 1, 2, |
| 167 | 0, 1, 2, 3, 4, 5, 0, 1, |
| 168 | 0, 1, 2, 3, 4, 5, 6, 0, |
| 169 | 0, 0, 0, 0, 0, 0, 0, 0, /* this row is not used: padding to allow read 16 bytes starting at previous row */ |
| 170 | }; |
| 171 | |
| 172 | _mm_storeu_si128(reinterpret_cast<__m128i *>(op), |
| 173 | _mm_shuffle_epi8( |
| 174 | _mm_loadu_si128(reinterpret_cast<const __m128i *>(match)), |
| 175 | _mm_loadu_si128(reinterpret_cast<const __m128i *>(masks + 8 * offset)))); |
| 176 | |
| 177 | match += masks[offset]; |
| 178 | |
| 179 | #else |
| 180 | copyOverlap8(op, match, offset); |
| 181 | #endif |
| 182 | } |
| 183 | |
| 184 | #endif |
| 185 | |
| 186 | |
| 187 | #ifdef __aarch64__ |
| 188 | |
| 189 | inline void copyOverlap8Shuffle(UInt8 * op, const UInt8 *& match, const size_t offset) |
| 190 | { |
| 191 | static constexpr UInt8 __attribute__((__aligned__(8))) masks[] = |
| 192 | { |
| 193 | 0, 1, 2, 2, 4, 3, 2, 1, /* offset = 0, not used as mask, but for shift amount instead */ |
| 194 | 0, 0, 0, 0, 0, 0, 0, 0, /* offset = 1 */ |
| 195 | 0, 1, 0, 1, 0, 1, 0, 1, |
| 196 | 0, 1, 2, 0, 1, 2, 0, 1, |
| 197 | 0, 1, 2, 3, 0, 1, 2, 3, |
| 198 | 0, 1, 2, 3, 4, 0, 1, 2, |
| 199 | 0, 1, 2, 3, 4, 5, 0, 1, |
| 200 | 0, 1, 2, 3, 4, 5, 6, 0, |
| 201 | }; |
| 202 | |
| 203 | unalignedStore<uint8x8_t>(op, vtbl1_u8(unalignedLoad<uint8x8_t>(match), unalignedLoad<uint8x8_t>(masks + 8 * offset))); |
| 204 | match += masks[offset]; |
| 205 | } |
| 206 | |
| 207 | #endif |
| 208 | |
| 209 | |
| 210 | template <> void inline copy<8>(UInt8 * dst, const UInt8 * src) { copy8(dst, src); } |
| 211 | template <> void inline wildCopy<8>(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) { wildCopy8(dst, src, dst_end); } |
| 212 | template <> void inline copyOverlap<8, false>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap8(op, match, offset); } |
| 213 | template <> void inline copyOverlap<8, true>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap8Shuffle(op, match, offset); } |
| 214 | |
| 215 | |
| 216 | inline void copy16(UInt8 * dst, const UInt8 * src) |
| 217 | { |
| 218 | #ifdef __SSE2__ |
| 219 | _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), |
| 220 | _mm_loadu_si128(reinterpret_cast<const __m128i *>(src))); |
| 221 | #else |
| 222 | memcpy(dst, src, 16); |
| 223 | #endif |
| 224 | } |
| 225 | |
| 226 | inline void wildCopy16(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) |
| 227 | { |
| 228 | /// Unrolling with clang is doing >10% performance degrade. |
| 229 | #if defined(__clang__) |
| 230 | #pragma nounroll |
| 231 | #endif |
| 232 | do |
| 233 | { |
| 234 | copy16(dst, src); |
| 235 | dst += 16; |
| 236 | src += 16; |
| 237 | } while (dst < dst_end); |
| 238 | } |
| 239 | |
| 240 | inline void copyOverlap16(UInt8 * op, const UInt8 *& match, const size_t offset) |
| 241 | { |
| 242 | /// 4 % n. |
| 243 | static constexpr int shift1[] |
| 244 | = { 0, 1, 2, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; |
| 245 | |
| 246 | /// 8 % n - 4 % n |
| 247 | static constexpr int shift2[] |
| 248 | = { 0, 0, 0, 1, 0, -1, -2, -3, -4, 4, 4, 4, 4, 4, 4, 4 }; |
| 249 | |
| 250 | /// 16 % n - 8 % n |
| 251 | static constexpr int shift3[] |
| 252 | = { 0, 0, 0, -1, 0, -2, 2, 1, 8, -1, -2, -3, -4, -5, -6, -7 }; |
| 253 | |
| 254 | op[0] = match[0]; |
| 255 | op[1] = match[1]; |
| 256 | op[2] = match[2]; |
| 257 | op[3] = match[3]; |
| 258 | |
| 259 | match += shift1[offset]; |
| 260 | memcpy(op + 4, match, 4); |
| 261 | match += shift2[offset]; |
| 262 | memcpy(op + 8, match, 8); |
| 263 | match += shift3[offset]; |
| 264 | } |
| 265 | |
| 266 | |
| 267 | #if defined(__x86_64__) || defined(__PPC__) |
| 268 | |
| 269 | inline void copyOverlap16Shuffle(UInt8 * op, const UInt8 *& match, const size_t offset) |
| 270 | { |
| 271 | #if defined(__SSSE3__) && !defined(MEMORY_SANITIZER) |
| 272 | |
| 273 | static constexpr UInt8 __attribute__((__aligned__(16))) masks[] = |
| 274 | { |
| 275 | 0, 1, 2, 1, 4, 1, 4, 2, 8, 7, 6, 5, 4, 3, 2, 1, /* offset = 0, not used as mask, but for shift amount instead */ |
| 276 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* offset = 1 */ |
| 277 | 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, |
| 278 | 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, |
| 279 | 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, |
| 280 | 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, |
| 281 | 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, |
| 282 | 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, |
| 283 | 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, |
| 284 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, |
| 285 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, |
| 286 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, |
| 287 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, |
| 288 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, |
| 289 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, |
| 290 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, |
| 291 | }; |
| 292 | |
| 293 | _mm_storeu_si128(reinterpret_cast<__m128i *>(op), |
| 294 | _mm_shuffle_epi8( |
| 295 | _mm_loadu_si128(reinterpret_cast<const __m128i *>(match)), |
| 296 | _mm_load_si128(reinterpret_cast<const __m128i *>(masks) + offset))); |
| 297 | |
| 298 | match += masks[offset]; |
| 299 | |
| 300 | #else |
| 301 | copyOverlap16(op, match, offset); |
| 302 | #endif |
| 303 | } |
| 304 | |
| 305 | #endif |
| 306 | |
| 307 | #ifdef __aarch64__ |
| 308 | |
| 309 | inline void copyOverlap16Shuffle(UInt8 * op, const UInt8 *& match, const size_t offset) |
| 310 | { |
| 311 | static constexpr UInt8 __attribute__((__aligned__(16))) masks[] = |
| 312 | { |
| 313 | 0, 1, 2, 1, 4, 1, 4, 2, 8, 7, 6, 5, 4, 3, 2, 1, /* offset = 0, not used as mask, but for shift amount instead */ |
| 314 | 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* offset = 1 */ |
| 315 | 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, |
| 316 | 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, |
| 317 | 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, |
| 318 | 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, |
| 319 | 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, |
| 320 | 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, |
| 321 | 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, |
| 322 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, |
| 323 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, |
| 324 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 0, 1, 2, 3, 4, |
| 325 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 1, 2, 3, |
| 326 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 0, 1, 2, |
| 327 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 1, |
| 328 | 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 0, |
| 329 | }; |
| 330 | |
| 331 | unalignedStore<uint8x8_t>(op, |
| 332 | vtbl2_u8(unalignedLoad<uint8x8x2_t>(match), unalignedLoad<uint8x8_t>(masks + 16 * offset))); |
| 333 | |
| 334 | unalignedStore<uint8x8_t>(op + 8, |
| 335 | vtbl2_u8(unalignedLoad<uint8x8x2_t>(match), unalignedLoad<uint8x8_t>(masks + 16 * offset + 8))); |
| 336 | |
| 337 | match += masks[offset]; |
| 338 | } |
| 339 | |
| 340 | #endif |
| 341 | |
| 342 | |
| 343 | template <> void inline copy<16>(UInt8 * dst, const UInt8 * src) { copy16(dst, src); } |
| 344 | template <> void inline wildCopy<16>(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) { wildCopy16(dst, src, dst_end); } |
| 345 | template <> void inline copyOverlap<16, false>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap16(op, match, offset); } |
| 346 | template <> void inline copyOverlap<16, true>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap16Shuffle(op, match, offset); } |
| 347 | |
| 348 | |
| 349 | inline void copy32(UInt8 * dst, const UInt8 * src) |
| 350 | { |
| 351 | /// There was an AVX here but with mash with SSE instructions, we got a big slowdown. |
| 352 | #if defined(__SSE2__) |
| 353 | _mm_storeu_si128(reinterpret_cast<__m128i *>(dst), |
| 354 | _mm_loadu_si128(reinterpret_cast<const __m128i *>(src))); |
| 355 | _mm_storeu_si128(reinterpret_cast<__m128i *>(dst + 16), |
| 356 | _mm_loadu_si128(reinterpret_cast<const __m128i *>(src + 16))); |
| 357 | #else |
| 358 | memcpy(dst, src, 16); |
| 359 | memcpy(dst + 16, src + 16, 16); |
| 360 | #endif |
| 361 | } |
| 362 | |
| 363 | inline void wildCopy32(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) |
| 364 | { |
| 365 | /// Unrolling with clang is doing >10% performance degrade. |
| 366 | #if defined(__clang__) |
| 367 | #pragma nounroll |
| 368 | #endif |
| 369 | do |
| 370 | { |
| 371 | copy32(dst, src); |
| 372 | dst += 32; |
| 373 | src += 32; |
| 374 | } while (dst < dst_end); |
| 375 | } |
| 376 | |
| 377 | inline void copyOverlap32(UInt8 * op, const UInt8 *& match, const size_t offset) |
| 378 | { |
| 379 | /// 4 % n. |
| 380 | static constexpr int shift1[] |
| 381 | = { 0, 1, 2, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; |
| 382 | |
| 383 | /// 8 % n - 4 % n |
| 384 | static constexpr int shift2[] |
| 385 | = { 0, 0, 0, 1, 0, -1, -2, -3, -4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4 }; |
| 386 | |
| 387 | /// 16 % n - 8 % n |
| 388 | static constexpr int shift3[] |
| 389 | = { 0, 0, 0, -1, 0, -2, 2, 1, 8, -1, -2, -3, -4, -5, -6, -7, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8 }; |
| 390 | |
| 391 | /// 32 % n - 16 % n |
| 392 | static constexpr int shift4[] |
| 393 | = { 0, 0, 0, 1, 0, 1, -2, 2, 0, -2, -4, 5, 4, 3, 2, 1, 0, -1, -2, -3, -4, -5, -6, -7, -8, -9,-10,-11,-12,-13,-14,-15 }; |
| 394 | |
| 395 | op[0] = match[0]; |
| 396 | op[1] = match[1]; |
| 397 | op[2] = match[2]; |
| 398 | op[3] = match[3]; |
| 399 | |
| 400 | match += shift1[offset]; |
| 401 | memcpy(op + 4, match, 4); |
| 402 | match += shift2[offset]; |
| 403 | memcpy(op + 8, match, 8); |
| 404 | match += shift3[offset]; |
| 405 | memcpy(op + 16, match, 16); |
| 406 | match += shift4[offset]; |
| 407 | } |
| 408 | |
| 409 | |
| 410 | template <> void inline copy<32>(UInt8 * dst, const UInt8 * src) { copy32(dst, src); } |
| 411 | template <> void inline wildCopy<32>(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) { wildCopy32(dst, src, dst_end); } |
| 412 | template <> void inline copyOverlap<32, false>(UInt8 * op, const UInt8 *& match, const size_t offset) { copyOverlap32(op, match, offset); } |
| 413 | |
| 414 | |
| 415 | /// See also https://stackoverflow.com/a/30669632 |
| 416 | |
| 417 | template <size_t copy_amount, bool use_shuffle> |
| 418 | void NO_INLINE decompressImpl( |
| 419 | const char * const source, |
| 420 | char * const dest, |
| 421 | size_t dest_size) |
| 422 | { |
| 423 | const UInt8 * ip = reinterpret_cast<const UInt8 *>(source); |
| 424 | UInt8 * op = reinterpret_cast<UInt8 *>(dest); |
| 425 | UInt8 * const output_end = op + dest_size; |
| 426 | |
| 427 | /// Unrolling with clang is doing >10% performance degrade. |
| 428 | #if defined(__clang__) |
| 429 | #pragma nounroll |
| 430 | #endif |
| 431 | while (1) |
| 432 | { |
| 433 | size_t length; |
| 434 | |
| 435 | auto continue_read_length = [&] |
| 436 | { |
| 437 | unsigned s; |
| 438 | do |
| 439 | { |
| 440 | s = *ip++; |
| 441 | length += s; |
| 442 | } while (unlikely(s == 255)); |
| 443 | }; |
| 444 | |
| 445 | /// Get literal length. |
| 446 | |
| 447 | const unsigned token = *ip++; |
| 448 | length = token >> 4; |
| 449 | if (length == 0x0F) |
| 450 | continue_read_length(); |
| 451 | |
| 452 | /// Copy literals. |
| 453 | |
| 454 | UInt8 * copy_end = op + length; |
| 455 | |
| 456 | /// input: Hello, world |
| 457 | /// ^-ip |
| 458 | /// output: xyz |
| 459 | /// ^-op ^-copy_end |
| 460 | /// output: xyzHello, w |
| 461 | /// ^- excessive copied bytes due to "wildCopy" |
| 462 | /// input: Hello, world |
| 463 | /// ^-ip |
| 464 | /// output: xyzHello, w |
| 465 | /// ^-op (we will overwrite excessive bytes on next iteration) |
| 466 | |
| 467 | wildCopy<copy_amount>(op, ip, copy_end); /// Here we can write up to copy_amount - 1 bytes after buffer. |
| 468 | |
| 469 | ip += length; |
| 470 | op = copy_end; |
| 471 | |
| 472 | if (copy_end >= output_end) |
| 473 | return; |
| 474 | |
| 475 | /// Get match offset. |
| 476 | |
| 477 | size_t offset = unalignedLoad<UInt16>(ip); |
| 478 | ip += 2; |
| 479 | const UInt8 * match = op - offset; |
| 480 | |
| 481 | /// Get match length. |
| 482 | |
| 483 | length = token & 0x0F; |
| 484 | if (length == 0x0F) |
| 485 | continue_read_length(); |
| 486 | length += 4; |
| 487 | |
| 488 | /// Copy match within block, that produce overlapping pattern. Match may replicate itself. |
| 489 | |
| 490 | copy_end = op + length; |
| 491 | |
| 492 | /** Here we can write up to copy_amount - 1 - 4 * 2 bytes after buffer. |
| 493 | * The worst case when offset = 1 and length = 4 |
| 494 | */ |
| 495 | |
| 496 | if (unlikely(offset < copy_amount)) |
| 497 | { |
| 498 | /// output: Hello |
| 499 | /// ^-op |
| 500 | /// ^-match; offset = 5 |
| 501 | /// |
| 502 | /// output: Hello |
| 503 | /// [------] - copy_amount bytes |
| 504 | /// [------] - copy them here |
| 505 | /// |
| 506 | /// output: HelloHelloHel |
| 507 | /// ^-match ^-op |
| 508 | |
| 509 | copyOverlap<copy_amount, use_shuffle>(op, match, offset); |
| 510 | } |
| 511 | else |
| 512 | { |
| 513 | copy<copy_amount>(op, match); |
| 514 | match += copy_amount; |
| 515 | } |
| 516 | |
| 517 | op += copy_amount; |
| 518 | |
| 519 | copy<copy_amount>(op, match); /// copy_amount + copy_amount - 1 - 4 * 2 bytes after buffer. |
| 520 | if (length > copy_amount * 2) |
| 521 | wildCopy<copy_amount>(op + copy_amount, match + copy_amount, copy_end); |
| 522 | |
| 523 | op = copy_end; |
| 524 | } |
| 525 | } |
| 526 | |
| 527 | } |
| 528 | |
| 529 | |
| 530 | void decompress( |
| 531 | const char * const source, |
| 532 | char * const dest, |
| 533 | size_t source_size, |
| 534 | size_t dest_size, |
| 535 | PerformanceStatistics & statistics [[maybe_unused]]) |
| 536 | { |
| 537 | if (source_size == 0 || dest_size == 0) |
| 538 | return; |
| 539 | |
| 540 | /// Don't run timer if the block is too small. |
| 541 | if (dest_size >= 32768) |
| 542 | { |
| 543 | size_t best_variant = statistics.select(); |
| 544 | |
| 545 | /// Run the selected method and measure time. |
| 546 | |
| 547 | Stopwatch watch; |
| 548 | if (best_variant == 0) |
| 549 | decompressImpl<16, true>(source, dest, dest_size); |
| 550 | if (best_variant == 1) |
| 551 | decompressImpl<16, false>(source, dest, dest_size); |
| 552 | if (best_variant == 2) |
| 553 | decompressImpl<8, true>(source, dest, dest_size); |
| 554 | if (best_variant == 3) |
| 555 | decompressImpl<32, false>(source, dest, dest_size); |
| 556 | |
| 557 | watch.stop(); |
| 558 | |
| 559 | /// Update performance statistics. |
| 560 | |
| 561 | statistics.data[best_variant].update(watch.elapsedSeconds(), dest_size); |
| 562 | } |
| 563 | else |
| 564 | { |
| 565 | decompressImpl<8, false>(source, dest, dest_size); |
| 566 | } |
| 567 | } |
| 568 | |
| 569 | |
| 570 | void StreamStatistics::literal(size_t length) |
| 571 | { |
| 572 | ++num_tokens; |
| 573 | sum_literal_lengths += length; |
| 574 | } |
| 575 | |
| 576 | void StreamStatistics::match(size_t length, size_t offset) |
| 577 | { |
| 578 | ++num_tokens; |
| 579 | sum_match_lengths += length; |
| 580 | sum_match_offsets += offset; |
| 581 | count_match_offset_less_8 += offset < 8; |
| 582 | count_match_offset_less_16 += offset < 16; |
| 583 | count_match_replicate_itself += offset < length; |
| 584 | } |
| 585 | |
| 586 | void StreamStatistics::print() const |
| 587 | { |
| 588 | std::cerr |
| 589 | << "Num tokens: " << num_tokens |
| 590 | << ", Avg literal length: " << double(sum_literal_lengths) / num_tokens |
| 591 | << ", Avg match length: " << double(sum_match_lengths) / num_tokens |
| 592 | << ", Avg match offset: " << double(sum_match_offsets) / num_tokens |
| 593 | << ", Offset < 8 ratio: " << double(count_match_offset_less_8) / num_tokens |
| 594 | << ", Offset < 16 ratio: " << double(count_match_offset_less_16) / num_tokens |
| 595 | << ", Match replicate itself: " << double(count_match_replicate_itself) / num_tokens |
| 596 | << "\n" ; |
| 597 | } |
| 598 | |
| 599 | void statistics( |
| 600 | const char * const source, |
| 601 | char * const dest, |
| 602 | size_t dest_size, |
| 603 | StreamStatistics & stat) |
| 604 | { |
| 605 | const UInt8 * ip = reinterpret_cast<const UInt8 *>(source); |
| 606 | UInt8 * op = reinterpret_cast<UInt8 *>(dest); |
| 607 | UInt8 * const output_end = op + dest_size; |
| 608 | while (1) |
| 609 | { |
| 610 | size_t length; |
| 611 | |
| 612 | auto continue_read_length = [&] |
| 613 | { |
| 614 | unsigned s; |
| 615 | do |
| 616 | { |
| 617 | s = *ip++; |
| 618 | length += s; |
| 619 | } while (unlikely(s == 255)); |
| 620 | }; |
| 621 | |
| 622 | auto token = *ip++; |
| 623 | length = token >> 4; |
| 624 | if (length == 0x0F) |
| 625 | continue_read_length(); |
| 626 | |
| 627 | stat.literal(length); |
| 628 | |
| 629 | ip += length; |
| 630 | op += length; |
| 631 | |
| 632 | if (op > output_end) |
| 633 | return; |
| 634 | |
| 635 | size_t offset = unalignedLoad<UInt16>(ip); |
| 636 | ip += 2; |
| 637 | |
| 638 | length = token & 0x0F; |
| 639 | if (length == 0x0F) |
| 640 | continue_read_length(); |
| 641 | length += 4; |
| 642 | |
| 643 | stat.match(length, offset); |
| 644 | |
| 645 | op += length; |
| 646 | } |
| 647 | } |
| 648 | |
| 649 | } |
| 650 | |