LZ4_decompress_faster.cpp source code [ClickHouse/dbms/src/Compression/LZ4_decompress_faster.cpp]

1	#include "LZ4_decompress_faster.h"
2
3	#include <string.h>
4	#include <iostream>
5	#include <random>
6	#include <algorithm>
7	#include <Core/Defines.h>
8	#include <Common/Stopwatch.h>
9	#include <common/likely.h>
10	#include <common/Types.h>
11	#include <common/unaligned.h>
12
13	#ifdef __SSE2__
14	#include <emmintrin.h>
15	#endif
16
17	#ifdef __SSSE3__
18	#include <tmmintrin.h>
19	#endif
20
21	#ifdef __aarch64__
22	#include <arm_neon.h>
23	#endif
24
25	namespace LZ4
26	{
27
28	namespace
29	{
30
31	template <size_t N> [[maybe_unused]] void copy(UInt8 * dst, const UInt8 * src);
32	template <size_t N> [[maybe_unused]] void wildCopy(UInt8 * dst, const UInt8 * src, UInt8 * dst_end);
33	template <size_t N, bool USE_SHUFFLE> [[maybe_unused]] void copyOverlap(UInt8 * op, const UInt8 & match, const* size_t offset);
34
35
36	inline void copy8(UInt8 * dst, const UInt8 * src)
37	{
38	memcpy(dst, src, `8`);
39	}
40
41	inline void wildCopy8(UInt8 * dst, const UInt8 * src, UInt8 * dst_end)
42	{
43	/// Unrolling with clang is doing >10% performance degrade.
44	#if defined(__clang__)
45	#pragma nounroll
46	#endif
47	do
48	{
49	copy8(dst, src);
50	dst += `8`;
51	src += `8`;
52	} while (dst < dst_end);
53	}
54
55	inline void copyOverlap8(UInt8 * op, const UInt8 & match, const* size_t offset)
56	{
57	/// 4 % n.
58	/// Or if 4 % n is zero, we use n.
59	/// It gives equivalent result, but is better CPU friendly for unknown reason.
60	static constexpr int shift1[] = { `0`, `1`, `2`, `1`, `4`, `4`, `4`, `4` };
61
62	/// 8 % n - 4 % n
63	static constexpr int shift2[] = { `0`, `0`, `0`, `1`, `0`, -`1`, -`2`, -`3` };
64
65	op[`0`] = match[`0`];
66	op[`1`] = match[`1`];
67	op[`2`] = match[`2`];
68	op[`3`] = match[`3`];
69
70	match += shift1[offset];
71	memcpy(op + `4`, match, `4`);
72	match += shift2[offset];
73	}
74
75
76	#if defined(__x86_64__) \|\| defined(__PPC__)
77
78	/* We use 'xmm' (128bit SSE) registers here to shuffle 16 bytes.*
79	*
80	* It is possible to use 'mm' (64bit MMX) registers to shuffle just 8 bytes as we need.
81	*
82	* There is corresponding version of 'pshufb' instruction that operates on 'mm' registers,
83	* (it operates on MMX registers although it is available in SSSE3)
84	* and compiler library has the corresponding intrinsic: '_mm_shuffle_pi8'.
85	*
86	* It can be done like this:
87	*
88	* unalignedStore(op, _mm_shuffle_pi8(
89	* unalignedLoad<__m64>(match),
90	* unalignedLoad<__m64>(masks + 8 * offset)));
91	*
92	* This is perfectly correct and this code have the same or even better performance.
93	*
94	* But if we write code this way, it will lead to
95	* extremely weird and extremely non obvious
96	* effects in completely unrelated parts of code.
97	*
98	* Because using MMX registers alters the mode of operation of x87 FPU,
99	* and then operations with FPU become broken.
100	*
101	* Example 1.
102	* Compile this code without optimizations:
103	*
104	#include <vector>
105	#include <unordered_set>
106	#include <iostream>
107	#include <tmmintrin.h>
108
109	int main(int, char )
110	{
111	[[maybe_unused]] __m64 shuffled = _mm_shuffle_pi8(__m64{}, __m64{});
112
113	std::vector<int> vec;
114	std::unordered_set<int> set(vec.begin(), vec.end());
115
116	std::cerr << set.size() << "\n";
117	return 0;
118	}
119
120	$ g++ -g -O0 -mssse3 -std=c++17 mmx_bug1.cpp && ./a.out
121	terminate called after throwing an instance of 'std::bad_alloc'
122	what(): std::bad_alloc
123
124	Also reproduced with clang. But only with libstdc++, not with libc++.
125
126	* Example 2.
127
128	#include <math.h>
129	#include <iostream>
130	#include <tmmintrin.h>
131
132	int main(int, char )
133	{
134	double max_fill = 1;
135
136	std::cerr << (long double)max_fill << "\n";
137	[[maybe_unused]] __m64 shuffled = _mm_shuffle_pi8(__m64{}, __m64{});
138	std::cerr << (long double)max_fill << "\n";
139
140	return 0;
141	}
142
143	$ g++ -g -O0 -mssse3 -std=c++17 mmx_bug2.cpp && ./a.out
144	1
145	-nan
146
147	* Explanation:
148	*
149	* https://stackoverflow.com/questions/33692969/assembler-mmx-errors
150	* https://software.intel.com/en-us/node/524274
151	*
152	* Actually it's possible to use 'emms' instruction after decompression routine.
153	* But it's more easy to just use 'xmm' registers and avoid using 'mm' registers.
154	*/
155	inline void copyOverlap8Shuffle(UInt8 * op, const UInt8 & match, const* size_t offset)
156	{
157	#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
158
159	static constexpr UInt8 __attribute__((__aligned__(`8`))) masks[] =
160	{
161	`0`, `1`, `2`, `2`, `4`, `3`, `2`, `1`, / offset = 0, not used as mask, but for shift amount instead /
162	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, / offset = 1 /
163	`0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`,
164	`0`, `1`, `2`, `0`, `1`, `2`, `0`, `1`,
165	`0`, `1`, `2`, `3`, `0`, `1`, `2`, `3`,
166	`0`, `1`, `2`, `3`, `4`, `0`, `1`, `2`,
167	`0`, `1`, `2`, `3`, `4`, `5`, `0`, `1`,
168	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `0`,
169	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, / this row is not used: padding to allow read 16 bytes starting at previous row /
170	};
171
172	_mm_storeu_si128(reinterpret_cast<__m128i *>(op),
173	_mm_shuffle_epi8(
174	_mm_loadu_si128(reinterpret_cast<const __m128i *>(match)),
175	_mm_loadu_si128(reinterpret_cast<const __m128i >(masks + `8` offset))));
176
177	match += masks[offset];
178
179	#else
180	copyOverlap8(op, match, offset);
181	#endif
182	}
183
184	#endif
185
186
187	#ifdef __aarch64__
188
189	inline void copyOverlap8Shuffle(UInt8 * op, const UInt8 & match, const* size_t offset)
190	{
191	static constexpr UInt8 __attribute__((__aligned__(`8`))) masks[] =
192	{
193	`0`, `1`, `2`, `2`, `4`, `3`, `2`, `1`, / offset = 0, not used as mask, but for shift amount instead /
194	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, / offset = 1 /
195	`0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`,
196	`0`, `1`, `2`, `0`, `1`, `2`, `0`, `1`,
197	`0`, `1`, `2`, `3`, `0`, `1`, `2`, `3`,
198	`0`, `1`, `2`, `3`, `4`, `0`, `1`, `2`,
199	`0`, `1`, `2`, `3`, `4`, `5`, `0`, `1`,
200	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `0`,
201	};
202
203	unalignedStore<uint8x8_t>(op, vtbl1_u8(unalignedLoad<uint8x8_t>(match), unalignedLoad<uint8x8_t>(masks + `8` * offset)));
204	match += masks[offset];
205	}
206
207	#endif
208
209
210	template <> void inline copy<`8`>(UInt8 * dst, const UInt8 * src) { copy8(dst, src); }
211	template <> void inline wildCopy<`8`>(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) { wildCopy8(dst, src, dst_end); }
212	template <> void inline copyOverlap<`8`, false>(UInt8 * op, const UInt8 & match, const* size_t offset) { copyOverlap8(op, match, offset); }
213	template <> void inline copyOverlap<`8`, true>(UInt8 * op, const UInt8 & match, const* size_t offset) { copyOverlap8Shuffle(op, match, offset); }
214
215
216	inline void copy16(UInt8 * dst, const UInt8 * src)
217	{
218	#ifdef __SSE2__
219	_mm_storeu_si128(reinterpret_cast<__m128i *>(dst),
220	_mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
221	#else
222	memcpy(dst, src, `16`);
223	#endif
224	}
225
226	inline void wildCopy16(UInt8 * dst, const UInt8 * src, UInt8 * dst_end)
227	{
228	/// Unrolling with clang is doing >10% performance degrade.
229	#if defined(__clang__)
230	#pragma nounroll
231	#endif
232	do
233	{
234	copy16(dst, src);
235	dst += `16`;
236	src += `16`;
237	} while (dst < dst_end);
238	}
239
240	inline void copyOverlap16(UInt8 * op, const UInt8 & match, const* size_t offset)
241	{
242	/// 4 % n.
243	static constexpr int shift1[]
244	= { `0`, `1`, `2`, `1`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4` };
245
246	/// 8 % n - 4 % n
247	static constexpr int shift2[]
248	= { `0`, `0`, `0`, `1`, `0`, -`1`, -`2`, -`3`, -`4`, `4`, `4`, `4`, `4`, `4`, `4`, `4` };
249
250	/// 16 % n - 8 % n
251	static constexpr int shift3[]
252	= { `0`, `0`, `0`, -`1`, `0`, -`2`, `2`, `1`, `8`, -`1`, -`2`, -`3`, -`4`, -`5`, -`6`, -`7` };
253
254	op[`0`] = match[`0`];
255	op[`1`] = match[`1`];
256	op[`2`] = match[`2`];
257	op[`3`] = match[`3`];
258
259	match += shift1[offset];
260	memcpy(op + `4`, match, `4`);
261	match += shift2[offset];
262	memcpy(op + `8`, match, `8`);
263	match += shift3[offset];
264	}
265
266
267	#if defined(__x86_64__) \|\| defined(__PPC__)
268
269	inline void copyOverlap16Shuffle(UInt8 * op, const UInt8 & match, const* size_t offset)
270	{
271	#if defined(__SSSE3__) && !defined(MEMORY_SANITIZER)
272
273	static constexpr UInt8 __attribute__((__aligned__(`16`))) masks[] =
274	{
275	`0`, `1`, `2`, `1`, `4`, `1`, `4`, `2`, `8`, `7`, `6`, `5`, `4`, `3`, `2`, `1`, / offset = 0, not used as mask, but for shift amount instead /
276	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, / offset = 1 /
277	`0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`,
278	`0`, `1`, `2`, `0`, `1`, `2`, `0`, `1`, `2`, `0`, `1`, `2`, `0`, `1`, `2`, `0`,
279	`0`, `1`, `2`, `3`, `0`, `1`, `2`, `3`, `0`, `1`, `2`, `3`, `0`, `1`, `2`, `3`,
280	`0`, `1`, `2`, `3`, `4`, `0`, `1`, `2`, `3`, `4`, `0`, `1`, `2`, `3`, `4`, `0`,
281	`0`, `1`, `2`, `3`, `4`, `5`, `0`, `1`, `2`, `3`, `4`, `5`, `0`, `1`, `2`, `3`,
282	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `0`, `1`, `2`, `3`, `4`, `5`, `6`, `0`, `1`,
283	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`,
284	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `0`, `1`, `2`, `3`, `4`, `5`, `6`,
285	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `0`, `1`, `2`, `3`, `4`, `5`,
286	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `0`, `1`, `2`, `3`, `4`,
287	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `0`, `1`, `2`, `3`,
288	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `0`, `1`, `2`,
289	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `0`, `1`,
290	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `0`,
291	};
292
293	_mm_storeu_si128(reinterpret_cast<__m128i *>(op),
294	_mm_shuffle_epi8(
295	_mm_loadu_si128(reinterpret_cast<const __m128i *>(match)),
296	_mm_load_si128(reinterpret_cast<const __m128i *>(masks) + offset)));
297
298	match += masks[offset];
299
300	#else
301	copyOverlap16(op, match, offset);
302	#endif
303	}
304
305	#endif
306
307	#ifdef __aarch64__
308
309	inline void copyOverlap16Shuffle(UInt8 * op, const UInt8 & match, const* size_t offset)
310	{
311	static constexpr UInt8 __attribute__((__aligned__(`16`))) masks[] =
312	{
313	`0`, `1`, `2`, `1`, `4`, `1`, `4`, `2`, `8`, `7`, `6`, `5`, `4`, `3`, `2`, `1`, / offset = 0, not used as mask, but for shift amount instead /
314	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, / offset = 1 /
315	`0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`, `0`, `1`,
316	`0`, `1`, `2`, `0`, `1`, `2`, `0`, `1`, `2`, `0`, `1`, `2`, `0`, `1`, `2`, `0`,
317	`0`, `1`, `2`, `3`, `0`, `1`, `2`, `3`, `0`, `1`, `2`, `3`, `0`, `1`, `2`, `3`,
318	`0`, `1`, `2`, `3`, `4`, `0`, `1`, `2`, `3`, `4`, `0`, `1`, `2`, `3`, `4`, `0`,
319	`0`, `1`, `2`, `3`, `4`, `5`, `0`, `1`, `2`, `3`, `4`, `5`, `0`, `1`, `2`, `3`,
320	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `0`, `1`, `2`, `3`, `4`, `5`, `6`, `0`, `1`,
321	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`,
322	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `0`, `1`, `2`, `3`, `4`, `5`, `6`,
323	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `0`, `1`, `2`, `3`, `4`, `5`,
324	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `0`, `1`, `2`, `3`, `4`,
325	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `0`, `1`, `2`, `3`,
326	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `0`, `1`, `2`,
327	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `0`, `1`,
328	`0`, `1`, `2`, `3`, `4`, `5`, `6`, `7`, `8`, `9`, `10`, `11`, `12`, `13`, `14`, `0`,
329	};
330
331	unalignedStore<uint8x8_t>(op,
332	vtbl2_u8(unalignedLoad<uint8x8x2_t>(match), unalignedLoad<uint8x8_t>(masks + `16` * offset)));
333
334	unalignedStore<uint8x8_t>(op + `8`,
335	vtbl2_u8(unalignedLoad<uint8x8x2_t>(match), unalignedLoad<uint8x8_t>(masks + `16` * offset + `8`)));
336
337	match += masks[offset];
338	}
339
340	#endif
341
342
343	template <> void inline copy<`16`>(UInt8 * dst, const UInt8 * src) { copy16(dst, src); }
344	template <> void inline wildCopy<`16`>(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) { wildCopy16(dst, src, dst_end); }
345	template <> void inline copyOverlap<`16`, false>(UInt8 * op, const UInt8 & match, const* size_t offset) { copyOverlap16(op, match, offset); }
346	template <> void inline copyOverlap<`16`, true>(UInt8 * op, const UInt8 & match, const* size_t offset) { copyOverlap16Shuffle(op, match, offset); }
347
348
349	inline void copy32(UInt8 * dst, const UInt8 * src)
350	{
351	/// There was an AVX here but with mash with SSE instructions, we got a big slowdown.
352	#if defined(__SSE2__)
353	_mm_storeu_si128(reinterpret_cast<__m128i *>(dst),
354	_mm_loadu_si128(reinterpret_cast<const __m128i *>(src)));
355	_mm_storeu_si128(reinterpret_cast<__m128i *>(dst + `16`),
356	_mm_loadu_si128(reinterpret_cast<const __m128i *>(src + `16`)));
357	#else
358	memcpy(dst, src, `16`);
359	memcpy(dst + `16`, src + `16`, `16`);
360	#endif
361	}
362
363	inline void wildCopy32(UInt8 * dst, const UInt8 * src, UInt8 * dst_end)
364	{
365	/// Unrolling with clang is doing >10% performance degrade.
366	#if defined(__clang__)
367	#pragma nounroll
368	#endif
369	do
370	{
371	copy32(dst, src);
372	dst += `32`;
373	src += `32`;
374	} while (dst < dst_end);
375	}
376
377	inline void copyOverlap32(UInt8 * op, const UInt8 & match, const* size_t offset)
378	{
379	/// 4 % n.
380	static constexpr int shift1[]
381	= { `0`, `1`, `2`, `1`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4` };
382
383	/// 8 % n - 4 % n
384	static constexpr int shift2[]
385	= { `0`, `0`, `0`, `1`, `0`, -`1`, -`2`, -`3`, -`4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4`, `4` };
386
387	/// 16 % n - 8 % n
388	static constexpr int shift3[]
389	= { `0`, `0`, `0`, -`1`, `0`, -`2`, `2`, `1`, `8`, -`1`, -`2`, -`3`, -`4`, -`5`, -`6`, -`7`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8`, `8` };
390
391	/// 32 % n - 16 % n
392	static constexpr int shift4[]
393	= { `0`, `0`, `0`, `1`, `0`, `1`, -`2`, `2`, `0`, -`2`, -`4`, `5`, `4`, `3`, `2`, `1`, `0`, -`1`, -`2`, -`3`, -`4`, -`5`, -`6`, -`7`, -`8`, -`9`,-`10`,-`11`,-`12`,-`13`,-`14`,-`15` };
394
395	op[`0`] = match[`0`];
396	op[`1`] = match[`1`];
397	op[`2`] = match[`2`];
398	op[`3`] = match[`3`];
399
400	match += shift1[offset];
401	memcpy(op + `4`, match, `4`);
402	match += shift2[offset];
403	memcpy(op + `8`, match, `8`);
404	match += shift3[offset];
405	memcpy(op + `16`, match, `16`);
406	match += shift4[offset];
407	}
408
409
410	template <> void inline copy<`32`>(UInt8 * dst, const UInt8 * src) { copy32(dst, src); }
411	template <> void inline wildCopy<`32`>(UInt8 * dst, const UInt8 * src, UInt8 * dst_end) { wildCopy32(dst, src, dst_end); }
412	template <> void inline copyOverlap<`32`, false>(UInt8 * op, const UInt8 & match, const* size_t offset) { copyOverlap32(op, match, offset); }
413
414
415	/// See also https://stackoverflow.com/a/30669632
416
417	template <size_t copy_amount, bool use_shuffle>
418	void NO_INLINE decompressImpl(
419	const char * const source,
420	char * const dest,
421	size_t dest_size)
422	{
423	const UInt8 * ip = reinterpret_cast<const UInt8 *>(source);
424	UInt8 * op = reinterpret_cast<UInt8 *>(dest);
425	UInt8 * const output_end = op + dest_size;
426
427	/// Unrolling with clang is doing >10% performance degrade.
428	#if defined(__clang__)
429	#pragma nounroll
430	#endif
431	while (`1`)
432	{
433	size_t length;
434
435	auto continue_read_length = [&]
436	{
437	unsigned s;
438	do
439	{
440	s = *ip++;
441	length += s;
442	} while (unlikely(s == `255`));
443	};
444
445	/// Get literal length.
446
447	const unsigned token = *ip++;
448	length = token >> `4`;
449	if (length == `0x0F`)
450	continue_read_length();
451
452	/// Copy literals.
453
454	UInt8 * copy_end = op + length;
455
456	/// input: Hello, world
457	/// ^-ip
458	/// output: xyz
459	/// ^-op ^-copy_end
460	/// output: xyzHello, w
461	/// ^- excessive copied bytes due to "wildCopy"
462	/// input: Hello, world
463	/// ^-ip
464	/// output: xyzHello, w
465	/// ^-op (we will overwrite excessive bytes on next iteration)
466
467	wildCopy<copy_amount>(op, ip, copy_end); /// Here we can write up to copy_amount - 1 bytes after buffer.
468
469	ip += length;
470	op = copy_end;
471
472	if (copy_end >= output_end)
473	return;
474
475	/// Get match offset.
476
477	size_t offset = unalignedLoad<UInt16>(ip);
478	ip += `2`;
479	const UInt8 * match = op - offset;
480
481	/// Get match length.
482
483	length = token & `0x0F`;
484	if (length == `0x0F`)
485	continue_read_length();
486	length += `4`;
487
488	/// Copy match within block, that produce overlapping pattern. Match may replicate itself.
489
490	copy_end = op + length;
491
492	/* Here we can write up to copy_amount - 1 - 4 * 2 bytes after buffer.*
493	* The worst case when offset = 1 and length = 4
494	*/
495
496	if (unlikely(offset < copy_amount))
497	{
498	/// output: Hello
499	/// ^-op
500	/// ^-match; offset = 5
501	///
502	/// output: Hello
503	/// [------] - copy_amount bytes
504	/// [------] - copy them here
505	///
506	/// output: HelloHelloHel
507	/// ^-match ^-op
508
509	copyOverlap<copy_amount, use_shuffle>(op, match, offset);
510	}
511	else
512	{
513	copy<copy_amount>(op, match);
514	match += copy_amount;
515	}
516
517	op += copy_amount;
518
519	copy<copy_amount>(op, match); /// copy_amount + copy_amount - 1 - 4 2 bytes after buffer.*
520	if (length > copy_amount * `2`)
521	wildCopy<copy_amount>(op + copy_amount, match + copy_amount, copy_end);
522
523	op = copy_end;
524	}
525	}
526
527	}
528
529
530	void decompress(
531	const char * const source,
532	char * const dest,
533	size_t source_size,
534	size_t dest_size,
535	PerformanceStatistics & statistics [[maybe_unused]])
536	{
537	if (source_size == `0` \|\| dest_size == `0`)
538	return;
539
540	/// Don't run timer if the block is too small.
541	if (dest_size >= `32768`)
542	{
543	size_t best_variant = statistics.select();
544
545	/// Run the selected method and measure time.
546
547	Stopwatch watch;
548	if (best_variant == `0`)
549	decompressImpl<`16`, true>(source, dest, dest_size);
550	if (best_variant == `1`)
551	decompressImpl<`16`, false>(source, dest, dest_size);
552	if (best_variant == `2`)
553	decompressImpl<`8`, true>(source, dest, dest_size);
554	if (best_variant == `3`)
555	decompressImpl<`32`, false>(source, dest, dest_size);
556
557	watch.stop();
558
559	/// Update performance statistics.
560
561	statistics.data[best_variant].update(watch.elapsedSeconds(), dest_size);
562	}
563	else
564	{
565	decompressImpl<`8`, false>(source, dest, dest_size);
566	}
567	}
568
569
570	void StreamStatistics::literal(size_t length)
571	{
572	++num_tokens;
573	sum_literal_lengths += length;
574	}
575
576	void StreamStatistics::match(size_t length, size_t offset)
577	{
578	++num_tokens;
579	sum_match_lengths += length;
580	sum_match_offsets += offset;
581	count_match_offset_less_8 += offset < `8`;
582	count_match_offset_less_16 += offset < `16`;
583	count_match_replicate_itself += offset < length;
584	}
585
586	void StreamStatistics::print() const
587	{
588	std::cerr
589	<< "Num tokens: " << num_tokens
590	<< ", Avg literal length: " << double(sum_literal_lengths) / num_tokens
591	<< ", Avg match length: " << double(sum_match_lengths) / num_tokens
592	<< ", Avg match offset: " << double(sum_match_offsets) / num_tokens
593	<< ", Offset < 8 ratio: " << double(count_match_offset_less_8) / num_tokens
594	<< ", Offset < 16 ratio: " << double(count_match_offset_less_16) / num_tokens
595	<< ", Match replicate itself: " << double(count_match_replicate_itself) / num_tokens
596	<< "\n";
597	}
598
599	void statistics(
600	const char * const source,
601	char * const dest,
602	size_t dest_size,
603	StreamStatistics & stat)
604	{
605	const UInt8 * ip = reinterpret_cast<const UInt8 *>(source);
606	UInt8 * op = reinterpret_cast<UInt8 *>(dest);
607	UInt8 * const output_end = op + dest_size;
608	while (`1`)
609	{
610	size_t length;
611
612	auto continue_read_length = [&]
613	{
614	unsigned s;
615	do
616	{
617	s = *ip++;
618	length += s;
619	} while (unlikely(s == `255`));
620	};
621
622	auto token = *ip++;
623	length = token >> `4`;
624	if (length == `0x0F`)
625	continue_read_length ();
626
627	stat.literal(length);
628
629	ip += length;
630	op += length;
631
632	if (op > output_end)
633	return;
634
635	size_t offset = unalignedLoad<UInt16>(ip);
636	ip += `2`;
637
638	length = token & `0x0F`;
639	if (length == `0x0F`)
640	continue_read_length ();
641	length += `4`;
642
643	stat.match(length, offset);
644
645	op += length;
646	}
647	}
648
649	}
650

Browse the source code of ClickHouse/dbms/src/Compression/LZ4_decompress_faster.cpp