LZ4_decompress_faster.h source code [ClickHouse/dbms/src/Compression/LZ4_decompress_faster.h]

1	#pragma once
2
3	#include <cmath>
4	#include <random>
5	#include <pcg_random.hpp>
6
7
8	namespace LZ4
9	{
10
11	/* There are many implementation details of LZ4 decompression loop, that affect performance.*
12	* For example: copy by 8 or by 16 (SSE2) bytes at once; use shuffle (SSSE3) instruction to replicate match or not.
13	*
14	* The optimal algorithm is dependent on:
15	*
16	* 1. CPU architecture.
17	* (example: on Skylake it's almost always better to copy by 16 bytes and use shuffle,
18	* but on Westmere using shuffle is worse and copy by 16 bytes is better only for high compression ratios)
19	*
20	* 2. Data distribution.
21	* (example: when compression ratio is higher than 10.20,
22	* it's usually better to copy by 16 bytes rather than 8).
23	*
24	* It's very difficult to test all combinations on different CPUs and to choose correct rule to select best variant.
25	* (Even if you do this, you have high chance to over-optimize for specific CPU while downgrading performance on another.)
26	*
27	* Instead of this, we choose best algorithm by using performance statistics
28	* with something like "Bayesian Bandits" method.
29	*/
30
31
32	/* Both buffers passed to 'decompress' function must have*
33	* at least this amount of excessive bytes after end of data
34	* that is allowed to read/write.
35	* This value is a little overestimation.
36	*/
37	static constexpr size_t ADDITIONAL_BYTES_AT_END_OF_BUFFER = `64`;
38
39
40	/* When decompressing uniform sequence of blocks (for example, blocks from one file),*
41	* you can pass single PerformanceStatistics object to subsequent invocations of 'decompress' method.
42	* It will accumulate statistics and use it as a feedback to choose best specialization of algorithm at runtime.
43	* One PerformanceStatistics object cannot be used concurrently from different threads.
44	*/
45	struct PerformanceStatistics
46	{
47	struct Element
48	{
49	double count = `0`;
50	double sum = `0`;
51
52	double adjustedCount() const
53	{
54	return count - NUM_INVOCATIONS_TO_THROW_OFF;
55	}
56
57	double mean() const
58	{
59	return sum / adjustedCount();
60	}
61
62	/// For better convergence, we don't use proper estimate of stddev.
63	/// We want to eventually separate between two algorithms even in case
64	/// when there is no statistical significant difference between them.
65	double sigma() const
66	{
67	return mean() / sqrt(adjustedCount());
68	}
69
70	void update(double seconds, double bytes)
71	{
72	++count;
73
74	if (count > NUM_INVOCATIONS_TO_THROW_OFF)
75	sum += seconds / bytes;
76	}
77
78	double sample(pcg64 & stat_rng) const
79	{
80	/// If there is a variant with not enough statistics, always choose it.
81	/// And in that case prefer variant with less number of invocations.
82
83	if (adjustedCount() < `2`)
84	return adjustedCount() - `1`;
85	else
86	return std::normal_distribution<>(mean(), sigma())(stat_rng);
87	}
88	};
89
90	/// Number of different algorithms to select from.
91	static constexpr size_t NUM_ELEMENTS = `4`;
92
93	/// Cold invocations may be affected by additional memory latencies. Don't take first invocations into account.
94	static constexpr double NUM_INVOCATIONS_TO_THROW_OFF = `2`;
95
96	/// How to select method to run.
97	/// -1 - automatically, based on statistics (default);
98	/// 0..3 - always choose specified method (for performance testing);
99	/// -2 - choose methods in round robin fashion (for performance testing).
100	ssize_t choose_method = -`1`;
101
102	Element data[NUM_ELEMENTS];
103
104	/// It's Ok that generator is not seeded.
105	pcg64 rng;
106
107	/// To select from different algorithms we use a kind of "bandits" algorithm.
108	/// Sample random values from estimated normal distributions and choose the minimal.
109	size_t select()
110	{
111	if (choose_method < `0`)
112	{
113	double samples[NUM_ELEMENTS];
114	for (size_t i = `0`; i < NUM_ELEMENTS; ++i)
115	samples[i] = choose_method == -`1`
116	? data[i].sample(rng)
117	: data[i].adjustedCount();
118
119	return std::min_element(samples, samples + NUM_ELEMENTS) - samples;
120	}
121	else
122	return choose_method;
123	}
124
125	PerformanceStatistics() {}
126	PerformanceStatistics(ssize_t choose_method_) : choose_method(choose_method_) {}
127	};
128
129
130	/* This method dispatch to one of different implementations depending on performance statistics.*
131	*/
132	void decompress(
133	const char * const source,
134	char * const dest,
135	size_t source_size,
136	size_t dest_size,
137	PerformanceStatistics & statistics);
138
139
140	/* Obtain statistics about LZ4 block useful for development.*
141	*/
142	struct StreamStatistics
143	{
144	size_t num_tokens = `0`;
145	size_t sum_literal_lengths = `0`;
146	size_t sum_match_lengths = `0`;
147	size_t sum_match_offsets = `0`;
148	size_t count_match_offset_less_8 = `0`;
149	size_t count_match_offset_less_16 = `0`;
150	size_t count_match_replicate_itself = `0`;
151
152	void literal(size_t length);
153	void match(size_t length, size_t offset);
154
155	void print() const;
156	};
157
158	void statistics(
159	const char * const source,
160	char * const dest,
161	size_t dest_size,
162	StreamStatistics & stat);
163
164	}
165

Browse the source code of ClickHouse/dbms/src/Compression/LZ4_decompress_faster.h