1#pragma once
2
3#include <cmath>
4#include <random>
5#include <pcg_random.hpp>
6
7
8namespace LZ4
9{
10
11/** There are many implementation details of LZ4 decompression loop, that affect performance.
12 * For example: copy by 8 or by 16 (SSE2) bytes at once; use shuffle (SSSE3) instruction to replicate match or not.
13 *
14 * The optimal algorithm is dependent on:
15 *
16 * 1. CPU architecture.
17 * (example: on Skylake it's almost always better to copy by 16 bytes and use shuffle,
18 * but on Westmere using shuffle is worse and copy by 16 bytes is better only for high compression ratios)
19 *
20 * 2. Data distribution.
21 * (example: when compression ratio is higher than 10.20,
22 * it's usually better to copy by 16 bytes rather than 8).
23 *
24 * It's very difficult to test all combinations on different CPUs and to choose correct rule to select best variant.
25 * (Even if you do this, you have high chance to over-optimize for specific CPU while downgrading performance on another.)
26 *
27 * Instead of this, we choose best algorithm by using performance statistics
28 * with something like "Bayesian Bandits" method.
29 */
30
31
32/** Both buffers passed to 'decompress' function must have
33 * at least this amount of excessive bytes after end of data
34 * that is allowed to read/write.
35 * This value is a little overestimation.
36 */
37static constexpr size_t ADDITIONAL_BYTES_AT_END_OF_BUFFER = 64;
38
39
40/** When decompressing uniform sequence of blocks (for example, blocks from one file),
41 * you can pass single PerformanceStatistics object to subsequent invocations of 'decompress' method.
42 * It will accumulate statistics and use it as a feedback to choose best specialization of algorithm at runtime.
43 * One PerformanceStatistics object cannot be used concurrently from different threads.
44 */
45struct PerformanceStatistics
46{
47 struct Element
48 {
49 double count = 0;
50 double sum = 0;
51
52 double adjustedCount() const
53 {
54 return count - NUM_INVOCATIONS_TO_THROW_OFF;
55 }
56
57 double mean() const
58 {
59 return sum / adjustedCount();
60 }
61
62 /// For better convergence, we don't use proper estimate of stddev.
63 /// We want to eventually separate between two algorithms even in case
64 /// when there is no statistical significant difference between them.
65 double sigma() const
66 {
67 return mean() / sqrt(adjustedCount());
68 }
69
70 void update(double seconds, double bytes)
71 {
72 ++count;
73
74 if (count > NUM_INVOCATIONS_TO_THROW_OFF)
75 sum += seconds / bytes;
76 }
77
78 double sample(pcg64 & stat_rng) const
79 {
80 /// If there is a variant with not enough statistics, always choose it.
81 /// And in that case prefer variant with less number of invocations.
82
83 if (adjustedCount() < 2)
84 return adjustedCount() - 1;
85 else
86 return std::normal_distribution<>(mean(), sigma())(stat_rng);
87 }
88 };
89
90 /// Number of different algorithms to select from.
91 static constexpr size_t NUM_ELEMENTS = 4;
92
93 /// Cold invocations may be affected by additional memory latencies. Don't take first invocations into account.
94 static constexpr double NUM_INVOCATIONS_TO_THROW_OFF = 2;
95
96 /// How to select method to run.
97 /// -1 - automatically, based on statistics (default);
98 /// 0..3 - always choose specified method (for performance testing);
99 /// -2 - choose methods in round robin fashion (for performance testing).
100 ssize_t choose_method = -1;
101
102 Element data[NUM_ELEMENTS];
103
104 /// It's Ok that generator is not seeded.
105 pcg64 rng;
106
107 /// To select from different algorithms we use a kind of "bandits" algorithm.
108 /// Sample random values from estimated normal distributions and choose the minimal.
109 size_t select()
110 {
111 if (choose_method < 0)
112 {
113 double samples[NUM_ELEMENTS];
114 for (size_t i = 0; i < NUM_ELEMENTS; ++i)
115 samples[i] = choose_method == -1
116 ? data[i].sample(rng)
117 : data[i].adjustedCount();
118
119 return std::min_element(samples, samples + NUM_ELEMENTS) - samples;
120 }
121 else
122 return choose_method;
123 }
124
125 PerformanceStatistics() {}
126 PerformanceStatistics(ssize_t choose_method_) : choose_method(choose_method_) {}
127};
128
129
130/** This method dispatch to one of different implementations depending on performance statistics.
131 */
132void decompress(
133 const char * const source,
134 char * const dest,
135 size_t source_size,
136 size_t dest_size,
137 PerformanceStatistics & statistics);
138
139
140/** Obtain statistics about LZ4 block useful for development.
141 */
142struct StreamStatistics
143{
144 size_t num_tokens = 0;
145 size_t sum_literal_lengths = 0;
146 size_t sum_match_lengths = 0;
147 size_t sum_match_offsets = 0;
148 size_t count_match_offset_less_8 = 0;
149 size_t count_match_offset_less_16 = 0;
150 size_t count_match_replicate_itself = 0;
151
152 void literal(size_t length);
153 void match(size_t length, size_t offset);
154
155 void print() const;
156};
157
158void statistics(
159 const char * const source,
160 char * const dest,
161 size_t dest_size,
162 StreamStatistics & stat);
163
164}
165