| 1 | #pragma once |
| 2 | |
| 3 | #include <cmath> |
| 4 | #include <random> |
| 5 | #include <pcg_random.hpp> |
| 6 | |
| 7 | |
| 8 | namespace LZ4 |
| 9 | { |
| 10 | |
| 11 | /** There are many implementation details of LZ4 decompression loop, that affect performance. |
| 12 | * For example: copy by 8 or by 16 (SSE2) bytes at once; use shuffle (SSSE3) instruction to replicate match or not. |
| 13 | * |
| 14 | * The optimal algorithm is dependent on: |
| 15 | * |
| 16 | * 1. CPU architecture. |
| 17 | * (example: on Skylake it's almost always better to copy by 16 bytes and use shuffle, |
| 18 | * but on Westmere using shuffle is worse and copy by 16 bytes is better only for high compression ratios) |
| 19 | * |
| 20 | * 2. Data distribution. |
| 21 | * (example: when compression ratio is higher than 10.20, |
| 22 | * it's usually better to copy by 16 bytes rather than 8). |
| 23 | * |
| 24 | * It's very difficult to test all combinations on different CPUs and to choose correct rule to select best variant. |
| 25 | * (Even if you do this, you have high chance to over-optimize for specific CPU while downgrading performance on another.) |
| 26 | * |
| 27 | * Instead of this, we choose best algorithm by using performance statistics |
| 28 | * with something like "Bayesian Bandits" method. |
| 29 | */ |
| 30 | |
| 31 | |
| 32 | /** Both buffers passed to 'decompress' function must have |
| 33 | * at least this amount of excessive bytes after end of data |
| 34 | * that is allowed to read/write. |
| 35 | * This value is a little overestimation. |
| 36 | */ |
| 37 | static constexpr size_t ADDITIONAL_BYTES_AT_END_OF_BUFFER = 64; |
| 38 | |
| 39 | |
| 40 | /** When decompressing uniform sequence of blocks (for example, blocks from one file), |
| 41 | * you can pass single PerformanceStatistics object to subsequent invocations of 'decompress' method. |
| 42 | * It will accumulate statistics and use it as a feedback to choose best specialization of algorithm at runtime. |
| 43 | * One PerformanceStatistics object cannot be used concurrently from different threads. |
| 44 | */ |
| 45 | struct PerformanceStatistics |
| 46 | { |
| 47 | struct Element |
| 48 | { |
| 49 | double count = 0; |
| 50 | double sum = 0; |
| 51 | |
| 52 | double adjustedCount() const |
| 53 | { |
| 54 | return count - NUM_INVOCATIONS_TO_THROW_OFF; |
| 55 | } |
| 56 | |
| 57 | double mean() const |
| 58 | { |
| 59 | return sum / adjustedCount(); |
| 60 | } |
| 61 | |
| 62 | /// For better convergence, we don't use proper estimate of stddev. |
| 63 | /// We want to eventually separate between two algorithms even in case |
| 64 | /// when there is no statistical significant difference between them. |
| 65 | double sigma() const |
| 66 | { |
| 67 | return mean() / sqrt(adjustedCount()); |
| 68 | } |
| 69 | |
| 70 | void update(double seconds, double bytes) |
| 71 | { |
| 72 | ++count; |
| 73 | |
| 74 | if (count > NUM_INVOCATIONS_TO_THROW_OFF) |
| 75 | sum += seconds / bytes; |
| 76 | } |
| 77 | |
| 78 | double sample(pcg64 & stat_rng) const |
| 79 | { |
| 80 | /// If there is a variant with not enough statistics, always choose it. |
| 81 | /// And in that case prefer variant with less number of invocations. |
| 82 | |
| 83 | if (adjustedCount() < 2) |
| 84 | return adjustedCount() - 1; |
| 85 | else |
| 86 | return std::normal_distribution<>(mean(), sigma())(stat_rng); |
| 87 | } |
| 88 | }; |
| 89 | |
| 90 | /// Number of different algorithms to select from. |
| 91 | static constexpr size_t NUM_ELEMENTS = 4; |
| 92 | |
| 93 | /// Cold invocations may be affected by additional memory latencies. Don't take first invocations into account. |
| 94 | static constexpr double NUM_INVOCATIONS_TO_THROW_OFF = 2; |
| 95 | |
| 96 | /// How to select method to run. |
| 97 | /// -1 - automatically, based on statistics (default); |
| 98 | /// 0..3 - always choose specified method (for performance testing); |
| 99 | /// -2 - choose methods in round robin fashion (for performance testing). |
| 100 | ssize_t choose_method = -1; |
| 101 | |
| 102 | Element data[NUM_ELEMENTS]; |
| 103 | |
| 104 | /// It's Ok that generator is not seeded. |
| 105 | pcg64 rng; |
| 106 | |
| 107 | /// To select from different algorithms we use a kind of "bandits" algorithm. |
| 108 | /// Sample random values from estimated normal distributions and choose the minimal. |
| 109 | size_t select() |
| 110 | { |
| 111 | if (choose_method < 0) |
| 112 | { |
| 113 | double samples[NUM_ELEMENTS]; |
| 114 | for (size_t i = 0; i < NUM_ELEMENTS; ++i) |
| 115 | samples[i] = choose_method == -1 |
| 116 | ? data[i].sample(rng) |
| 117 | : data[i].adjustedCount(); |
| 118 | |
| 119 | return std::min_element(samples, samples + NUM_ELEMENTS) - samples; |
| 120 | } |
| 121 | else |
| 122 | return choose_method; |
| 123 | } |
| 124 | |
| 125 | PerformanceStatistics() {} |
| 126 | PerformanceStatistics(ssize_t choose_method_) : choose_method(choose_method_) {} |
| 127 | }; |
| 128 | |
| 129 | |
| 130 | /** This method dispatch to one of different implementations depending on performance statistics. |
| 131 | */ |
| 132 | void decompress( |
| 133 | const char * const source, |
| 134 | char * const dest, |
| 135 | size_t source_size, |
| 136 | size_t dest_size, |
| 137 | PerformanceStatistics & statistics); |
| 138 | |
| 139 | |
| 140 | /** Obtain statistics about LZ4 block useful for development. |
| 141 | */ |
| 142 | struct StreamStatistics |
| 143 | { |
| 144 | size_t num_tokens = 0; |
| 145 | size_t sum_literal_lengths = 0; |
| 146 | size_t sum_match_lengths = 0; |
| 147 | size_t sum_match_offsets = 0; |
| 148 | size_t count_match_offset_less_8 = 0; |
| 149 | size_t count_match_offset_less_16 = 0; |
| 150 | size_t count_match_replicate_itself = 0; |
| 151 | |
| 152 | void literal(size_t length); |
| 153 | void match(size_t length, size_t offset); |
| 154 | |
| 155 | void print() const; |
| 156 | }; |
| 157 | |
| 158 | void statistics( |
| 159 | const char * const source, |
| 160 | char * const dest, |
| 161 | size_t dest_size, |
| 162 | StreamStatistics & stat); |
| 163 | |
| 164 | } |
| 165 | |