| 1 | #pragma once |
| 2 | |
| 3 | #include <array> |
| 4 | |
| 5 | namespace DB |
| 6 | { |
| 7 | |
| 8 | /** Data for HyperLogLogBiasEstimator in the uniqCombined function. |
| 9 | * The development plan is as follows: |
| 10 | * 1. Assemble ClickHouse. |
| 11 | * 2. Run the script src/dbms/scripts/gen-bias-data.py, which returns one array for getRawEstimates() |
| 12 | * and another array for getBiases(). |
| 13 | * 3. Update `raw_estimates` and `biases` arrays. Also update the size of arrays in InterpolatedData. |
| 14 | * 4. Assemble ClickHouse. |
| 15 | * 5. Run the script src/dbms/scripts/linear-counting-threshold.py, which creates 3 files: |
| 16 | * - raw_graph.txt (1st column: the present number of unique values; |
| 17 | * 2nd column: relative error in the case of HyperLogLog without applying any corrections) |
| 18 | * - linear_counting_graph.txt (1st column: the present number of unique values; |
| 19 | * 2nd column: relative error in the case of HyperLogLog using LinearCounting) |
| 20 | * - bias_corrected_graph.txt (1st column: the present number of unique values; |
| 21 | * 2nd column: relative error in the case of HyperLogLog with the use of corrections from the algorithm HyperLogLog++) |
| 22 | * 6. Generate a graph with gnuplot based on this data. |
| 23 | * 7. Determine the minimum number of unique values at which it is better to correct the error |
| 24 | * using its evaluation (ie, using the HyperLogLog++ algorithm) than applying the LinearCounting algorithm. |
| 25 | * 7. Accordingly, update the constant in the function getThreshold() |
| 26 | * 8. Assemble ClickHouse. |
| 27 | */ |
| 28 | struct UniqCombinedBiasData |
| 29 | { |
| 30 | using InterpolatedData = std::array<double, 200>; |
| 31 | |
| 32 | static double getThreshold(); |
| 33 | /// Estimates of the number of unique values using the HyperLogLog algorithm without applying any corrections. |
| 34 | static const InterpolatedData & getRawEstimates(); |
| 35 | /// Corresponding error estimates. |
| 36 | static const InterpolatedData & getBiases(); |
| 37 | }; |
| 38 | |
| 39 | } |
| 40 | |