1 | #pragma once |
2 | |
3 | #include <array> |
4 | |
5 | namespace DB |
6 | { |
7 | |
8 | /** Data for HyperLogLogBiasEstimator in the uniqCombined function. |
9 | * The development plan is as follows: |
10 | * 1. Assemble ClickHouse. |
11 | * 2. Run the script src/dbms/scripts/gen-bias-data.py, which returns one array for getRawEstimates() |
12 | * and another array for getBiases(). |
13 | * 3. Update `raw_estimates` and `biases` arrays. Also update the size of arrays in InterpolatedData. |
14 | * 4. Assemble ClickHouse. |
15 | * 5. Run the script src/dbms/scripts/linear-counting-threshold.py, which creates 3 files: |
16 | * - raw_graph.txt (1st column: the present number of unique values; |
17 | * 2nd column: relative error in the case of HyperLogLog without applying any corrections) |
18 | * - linear_counting_graph.txt (1st column: the present number of unique values; |
19 | * 2nd column: relative error in the case of HyperLogLog using LinearCounting) |
20 | * - bias_corrected_graph.txt (1st column: the present number of unique values; |
21 | * 2nd column: relative error in the case of HyperLogLog with the use of corrections from the algorithm HyperLogLog++) |
22 | * 6. Generate a graph with gnuplot based on this data. |
23 | * 7. Determine the minimum number of unique values at which it is better to correct the error |
24 | * using its evaluation (ie, using the HyperLogLog++ algorithm) than applying the LinearCounting algorithm. |
25 | * 7. Accordingly, update the constant in the function getThreshold() |
26 | * 8. Assemble ClickHouse. |
27 | */ |
28 | struct UniqCombinedBiasData |
29 | { |
30 | using InterpolatedData = std::array<double, 200>; |
31 | |
32 | static double getThreshold(); |
33 | /// Estimates of the number of unique values using the HyperLogLog algorithm without applying any corrections. |
34 | static const InterpolatedData & getRawEstimates(); |
35 | /// Corresponding error estimates. |
36 | static const InterpolatedData & getBiases(); |
37 | }; |
38 | |
39 | } |
40 | |