1 | #pragma once |
2 | |
3 | #include <Common/PODArray.h> |
4 | #include <Common/NaNUtils.h> |
5 | #include <Core/Types.h> |
6 | #include <IO/WriteBuffer.h> |
7 | #include <IO/ReadBuffer.h> |
8 | #include <IO/VarInt.h> |
9 | |
10 | |
11 | namespace DB |
12 | { |
13 | |
14 | namespace ErrorCodes |
15 | { |
16 | extern const int NOT_IMPLEMENTED; |
17 | extern const int BAD_ARGUMENTS; |
18 | } |
19 | |
20 | /** Calculates quantile by collecting all values into array |
21 | * and applying n-th element (introselect) algorithm for the resulting array. |
22 | * |
23 | * It uses O(N) memory and it is very inefficient in case of high amount of identical values. |
24 | * But it is very CPU efficient for not large datasets. |
25 | */ |
26 | template <typename Value> |
27 | struct QuantileExact |
28 | { |
29 | /// The memory will be allocated to several elements at once, so that the state occupies 64 bytes. |
30 | static constexpr size_t bytes_in_arena = 64 - sizeof(PODArray<Value>); |
31 | using Array = PODArrayWithStackMemory<Value, bytes_in_arena>; |
32 | Array array; |
33 | |
34 | void add(const Value & x) |
35 | { |
36 | /// We must skip NaNs as they are not compatible with comparison sorting. |
37 | if (!isNaN(x)) |
38 | array.push_back(x); |
39 | } |
40 | |
41 | template <typename Weight> |
42 | void add(const Value &, const Weight &) |
43 | { |
44 | throw Exception("Method add with weight is not implemented for QuantileExact" , ErrorCodes::NOT_IMPLEMENTED); |
45 | } |
46 | |
47 | void merge(const QuantileExact & rhs) |
48 | { |
49 | array.insert(rhs.array.begin(), rhs.array.end()); |
50 | } |
51 | |
52 | void serialize(WriteBuffer & buf) const |
53 | { |
54 | size_t size = array.size(); |
55 | writeVarUInt(size, buf); |
56 | buf.write(reinterpret_cast<const char *>(array.data()), size * sizeof(array[0])); |
57 | } |
58 | |
59 | void deserialize(ReadBuffer & buf) |
60 | { |
61 | size_t size = 0; |
62 | readVarUInt(size, buf); |
63 | array.resize(size); |
64 | buf.read(reinterpret_cast<char *>(array.data()), size * sizeof(array[0])); |
65 | } |
66 | |
67 | /// Get the value of the `level` quantile. The level must be between 0 and 1. |
68 | Value get(Float64 level) |
69 | { |
70 | if (!array.empty()) |
71 | { |
72 | size_t n = level < 1 |
73 | ? level * array.size() |
74 | : (array.size() - 1); |
75 | |
76 | std::nth_element(array.begin(), array.begin() + n, array.end()); /// NOTE You can think of the radix-select algorithm. |
77 | return array[n]; |
78 | } |
79 | |
80 | return std::numeric_limits<Value>::quiet_NaN(); |
81 | } |
82 | |
83 | /// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address. |
84 | /// indices - an array of index levels such that the corresponding elements will go in ascending order. |
85 | void getMany(const Float64 * levels, const size_t * indices, size_t size, Value * result) |
86 | { |
87 | if (!array.empty()) |
88 | { |
89 | size_t prev_n = 0; |
90 | for (size_t i = 0; i < size; ++i) |
91 | { |
92 | auto level = levels[indices[i]]; |
93 | |
94 | size_t n = level < 1 |
95 | ? level * array.size() |
96 | : (array.size() - 1); |
97 | |
98 | std::nth_element(array.begin() + prev_n, array.begin() + n, array.end()); |
99 | |
100 | result[indices[i]] = array[n]; |
101 | prev_n = n; |
102 | } |
103 | } |
104 | else |
105 | { |
106 | for (size_t i = 0; i < size; ++i) |
107 | result[i] = Value(); |
108 | } |
109 | } |
110 | }; |
111 | |
112 | /// QuantileExactExclusive is equivalent to Excel PERCENTILE.EXC, R-6, SAS-4, SciPy-(0,0) |
113 | template <typename Value> |
114 | struct QuantileExactExclusive : public QuantileExact<Value> |
115 | { |
116 | using QuantileExact<Value>::array; |
117 | |
118 | /// Get the value of the `level` quantile. The level must be between 0 and 1 excluding bounds. |
119 | Float64 getFloat(Float64 level) |
120 | { |
121 | if (!array.empty()) |
122 | { |
123 | if (level == 0. || level == 1.) |
124 | throw Exception("QuantileExactExclusive cannot interpolate for the percentiles 1 and 0" , ErrorCodes::BAD_ARGUMENTS); |
125 | |
126 | Float64 h = level * (array.size() + 1); |
127 | auto n = static_cast<size_t>(h); |
128 | |
129 | if (n >= array.size()) |
130 | return array[array.size() - 1]; |
131 | else if (n < 1) |
132 | return array[0]; |
133 | |
134 | std::nth_element(array.begin(), array.begin() + n - 1, array.end()); |
135 | auto nth_element = std::min_element(array.begin() + n, array.end()); |
136 | |
137 | return array[n - 1] + (h - n) * (*nth_element - array[n - 1]); |
138 | } |
139 | |
140 | return std::numeric_limits<Float64>::quiet_NaN(); |
141 | } |
142 | |
143 | void getManyFloat(const Float64 * levels, const size_t * indices, size_t size, Float64 * result) |
144 | { |
145 | if (!array.empty()) |
146 | { |
147 | size_t prev_n = 0; |
148 | for (size_t i = 0; i < size; ++i) |
149 | { |
150 | auto level = levels[indices[i]]; |
151 | if (level == 0. || level == 1.) |
152 | throw Exception("QuantileExactExclusive cannot interpolate for the percentiles 1 and 0" , ErrorCodes::BAD_ARGUMENTS); |
153 | |
154 | Float64 h = level * (array.size() + 1); |
155 | auto n = static_cast<size_t>(h); |
156 | |
157 | if (n >= array.size()) |
158 | result[indices[i]] = array[array.size() - 1]; |
159 | else if (n < 1) |
160 | result[indices[i]] = array[0]; |
161 | else |
162 | { |
163 | std::nth_element(array.begin() + prev_n, array.begin() + n - 1, array.end()); |
164 | auto nth_element = std::min_element(array.begin() + n, array.end()); |
165 | |
166 | result[indices[i]] = array[n - 1] + (h - n) * (*nth_element - array[n - 1]); |
167 | prev_n = n - 1; |
168 | } |
169 | } |
170 | } |
171 | else |
172 | { |
173 | for (size_t i = 0; i < size; ++i) |
174 | result[i] = std::numeric_limits<Float64>::quiet_NaN(); |
175 | } |
176 | } |
177 | }; |
178 | |
179 | /// QuantileExactInclusive is equivalent to Excel PERCENTILE and PERCENTILE.INC, R-7, SciPy-(1,1) |
180 | template <typename Value> |
181 | struct QuantileExactInclusive : public QuantileExact<Value> |
182 | { |
183 | using QuantileExact<Value>::array; |
184 | |
185 | /// Get the value of the `level` quantile. The level must be between 0 and 1 including bounds. |
186 | Float64 getFloat(Float64 level) |
187 | { |
188 | if (!array.empty()) |
189 | { |
190 | Float64 h = level * (array.size() - 1) + 1; |
191 | auto n = static_cast<size_t>(h); |
192 | |
193 | if (n >= array.size()) |
194 | return array[array.size() - 1]; |
195 | else if (n < 1) |
196 | return array[0]; |
197 | |
198 | std::nth_element(array.begin(), array.begin() + n - 1, array.end()); |
199 | auto nth_element = std::min_element(array.begin() + n, array.end()); |
200 | |
201 | return array[n - 1] + (h - n) * (*nth_element - array[n - 1]); |
202 | } |
203 | |
204 | return std::numeric_limits<Float64>::quiet_NaN(); |
205 | } |
206 | |
207 | void getManyFloat(const Float64 * levels, const size_t * indices, size_t size, Float64 * result) |
208 | { |
209 | if (!array.empty()) |
210 | { |
211 | size_t prev_n = 0; |
212 | for (size_t i = 0; i < size; ++i) |
213 | { |
214 | auto level = levels[indices[i]]; |
215 | |
216 | Float64 h = level * (array.size() - 1) + 1; |
217 | auto n = static_cast<size_t>(h); |
218 | |
219 | if (n >= array.size()) |
220 | result[indices[i]] = array[array.size() - 1]; |
221 | else if (n < 1) |
222 | result[indices[i]] = array[0]; |
223 | else |
224 | { |
225 | std::nth_element(array.begin() + prev_n, array.begin() + n - 1, array.end()); |
226 | auto nth_element = std::min_element(array.begin() + n, array.end()); |
227 | |
228 | result[indices[i]] = array[n - 1] + (h - n) * (*nth_element - array[n - 1]); |
229 | prev_n = n - 1; |
230 | } |
231 | } |
232 | } |
233 | else |
234 | { |
235 | for (size_t i = 0; i < size; ++i) |
236 | result[i] = std::numeric_limits<Float64>::quiet_NaN(); |
237 | } |
238 | } |
239 | }; |
240 | |
241 | } |
242 | |