1#pragma once
2
3#include <Common/PODArray.h>
4#include <Common/NaNUtils.h>
5#include <Core/Types.h>
6#include <IO/WriteBuffer.h>
7#include <IO/ReadBuffer.h>
8#include <IO/VarInt.h>
9
10
11namespace DB
12{
13
14namespace ErrorCodes
15{
16 extern const int NOT_IMPLEMENTED;
17 extern const int BAD_ARGUMENTS;
18}
19
20/** Calculates quantile by collecting all values into array
21 * and applying n-th element (introselect) algorithm for the resulting array.
22 *
23 * It uses O(N) memory and it is very inefficient in case of high amount of identical values.
24 * But it is very CPU efficient for not large datasets.
25 */
26template <typename Value>
27struct QuantileExact
28{
29 /// The memory will be allocated to several elements at once, so that the state occupies 64 bytes.
30 static constexpr size_t bytes_in_arena = 64 - sizeof(PODArray<Value>);
31 using Array = PODArrayWithStackMemory<Value, bytes_in_arena>;
32 Array array;
33
34 void add(const Value & x)
35 {
36 /// We must skip NaNs as they are not compatible with comparison sorting.
37 if (!isNaN(x))
38 array.push_back(x);
39 }
40
41 template <typename Weight>
42 void add(const Value &, const Weight &)
43 {
44 throw Exception("Method add with weight is not implemented for QuantileExact", ErrorCodes::NOT_IMPLEMENTED);
45 }
46
47 void merge(const QuantileExact & rhs)
48 {
49 array.insert(rhs.array.begin(), rhs.array.end());
50 }
51
52 void serialize(WriteBuffer & buf) const
53 {
54 size_t size = array.size();
55 writeVarUInt(size, buf);
56 buf.write(reinterpret_cast<const char *>(array.data()), size * sizeof(array[0]));
57 }
58
59 void deserialize(ReadBuffer & buf)
60 {
61 size_t size = 0;
62 readVarUInt(size, buf);
63 array.resize(size);
64 buf.read(reinterpret_cast<char *>(array.data()), size * sizeof(array[0]));
65 }
66
67 /// Get the value of the `level` quantile. The level must be between 0 and 1.
68 Value get(Float64 level)
69 {
70 if (!array.empty())
71 {
72 size_t n = level < 1
73 ? level * array.size()
74 : (array.size() - 1);
75
76 std::nth_element(array.begin(), array.begin() + n, array.end()); /// NOTE You can think of the radix-select algorithm.
77 return array[n];
78 }
79
80 return std::numeric_limits<Value>::quiet_NaN();
81 }
82
83 /// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address.
84 /// indices - an array of index levels such that the corresponding elements will go in ascending order.
85 void getMany(const Float64 * levels, const size_t * indices, size_t size, Value * result)
86 {
87 if (!array.empty())
88 {
89 size_t prev_n = 0;
90 for (size_t i = 0; i < size; ++i)
91 {
92 auto level = levels[indices[i]];
93
94 size_t n = level < 1
95 ? level * array.size()
96 : (array.size() - 1);
97
98 std::nth_element(array.begin() + prev_n, array.begin() + n, array.end());
99
100 result[indices[i]] = array[n];
101 prev_n = n;
102 }
103 }
104 else
105 {
106 for (size_t i = 0; i < size; ++i)
107 result[i] = Value();
108 }
109 }
110};
111
112/// QuantileExactExclusive is equivalent to Excel PERCENTILE.EXC, R-6, SAS-4, SciPy-(0,0)
113template <typename Value>
114struct QuantileExactExclusive : public QuantileExact<Value>
115{
116 using QuantileExact<Value>::array;
117
118 /// Get the value of the `level` quantile. The level must be between 0 and 1 excluding bounds.
119 Float64 getFloat(Float64 level)
120 {
121 if (!array.empty())
122 {
123 if (level == 0. || level == 1.)
124 throw Exception("QuantileExactExclusive cannot interpolate for the percentiles 1 and 0", ErrorCodes::BAD_ARGUMENTS);
125
126 Float64 h = level * (array.size() + 1);
127 auto n = static_cast<size_t>(h);
128
129 if (n >= array.size())
130 return array[array.size() - 1];
131 else if (n < 1)
132 return array[0];
133
134 std::nth_element(array.begin(), array.begin() + n - 1, array.end());
135 auto nth_element = std::min_element(array.begin() + n, array.end());
136
137 return array[n - 1] + (h - n) * (*nth_element - array[n - 1]);
138 }
139
140 return std::numeric_limits<Float64>::quiet_NaN();
141 }
142
143 void getManyFloat(const Float64 * levels, const size_t * indices, size_t size, Float64 * result)
144 {
145 if (!array.empty())
146 {
147 size_t prev_n = 0;
148 for (size_t i = 0; i < size; ++i)
149 {
150 auto level = levels[indices[i]];
151 if (level == 0. || level == 1.)
152 throw Exception("QuantileExactExclusive cannot interpolate for the percentiles 1 and 0", ErrorCodes::BAD_ARGUMENTS);
153
154 Float64 h = level * (array.size() + 1);
155 auto n = static_cast<size_t>(h);
156
157 if (n >= array.size())
158 result[indices[i]] = array[array.size() - 1];
159 else if (n < 1)
160 result[indices[i]] = array[0];
161 else
162 {
163 std::nth_element(array.begin() + prev_n, array.begin() + n - 1, array.end());
164 auto nth_element = std::min_element(array.begin() + n, array.end());
165
166 result[indices[i]] = array[n - 1] + (h - n) * (*nth_element - array[n - 1]);
167 prev_n = n - 1;
168 }
169 }
170 }
171 else
172 {
173 for (size_t i = 0; i < size; ++i)
174 result[i] = std::numeric_limits<Float64>::quiet_NaN();
175 }
176 }
177};
178
179/// QuantileExactInclusive is equivalent to Excel PERCENTILE and PERCENTILE.INC, R-7, SciPy-(1,1)
180template <typename Value>
181struct QuantileExactInclusive : public QuantileExact<Value>
182{
183 using QuantileExact<Value>::array;
184
185 /// Get the value of the `level` quantile. The level must be between 0 and 1 including bounds.
186 Float64 getFloat(Float64 level)
187 {
188 if (!array.empty())
189 {
190 Float64 h = level * (array.size() - 1) + 1;
191 auto n = static_cast<size_t>(h);
192
193 if (n >= array.size())
194 return array[array.size() - 1];
195 else if (n < 1)
196 return array[0];
197
198 std::nth_element(array.begin(), array.begin() + n - 1, array.end());
199 auto nth_element = std::min_element(array.begin() + n, array.end());
200
201 return array[n - 1] + (h - n) * (*nth_element - array[n - 1]);
202 }
203
204 return std::numeric_limits<Float64>::quiet_NaN();
205 }
206
207 void getManyFloat(const Float64 * levels, const size_t * indices, size_t size, Float64 * result)
208 {
209 if (!array.empty())
210 {
211 size_t prev_n = 0;
212 for (size_t i = 0; i < size; ++i)
213 {
214 auto level = levels[indices[i]];
215
216 Float64 h = level * (array.size() - 1) + 1;
217 auto n = static_cast<size_t>(h);
218
219 if (n >= array.size())
220 result[indices[i]] = array[array.size() - 1];
221 else if (n < 1)
222 result[indices[i]] = array[0];
223 else
224 {
225 std::nth_element(array.begin() + prev_n, array.begin() + n - 1, array.end());
226 auto nth_element = std::min_element(array.begin() + n, array.end());
227
228 result[indices[i]] = array[n - 1] + (h - n) * (*nth_element - array[n - 1]);
229 prev_n = n - 1;
230 }
231 }
232 }
233 else
234 {
235 for (size_t i = 0; i < size; ++i)
236 result[i] = std::numeric_limits<Float64>::quiet_NaN();
237 }
238 }
239};
240
241}
242