QuantileExact.h source code [ClickHouse/dbms/src/AggregateFunctions/QuantileExact.h]

1	#pragma once
2
3	#include <Common/PODArray.h>
4	#include <Common/NaNUtils.h>
5	#include <Core/Types.h>
6	#include <IO/WriteBuffer.h>
7	#include <IO/ReadBuffer.h>
8	#include <IO/VarInt.h>
9
10
11	namespace DB
12	{
13
14	namespace ErrorCodes
15	{
16	extern const int NOT_IMPLEMENTED;
17	extern const int BAD_ARGUMENTS;
18	}
19
20	/* Calculates quantile by collecting all values into array*
21	* and applying n-th element (introselect) algorithm for the resulting array.
22	*
23	* It uses O(N) memory and it is very inefficient in case of high amount of identical values.
24	* But it is very CPU efficient for not large datasets.
25	*/
26	template <typename Value>
27	struct QuantileExact
28	{
29	/// The memory will be allocated to several elements at once, so that the state occupies 64 bytes.
30	static constexpr size_t bytes_in_arena = `64` - sizeof(PODArray<Value>);
31	using Array = PODArrayWithStackMemory<Value, bytes_in_arena>;
32	Array array;
33
34	void add(const Value & x)
35	{
36	/// We must skip NaNs as they are not compatible with comparison sorting.
37	if (!isNaN(x))
38	array.push_back(x);
39	}
40
41	template <typename Weight>
42	void add(const Value &, const Weight &)
43	{
44	throw Exception ("Method add with weight is not implemented for QuantileExact", ErrorCodes::NOT_IMPLEMENTED);
45	}
46
47	void merge(const QuantileExact & rhs)
48	{
49	array.insert(rhs.array.begin(), rhs.array.end());
50	}
51
52	void serialize(WriteBuffer & buf) const
53	{
54	size_t size = array.size();
55	writeVarUInt(size, buf);
56	buf.write(reinterpret_cast<const char >(array.data()), size sizeof(array[`0`]));
57	}
58
59	void deserialize(ReadBuffer & buf)
60	{
61	size_t size = `0`;
62	readVarUInt(size, buf);
63	array.resize(size);
64	buf.read(reinterpret_cast<char >(array.data()), size sizeof(array[`0`]));
65	}
66
67	/// Get the value of the `level` quantile. The level must be between 0 and 1.
68	Value get(Float64 level)
69	{
70	if (!array.empty())
71	{
72	size_t n = level < `1`
73	? level * array.size()
74	: (array.size() - `1`);
75
76	std::nth_element(array.begin(), array.begin() + n, array.end()); /// NOTE You can think of the radix-select algorithm.
77	return array[n];
78	}
79
80	return std::numeric_limits<Value>::quiet_NaN();
81	}
82
83	/// Get the `size` values of `levels` quantiles. Write `size` results starting with `result` address.
84	/// indices - an array of index levels such that the corresponding elements will go in ascending order.
85	void getMany(const Float64 * levels, const size_t * indices, size_t size, Value * result)
86	{
87	if (!array.empty())
88	{
89	size_t prev_n = `0`;
90	for (size_t i = `0`; i < size; ++i)
91	{
92	auto level = levels[indices[i]];
93
94	size_t n = level < `1`
95	? level * array.size()
96	: (array.size() - `1`);
97
98	std::nth_element(array.begin() + prev_n, array.begin() + n, array.end());
99
100	result[indices[i]] = array[n];
101	prev_n = n;
102	}
103	}
104	else
105	{
106	for (size_t i = `0`; i < size; ++i)
107	result[i] = Value();
108	}
109	}
110	};
111
112	/// QuantileExactExclusive is equivalent to Excel PERCENTILE.EXC, R-6, SAS-4, SciPy-(0,0)
113	template <typename Value>
114	struct QuantileExactExclusive : public QuantileExact<Value>
115	{
116	using QuantileExact<Value>::array;
117
118	/// Get the value of the `level` quantile. The level must be between 0 and 1 excluding bounds.
119	Float64 getFloat(Float64 level)
120	{
121	if (!array.empty())
122	{
123	if (level == `0.` \|\| level == `1.`)
124	throw Exception ("QuantileExactExclusive cannot interpolate for the percentiles 1 and 0", ErrorCodes::BAD_ARGUMENTS);
125
126	Float64 h = level * (array.size() + `1`);
127	auto n = static_cast<size_t>(h);
128
129	if (n >= array.size())
130	return array[array.size() - `1`];
131	else if (n < `1`)
132	return array[`0`];
133
134	std::nth_element(array.begin(), array.begin() + n - `1`, array.end());
135	auto nth_element = std::min_element(array.begin() + n, array.end());
136
137	return array[n - `1`] + (h - n) * (*nth_element - array[n - `1`]);
138	}
139
140	return std::numeric_limits<Float64>::quiet_NaN();
141	}
142
143	void getManyFloat(const Float64 * levels, const size_t * indices, size_t size, Float64 * result)
144	{
145	if (!array.empty())
146	{
147	size_t prev_n = `0`;
148	for (size_t i = `0`; i < size; ++i)
149	{
150	auto level = levels[indices[i]];
151	if (level == `0.` \|\| level == `1.`)
152	throw Exception ("QuantileExactExclusive cannot interpolate for the percentiles 1 and 0", ErrorCodes::BAD_ARGUMENTS);
153
154	Float64 h = level * (array.size() + `1`);
155	auto n = static_cast<size_t>(h);
156
157	if (n >= array.size())
158	result[indices[i]] = array[array.size() - `1`];
159	else if (n < `1`)
160	result[indices[i]] = array[`0`];
161	else
162	{
163	std::nth_element(array.begin() + prev_n, array.begin() + n - `1`, array.end());
164	auto nth_element = std::min_element(array.begin() + n, array.end());
165
166	result[indices[i]] = array[n - `1`] + (h - n) * (*nth_element - array[n - `1`]);
167	prev_n = n - `1`;
168	}
169	}
170	}
171	else
172	{
173	for (size_t i = `0`; i < size; ++i)
174	result[i] = std::numeric_limits<Float64>::quiet_NaN();
175	}
176	}
177	};
178
179	/// QuantileExactInclusive is equivalent to Excel PERCENTILE and PERCENTILE.INC, R-7, SciPy-(1,1)
180	template <typename Value>
181	struct QuantileExactInclusive : public QuantileExact<Value>
182	{
183	using QuantileExact<Value>::array;
184
185	/// Get the value of the `level` quantile. The level must be between 0 and 1 including bounds.
186	Float64 getFloat(Float64 level)
187	{
188	if (!array.empty())
189	{
190	Float64 h = level * (array.size() - `1`) + `1`;
191	auto n = static_cast<size_t>(h);
192
193	if (n >= array.size())
194	return array[array.size() - `1`];
195	else if (n < `1`)
196	return array[`0`];
197
198	std::nth_element(array.begin(), array.begin() + n - `1`, array.end());
199	auto nth_element = std::min_element(array.begin() + n, array.end());
200
201	return array[n - `1`] + (h - n) * (*nth_element - array[n - `1`]);
202	}
203
204	return std::numeric_limits<Float64>::quiet_NaN();
205	}
206
207	void getManyFloat(const Float64 * levels, const size_t * indices, size_t size, Float64 * result)
208	{
209	if (!array.empty())
210	{
211	size_t prev_n = `0`;
212	for (size_t i = `0`; i < size; ++i)
213	{
214	auto level = levels[indices[i]];
215
216	Float64 h = level * (array.size() - `1`) + `1`;
217	auto n = static_cast<size_t>(h);
218
219	if (n >= array.size())
220	result[indices[i]] = array[array.size() - `1`];
221	else if (n < `1`)
222	result[indices[i]] = array[`0`];
223	else
224	{
225	std::nth_element(array.begin() + prev_n, array.begin() + n - `1`, array.end());
226	auto nth_element = std::min_element(array.begin() + n, array.end());
227
228	result[indices[i]] = array[n - `1`] + (h - n) * (*nth_element - array[n - `1`]);
229	prev_n = n - `1`;
230	}
231	}
232	}
233	else
234	{
235	for (size_t i = `0`; i < size; ++i)
236	result[i] = std::numeric_limits<Float64>::quiet_NaN();
237	}
238	}
239	};
240
241	}
242

Browse the source code of ClickHouse/dbms/src/AggregateFunctions/QuantileExact.h