ColumnVector.cpp source code [ClickHouse/dbms/src/Columns/ColumnVector.cpp]

1	#include "ColumnVector.h"
2
3	#include <cstring>
4	#include <cmath>
5	#include <common/unaligned.h>
6	#include <Common/Exception.h>
7	#include <Common/Arena.h>
8	#include <Common/SipHash.h>
9	#include <Common/NaNUtils.h>
10	#include <Common/RadixSort.h>
11	#include <Common/assert_cast.h>
12	#include <IO/WriteBuffer.h>
13	#include <IO/WriteHelpers.h>
14	#include <Columns/ColumnsCommon.h>
15	#include <DataStreams/ColumnGathererStream.h>
16	#include <ext/bit_cast.h>
17	#include <pdqsort.h>
18
19	#ifdef __SSE2__
20	#include <emmintrin.h>
21	#endif
22
23	namespace DB
24	{
25
26	namespace ErrorCodes
27	{
28	extern const int PARAMETER_OUT_OF_BOUND;
29	extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
30	}
31
32
33	template <typename T>
34	StringRef ColumnVector<T>::serializeValueIntoArena(size_t n, Arena & arena, char const & begin) const*
35	{
36	auto pos = arena.allocContinue(sizeof(T), begin);
37	unalignedStore<T>(pos, data[n]);
38	return StringRef (pos, sizeof(T));
39	}
40
41	template <typename T>
42	const char * ColumnVector<T>::deserializeAndInsertFromArena(const char * pos)
43	{
44	data.push_back(unalignedLoad<T>(pos));
45	return pos + sizeof(T);
46	}
47
48	template <typename T>
49	void ColumnVector<T>::updateHashWithValue(size_t n, SipHash & hash) const
50	{
51	hash.update(data[n]);
52	}
53
54	template <typename T>
55	struct ColumnVector<T>::less
56	{
57	const Self & parent;
58	int nan_direction_hint;
59	less(const Self & parent_, int nan_direction_hint_) : parent(parent_), nan_direction_hint(nan_direction_hint_) {}
60	bool operator()(size_t lhs, size_t rhs) const { return CompareHelper<T>::less(parent.data[lhs], parent.data[rhs], nan_direction_hint); }
61	};
62
63	template <typename T>
64	struct ColumnVector<T>::greater
65	{
66	const Self & parent;
67	int nan_direction_hint;
68	greater(const Self & parent_, int nan_direction_hint_) : parent(parent_), nan_direction_hint(nan_direction_hint_) {}
69	bool operator()(size_t lhs, size_t rhs) const { return CompareHelper<T>::greater(parent.data[lhs], parent.data[rhs], nan_direction_hint); }
70	};
71
72
73	namespace
74	{
75	template <typename T>
76	struct ValueWithIndex
77	{
78	T value;
79	UInt32 index;
80	};
81
82	template <typename T>
83	struct RadixSortTraits : RadixSortNumTraits<T>
84	{
85	using Element = ValueWithIndex<T>;
86	static T & extractKey(Element & elem) { return elem.value; }
87	};
88	}
89
90	template <typename T>
91	void ColumnVector<T>::getPermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res) const
92	{
93	size_t s = data.size();
94	res.resize(s);
95
96	if (s == `0`)
97	return;
98
99	if (limit >= s)
100	limit = `0`;
101
102	if (limit)
103	{
104	for (size_t i = `0`; i < s; ++i)
105	res [i] = i;
106
107	if (reverse)
108	std::partial_sort(res.begin(), res.begin() + limit, res.end(), greater(*this, nan_direction_hint));
109	else
110	std::partial_sort(res.begin(), res.begin() + limit, res.end(), less(*this, nan_direction_hint));
111	}
112	else
113	{
114	/// A case for radix sort
115	if constexpr (is_arithmetic_v<T> && !std::is_same_v<T, UInt128>)
116	{
117	/// Thresholds on size. Lower threshold is arbitrary. Upper threshold is chosen by the type for histogram counters.
118	if (s >= `256` && s <= std::numeric_limits<UInt32>::max())
119	{
120	PaddedPODArray<ValueWithIndex<T>> pairs(s);
121	for (UInt32 i = `0`; i < s; ++i)
122	pairs[i] = {data[i], i};
123
124	RadixSort<RadixSortTraits<T>>::executeLSD(pairs.data(), s);
125
126	/// Radix sort treats all NaNs to be greater than all numbers.
127	/// If the user needs the opposite, we must move them accordingly.
128	size_t nans_to_move = `0`;
129	if (std::is_floating_point_v<T> && nan_direction_hint < `0`)
130	{
131	for (ssize_t i = s - `1`; i >= `0`; --i)
132	{
133	if (isNaN(pairs[i].value))
134	++nans_to_move;
135	else
136	break;
137	}
138	}
139
140	if (reverse)
141	{
142	if (nans_to_move)
143	{
144	for (size_t i = `0`; i < s - nans_to_move; ++i)
145	res [i] = pairs[s - nans_to_move - `1` - i].index;
146	for (size_t i = s - nans_to_move; i < s; ++i)
147	res [i] = pairs[s - `1` - (i - (s - nans_to_move))].index;
148	}
149	else
150	{
151	for (size_t i = `0`; i < s; ++i)
152	res [s - `1` - i] = pairs[i].index;
153	}
154	}
155	else
156	{
157	if (nans_to_move)
158	{
159	for (size_t i = `0`; i < nans_to_move; ++i)
160	res [i] = pairs[i + s - nans_to_move].index;
161	for (size_t i = nans_to_move; i < s; ++i)
162	res [i] = pairs[i - nans_to_move].index;
163	}
164	else
165	{
166	for (size_t i = `0`; i < s; ++i)
167	res [i] = pairs[i].index;
168	}
169	}
170
171	return;
172	}
173	}
174
175	/// Default sorting algorithm.
176	for (size_t i = `0`; i < s; ++i)
177	res [i] = i;
178
179	if (reverse)
180	pdqsort(res.begin(), res.end(), greater(*this, nan_direction_hint));
181	else
182	pdqsort(res.begin(), res.end(), less(*this, nan_direction_hint));
183	}
184	}
185
186
187	template <typename T>
188	const char * ColumnVector<T>::getFamilyName() const
189	{
190	return TypeName<T>::get();
191	}
192
193	template <typename T>
194	MutableColumnPtr ColumnVector<T>::cloneResized(size_t size) const
195	{
196	auto res = this->create();
197
198	if (size > `0`)
199	{
200	auto & new_col = static_cast<Self &>(*res);
201	new_col.data.resize(size);
202
203	size_t count = std::min(this->size(), size);
204	memcpy(new_col.data.data(), data.data(), count * sizeof(data[`0`]));
205
206	if (size > count)
207	memset(static_cast<void >(&new_col.data[count]), static_cast<int>(ValueType()), (size - count) sizeof(ValueType));
208	}
209
210	return res;
211	}
212
213	template <typename T>
214	UInt64 ColumnVector<T>::get64(size_t n) const
215	{
216	return ext::bit_cast<UInt64>(data[n]);
217	}
218
219	template <typename T>
220	Float64 ColumnVector<T>::getFloat64(size_t n) const
221	{
222	return static_cast<Float64>(data[n]);
223	}
224
225	template <typename T>
226	Float32 ColumnVector<T>::getFloat32(size_t n) const
227	{
228	return static_cast<Float32>(data[n]);
229	}
230
231	template <typename T>
232	void ColumnVector<T>::insertRangeFrom(const IColumn & src, size_t start, size_t length)
233	{
234	const ColumnVector & src_vec = assert_cast<const ColumnVector &>(src);
235
236	if (start + length > src_vec.data.size())
237	throw Exception("Parameters start = "
238	+ toString(start) + ", length = "
239	+ toString(length) + " are out of bound in ColumnVector<T>::insertRangeFrom method"
240	" (data.size() = " + toString(src_vec.data.size()) + ").",
241	ErrorCodes::PARAMETER_OUT_OF_BOUND);
242
243	size_t old_size = data.size();
244	data.resize(old_size + length);
245	memcpy(data.data() + old_size, &src_vec.data[start], length * sizeof(data[`0`]));
246	}
247
248	template <typename T>
249	ColumnPtr ColumnVector<T>::filter(const IColumn::Filter & filt, ssize_t result_size_hint) const
250	{
251	size_t size = data.size();
252	if (size != filt.size())
253	throw Exception ("Size of filter doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
254
255	auto res = this->create();
256	Container & res_data = res->getData();
257
258	if (result_size_hint)
259	res_data.reserve(result_size_hint > `0` ? result_size_hint : size);
260
261	const UInt8 * filt_pos = filt.data();
262	const UInt8 * filt_end = filt_pos + size;
263	const T * data_pos = data.data();
264
265	#ifdef __SSE2__
266	/* A slightly more optimized version.*
267	* Based on the assumption that often pieces of consecutive values
268	* completely pass or do not pass the filter.
269	* Therefore, we will optimistically check the parts of `SIMD_BYTES` values.
270	*/
271
272	static constexpr size_t SIMD_BYTES = `16`;
273	const __m128i zero16 = _mm_setzero_si128();
274	const UInt8 * filt_end_sse = filt_pos + size / SIMD_BYTES * SIMD_BYTES;
275
276	while (filt_pos < filt_end_sse)
277	{
278	int mask = _mm_movemask_epi8(_mm_cmpgt_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i *>(filt_pos)), zero16));
279
280	if (`0` == mask)
281	{
282	/// Nothing is inserted.
283	}
284	else if (`0xFFFF` == mask)
285	{
286	res_data.insert(data_pos, data_pos + SIMD_BYTES);
287	}
288	else
289	{
290	for (size_t i = `0`; i < SIMD_BYTES; ++i)
291	if (filt_pos[i])
292	res_data.push_back(data_pos[i]);
293	}
294
295	filt_pos += SIMD_BYTES;
296	data_pos += SIMD_BYTES;
297	}
298	#endif
299
300	while (filt_pos < filt_end)
301	{
302	if (*filt_pos)
303	res_data.push_back(*data_pos);
304
305	++filt_pos;
306	++data_pos;
307	}
308
309	return res;
310	}
311
312	template <typename T>
313	ColumnPtr ColumnVector<T>::permute(const IColumn::Permutation & perm, size_t limit) const
314	{
315	size_t size = data.size();
316
317	if (limit == `0`)
318	limit = size;
319	else
320	limit = std::min(size, limit);
321
322	if (perm.size() < limit)
323	throw Exception ("Size of permutation is less than required.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
324
325	auto res = this->create(limit);
326	typename Self::Container & res_data = res->getData();
327	for (size_t i = `0`; i < limit; ++i)
328	res_data[i] = data[perm [i]];
329
330	return res;
331	}
332
333	template <typename T>
334	ColumnPtr ColumnVector<T>::index(const IColumn & indexes, size_t limit) const
335	{
336	return selectIndexImpl(*this, indexes, limit);
337	}
338
339	template <typename T>
340	ColumnPtr ColumnVector<T>::replicate(const IColumn::Offsets & offsets) const
341	{
342	size_t size = data.size();
343	if (size != offsets.size())
344	throw Exception ("Size of offsets doesn't match size of column.", ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);
345
346	if (`0` == size)
347	return this->create();
348
349	auto res = this->create();
350	typename Self::Container & res_data = res->getData();
351	res_data.reserve(offsets.back());
352
353	IColumn::Offset prev_offset = `0`;
354	for (size_t i = `0`; i < size; ++i)
355	{
356	size_t size_to_replicate = offsets [i] - prev_offset;
357	prev_offset = offsets [i];
358
359	for (size_t j = `0`; j < size_to_replicate; ++j)
360	res_data.push_back(data[i]);
361	}
362
363	return res;
364	}
365
366	template <typename T>
367	void ColumnVector<T>::gather(ColumnGathererStream & gatherer)
368	{
369	gatherer.gather(*this);
370	}
371
372	template <typename T>
373	void ColumnVector<T>::getExtremes(Field & min, Field & max) const
374	{
375	size_t size = data.size();
376
377	if (size == `0`)
378	{
379	min = T(`0`);
380	max = T(`0`);
381	return;
382	}
383
384	bool has_value = false;
385
386	/* Skip all NaNs in extremes calculation.*
387	* If all values are NaNs, then return NaN.
388	* NOTE: There exist many different NaNs.
389	* Different NaN could be returned: not bit-exact value as one of NaNs from column.
390	*/
391
392	T cur_min = NaNOrZero<T>();
393	T cur_max = NaNOrZero<T>();
394
395	for (const T x : data)
396	{
397	if (isNaN(x))
398	continue;
399
400	if (!has_value)
401	{
402	cur_min = x;
403	cur_max = x;
404	has_value = true;
405	continue;
406	}
407
408	if (x < cur_min)
409	cur_min = x;
410	else if (x > cur_max)
411	cur_max = x;
412	}
413
414	min = NearestFieldType<T>(cur_min);
415	max = NearestFieldType<T>(cur_max);
416	}
417
418	/// Explicit template instantiations - to avoid code bloat in headers.
419	template class ColumnVector<UInt8>;
420	template class ColumnVector<UInt16>;
421	template class ColumnVector<UInt32>;
422	template class ColumnVector<UInt64>;
423	template class ColumnVector<UInt128>;
424	template class ColumnVector<Int8>;
425	template class ColumnVector<Int16>;
426	template class ColumnVector<Int32>;
427	template class ColumnVector<Int64>;
428	template class ColumnVector<Int128>;
429	template class ColumnVector<Float32>;
430	template class ColumnVector<Float64>;
431	}
432

Browse the source code of ClickHouse/dbms/src/Columns/ColumnVector.cpp