1#include <Columns/ColumnLowCardinality.h>
2#include <Columns/ColumnsNumber.h>
3#include <DataStreams/ColumnGathererStream.h>
4#include <DataTypes/NumberTraits.h>
5#include <Common/HashTable/HashMap.h>
6#include <Common/assert_cast.h>
7
8
9namespace DB
10{
11
12namespace
13{
14 template <typename T>
15 PaddedPODArray<T> * getIndexesData(IColumn & indexes)
16 {
17 auto * column = typeid_cast<ColumnVector<T> *>(&indexes);
18 if (column)
19 return &column->getData();
20
21 return nullptr;
22 }
23
24 template <typename T>
25 MutableColumnPtr mapUniqueIndexImplRef(PaddedPODArray<T> & index)
26 {
27 PaddedPODArray<T> copy(index.cbegin(), index.cend());
28
29 HashMap<T, T> hash_map;
30 for (auto val : index)
31 hash_map.insert({val, hash_map.size()});
32
33 auto res_col = ColumnVector<T>::create();
34 auto & data = res_col->getData();
35
36 data.resize(hash_map.size());
37 for (const auto & val : hash_map)
38 data[val.getMapped()] = val.getKey();
39
40 for (auto & ind : index)
41 ind = hash_map[ind];
42
43 for (size_t i = 0; i < index.size(); ++i)
44 if (data[index[i]] != copy[i])
45 throw Exception("Expected " + toString(data[index[i]]) + ", but got " + toString(copy[i]), ErrorCodes::LOGICAL_ERROR);
46
47 return res_col;
48 }
49
50 template <typename T>
51 MutableColumnPtr mapUniqueIndexImpl(PaddedPODArray<T> & index)
52 {
53 if (index.empty())
54 return ColumnVector<T>::create();
55
56 auto size = index.size();
57
58 T max_val = index[0];
59 for (size_t i = 1; i < size; ++i)
60 max_val = std::max(max_val, index[i]);
61
62 /// May happen when dictionary is shared.
63 if (max_val > size)
64 return mapUniqueIndexImplRef(index);
65
66 auto map_size = UInt64(max_val) + 1;
67 PaddedPODArray<T> map(map_size, 0);
68 T zero_pos_value = index[0];
69 index[0] = 0;
70 T cur_pos = 0;
71 for (size_t i = 1; i < size; ++i)
72 {
73 T val = index[i];
74 if (val != zero_pos_value && map[val] == 0)
75 {
76 ++cur_pos;
77 map[val] = cur_pos;
78 }
79
80 index[i] = map[val];
81 }
82
83 auto res_col = ColumnVector<T>::create(UInt64(cur_pos) + 1);
84 auto & data = res_col->getData();
85 data[0] = zero_pos_value;
86 for (size_t i = 0; i < map_size; ++i)
87 {
88 auto val = map[i];
89 if (val)
90 data[val] = static_cast<T>(i);
91 }
92
93 return res_col;
94 }
95
96 /// Returns unique values of column. Write new index to column.
97 MutableColumnPtr mapUniqueIndex(IColumn & column)
98 {
99 if (auto * data_uint8 = getIndexesData<UInt8>(column))
100 return mapUniqueIndexImpl(*data_uint8);
101 else if (auto * data_uint16 = getIndexesData<UInt16>(column))
102 return mapUniqueIndexImpl(*data_uint16);
103 else if (auto * data_uint32 = getIndexesData<UInt32>(column))
104 return mapUniqueIndexImpl(*data_uint32);
105 else if (auto * data_uint64 = getIndexesData<UInt64>(column))
106 return mapUniqueIndexImpl(*data_uint64);
107 else
108 throw Exception("Indexes column for getUniqueIndex must be ColumnUInt, got" + column.getName(),
109 ErrorCodes::LOGICAL_ERROR);
110 }
111}
112
113
114ColumnLowCardinality::ColumnLowCardinality(MutableColumnPtr && column_unique_, MutableColumnPtr && indexes_, bool is_shared)
115 : dictionary(std::move(column_unique_), is_shared), idx(std::move(indexes_))
116{
117 idx.check(getDictionary().size());
118}
119
120void ColumnLowCardinality::insert(const Field & x)
121{
122 compactIfSharedDictionary();
123 idx.insertPosition(dictionary.getColumnUnique().uniqueInsert(x));
124 idx.check(getDictionary().size());
125}
126
127void ColumnLowCardinality::insertDefault()
128{
129 idx.insertPosition(getDictionary().getDefaultValueIndex());
130}
131
132void ColumnLowCardinality::insertFrom(const IColumn & src, size_t n)
133{
134 auto * low_cardinality_src = typeid_cast<const ColumnLowCardinality *>(&src);
135
136 if (!low_cardinality_src)
137 throw Exception("Expected ColumnLowCardinality, got" + src.getName(), ErrorCodes::ILLEGAL_COLUMN);
138
139 size_t position = low_cardinality_src->getIndexes().getUInt(n);
140
141 if (&low_cardinality_src->getDictionary() == &getDictionary())
142 {
143 /// Dictionary is shared with src column. Insert only index.
144 idx.insertPosition(position);
145 }
146 else
147 {
148 compactIfSharedDictionary();
149 const auto & nested = *low_cardinality_src->getDictionary().getNestedColumn();
150 idx.insertPosition(dictionary.getColumnUnique().uniqueInsertFrom(nested, position));
151 }
152
153 idx.check(getDictionary().size());
154}
155
156void ColumnLowCardinality::insertFromFullColumn(const IColumn & src, size_t n)
157{
158 compactIfSharedDictionary();
159 idx.insertPosition(dictionary.getColumnUnique().uniqueInsertFrom(src, n));
160 idx.check(getDictionary().size());
161}
162
163void ColumnLowCardinality::insertRangeFrom(const IColumn & src, size_t start, size_t length)
164{
165 auto * low_cardinality_src = typeid_cast<const ColumnLowCardinality *>(&src);
166
167 if (!low_cardinality_src)
168 throw Exception("Expected ColumnLowCardinality, got " + src.getName(), ErrorCodes::ILLEGAL_COLUMN);
169
170 if (&low_cardinality_src->getDictionary() == &getDictionary())
171 {
172 /// Dictionary is shared with src column. Insert only indexes.
173 idx.insertPositionsRange(low_cardinality_src->getIndexes(), start, length);
174 }
175 else
176 {
177 compactIfSharedDictionary();
178
179 /// TODO: Support native insertion from other unique column. It will help to avoid null map creation.
180
181 auto sub_idx = (*low_cardinality_src->getIndexes().cut(start, length)).mutate();
182 auto idx_map = mapUniqueIndex(*sub_idx);
183
184 auto src_nested = low_cardinality_src->getDictionary().getNestedColumn();
185 auto used_keys = src_nested->index(*idx_map, 0);
186
187 auto inserted_indexes = dictionary.getColumnUnique().uniqueInsertRangeFrom(*used_keys, 0, used_keys->size());
188 idx.insertPositionsRange(*inserted_indexes->index(*sub_idx, 0), 0, length);
189 }
190 idx.check(getDictionary().size());
191}
192
193void ColumnLowCardinality::insertRangeFromFullColumn(const IColumn & src, size_t start, size_t length)
194{
195 compactIfSharedDictionary();
196 auto inserted_indexes = dictionary.getColumnUnique().uniqueInsertRangeFrom(src, start, length);
197 idx.insertPositionsRange(*inserted_indexes, 0, length);
198 idx.check(getDictionary().size());
199}
200
201void ColumnLowCardinality::insertRangeFromDictionaryEncodedColumn(const IColumn & keys, const IColumn & positions)
202{
203 Index(positions.getPtr()).check(keys.size());
204 compactIfSharedDictionary();
205 auto inserted_indexes = dictionary.getColumnUnique().uniqueInsertRangeFrom(keys, 0, keys.size());
206 idx.insertPositionsRange(*inserted_indexes->index(positions, 0), 0, positions.size());
207 idx.check(getDictionary().size());
208}
209
210void ColumnLowCardinality::insertData(const char * pos, size_t length)
211{
212 compactIfSharedDictionary();
213 idx.insertPosition(dictionary.getColumnUnique().uniqueInsertData(pos, length));
214 idx.check(getDictionary().size());
215}
216
217StringRef ColumnLowCardinality::serializeValueIntoArena(size_t n, Arena & arena, char const *& begin) const
218{
219 return getDictionary().serializeValueIntoArena(getIndexes().getUInt(n), arena, begin);
220}
221
222const char * ColumnLowCardinality::deserializeAndInsertFromArena(const char * pos)
223{
224 compactIfSharedDictionary();
225
226 const char * new_pos;
227 idx.insertPosition(dictionary.getColumnUnique().uniqueDeserializeAndInsertFromArena(pos, new_pos));
228
229 idx.check(getDictionary().size());
230 return new_pos;
231}
232
233void ColumnLowCardinality::gather(ColumnGathererStream & gatherer)
234{
235 gatherer.gather(*this);
236}
237
238MutableColumnPtr ColumnLowCardinality::cloneResized(size_t size) const
239{
240 auto unique_ptr = dictionary.getColumnUniquePtr();
241 if (size == 0)
242 unique_ptr = unique_ptr->cloneEmpty();
243
244 return ColumnLowCardinality::create((*std::move(unique_ptr)).mutate(), getIndexes().cloneResized(size));
245}
246
247int ColumnLowCardinality::compareAt(size_t n, size_t m, const IColumn & rhs, int nan_direction_hint) const
248{
249 const auto & low_cardinality_column = assert_cast<const ColumnLowCardinality &>(rhs);
250 size_t n_index = getIndexes().getUInt(n);
251 size_t m_index = low_cardinality_column.getIndexes().getUInt(m);
252 return getDictionary().compareAt(n_index, m_index, low_cardinality_column.getDictionary(), nan_direction_hint);
253}
254
255void ColumnLowCardinality::getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const
256{
257 if (limit == 0)
258 limit = size();
259
260 size_t unique_limit = getDictionary().size();
261 Permutation unique_perm;
262 getDictionary().getNestedColumn()->getPermutation(reverse, unique_limit, nan_direction_hint, unique_perm);
263
264 /// TODO: optimize with sse.
265
266 /// Get indexes per row in column_unique.
267 std::vector<std::vector<size_t>> indexes_per_row(getDictionary().size());
268 size_t indexes_size = getIndexes().size();
269 for (size_t row = 0; row < indexes_size; ++row)
270 indexes_per_row[getIndexes().getUInt(row)].push_back(row);
271
272 /// Replicate permutation.
273 size_t perm_size = std::min(indexes_size, limit);
274 res.resize(perm_size);
275 size_t perm_index = 0;
276 for (size_t row = 0; row < unique_perm.size() && perm_index < perm_size; ++row)
277 {
278 const auto & row_indexes = indexes_per_row[unique_perm[row]];
279 for (auto row_index : row_indexes)
280 {
281 res[perm_index] = row_index;
282 ++perm_index;
283
284 if (perm_index == perm_size)
285 break;
286 }
287 }
288}
289
290std::vector<MutableColumnPtr> ColumnLowCardinality::scatter(ColumnIndex num_columns, const Selector & selector) const
291{
292 auto columns = getIndexes().scatter(num_columns, selector);
293 for (auto & column : columns)
294 {
295 auto unique_ptr = dictionary.getColumnUniquePtr();
296 column = ColumnLowCardinality::create((*std::move(unique_ptr)).mutate(), std::move(column));
297 }
298
299 return columns;
300}
301
302void ColumnLowCardinality::setSharedDictionary(const ColumnPtr & column_unique)
303{
304 if (!empty())
305 throw Exception("Can't set ColumnUnique for ColumnLowCardinality because is't not empty.",
306 ErrorCodes::LOGICAL_ERROR);
307
308 dictionary.setShared(column_unique);
309}
310
311ColumnLowCardinality::MutablePtr ColumnLowCardinality::cutAndCompact(size_t start, size_t length) const
312{
313 auto sub_positions = (*idx.getPositions()->cut(start, length)).mutate();
314 /// Create column with new indexes and old dictionary.
315 /// Dictionary is shared, but will be recreated after compactInplace call.
316 auto column = ColumnLowCardinality::create(getDictionary().assumeMutable(), std::move(sub_positions));
317 /// Will create new dictionary.
318 column->compactInplace();
319
320 return column;
321}
322
323void ColumnLowCardinality::compactInplace()
324{
325 auto positions = idx.detachPositions();
326 dictionary.compact(positions);
327 idx.attachPositions(std::move(positions));
328}
329
330void ColumnLowCardinality::compactIfSharedDictionary()
331{
332 if (dictionary.isShared())
333 compactInplace();
334}
335
336
337ColumnLowCardinality::DictionaryEncodedColumn
338ColumnLowCardinality::getMinimalDictionaryEncodedColumn(UInt64 offset, UInt64 limit) const
339{
340 MutableColumnPtr sub_indexes = (*std::move(idx.getPositions()->cut(offset, limit))).mutate();
341 auto indexes_map = mapUniqueIndex(*sub_indexes);
342 auto sub_keys = getDictionary().getNestedColumn()->index(*indexes_map, 0);
343
344 return {std::move(sub_keys), std::move(sub_indexes)};
345}
346
347ColumnPtr ColumnLowCardinality::countKeys() const
348{
349 const auto & nested_column = getDictionary().getNestedColumn();
350 size_t dict_size = nested_column->size();
351
352 auto counter = ColumnUInt64::create(dict_size, 0);
353 idx.countKeys(counter->getData());
354 return counter;
355}
356
357bool ColumnLowCardinality::containsNull() const
358{
359 return getDictionary().nestedColumnIsNullable() && idx.containsDefault();
360}
361
362
363ColumnLowCardinality::Index::Index() : positions(ColumnUInt8::create()), size_of_type(sizeof(UInt8)) {}
364
365ColumnLowCardinality::Index::Index(MutableColumnPtr && positions_) : positions(std::move(positions_))
366{
367 updateSizeOfType();
368}
369
370ColumnLowCardinality::Index::Index(ColumnPtr positions_) : positions(std::move(positions_))
371{
372 updateSizeOfType();
373}
374
375template <typename Callback>
376void ColumnLowCardinality::Index::callForType(Callback && callback, size_t size_of_type)
377{
378 switch (size_of_type)
379 {
380 case sizeof(UInt8): { callback(UInt8()); break; }
381 case sizeof(UInt16): { callback(UInt16()); break; }
382 case sizeof(UInt32): { callback(UInt32()); break; }
383 case sizeof(UInt64): { callback(UInt64()); break; }
384 default: {
385 throw Exception("Unexpected size of index type for ColumnLowCardinality: " + toString(size_of_type),
386 ErrorCodes::LOGICAL_ERROR);
387 }
388 }
389}
390
391size_t ColumnLowCardinality::Index::getSizeOfIndexType(const IColumn & column, size_t hint)
392{
393 auto checkFor = [&](auto type) { return typeid_cast<const ColumnVector<decltype(type)> *>(&column) != nullptr; };
394 auto tryGetSizeFor = [&](auto type) -> size_t { return checkFor(type) ? sizeof(decltype(type)) : 0; };
395
396 if (hint)
397 {
398 size_t size = 0;
399 callForType([&](auto type) { size = tryGetSizeFor(type); }, hint);
400
401 if (size)
402 return size;
403 }
404
405 if (auto size = tryGetSizeFor(UInt8()))
406 return size;
407 if (auto size = tryGetSizeFor(UInt16()))
408 return size;
409 if (auto size = tryGetSizeFor(UInt32()))
410 return size;
411 if (auto size = tryGetSizeFor(UInt64()))
412 return size;
413
414 throw Exception("Unexpected indexes type for ColumnLowCardinality. Expected UInt, got " + column.getName(),
415 ErrorCodes::ILLEGAL_COLUMN);
416}
417
418void ColumnLowCardinality::Index::attachPositions(ColumnPtr positions_)
419{
420 positions = std::move(positions_);
421 updateSizeOfType();
422}
423
424template <typename IndexType>
425typename ColumnVector<IndexType>::Container & ColumnLowCardinality::Index::getPositionsData()
426{
427 auto * positions_ptr = typeid_cast<ColumnVector<IndexType> *>(positions->assumeMutable().get());
428 if (!positions_ptr)
429 throw Exception("Invalid indexes type for ColumnLowCardinality."
430 " Expected UInt" + toString(8 * sizeof(IndexType)) + ", got " + positions->getName(),
431 ErrorCodes::LOGICAL_ERROR);
432
433 return positions_ptr->getData();
434}
435
436template <typename IndexType>
437const typename ColumnVector<IndexType>::Container & ColumnLowCardinality::Index::getPositionsData() const
438{
439 const auto * positions_ptr = typeid_cast<const ColumnVector<IndexType> *>(positions.get());
440 if (!positions_ptr)
441 throw Exception("Invalid indexes type for ColumnLowCardinality."
442 " Expected UInt" + toString(8 * sizeof(IndexType)) + ", got " + positions->getName(),
443 ErrorCodes::LOGICAL_ERROR);
444
445 return positions_ptr->getData();
446}
447
448template <typename IndexType>
449void ColumnLowCardinality::Index::convertPositions()
450{
451 auto convert = [&](auto x)
452 {
453 using CurIndexType = decltype(x);
454 auto & data = getPositionsData<CurIndexType>();
455
456 if (sizeof(CurIndexType) > sizeof(IndexType))
457 throw Exception("Converting indexes to smaller type: from " + toString(sizeof(CurIndexType)) +
458 " to " + toString(sizeof(IndexType)), ErrorCodes::LOGICAL_ERROR);
459
460 if (sizeof(CurIndexType) != sizeof(IndexType))
461 {
462 size_t size = data.size();
463 auto new_positions = ColumnVector<IndexType>::create(size);
464 auto & new_data = new_positions->getData();
465
466 /// TODO: Optimize with SSE?
467 for (size_t i = 0; i < size; ++i)
468 new_data[i] = data[i];
469
470 positions = std::move(new_positions);
471 size_of_type = sizeof(IndexType);
472 }
473 };
474
475 callForType(std::move(convert), size_of_type);
476
477 checkSizeOfType();
478}
479
480void ColumnLowCardinality::Index::expandType()
481{
482 auto expand = [&](auto type)
483 {
484 using CurIndexType = decltype(type);
485 constexpr auto next_size = NumberTraits::nextSize(sizeof(CurIndexType));
486 if (next_size == sizeof(CurIndexType))
487 throw Exception("Can't expand indexes type for ColumnLowCardinality from type: "
488 + demangle(typeid(CurIndexType).name()), ErrorCodes::LOGICAL_ERROR);
489
490 using NewIndexType = typename NumberTraits::Construct<false, false, next_size>::Type;
491 convertPositions<NewIndexType>();
492 };
493
494 callForType(std::move(expand), size_of_type);
495}
496
497UInt64 ColumnLowCardinality::Index::getMaxPositionForCurrentType() const
498{
499 UInt64 value = 0;
500 callForType([&](auto type) { value = std::numeric_limits<decltype(type)>::max(); }, size_of_type);
501 return value;
502}
503
504size_t ColumnLowCardinality::Index::getPositionAt(size_t row) const
505{
506 size_t pos;
507 auto getPosition = [&](auto type)
508 {
509 using CurIndexType = decltype(type);
510 pos = getPositionsData<CurIndexType>()[row];
511 };
512
513 callForType(std::move(getPosition), size_of_type);
514 return pos;
515}
516
517void ColumnLowCardinality::Index::insertPosition(UInt64 position)
518{
519 while (position > getMaxPositionForCurrentType())
520 expandType();
521
522 positions->insert(position);
523 checkSizeOfType();
524}
525
526void ColumnLowCardinality::Index::insertPositionsRange(const IColumn & column, UInt64 offset, UInt64 limit)
527{
528 auto insertForType = [&](auto type)
529 {
530 using ColumnType = decltype(type);
531 const auto * column_ptr = typeid_cast<const ColumnVector<ColumnType> *>(&column);
532
533 if (!column_ptr)
534 return false;
535
536 if (size_of_type < sizeof(ColumnType))
537 convertPositions<ColumnType>();
538
539 if (size_of_type == sizeof(ColumnType))
540 positions->insertRangeFrom(column, offset, limit);
541 else
542 {
543 auto copy = [&](auto cur_type)
544 {
545 using CurIndexType = decltype(cur_type);
546 auto & positions_data = getPositionsData<CurIndexType>();
547 const auto & column_data = column_ptr->getData();
548
549 UInt64 size = positions_data.size();
550 positions_data.resize(size + limit);
551
552 for (UInt64 i = 0; i < limit; ++i)
553 positions_data[size + i] = column_data[offset + i];
554 };
555
556 callForType(std::move(copy), size_of_type);
557 }
558
559 return true;
560 };
561
562 if (!insertForType(UInt8()) &&
563 !insertForType(UInt16()) &&
564 !insertForType(UInt32()) &&
565 !insertForType(UInt64()))
566 throw Exception("Invalid column for ColumnLowCardinality index. Expected UInt, got " + column.getName(),
567 ErrorCodes::ILLEGAL_COLUMN);
568
569 checkSizeOfType();
570}
571
572void ColumnLowCardinality::Index::check(size_t /*max_dictionary_size*/)
573{
574 /// TODO: remove
575 /*
576 auto check = [&](auto cur_type)
577 {
578 using CurIndexType = decltype(cur_type);
579 auto & positions_data = getPositionsData<CurIndexType>();
580
581 for (size_t i = 0; i < positions_data.size(); ++i)
582 {
583 if (positions_data[i] >= max_dictionary_size)
584 {
585 throw Exception("Found index " + toString(positions_data[i]) + " at position " + toString(i)
586 + " which is grated or equal than dictionary size " + toString(max_dictionary_size),
587 ErrorCodes::LOGICAL_ERROR);
588 }
589 }
590 };
591
592 callForType(std::move(check), size_of_type);
593 */
594}
595
596void ColumnLowCardinality::Index::checkSizeOfType()
597{
598 if (size_of_type != getSizeOfIndexType(*positions, size_of_type))
599 throw Exception("Invalid size of type. Expected " + toString(8 * size_of_type) +
600 ", but positions are " + positions->getName(), ErrorCodes::LOGICAL_ERROR);
601}
602
603void ColumnLowCardinality::Index::countKeys(ColumnUInt64::Container & counts) const
604{
605 auto counter = [&](auto x)
606 {
607 using CurIndexType = decltype(x);
608 auto & data = getPositionsData<CurIndexType>();
609 for (auto pos : data)
610 ++counts[pos];
611 };
612 callForType(std::move(counter), size_of_type);
613}
614
615bool ColumnLowCardinality::Index::containsDefault() const
616{
617 bool contains = false;
618
619 auto check_contains_default = [&](auto x)
620 {
621 using CurIndexType = decltype(x);
622 auto & data = getPositionsData<CurIndexType>();
623 for (auto pos : data)
624 {
625 if (pos == 0)
626 {
627 contains = true;
628 break;
629 }
630 }
631 };
632
633 callForType(std::move(check_contains_default), size_of_type);
634 return contains;
635}
636
637
638ColumnLowCardinality::Dictionary::Dictionary(MutableColumnPtr && column_unique_, bool is_shared)
639 : column_unique(std::move(column_unique_)), shared(is_shared)
640{
641 checkColumn(*column_unique);
642}
643ColumnLowCardinality::Dictionary::Dictionary(ColumnPtr column_unique_, bool is_shared)
644 : column_unique(std::move(column_unique_)), shared(is_shared)
645{
646 checkColumn(*column_unique);
647}
648
649void ColumnLowCardinality::Dictionary::checkColumn(const IColumn & column)
650{
651
652 if (!dynamic_cast<const IColumnUnique *>(&column))
653 throw Exception("ColumnUnique expected as an argument of ColumnLowCardinality.", ErrorCodes::ILLEGAL_COLUMN);
654}
655
656void ColumnLowCardinality::Dictionary::setShared(const ColumnPtr & column_unique_)
657{
658 checkColumn(*column_unique_);
659
660 column_unique = column_unique_;
661 shared = true;
662}
663
664void ColumnLowCardinality::Dictionary::compact(ColumnPtr & positions)
665{
666 auto new_column_unique = column_unique->cloneEmpty();
667
668 auto & unique = getColumnUnique();
669 auto & new_unique = static_cast<IColumnUnique &>(*new_column_unique);
670
671 auto indexes = mapUniqueIndex(positions->assumeMutableRef());
672 auto sub_keys = unique.getNestedColumn()->index(*indexes, 0);
673 auto new_indexes = new_unique.uniqueInsertRangeFrom(*sub_keys, 0, sub_keys->size());
674
675 positions = (*new_indexes->index(*positions, 0)).mutate();
676 column_unique = std::move(new_column_unique);
677
678 shared = false;
679}
680
681}
682