1#include <Columns/ColumnUnique.h>
2#include <Columns/ColumnString.h>
3#include <Columns/ColumnsNumber.h>
4#include <Columns/ColumnNullable.h>
5
6#include <DataTypes/DataTypeString.h>
7#include <DataTypes/DataTypesNumber.h>
8#include <DataTypes/DataTypeNullable.h>
9
10#include <gtest/gtest.h>
11
12#include <unordered_map>
13#include <vector>
14using namespace DB;
15
16TEST(column_unique, column_unique_unique_insert_range_Test)
17{
18 std::unordered_map<String, size_t> ref_map;
19 auto data_type = std::make_shared<DataTypeString>();
20 auto column_unique = ColumnUnique<ColumnString>::create(*data_type);
21 auto column_string = ColumnString::create();
22
23 size_t num_values = 1000000;
24 size_t mod_to = 1000;
25
26 std::vector<size_t> indexes(num_values);
27 for (size_t i = 0; i < num_values; ++i)
28 {
29 String str = toString(i % mod_to);
30 column_string->insertData(str.data(), str.size());
31
32 if (ref_map.count(str) == 0)
33 ref_map[str] = ref_map.size();
34
35 indexes[i]= ref_map[str];
36 }
37
38 auto idx = column_unique->uniqueInsertRangeFrom(*column_string, 0, num_values);
39 ASSERT_EQ(idx->size(), num_values);
40
41 for (size_t i = 0; i < num_values; ++i)
42 {
43 ASSERT_EQ(indexes[i] + 1, idx->getUInt(i)) << "Different indexes at position " << i;
44 }
45
46 auto & nested = column_unique->getNestedColumn();
47 ASSERT_EQ(nested->size(), mod_to + 1);
48
49 for (size_t i = 0; i < mod_to; ++i)
50 {
51 ASSERT_EQ(std::to_string(i), nested->getDataAt(i + 1).toString());
52 }
53}
54
55TEST(column_unique, column_unique_unique_insert_range_with_overflow_Test)
56{
57 std::unordered_map<String, size_t> ref_map;
58 auto data_type = std::make_shared<DataTypeString>();
59 auto column_unique = ColumnUnique<ColumnString>::create(*data_type);
60 auto column_string = ColumnString::create();
61
62 size_t num_values = 1000000;
63 size_t mod_to = 1000;
64
65 std::vector<size_t> indexes(num_values);
66 for (size_t i = 0; i < num_values; ++i)
67 {
68 String str = toString(i % mod_to);
69 column_string->insertData(str.data(), str.size());
70
71 if (ref_map.count(str) == 0)
72 ref_map[str] = ref_map.size();
73
74 indexes[i]= ref_map[str];
75 }
76
77 size_t max_val = mod_to / 2;
78 size_t max_dict_size = max_val + 1;
79 auto idx_with_overflow = column_unique->uniqueInsertRangeWithOverflow(*column_string, 0, num_values, max_dict_size);
80 auto & idx = idx_with_overflow.indexes;
81 auto & add_keys = idx_with_overflow.overflowed_keys;
82
83 ASSERT_EQ(idx->size(), num_values);
84
85 for (size_t i = 0; i < num_values; ++i)
86 {
87 ASSERT_EQ(indexes[i] + 1, idx->getUInt(i)) << "Different indexes at position " << i;
88 }
89
90 auto & nested = column_unique->getNestedColumn();
91 ASSERT_EQ(nested->size(), max_dict_size);
92 ASSERT_EQ(add_keys->size(), mod_to - max_val);
93
94 for (size_t i = 0; i < max_val; ++i)
95 {
96 ASSERT_EQ(std::to_string(i), nested->getDataAt(i + 1).toString());
97 }
98
99 for (size_t i = 0; i < mod_to - max_val; ++i)
100 {
101 ASSERT_EQ(std::to_string(max_val + i), add_keys->getDataAt(i).toString());
102 }
103}
104
105template <typename ColumnType>
106void column_unique_unique_deserialize_from_arena_impl(ColumnType & column, const IDataType & data_type)
107{
108 size_t num_values = column.size();
109
110 {
111 /// Check serialization is reversible.
112 Arena arena;
113 auto column_unique_pattern = ColumnUnique<ColumnString>::create(data_type);
114 auto column_unique = ColumnUnique<ColumnString>::create(data_type);
115 auto idx = column_unique_pattern->uniqueInsertRangeFrom(column, 0, num_values);
116
117 const char * pos = nullptr;
118 for (size_t i = 0; i < num_values; ++i)
119 {
120 auto ref = column_unique_pattern->serializeValueIntoArena(idx->getUInt(i), arena, pos);
121 const char * new_pos;
122 column_unique->uniqueDeserializeAndInsertFromArena(ref.data, new_pos);
123 ASSERT_EQ(new_pos - ref.data, ref.size) << "Deserialized data has different sizes at position " << i;
124
125 ASSERT_EQ(column_unique_pattern->getNestedNotNullableColumn()->getDataAt(idx->getUInt(i)),
126 column_unique->getNestedNotNullableColumn()->getDataAt(idx->getUInt(i)))
127 << "Deserialized data is different from pattern at position " << i;
128
129 }
130 }
131
132 {
133 /// Check serialization the same with ordinary column.
134 Arena arena_string;
135 Arena arena_lc;
136 auto column_unique = ColumnUnique<ColumnString>::create(data_type);
137 auto idx = column_unique->uniqueInsertRangeFrom(column, 0, num_values);
138
139 const char * pos_string = nullptr;
140 const char * pos_lc = nullptr;
141 for (size_t i = 0; i < num_values; ++i)
142 {
143 auto ref_string = column.serializeValueIntoArena(i, arena_string, pos_string);
144 auto ref_lc = column_unique->serializeValueIntoArena(idx->getUInt(i), arena_lc, pos_lc);
145 ASSERT_EQ(ref_string, ref_lc) << "Serialized data is different from pattern at position " << i;
146 }
147 }
148}
149
150TEST(column_unique, column_unique_unique_deserialize_from_arena_String_Test)
151{
152 auto data_type = std::make_shared<DataTypeString>();
153 auto column_string = ColumnString::create();
154
155 size_t num_values = 1000000;
156 size_t mod_to = 1000;
157
158 std::vector<size_t> indexes(num_values);
159 for (size_t i = 0; i < num_values; ++i)
160 {
161 String str = toString(i % mod_to);
162 column_string->insertData(str.data(), str.size());
163 }
164
165 column_unique_unique_deserialize_from_arena_impl(*column_string, *data_type);
166}
167
168TEST(column_unique, column_unique_unique_deserialize_from_arena_Nullable_String_Test)
169{
170 auto data_type = std::make_shared<DataTypeNullable>(std::make_shared<DataTypeString>());
171 auto column_string = ColumnString::create();
172 auto null_mask = ColumnUInt8::create();
173
174 size_t num_values = 1000000;
175 size_t mod_to = 1000;
176
177 std::vector<size_t> indexes(num_values);
178 for (size_t i = 0; i < num_values; ++i)
179 {
180 String str = toString(i % mod_to);
181 column_string->insertData(str.data(), str.size());
182
183 null_mask->insertValue(i % 3 ? 1 : 0);
184 }
185
186 auto column = ColumnNullable::create(std::move(column_string), std::move(null_mask));
187 column_unique_unique_deserialize_from_arena_impl(*column, *data_type);
188}
189