1 | #include <Columns/ColumnUnique.h> |
2 | #include <Columns/ColumnString.h> |
3 | #include <Columns/ColumnsNumber.h> |
4 | #include <Columns/ColumnNullable.h> |
5 | |
6 | #include <DataTypes/DataTypeString.h> |
7 | #include <DataTypes/DataTypesNumber.h> |
8 | #include <DataTypes/DataTypeNullable.h> |
9 | |
10 | #include <gtest/gtest.h> |
11 | |
12 | #include <unordered_map> |
13 | #include <vector> |
14 | using namespace DB; |
15 | |
16 | TEST(column_unique, column_unique_unique_insert_range_Test) |
17 | { |
18 | std::unordered_map<String, size_t> ref_map; |
19 | auto data_type = std::make_shared<DataTypeString>(); |
20 | auto column_unique = ColumnUnique<ColumnString>::create(*data_type); |
21 | auto column_string = ColumnString::create(); |
22 | |
23 | size_t num_values = 1000000; |
24 | size_t mod_to = 1000; |
25 | |
26 | std::vector<size_t> indexes(num_values); |
27 | for (size_t i = 0; i < num_values; ++i) |
28 | { |
29 | String str = toString(i % mod_to); |
30 | column_string->insertData(str.data(), str.size()); |
31 | |
32 | if (ref_map.count(str) == 0) |
33 | ref_map[str] = ref_map.size(); |
34 | |
35 | indexes[i]= ref_map[str]; |
36 | } |
37 | |
38 | auto idx = column_unique->uniqueInsertRangeFrom(*column_string, 0, num_values); |
39 | ASSERT_EQ(idx->size(), num_values); |
40 | |
41 | for (size_t i = 0; i < num_values; ++i) |
42 | { |
43 | ASSERT_EQ(indexes[i] + 1, idx->getUInt(i)) << "Different indexes at position " << i; |
44 | } |
45 | |
46 | auto & nested = column_unique->getNestedColumn(); |
47 | ASSERT_EQ(nested->size(), mod_to + 1); |
48 | |
49 | for (size_t i = 0; i < mod_to; ++i) |
50 | { |
51 | ASSERT_EQ(std::to_string(i), nested->getDataAt(i + 1).toString()); |
52 | } |
53 | } |
54 | |
55 | TEST(column_unique, column_unique_unique_insert_range_with_overflow_Test) |
56 | { |
57 | std::unordered_map<String, size_t> ref_map; |
58 | auto data_type = std::make_shared<DataTypeString>(); |
59 | auto column_unique = ColumnUnique<ColumnString>::create(*data_type); |
60 | auto column_string = ColumnString::create(); |
61 | |
62 | size_t num_values = 1000000; |
63 | size_t mod_to = 1000; |
64 | |
65 | std::vector<size_t> indexes(num_values); |
66 | for (size_t i = 0; i < num_values; ++i) |
67 | { |
68 | String str = toString(i % mod_to); |
69 | column_string->insertData(str.data(), str.size()); |
70 | |
71 | if (ref_map.count(str) == 0) |
72 | ref_map[str] = ref_map.size(); |
73 | |
74 | indexes[i]= ref_map[str]; |
75 | } |
76 | |
77 | size_t max_val = mod_to / 2; |
78 | size_t max_dict_size = max_val + 1; |
79 | auto idx_with_overflow = column_unique->uniqueInsertRangeWithOverflow(*column_string, 0, num_values, max_dict_size); |
80 | auto & idx = idx_with_overflow.indexes; |
81 | auto & add_keys = idx_with_overflow.overflowed_keys; |
82 | |
83 | ASSERT_EQ(idx->size(), num_values); |
84 | |
85 | for (size_t i = 0; i < num_values; ++i) |
86 | { |
87 | ASSERT_EQ(indexes[i] + 1, idx->getUInt(i)) << "Different indexes at position " << i; |
88 | } |
89 | |
90 | auto & nested = column_unique->getNestedColumn(); |
91 | ASSERT_EQ(nested->size(), max_dict_size); |
92 | ASSERT_EQ(add_keys->size(), mod_to - max_val); |
93 | |
94 | for (size_t i = 0; i < max_val; ++i) |
95 | { |
96 | ASSERT_EQ(std::to_string(i), nested->getDataAt(i + 1).toString()); |
97 | } |
98 | |
99 | for (size_t i = 0; i < mod_to - max_val; ++i) |
100 | { |
101 | ASSERT_EQ(std::to_string(max_val + i), add_keys->getDataAt(i).toString()); |
102 | } |
103 | } |
104 | |
105 | template <typename ColumnType> |
106 | void column_unique_unique_deserialize_from_arena_impl(ColumnType & column, const IDataType & data_type) |
107 | { |
108 | size_t num_values = column.size(); |
109 | |
110 | { |
111 | /// Check serialization is reversible. |
112 | Arena arena; |
113 | auto column_unique_pattern = ColumnUnique<ColumnString>::create(data_type); |
114 | auto column_unique = ColumnUnique<ColumnString>::create(data_type); |
115 | auto idx = column_unique_pattern->uniqueInsertRangeFrom(column, 0, num_values); |
116 | |
117 | const char * pos = nullptr; |
118 | for (size_t i = 0; i < num_values; ++i) |
119 | { |
120 | auto ref = column_unique_pattern->serializeValueIntoArena(idx->getUInt(i), arena, pos); |
121 | const char * new_pos; |
122 | column_unique->uniqueDeserializeAndInsertFromArena(ref.data, new_pos); |
123 | ASSERT_EQ(new_pos - ref.data, ref.size) << "Deserialized data has different sizes at position " << i; |
124 | |
125 | ASSERT_EQ(column_unique_pattern->getNestedNotNullableColumn()->getDataAt(idx->getUInt(i)), |
126 | column_unique->getNestedNotNullableColumn()->getDataAt(idx->getUInt(i))) |
127 | << "Deserialized data is different from pattern at position " << i; |
128 | |
129 | } |
130 | } |
131 | |
132 | { |
133 | /// Check serialization the same with ordinary column. |
134 | Arena arena_string; |
135 | Arena arena_lc; |
136 | auto column_unique = ColumnUnique<ColumnString>::create(data_type); |
137 | auto idx = column_unique->uniqueInsertRangeFrom(column, 0, num_values); |
138 | |
139 | const char * pos_string = nullptr; |
140 | const char * pos_lc = nullptr; |
141 | for (size_t i = 0; i < num_values; ++i) |
142 | { |
143 | auto ref_string = column.serializeValueIntoArena(i, arena_string, pos_string); |
144 | auto ref_lc = column_unique->serializeValueIntoArena(idx->getUInt(i), arena_lc, pos_lc); |
145 | ASSERT_EQ(ref_string, ref_lc) << "Serialized data is different from pattern at position " << i; |
146 | } |
147 | } |
148 | } |
149 | |
150 | TEST(column_unique, column_unique_unique_deserialize_from_arena_String_Test) |
151 | { |
152 | auto data_type = std::make_shared<DataTypeString>(); |
153 | auto column_string = ColumnString::create(); |
154 | |
155 | size_t num_values = 1000000; |
156 | size_t mod_to = 1000; |
157 | |
158 | std::vector<size_t> indexes(num_values); |
159 | for (size_t i = 0; i < num_values; ++i) |
160 | { |
161 | String str = toString(i % mod_to); |
162 | column_string->insertData(str.data(), str.size()); |
163 | } |
164 | |
165 | column_unique_unique_deserialize_from_arena_impl(*column_string, *data_type); |
166 | } |
167 | |
168 | TEST(column_unique, column_unique_unique_deserialize_from_arena_Nullable_String_Test) |
169 | { |
170 | auto data_type = std::make_shared<DataTypeNullable>(std::make_shared<DataTypeString>()); |
171 | auto column_string = ColumnString::create(); |
172 | auto null_mask = ColumnUInt8::create(); |
173 | |
174 | size_t num_values = 1000000; |
175 | size_t mod_to = 1000; |
176 | |
177 | std::vector<size_t> indexes(num_values); |
178 | for (size_t i = 0; i < num_values; ++i) |
179 | { |
180 | String str = toString(i % mod_to); |
181 | column_string->insertData(str.data(), str.size()); |
182 | |
183 | null_mask->insertValue(i % 3 ? 1 : 0); |
184 | } |
185 | |
186 | auto column = ColumnNullable::create(std::move(column_string), std::move(null_mask)); |
187 | column_unique_unique_deserialize_from_arena_impl(*column, *data_type); |
188 | } |
189 | |