| 1 | #include <Columns/ColumnUnique.h> |
| 2 | #include <Columns/ColumnString.h> |
| 3 | #include <Columns/ColumnsNumber.h> |
| 4 | #include <Columns/ColumnNullable.h> |
| 5 | |
| 6 | #include <DataTypes/DataTypeString.h> |
| 7 | #include <DataTypes/DataTypesNumber.h> |
| 8 | #include <DataTypes/DataTypeNullable.h> |
| 9 | |
| 10 | #include <gtest/gtest.h> |
| 11 | |
| 12 | #include <unordered_map> |
| 13 | #include <vector> |
| 14 | using namespace DB; |
| 15 | |
| 16 | TEST(column_unique, column_unique_unique_insert_range_Test) |
| 17 | { |
| 18 | std::unordered_map<String, size_t> ref_map; |
| 19 | auto data_type = std::make_shared<DataTypeString>(); |
| 20 | auto column_unique = ColumnUnique<ColumnString>::create(*data_type); |
| 21 | auto column_string = ColumnString::create(); |
| 22 | |
| 23 | size_t num_values = 1000000; |
| 24 | size_t mod_to = 1000; |
| 25 | |
| 26 | std::vector<size_t> indexes(num_values); |
| 27 | for (size_t i = 0; i < num_values; ++i) |
| 28 | { |
| 29 | String str = toString(i % mod_to); |
| 30 | column_string->insertData(str.data(), str.size()); |
| 31 | |
| 32 | if (ref_map.count(str) == 0) |
| 33 | ref_map[str] = ref_map.size(); |
| 34 | |
| 35 | indexes[i]= ref_map[str]; |
| 36 | } |
| 37 | |
| 38 | auto idx = column_unique->uniqueInsertRangeFrom(*column_string, 0, num_values); |
| 39 | ASSERT_EQ(idx->size(), num_values); |
| 40 | |
| 41 | for (size_t i = 0; i < num_values; ++i) |
| 42 | { |
| 43 | ASSERT_EQ(indexes[i] + 1, idx->getUInt(i)) << "Different indexes at position " << i; |
| 44 | } |
| 45 | |
| 46 | auto & nested = column_unique->getNestedColumn(); |
| 47 | ASSERT_EQ(nested->size(), mod_to + 1); |
| 48 | |
| 49 | for (size_t i = 0; i < mod_to; ++i) |
| 50 | { |
| 51 | ASSERT_EQ(std::to_string(i), nested->getDataAt(i + 1).toString()); |
| 52 | } |
| 53 | } |
| 54 | |
| 55 | TEST(column_unique, column_unique_unique_insert_range_with_overflow_Test) |
| 56 | { |
| 57 | std::unordered_map<String, size_t> ref_map; |
| 58 | auto data_type = std::make_shared<DataTypeString>(); |
| 59 | auto column_unique = ColumnUnique<ColumnString>::create(*data_type); |
| 60 | auto column_string = ColumnString::create(); |
| 61 | |
| 62 | size_t num_values = 1000000; |
| 63 | size_t mod_to = 1000; |
| 64 | |
| 65 | std::vector<size_t> indexes(num_values); |
| 66 | for (size_t i = 0; i < num_values; ++i) |
| 67 | { |
| 68 | String str = toString(i % mod_to); |
| 69 | column_string->insertData(str.data(), str.size()); |
| 70 | |
| 71 | if (ref_map.count(str) == 0) |
| 72 | ref_map[str] = ref_map.size(); |
| 73 | |
| 74 | indexes[i]= ref_map[str]; |
| 75 | } |
| 76 | |
| 77 | size_t max_val = mod_to / 2; |
| 78 | size_t max_dict_size = max_val + 1; |
| 79 | auto idx_with_overflow = column_unique->uniqueInsertRangeWithOverflow(*column_string, 0, num_values, max_dict_size); |
| 80 | auto & idx = idx_with_overflow.indexes; |
| 81 | auto & add_keys = idx_with_overflow.overflowed_keys; |
| 82 | |
| 83 | ASSERT_EQ(idx->size(), num_values); |
| 84 | |
| 85 | for (size_t i = 0; i < num_values; ++i) |
| 86 | { |
| 87 | ASSERT_EQ(indexes[i] + 1, idx->getUInt(i)) << "Different indexes at position " << i; |
| 88 | } |
| 89 | |
| 90 | auto & nested = column_unique->getNestedColumn(); |
| 91 | ASSERT_EQ(nested->size(), max_dict_size); |
| 92 | ASSERT_EQ(add_keys->size(), mod_to - max_val); |
| 93 | |
| 94 | for (size_t i = 0; i < max_val; ++i) |
| 95 | { |
| 96 | ASSERT_EQ(std::to_string(i), nested->getDataAt(i + 1).toString()); |
| 97 | } |
| 98 | |
| 99 | for (size_t i = 0; i < mod_to - max_val; ++i) |
| 100 | { |
| 101 | ASSERT_EQ(std::to_string(max_val + i), add_keys->getDataAt(i).toString()); |
| 102 | } |
| 103 | } |
| 104 | |
| 105 | template <typename ColumnType> |
| 106 | void column_unique_unique_deserialize_from_arena_impl(ColumnType & column, const IDataType & data_type) |
| 107 | { |
| 108 | size_t num_values = column.size(); |
| 109 | |
| 110 | { |
| 111 | /// Check serialization is reversible. |
| 112 | Arena arena; |
| 113 | auto column_unique_pattern = ColumnUnique<ColumnString>::create(data_type); |
| 114 | auto column_unique = ColumnUnique<ColumnString>::create(data_type); |
| 115 | auto idx = column_unique_pattern->uniqueInsertRangeFrom(column, 0, num_values); |
| 116 | |
| 117 | const char * pos = nullptr; |
| 118 | for (size_t i = 0; i < num_values; ++i) |
| 119 | { |
| 120 | auto ref = column_unique_pattern->serializeValueIntoArena(idx->getUInt(i), arena, pos); |
| 121 | const char * new_pos; |
| 122 | column_unique->uniqueDeserializeAndInsertFromArena(ref.data, new_pos); |
| 123 | ASSERT_EQ(new_pos - ref.data, ref.size) << "Deserialized data has different sizes at position " << i; |
| 124 | |
| 125 | ASSERT_EQ(column_unique_pattern->getNestedNotNullableColumn()->getDataAt(idx->getUInt(i)), |
| 126 | column_unique->getNestedNotNullableColumn()->getDataAt(idx->getUInt(i))) |
| 127 | << "Deserialized data is different from pattern at position " << i; |
| 128 | |
| 129 | } |
| 130 | } |
| 131 | |
| 132 | { |
| 133 | /// Check serialization the same with ordinary column. |
| 134 | Arena arena_string; |
| 135 | Arena arena_lc; |
| 136 | auto column_unique = ColumnUnique<ColumnString>::create(data_type); |
| 137 | auto idx = column_unique->uniqueInsertRangeFrom(column, 0, num_values); |
| 138 | |
| 139 | const char * pos_string = nullptr; |
| 140 | const char * pos_lc = nullptr; |
| 141 | for (size_t i = 0; i < num_values; ++i) |
| 142 | { |
| 143 | auto ref_string = column.serializeValueIntoArena(i, arena_string, pos_string); |
| 144 | auto ref_lc = column_unique->serializeValueIntoArena(idx->getUInt(i), arena_lc, pos_lc); |
| 145 | ASSERT_EQ(ref_string, ref_lc) << "Serialized data is different from pattern at position " << i; |
| 146 | } |
| 147 | } |
| 148 | } |
| 149 | |
| 150 | TEST(column_unique, column_unique_unique_deserialize_from_arena_String_Test) |
| 151 | { |
| 152 | auto data_type = std::make_shared<DataTypeString>(); |
| 153 | auto column_string = ColumnString::create(); |
| 154 | |
| 155 | size_t num_values = 1000000; |
| 156 | size_t mod_to = 1000; |
| 157 | |
| 158 | std::vector<size_t> indexes(num_values); |
| 159 | for (size_t i = 0; i < num_values; ++i) |
| 160 | { |
| 161 | String str = toString(i % mod_to); |
| 162 | column_string->insertData(str.data(), str.size()); |
| 163 | } |
| 164 | |
| 165 | column_unique_unique_deserialize_from_arena_impl(*column_string, *data_type); |
| 166 | } |
| 167 | |
| 168 | TEST(column_unique, column_unique_unique_deserialize_from_arena_Nullable_String_Test) |
| 169 | { |
| 170 | auto data_type = std::make_shared<DataTypeNullable>(std::make_shared<DataTypeString>()); |
| 171 | auto column_string = ColumnString::create(); |
| 172 | auto null_mask = ColumnUInt8::create(); |
| 173 | |
| 174 | size_t num_values = 1000000; |
| 175 | size_t mod_to = 1000; |
| 176 | |
| 177 | std::vector<size_t> indexes(num_values); |
| 178 | for (size_t i = 0; i < num_values; ++i) |
| 179 | { |
| 180 | String str = toString(i % mod_to); |
| 181 | column_string->insertData(str.data(), str.size()); |
| 182 | |
| 183 | null_mask->insertValue(i % 3 ? 1 : 0); |
| 184 | } |
| 185 | |
| 186 | auto column = ColumnNullable::create(std::move(column_string), std::move(null_mask)); |
| 187 | column_unique_unique_deserialize_from_arena_impl(*column, *data_type); |
| 188 | } |
| 189 | |