1 | #pragma once |
2 | |
3 | #include <Columns/IColumn.h> |
4 | #include <Columns/ColumnArray.h> |
5 | #include <Columns/ColumnConst.h> |
6 | #include <Columns/ColumnNullable.h> |
7 | #include <Columns/ColumnsNumber.h> |
8 | #include <Columns/ColumnString.h> |
9 | #include <Columns/ColumnFixedString.h> |
10 | #include <DataTypes/IDataType.h> |
11 | #include <DataTypes/DataTypeArray.h> |
12 | #include <DataTypes/DataTypeNullable.h> |
13 | #include <DataTypes/DataTypeFixedString.h> |
14 | #include <DataTypes/DataTypeLowCardinality.h> |
15 | #include <DataTypes/DataTypesNumber.h> |
16 | #include <ext/bit_cast.h> |
17 | #include <Common/HashTable/Hash.h> |
18 | #include <Interpreters/BloomFilter.h> |
19 | |
20 | namespace DB |
21 | { |
22 | |
23 | namespace ErrorCodes |
24 | { |
25 | extern const int ILLEGAL_COLUMN; |
26 | } |
27 | |
28 | struct BloomFilterHash |
29 | { |
30 | static constexpr UInt64 bf_hash_seed[15] = { |
31 | 13635471485423070496ULL, 10336109063487487899ULL, 17779957404565211594ULL, 8988612159822229247ULL, 4954614162757618085ULL, |
32 | 12980113590177089081ULL, 9263883436177860930ULL, 3656772712723269762ULL, 10362091744962961274ULL, 7582936617938287249ULL, |
33 | 15033938188484401405ULL, 18286745649494826751ULL, 6852245486148412312ULL, 8886056245089344681ULL, 10151472371158292780ULL |
34 | }; |
35 | |
36 | static ColumnPtr hashWithField(const IDataType * data_type, const Field & field) |
37 | { |
38 | WhichDataType which(data_type); |
39 | UInt64 hash = 0; |
40 | bool unexpected_type = false; |
41 | |
42 | if (field.isNull()) |
43 | { |
44 | if (which.isInt() || which.isUInt() || which.isEnum() || which.isDateOrDateTime() || which.isFloat()) |
45 | hash = intHash64(0); |
46 | else if (which.isString()) |
47 | hash = CityHash_v1_0_2::CityHash64("" , 0); |
48 | else if (which.isFixedString()) |
49 | { |
50 | const auto * fixed_string_type = typeid_cast<const DataTypeFixedString *>(data_type); |
51 | const std::vector<char> value(fixed_string_type->getN(), 0); |
52 | hash = CityHash_v1_0_2::CityHash64(value.data(), value.size()); |
53 | } |
54 | else |
55 | unexpected_type = true; |
56 | } |
57 | else if (which.isUInt() || which.isDateOrDateTime()) |
58 | hash = intHash64(field.safeGet<UInt64>()); |
59 | else if (which.isInt() || which.isEnum()) |
60 | hash = intHash64(ext::bit_cast<UInt64>(field.safeGet<Int64>())); |
61 | else if (which.isFloat32() || which.isFloat64()) |
62 | hash = intHash64(ext::bit_cast<UInt64>(field.safeGet<Float64>())); |
63 | else if (which.isString() || which.isFixedString()) |
64 | { |
65 | const auto & value = field.safeGet<String>(); |
66 | hash = CityHash_v1_0_2::CityHash64(value.data(), value.size()); |
67 | } |
68 | else |
69 | unexpected_type = true; |
70 | |
71 | if (unexpected_type) |
72 | throw Exception("Unexpected type " + data_type->getName() + " of bloom filter index." , ErrorCodes::LOGICAL_ERROR); |
73 | |
74 | return ColumnConst::create(ColumnUInt64::create(1, hash), 1); |
75 | } |
76 | |
77 | static ColumnPtr hashWithColumn(const DataTypePtr & data_type, const ColumnPtr & column, size_t pos, size_t limit) |
78 | { |
79 | WhichDataType which(data_type); |
80 | if (which.isArray()) |
81 | { |
82 | const auto * array_col = typeid_cast<const ColumnArray *>(column.get()); |
83 | |
84 | if (checkAndGetColumn<ColumnNullable>(array_col->getData())) |
85 | throw Exception("Unexpected type " + data_type->getName() + " of bloom filter index." , ErrorCodes::LOGICAL_ERROR); |
86 | |
87 | const auto & offsets = array_col->getOffsets(); |
88 | limit = offsets[pos + limit - 1] - offsets[pos - 1]; /// PaddedPODArray allows access on index -1. |
89 | pos = offsets[pos - 1]; |
90 | |
91 | if (limit == 0) |
92 | { |
93 | auto index_column = ColumnUInt64::create(1); |
94 | ColumnUInt64::Container & index_column_vec = index_column->getData(); |
95 | index_column_vec[0] = 0; |
96 | return index_column; |
97 | } |
98 | } |
99 | |
100 | const ColumnPtr actual_col = BloomFilter::getPrimitiveColumn(column); |
101 | const DataTypePtr actual_type = BloomFilter::getPrimitiveType(data_type); |
102 | |
103 | auto index_column = ColumnUInt64::create(limit); |
104 | ColumnUInt64::Container & index_column_vec = index_column->getData(); |
105 | getAnyTypeHash<true>(actual_type.get(), actual_col.get(), index_column_vec, pos); |
106 | return index_column; |
107 | } |
108 | |
109 | template <bool is_first> |
110 | static void getAnyTypeHash(const IDataType * data_type, const IColumn * column, ColumnUInt64::Container & vec, size_t pos) |
111 | { |
112 | WhichDataType which(data_type); |
113 | |
114 | if (which.isUInt8()) getNumberTypeHash<UInt8, is_first>(column, vec, pos); |
115 | else if (which.isUInt16()) getNumberTypeHash<UInt16, is_first>(column, vec, pos); |
116 | else if (which.isUInt32()) getNumberTypeHash<UInt32, is_first>(column, vec, pos); |
117 | else if (which.isUInt64()) getNumberTypeHash<UInt64, is_first>(column, vec, pos); |
118 | else if (which.isInt8()) getNumberTypeHash<Int8, is_first>(column, vec, pos); |
119 | else if (which.isInt16()) getNumberTypeHash<Int16, is_first>(column, vec, pos); |
120 | else if (which.isInt32()) getNumberTypeHash<Int32, is_first>(column, vec, pos); |
121 | else if (which.isInt64()) getNumberTypeHash<Int64, is_first>(column, vec, pos); |
122 | else if (which.isEnum8()) getNumberTypeHash<Int8, is_first>(column, vec, pos); |
123 | else if (which.isEnum16()) getNumberTypeHash<Int16, is_first>(column, vec, pos); |
124 | else if (which.isDate()) getNumberTypeHash<UInt16, is_first>(column, vec, pos); |
125 | else if (which.isDateTime()) getNumberTypeHash<UInt32, is_first>(column, vec, pos); |
126 | else if (which.isFloat32()) getNumberTypeHash<Float32, is_first>(column, vec, pos); |
127 | else if (which.isFloat64()) getNumberTypeHash<Float64, is_first>(column, vec, pos); |
128 | else if (which.isString()) getStringTypeHash<is_first>(column, vec, pos); |
129 | else if (which.isFixedString()) getStringTypeHash<is_first>(column, vec, pos); |
130 | else throw Exception("Unexpected type " + data_type->getName() + " of bloom filter index." , ErrorCodes::LOGICAL_ERROR); |
131 | } |
132 | |
133 | template <typename Type, bool is_first> |
134 | static void getNumberTypeHash(const IColumn * column, ColumnUInt64::Container & vec, size_t pos) |
135 | { |
136 | const auto * index_column = typeid_cast<const ColumnVector<Type> *>(column); |
137 | |
138 | if (unlikely(!index_column)) |
139 | throw Exception("Illegal column type was passed to the bloom filter index." , ErrorCodes::ILLEGAL_COLUMN); |
140 | |
141 | const typename ColumnVector<Type>::Container & vec_from = index_column->getData(); |
142 | |
143 | /// Because we're missing the precision of float in the Field.h |
144 | /// to be consistent, we need to convert Float32 to Float64 processing, also see: BloomFilterHash::hashWithField |
145 | if constexpr (std::is_same_v<ColumnVector<Type>, ColumnFloat32>) |
146 | { |
147 | for (size_t index = 0, size = vec.size(); index < size; ++index) |
148 | { |
149 | UInt64 hash = intHash64(ext::bit_cast<UInt64>(Float64(vec_from[index + pos]))); |
150 | |
151 | if constexpr (is_first) |
152 | vec[index] = hash; |
153 | else |
154 | vec[index] = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(vec[index], hash)); |
155 | } |
156 | } |
157 | else |
158 | { |
159 | for (size_t index = 0, size = vec.size(); index < size; ++index) |
160 | { |
161 | UInt64 hash = intHash64(ext::bit_cast<UInt64>(vec_from[index + pos])); |
162 | |
163 | if constexpr (is_first) |
164 | vec[index] = hash; |
165 | else |
166 | vec[index] = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(vec[index], hash)); |
167 | } |
168 | } |
169 | } |
170 | |
171 | template <bool is_first> |
172 | static void getStringTypeHash(const IColumn * column, ColumnUInt64::Container & vec, size_t pos) |
173 | { |
174 | if (const auto * index_column = typeid_cast<const ColumnString *>(column)) |
175 | { |
176 | const ColumnString::Chars & data = index_column->getChars(); |
177 | const ColumnString::Offsets & offsets = index_column->getOffsets(); |
178 | |
179 | ColumnString::Offset current_offset = pos; |
180 | for (size_t index = 0, size = vec.size(); index < size; ++index) |
181 | { |
182 | UInt64 city_hash = CityHash_v1_0_2::CityHash64( |
183 | reinterpret_cast<const char *>(&data[current_offset]), offsets[index + pos] - current_offset - 1); |
184 | |
185 | if constexpr (is_first) |
186 | vec[index] = city_hash; |
187 | else |
188 | vec[index] = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(vec[index], city_hash)); |
189 | |
190 | current_offset = offsets[index + pos]; |
191 | } |
192 | } |
193 | else if (const auto * fixed_string_index_column = typeid_cast<const ColumnFixedString *>(column)) |
194 | { |
195 | size_t fixed_len = fixed_string_index_column->getN(); |
196 | const auto & data = fixed_string_index_column->getChars(); |
197 | |
198 | for (size_t index = 0, size = vec.size(); index < size; ++index) |
199 | { |
200 | UInt64 city_hash = CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&data[(index + pos) * fixed_len]), fixed_len); |
201 | |
202 | if constexpr (is_first) |
203 | vec[index] = city_hash; |
204 | else |
205 | vec[index] = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(vec[index], city_hash)); |
206 | } |
207 | } |
208 | else |
209 | throw Exception("Illegal column type was passed to the bloom filter index." , ErrorCodes::ILLEGAL_COLUMN); |
210 | } |
211 | |
212 | static std::pair<size_t, size_t> calculationBestPractices(double max_conflict_probability) |
213 | { |
214 | static const size_t MAX_BITS_PER_ROW = 20; |
215 | static const size_t MAX_HASH_FUNCTION_COUNT = 15; |
216 | |
217 | /// For the smallest index per level in probability_lookup_table |
218 | static const size_t min_probability_index_each_bits[] = {0, 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 8, 9, 10, 10, 11, 12, 12, 13, 14}; |
219 | |
220 | static const long double probability_lookup_table[MAX_BITS_PER_ROW + 1][MAX_HASH_FUNCTION_COUNT] = |
221 | { |
222 | {1.0}, /// dummy, 0 bits per row |
223 | {1.0, 1.0}, |
224 | {1.0, 0.393, 0.400}, |
225 | {1.0, 0.283, 0.237, 0.253}, |
226 | {1.0, 0.221, 0.155, 0.147, 0.160}, |
227 | {1.0, 0.181, 0.109, 0.092, 0.092, 0.101}, // 5 |
228 | {1.0, 0.154, 0.0804, 0.0609, 0.0561, 0.0578, 0.0638}, |
229 | {1.0, 0.133, 0.0618, 0.0423, 0.0359, 0.0347, 0.0364}, |
230 | {1.0, 0.118, 0.0489, 0.0306, 0.024, 0.0217, 0.0216, 0.0229}, |
231 | {1.0, 0.105, 0.0397, 0.0228, 0.0166, 0.0141, 0.0133, 0.0135, 0.0145}, |
232 | {1.0, 0.0952, 0.0329, 0.0174, 0.0118, 0.00943, 0.00844, 0.00819, 0.00846}, // 10 |
233 | {1.0, 0.0869, 0.0276, 0.0136, 0.00864, 0.0065, 0.00552, 0.00513, 0.00509}, |
234 | {1.0, 0.08, 0.0236, 0.0108, 0.00646, 0.00459, 0.00371, 0.00329, 0.00314}, |
235 | {1.0, 0.074, 0.0203, 0.00875, 0.00492, 0.00332, 0.00255, 0.00217, 0.00199, 0.00194}, |
236 | {1.0, 0.0689, 0.0177, 0.00718, 0.00381, 0.00244, 0.00179, 0.00146, 0.00129, 0.00121, 0.0012}, |
237 | {1.0, 0.0645, 0.0156, 0.00596, 0.003, 0.00183, 0.00128, 0.001, 0.000852, 0.000775, 0.000744}, // 15 |
238 | {1.0, 0.0606, 0.0138, 0.005, 0.00239, 0.00139, 0.000935, 0.000702, 0.000574, 0.000505, 0.00047, 0.000459}, |
239 | {1.0, 0.0571, 0.0123, 0.00423, 0.00193, 0.00107, 0.000692, 0.000499, 0.000394, 0.000335, 0.000302, 0.000287, 0.000284}, |
240 | {1.0, 0.054, 0.0111, 0.00362, 0.00158, 0.000839, 0.000519, 0.00036, 0.000275, 0.000226, 0.000198, 0.000183, 0.000176}, |
241 | {1.0, 0.0513, 0.00998, 0.00312, 0.0013, 0.000663, 0.000394, 0.000264, 0.000194, 0.000155, 0.000132, 0.000118, 0.000111, 0.000109}, |
242 | {1.0, 0.0488, 0.00906, 0.0027, 0.00108, 0.00053, 0.000303, 0.000196, 0.00014, 0.000108, 8.89e-05, 7.77e-05, 7.12e-05, 6.79e-05, 6.71e-05} // 20 |
243 | }; |
244 | |
245 | for (size_t bits_per_row = 1; bits_per_row < MAX_BITS_PER_ROW; ++bits_per_row) |
246 | { |
247 | if (probability_lookup_table[bits_per_row][min_probability_index_each_bits[bits_per_row]] <= max_conflict_probability) |
248 | { |
249 | size_t max_size_of_hash_functions = min_probability_index_each_bits[bits_per_row]; |
250 | for (size_t size_of_hash_functions = max_size_of_hash_functions; size_of_hash_functions > 0; --size_of_hash_functions) |
251 | if (probability_lookup_table[bits_per_row][size_of_hash_functions] > max_conflict_probability) |
252 | return std::pair<size_t, size_t>(bits_per_row, size_of_hash_functions + 1); |
253 | } |
254 | } |
255 | |
256 | return std::pair<size_t, size_t>(MAX_BITS_PER_ROW - 1, min_probability_index_each_bits[MAX_BITS_PER_ROW - 1]); |
257 | } |
258 | }; |
259 | |
260 | } |
261 | |