1#pragma once
2
3#include <Columns/IColumn.h>
4#include <Columns/ColumnArray.h>
5#include <Columns/ColumnConst.h>
6#include <Columns/ColumnNullable.h>
7#include <Columns/ColumnsNumber.h>
8#include <Columns/ColumnString.h>
9#include <Columns/ColumnFixedString.h>
10#include <DataTypes/IDataType.h>
11#include <DataTypes/DataTypeArray.h>
12#include <DataTypes/DataTypeNullable.h>
13#include <DataTypes/DataTypeFixedString.h>
14#include <DataTypes/DataTypeLowCardinality.h>
15#include <DataTypes/DataTypesNumber.h>
16#include <ext/bit_cast.h>
17#include <Common/HashTable/Hash.h>
18#include <Interpreters/BloomFilter.h>
19
20namespace DB
21{
22
23namespace ErrorCodes
24{
25 extern const int ILLEGAL_COLUMN;
26}
27
28struct BloomFilterHash
29{
30 static constexpr UInt64 bf_hash_seed[15] = {
31 13635471485423070496ULL, 10336109063487487899ULL, 17779957404565211594ULL, 8988612159822229247ULL, 4954614162757618085ULL,
32 12980113590177089081ULL, 9263883436177860930ULL, 3656772712723269762ULL, 10362091744962961274ULL, 7582936617938287249ULL,
33 15033938188484401405ULL, 18286745649494826751ULL, 6852245486148412312ULL, 8886056245089344681ULL, 10151472371158292780ULL
34 };
35
36 static ColumnPtr hashWithField(const IDataType * data_type, const Field & field)
37 {
38 WhichDataType which(data_type);
39 UInt64 hash = 0;
40 bool unexpected_type = false;
41
42 if (field.isNull())
43 {
44 if (which.isInt() || which.isUInt() || which.isEnum() || which.isDateOrDateTime() || which.isFloat())
45 hash = intHash64(0);
46 else if (which.isString())
47 hash = CityHash_v1_0_2::CityHash64("", 0);
48 else if (which.isFixedString())
49 {
50 const auto * fixed_string_type = typeid_cast<const DataTypeFixedString *>(data_type);
51 const std::vector<char> value(fixed_string_type->getN(), 0);
52 hash = CityHash_v1_0_2::CityHash64(value.data(), value.size());
53 }
54 else
55 unexpected_type = true;
56 }
57 else if (which.isUInt() || which.isDateOrDateTime())
58 hash = intHash64(field.safeGet<UInt64>());
59 else if (which.isInt() || which.isEnum())
60 hash = intHash64(ext::bit_cast<UInt64>(field.safeGet<Int64>()));
61 else if (which.isFloat32() || which.isFloat64())
62 hash = intHash64(ext::bit_cast<UInt64>(field.safeGet<Float64>()));
63 else if (which.isString() || which.isFixedString())
64 {
65 const auto & value = field.safeGet<String>();
66 hash = CityHash_v1_0_2::CityHash64(value.data(), value.size());
67 }
68 else
69 unexpected_type = true;
70
71 if (unexpected_type)
72 throw Exception("Unexpected type " + data_type->getName() + " of bloom filter index.", ErrorCodes::LOGICAL_ERROR);
73
74 return ColumnConst::create(ColumnUInt64::create(1, hash), 1);
75 }
76
77 static ColumnPtr hashWithColumn(const DataTypePtr & data_type, const ColumnPtr & column, size_t pos, size_t limit)
78 {
79 WhichDataType which(data_type);
80 if (which.isArray())
81 {
82 const auto * array_col = typeid_cast<const ColumnArray *>(column.get());
83
84 if (checkAndGetColumn<ColumnNullable>(array_col->getData()))
85 throw Exception("Unexpected type " + data_type->getName() + " of bloom filter index.", ErrorCodes::LOGICAL_ERROR);
86
87 const auto & offsets = array_col->getOffsets();
88 limit = offsets[pos + limit - 1] - offsets[pos - 1]; /// PaddedPODArray allows access on index -1.
89 pos = offsets[pos - 1];
90
91 if (limit == 0)
92 {
93 auto index_column = ColumnUInt64::create(1);
94 ColumnUInt64::Container & index_column_vec = index_column->getData();
95 index_column_vec[0] = 0;
96 return index_column;
97 }
98 }
99
100 const ColumnPtr actual_col = BloomFilter::getPrimitiveColumn(column);
101 const DataTypePtr actual_type = BloomFilter::getPrimitiveType(data_type);
102
103 auto index_column = ColumnUInt64::create(limit);
104 ColumnUInt64::Container & index_column_vec = index_column->getData();
105 getAnyTypeHash<true>(actual_type.get(), actual_col.get(), index_column_vec, pos);
106 return index_column;
107 }
108
109 template <bool is_first>
110 static void getAnyTypeHash(const IDataType * data_type, const IColumn * column, ColumnUInt64::Container & vec, size_t pos)
111 {
112 WhichDataType which(data_type);
113
114 if (which.isUInt8()) getNumberTypeHash<UInt8, is_first>(column, vec, pos);
115 else if (which.isUInt16()) getNumberTypeHash<UInt16, is_first>(column, vec, pos);
116 else if (which.isUInt32()) getNumberTypeHash<UInt32, is_first>(column, vec, pos);
117 else if (which.isUInt64()) getNumberTypeHash<UInt64, is_first>(column, vec, pos);
118 else if (which.isInt8()) getNumberTypeHash<Int8, is_first>(column, vec, pos);
119 else if (which.isInt16()) getNumberTypeHash<Int16, is_first>(column, vec, pos);
120 else if (which.isInt32()) getNumberTypeHash<Int32, is_first>(column, vec, pos);
121 else if (which.isInt64()) getNumberTypeHash<Int64, is_first>(column, vec, pos);
122 else if (which.isEnum8()) getNumberTypeHash<Int8, is_first>(column, vec, pos);
123 else if (which.isEnum16()) getNumberTypeHash<Int16, is_first>(column, vec, pos);
124 else if (which.isDate()) getNumberTypeHash<UInt16, is_first>(column, vec, pos);
125 else if (which.isDateTime()) getNumberTypeHash<UInt32, is_first>(column, vec, pos);
126 else if (which.isFloat32()) getNumberTypeHash<Float32, is_first>(column, vec, pos);
127 else if (which.isFloat64()) getNumberTypeHash<Float64, is_first>(column, vec, pos);
128 else if (which.isString()) getStringTypeHash<is_first>(column, vec, pos);
129 else if (which.isFixedString()) getStringTypeHash<is_first>(column, vec, pos);
130 else throw Exception("Unexpected type " + data_type->getName() + " of bloom filter index.", ErrorCodes::LOGICAL_ERROR);
131 }
132
133 template <typename Type, bool is_first>
134 static void getNumberTypeHash(const IColumn * column, ColumnUInt64::Container & vec, size_t pos)
135 {
136 const auto * index_column = typeid_cast<const ColumnVector<Type> *>(column);
137
138 if (unlikely(!index_column))
139 throw Exception("Illegal column type was passed to the bloom filter index.", ErrorCodes::ILLEGAL_COLUMN);
140
141 const typename ColumnVector<Type>::Container & vec_from = index_column->getData();
142
143 /// Because we're missing the precision of float in the Field.h
144 /// to be consistent, we need to convert Float32 to Float64 processing, also see: BloomFilterHash::hashWithField
145 if constexpr (std::is_same_v<ColumnVector<Type>, ColumnFloat32>)
146 {
147 for (size_t index = 0, size = vec.size(); index < size; ++index)
148 {
149 UInt64 hash = intHash64(ext::bit_cast<UInt64>(Float64(vec_from[index + pos])));
150
151 if constexpr (is_first)
152 vec[index] = hash;
153 else
154 vec[index] = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(vec[index], hash));
155 }
156 }
157 else
158 {
159 for (size_t index = 0, size = vec.size(); index < size; ++index)
160 {
161 UInt64 hash = intHash64(ext::bit_cast<UInt64>(vec_from[index + pos]));
162
163 if constexpr (is_first)
164 vec[index] = hash;
165 else
166 vec[index] = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(vec[index], hash));
167 }
168 }
169 }
170
171 template <bool is_first>
172 static void getStringTypeHash(const IColumn * column, ColumnUInt64::Container & vec, size_t pos)
173 {
174 if (const auto * index_column = typeid_cast<const ColumnString *>(column))
175 {
176 const ColumnString::Chars & data = index_column->getChars();
177 const ColumnString::Offsets & offsets = index_column->getOffsets();
178
179 ColumnString::Offset current_offset = pos;
180 for (size_t index = 0, size = vec.size(); index < size; ++index)
181 {
182 UInt64 city_hash = CityHash_v1_0_2::CityHash64(
183 reinterpret_cast<const char *>(&data[current_offset]), offsets[index + pos] - current_offset - 1);
184
185 if constexpr (is_first)
186 vec[index] = city_hash;
187 else
188 vec[index] = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(vec[index], city_hash));
189
190 current_offset = offsets[index + pos];
191 }
192 }
193 else if (const auto * fixed_string_index_column = typeid_cast<const ColumnFixedString *>(column))
194 {
195 size_t fixed_len = fixed_string_index_column->getN();
196 const auto & data = fixed_string_index_column->getChars();
197
198 for (size_t index = 0, size = vec.size(); index < size; ++index)
199 {
200 UInt64 city_hash = CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&data[(index + pos) * fixed_len]), fixed_len);
201
202 if constexpr (is_first)
203 vec[index] = city_hash;
204 else
205 vec[index] = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(vec[index], city_hash));
206 }
207 }
208 else
209 throw Exception("Illegal column type was passed to the bloom filter index.", ErrorCodes::ILLEGAL_COLUMN);
210 }
211
212 static std::pair<size_t, size_t> calculationBestPractices(double max_conflict_probability)
213 {
214 static const size_t MAX_BITS_PER_ROW = 20;
215 static const size_t MAX_HASH_FUNCTION_COUNT = 15;
216
217 /// For the smallest index per level in probability_lookup_table
218 static const size_t min_probability_index_each_bits[] = {0, 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 8, 9, 10, 10, 11, 12, 12, 13, 14};
219
220 static const long double probability_lookup_table[MAX_BITS_PER_ROW + 1][MAX_HASH_FUNCTION_COUNT] =
221 {
222 {1.0}, /// dummy, 0 bits per row
223 {1.0, 1.0},
224 {1.0, 0.393, 0.400},
225 {1.0, 0.283, 0.237, 0.253},
226 {1.0, 0.221, 0.155, 0.147, 0.160},
227 {1.0, 0.181, 0.109, 0.092, 0.092, 0.101}, // 5
228 {1.0, 0.154, 0.0804, 0.0609, 0.0561, 0.0578, 0.0638},
229 {1.0, 0.133, 0.0618, 0.0423, 0.0359, 0.0347, 0.0364},
230 {1.0, 0.118, 0.0489, 0.0306, 0.024, 0.0217, 0.0216, 0.0229},
231 {1.0, 0.105, 0.0397, 0.0228, 0.0166, 0.0141, 0.0133, 0.0135, 0.0145},
232 {1.0, 0.0952, 0.0329, 0.0174, 0.0118, 0.00943, 0.00844, 0.00819, 0.00846}, // 10
233 {1.0, 0.0869, 0.0276, 0.0136, 0.00864, 0.0065, 0.00552, 0.00513, 0.00509},
234 {1.0, 0.08, 0.0236, 0.0108, 0.00646, 0.00459, 0.00371, 0.00329, 0.00314},
235 {1.0, 0.074, 0.0203, 0.00875, 0.00492, 0.00332, 0.00255, 0.00217, 0.00199, 0.00194},
236 {1.0, 0.0689, 0.0177, 0.00718, 0.00381, 0.00244, 0.00179, 0.00146, 0.00129, 0.00121, 0.0012},
237 {1.0, 0.0645, 0.0156, 0.00596, 0.003, 0.00183, 0.00128, 0.001, 0.000852, 0.000775, 0.000744}, // 15
238 {1.0, 0.0606, 0.0138, 0.005, 0.00239, 0.00139, 0.000935, 0.000702, 0.000574, 0.000505, 0.00047, 0.000459},
239 {1.0, 0.0571, 0.0123, 0.00423, 0.00193, 0.00107, 0.000692, 0.000499, 0.000394, 0.000335, 0.000302, 0.000287, 0.000284},
240 {1.0, 0.054, 0.0111, 0.00362, 0.00158, 0.000839, 0.000519, 0.00036, 0.000275, 0.000226, 0.000198, 0.000183, 0.000176},
241 {1.0, 0.0513, 0.00998, 0.00312, 0.0013, 0.000663, 0.000394, 0.000264, 0.000194, 0.000155, 0.000132, 0.000118, 0.000111, 0.000109},
242 {1.0, 0.0488, 0.00906, 0.0027, 0.00108, 0.00053, 0.000303, 0.000196, 0.00014, 0.000108, 8.89e-05, 7.77e-05, 7.12e-05, 6.79e-05, 6.71e-05} // 20
243 };
244
245 for (size_t bits_per_row = 1; bits_per_row < MAX_BITS_PER_ROW; ++bits_per_row)
246 {
247 if (probability_lookup_table[bits_per_row][min_probability_index_each_bits[bits_per_row]] <= max_conflict_probability)
248 {
249 size_t max_size_of_hash_functions = min_probability_index_each_bits[bits_per_row];
250 for (size_t size_of_hash_functions = max_size_of_hash_functions; size_of_hash_functions > 0; --size_of_hash_functions)
251 if (probability_lookup_table[bits_per_row][size_of_hash_functions] > max_conflict_probability)
252 return std::pair<size_t, size_t>(bits_per_row, size_of_hash_functions + 1);
253 }
254 }
255
256 return std::pair<size_t, size_t>(MAX_BITS_PER_ROW - 1, min_probability_index_each_bits[MAX_BITS_PER_ROW - 1]);
257 }
258};
259
260}
261