BloomFilterHash.h source code [ClickHouse/dbms/src/Interpreters/BloomFilterHash.h]

1	#pragma once
2
3	#include <Columns/IColumn.h>
4	#include <Columns/ColumnArray.h>
5	#include <Columns/ColumnConst.h>
6	#include <Columns/ColumnNullable.h>
7	#include <Columns/ColumnsNumber.h>
8	#include <Columns/ColumnString.h>
9	#include <Columns/ColumnFixedString.h>
10	#include <DataTypes/IDataType.h>
11	#include <DataTypes/DataTypeArray.h>
12	#include <DataTypes/DataTypeNullable.h>
13	#include <DataTypes/DataTypeFixedString.h>
14	#include <DataTypes/DataTypeLowCardinality.h>
15	#include <DataTypes/DataTypesNumber.h>
16	#include <ext/bit_cast.h>
17	#include <Common/HashTable/Hash.h>
18	#include <Interpreters/BloomFilter.h>
19
20	namespace DB
21	{
22
23	namespace ErrorCodes
24	{
25	extern const int ILLEGAL_COLUMN;
26	}
27
28	struct BloomFilterHash
29	{
30	static constexpr UInt64 bf_hash_seed[`15`] = {
31	`13635471485423070496ULL`, `10336109063487487899ULL`, `17779957404565211594ULL`, `8988612159822229247ULL`, `4954614162757618085ULL`,
32	`12980113590177089081ULL`, `9263883436177860930ULL`, `3656772712723269762ULL`, `10362091744962961274ULL`, `7582936617938287249ULL`,
33	`15033938188484401405ULL`, `18286745649494826751ULL`, `6852245486148412312ULL`, `8886056245089344681ULL`, `10151472371158292780ULL`
34	};
35
36	static ColumnPtr hashWithField(const IDataType * data_type, const Field & field)
37	{
38	WhichDataType which(data_type);
39	UInt64 hash = `0`;
40	bool unexpected_type = false;
41
42	if (field.isNull())
43	{
44	if (which.isInt() \|\| which.isUInt() \|\| which.isEnum() \|\| which.isDateOrDateTime() \|\| which.isFloat())
45	hash = intHash64(`0`);
46	else if (which.isString())
47	hash = CityHash_v1_0_2::CityHash64("", `0`);
48	else if (which.isFixedString())
49	{
50	const auto * fixed_string_type = typeid_cast<const DataTypeFixedString *>(data_type);
51	const std::vector<char> value(fixed_string_type->getN(), `0`);
52	hash = CityHash_v1_0_2::CityHash64(value.data(), value.size());
53	}
54	else
55	unexpected_type = true;
56	}
57	else if (which.isUInt() \|\| which.isDateOrDateTime())
58	hash = intHash64(field.safeGet<UInt64>());
59	else if (which.isInt() \|\| which.isEnum())
60	hash = intHash64(ext::bit_cast<UInt64>(field.safeGet<Int64>()));
61	else if (which.isFloat32() \|\| which.isFloat64())
62	hash = intHash64(ext::bit_cast<UInt64>(field.safeGet<Float64>()));
63	else if (which.isString() \|\| which.isFixedString())
64	{
65	const auto & value = field.safeGet<String>();
66	hash = CityHash_v1_0_2::CityHash64(value.data(), value.size());
67	}
68	else
69	unexpected_type = true;
70
71	if (unexpected_type)
72	throw Exception ("Unexpected type " + data_type->getName() + " of bloom filter index.", ErrorCodes::LOGICAL_ERROR);
73
74	return ColumnConst::create(ColumnUInt64::create(`1`, hash), `1`);
75	}
76
77	static ColumnPtr hashWithColumn(const DataTypePtr & data_type, const ColumnPtr & column, size_t pos, size_t limit)
78	{
79	WhichDataType which(data_type);
80	if (which.isArray())
81	{
82	const auto * array_col = typeid_cast<const ColumnArray *>(column.get());
83
84	if (checkAndGetColumn<ColumnNullable>(array_col->getData()))
85	throw Exception ("Unexpected type " + data_type ->getName() + " of bloom filter index.", ErrorCodes::LOGICAL_ERROR);
86
87	const auto & offsets = array_col->getOffsets();
88	limit = offsets [pos + limit - `1`] - offsets [pos - `1`]; /// PaddedPODArray allows access on index -1.
89	pos = offsets [pos - `1`];
90
91	if (limit == `0`)
92	{
93	auto index_column = ColumnUInt64::create(`1`);
94	ColumnUInt64::Container & index_column_vec = index_column ->getData();
95	index_column_vec [`0`] = `0`;
96	return index_column;
97	}
98	}
99
100	const ColumnPtr actual_col = BloomFilter::getPrimitiveColumn(column);
101	const DataTypePtr actual_type = BloomFilter::getPrimitiveType(data_type);
102
103	auto index_column = ColumnUInt64::create(limit);
104	ColumnUInt64::Container & index_column_vec = index_column ->getData();
105	getAnyTypeHash<true>(actual_type.get(), actual_col.get(), index_column_vec, pos);
106	return index_column;
107	}
108
109	template <bool is_first>
110	static void getAnyTypeHash(const IDataType * data_type, const IColumn * column, ColumnUInt64::Container & vec, size_t pos)
111	{
112	WhichDataType which(data_type);
113
114	if (which.isUInt8()) getNumberTypeHash<UInt8, is_first>(column, vec, pos);
115	else if (which.isUInt16()) getNumberTypeHash<UInt16, is_first>(column, vec, pos);
116	else if (which.isUInt32()) getNumberTypeHash<UInt32, is_first>(column, vec, pos);
117	else if (which.isUInt64()) getNumberTypeHash<UInt64, is_first>(column, vec, pos);
118	else if (which.isInt8()) getNumberTypeHash<Int8, is_first>(column, vec, pos);
119	else if (which.isInt16()) getNumberTypeHash<Int16, is_first>(column, vec, pos);
120	else if (which.isInt32()) getNumberTypeHash<Int32, is_first>(column, vec, pos);
121	else if (which.isInt64()) getNumberTypeHash<Int64, is_first>(column, vec, pos);
122	else if (which.isEnum8()) getNumberTypeHash<Int8, is_first>(column, vec, pos);
123	else if (which.isEnum16()) getNumberTypeHash<Int16, is_first>(column, vec, pos);
124	else if (which.isDate()) getNumberTypeHash<UInt16, is_first>(column, vec, pos);
125	else if (which.isDateTime()) getNumberTypeHash<UInt32, is_first>(column, vec, pos);
126	else if (which.isFloat32()) getNumberTypeHash<Float32, is_first>(column, vec, pos);
127	else if (which.isFloat64()) getNumberTypeHash<Float64, is_first>(column, vec, pos);
128	else if (which.isString()) getStringTypeHash<is_first>(column, vec, pos);
129	else if (which.isFixedString()) getStringTypeHash<is_first>(column, vec, pos);
130	else throw Exception ("Unexpected type " + data_type->getName() + " of bloom filter index.", ErrorCodes::LOGICAL_ERROR);
131	}
132
133	template <typename Type, bool is_first>
134	static void getNumberTypeHash(const IColumn * column, ColumnUInt64::Container & vec, size_t pos)
135	{
136	const auto * index_column = typeid_cast<const ColumnVector<Type> *>(column);
137
138	if (unlikely(!index_column))
139	throw Exception ("Illegal column type was passed to the bloom filter index.", ErrorCodes::ILLEGAL_COLUMN);
140
141	const typename ColumnVector<Type>::Container & vec_from = index_column->getData();
142
143	/// Because we're missing the precision of float in the Field.h
144	/// to be consistent, we need to convert Float32 to Float64 processing, also see: BloomFilterHash::hashWithField
145	if constexpr (std::is_same_v<ColumnVector<Type>, ColumnFloat32>)
146	{
147	for (size_t index = `0`, size = vec.size(); index < size; ++index)
148	{
149	UInt64 hash = intHash64(ext::bit_cast<UInt64>(Float64(vec_from[index + pos])));
150
151	if constexpr (is_first)
152	vec [index] = hash;
153	else
154	vec [index] = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128 (vec [index], hash));
155	}
156	}
157	else
158	{
159	for (size_t index = `0`, size = vec.size(); index < size; ++index)
160	{
161	UInt64 hash = intHash64(ext::bit_cast<UInt64>(vec_from[index + pos]));
162
163	if constexpr (is_first)
164	vec [index] = hash;
165	else
166	vec [index] = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128 (vec [index], hash));
167	}
168	}
169	}
170
171	template <bool is_first>
172	static void getStringTypeHash(const IColumn * column, ColumnUInt64::Container & vec, size_t pos)
173	{
174	if (const auto * index_column = typeid_cast<const ColumnString *>(column))
175	{
176	const ColumnString::Chars & data = index_column->getChars();
177	const ColumnString::Offsets & offsets = index_column->getOffsets();
178
179	ColumnString::Offset current_offset = pos;
180	for (size_t index = `0`, size = vec.size(); index < size; ++index)
181	{
182	UInt64 city_hash = CityHash_v1_0_2::CityHash64(
183	reinterpret_cast<const char *>(&data [current_offset]), offsets [index + pos] - current_offset - `1`);
184
185	if constexpr (is_first)
186	vec [index] = city_hash;
187	else
188	vec [index] = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128 (vec [index], city_hash));
189
190	current_offset = offsets [index + pos];
191	}
192	}
193	else if (const auto * fixed_string_index_column = typeid_cast<const ColumnFixedString *>(column))
194	{
195	size_t fixed_len = fixed_string_index_column->getN();
196	const auto & data = fixed_string_index_column->getChars();
197
198	for (size_t index = `0`, size = vec.size(); index < size; ++index)
199	{
200	UInt64 city_hash = CityHash_v1_0_2::CityHash64(reinterpret_cast<const char >(&data [(index + pos) fixed_len]), fixed_len);
201
202	if constexpr (is_first)
203	vec [index] = city_hash;
204	else
205	vec [index] = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128 (vec [index], city_hash));
206	}
207	}
208	else
209	throw Exception ("Illegal column type was passed to the bloom filter index.", ErrorCodes::ILLEGAL_COLUMN);
210	}
211
212	static std::pair<size_t, size_t> calculationBestPractices(double max_conflict_probability)
213	{
214	static const size_t MAX_BITS_PER_ROW = `20`;
215	static const size_t MAX_HASH_FUNCTION_COUNT = `15`;
216
217	/// For the smallest index per level in probability_lookup_table
218	static const size_t min_probability_index_each_bits[] = {`0`, `0`, `1`, `2`, `3`, `3`, `4`, `5`, `6`, `6`, `7`, `8`, `8`, `9`, `10`, `10`, `11`, `12`, `12`, `13`, `14`};
219
220	static const long double probability_lookup_table[MAX_BITS_PER_ROW + `1`][MAX_HASH_FUNCTION_COUNT] =
221	{
222	{`1.0`}, /// dummy, 0 bits per row
223	{`1.0`, `1.0`},
224	{`1.0`, `0.393`, `0.400`},
225	{`1.0`, `0.283`, `0.237`, `0.253`},
226	{`1.0`, `0.221`, `0.155`, `0.147`, `0.160`},
227	{`1.0`, `0.181`, `0.109`, `0.092`, `0.092`, `0.101`}, // 5
228	{`1.0`, `0.154`, `0.0804`, `0.0609`, `0.0561`, `0.0578`, `0.0638`},
229	{`1.0`, `0.133`, `0.0618`, `0.0423`, `0.0359`, `0.0347`, `0.0364`},
230	{`1.0`, `0.118`, `0.0489`, `0.0306`, `0.024`, `0.0217`, `0.0216`, `0.0229`},
231	{`1.0`, `0.105`, `0.0397`, `0.0228`, `0.0166`, `0.0141`, `0.0133`, `0.0135`, `0.0145`},
232	{`1.0`, `0.0952`, `0.0329`, `0.0174`, `0.0118`, `0.00943`, `0.00844`, `0.00819`, `0.00846`}, // 10
233	{`1.0`, `0.0869`, `0.0276`, `0.0136`, `0.00864`, `0.0065`, `0.00552`, `0.00513`, `0.00509`},
234	{`1.0`, `0.08`, `0.0236`, `0.0108`, `0.00646`, `0.00459`, `0.00371`, `0.00329`, `0.00314`},
235	{`1.0`, `0.074`, `0.0203`, `0.00875`, `0.00492`, `0.00332`, `0.00255`, `0.00217`, `0.00199`, `0.00194`},
236	{`1.0`, `0.0689`, `0.0177`, `0.00718`, `0.00381`, `0.00244`, `0.00179`, `0.00146`, `0.00129`, `0.00121`, `0.0012`},
237	{`1.0`, `0.0645`, `0.0156`, `0.00596`, `0.003`, `0.00183`, `0.00128`, `0.001`, `0.000852`, `0.000775`, `0.000744`}, // 15
238	{`1.0`, `0.0606`, `0.0138`, `0.005`, `0.00239`, `0.00139`, `0.000935`, `0.000702`, `0.000574`, `0.000505`, `0.00047`, `0.000459`},
239	{`1.0`, `0.0571`, `0.0123`, `0.00423`, `0.00193`, `0.00107`, `0.000692`, `0.000499`, `0.000394`, `0.000335`, `0.000302`, `0.000287`, `0.000284`},
240	{`1.0`, `0.054`, `0.0111`, `0.00362`, `0.00158`, `0.000839`, `0.000519`, `0.00036`, `0.000275`, `0.000226`, `0.000198`, `0.000183`, `0.000176`},
241	{`1.0`, `0.0513`, `0.00998`, `0.00312`, `0.0013`, `0.000663`, `0.000394`, `0.000264`, `0.000194`, `0.000155`, `0.000132`, `0.000118`, `0.000111`, `0.000109`},
242	{`1.0`, `0.0488`, `0.00906`, `0.0027`, `0.00108`, `0.00053`, `0.000303`, `0.000196`, `0.00014`, `0.000108`, `8.89e-05`, `7.77e-05`, `7.12e-05`, `6.79e-05`, `6.71e-05`} // 20
243	};
244
245	for (size_t bits_per_row = `1`; bits_per_row < MAX_BITS_PER_ROW; ++bits_per_row)
246	{
247	if (probability_lookup_table[bits_per_row][min_probability_index_each_bits[bits_per_row]] <= max_conflict_probability)
248	{
249	size_t max_size_of_hash_functions = min_probability_index_each_bits[bits_per_row];
250	for (size_t size_of_hash_functions = max_size_of_hash_functions; size_of_hash_functions > `0`; --size_of_hash_functions)
251	if (probability_lookup_table[bits_per_row][size_of_hash_functions] > max_conflict_probability)
252	return std::pair<size_t, size_t>(bits_per_row, size_of_hash_functions + `1`);
253	}
254	}
255
256	return std::pair<size_t, size_t>(MAX_BITS_PER_ROW - `1`, min_probability_index_each_bits[MAX_BITS_PER_ROW - `1`]);
257	}
258	};
259
260	}
261

Browse the source code of ClickHouse/dbms/src/Interpreters/BloomFilterHash.h