| 1 | #pragma once |
| 2 | |
| 3 | #include <Columns/IColumn.h> |
| 4 | #include <Columns/ColumnArray.h> |
| 5 | #include <Columns/ColumnConst.h> |
| 6 | #include <Columns/ColumnNullable.h> |
| 7 | #include <Columns/ColumnsNumber.h> |
| 8 | #include <Columns/ColumnString.h> |
| 9 | #include <Columns/ColumnFixedString.h> |
| 10 | #include <DataTypes/IDataType.h> |
| 11 | #include <DataTypes/DataTypeArray.h> |
| 12 | #include <DataTypes/DataTypeNullable.h> |
| 13 | #include <DataTypes/DataTypeFixedString.h> |
| 14 | #include <DataTypes/DataTypeLowCardinality.h> |
| 15 | #include <DataTypes/DataTypesNumber.h> |
| 16 | #include <ext/bit_cast.h> |
| 17 | #include <Common/HashTable/Hash.h> |
| 18 | #include <Interpreters/BloomFilter.h> |
| 19 | |
| 20 | namespace DB |
| 21 | { |
| 22 | |
| 23 | namespace ErrorCodes |
| 24 | { |
| 25 | extern const int ILLEGAL_COLUMN; |
| 26 | } |
| 27 | |
| 28 | struct BloomFilterHash |
| 29 | { |
| 30 | static constexpr UInt64 bf_hash_seed[15] = { |
| 31 | 13635471485423070496ULL, 10336109063487487899ULL, 17779957404565211594ULL, 8988612159822229247ULL, 4954614162757618085ULL, |
| 32 | 12980113590177089081ULL, 9263883436177860930ULL, 3656772712723269762ULL, 10362091744962961274ULL, 7582936617938287249ULL, |
| 33 | 15033938188484401405ULL, 18286745649494826751ULL, 6852245486148412312ULL, 8886056245089344681ULL, 10151472371158292780ULL |
| 34 | }; |
| 35 | |
| 36 | static ColumnPtr hashWithField(const IDataType * data_type, const Field & field) |
| 37 | { |
| 38 | WhichDataType which(data_type); |
| 39 | UInt64 hash = 0; |
| 40 | bool unexpected_type = false; |
| 41 | |
| 42 | if (field.isNull()) |
| 43 | { |
| 44 | if (which.isInt() || which.isUInt() || which.isEnum() || which.isDateOrDateTime() || which.isFloat()) |
| 45 | hash = intHash64(0); |
| 46 | else if (which.isString()) |
| 47 | hash = CityHash_v1_0_2::CityHash64("" , 0); |
| 48 | else if (which.isFixedString()) |
| 49 | { |
| 50 | const auto * fixed_string_type = typeid_cast<const DataTypeFixedString *>(data_type); |
| 51 | const std::vector<char> value(fixed_string_type->getN(), 0); |
| 52 | hash = CityHash_v1_0_2::CityHash64(value.data(), value.size()); |
| 53 | } |
| 54 | else |
| 55 | unexpected_type = true; |
| 56 | } |
| 57 | else if (which.isUInt() || which.isDateOrDateTime()) |
| 58 | hash = intHash64(field.safeGet<UInt64>()); |
| 59 | else if (which.isInt() || which.isEnum()) |
| 60 | hash = intHash64(ext::bit_cast<UInt64>(field.safeGet<Int64>())); |
| 61 | else if (which.isFloat32() || which.isFloat64()) |
| 62 | hash = intHash64(ext::bit_cast<UInt64>(field.safeGet<Float64>())); |
| 63 | else if (which.isString() || which.isFixedString()) |
| 64 | { |
| 65 | const auto & value = field.safeGet<String>(); |
| 66 | hash = CityHash_v1_0_2::CityHash64(value.data(), value.size()); |
| 67 | } |
| 68 | else |
| 69 | unexpected_type = true; |
| 70 | |
| 71 | if (unexpected_type) |
| 72 | throw Exception("Unexpected type " + data_type->getName() + " of bloom filter index." , ErrorCodes::LOGICAL_ERROR); |
| 73 | |
| 74 | return ColumnConst::create(ColumnUInt64::create(1, hash), 1); |
| 75 | } |
| 76 | |
| 77 | static ColumnPtr hashWithColumn(const DataTypePtr & data_type, const ColumnPtr & column, size_t pos, size_t limit) |
| 78 | { |
| 79 | WhichDataType which(data_type); |
| 80 | if (which.isArray()) |
| 81 | { |
| 82 | const auto * array_col = typeid_cast<const ColumnArray *>(column.get()); |
| 83 | |
| 84 | if (checkAndGetColumn<ColumnNullable>(array_col->getData())) |
| 85 | throw Exception("Unexpected type " + data_type->getName() + " of bloom filter index." , ErrorCodes::LOGICAL_ERROR); |
| 86 | |
| 87 | const auto & offsets = array_col->getOffsets(); |
| 88 | limit = offsets[pos + limit - 1] - offsets[pos - 1]; /// PaddedPODArray allows access on index -1. |
| 89 | pos = offsets[pos - 1]; |
| 90 | |
| 91 | if (limit == 0) |
| 92 | { |
| 93 | auto index_column = ColumnUInt64::create(1); |
| 94 | ColumnUInt64::Container & index_column_vec = index_column->getData(); |
| 95 | index_column_vec[0] = 0; |
| 96 | return index_column; |
| 97 | } |
| 98 | } |
| 99 | |
| 100 | const ColumnPtr actual_col = BloomFilter::getPrimitiveColumn(column); |
| 101 | const DataTypePtr actual_type = BloomFilter::getPrimitiveType(data_type); |
| 102 | |
| 103 | auto index_column = ColumnUInt64::create(limit); |
| 104 | ColumnUInt64::Container & index_column_vec = index_column->getData(); |
| 105 | getAnyTypeHash<true>(actual_type.get(), actual_col.get(), index_column_vec, pos); |
| 106 | return index_column; |
| 107 | } |
| 108 | |
| 109 | template <bool is_first> |
| 110 | static void getAnyTypeHash(const IDataType * data_type, const IColumn * column, ColumnUInt64::Container & vec, size_t pos) |
| 111 | { |
| 112 | WhichDataType which(data_type); |
| 113 | |
| 114 | if (which.isUInt8()) getNumberTypeHash<UInt8, is_first>(column, vec, pos); |
| 115 | else if (which.isUInt16()) getNumberTypeHash<UInt16, is_first>(column, vec, pos); |
| 116 | else if (which.isUInt32()) getNumberTypeHash<UInt32, is_first>(column, vec, pos); |
| 117 | else if (which.isUInt64()) getNumberTypeHash<UInt64, is_first>(column, vec, pos); |
| 118 | else if (which.isInt8()) getNumberTypeHash<Int8, is_first>(column, vec, pos); |
| 119 | else if (which.isInt16()) getNumberTypeHash<Int16, is_first>(column, vec, pos); |
| 120 | else if (which.isInt32()) getNumberTypeHash<Int32, is_first>(column, vec, pos); |
| 121 | else if (which.isInt64()) getNumberTypeHash<Int64, is_first>(column, vec, pos); |
| 122 | else if (which.isEnum8()) getNumberTypeHash<Int8, is_first>(column, vec, pos); |
| 123 | else if (which.isEnum16()) getNumberTypeHash<Int16, is_first>(column, vec, pos); |
| 124 | else if (which.isDate()) getNumberTypeHash<UInt16, is_first>(column, vec, pos); |
| 125 | else if (which.isDateTime()) getNumberTypeHash<UInt32, is_first>(column, vec, pos); |
| 126 | else if (which.isFloat32()) getNumberTypeHash<Float32, is_first>(column, vec, pos); |
| 127 | else if (which.isFloat64()) getNumberTypeHash<Float64, is_first>(column, vec, pos); |
| 128 | else if (which.isString()) getStringTypeHash<is_first>(column, vec, pos); |
| 129 | else if (which.isFixedString()) getStringTypeHash<is_first>(column, vec, pos); |
| 130 | else throw Exception("Unexpected type " + data_type->getName() + " of bloom filter index." , ErrorCodes::LOGICAL_ERROR); |
| 131 | } |
| 132 | |
| 133 | template <typename Type, bool is_first> |
| 134 | static void getNumberTypeHash(const IColumn * column, ColumnUInt64::Container & vec, size_t pos) |
| 135 | { |
| 136 | const auto * index_column = typeid_cast<const ColumnVector<Type> *>(column); |
| 137 | |
| 138 | if (unlikely(!index_column)) |
| 139 | throw Exception("Illegal column type was passed to the bloom filter index." , ErrorCodes::ILLEGAL_COLUMN); |
| 140 | |
| 141 | const typename ColumnVector<Type>::Container & vec_from = index_column->getData(); |
| 142 | |
| 143 | /// Because we're missing the precision of float in the Field.h |
| 144 | /// to be consistent, we need to convert Float32 to Float64 processing, also see: BloomFilterHash::hashWithField |
| 145 | if constexpr (std::is_same_v<ColumnVector<Type>, ColumnFloat32>) |
| 146 | { |
| 147 | for (size_t index = 0, size = vec.size(); index < size; ++index) |
| 148 | { |
| 149 | UInt64 hash = intHash64(ext::bit_cast<UInt64>(Float64(vec_from[index + pos]))); |
| 150 | |
| 151 | if constexpr (is_first) |
| 152 | vec[index] = hash; |
| 153 | else |
| 154 | vec[index] = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(vec[index], hash)); |
| 155 | } |
| 156 | } |
| 157 | else |
| 158 | { |
| 159 | for (size_t index = 0, size = vec.size(); index < size; ++index) |
| 160 | { |
| 161 | UInt64 hash = intHash64(ext::bit_cast<UInt64>(vec_from[index + pos])); |
| 162 | |
| 163 | if constexpr (is_first) |
| 164 | vec[index] = hash; |
| 165 | else |
| 166 | vec[index] = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(vec[index], hash)); |
| 167 | } |
| 168 | } |
| 169 | } |
| 170 | |
| 171 | template <bool is_first> |
| 172 | static void getStringTypeHash(const IColumn * column, ColumnUInt64::Container & vec, size_t pos) |
| 173 | { |
| 174 | if (const auto * index_column = typeid_cast<const ColumnString *>(column)) |
| 175 | { |
| 176 | const ColumnString::Chars & data = index_column->getChars(); |
| 177 | const ColumnString::Offsets & offsets = index_column->getOffsets(); |
| 178 | |
| 179 | ColumnString::Offset current_offset = pos; |
| 180 | for (size_t index = 0, size = vec.size(); index < size; ++index) |
| 181 | { |
| 182 | UInt64 city_hash = CityHash_v1_0_2::CityHash64( |
| 183 | reinterpret_cast<const char *>(&data[current_offset]), offsets[index + pos] - current_offset - 1); |
| 184 | |
| 185 | if constexpr (is_first) |
| 186 | vec[index] = city_hash; |
| 187 | else |
| 188 | vec[index] = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(vec[index], city_hash)); |
| 189 | |
| 190 | current_offset = offsets[index + pos]; |
| 191 | } |
| 192 | } |
| 193 | else if (const auto * fixed_string_index_column = typeid_cast<const ColumnFixedString *>(column)) |
| 194 | { |
| 195 | size_t fixed_len = fixed_string_index_column->getN(); |
| 196 | const auto & data = fixed_string_index_column->getChars(); |
| 197 | |
| 198 | for (size_t index = 0, size = vec.size(); index < size; ++index) |
| 199 | { |
| 200 | UInt64 city_hash = CityHash_v1_0_2::CityHash64(reinterpret_cast<const char *>(&data[(index + pos) * fixed_len]), fixed_len); |
| 201 | |
| 202 | if constexpr (is_first) |
| 203 | vec[index] = city_hash; |
| 204 | else |
| 205 | vec[index] = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(vec[index], city_hash)); |
| 206 | } |
| 207 | } |
| 208 | else |
| 209 | throw Exception("Illegal column type was passed to the bloom filter index." , ErrorCodes::ILLEGAL_COLUMN); |
| 210 | } |
| 211 | |
| 212 | static std::pair<size_t, size_t> calculationBestPractices(double max_conflict_probability) |
| 213 | { |
| 214 | static const size_t MAX_BITS_PER_ROW = 20; |
| 215 | static const size_t MAX_HASH_FUNCTION_COUNT = 15; |
| 216 | |
| 217 | /// For the smallest index per level in probability_lookup_table |
| 218 | static const size_t min_probability_index_each_bits[] = {0, 0, 1, 2, 3, 3, 4, 5, 6, 6, 7, 8, 8, 9, 10, 10, 11, 12, 12, 13, 14}; |
| 219 | |
| 220 | static const long double probability_lookup_table[MAX_BITS_PER_ROW + 1][MAX_HASH_FUNCTION_COUNT] = |
| 221 | { |
| 222 | {1.0}, /// dummy, 0 bits per row |
| 223 | {1.0, 1.0}, |
| 224 | {1.0, 0.393, 0.400}, |
| 225 | {1.0, 0.283, 0.237, 0.253}, |
| 226 | {1.0, 0.221, 0.155, 0.147, 0.160}, |
| 227 | {1.0, 0.181, 0.109, 0.092, 0.092, 0.101}, // 5 |
| 228 | {1.0, 0.154, 0.0804, 0.0609, 0.0561, 0.0578, 0.0638}, |
| 229 | {1.0, 0.133, 0.0618, 0.0423, 0.0359, 0.0347, 0.0364}, |
| 230 | {1.0, 0.118, 0.0489, 0.0306, 0.024, 0.0217, 0.0216, 0.0229}, |
| 231 | {1.0, 0.105, 0.0397, 0.0228, 0.0166, 0.0141, 0.0133, 0.0135, 0.0145}, |
| 232 | {1.0, 0.0952, 0.0329, 0.0174, 0.0118, 0.00943, 0.00844, 0.00819, 0.00846}, // 10 |
| 233 | {1.0, 0.0869, 0.0276, 0.0136, 0.00864, 0.0065, 0.00552, 0.00513, 0.00509}, |
| 234 | {1.0, 0.08, 0.0236, 0.0108, 0.00646, 0.00459, 0.00371, 0.00329, 0.00314}, |
| 235 | {1.0, 0.074, 0.0203, 0.00875, 0.00492, 0.00332, 0.00255, 0.00217, 0.00199, 0.00194}, |
| 236 | {1.0, 0.0689, 0.0177, 0.00718, 0.00381, 0.00244, 0.00179, 0.00146, 0.00129, 0.00121, 0.0012}, |
| 237 | {1.0, 0.0645, 0.0156, 0.00596, 0.003, 0.00183, 0.00128, 0.001, 0.000852, 0.000775, 0.000744}, // 15 |
| 238 | {1.0, 0.0606, 0.0138, 0.005, 0.00239, 0.00139, 0.000935, 0.000702, 0.000574, 0.000505, 0.00047, 0.000459}, |
| 239 | {1.0, 0.0571, 0.0123, 0.00423, 0.00193, 0.00107, 0.000692, 0.000499, 0.000394, 0.000335, 0.000302, 0.000287, 0.000284}, |
| 240 | {1.0, 0.054, 0.0111, 0.00362, 0.00158, 0.000839, 0.000519, 0.00036, 0.000275, 0.000226, 0.000198, 0.000183, 0.000176}, |
| 241 | {1.0, 0.0513, 0.00998, 0.00312, 0.0013, 0.000663, 0.000394, 0.000264, 0.000194, 0.000155, 0.000132, 0.000118, 0.000111, 0.000109}, |
| 242 | {1.0, 0.0488, 0.00906, 0.0027, 0.00108, 0.00053, 0.000303, 0.000196, 0.00014, 0.000108, 8.89e-05, 7.77e-05, 7.12e-05, 6.79e-05, 6.71e-05} // 20 |
| 243 | }; |
| 244 | |
| 245 | for (size_t bits_per_row = 1; bits_per_row < MAX_BITS_PER_ROW; ++bits_per_row) |
| 246 | { |
| 247 | if (probability_lookup_table[bits_per_row][min_probability_index_each_bits[bits_per_row]] <= max_conflict_probability) |
| 248 | { |
| 249 | size_t max_size_of_hash_functions = min_probability_index_each_bits[bits_per_row]; |
| 250 | for (size_t size_of_hash_functions = max_size_of_hash_functions; size_of_hash_functions > 0; --size_of_hash_functions) |
| 251 | if (probability_lookup_table[bits_per_row][size_of_hash_functions] > max_conflict_probability) |
| 252 | return std::pair<size_t, size_t>(bits_per_row, size_of_hash_functions + 1); |
| 253 | } |
| 254 | } |
| 255 | |
| 256 | return std::pair<size_t, size_t>(MAX_BITS_PER_ROW - 1, min_probability_index_each_bits[MAX_BITS_PER_ROW - 1]); |
| 257 | } |
| 258 | }; |
| 259 | |
| 260 | } |
| 261 | |