| 1 | #include <Interpreters/BloomFilter.h> |
| 2 | #include <city.h> |
| 3 | #include <Columns/ColumnArray.h> |
| 4 | #include <Columns/ColumnNullable.h> |
| 5 | #include <Columns/ColumnLowCardinality.h> |
| 6 | #include <DataTypes/DataTypeArray.h> |
| 7 | #include <DataTypes/DataTypeNullable.h> |
| 8 | #include <DataTypes/DataTypeLowCardinality.h> |
| 9 | |
| 10 | |
| 11 | namespace DB |
| 12 | { |
| 13 | |
| 14 | static constexpr UInt64 SEED_GEN_A = 845897321; |
| 15 | static constexpr UInt64 SEED_GEN_B = 217728422; |
| 16 | |
| 17 | BloomFilter::BloomFilter(size_t size_, size_t hashes_, size_t seed_) |
| 18 | : size(size_), hashes(hashes_), seed(seed_), words((size + sizeof(UnderType) - 1) / sizeof(UnderType)), filter(words, 0) {} |
| 19 | |
| 20 | BloomFilter::BloomFilter(const BloomFilter & bloom_filter) |
| 21 | : size(bloom_filter.size), hashes(bloom_filter.hashes), seed(bloom_filter.seed), words(bloom_filter.words), filter(bloom_filter.filter) {} |
| 22 | |
| 23 | bool BloomFilter::find(const char * data, size_t len) |
| 24 | { |
| 25 | size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, seed); |
| 26 | size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN_A * seed + SEED_GEN_B); |
| 27 | |
| 28 | for (size_t i = 0; i < hashes; ++i) |
| 29 | { |
| 30 | size_t pos = (hash1 + i * hash2 + i * i) % (8 * size); |
| 31 | if (!(filter[pos / (8 * sizeof(UnderType))] & (1ULL << (pos % (8 * sizeof(UnderType)))))) |
| 32 | return false; |
| 33 | } |
| 34 | return true; |
| 35 | } |
| 36 | |
| 37 | void BloomFilter::add(const char * data, size_t len) |
| 38 | { |
| 39 | size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, seed); |
| 40 | size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN_A * seed + SEED_GEN_B); |
| 41 | |
| 42 | for (size_t i = 0; i < hashes; ++i) |
| 43 | { |
| 44 | size_t pos = (hash1 + i * hash2 + i * i) % (8 * size); |
| 45 | filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType)))); |
| 46 | } |
| 47 | } |
| 48 | |
| 49 | void BloomFilter::clear() |
| 50 | { |
| 51 | filter.assign(words, 0); |
| 52 | } |
| 53 | |
| 54 | bool BloomFilter::contains(const BloomFilter & bf) |
| 55 | { |
| 56 | for (size_t i = 0; i < words; ++i) |
| 57 | { |
| 58 | if ((filter[i] & bf.filter[i]) != bf.filter[i]) |
| 59 | return false; |
| 60 | } |
| 61 | return true; |
| 62 | } |
| 63 | |
| 64 | UInt64 BloomFilter::isEmpty() const |
| 65 | { |
| 66 | for (size_t i = 0; i < words; ++i) |
| 67 | if (filter[i] != 0) |
| 68 | return false; |
| 69 | return true; |
| 70 | } |
| 71 | |
| 72 | bool operator== (const BloomFilter & a, const BloomFilter & b) |
| 73 | { |
| 74 | for (size_t i = 0; i < a.words; ++i) |
| 75 | if (a.filter[i] != b.filter[i]) |
| 76 | return false; |
| 77 | return true; |
| 78 | } |
| 79 | |
| 80 | void BloomFilter::addHashWithSeed(const UInt64 & hash, const UInt64 & hash_seed) |
| 81 | { |
| 82 | size_t pos = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)) % (8 * size); |
| 83 | filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType)))); |
| 84 | } |
| 85 | |
| 86 | bool BloomFilter::findHashWithSeed(const UInt64 & hash, const UInt64 & hash_seed) |
| 87 | { |
| 88 | size_t pos = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)) % (8 * size); |
| 89 | return bool(filter[pos / (8 * sizeof(UnderType))] & (1ULL << (pos % (8 * sizeof(UnderType))))); |
| 90 | } |
| 91 | |
| 92 | DataTypePtr BloomFilter::getPrimitiveType(const DataTypePtr & data_type) |
| 93 | { |
| 94 | if (const auto * array_type = typeid_cast<const DataTypeArray *>(data_type.get())) |
| 95 | { |
| 96 | if (!typeid_cast<const DataTypeArray *>(array_type->getNestedType().get())) |
| 97 | return getPrimitiveType(array_type->getNestedType()); |
| 98 | else |
| 99 | throw Exception("Unexpected type " + data_type->getName() + " of bloom filter index." , ErrorCodes::LOGICAL_ERROR); |
| 100 | } |
| 101 | |
| 102 | if (const auto * nullable_type = typeid_cast<const DataTypeNullable *>(data_type.get())) |
| 103 | return getPrimitiveType(nullable_type->getNestedType()); |
| 104 | |
| 105 | if (const auto * low_cardinality_type = typeid_cast<const DataTypeLowCardinality *>(data_type.get())) |
| 106 | return getPrimitiveType(low_cardinality_type->getDictionaryType()); |
| 107 | |
| 108 | return data_type; |
| 109 | } |
| 110 | |
| 111 | ColumnPtr BloomFilter::getPrimitiveColumn(const ColumnPtr & column) |
| 112 | { |
| 113 | if (const auto * array_col = typeid_cast<const ColumnArray *>(column.get())) |
| 114 | return getPrimitiveColumn(array_col->getDataPtr()); |
| 115 | |
| 116 | if (const auto * nullable_col = typeid_cast<const ColumnNullable *>(column.get())) |
| 117 | return getPrimitiveColumn(nullable_col->getNestedColumnPtr()); |
| 118 | |
| 119 | if (const auto * low_cardinality_col = typeid_cast<const ColumnLowCardinality *>(column.get())) |
| 120 | return getPrimitiveColumn(low_cardinality_col->convertToFullColumnIfLowCardinality()); |
| 121 | |
| 122 | return column; |
| 123 | } |
| 124 | |
| 125 | } |
| 126 | |