1 | #include <Interpreters/BloomFilter.h> |
2 | #include <city.h> |
3 | #include <Columns/ColumnArray.h> |
4 | #include <Columns/ColumnNullable.h> |
5 | #include <Columns/ColumnLowCardinality.h> |
6 | #include <DataTypes/DataTypeArray.h> |
7 | #include <DataTypes/DataTypeNullable.h> |
8 | #include <DataTypes/DataTypeLowCardinality.h> |
9 | |
10 | |
11 | namespace DB |
12 | { |
13 | |
14 | static constexpr UInt64 SEED_GEN_A = 845897321; |
15 | static constexpr UInt64 SEED_GEN_B = 217728422; |
16 | |
17 | BloomFilter::BloomFilter(size_t size_, size_t hashes_, size_t seed_) |
18 | : size(size_), hashes(hashes_), seed(seed_), words((size + sizeof(UnderType) - 1) / sizeof(UnderType)), filter(words, 0) {} |
19 | |
20 | BloomFilter::BloomFilter(const BloomFilter & bloom_filter) |
21 | : size(bloom_filter.size), hashes(bloom_filter.hashes), seed(bloom_filter.seed), words(bloom_filter.words), filter(bloom_filter.filter) {} |
22 | |
23 | bool BloomFilter::find(const char * data, size_t len) |
24 | { |
25 | size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, seed); |
26 | size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN_A * seed + SEED_GEN_B); |
27 | |
28 | for (size_t i = 0; i < hashes; ++i) |
29 | { |
30 | size_t pos = (hash1 + i * hash2 + i * i) % (8 * size); |
31 | if (!(filter[pos / (8 * sizeof(UnderType))] & (1ULL << (pos % (8 * sizeof(UnderType)))))) |
32 | return false; |
33 | } |
34 | return true; |
35 | } |
36 | |
37 | void BloomFilter::add(const char * data, size_t len) |
38 | { |
39 | size_t hash1 = CityHash_v1_0_2::CityHash64WithSeed(data, len, seed); |
40 | size_t hash2 = CityHash_v1_0_2::CityHash64WithSeed(data, len, SEED_GEN_A * seed + SEED_GEN_B); |
41 | |
42 | for (size_t i = 0; i < hashes; ++i) |
43 | { |
44 | size_t pos = (hash1 + i * hash2 + i * i) % (8 * size); |
45 | filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType)))); |
46 | } |
47 | } |
48 | |
49 | void BloomFilter::clear() |
50 | { |
51 | filter.assign(words, 0); |
52 | } |
53 | |
54 | bool BloomFilter::contains(const BloomFilter & bf) |
55 | { |
56 | for (size_t i = 0; i < words; ++i) |
57 | { |
58 | if ((filter[i] & bf.filter[i]) != bf.filter[i]) |
59 | return false; |
60 | } |
61 | return true; |
62 | } |
63 | |
64 | UInt64 BloomFilter::isEmpty() const |
65 | { |
66 | for (size_t i = 0; i < words; ++i) |
67 | if (filter[i] != 0) |
68 | return false; |
69 | return true; |
70 | } |
71 | |
72 | bool operator== (const BloomFilter & a, const BloomFilter & b) |
73 | { |
74 | for (size_t i = 0; i < a.words; ++i) |
75 | if (a.filter[i] != b.filter[i]) |
76 | return false; |
77 | return true; |
78 | } |
79 | |
80 | void BloomFilter::addHashWithSeed(const UInt64 & hash, const UInt64 & hash_seed) |
81 | { |
82 | size_t pos = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)) % (8 * size); |
83 | filter[pos / (8 * sizeof(UnderType))] |= (1ULL << (pos % (8 * sizeof(UnderType)))); |
84 | } |
85 | |
86 | bool BloomFilter::findHashWithSeed(const UInt64 & hash, const UInt64 & hash_seed) |
87 | { |
88 | size_t pos = CityHash_v1_0_2::Hash128to64(CityHash_v1_0_2::uint128(hash, hash_seed)) % (8 * size); |
89 | return bool(filter[pos / (8 * sizeof(UnderType))] & (1ULL << (pos % (8 * sizeof(UnderType))))); |
90 | } |
91 | |
92 | DataTypePtr BloomFilter::getPrimitiveType(const DataTypePtr & data_type) |
93 | { |
94 | if (const auto * array_type = typeid_cast<const DataTypeArray *>(data_type.get())) |
95 | { |
96 | if (!typeid_cast<const DataTypeArray *>(array_type->getNestedType().get())) |
97 | return getPrimitiveType(array_type->getNestedType()); |
98 | else |
99 | throw Exception("Unexpected type " + data_type->getName() + " of bloom filter index." , ErrorCodes::LOGICAL_ERROR); |
100 | } |
101 | |
102 | if (const auto * nullable_type = typeid_cast<const DataTypeNullable *>(data_type.get())) |
103 | return getPrimitiveType(nullable_type->getNestedType()); |
104 | |
105 | if (const auto * low_cardinality_type = typeid_cast<const DataTypeLowCardinality *>(data_type.get())) |
106 | return getPrimitiveType(low_cardinality_type->getDictionaryType()); |
107 | |
108 | return data_type; |
109 | } |
110 | |
111 | ColumnPtr BloomFilter::getPrimitiveColumn(const ColumnPtr & column) |
112 | { |
113 | if (const auto * array_col = typeid_cast<const ColumnArray *>(column.get())) |
114 | return getPrimitiveColumn(array_col->getDataPtr()); |
115 | |
116 | if (const auto * nullable_col = typeid_cast<const ColumnNullable *>(column.get())) |
117 | return getPrimitiveColumn(nullable_col->getNestedColumnPtr()); |
118 | |
119 | if (const auto * low_cardinality_col = typeid_cast<const ColumnLowCardinality *>(column.get())) |
120 | return getPrimitiveColumn(low_cardinality_col->convertToFullColumnIfLowCardinality()); |
121 | |
122 | return column; |
123 | } |
124 | |
125 | } |
126 | |