1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18#ifndef PARQUET_BLOOM_FILTER_H
19#define PARQUET_BLOOM_FILTER_H
20
21#include <cstdint>
22#include <memory>
23
24#include "arrow/util/bit-util.h"
25#include "arrow/util/logging.h"
26#include "parquet/exception.h"
27#include "parquet/hasher.h"
28#include "parquet/types.h"
29#include "parquet/util/memory.h"
30#include "parquet/util/visibility.h"
31
32namespace parquet {
33class OutputStream;
34
35// A Bloom filter is a compact structure to indicate whether an item is not in a set or
36// probably in a set. The Bloom filter usually consists of a bit set that represents a
37// set of elements, a hash strategy and a Bloom filter algorithm.
38class PARQUET_EXPORT BloomFilter {
39 public:
40 // Maximum Bloom filter size, it sets to HDFS default block size 128MB
41 // This value will be reconsidered when implementing Bloom filter producer.
42 static constexpr uint32_t kMaximumBloomFilterBytes = 128 * 1024 * 1024;
43
44 /// Determine whether an element exist in set or not.
45 ///
46 /// @param hash the element to contain.
47 /// @return false if value is definitely not in set, and true means PROBABLY
48 /// in set.
49 virtual bool FindHash(uint64_t hash) const = 0;
50
51 /// Insert element to set represented by Bloom filter bitset.
52 /// @param hash the hash of value to insert into Bloom filter.
53 virtual void InsertHash(uint64_t hash) = 0;
54
55 /// Write this Bloom filter to an output stream. A Bloom filter structure should
56 /// include bitset length, hash strategy, algorithm, and bitset.
57 ///
58 /// @param sink the output stream to write
59 virtual void WriteTo(OutputStream* sink) const = 0;
60
61 /// Get the number of bytes of bitset
62 virtual uint32_t GetBitsetSize() const = 0;
63
64 /// Compute hash for 32 bits value by using its plain encoding result.
65 ///
66 /// @param value the value to hash.
67 /// @return hash result.
68 virtual uint64_t Hash(int32_t value) const = 0;
69
70 /// Compute hash for 64 bits value by using its plain encoding result.
71 ///
72 /// @param value the value to hash.
73 /// @return hash result.
74 virtual uint64_t Hash(int64_t value) const = 0;
75
76 /// Compute hash for float value by using its plain encoding result.
77 ///
78 /// @param value the value to hash.
79 /// @return hash result.
80 virtual uint64_t Hash(float value) const = 0;
81
82 /// Compute hash for double value by using its plain encoding result.
83 ///
84 /// @param value the value to hash.
85 /// @return hash result.
86 virtual uint64_t Hash(double value) const = 0;
87
88 /// Compute hash for Int96 value by using its plain encoding result.
89 ///
90 /// @param value the value to hash.
91 /// @return hash result.
92 virtual uint64_t Hash(const Int96* value) const = 0;
93
94 /// Compute hash for ByteArray value by using its plain encoding result.
95 ///
96 /// @param value the value to hash.
97 /// @return hash result.
98 virtual uint64_t Hash(const ByteArray* value) const = 0;
99
100 /// Compute hash for fixed byte array value by using its plain encoding result.
101 ///
102 /// @param value the value address.
103 /// @param len the value length.
104 /// @return hash result.
105 virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0;
106
107 virtual ~BloomFilter() {}
108
109 protected:
110 // Hash strategy available for Bloom filter.
111 enum class HashStrategy : uint32_t { MURMUR3_X64_128 = 0 };
112
113 // Bloom filter algorithm.
114 enum class Algorithm : uint32_t { BLOCK = 0 };
115};
116
117// The BlockSplitBloomFilter is implemented using block-based Bloom filters from
118// Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic idea is to
119// hash the item to a tiny Bloom filter which size fit a single cache line or smaller.
120//
121// This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom
122// filter is 32 bytes to take advantage of 32-byte SIMD instructions.
123class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter {
124 public:
125 /// The constructor of BlockSplitBloomFilter. It uses murmur3_x64_128 as hash function.
126 BlockSplitBloomFilter();
127
128 /// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within
129 /// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be
130 /// rounded up/down to lower/upper bound if num_bytes is out of range and also
131 /// will be rounded up to a power of 2.
132 ///
133 /// @param num_bytes The number of bytes to store Bloom filter bitset.
134 void Init(uint32_t num_bytes);
135
136 /// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying
137 /// bitset because the given bitset may not satisfy the 32-byte alignment requirement
138 /// which may lead to segfault when performing SIMD instructions. It is the caller's
139 /// responsibility to free the bitset passed in. This is used when reconstructing
140 /// a Bloom filter from a parquet file.
141 ///
142 /// @param bitset The given bitset to initialize the Bloom filter.
143 /// @param num_bytes The number of bytes of given bitset.
144 void Init(const uint8_t* bitset, uint32_t num_bytes);
145
146 // Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom filter.
147 static constexpr uint32_t kMinimumBloomFilterBytes = 32;
148
149 /// Calculate optimal size according to the number of distinct values and false
150 /// positive probability.
151 ///
152 /// @param ndv The number of distinct values.
153 /// @param fpp The false positive probability.
154 /// @return it always return a value between kMinimumBloomFilterBytes and
155 /// kMaximumBloomFilterBytes, and the return value is always a power of 2
156 static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) {
157 DCHECK(fpp > 0.0 && fpp < 1.0);
158 const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8));
159 uint32_t num_bits;
160
161 // Handle overflow.
162 if (m < 0 || m > kMaximumBloomFilterBytes << 3) {
163 num_bits = static_cast<uint32_t>(kMaximumBloomFilterBytes << 3);
164 } else {
165 num_bits = static_cast<uint32_t>(m);
166 }
167
168 // Round up to lower bound
169 if (num_bits < kMinimumBloomFilterBytes << 3) {
170 num_bits = kMinimumBloomFilterBytes << 3;
171 }
172
173 // Get next power of 2 if bits is not power of 2.
174 if ((num_bits & (num_bits - 1)) != 0) {
175 num_bits = static_cast<uint32_t>(::arrow::BitUtil::NextPower2(num_bits));
176 }
177
178 // Round down to upper bound
179 if (num_bits > kMaximumBloomFilterBytes << 3) {
180 num_bits = kMaximumBloomFilterBytes << 3;
181 }
182
183 return num_bits;
184 }
185
186 bool FindHash(uint64_t hash) const override;
187 void InsertHash(uint64_t hash) override;
188 void WriteTo(OutputStream* sink) const override;
189 uint32_t GetBitsetSize() const override { return num_bytes_; }
190
191 uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); }
192 uint64_t Hash(float value) const override { return hasher_->Hash(value); }
193 uint64_t Hash(double value) const override { return hasher_->Hash(value); }
194 uint64_t Hash(const Int96* value) const override { return hasher_->Hash(value); }
195 uint64_t Hash(const ByteArray* value) const override { return hasher_->Hash(value); }
196 uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); }
197 uint64_t Hash(const FLBA* value, uint32_t len) const override {
198 return hasher_->Hash(value, len);
199 }
200
201 /// Deserialize the Bloom filter from an input stream. It is used when reconstructing
202 /// a Bloom filter from a parquet filter.
203 ///
204 /// @param input_stream The input stream from which to construct the Bloom filter
205 /// @return The BlockSplitBloomFilter.
206 static BlockSplitBloomFilter Deserialize(InputStream* input_stream);
207
208 private:
209 // Bytes in a tiny Bloom filter block.
210 static constexpr int kBytesPerFilterBlock = 32;
211
212 // The number of bits to be set in each tiny Bloom filter
213 static constexpr int kBitsSetPerBlock = 8;
214
215 // A mask structure used to set bits in each tiny Bloom filter.
216 struct BlockMask {
217 uint32_t item[kBitsSetPerBlock];
218 };
219
220 // The block-based algorithm needs eight odd SALT values to calculate eight indexes
221 // of bit to set, one bit in each 32-bit word.
222 static constexpr uint32_t SALT[kBitsSetPerBlock] = {
223 0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU,
224 0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U};
225
226 /// Set bits in mask array according to input key.
227 /// @param key the value to calculate mask values.
228 /// @param mask the mask array is used to set inside a block
229 void SetMask(uint32_t key, BlockMask& mask) const;
230
231 // Memory pool to allocate aligned buffer for bitset
232 ::arrow::MemoryPool* pool_;
233
234 // The underlying buffer of bitset.
235 std::shared_ptr<Buffer> data_;
236
237 // The number of bytes of Bloom filter bitset.
238 uint32_t num_bytes_;
239
240 // Hash strategy used in this Bloom filter.
241 HashStrategy hash_strategy_;
242
243 // Algorithm used in this Bloom filter.
244 Algorithm algorithm_;
245
246 // The hash pointer points to actual hash class used.
247 std::unique_ptr<Hasher> hasher_;
248};
249
250} // namespace parquet
251
252#endif // PARQUET_BLOOM_FILTER_H
253