bloom_filter.h source code [arrow/parquet/bloom_filter.h]

1	// Licensed to the Apache Software Foundation (ASF) under one
2	// or more contributor license agreements. See the NOTICE file
3	// distributed with this work for additional information
4	// regarding copyright ownership. The ASF licenses this file
5	// to you under the Apache License, Version 2.0 (the
6	// "License"); you may not use this file except in compliance
7	// with the License. You may obtain a copy of the License at
8	//
9	// http://www.apache.org/licenses/LICENSE-2.0
10	//
11	// Unless required by applicable law or agreed to in writing,
12	// software distributed under the License is distributed on an
13	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14	// KIND, either express or implied. See the License for the
15	// specific language governing permissions and limitations
16	// under the License.
17
18	#ifndef PARQUET_BLOOM_FILTER_H
19	#define PARQUET_BLOOM_FILTER_H
20
21	#include <cstdint>
22	#include <memory>
23
24	#include "arrow/util/bit-util.h"
25	#include "arrow/util/logging.h"
26	#include "parquet/exception.h"
27	#include "parquet/hasher.h"
28	#include "parquet/types.h"
29	#include "parquet/util/memory.h"
30	#include "parquet/util/visibility.h"
31
32	namespace parquet {
33	class OutputStream;
34
35	// A Bloom filter is a compact structure to indicate whether an item is not in a set or
36	// probably in a set. The Bloom filter usually consists of a bit set that represents a
37	// set of elements, a hash strategy and a Bloom filter algorithm.
38	class PARQUET_EXPORT BloomFilter {
39	public:
40	// Maximum Bloom filter size, it sets to HDFS default block size 128MB
41	// This value will be reconsidered when implementing Bloom filter producer.
42	static constexpr uint32_t kMaximumBloomFilterBytes = `128` * `1024` * `1024`;
43
44	/// Determine whether an element exist in set or not.
45	///
46	/// @param hash the element to contain.
47	/// @return false if value is definitely not in set, and true means PROBABLY
48	/// in set.
49	virtual bool FindHash(uint64_t hash) const = `0`;
50
51	/// Insert element to set represented by Bloom filter bitset.
52	/// @param hash the hash of value to insert into Bloom filter.
53	virtual void InsertHash(uint64_t hash) = `0`;
54
55	/// Write this Bloom filter to an output stream. A Bloom filter structure should
56	/// include bitset length, hash strategy, algorithm, and bitset.
57	///
58	/// @param sink the output stream to write
59	virtual void WriteTo(OutputStream* sink) const = `0`;
60
61	/// Get the number of bytes of bitset
62	virtual uint32_t GetBitsetSize() const = `0`;
63
64	/// Compute hash for 32 bits value by using its plain encoding result.
65	///
66	/// @param value the value to hash.
67	/// @return hash result.
68	virtual uint64_t Hash(int32_t value) const = `0`;
69
70	/// Compute hash for 64 bits value by using its plain encoding result.
71	///
72	/// @param value the value to hash.
73	/// @return hash result.
74	virtual uint64_t Hash(int64_t value) const = `0`;
75
76	/// Compute hash for float value by using its plain encoding result.
77	///
78	/// @param value the value to hash.
79	/// @return hash result.
80	virtual uint64_t Hash(float value) const = `0`;
81
82	/// Compute hash for double value by using its plain encoding result.
83	///
84	/// @param value the value to hash.
85	/// @return hash result.
86	virtual uint64_t Hash(double value) const = `0`;
87
88	/// Compute hash for Int96 value by using its plain encoding result.
89	///
90	/// @param value the value to hash.
91	/// @return hash result.
92	virtual uint64_t Hash(const Int96* value) const = `0`;
93
94	/// Compute hash for ByteArray value by using its plain encoding result.
95	///
96	/// @param value the value to hash.
97	/// @return hash result.
98	virtual uint64_t Hash(const ByteArray* value) const = `0`;
99
100	/// Compute hash for fixed byte array value by using its plain encoding result.
101	///
102	/// @param value the value address.
103	/// @param len the value length.
104	/// @return hash result.
105	virtual uint64_t Hash(const FLBA* value, uint32_t len) const = `0`;
106
107	virtual ~BloomFilter() {}
108
109	protected:
110	// Hash strategy available for Bloom filter.
111	enum class HashStrategy : uint32_t { MURMUR3_X64_128 = `0` };
112
113	// Bloom filter algorithm.
114	enum class Algorithm : uint32_t { BLOCK = `0` };
115	};
116
117	// The BlockSplitBloomFilter is implemented using block-based Bloom filters from
118	// Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic idea is to
119	// hash the item to a tiny Bloom filter which size fit a single cache line or smaller.
120	//
121	// This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom
122	// filter is 32 bytes to take advantage of 32-byte SIMD instructions.
123	class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter {
124	public:
125	/// The constructor of BlockSplitBloomFilter. It uses murmur3_x64_128 as hash function.
126	BlockSplitBloomFilter();
127
128	/// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within
129	/// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be
130	/// rounded up/down to lower/upper bound if num_bytes is out of range and also
131	/// will be rounded up to a power of 2.
132	///
133	/// @param num_bytes The number of bytes to store Bloom filter bitset.
134	void Init(uint32_t num_bytes);
135
136	/// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying
137	/// bitset because the given bitset may not satisfy the 32-byte alignment requirement
138	/// which may lead to segfault when performing SIMD instructions. It is the caller's
139	/// responsibility to free the bitset passed in. This is used when reconstructing
140	/// a Bloom filter from a parquet file.
141	///
142	/// @param bitset The given bitset to initialize the Bloom filter.
143	/// @param num_bytes The number of bytes of given bitset.
144	void Init(const uint8_t* bitset, uint32_t num_bytes);
145
146	// Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom filter.
147	static constexpr uint32_t kMinimumBloomFilterBytes = `32`;
148
149	/// Calculate optimal size according to the number of distinct values and false
150	/// positive probability.
151	///
152	/// @param ndv The number of distinct values.
153	/// @param fpp The false positive probability.
154	/// @return it always return a value between kMinimumBloomFilterBytes and
155	/// kMaximumBloomFilterBytes, and the return value is always a power of 2
156	static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) {
157	DCHECK(fpp > `0.0` && fpp < `1.0`);
158	const double m = -`8.0` * ndv / log(`1` - pow(fpp, `1.0` / `8`));
159	uint32_t num_bits;
160
161	// Handle overflow.
162	if (m < `0` \|\| m > kMaximumBloomFilterBytes << `3`) {
163	num_bits = static_cast<uint32_t>(kMaximumBloomFilterBytes << `3`);
164	} else {
165	num_bits = static_cast<uint32_t>(m);
166	}
167
168	// Round up to lower bound
169	if (num_bits < kMinimumBloomFilterBytes << `3`) {
170	num_bits = kMinimumBloomFilterBytes << `3`;
171	}
172
173	// Get next power of 2 if bits is not power of 2.
174	if ((num_bits & (num_bits - `1`)) != `0`) {
175	num_bits = static_cast<uint32_t>(::arrow::BitUtil::NextPower2(num_bits));
176	}
177
178	// Round down to upper bound
179	if (num_bits > kMaximumBloomFilterBytes << `3`) {
180	num_bits = kMaximumBloomFilterBytes << `3`;
181	}
182
183	return num_bits;
184	}
185
186	bool FindHash(uint64_t hash) const override;
187	void InsertHash(uint64_t hash) override;
188	void WriteTo(OutputStream* sink) const override;
189	uint32_t GetBitsetSize() const override { return num_bytes_; }
190
191	uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); }
192	uint64_t Hash(float value) const override { return hasher_->Hash(value); }
193	uint64_t Hash(double value) const override { return hasher_->Hash(value); }
194	uint64_t Hash(const Int96* value) const override { return hasher_->Hash(value); }
195	uint64_t Hash(const ByteArray* value) const override { return hasher_->Hash(value); }
196	uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); }
197	uint64_t Hash(const FLBA* value, uint32_t len) const override {
198	return hasher_->Hash(value, len);
199	}
200
201	/// Deserialize the Bloom filter from an input stream. It is used when reconstructing
202	/// a Bloom filter from a parquet filter.
203	///
204	/// @param input_stream The input stream from which to construct the Bloom filter
205	/// @return The BlockSplitBloomFilter.
206	static BlockSplitBloomFilter Deserialize(InputStream* input_stream);
207
208	private:
209	// Bytes in a tiny Bloom filter block.
210	static constexpr int kBytesPerFilterBlock = `32`;
211
212	// The number of bits to be set in each tiny Bloom filter
213	static constexpr int kBitsSetPerBlock = `8`;
214
215	// A mask structure used to set bits in each tiny Bloom filter.
216	struct BlockMask {
217	uint32_t item[kBitsSetPerBlock];
218	};
219
220	// The block-based algorithm needs eight odd SALT values to calculate eight indexes
221	// of bit to set, one bit in each 32-bit word.
222	static constexpr uint32_t SALT[kBitsSetPerBlock] = {
223	`0x47b6137bU`, `0x44974d91U`, `0x8824ad5bU`, `0xa2b7289dU`,
224	`0x705495c7U`, `0x2df1424bU`, `0x9efc4947U`, `0x5c6bfb31U`};
225
226	/// Set bits in mask array according to input key.
227	/// @param key the value to calculate mask values.
228	/// @param mask the mask array is used to set inside a block
229	void SetMask(uint32_t key, BlockMask& mask) const;
230
231	// Memory pool to allocate aligned buffer for bitset
232	::arrow::MemoryPool* pool_;
233
234	// The underlying buffer of bitset.
235	std::shared_ptr<Buffer> data_;
236
237	// The number of bytes of Bloom filter bitset.
238	uint32_t num_bytes_;
239
240	// Hash strategy used in this Bloom filter.
241	HashStrategy hash_strategy_;
242
243	// Algorithm used in this Bloom filter.
244	Algorithm algorithm_;
245
246	// The hash pointer points to actual hash class used.
247	std::unique_ptr<Hasher> hasher_;
248	};
249
250	} // namespace parquet
251
252	#endif // PARQUET_BLOOM_FILTER_H
253

Browse the source code of arrow/parquet/bloom_filter.h