1 | // Licensed to the Apache Software Foundation (ASF) under one |
2 | // or more contributor license agreements. See the NOTICE file |
3 | // distributed with this work for additional information |
4 | // regarding copyright ownership. The ASF licenses this file |
5 | // to you under the Apache License, Version 2.0 (the |
6 | // "License"); you may not use this file except in compliance |
7 | // with the License. You may obtain a copy of the License at |
8 | // |
9 | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | // |
11 | // Unless required by applicable law or agreed to in writing, |
12 | // software distributed under the License is distributed on an |
13 | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | // KIND, either express or implied. See the License for the |
15 | // specific language governing permissions and limitations |
16 | // under the License. |
17 | |
18 | #ifndef PARQUET_BLOOM_FILTER_H |
19 | #define PARQUET_BLOOM_FILTER_H |
20 | |
21 | #include <cstdint> |
22 | #include <memory> |
23 | |
24 | #include "arrow/util/bit-util.h" |
25 | #include "arrow/util/logging.h" |
26 | #include "parquet/exception.h" |
27 | #include "parquet/hasher.h" |
28 | #include "parquet/types.h" |
29 | #include "parquet/util/memory.h" |
30 | #include "parquet/util/visibility.h" |
31 | |
32 | namespace parquet { |
33 | class OutputStream; |
34 | |
35 | // A Bloom filter is a compact structure to indicate whether an item is not in a set or |
36 | // probably in a set. The Bloom filter usually consists of a bit set that represents a |
37 | // set of elements, a hash strategy and a Bloom filter algorithm. |
38 | class PARQUET_EXPORT BloomFilter { |
39 | public: |
40 | // Maximum Bloom filter size, it sets to HDFS default block size 128MB |
41 | // This value will be reconsidered when implementing Bloom filter producer. |
42 | static constexpr uint32_t kMaximumBloomFilterBytes = 128 * 1024 * 1024; |
43 | |
44 | /// Determine whether an element exist in set or not. |
45 | /// |
46 | /// @param hash the element to contain. |
47 | /// @return false if value is definitely not in set, and true means PROBABLY |
48 | /// in set. |
49 | virtual bool FindHash(uint64_t hash) const = 0; |
50 | |
51 | /// Insert element to set represented by Bloom filter bitset. |
52 | /// @param hash the hash of value to insert into Bloom filter. |
53 | virtual void InsertHash(uint64_t hash) = 0; |
54 | |
55 | /// Write this Bloom filter to an output stream. A Bloom filter structure should |
56 | /// include bitset length, hash strategy, algorithm, and bitset. |
57 | /// |
58 | /// @param sink the output stream to write |
59 | virtual void WriteTo(OutputStream* sink) const = 0; |
60 | |
61 | /// Get the number of bytes of bitset |
62 | virtual uint32_t GetBitsetSize() const = 0; |
63 | |
64 | /// Compute hash for 32 bits value by using its plain encoding result. |
65 | /// |
66 | /// @param value the value to hash. |
67 | /// @return hash result. |
68 | virtual uint64_t Hash(int32_t value) const = 0; |
69 | |
70 | /// Compute hash for 64 bits value by using its plain encoding result. |
71 | /// |
72 | /// @param value the value to hash. |
73 | /// @return hash result. |
74 | virtual uint64_t Hash(int64_t value) const = 0; |
75 | |
76 | /// Compute hash for float value by using its plain encoding result. |
77 | /// |
78 | /// @param value the value to hash. |
79 | /// @return hash result. |
80 | virtual uint64_t Hash(float value) const = 0; |
81 | |
82 | /// Compute hash for double value by using its plain encoding result. |
83 | /// |
84 | /// @param value the value to hash. |
85 | /// @return hash result. |
86 | virtual uint64_t Hash(double value) const = 0; |
87 | |
88 | /// Compute hash for Int96 value by using its plain encoding result. |
89 | /// |
90 | /// @param value the value to hash. |
91 | /// @return hash result. |
92 | virtual uint64_t Hash(const Int96* value) const = 0; |
93 | |
94 | /// Compute hash for ByteArray value by using its plain encoding result. |
95 | /// |
96 | /// @param value the value to hash. |
97 | /// @return hash result. |
98 | virtual uint64_t Hash(const ByteArray* value) const = 0; |
99 | |
100 | /// Compute hash for fixed byte array value by using its plain encoding result. |
101 | /// |
102 | /// @param value the value address. |
103 | /// @param len the value length. |
104 | /// @return hash result. |
105 | virtual uint64_t Hash(const FLBA* value, uint32_t len) const = 0; |
106 | |
107 | virtual ~BloomFilter() {} |
108 | |
109 | protected: |
110 | // Hash strategy available for Bloom filter. |
111 | enum class HashStrategy : uint32_t { MURMUR3_X64_128 = 0 }; |
112 | |
113 | // Bloom filter algorithm. |
114 | enum class Algorithm : uint32_t { BLOCK = 0 }; |
115 | }; |
116 | |
117 | // The BlockSplitBloomFilter is implemented using block-based Bloom filters from |
118 | // Putze et al.'s "Cache-,Hash- and Space-Efficient Bloom filters". The basic idea is to |
119 | // hash the item to a tiny Bloom filter which size fit a single cache line or smaller. |
120 | // |
121 | // This implementation sets 8 bits in each tiny Bloom filter. Each tiny Bloom |
122 | // filter is 32 bytes to take advantage of 32-byte SIMD instructions. |
123 | class PARQUET_EXPORT BlockSplitBloomFilter : public BloomFilter { |
124 | public: |
125 | /// The constructor of BlockSplitBloomFilter. It uses murmur3_x64_128 as hash function. |
126 | BlockSplitBloomFilter(); |
127 | |
128 | /// Initialize the BlockSplitBloomFilter. The range of num_bytes should be within |
129 | /// [kMinimumBloomFilterBytes, kMaximumBloomFilterBytes], it will be |
130 | /// rounded up/down to lower/upper bound if num_bytes is out of range and also |
131 | /// will be rounded up to a power of 2. |
132 | /// |
133 | /// @param num_bytes The number of bytes to store Bloom filter bitset. |
134 | void Init(uint32_t num_bytes); |
135 | |
136 | /// Initialize the BlockSplitBloomFilter. It copies the bitset as underlying |
137 | /// bitset because the given bitset may not satisfy the 32-byte alignment requirement |
138 | /// which may lead to segfault when performing SIMD instructions. It is the caller's |
139 | /// responsibility to free the bitset passed in. This is used when reconstructing |
140 | /// a Bloom filter from a parquet file. |
141 | /// |
142 | /// @param bitset The given bitset to initialize the Bloom filter. |
143 | /// @param num_bytes The number of bytes of given bitset. |
144 | void Init(const uint8_t* bitset, uint32_t num_bytes); |
145 | |
146 | // Minimum Bloom filter size, it sets to 32 bytes to fit a tiny Bloom filter. |
147 | static constexpr uint32_t kMinimumBloomFilterBytes = 32; |
148 | |
149 | /// Calculate optimal size according to the number of distinct values and false |
150 | /// positive probability. |
151 | /// |
152 | /// @param ndv The number of distinct values. |
153 | /// @param fpp The false positive probability. |
154 | /// @return it always return a value between kMinimumBloomFilterBytes and |
155 | /// kMaximumBloomFilterBytes, and the return value is always a power of 2 |
156 | static uint32_t OptimalNumOfBits(uint32_t ndv, double fpp) { |
157 | DCHECK(fpp > 0.0 && fpp < 1.0); |
158 | const double m = -8.0 * ndv / log(1 - pow(fpp, 1.0 / 8)); |
159 | uint32_t num_bits; |
160 | |
161 | // Handle overflow. |
162 | if (m < 0 || m > kMaximumBloomFilterBytes << 3) { |
163 | num_bits = static_cast<uint32_t>(kMaximumBloomFilterBytes << 3); |
164 | } else { |
165 | num_bits = static_cast<uint32_t>(m); |
166 | } |
167 | |
168 | // Round up to lower bound |
169 | if (num_bits < kMinimumBloomFilterBytes << 3) { |
170 | num_bits = kMinimumBloomFilterBytes << 3; |
171 | } |
172 | |
173 | // Get next power of 2 if bits is not power of 2. |
174 | if ((num_bits & (num_bits - 1)) != 0) { |
175 | num_bits = static_cast<uint32_t>(::arrow::BitUtil::NextPower2(num_bits)); |
176 | } |
177 | |
178 | // Round down to upper bound |
179 | if (num_bits > kMaximumBloomFilterBytes << 3) { |
180 | num_bits = kMaximumBloomFilterBytes << 3; |
181 | } |
182 | |
183 | return num_bits; |
184 | } |
185 | |
186 | bool FindHash(uint64_t hash) const override; |
187 | void InsertHash(uint64_t hash) override; |
188 | void WriteTo(OutputStream* sink) const override; |
189 | uint32_t GetBitsetSize() const override { return num_bytes_; } |
190 | |
191 | uint64_t Hash(int64_t value) const override { return hasher_->Hash(value); } |
192 | uint64_t Hash(float value) const override { return hasher_->Hash(value); } |
193 | uint64_t Hash(double value) const override { return hasher_->Hash(value); } |
194 | uint64_t Hash(const Int96* value) const override { return hasher_->Hash(value); } |
195 | uint64_t Hash(const ByteArray* value) const override { return hasher_->Hash(value); } |
196 | uint64_t Hash(int32_t value) const override { return hasher_->Hash(value); } |
197 | uint64_t Hash(const FLBA* value, uint32_t len) const override { |
198 | return hasher_->Hash(value, len); |
199 | } |
200 | |
201 | /// Deserialize the Bloom filter from an input stream. It is used when reconstructing |
202 | /// a Bloom filter from a parquet filter. |
203 | /// |
204 | /// @param input_stream The input stream from which to construct the Bloom filter |
205 | /// @return The BlockSplitBloomFilter. |
206 | static BlockSplitBloomFilter Deserialize(InputStream* input_stream); |
207 | |
208 | private: |
209 | // Bytes in a tiny Bloom filter block. |
210 | static constexpr int kBytesPerFilterBlock = 32; |
211 | |
212 | // The number of bits to be set in each tiny Bloom filter |
213 | static constexpr int kBitsSetPerBlock = 8; |
214 | |
215 | // A mask structure used to set bits in each tiny Bloom filter. |
216 | struct BlockMask { |
217 | uint32_t item[kBitsSetPerBlock]; |
218 | }; |
219 | |
220 | // The block-based algorithm needs eight odd SALT values to calculate eight indexes |
221 | // of bit to set, one bit in each 32-bit word. |
222 | static constexpr uint32_t SALT[kBitsSetPerBlock] = { |
223 | 0x47b6137bU, 0x44974d91U, 0x8824ad5bU, 0xa2b7289dU, |
224 | 0x705495c7U, 0x2df1424bU, 0x9efc4947U, 0x5c6bfb31U}; |
225 | |
226 | /// Set bits in mask array according to input key. |
227 | /// @param key the value to calculate mask values. |
228 | /// @param mask the mask array is used to set inside a block |
229 | void SetMask(uint32_t key, BlockMask& mask) const; |
230 | |
231 | // Memory pool to allocate aligned buffer for bitset |
232 | ::arrow::MemoryPool* pool_; |
233 | |
234 | // The underlying buffer of bitset. |
235 | std::shared_ptr<Buffer> data_; |
236 | |
237 | // The number of bytes of Bloom filter bitset. |
238 | uint32_t num_bytes_; |
239 | |
240 | // Hash strategy used in this Bloom filter. |
241 | HashStrategy hash_strategy_; |
242 | |
243 | // Algorithm used in this Bloom filter. |
244 | Algorithm algorithm_; |
245 | |
246 | // The hash pointer points to actual hash class used. |
247 | std::unique_ptr<Hasher> hasher_; |
248 | }; |
249 | |
250 | } // namespace parquet |
251 | |
252 | #endif // PARQUET_BLOOM_FILTER_H |
253 | |