1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License |
4 | // (found in the LICENSE.Apache file in the root directory). |
5 | // |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
7 | // Use of this source code is governed by a BSD-style license that can be |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
9 | |
10 | #ifndef ROCKSDB_LITE |
11 | #include "table/cuckoo_table_reader.h" |
12 | |
13 | #include <algorithm> |
14 | #include <limits> |
15 | #include <string> |
16 | #include <utility> |
17 | #include <vector> |
18 | #include "rocksdb/iterator.h" |
19 | #include "rocksdb/table.h" |
20 | #include "table/internal_iterator.h" |
21 | #include "table/meta_blocks.h" |
22 | #include "table/cuckoo_table_factory.h" |
23 | #include "table/get_context.h" |
24 | #include "util/arena.h" |
25 | #include "util/coding.h" |
26 | |
27 | namespace rocksdb { |
28 | namespace { |
29 | const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1); |
30 | const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max(); |
31 | } |
32 | |
33 | extern const uint64_t kCuckooTableMagicNumber; |
34 | |
35 | CuckooTableReader::CuckooTableReader( |
36 | const ImmutableCFOptions& ioptions, |
37 | std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, |
38 | const Comparator* comparator, |
39 | uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) |
40 | : file_(std::move(file)), |
41 | is_last_level_(false), |
42 | identity_as_first_hash_(false), |
43 | use_module_hash_(false), |
44 | num_hash_func_(0), |
45 | unused_key_("" ), |
46 | key_length_(0), |
47 | user_key_length_(0), |
48 | value_length_(0), |
49 | bucket_length_(0), |
50 | cuckoo_block_size_(0), |
51 | cuckoo_block_bytes_minus_one_(0), |
52 | table_size_(0), |
53 | ucomp_(comparator), |
54 | get_slice_hash_(get_slice_hash) { |
55 | if (!ioptions.allow_mmap_reads) { |
56 | status_ = Status::InvalidArgument("File is not mmaped" ); |
57 | } |
58 | TableProperties* props = nullptr; |
59 | status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber, |
60 | ioptions, &props); |
61 | if (!status_.ok()) { |
62 | return; |
63 | } |
64 | table_props_.reset(props); |
65 | auto& user_props = props->user_collected_properties; |
66 | auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc); |
67 | if (hash_funs == user_props.end()) { |
68 | status_ = Status::Corruption("Number of hash functions not found" ); |
69 | return; |
70 | } |
71 | num_hash_func_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data()); |
72 | auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey); |
73 | if (unused_key == user_props.end()) { |
74 | status_ = Status::Corruption("Empty bucket value not found" ); |
75 | return; |
76 | } |
77 | unused_key_ = unused_key->second; |
78 | |
79 | key_length_ = static_cast<uint32_t>(props->fixed_key_len); |
80 | auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength); |
81 | if (user_key_len == user_props.end()) { |
82 | status_ = Status::Corruption("User key length not found" ); |
83 | return; |
84 | } |
85 | user_key_length_ = *reinterpret_cast<const uint32_t*>( |
86 | user_key_len->second.data()); |
87 | |
88 | auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength); |
89 | if (value_length == user_props.end()) { |
90 | status_ = Status::Corruption("Value length not found" ); |
91 | return; |
92 | } |
93 | value_length_ = *reinterpret_cast<const uint32_t*>( |
94 | value_length->second.data()); |
95 | bucket_length_ = key_length_ + value_length_; |
96 | |
97 | auto hash_table_size = user_props.find( |
98 | CuckooTablePropertyNames::kHashTableSize); |
99 | if (hash_table_size == user_props.end()) { |
100 | status_ = Status::Corruption("Hash table size not found" ); |
101 | return; |
102 | } |
103 | table_size_ = *reinterpret_cast<const uint64_t*>( |
104 | hash_table_size->second.data()); |
105 | |
106 | auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel); |
107 | if (is_last_level == user_props.end()) { |
108 | status_ = Status::Corruption("Is last level not found" ); |
109 | return; |
110 | } |
111 | is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data()); |
112 | |
113 | auto identity_as_first_hash = user_props.find( |
114 | CuckooTablePropertyNames::kIdentityAsFirstHash); |
115 | if (identity_as_first_hash == user_props.end()) { |
116 | status_ = Status::Corruption("identity as first hash not found" ); |
117 | return; |
118 | } |
119 | identity_as_first_hash_ = *reinterpret_cast<const bool*>( |
120 | identity_as_first_hash->second.data()); |
121 | |
122 | auto use_module_hash = user_props.find( |
123 | CuckooTablePropertyNames::kUseModuleHash); |
124 | if (use_module_hash == user_props.end()) { |
125 | status_ = Status::Corruption("hash type is not found" ); |
126 | return; |
127 | } |
128 | use_module_hash_ = *reinterpret_cast<const bool*>( |
129 | use_module_hash->second.data()); |
130 | auto cuckoo_block_size = user_props.find( |
131 | CuckooTablePropertyNames::kCuckooBlockSize); |
132 | if (cuckoo_block_size == user_props.end()) { |
133 | status_ = Status::Corruption("Cuckoo block size not found" ); |
134 | return; |
135 | } |
136 | cuckoo_block_size_ = *reinterpret_cast<const uint32_t*>( |
137 | cuckoo_block_size->second.data()); |
138 | cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1; |
139 | status_ = file_->Read(0, file_size, &file_data_, nullptr); |
140 | } |
141 | |
142 | Status CuckooTableReader::Get(const ReadOptions& readOptions, const Slice& key, |
143 | GetContext* get_context, bool skip_filters) { |
144 | assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0)); |
145 | Slice user_key = ExtractUserKey(key); |
146 | for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) { |
147 | uint64_t offset = bucket_length_ * CuckooHash( |
148 | user_key, hash_cnt, use_module_hash_, table_size_, |
149 | identity_as_first_hash_, get_slice_hash_); |
150 | const char* bucket = &file_data_.data()[offset]; |
151 | for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; |
152 | ++block_idx, bucket += bucket_length_) { |
153 | if (ucomp_->Equal(Slice(unused_key_.data(), user_key.size()), |
154 | Slice(bucket, user_key.size()))) { |
155 | return Status::OK(); |
156 | } |
157 | // Here, we compare only the user key part as we support only one entry |
158 | // per user key and we don't support snapshot. |
159 | if (ucomp_->Equal(user_key, Slice(bucket, user_key.size()))) { |
160 | Slice value(bucket + key_length_, value_length_); |
161 | if (is_last_level_) { |
162 | // Sequence number is not stored at the last level, so we will use |
163 | // kMaxSequenceNumber since it is unknown. This could cause some |
164 | // transactions to fail to lock a key due to known sequence number. |
165 | // However, it is expected for anyone to use a CuckooTable in a |
166 | // TransactionDB. |
167 | get_context->SaveValue(value, kMaxSequenceNumber); |
168 | } else { |
169 | Slice full_key(bucket, key_length_); |
170 | ParsedInternalKey found_ikey; |
171 | ParseInternalKey(full_key, &found_ikey); |
172 | get_context->SaveValue(found_ikey, value); |
173 | } |
174 | // We don't support merge operations. So, we return here. |
175 | return Status::OK(); |
176 | } |
177 | } |
178 | } |
179 | return Status::OK(); |
180 | } |
181 | |
182 | void CuckooTableReader::Prepare(const Slice& key) { |
183 | // Prefetch the first Cuckoo Block. |
184 | Slice user_key = ExtractUserKey(key); |
185 | uint64_t addr = reinterpret_cast<uint64_t>(file_data_.data()) + |
186 | bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_, |
187 | identity_as_first_hash_, nullptr); |
188 | uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_; |
189 | for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) { |
190 | PREFETCH(reinterpret_cast<const char*>(addr), 0, 3); |
191 | } |
192 | } |
193 | |
194 | class CuckooTableIterator : public InternalIterator { |
195 | public: |
196 | explicit CuckooTableIterator(CuckooTableReader* reader); |
197 | ~CuckooTableIterator() {} |
198 | bool Valid() const override; |
199 | void SeekToFirst() override; |
200 | void SeekToLast() override; |
201 | void Seek(const Slice& target) override; |
202 | void SeekForPrev(const Slice& target) override; |
203 | void Next() override; |
204 | void Prev() override; |
205 | Slice key() const override; |
206 | Slice value() const override; |
207 | Status status() const override { return status_; } |
208 | void InitIfNeeded(); |
209 | |
210 | private: |
211 | struct BucketComparator { |
212 | BucketComparator(const Slice& file_data, const Comparator* ucomp, |
213 | uint32_t bucket_len, uint32_t user_key_len, |
214 | const Slice& target = Slice()) |
215 | : file_data_(file_data), |
216 | ucomp_(ucomp), |
217 | bucket_len_(bucket_len), |
218 | user_key_len_(user_key_len), |
219 | target_(target) {} |
220 | bool operator()(const uint32_t first, const uint32_t second) const { |
221 | const char* first_bucket = |
222 | (first == kInvalidIndex) ? target_.data() : |
223 | &file_data_.data()[first * bucket_len_]; |
224 | const char* second_bucket = |
225 | (second == kInvalidIndex) ? target_.data() : |
226 | &file_data_.data()[second * bucket_len_]; |
227 | return ucomp_->Compare(Slice(first_bucket, user_key_len_), |
228 | Slice(second_bucket, user_key_len_)) < 0; |
229 | } |
230 | private: |
231 | const Slice file_data_; |
232 | const Comparator* ucomp_; |
233 | const uint32_t bucket_len_; |
234 | const uint32_t user_key_len_; |
235 | const Slice target_; |
236 | }; |
237 | |
238 | const BucketComparator bucket_comparator_; |
239 | void PrepareKVAtCurrIdx(); |
240 | CuckooTableReader* reader_; |
241 | bool initialized_; |
242 | Status status_; |
243 | // Contains a map of keys to bucket_id sorted in key order. |
244 | std::vector<uint32_t> sorted_bucket_ids_; |
245 | // We assume that the number of items can be stored in uint32 (4 Billion). |
246 | uint32_t curr_key_idx_; |
247 | Slice curr_value_; |
248 | IterKey curr_key_; |
249 | // No copying allowed |
250 | CuckooTableIterator(const CuckooTableIterator&) = delete; |
251 | void operator=(const Iterator&) = delete; |
252 | }; |
253 | |
254 | CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader) |
255 | : bucket_comparator_(reader->file_data_, reader->ucomp_, |
256 | reader->bucket_length_, reader->user_key_length_), |
257 | reader_(reader), |
258 | initialized_(false), |
259 | curr_key_idx_(kInvalidIndex) { |
260 | sorted_bucket_ids_.clear(); |
261 | curr_value_.clear(); |
262 | curr_key_.Clear(); |
263 | } |
264 | |
265 | void CuckooTableIterator::InitIfNeeded() { |
266 | if (initialized_) { |
267 | return; |
268 | } |
269 | sorted_bucket_ids_.reserve(reader_->GetTableProperties()->num_entries); |
270 | uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1; |
271 | assert(num_buckets < kInvalidIndex); |
272 | const char* bucket = reader_->file_data_.data(); |
273 | for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) { |
274 | if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) { |
275 | sorted_bucket_ids_.push_back(bucket_id); |
276 | } |
277 | bucket += reader_->bucket_length_; |
278 | } |
279 | assert(sorted_bucket_ids_.size() == |
280 | reader_->GetTableProperties()->num_entries); |
281 | std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(), |
282 | bucket_comparator_); |
283 | curr_key_idx_ = kInvalidIndex; |
284 | initialized_ = true; |
285 | } |
286 | |
287 | void CuckooTableIterator::SeekToFirst() { |
288 | InitIfNeeded(); |
289 | curr_key_idx_ = 0; |
290 | PrepareKVAtCurrIdx(); |
291 | } |
292 | |
293 | void CuckooTableIterator::SeekToLast() { |
294 | InitIfNeeded(); |
295 | curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()) - 1; |
296 | PrepareKVAtCurrIdx(); |
297 | } |
298 | |
299 | void CuckooTableIterator::Seek(const Slice& target) { |
300 | InitIfNeeded(); |
301 | const BucketComparator seek_comparator( |
302 | reader_->file_data_, reader_->ucomp_, |
303 | reader_->bucket_length_, reader_->user_key_length_, |
304 | ExtractUserKey(target)); |
305 | auto seek_it = std::lower_bound(sorted_bucket_ids_.begin(), |
306 | sorted_bucket_ids_.end(), |
307 | kInvalidIndex, |
308 | seek_comparator); |
309 | curr_key_idx_ = |
310 | static_cast<uint32_t>(std::distance(sorted_bucket_ids_.begin(), seek_it)); |
311 | PrepareKVAtCurrIdx(); |
312 | } |
313 | |
314 | void CuckooTableIterator::SeekForPrev(const Slice& target) { |
315 | // Not supported |
316 | assert(false); |
317 | } |
318 | |
319 | bool CuckooTableIterator::Valid() const { |
320 | return curr_key_idx_ < sorted_bucket_ids_.size(); |
321 | } |
322 | |
323 | void CuckooTableIterator::PrepareKVAtCurrIdx() { |
324 | if (!Valid()) { |
325 | curr_value_.clear(); |
326 | curr_key_.Clear(); |
327 | return; |
328 | } |
329 | uint32_t id = sorted_bucket_ids_[curr_key_idx_]; |
330 | const char* offset = reader_->file_data_.data() + |
331 | id * reader_->bucket_length_; |
332 | if (reader_->is_last_level_) { |
333 | // Always return internal key. |
334 | curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_), |
335 | 0, kTypeValue); |
336 | } else { |
337 | curr_key_.SetInternalKey(Slice(offset, reader_->key_length_)); |
338 | } |
339 | curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_); |
340 | } |
341 | |
342 | void CuckooTableIterator::Next() { |
343 | if (!Valid()) { |
344 | curr_value_.clear(); |
345 | curr_key_.Clear(); |
346 | return; |
347 | } |
348 | ++curr_key_idx_; |
349 | PrepareKVAtCurrIdx(); |
350 | } |
351 | |
352 | void CuckooTableIterator::Prev() { |
353 | if (curr_key_idx_ == 0) { |
354 | curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()); |
355 | } |
356 | if (!Valid()) { |
357 | curr_value_.clear(); |
358 | curr_key_.Clear(); |
359 | return; |
360 | } |
361 | --curr_key_idx_; |
362 | PrepareKVAtCurrIdx(); |
363 | } |
364 | |
365 | Slice CuckooTableIterator::key() const { |
366 | assert(Valid()); |
367 | return curr_key_.GetInternalKey(); |
368 | } |
369 | |
370 | Slice CuckooTableIterator::value() const { |
371 | assert(Valid()); |
372 | return curr_value_; |
373 | } |
374 | |
375 | extern InternalIterator* NewErrorInternalIterator(const Status& status, |
376 | Arena* arena); |
377 | |
378 | InternalIterator* CuckooTableReader::NewIterator( |
379 | const ReadOptions& read_options, Arena* arena, bool skip_filters) { |
380 | if (!status().ok()) { |
381 | return NewErrorInternalIterator( |
382 | Status::Corruption("CuckooTableReader status is not okay." ), arena); |
383 | } |
384 | CuckooTableIterator* iter; |
385 | if (arena == nullptr) { |
386 | iter = new CuckooTableIterator(this); |
387 | } else { |
388 | auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator)); |
389 | iter = new (iter_mem) CuckooTableIterator(this); |
390 | } |
391 | return iter; |
392 | } |
393 | |
394 | size_t CuckooTableReader::ApproximateMemoryUsage() const { return 0; } |
395 | |
396 | } // namespace rocksdb |
397 | #endif |
398 | |