| 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
| 2 | // This source code is licensed under both the GPLv2 (found in the |
| 3 | // COPYING file in the root directory) and Apache 2.0 License |
| 4 | // (found in the LICENSE.Apache file in the root directory). |
| 5 | // |
| 6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
| 7 | // Use of this source code is governed by a BSD-style license that can be |
| 8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
| 9 | |
| 10 | #ifndef ROCKSDB_LITE |
| 11 | #include "table/cuckoo_table_reader.h" |
| 12 | |
| 13 | #include <algorithm> |
| 14 | #include <limits> |
| 15 | #include <string> |
| 16 | #include <utility> |
| 17 | #include <vector> |
| 18 | #include "rocksdb/iterator.h" |
| 19 | #include "rocksdb/table.h" |
| 20 | #include "table/internal_iterator.h" |
| 21 | #include "table/meta_blocks.h" |
| 22 | #include "table/cuckoo_table_factory.h" |
| 23 | #include "table/get_context.h" |
| 24 | #include "util/arena.h" |
| 25 | #include "util/coding.h" |
| 26 | |
| 27 | namespace rocksdb { |
| 28 | namespace { |
| 29 | const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1); |
| 30 | const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max(); |
| 31 | } |
| 32 | |
| 33 | extern const uint64_t kCuckooTableMagicNumber; |
| 34 | |
| 35 | CuckooTableReader::CuckooTableReader( |
| 36 | const ImmutableCFOptions& ioptions, |
| 37 | std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size, |
| 38 | const Comparator* comparator, |
| 39 | uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t)) |
| 40 | : file_(std::move(file)), |
| 41 | is_last_level_(false), |
| 42 | identity_as_first_hash_(false), |
| 43 | use_module_hash_(false), |
| 44 | num_hash_func_(0), |
| 45 | unused_key_("" ), |
| 46 | key_length_(0), |
| 47 | user_key_length_(0), |
| 48 | value_length_(0), |
| 49 | bucket_length_(0), |
| 50 | cuckoo_block_size_(0), |
| 51 | cuckoo_block_bytes_minus_one_(0), |
| 52 | table_size_(0), |
| 53 | ucomp_(comparator), |
| 54 | get_slice_hash_(get_slice_hash) { |
| 55 | if (!ioptions.allow_mmap_reads) { |
| 56 | status_ = Status::InvalidArgument("File is not mmaped" ); |
| 57 | } |
| 58 | TableProperties* props = nullptr; |
| 59 | status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber, |
| 60 | ioptions, &props); |
| 61 | if (!status_.ok()) { |
| 62 | return; |
| 63 | } |
| 64 | table_props_.reset(props); |
| 65 | auto& user_props = props->user_collected_properties; |
| 66 | auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc); |
| 67 | if (hash_funs == user_props.end()) { |
| 68 | status_ = Status::Corruption("Number of hash functions not found" ); |
| 69 | return; |
| 70 | } |
| 71 | num_hash_func_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data()); |
| 72 | auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey); |
| 73 | if (unused_key == user_props.end()) { |
| 74 | status_ = Status::Corruption("Empty bucket value not found" ); |
| 75 | return; |
| 76 | } |
| 77 | unused_key_ = unused_key->second; |
| 78 | |
| 79 | key_length_ = static_cast<uint32_t>(props->fixed_key_len); |
| 80 | auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength); |
| 81 | if (user_key_len == user_props.end()) { |
| 82 | status_ = Status::Corruption("User key length not found" ); |
| 83 | return; |
| 84 | } |
| 85 | user_key_length_ = *reinterpret_cast<const uint32_t*>( |
| 86 | user_key_len->second.data()); |
| 87 | |
| 88 | auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength); |
| 89 | if (value_length == user_props.end()) { |
| 90 | status_ = Status::Corruption("Value length not found" ); |
| 91 | return; |
| 92 | } |
| 93 | value_length_ = *reinterpret_cast<const uint32_t*>( |
| 94 | value_length->second.data()); |
| 95 | bucket_length_ = key_length_ + value_length_; |
| 96 | |
| 97 | auto hash_table_size = user_props.find( |
| 98 | CuckooTablePropertyNames::kHashTableSize); |
| 99 | if (hash_table_size == user_props.end()) { |
| 100 | status_ = Status::Corruption("Hash table size not found" ); |
| 101 | return; |
| 102 | } |
| 103 | table_size_ = *reinterpret_cast<const uint64_t*>( |
| 104 | hash_table_size->second.data()); |
| 105 | |
| 106 | auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel); |
| 107 | if (is_last_level == user_props.end()) { |
| 108 | status_ = Status::Corruption("Is last level not found" ); |
| 109 | return; |
| 110 | } |
| 111 | is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data()); |
| 112 | |
| 113 | auto identity_as_first_hash = user_props.find( |
| 114 | CuckooTablePropertyNames::kIdentityAsFirstHash); |
| 115 | if (identity_as_first_hash == user_props.end()) { |
| 116 | status_ = Status::Corruption("identity as first hash not found" ); |
| 117 | return; |
| 118 | } |
| 119 | identity_as_first_hash_ = *reinterpret_cast<const bool*>( |
| 120 | identity_as_first_hash->second.data()); |
| 121 | |
| 122 | auto use_module_hash = user_props.find( |
| 123 | CuckooTablePropertyNames::kUseModuleHash); |
| 124 | if (use_module_hash == user_props.end()) { |
| 125 | status_ = Status::Corruption("hash type is not found" ); |
| 126 | return; |
| 127 | } |
| 128 | use_module_hash_ = *reinterpret_cast<const bool*>( |
| 129 | use_module_hash->second.data()); |
| 130 | auto cuckoo_block_size = user_props.find( |
| 131 | CuckooTablePropertyNames::kCuckooBlockSize); |
| 132 | if (cuckoo_block_size == user_props.end()) { |
| 133 | status_ = Status::Corruption("Cuckoo block size not found" ); |
| 134 | return; |
| 135 | } |
| 136 | cuckoo_block_size_ = *reinterpret_cast<const uint32_t*>( |
| 137 | cuckoo_block_size->second.data()); |
| 138 | cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1; |
| 139 | status_ = file_->Read(0, file_size, &file_data_, nullptr); |
| 140 | } |
| 141 | |
| 142 | Status CuckooTableReader::Get(const ReadOptions& readOptions, const Slice& key, |
| 143 | GetContext* get_context, bool skip_filters) { |
| 144 | assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0)); |
| 145 | Slice user_key = ExtractUserKey(key); |
| 146 | for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) { |
| 147 | uint64_t offset = bucket_length_ * CuckooHash( |
| 148 | user_key, hash_cnt, use_module_hash_, table_size_, |
| 149 | identity_as_first_hash_, get_slice_hash_); |
| 150 | const char* bucket = &file_data_.data()[offset]; |
| 151 | for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_; |
| 152 | ++block_idx, bucket += bucket_length_) { |
| 153 | if (ucomp_->Equal(Slice(unused_key_.data(), user_key.size()), |
| 154 | Slice(bucket, user_key.size()))) { |
| 155 | return Status::OK(); |
| 156 | } |
| 157 | // Here, we compare only the user key part as we support only one entry |
| 158 | // per user key and we don't support snapshot. |
| 159 | if (ucomp_->Equal(user_key, Slice(bucket, user_key.size()))) { |
| 160 | Slice value(bucket + key_length_, value_length_); |
| 161 | if (is_last_level_) { |
| 162 | // Sequence number is not stored at the last level, so we will use |
| 163 | // kMaxSequenceNumber since it is unknown. This could cause some |
| 164 | // transactions to fail to lock a key due to known sequence number. |
| 165 | // However, it is expected for anyone to use a CuckooTable in a |
| 166 | // TransactionDB. |
| 167 | get_context->SaveValue(value, kMaxSequenceNumber); |
| 168 | } else { |
| 169 | Slice full_key(bucket, key_length_); |
| 170 | ParsedInternalKey found_ikey; |
| 171 | ParseInternalKey(full_key, &found_ikey); |
| 172 | get_context->SaveValue(found_ikey, value); |
| 173 | } |
| 174 | // We don't support merge operations. So, we return here. |
| 175 | return Status::OK(); |
| 176 | } |
| 177 | } |
| 178 | } |
| 179 | return Status::OK(); |
| 180 | } |
| 181 | |
| 182 | void CuckooTableReader::Prepare(const Slice& key) { |
| 183 | // Prefetch the first Cuckoo Block. |
| 184 | Slice user_key = ExtractUserKey(key); |
| 185 | uint64_t addr = reinterpret_cast<uint64_t>(file_data_.data()) + |
| 186 | bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_, |
| 187 | identity_as_first_hash_, nullptr); |
| 188 | uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_; |
| 189 | for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) { |
| 190 | PREFETCH(reinterpret_cast<const char*>(addr), 0, 3); |
| 191 | } |
| 192 | } |
| 193 | |
| 194 | class CuckooTableIterator : public InternalIterator { |
| 195 | public: |
| 196 | explicit CuckooTableIterator(CuckooTableReader* reader); |
| 197 | ~CuckooTableIterator() {} |
| 198 | bool Valid() const override; |
| 199 | void SeekToFirst() override; |
| 200 | void SeekToLast() override; |
| 201 | void Seek(const Slice& target) override; |
| 202 | void SeekForPrev(const Slice& target) override; |
| 203 | void Next() override; |
| 204 | void Prev() override; |
| 205 | Slice key() const override; |
| 206 | Slice value() const override; |
| 207 | Status status() const override { return status_; } |
| 208 | void InitIfNeeded(); |
| 209 | |
| 210 | private: |
| 211 | struct BucketComparator { |
| 212 | BucketComparator(const Slice& file_data, const Comparator* ucomp, |
| 213 | uint32_t bucket_len, uint32_t user_key_len, |
| 214 | const Slice& target = Slice()) |
| 215 | : file_data_(file_data), |
| 216 | ucomp_(ucomp), |
| 217 | bucket_len_(bucket_len), |
| 218 | user_key_len_(user_key_len), |
| 219 | target_(target) {} |
| 220 | bool operator()(const uint32_t first, const uint32_t second) const { |
| 221 | const char* first_bucket = |
| 222 | (first == kInvalidIndex) ? target_.data() : |
| 223 | &file_data_.data()[first * bucket_len_]; |
| 224 | const char* second_bucket = |
| 225 | (second == kInvalidIndex) ? target_.data() : |
| 226 | &file_data_.data()[second * bucket_len_]; |
| 227 | return ucomp_->Compare(Slice(first_bucket, user_key_len_), |
| 228 | Slice(second_bucket, user_key_len_)) < 0; |
| 229 | } |
| 230 | private: |
| 231 | const Slice file_data_; |
| 232 | const Comparator* ucomp_; |
| 233 | const uint32_t bucket_len_; |
| 234 | const uint32_t user_key_len_; |
| 235 | const Slice target_; |
| 236 | }; |
| 237 | |
| 238 | const BucketComparator bucket_comparator_; |
| 239 | void PrepareKVAtCurrIdx(); |
| 240 | CuckooTableReader* reader_; |
| 241 | bool initialized_; |
| 242 | Status status_; |
| 243 | // Contains a map of keys to bucket_id sorted in key order. |
| 244 | std::vector<uint32_t> sorted_bucket_ids_; |
| 245 | // We assume that the number of items can be stored in uint32 (4 Billion). |
| 246 | uint32_t curr_key_idx_; |
| 247 | Slice curr_value_; |
| 248 | IterKey curr_key_; |
| 249 | // No copying allowed |
| 250 | CuckooTableIterator(const CuckooTableIterator&) = delete; |
| 251 | void operator=(const Iterator&) = delete; |
| 252 | }; |
| 253 | |
| 254 | CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader) |
| 255 | : bucket_comparator_(reader->file_data_, reader->ucomp_, |
| 256 | reader->bucket_length_, reader->user_key_length_), |
| 257 | reader_(reader), |
| 258 | initialized_(false), |
| 259 | curr_key_idx_(kInvalidIndex) { |
| 260 | sorted_bucket_ids_.clear(); |
| 261 | curr_value_.clear(); |
| 262 | curr_key_.Clear(); |
| 263 | } |
| 264 | |
| 265 | void CuckooTableIterator::InitIfNeeded() { |
| 266 | if (initialized_) { |
| 267 | return; |
| 268 | } |
| 269 | sorted_bucket_ids_.reserve(reader_->GetTableProperties()->num_entries); |
| 270 | uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1; |
| 271 | assert(num_buckets < kInvalidIndex); |
| 272 | const char* bucket = reader_->file_data_.data(); |
| 273 | for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) { |
| 274 | if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) { |
| 275 | sorted_bucket_ids_.push_back(bucket_id); |
| 276 | } |
| 277 | bucket += reader_->bucket_length_; |
| 278 | } |
| 279 | assert(sorted_bucket_ids_.size() == |
| 280 | reader_->GetTableProperties()->num_entries); |
| 281 | std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(), |
| 282 | bucket_comparator_); |
| 283 | curr_key_idx_ = kInvalidIndex; |
| 284 | initialized_ = true; |
| 285 | } |
| 286 | |
| 287 | void CuckooTableIterator::SeekToFirst() { |
| 288 | InitIfNeeded(); |
| 289 | curr_key_idx_ = 0; |
| 290 | PrepareKVAtCurrIdx(); |
| 291 | } |
| 292 | |
| 293 | void CuckooTableIterator::SeekToLast() { |
| 294 | InitIfNeeded(); |
| 295 | curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()) - 1; |
| 296 | PrepareKVAtCurrIdx(); |
| 297 | } |
| 298 | |
| 299 | void CuckooTableIterator::Seek(const Slice& target) { |
| 300 | InitIfNeeded(); |
| 301 | const BucketComparator seek_comparator( |
| 302 | reader_->file_data_, reader_->ucomp_, |
| 303 | reader_->bucket_length_, reader_->user_key_length_, |
| 304 | ExtractUserKey(target)); |
| 305 | auto seek_it = std::lower_bound(sorted_bucket_ids_.begin(), |
| 306 | sorted_bucket_ids_.end(), |
| 307 | kInvalidIndex, |
| 308 | seek_comparator); |
| 309 | curr_key_idx_ = |
| 310 | static_cast<uint32_t>(std::distance(sorted_bucket_ids_.begin(), seek_it)); |
| 311 | PrepareKVAtCurrIdx(); |
| 312 | } |
| 313 | |
| 314 | void CuckooTableIterator::SeekForPrev(const Slice& target) { |
| 315 | // Not supported |
| 316 | assert(false); |
| 317 | } |
| 318 | |
| 319 | bool CuckooTableIterator::Valid() const { |
| 320 | return curr_key_idx_ < sorted_bucket_ids_.size(); |
| 321 | } |
| 322 | |
| 323 | void CuckooTableIterator::PrepareKVAtCurrIdx() { |
| 324 | if (!Valid()) { |
| 325 | curr_value_.clear(); |
| 326 | curr_key_.Clear(); |
| 327 | return; |
| 328 | } |
| 329 | uint32_t id = sorted_bucket_ids_[curr_key_idx_]; |
| 330 | const char* offset = reader_->file_data_.data() + |
| 331 | id * reader_->bucket_length_; |
| 332 | if (reader_->is_last_level_) { |
| 333 | // Always return internal key. |
| 334 | curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_), |
| 335 | 0, kTypeValue); |
| 336 | } else { |
| 337 | curr_key_.SetInternalKey(Slice(offset, reader_->key_length_)); |
| 338 | } |
| 339 | curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_); |
| 340 | } |
| 341 | |
| 342 | void CuckooTableIterator::Next() { |
| 343 | if (!Valid()) { |
| 344 | curr_value_.clear(); |
| 345 | curr_key_.Clear(); |
| 346 | return; |
| 347 | } |
| 348 | ++curr_key_idx_; |
| 349 | PrepareKVAtCurrIdx(); |
| 350 | } |
| 351 | |
| 352 | void CuckooTableIterator::Prev() { |
| 353 | if (curr_key_idx_ == 0) { |
| 354 | curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()); |
| 355 | } |
| 356 | if (!Valid()) { |
| 357 | curr_value_.clear(); |
| 358 | curr_key_.Clear(); |
| 359 | return; |
| 360 | } |
| 361 | --curr_key_idx_; |
| 362 | PrepareKVAtCurrIdx(); |
| 363 | } |
| 364 | |
| 365 | Slice CuckooTableIterator::key() const { |
| 366 | assert(Valid()); |
| 367 | return curr_key_.GetInternalKey(); |
| 368 | } |
| 369 | |
| 370 | Slice CuckooTableIterator::value() const { |
| 371 | assert(Valid()); |
| 372 | return curr_value_; |
| 373 | } |
| 374 | |
| 375 | extern InternalIterator* NewErrorInternalIterator(const Status& status, |
| 376 | Arena* arena); |
| 377 | |
| 378 | InternalIterator* CuckooTableReader::NewIterator( |
| 379 | const ReadOptions& read_options, Arena* arena, bool skip_filters) { |
| 380 | if (!status().ok()) { |
| 381 | return NewErrorInternalIterator( |
| 382 | Status::Corruption("CuckooTableReader status is not okay." ), arena); |
| 383 | } |
| 384 | CuckooTableIterator* iter; |
| 385 | if (arena == nullptr) { |
| 386 | iter = new CuckooTableIterator(this); |
| 387 | } else { |
| 388 | auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator)); |
| 389 | iter = new (iter_mem) CuckooTableIterator(this); |
| 390 | } |
| 391 | return iter; |
| 392 | } |
| 393 | |
| 394 | size_t CuckooTableReader::ApproximateMemoryUsage() const { return 0; } |
| 395 | |
| 396 | } // namespace rocksdb |
| 397 | #endif |
| 398 | |