1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#ifndef ROCKSDB_LITE
11#include "table/cuckoo_table_reader.h"
12
13#include <algorithm>
14#include <limits>
15#include <string>
16#include <utility>
17#include <vector>
18#include "rocksdb/iterator.h"
19#include "rocksdb/table.h"
20#include "table/internal_iterator.h"
21#include "table/meta_blocks.h"
22#include "table/cuckoo_table_factory.h"
23#include "table/get_context.h"
24#include "util/arena.h"
25#include "util/coding.h"
26
27namespace rocksdb {
28namespace {
29const uint64_t CACHE_LINE_MASK = ~((uint64_t)CACHE_LINE_SIZE - 1);
30const uint32_t kInvalidIndex = std::numeric_limits<uint32_t>::max();
31}
32
33extern const uint64_t kCuckooTableMagicNumber;
34
35CuckooTableReader::CuckooTableReader(
36 const ImmutableCFOptions& ioptions,
37 std::unique_ptr<RandomAccessFileReader>&& file, uint64_t file_size,
38 const Comparator* comparator,
39 uint64_t (*get_slice_hash)(const Slice&, uint32_t, uint64_t))
40 : file_(std::move(file)),
41 is_last_level_(false),
42 identity_as_first_hash_(false),
43 use_module_hash_(false),
44 num_hash_func_(0),
45 unused_key_(""),
46 key_length_(0),
47 user_key_length_(0),
48 value_length_(0),
49 bucket_length_(0),
50 cuckoo_block_size_(0),
51 cuckoo_block_bytes_minus_one_(0),
52 table_size_(0),
53 ucomp_(comparator),
54 get_slice_hash_(get_slice_hash) {
55 if (!ioptions.allow_mmap_reads) {
56 status_ = Status::InvalidArgument("File is not mmaped");
57 }
58 TableProperties* props = nullptr;
59 status_ = ReadTableProperties(file_.get(), file_size, kCuckooTableMagicNumber,
60 ioptions, &props);
61 if (!status_.ok()) {
62 return;
63 }
64 table_props_.reset(props);
65 auto& user_props = props->user_collected_properties;
66 auto hash_funs = user_props.find(CuckooTablePropertyNames::kNumHashFunc);
67 if (hash_funs == user_props.end()) {
68 status_ = Status::Corruption("Number of hash functions not found");
69 return;
70 }
71 num_hash_func_ = *reinterpret_cast<const uint32_t*>(hash_funs->second.data());
72 auto unused_key = user_props.find(CuckooTablePropertyNames::kEmptyKey);
73 if (unused_key == user_props.end()) {
74 status_ = Status::Corruption("Empty bucket value not found");
75 return;
76 }
77 unused_key_ = unused_key->second;
78
79 key_length_ = static_cast<uint32_t>(props->fixed_key_len);
80 auto user_key_len = user_props.find(CuckooTablePropertyNames::kUserKeyLength);
81 if (user_key_len == user_props.end()) {
82 status_ = Status::Corruption("User key length not found");
83 return;
84 }
85 user_key_length_ = *reinterpret_cast<const uint32_t*>(
86 user_key_len->second.data());
87
88 auto value_length = user_props.find(CuckooTablePropertyNames::kValueLength);
89 if (value_length == user_props.end()) {
90 status_ = Status::Corruption("Value length not found");
91 return;
92 }
93 value_length_ = *reinterpret_cast<const uint32_t*>(
94 value_length->second.data());
95 bucket_length_ = key_length_ + value_length_;
96
97 auto hash_table_size = user_props.find(
98 CuckooTablePropertyNames::kHashTableSize);
99 if (hash_table_size == user_props.end()) {
100 status_ = Status::Corruption("Hash table size not found");
101 return;
102 }
103 table_size_ = *reinterpret_cast<const uint64_t*>(
104 hash_table_size->second.data());
105
106 auto is_last_level = user_props.find(CuckooTablePropertyNames::kIsLastLevel);
107 if (is_last_level == user_props.end()) {
108 status_ = Status::Corruption("Is last level not found");
109 return;
110 }
111 is_last_level_ = *reinterpret_cast<const bool*>(is_last_level->second.data());
112
113 auto identity_as_first_hash = user_props.find(
114 CuckooTablePropertyNames::kIdentityAsFirstHash);
115 if (identity_as_first_hash == user_props.end()) {
116 status_ = Status::Corruption("identity as first hash not found");
117 return;
118 }
119 identity_as_first_hash_ = *reinterpret_cast<const bool*>(
120 identity_as_first_hash->second.data());
121
122 auto use_module_hash = user_props.find(
123 CuckooTablePropertyNames::kUseModuleHash);
124 if (use_module_hash == user_props.end()) {
125 status_ = Status::Corruption("hash type is not found");
126 return;
127 }
128 use_module_hash_ = *reinterpret_cast<const bool*>(
129 use_module_hash->second.data());
130 auto cuckoo_block_size = user_props.find(
131 CuckooTablePropertyNames::kCuckooBlockSize);
132 if (cuckoo_block_size == user_props.end()) {
133 status_ = Status::Corruption("Cuckoo block size not found");
134 return;
135 }
136 cuckoo_block_size_ = *reinterpret_cast<const uint32_t*>(
137 cuckoo_block_size->second.data());
138 cuckoo_block_bytes_minus_one_ = cuckoo_block_size_ * bucket_length_ - 1;
139 status_ = file_->Read(0, file_size, &file_data_, nullptr);
140}
141
142Status CuckooTableReader::Get(const ReadOptions& readOptions, const Slice& key,
143 GetContext* get_context, bool skip_filters) {
144 assert(key.size() == key_length_ + (is_last_level_ ? 8 : 0));
145 Slice user_key = ExtractUserKey(key);
146 for (uint32_t hash_cnt = 0; hash_cnt < num_hash_func_; ++hash_cnt) {
147 uint64_t offset = bucket_length_ * CuckooHash(
148 user_key, hash_cnt, use_module_hash_, table_size_,
149 identity_as_first_hash_, get_slice_hash_);
150 const char* bucket = &file_data_.data()[offset];
151 for (uint32_t block_idx = 0; block_idx < cuckoo_block_size_;
152 ++block_idx, bucket += bucket_length_) {
153 if (ucomp_->Equal(Slice(unused_key_.data(), user_key.size()),
154 Slice(bucket, user_key.size()))) {
155 return Status::OK();
156 }
157 // Here, we compare only the user key part as we support only one entry
158 // per user key and we don't support snapshot.
159 if (ucomp_->Equal(user_key, Slice(bucket, user_key.size()))) {
160 Slice value(bucket + key_length_, value_length_);
161 if (is_last_level_) {
162 // Sequence number is not stored at the last level, so we will use
163 // kMaxSequenceNumber since it is unknown. This could cause some
164 // transactions to fail to lock a key due to known sequence number.
165 // However, it is expected for anyone to use a CuckooTable in a
166 // TransactionDB.
167 get_context->SaveValue(value, kMaxSequenceNumber);
168 } else {
169 Slice full_key(bucket, key_length_);
170 ParsedInternalKey found_ikey;
171 ParseInternalKey(full_key, &found_ikey);
172 get_context->SaveValue(found_ikey, value);
173 }
174 // We don't support merge operations. So, we return here.
175 return Status::OK();
176 }
177 }
178 }
179 return Status::OK();
180}
181
182void CuckooTableReader::Prepare(const Slice& key) {
183 // Prefetch the first Cuckoo Block.
184 Slice user_key = ExtractUserKey(key);
185 uint64_t addr = reinterpret_cast<uint64_t>(file_data_.data()) +
186 bucket_length_ * CuckooHash(user_key, 0, use_module_hash_, table_size_,
187 identity_as_first_hash_, nullptr);
188 uint64_t end_addr = addr + cuckoo_block_bytes_minus_one_;
189 for (addr &= CACHE_LINE_MASK; addr < end_addr; addr += CACHE_LINE_SIZE) {
190 PREFETCH(reinterpret_cast<const char*>(addr), 0, 3);
191 }
192}
193
194class CuckooTableIterator : public InternalIterator {
195 public:
196 explicit CuckooTableIterator(CuckooTableReader* reader);
197 ~CuckooTableIterator() {}
198 bool Valid() const override;
199 void SeekToFirst() override;
200 void SeekToLast() override;
201 void Seek(const Slice& target) override;
202 void SeekForPrev(const Slice& target) override;
203 void Next() override;
204 void Prev() override;
205 Slice key() const override;
206 Slice value() const override;
207 Status status() const override { return status_; }
208 void InitIfNeeded();
209
210 private:
211 struct BucketComparator {
212 BucketComparator(const Slice& file_data, const Comparator* ucomp,
213 uint32_t bucket_len, uint32_t user_key_len,
214 const Slice& target = Slice())
215 : file_data_(file_data),
216 ucomp_(ucomp),
217 bucket_len_(bucket_len),
218 user_key_len_(user_key_len),
219 target_(target) {}
220 bool operator()(const uint32_t first, const uint32_t second) const {
221 const char* first_bucket =
222 (first == kInvalidIndex) ? target_.data() :
223 &file_data_.data()[first * bucket_len_];
224 const char* second_bucket =
225 (second == kInvalidIndex) ? target_.data() :
226 &file_data_.data()[second * bucket_len_];
227 return ucomp_->Compare(Slice(first_bucket, user_key_len_),
228 Slice(second_bucket, user_key_len_)) < 0;
229 }
230 private:
231 const Slice file_data_;
232 const Comparator* ucomp_;
233 const uint32_t bucket_len_;
234 const uint32_t user_key_len_;
235 const Slice target_;
236 };
237
238 const BucketComparator bucket_comparator_;
239 void PrepareKVAtCurrIdx();
240 CuckooTableReader* reader_;
241 bool initialized_;
242 Status status_;
243 // Contains a map of keys to bucket_id sorted in key order.
244 std::vector<uint32_t> sorted_bucket_ids_;
245 // We assume that the number of items can be stored in uint32 (4 Billion).
246 uint32_t curr_key_idx_;
247 Slice curr_value_;
248 IterKey curr_key_;
249 // No copying allowed
250 CuckooTableIterator(const CuckooTableIterator&) = delete;
251 void operator=(const Iterator&) = delete;
252};
253
254CuckooTableIterator::CuckooTableIterator(CuckooTableReader* reader)
255 : bucket_comparator_(reader->file_data_, reader->ucomp_,
256 reader->bucket_length_, reader->user_key_length_),
257 reader_(reader),
258 initialized_(false),
259 curr_key_idx_(kInvalidIndex) {
260 sorted_bucket_ids_.clear();
261 curr_value_.clear();
262 curr_key_.Clear();
263}
264
265void CuckooTableIterator::InitIfNeeded() {
266 if (initialized_) {
267 return;
268 }
269 sorted_bucket_ids_.reserve(reader_->GetTableProperties()->num_entries);
270 uint64_t num_buckets = reader_->table_size_ + reader_->cuckoo_block_size_ - 1;
271 assert(num_buckets < kInvalidIndex);
272 const char* bucket = reader_->file_data_.data();
273 for (uint32_t bucket_id = 0; bucket_id < num_buckets; ++bucket_id) {
274 if (Slice(bucket, reader_->key_length_) != Slice(reader_->unused_key_)) {
275 sorted_bucket_ids_.push_back(bucket_id);
276 }
277 bucket += reader_->bucket_length_;
278 }
279 assert(sorted_bucket_ids_.size() ==
280 reader_->GetTableProperties()->num_entries);
281 std::sort(sorted_bucket_ids_.begin(), sorted_bucket_ids_.end(),
282 bucket_comparator_);
283 curr_key_idx_ = kInvalidIndex;
284 initialized_ = true;
285}
286
287void CuckooTableIterator::SeekToFirst() {
288 InitIfNeeded();
289 curr_key_idx_ = 0;
290 PrepareKVAtCurrIdx();
291}
292
293void CuckooTableIterator::SeekToLast() {
294 InitIfNeeded();
295 curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size()) - 1;
296 PrepareKVAtCurrIdx();
297}
298
299void CuckooTableIterator::Seek(const Slice& target) {
300 InitIfNeeded();
301 const BucketComparator seek_comparator(
302 reader_->file_data_, reader_->ucomp_,
303 reader_->bucket_length_, reader_->user_key_length_,
304 ExtractUserKey(target));
305 auto seek_it = std::lower_bound(sorted_bucket_ids_.begin(),
306 sorted_bucket_ids_.end(),
307 kInvalidIndex,
308 seek_comparator);
309 curr_key_idx_ =
310 static_cast<uint32_t>(std::distance(sorted_bucket_ids_.begin(), seek_it));
311 PrepareKVAtCurrIdx();
312}
313
314void CuckooTableIterator::SeekForPrev(const Slice& target) {
315 // Not supported
316 assert(false);
317}
318
319bool CuckooTableIterator::Valid() const {
320 return curr_key_idx_ < sorted_bucket_ids_.size();
321}
322
323void CuckooTableIterator::PrepareKVAtCurrIdx() {
324 if (!Valid()) {
325 curr_value_.clear();
326 curr_key_.Clear();
327 return;
328 }
329 uint32_t id = sorted_bucket_ids_[curr_key_idx_];
330 const char* offset = reader_->file_data_.data() +
331 id * reader_->bucket_length_;
332 if (reader_->is_last_level_) {
333 // Always return internal key.
334 curr_key_.SetInternalKey(Slice(offset, reader_->user_key_length_),
335 0, kTypeValue);
336 } else {
337 curr_key_.SetInternalKey(Slice(offset, reader_->key_length_));
338 }
339 curr_value_ = Slice(offset + reader_->key_length_, reader_->value_length_);
340}
341
342void CuckooTableIterator::Next() {
343 if (!Valid()) {
344 curr_value_.clear();
345 curr_key_.Clear();
346 return;
347 }
348 ++curr_key_idx_;
349 PrepareKVAtCurrIdx();
350}
351
352void CuckooTableIterator::Prev() {
353 if (curr_key_idx_ == 0) {
354 curr_key_idx_ = static_cast<uint32_t>(sorted_bucket_ids_.size());
355 }
356 if (!Valid()) {
357 curr_value_.clear();
358 curr_key_.Clear();
359 return;
360 }
361 --curr_key_idx_;
362 PrepareKVAtCurrIdx();
363}
364
365Slice CuckooTableIterator::key() const {
366 assert(Valid());
367 return curr_key_.GetInternalKey();
368}
369
370Slice CuckooTableIterator::value() const {
371 assert(Valid());
372 return curr_value_;
373}
374
375extern InternalIterator* NewErrorInternalIterator(const Status& status,
376 Arena* arena);
377
378InternalIterator* CuckooTableReader::NewIterator(
379 const ReadOptions& read_options, Arena* arena, bool skip_filters) {
380 if (!status().ok()) {
381 return NewErrorInternalIterator(
382 Status::Corruption("CuckooTableReader status is not okay."), arena);
383 }
384 CuckooTableIterator* iter;
385 if (arena == nullptr) {
386 iter = new CuckooTableIterator(this);
387 } else {
388 auto iter_mem = arena->AllocateAligned(sizeof(CuckooTableIterator));
389 iter = new (iter_mem) CuckooTableIterator(this);
390 }
391 return iter;
392}
393
394size_t CuckooTableReader::ApproximateMemoryUsage() const { return 0; }
395
396} // namespace rocksdb
397#endif
398