| 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
| 2 | // This source code is licensed under both the GPLv2 (found in the |
| 3 | // COPYING file in the root directory) and Apache 2.0 License |
| 4 | // (found in the LICENSE.Apache file in the root directory). |
| 5 | // |
| 6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
| 7 | // Use of this source code is governed by a BSD-style license that can be |
| 8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
| 9 | // |
| 10 | // Decodes the blocks generated by block_builder.cc. |
| 11 | |
| 12 | #include "table/block.h" |
| 13 | #include <algorithm> |
| 14 | #include <string> |
| 15 | #include <unordered_map> |
| 16 | #include <vector> |
| 17 | |
| 18 | #include "monitoring/perf_context_imp.h" |
| 19 | #include "port/port.h" |
| 20 | #include "port/stack_trace.h" |
| 21 | #include "rocksdb/comparator.h" |
| 22 | #include "table/block_prefix_index.h" |
| 23 | #include "table/format.h" |
| 24 | #include "util/coding.h" |
| 25 | #include "util/logging.h" |
| 26 | |
| 27 | namespace rocksdb { |
| 28 | |
| 29 | // Helper routine: decode the next block entry starting at "p", |
| 30 | // storing the number of shared key bytes, non_shared key bytes, |
| 31 | // and the length of the value in "*shared", "*non_shared", and |
| 32 | // "*value_length", respectively. Will not derefence past "limit". |
| 33 | // |
| 34 | // If any errors are detected, returns nullptr. Otherwise, returns a |
| 35 | // pointer to the key delta (just past the three decoded values). |
| 36 | static inline const char* DecodeEntry(const char* p, const char* limit, |
| 37 | uint32_t* shared, |
| 38 | uint32_t* non_shared, |
| 39 | uint32_t* value_length) { |
| 40 | if (limit - p < 3) return nullptr; |
| 41 | *shared = reinterpret_cast<const unsigned char*>(p)[0]; |
| 42 | *non_shared = reinterpret_cast<const unsigned char*>(p)[1]; |
| 43 | *value_length = reinterpret_cast<const unsigned char*>(p)[2]; |
| 44 | if ((*shared | *non_shared | *value_length) < 128) { |
| 45 | // Fast path: all three values are encoded in one byte each |
| 46 | p += 3; |
| 47 | } else { |
| 48 | if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; |
| 49 | if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; |
| 50 | if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr; |
| 51 | } |
| 52 | |
| 53 | if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) { |
| 54 | return nullptr; |
| 55 | } |
| 56 | return p; |
| 57 | } |
| 58 | |
| 59 | void BlockIter::Next() { |
| 60 | assert(Valid()); |
| 61 | ParseNextKey(); |
| 62 | } |
| 63 | |
| 64 | void BlockIter::Prev() { |
| 65 | assert(Valid()); |
| 66 | |
| 67 | assert(prev_entries_idx_ == -1 || |
| 68 | static_cast<size_t>(prev_entries_idx_) < prev_entries_.size()); |
| 69 | // Check if we can use cached prev_entries_ |
| 70 | if (prev_entries_idx_ > 0 && |
| 71 | prev_entries_[prev_entries_idx_].offset == current_) { |
| 72 | // Read cached CachedPrevEntry |
| 73 | prev_entries_idx_--; |
| 74 | const CachedPrevEntry& current_prev_entry = |
| 75 | prev_entries_[prev_entries_idx_]; |
| 76 | |
| 77 | const char* key_ptr = nullptr; |
| 78 | if (current_prev_entry.key_ptr != nullptr) { |
| 79 | // The key is not delta encoded and stored in the data block |
| 80 | key_ptr = current_prev_entry.key_ptr; |
| 81 | key_pinned_ = true; |
| 82 | } else { |
| 83 | // The key is delta encoded and stored in prev_entries_keys_buff_ |
| 84 | key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset; |
| 85 | key_pinned_ = false; |
| 86 | } |
| 87 | const Slice current_key(key_ptr, current_prev_entry.key_size); |
| 88 | |
| 89 | current_ = current_prev_entry.offset; |
| 90 | key_.SetInternalKey(current_key, false /* copy */); |
| 91 | value_ = current_prev_entry.value; |
| 92 | |
| 93 | return; |
| 94 | } |
| 95 | |
| 96 | // Clear prev entries cache |
| 97 | prev_entries_idx_ = -1; |
| 98 | prev_entries_.clear(); |
| 99 | prev_entries_keys_buff_.clear(); |
| 100 | |
| 101 | // Scan backwards to a restart point before current_ |
| 102 | const uint32_t original = current_; |
| 103 | while (GetRestartPoint(restart_index_) >= original) { |
| 104 | if (restart_index_ == 0) { |
| 105 | // No more entries |
| 106 | current_ = restarts_; |
| 107 | restart_index_ = num_restarts_; |
| 108 | return; |
| 109 | } |
| 110 | restart_index_--; |
| 111 | } |
| 112 | |
| 113 | SeekToRestartPoint(restart_index_); |
| 114 | |
| 115 | do { |
| 116 | if (!ParseNextKey()) { |
| 117 | break; |
| 118 | } |
| 119 | Slice current_key = key(); |
| 120 | |
| 121 | if (key_.IsKeyPinned()) { |
| 122 | // The key is not delta encoded |
| 123 | prev_entries_.emplace_back(current_, current_key.data(), 0, |
| 124 | current_key.size(), value()); |
| 125 | } else { |
| 126 | // The key is delta encoded, cache decoded key in buffer |
| 127 | size_t new_key_offset = prev_entries_keys_buff_.size(); |
| 128 | prev_entries_keys_buff_.append(current_key.data(), current_key.size()); |
| 129 | |
| 130 | prev_entries_.emplace_back(current_, nullptr, new_key_offset, |
| 131 | current_key.size(), value()); |
| 132 | } |
| 133 | // Loop until end of current entry hits the start of original entry |
| 134 | } while (NextEntryOffset() < original); |
| 135 | prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1; |
| 136 | } |
| 137 | |
| 138 | void BlockIter::Seek(const Slice& target) { |
| 139 | PERF_TIMER_GUARD(block_seek_nanos); |
| 140 | if (data_ == nullptr) { // Not init yet |
| 141 | return; |
| 142 | } |
| 143 | uint32_t index = 0; |
| 144 | bool ok = false; |
| 145 | if (prefix_index_) { |
| 146 | ok = PrefixSeek(target, &index); |
| 147 | } else { |
| 148 | ok = BinarySeek(target, 0, num_restarts_ - 1, &index); |
| 149 | } |
| 150 | |
| 151 | if (!ok) { |
| 152 | return; |
| 153 | } |
| 154 | SeekToRestartPoint(index); |
| 155 | // Linear search (within restart block) for first key >= target |
| 156 | |
| 157 | while (true) { |
| 158 | if (!ParseNextKey() || Compare(key_.GetInternalKey(), target) >= 0) { |
| 159 | return; |
| 160 | } |
| 161 | } |
| 162 | } |
| 163 | |
| 164 | void BlockIter::SeekForPrev(const Slice& target) { |
| 165 | PERF_TIMER_GUARD(block_seek_nanos); |
| 166 | if (data_ == nullptr) { // Not init yet |
| 167 | return; |
| 168 | } |
| 169 | uint32_t index = 0; |
| 170 | bool ok = false; |
| 171 | ok = BinarySeek(target, 0, num_restarts_ - 1, &index); |
| 172 | |
| 173 | if (!ok) { |
| 174 | return; |
| 175 | } |
| 176 | SeekToRestartPoint(index); |
| 177 | // Linear search (within restart block) for first key >= target |
| 178 | |
| 179 | while (ParseNextKey() && Compare(key_.GetInternalKey(), target) < 0) { |
| 180 | } |
| 181 | if (!Valid()) { |
| 182 | SeekToLast(); |
| 183 | } else { |
| 184 | while (Valid() && Compare(key_.GetInternalKey(), target) > 0) { |
| 185 | Prev(); |
| 186 | } |
| 187 | } |
| 188 | } |
| 189 | |
| 190 | void BlockIter::SeekToFirst() { |
| 191 | if (data_ == nullptr) { // Not init yet |
| 192 | return; |
| 193 | } |
| 194 | SeekToRestartPoint(0); |
| 195 | ParseNextKey(); |
| 196 | } |
| 197 | |
| 198 | void BlockIter::SeekToLast() { |
| 199 | if (data_ == nullptr) { // Not init yet |
| 200 | return; |
| 201 | } |
| 202 | SeekToRestartPoint(num_restarts_ - 1); |
| 203 | while (ParseNextKey() && NextEntryOffset() < restarts_) { |
| 204 | // Keep skipping |
| 205 | } |
| 206 | } |
| 207 | |
| 208 | void BlockIter::CorruptionError() { |
| 209 | current_ = restarts_; |
| 210 | restart_index_ = num_restarts_; |
| 211 | status_ = Status::Corruption("bad entry in block" ); |
| 212 | key_.Clear(); |
| 213 | value_.clear(); |
| 214 | } |
| 215 | |
| 216 | bool BlockIter::ParseNextKey() { |
| 217 | current_ = NextEntryOffset(); |
| 218 | const char* p = data_ + current_; |
| 219 | const char* limit = data_ + restarts_; // Restarts come right after data |
| 220 | if (p >= limit) { |
| 221 | // No more entries to return. Mark as invalid. |
| 222 | current_ = restarts_; |
| 223 | restart_index_ = num_restarts_; |
| 224 | return false; |
| 225 | } |
| 226 | |
| 227 | // Decode next entry |
| 228 | uint32_t shared, non_shared, value_length; |
| 229 | p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); |
| 230 | if (p == nullptr || key_.Size() < shared) { |
| 231 | CorruptionError(); |
| 232 | return false; |
| 233 | } else { |
| 234 | if (shared == 0) { |
| 235 | // If this key dont share any bytes with prev key then we dont need |
| 236 | // to decode it and can use it's address in the block directly. |
| 237 | key_.SetInternalKey(Slice(p, non_shared), false /* copy */); |
| 238 | key_pinned_ = true; |
| 239 | } else { |
| 240 | // This key share `shared` bytes with prev key, we need to decode it |
| 241 | key_.TrimAppend(shared, p, non_shared); |
| 242 | key_pinned_ = false; |
| 243 | } |
| 244 | |
| 245 | if (global_seqno_ != kDisableGlobalSequenceNumber) { |
| 246 | // If we are reading a file with a global sequence number we should |
| 247 | // expect that all encoded sequence numbers are zeros and any value |
| 248 | // type is kTypeValue, kTypeMerge or kTypeDeletion |
| 249 | assert(GetInternalKeySeqno(key_.GetInternalKey()) == 0); |
| 250 | |
| 251 | ValueType value_type = ExtractValueType(key_.GetInternalKey()); |
| 252 | assert(value_type == ValueType::kTypeValue || |
| 253 | value_type == ValueType::kTypeMerge || |
| 254 | value_type == ValueType::kTypeDeletion); |
| 255 | |
| 256 | if (key_pinned_) { |
| 257 | // TODO(tec): Investigate updating the seqno in the loaded block |
| 258 | // directly instead of doing a copy and update. |
| 259 | |
| 260 | // We cannot use the key address in the block directly because |
| 261 | // we have a global_seqno_ that will overwrite the encoded one. |
| 262 | key_.OwnKey(); |
| 263 | key_pinned_ = false; |
| 264 | } |
| 265 | |
| 266 | key_.UpdateInternalKey(global_seqno_, value_type); |
| 267 | } |
| 268 | |
| 269 | value_ = Slice(p + non_shared, value_length); |
| 270 | while (restart_index_ + 1 < num_restarts_ && |
| 271 | GetRestartPoint(restart_index_ + 1) < current_) { |
| 272 | ++restart_index_; |
| 273 | } |
| 274 | return true; |
| 275 | } |
| 276 | } |
| 277 | |
| 278 | // Binary search in restart array to find the first restart point that |
| 279 | // is either the last restart point with a key less than target, |
| 280 | // which means the key of next restart point is larger than target, or |
| 281 | // the first restart point with a key = target |
| 282 | bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right, |
| 283 | uint32_t* index) { |
| 284 | assert(left <= right); |
| 285 | |
| 286 | while (left < right) { |
| 287 | uint32_t mid = (left + right + 1) / 2; |
| 288 | uint32_t region_offset = GetRestartPoint(mid); |
| 289 | uint32_t shared, non_shared, value_length; |
| 290 | const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_, |
| 291 | &shared, &non_shared, &value_length); |
| 292 | if (key_ptr == nullptr || (shared != 0)) { |
| 293 | CorruptionError(); |
| 294 | return false; |
| 295 | } |
| 296 | Slice mid_key(key_ptr, non_shared); |
| 297 | int cmp = Compare(mid_key, target); |
| 298 | if (cmp < 0) { |
| 299 | // Key at "mid" is smaller than "target". Therefore all |
| 300 | // blocks before "mid" are uninteresting. |
| 301 | left = mid; |
| 302 | } else if (cmp > 0) { |
| 303 | // Key at "mid" is >= "target". Therefore all blocks at or |
| 304 | // after "mid" are uninteresting. |
| 305 | right = mid - 1; |
| 306 | } else { |
| 307 | left = right = mid; |
| 308 | } |
| 309 | } |
| 310 | |
| 311 | *index = left; |
| 312 | return true; |
| 313 | } |
| 314 | |
| 315 | // Compare target key and the block key of the block of `block_index`. |
| 316 | // Return -1 if error. |
| 317 | int BlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) { |
| 318 | uint32_t region_offset = GetRestartPoint(block_index); |
| 319 | uint32_t shared, non_shared, value_length; |
| 320 | const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_, |
| 321 | &shared, &non_shared, &value_length); |
| 322 | if (key_ptr == nullptr || (shared != 0)) { |
| 323 | CorruptionError(); |
| 324 | return 1; // Return target is smaller |
| 325 | } |
| 326 | Slice block_key(key_ptr, non_shared); |
| 327 | return Compare(block_key, target); |
| 328 | } |
| 329 | |
| 330 | // Binary search in block_ids to find the first block |
| 331 | // with a key >= target |
| 332 | bool BlockIter::BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, |
| 333 | uint32_t left, uint32_t right, |
| 334 | uint32_t* index) { |
| 335 | assert(left <= right); |
| 336 | uint32_t left_bound = left; |
| 337 | |
| 338 | while (left <= right) { |
| 339 | uint32_t mid = (right + left) / 2; |
| 340 | |
| 341 | int cmp = CompareBlockKey(block_ids[mid], target); |
| 342 | if (!status_.ok()) { |
| 343 | return false; |
| 344 | } |
| 345 | if (cmp < 0) { |
| 346 | // Key at "target" is larger than "mid". Therefore all |
| 347 | // blocks before or at "mid" are uninteresting. |
| 348 | left = mid + 1; |
| 349 | } else { |
| 350 | // Key at "target" is <= "mid". Therefore all blocks |
| 351 | // after "mid" are uninteresting. |
| 352 | // If there is only one block left, we found it. |
| 353 | if (left == right) break; |
| 354 | right = mid; |
| 355 | } |
| 356 | } |
| 357 | |
| 358 | if (left == right) { |
| 359 | // In one of the two following cases: |
| 360 | // (1) left is the first one of block_ids |
| 361 | // (2) there is a gap of blocks between block of `left` and `left-1`. |
| 362 | // we can further distinguish the case of key in the block or key not |
| 363 | // existing, by comparing the target key and the key of the previous |
| 364 | // block to the left of the block found. |
| 365 | if (block_ids[left] > 0 && |
| 366 | (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) && |
| 367 | CompareBlockKey(block_ids[left] - 1, target) > 0) { |
| 368 | current_ = restarts_; |
| 369 | return false; |
| 370 | } |
| 371 | |
| 372 | *index = block_ids[left]; |
| 373 | return true; |
| 374 | } else { |
| 375 | assert(left > right); |
| 376 | // Mark iterator invalid |
| 377 | current_ = restarts_; |
| 378 | return false; |
| 379 | } |
| 380 | } |
| 381 | |
| 382 | bool BlockIter::PrefixSeek(const Slice& target, uint32_t* index) { |
| 383 | assert(prefix_index_); |
| 384 | uint32_t* block_ids = nullptr; |
| 385 | uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids); |
| 386 | |
| 387 | if (num_blocks == 0) { |
| 388 | current_ = restarts_; |
| 389 | return false; |
| 390 | } else { |
| 391 | return BinaryBlockIndexSeek(target, block_ids, 0, num_blocks - 1, index); |
| 392 | } |
| 393 | } |
| 394 | |
| 395 | uint32_t Block::NumRestarts() const { |
| 396 | assert(size_ >= 2*sizeof(uint32_t)); |
| 397 | return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); |
| 398 | } |
| 399 | |
| 400 | Block::Block(BlockContents&& contents, SequenceNumber _global_seqno, |
| 401 | size_t read_amp_bytes_per_bit, Statistics* statistics) |
| 402 | : contents_(std::move(contents)), |
| 403 | data_(contents_.data.data()), |
| 404 | size_(contents_.data.size()), |
| 405 | restart_offset_(0), |
| 406 | global_seqno_(_global_seqno) { |
| 407 | if (size_ < sizeof(uint32_t)) { |
| 408 | size_ = 0; // Error marker |
| 409 | } else { |
| 410 | restart_offset_ = |
| 411 | static_cast<uint32_t>(size_) - (1 + NumRestarts()) * sizeof(uint32_t); |
| 412 | if (restart_offset_ > size_ - sizeof(uint32_t)) { |
| 413 | // The size is too small for NumRestarts() and therefore |
| 414 | // restart_offset_ wrapped around. |
| 415 | size_ = 0; |
| 416 | } |
| 417 | } |
| 418 | if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) { |
| 419 | read_amp_bitmap_.reset(new BlockReadAmpBitmap( |
| 420 | restart_offset_, read_amp_bytes_per_bit, statistics)); |
| 421 | } |
| 422 | } |
| 423 | |
| 424 | InternalIterator* Block::NewIterator(const Comparator* cmp, BlockIter* iter, |
| 425 | bool total_order_seek, Statistics* stats) { |
| 426 | if (size_ < 2*sizeof(uint32_t)) { |
| 427 | if (iter != nullptr) { |
| 428 | iter->SetStatus(Status::Corruption("bad block contents" )); |
| 429 | return iter; |
| 430 | } else { |
| 431 | return NewErrorInternalIterator(Status::Corruption("bad block contents" )); |
| 432 | } |
| 433 | } |
| 434 | const uint32_t num_restarts = NumRestarts(); |
| 435 | if (num_restarts == 0) { |
| 436 | if (iter != nullptr) { |
| 437 | iter->SetStatus(Status::OK()); |
| 438 | return iter; |
| 439 | } else { |
| 440 | return NewEmptyInternalIterator(); |
| 441 | } |
| 442 | } else { |
| 443 | BlockPrefixIndex* prefix_index_ptr = |
| 444 | total_order_seek ? nullptr : prefix_index_.get(); |
| 445 | |
| 446 | if (iter != nullptr) { |
| 447 | iter->Initialize(cmp, data_, restart_offset_, num_restarts, |
| 448 | prefix_index_ptr, global_seqno_, read_amp_bitmap_.get()); |
| 449 | } else { |
| 450 | iter = new BlockIter(cmp, data_, restart_offset_, num_restarts, |
| 451 | prefix_index_ptr, global_seqno_, |
| 452 | read_amp_bitmap_.get()); |
| 453 | } |
| 454 | |
| 455 | if (read_amp_bitmap_) { |
| 456 | if (read_amp_bitmap_->GetStatistics() != stats) { |
| 457 | // DB changed the Statistics pointer, we need to notify read_amp_bitmap_ |
| 458 | read_amp_bitmap_->SetStatistics(stats); |
| 459 | } |
| 460 | } |
| 461 | } |
| 462 | |
| 463 | return iter; |
| 464 | } |
| 465 | |
| 466 | void Block::SetBlockPrefixIndex(BlockPrefixIndex* prefix_index) { |
| 467 | prefix_index_.reset(prefix_index); |
| 468 | } |
| 469 | |
| 470 | size_t Block::ApproximateMemoryUsage() const { |
| 471 | size_t usage = usable_size(); |
| 472 | if (prefix_index_) { |
| 473 | usage += prefix_index_->ApproximateMemoryUsage(); |
| 474 | } |
| 475 | return usage; |
| 476 | } |
| 477 | |
| 478 | } // namespace rocksdb |
| 479 | |