1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9//
10// Decodes the blocks generated by block_builder.cc.
11
12#include "table/block.h"
13#include <algorithm>
14#include <string>
15#include <unordered_map>
16#include <vector>
17
18#include "monitoring/perf_context_imp.h"
19#include "port/port.h"
20#include "port/stack_trace.h"
21#include "rocksdb/comparator.h"
22#include "table/block_prefix_index.h"
23#include "table/format.h"
24#include "util/coding.h"
25#include "util/logging.h"
26
27namespace rocksdb {
28
29// Helper routine: decode the next block entry starting at "p",
30// storing the number of shared key bytes, non_shared key bytes,
31// and the length of the value in "*shared", "*non_shared", and
32// "*value_length", respectively. Will not derefence past "limit".
33//
34// If any errors are detected, returns nullptr. Otherwise, returns a
35// pointer to the key delta (just past the three decoded values).
36static inline const char* DecodeEntry(const char* p, const char* limit,
37 uint32_t* shared,
38 uint32_t* non_shared,
39 uint32_t* value_length) {
40 if (limit - p < 3) return nullptr;
41 *shared = reinterpret_cast<const unsigned char*>(p)[0];
42 *non_shared = reinterpret_cast<const unsigned char*>(p)[1];
43 *value_length = reinterpret_cast<const unsigned char*>(p)[2];
44 if ((*shared | *non_shared | *value_length) < 128) {
45 // Fast path: all three values are encoded in one byte each
46 p += 3;
47 } else {
48 if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
49 if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
50 if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr;
51 }
52
53 if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
54 return nullptr;
55 }
56 return p;
57}
58
59void BlockIter::Next() {
60 assert(Valid());
61 ParseNextKey();
62}
63
64void BlockIter::Prev() {
65 assert(Valid());
66
67 assert(prev_entries_idx_ == -1 ||
68 static_cast<size_t>(prev_entries_idx_) < prev_entries_.size());
69 // Check if we can use cached prev_entries_
70 if (prev_entries_idx_ > 0 &&
71 prev_entries_[prev_entries_idx_].offset == current_) {
72 // Read cached CachedPrevEntry
73 prev_entries_idx_--;
74 const CachedPrevEntry& current_prev_entry =
75 prev_entries_[prev_entries_idx_];
76
77 const char* key_ptr = nullptr;
78 if (current_prev_entry.key_ptr != nullptr) {
79 // The key is not delta encoded and stored in the data block
80 key_ptr = current_prev_entry.key_ptr;
81 key_pinned_ = true;
82 } else {
83 // The key is delta encoded and stored in prev_entries_keys_buff_
84 key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset;
85 key_pinned_ = false;
86 }
87 const Slice current_key(key_ptr, current_prev_entry.key_size);
88
89 current_ = current_prev_entry.offset;
90 key_.SetInternalKey(current_key, false /* copy */);
91 value_ = current_prev_entry.value;
92
93 return;
94 }
95
96 // Clear prev entries cache
97 prev_entries_idx_ = -1;
98 prev_entries_.clear();
99 prev_entries_keys_buff_.clear();
100
101 // Scan backwards to a restart point before current_
102 const uint32_t original = current_;
103 while (GetRestartPoint(restart_index_) >= original) {
104 if (restart_index_ == 0) {
105 // No more entries
106 current_ = restarts_;
107 restart_index_ = num_restarts_;
108 return;
109 }
110 restart_index_--;
111 }
112
113 SeekToRestartPoint(restart_index_);
114
115 do {
116 if (!ParseNextKey()) {
117 break;
118 }
119 Slice current_key = key();
120
121 if (key_.IsKeyPinned()) {
122 // The key is not delta encoded
123 prev_entries_.emplace_back(current_, current_key.data(), 0,
124 current_key.size(), value());
125 } else {
126 // The key is delta encoded, cache decoded key in buffer
127 size_t new_key_offset = prev_entries_keys_buff_.size();
128 prev_entries_keys_buff_.append(current_key.data(), current_key.size());
129
130 prev_entries_.emplace_back(current_, nullptr, new_key_offset,
131 current_key.size(), value());
132 }
133 // Loop until end of current entry hits the start of original entry
134 } while (NextEntryOffset() < original);
135 prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1;
136}
137
138void BlockIter::Seek(const Slice& target) {
139 PERF_TIMER_GUARD(block_seek_nanos);
140 if (data_ == nullptr) { // Not init yet
141 return;
142 }
143 uint32_t index = 0;
144 bool ok = false;
145 if (prefix_index_) {
146 ok = PrefixSeek(target, &index);
147 } else {
148 ok = BinarySeek(target, 0, num_restarts_ - 1, &index);
149 }
150
151 if (!ok) {
152 return;
153 }
154 SeekToRestartPoint(index);
155 // Linear search (within restart block) for first key >= target
156
157 while (true) {
158 if (!ParseNextKey() || Compare(key_.GetInternalKey(), target) >= 0) {
159 return;
160 }
161 }
162}
163
164void BlockIter::SeekForPrev(const Slice& target) {
165 PERF_TIMER_GUARD(block_seek_nanos);
166 if (data_ == nullptr) { // Not init yet
167 return;
168 }
169 uint32_t index = 0;
170 bool ok = false;
171 ok = BinarySeek(target, 0, num_restarts_ - 1, &index);
172
173 if (!ok) {
174 return;
175 }
176 SeekToRestartPoint(index);
177 // Linear search (within restart block) for first key >= target
178
179 while (ParseNextKey() && Compare(key_.GetInternalKey(), target) < 0) {
180 }
181 if (!Valid()) {
182 SeekToLast();
183 } else {
184 while (Valid() && Compare(key_.GetInternalKey(), target) > 0) {
185 Prev();
186 }
187 }
188}
189
190void BlockIter::SeekToFirst() {
191 if (data_ == nullptr) { // Not init yet
192 return;
193 }
194 SeekToRestartPoint(0);
195 ParseNextKey();
196}
197
198void BlockIter::SeekToLast() {
199 if (data_ == nullptr) { // Not init yet
200 return;
201 }
202 SeekToRestartPoint(num_restarts_ - 1);
203 while (ParseNextKey() && NextEntryOffset() < restarts_) {
204 // Keep skipping
205 }
206}
207
208void BlockIter::CorruptionError() {
209 current_ = restarts_;
210 restart_index_ = num_restarts_;
211 status_ = Status::Corruption("bad entry in block");
212 key_.Clear();
213 value_.clear();
214}
215
216bool BlockIter::ParseNextKey() {
217 current_ = NextEntryOffset();
218 const char* p = data_ + current_;
219 const char* limit = data_ + restarts_; // Restarts come right after data
220 if (p >= limit) {
221 // No more entries to return. Mark as invalid.
222 current_ = restarts_;
223 restart_index_ = num_restarts_;
224 return false;
225 }
226
227 // Decode next entry
228 uint32_t shared, non_shared, value_length;
229 p = DecodeEntry(p, limit, &shared, &non_shared, &value_length);
230 if (p == nullptr || key_.Size() < shared) {
231 CorruptionError();
232 return false;
233 } else {
234 if (shared == 0) {
235 // If this key dont share any bytes with prev key then we dont need
236 // to decode it and can use it's address in the block directly.
237 key_.SetInternalKey(Slice(p, non_shared), false /* copy */);
238 key_pinned_ = true;
239 } else {
240 // This key share `shared` bytes with prev key, we need to decode it
241 key_.TrimAppend(shared, p, non_shared);
242 key_pinned_ = false;
243 }
244
245 if (global_seqno_ != kDisableGlobalSequenceNumber) {
246 // If we are reading a file with a global sequence number we should
247 // expect that all encoded sequence numbers are zeros and any value
248 // type is kTypeValue, kTypeMerge or kTypeDeletion
249 assert(GetInternalKeySeqno(key_.GetInternalKey()) == 0);
250
251 ValueType value_type = ExtractValueType(key_.GetInternalKey());
252 assert(value_type == ValueType::kTypeValue ||
253 value_type == ValueType::kTypeMerge ||
254 value_type == ValueType::kTypeDeletion);
255
256 if (key_pinned_) {
257 // TODO(tec): Investigate updating the seqno in the loaded block
258 // directly instead of doing a copy and update.
259
260 // We cannot use the key address in the block directly because
261 // we have a global_seqno_ that will overwrite the encoded one.
262 key_.OwnKey();
263 key_pinned_ = false;
264 }
265
266 key_.UpdateInternalKey(global_seqno_, value_type);
267 }
268
269 value_ = Slice(p + non_shared, value_length);
270 while (restart_index_ + 1 < num_restarts_ &&
271 GetRestartPoint(restart_index_ + 1) < current_) {
272 ++restart_index_;
273 }
274 return true;
275 }
276}
277
278// Binary search in restart array to find the first restart point that
279// is either the last restart point with a key less than target,
280// which means the key of next restart point is larger than target, or
281// the first restart point with a key = target
282bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right,
283 uint32_t* index) {
284 assert(left <= right);
285
286 while (left < right) {
287 uint32_t mid = (left + right + 1) / 2;
288 uint32_t region_offset = GetRestartPoint(mid);
289 uint32_t shared, non_shared, value_length;
290 const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_,
291 &shared, &non_shared, &value_length);
292 if (key_ptr == nullptr || (shared != 0)) {
293 CorruptionError();
294 return false;
295 }
296 Slice mid_key(key_ptr, non_shared);
297 int cmp = Compare(mid_key, target);
298 if (cmp < 0) {
299 // Key at "mid" is smaller than "target". Therefore all
300 // blocks before "mid" are uninteresting.
301 left = mid;
302 } else if (cmp > 0) {
303 // Key at "mid" is >= "target". Therefore all blocks at or
304 // after "mid" are uninteresting.
305 right = mid - 1;
306 } else {
307 left = right = mid;
308 }
309 }
310
311 *index = left;
312 return true;
313}
314
315// Compare target key and the block key of the block of `block_index`.
316// Return -1 if error.
317int BlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) {
318 uint32_t region_offset = GetRestartPoint(block_index);
319 uint32_t shared, non_shared, value_length;
320 const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_,
321 &shared, &non_shared, &value_length);
322 if (key_ptr == nullptr || (shared != 0)) {
323 CorruptionError();
324 return 1; // Return target is smaller
325 }
326 Slice block_key(key_ptr, non_shared);
327 return Compare(block_key, target);
328}
329
330// Binary search in block_ids to find the first block
331// with a key >= target
332bool BlockIter::BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids,
333 uint32_t left, uint32_t right,
334 uint32_t* index) {
335 assert(left <= right);
336 uint32_t left_bound = left;
337
338 while (left <= right) {
339 uint32_t mid = (right + left) / 2;
340
341 int cmp = CompareBlockKey(block_ids[mid], target);
342 if (!status_.ok()) {
343 return false;
344 }
345 if (cmp < 0) {
346 // Key at "target" is larger than "mid". Therefore all
347 // blocks before or at "mid" are uninteresting.
348 left = mid + 1;
349 } else {
350 // Key at "target" is <= "mid". Therefore all blocks
351 // after "mid" are uninteresting.
352 // If there is only one block left, we found it.
353 if (left == right) break;
354 right = mid;
355 }
356 }
357
358 if (left == right) {
359 // In one of the two following cases:
360 // (1) left is the first one of block_ids
361 // (2) there is a gap of blocks between block of `left` and `left-1`.
362 // we can further distinguish the case of key in the block or key not
363 // existing, by comparing the target key and the key of the previous
364 // block to the left of the block found.
365 if (block_ids[left] > 0 &&
366 (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) &&
367 CompareBlockKey(block_ids[left] - 1, target) > 0) {
368 current_ = restarts_;
369 return false;
370 }
371
372 *index = block_ids[left];
373 return true;
374 } else {
375 assert(left > right);
376 // Mark iterator invalid
377 current_ = restarts_;
378 return false;
379 }
380}
381
382bool BlockIter::PrefixSeek(const Slice& target, uint32_t* index) {
383 assert(prefix_index_);
384 uint32_t* block_ids = nullptr;
385 uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids);
386
387 if (num_blocks == 0) {
388 current_ = restarts_;
389 return false;
390 } else {
391 return BinaryBlockIndexSeek(target, block_ids, 0, num_blocks - 1, index);
392 }
393}
394
395uint32_t Block::NumRestarts() const {
396 assert(size_ >= 2*sizeof(uint32_t));
397 return DecodeFixed32(data_ + size_ - sizeof(uint32_t));
398}
399
400Block::Block(BlockContents&& contents, SequenceNumber _global_seqno,
401 size_t read_amp_bytes_per_bit, Statistics* statistics)
402 : contents_(std::move(contents)),
403 data_(contents_.data.data()),
404 size_(contents_.data.size()),
405 restart_offset_(0),
406 global_seqno_(_global_seqno) {
407 if (size_ < sizeof(uint32_t)) {
408 size_ = 0; // Error marker
409 } else {
410 restart_offset_ =
411 static_cast<uint32_t>(size_) - (1 + NumRestarts()) * sizeof(uint32_t);
412 if (restart_offset_ > size_ - sizeof(uint32_t)) {
413 // The size is too small for NumRestarts() and therefore
414 // restart_offset_ wrapped around.
415 size_ = 0;
416 }
417 }
418 if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) {
419 read_amp_bitmap_.reset(new BlockReadAmpBitmap(
420 restart_offset_, read_amp_bytes_per_bit, statistics));
421 }
422}
423
424InternalIterator* Block::NewIterator(const Comparator* cmp, BlockIter* iter,
425 bool total_order_seek, Statistics* stats) {
426 if (size_ < 2*sizeof(uint32_t)) {
427 if (iter != nullptr) {
428 iter->SetStatus(Status::Corruption("bad block contents"));
429 return iter;
430 } else {
431 return NewErrorInternalIterator(Status::Corruption("bad block contents"));
432 }
433 }
434 const uint32_t num_restarts = NumRestarts();
435 if (num_restarts == 0) {
436 if (iter != nullptr) {
437 iter->SetStatus(Status::OK());
438 return iter;
439 } else {
440 return NewEmptyInternalIterator();
441 }
442 } else {
443 BlockPrefixIndex* prefix_index_ptr =
444 total_order_seek ? nullptr : prefix_index_.get();
445
446 if (iter != nullptr) {
447 iter->Initialize(cmp, data_, restart_offset_, num_restarts,
448 prefix_index_ptr, global_seqno_, read_amp_bitmap_.get());
449 } else {
450 iter = new BlockIter(cmp, data_, restart_offset_, num_restarts,
451 prefix_index_ptr, global_seqno_,
452 read_amp_bitmap_.get());
453 }
454
455 if (read_amp_bitmap_) {
456 if (read_amp_bitmap_->GetStatistics() != stats) {
457 // DB changed the Statistics pointer, we need to notify read_amp_bitmap_
458 read_amp_bitmap_->SetStatistics(stats);
459 }
460 }
461 }
462
463 return iter;
464}
465
466void Block::SetBlockPrefixIndex(BlockPrefixIndex* prefix_index) {
467 prefix_index_.reset(prefix_index);
468}
469
470size_t Block::ApproximateMemoryUsage() const {
471 size_t usage = usable_size();
472 if (prefix_index_) {
473 usage += prefix_index_->ApproximateMemoryUsage();
474 }
475 return usage;
476}
477
478} // namespace rocksdb
479