1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License |
4 | // (found in the LICENSE.Apache file in the root directory). |
5 | // |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
7 | // Use of this source code is governed by a BSD-style license that can be |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
9 | // |
10 | // Decodes the blocks generated by block_builder.cc. |
11 | |
12 | #include "table/block.h" |
13 | #include <algorithm> |
14 | #include <string> |
15 | #include <unordered_map> |
16 | #include <vector> |
17 | |
18 | #include "monitoring/perf_context_imp.h" |
19 | #include "port/port.h" |
20 | #include "port/stack_trace.h" |
21 | #include "rocksdb/comparator.h" |
22 | #include "table/block_prefix_index.h" |
23 | #include "table/format.h" |
24 | #include "util/coding.h" |
25 | #include "util/logging.h" |
26 | |
27 | namespace rocksdb { |
28 | |
29 | // Helper routine: decode the next block entry starting at "p", |
30 | // storing the number of shared key bytes, non_shared key bytes, |
31 | // and the length of the value in "*shared", "*non_shared", and |
32 | // "*value_length", respectively. Will not derefence past "limit". |
33 | // |
34 | // If any errors are detected, returns nullptr. Otherwise, returns a |
35 | // pointer to the key delta (just past the three decoded values). |
36 | static inline const char* DecodeEntry(const char* p, const char* limit, |
37 | uint32_t* shared, |
38 | uint32_t* non_shared, |
39 | uint32_t* value_length) { |
40 | if (limit - p < 3) return nullptr; |
41 | *shared = reinterpret_cast<const unsigned char*>(p)[0]; |
42 | *non_shared = reinterpret_cast<const unsigned char*>(p)[1]; |
43 | *value_length = reinterpret_cast<const unsigned char*>(p)[2]; |
44 | if ((*shared | *non_shared | *value_length) < 128) { |
45 | // Fast path: all three values are encoded in one byte each |
46 | p += 3; |
47 | } else { |
48 | if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr; |
49 | if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr; |
50 | if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr; |
51 | } |
52 | |
53 | if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) { |
54 | return nullptr; |
55 | } |
56 | return p; |
57 | } |
58 | |
59 | void BlockIter::Next() { |
60 | assert(Valid()); |
61 | ParseNextKey(); |
62 | } |
63 | |
64 | void BlockIter::Prev() { |
65 | assert(Valid()); |
66 | |
67 | assert(prev_entries_idx_ == -1 || |
68 | static_cast<size_t>(prev_entries_idx_) < prev_entries_.size()); |
69 | // Check if we can use cached prev_entries_ |
70 | if (prev_entries_idx_ > 0 && |
71 | prev_entries_[prev_entries_idx_].offset == current_) { |
72 | // Read cached CachedPrevEntry |
73 | prev_entries_idx_--; |
74 | const CachedPrevEntry& current_prev_entry = |
75 | prev_entries_[prev_entries_idx_]; |
76 | |
77 | const char* key_ptr = nullptr; |
78 | if (current_prev_entry.key_ptr != nullptr) { |
79 | // The key is not delta encoded and stored in the data block |
80 | key_ptr = current_prev_entry.key_ptr; |
81 | key_pinned_ = true; |
82 | } else { |
83 | // The key is delta encoded and stored in prev_entries_keys_buff_ |
84 | key_ptr = prev_entries_keys_buff_.data() + current_prev_entry.key_offset; |
85 | key_pinned_ = false; |
86 | } |
87 | const Slice current_key(key_ptr, current_prev_entry.key_size); |
88 | |
89 | current_ = current_prev_entry.offset; |
90 | key_.SetInternalKey(current_key, false /* copy */); |
91 | value_ = current_prev_entry.value; |
92 | |
93 | return; |
94 | } |
95 | |
96 | // Clear prev entries cache |
97 | prev_entries_idx_ = -1; |
98 | prev_entries_.clear(); |
99 | prev_entries_keys_buff_.clear(); |
100 | |
101 | // Scan backwards to a restart point before current_ |
102 | const uint32_t original = current_; |
103 | while (GetRestartPoint(restart_index_) >= original) { |
104 | if (restart_index_ == 0) { |
105 | // No more entries |
106 | current_ = restarts_; |
107 | restart_index_ = num_restarts_; |
108 | return; |
109 | } |
110 | restart_index_--; |
111 | } |
112 | |
113 | SeekToRestartPoint(restart_index_); |
114 | |
115 | do { |
116 | if (!ParseNextKey()) { |
117 | break; |
118 | } |
119 | Slice current_key = key(); |
120 | |
121 | if (key_.IsKeyPinned()) { |
122 | // The key is not delta encoded |
123 | prev_entries_.emplace_back(current_, current_key.data(), 0, |
124 | current_key.size(), value()); |
125 | } else { |
126 | // The key is delta encoded, cache decoded key in buffer |
127 | size_t new_key_offset = prev_entries_keys_buff_.size(); |
128 | prev_entries_keys_buff_.append(current_key.data(), current_key.size()); |
129 | |
130 | prev_entries_.emplace_back(current_, nullptr, new_key_offset, |
131 | current_key.size(), value()); |
132 | } |
133 | // Loop until end of current entry hits the start of original entry |
134 | } while (NextEntryOffset() < original); |
135 | prev_entries_idx_ = static_cast<int32_t>(prev_entries_.size()) - 1; |
136 | } |
137 | |
138 | void BlockIter::Seek(const Slice& target) { |
139 | PERF_TIMER_GUARD(block_seek_nanos); |
140 | if (data_ == nullptr) { // Not init yet |
141 | return; |
142 | } |
143 | uint32_t index = 0; |
144 | bool ok = false; |
145 | if (prefix_index_) { |
146 | ok = PrefixSeek(target, &index); |
147 | } else { |
148 | ok = BinarySeek(target, 0, num_restarts_ - 1, &index); |
149 | } |
150 | |
151 | if (!ok) { |
152 | return; |
153 | } |
154 | SeekToRestartPoint(index); |
155 | // Linear search (within restart block) for first key >= target |
156 | |
157 | while (true) { |
158 | if (!ParseNextKey() || Compare(key_.GetInternalKey(), target) >= 0) { |
159 | return; |
160 | } |
161 | } |
162 | } |
163 | |
164 | void BlockIter::SeekForPrev(const Slice& target) { |
165 | PERF_TIMER_GUARD(block_seek_nanos); |
166 | if (data_ == nullptr) { // Not init yet |
167 | return; |
168 | } |
169 | uint32_t index = 0; |
170 | bool ok = false; |
171 | ok = BinarySeek(target, 0, num_restarts_ - 1, &index); |
172 | |
173 | if (!ok) { |
174 | return; |
175 | } |
176 | SeekToRestartPoint(index); |
177 | // Linear search (within restart block) for first key >= target |
178 | |
179 | while (ParseNextKey() && Compare(key_.GetInternalKey(), target) < 0) { |
180 | } |
181 | if (!Valid()) { |
182 | SeekToLast(); |
183 | } else { |
184 | while (Valid() && Compare(key_.GetInternalKey(), target) > 0) { |
185 | Prev(); |
186 | } |
187 | } |
188 | } |
189 | |
190 | void BlockIter::SeekToFirst() { |
191 | if (data_ == nullptr) { // Not init yet |
192 | return; |
193 | } |
194 | SeekToRestartPoint(0); |
195 | ParseNextKey(); |
196 | } |
197 | |
198 | void BlockIter::SeekToLast() { |
199 | if (data_ == nullptr) { // Not init yet |
200 | return; |
201 | } |
202 | SeekToRestartPoint(num_restarts_ - 1); |
203 | while (ParseNextKey() && NextEntryOffset() < restarts_) { |
204 | // Keep skipping |
205 | } |
206 | } |
207 | |
208 | void BlockIter::CorruptionError() { |
209 | current_ = restarts_; |
210 | restart_index_ = num_restarts_; |
211 | status_ = Status::Corruption("bad entry in block" ); |
212 | key_.Clear(); |
213 | value_.clear(); |
214 | } |
215 | |
216 | bool BlockIter::ParseNextKey() { |
217 | current_ = NextEntryOffset(); |
218 | const char* p = data_ + current_; |
219 | const char* limit = data_ + restarts_; // Restarts come right after data |
220 | if (p >= limit) { |
221 | // No more entries to return. Mark as invalid. |
222 | current_ = restarts_; |
223 | restart_index_ = num_restarts_; |
224 | return false; |
225 | } |
226 | |
227 | // Decode next entry |
228 | uint32_t shared, non_shared, value_length; |
229 | p = DecodeEntry(p, limit, &shared, &non_shared, &value_length); |
230 | if (p == nullptr || key_.Size() < shared) { |
231 | CorruptionError(); |
232 | return false; |
233 | } else { |
234 | if (shared == 0) { |
235 | // If this key dont share any bytes with prev key then we dont need |
236 | // to decode it and can use it's address in the block directly. |
237 | key_.SetInternalKey(Slice(p, non_shared), false /* copy */); |
238 | key_pinned_ = true; |
239 | } else { |
240 | // This key share `shared` bytes with prev key, we need to decode it |
241 | key_.TrimAppend(shared, p, non_shared); |
242 | key_pinned_ = false; |
243 | } |
244 | |
245 | if (global_seqno_ != kDisableGlobalSequenceNumber) { |
246 | // If we are reading a file with a global sequence number we should |
247 | // expect that all encoded sequence numbers are zeros and any value |
248 | // type is kTypeValue, kTypeMerge or kTypeDeletion |
249 | assert(GetInternalKeySeqno(key_.GetInternalKey()) == 0); |
250 | |
251 | ValueType value_type = ExtractValueType(key_.GetInternalKey()); |
252 | assert(value_type == ValueType::kTypeValue || |
253 | value_type == ValueType::kTypeMerge || |
254 | value_type == ValueType::kTypeDeletion); |
255 | |
256 | if (key_pinned_) { |
257 | // TODO(tec): Investigate updating the seqno in the loaded block |
258 | // directly instead of doing a copy and update. |
259 | |
260 | // We cannot use the key address in the block directly because |
261 | // we have a global_seqno_ that will overwrite the encoded one. |
262 | key_.OwnKey(); |
263 | key_pinned_ = false; |
264 | } |
265 | |
266 | key_.UpdateInternalKey(global_seqno_, value_type); |
267 | } |
268 | |
269 | value_ = Slice(p + non_shared, value_length); |
270 | while (restart_index_ + 1 < num_restarts_ && |
271 | GetRestartPoint(restart_index_ + 1) < current_) { |
272 | ++restart_index_; |
273 | } |
274 | return true; |
275 | } |
276 | } |
277 | |
278 | // Binary search in restart array to find the first restart point that |
279 | // is either the last restart point with a key less than target, |
280 | // which means the key of next restart point is larger than target, or |
281 | // the first restart point with a key = target |
282 | bool BlockIter::BinarySeek(const Slice& target, uint32_t left, uint32_t right, |
283 | uint32_t* index) { |
284 | assert(left <= right); |
285 | |
286 | while (left < right) { |
287 | uint32_t mid = (left + right + 1) / 2; |
288 | uint32_t region_offset = GetRestartPoint(mid); |
289 | uint32_t shared, non_shared, value_length; |
290 | const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_, |
291 | &shared, &non_shared, &value_length); |
292 | if (key_ptr == nullptr || (shared != 0)) { |
293 | CorruptionError(); |
294 | return false; |
295 | } |
296 | Slice mid_key(key_ptr, non_shared); |
297 | int cmp = Compare(mid_key, target); |
298 | if (cmp < 0) { |
299 | // Key at "mid" is smaller than "target". Therefore all |
300 | // blocks before "mid" are uninteresting. |
301 | left = mid; |
302 | } else if (cmp > 0) { |
303 | // Key at "mid" is >= "target". Therefore all blocks at or |
304 | // after "mid" are uninteresting. |
305 | right = mid - 1; |
306 | } else { |
307 | left = right = mid; |
308 | } |
309 | } |
310 | |
311 | *index = left; |
312 | return true; |
313 | } |
314 | |
315 | // Compare target key and the block key of the block of `block_index`. |
316 | // Return -1 if error. |
317 | int BlockIter::CompareBlockKey(uint32_t block_index, const Slice& target) { |
318 | uint32_t region_offset = GetRestartPoint(block_index); |
319 | uint32_t shared, non_shared, value_length; |
320 | const char* key_ptr = DecodeEntry(data_ + region_offset, data_ + restarts_, |
321 | &shared, &non_shared, &value_length); |
322 | if (key_ptr == nullptr || (shared != 0)) { |
323 | CorruptionError(); |
324 | return 1; // Return target is smaller |
325 | } |
326 | Slice block_key(key_ptr, non_shared); |
327 | return Compare(block_key, target); |
328 | } |
329 | |
330 | // Binary search in block_ids to find the first block |
331 | // with a key >= target |
332 | bool BlockIter::BinaryBlockIndexSeek(const Slice& target, uint32_t* block_ids, |
333 | uint32_t left, uint32_t right, |
334 | uint32_t* index) { |
335 | assert(left <= right); |
336 | uint32_t left_bound = left; |
337 | |
338 | while (left <= right) { |
339 | uint32_t mid = (right + left) / 2; |
340 | |
341 | int cmp = CompareBlockKey(block_ids[mid], target); |
342 | if (!status_.ok()) { |
343 | return false; |
344 | } |
345 | if (cmp < 0) { |
346 | // Key at "target" is larger than "mid". Therefore all |
347 | // blocks before or at "mid" are uninteresting. |
348 | left = mid + 1; |
349 | } else { |
350 | // Key at "target" is <= "mid". Therefore all blocks |
351 | // after "mid" are uninteresting. |
352 | // If there is only one block left, we found it. |
353 | if (left == right) break; |
354 | right = mid; |
355 | } |
356 | } |
357 | |
358 | if (left == right) { |
359 | // In one of the two following cases: |
360 | // (1) left is the first one of block_ids |
361 | // (2) there is a gap of blocks between block of `left` and `left-1`. |
362 | // we can further distinguish the case of key in the block or key not |
363 | // existing, by comparing the target key and the key of the previous |
364 | // block to the left of the block found. |
365 | if (block_ids[left] > 0 && |
366 | (left == left_bound || block_ids[left - 1] != block_ids[left] - 1) && |
367 | CompareBlockKey(block_ids[left] - 1, target) > 0) { |
368 | current_ = restarts_; |
369 | return false; |
370 | } |
371 | |
372 | *index = block_ids[left]; |
373 | return true; |
374 | } else { |
375 | assert(left > right); |
376 | // Mark iterator invalid |
377 | current_ = restarts_; |
378 | return false; |
379 | } |
380 | } |
381 | |
382 | bool BlockIter::PrefixSeek(const Slice& target, uint32_t* index) { |
383 | assert(prefix_index_); |
384 | uint32_t* block_ids = nullptr; |
385 | uint32_t num_blocks = prefix_index_->GetBlocks(target, &block_ids); |
386 | |
387 | if (num_blocks == 0) { |
388 | current_ = restarts_; |
389 | return false; |
390 | } else { |
391 | return BinaryBlockIndexSeek(target, block_ids, 0, num_blocks - 1, index); |
392 | } |
393 | } |
394 | |
395 | uint32_t Block::NumRestarts() const { |
396 | assert(size_ >= 2*sizeof(uint32_t)); |
397 | return DecodeFixed32(data_ + size_ - sizeof(uint32_t)); |
398 | } |
399 | |
400 | Block::Block(BlockContents&& contents, SequenceNumber _global_seqno, |
401 | size_t read_amp_bytes_per_bit, Statistics* statistics) |
402 | : contents_(std::move(contents)), |
403 | data_(contents_.data.data()), |
404 | size_(contents_.data.size()), |
405 | restart_offset_(0), |
406 | global_seqno_(_global_seqno) { |
407 | if (size_ < sizeof(uint32_t)) { |
408 | size_ = 0; // Error marker |
409 | } else { |
410 | restart_offset_ = |
411 | static_cast<uint32_t>(size_) - (1 + NumRestarts()) * sizeof(uint32_t); |
412 | if (restart_offset_ > size_ - sizeof(uint32_t)) { |
413 | // The size is too small for NumRestarts() and therefore |
414 | // restart_offset_ wrapped around. |
415 | size_ = 0; |
416 | } |
417 | } |
418 | if (read_amp_bytes_per_bit != 0 && statistics && size_ != 0) { |
419 | read_amp_bitmap_.reset(new BlockReadAmpBitmap( |
420 | restart_offset_, read_amp_bytes_per_bit, statistics)); |
421 | } |
422 | } |
423 | |
424 | InternalIterator* Block::NewIterator(const Comparator* cmp, BlockIter* iter, |
425 | bool total_order_seek, Statistics* stats) { |
426 | if (size_ < 2*sizeof(uint32_t)) { |
427 | if (iter != nullptr) { |
428 | iter->SetStatus(Status::Corruption("bad block contents" )); |
429 | return iter; |
430 | } else { |
431 | return NewErrorInternalIterator(Status::Corruption("bad block contents" )); |
432 | } |
433 | } |
434 | const uint32_t num_restarts = NumRestarts(); |
435 | if (num_restarts == 0) { |
436 | if (iter != nullptr) { |
437 | iter->SetStatus(Status::OK()); |
438 | return iter; |
439 | } else { |
440 | return NewEmptyInternalIterator(); |
441 | } |
442 | } else { |
443 | BlockPrefixIndex* prefix_index_ptr = |
444 | total_order_seek ? nullptr : prefix_index_.get(); |
445 | |
446 | if (iter != nullptr) { |
447 | iter->Initialize(cmp, data_, restart_offset_, num_restarts, |
448 | prefix_index_ptr, global_seqno_, read_amp_bitmap_.get()); |
449 | } else { |
450 | iter = new BlockIter(cmp, data_, restart_offset_, num_restarts, |
451 | prefix_index_ptr, global_seqno_, |
452 | read_amp_bitmap_.get()); |
453 | } |
454 | |
455 | if (read_amp_bitmap_) { |
456 | if (read_amp_bitmap_->GetStatistics() != stats) { |
457 | // DB changed the Statistics pointer, we need to notify read_amp_bitmap_ |
458 | read_amp_bitmap_->SetStatistics(stats); |
459 | } |
460 | } |
461 | } |
462 | |
463 | return iter; |
464 | } |
465 | |
466 | void Block::SetBlockPrefixIndex(BlockPrefixIndex* prefix_index) { |
467 | prefix_index_.reset(prefix_index); |
468 | } |
469 | |
470 | size_t Block::ApproximateMemoryUsage() const { |
471 | size_t usage = usable_size(); |
472 | if (prefix_index_) { |
473 | usage += prefix_index_->ApproximateMemoryUsage(); |
474 | } |
475 | return usage; |
476 | } |
477 | |
478 | } // namespace rocksdb |
479 | |