1// Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2// This source code is licensed under both the GPLv2 (found in the
3// COPYING file in the root directory) and Apache 2.0 License
4// (found in the LICENSE.Apache file in the root directory).
5//
6// Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7// Use of this source code is governed by a BSD-style license that can be
8// found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10#pragma once
11
12#include <assert.h>
13#include <inttypes.h>
14
15#include <list>
16#include <string>
17#include <unordered_map>
18
19#include "rocksdb/comparator.h"
20#include "table/block_based_table_factory.h"
21#include "table/block_builder.h"
22#include "table/format.h"
23
24namespace rocksdb {
25// The interface for building index.
26// Instruction for adding a new concrete IndexBuilder:
27// 1. Create a subclass instantiated from IndexBuilder.
28// 2. Add a new entry associated with that subclass in TableOptions::IndexType.
29// 3. Add a create function for the new subclass in CreateIndexBuilder.
30// Note: we can devise more advanced design to simplify the process for adding
31// new subclass, which will, on the other hand, increase the code complexity and
32// catch unwanted attention from readers. Given that we won't add/change
33// indexes frequently, it makes sense to just embrace a more straightforward
34// design that just works.
35class IndexBuilder {
36 public:
37 static IndexBuilder* CreateIndexBuilder(
38 BlockBasedTableOptions::IndexType index_type,
39 const rocksdb::InternalKeyComparator* comparator,
40 const InternalKeySliceTransform* int_key_slice_transform,
41 const BlockBasedTableOptions& table_opt);
42
43 // Index builder will construct a set of blocks which contain:
44 // 1. One primary index block.
45 // 2. (Optional) a set of metablocks that contains the metadata of the
46 // primary index.
47 struct IndexBlocks {
48 Slice index_block_contents;
49 std::unordered_map<std::string, Slice> meta_blocks;
50 };
51 explicit IndexBuilder(const InternalKeyComparator* comparator)
52 : comparator_(comparator) {}
53
54 virtual ~IndexBuilder() {}
55
56 // Add a new index entry to index block.
57 // To allow further optimization, we provide `last_key_in_current_block` and
58 // `first_key_in_next_block`, based on which the specific implementation can
59 // determine the best index key to be used for the index block.
60 // @last_key_in_current_block: this parameter maybe overridden with the value
61 // "substitute key".
62 // @first_key_in_next_block: it will be nullptr if the entry being added is
63 // the last one in the table
64 //
65 // REQUIRES: Finish() has not yet been called.
66 virtual void AddIndexEntry(std::string* last_key_in_current_block,
67 const Slice* first_key_in_next_block,
68 const BlockHandle& block_handle) = 0;
69
70 // This method will be called whenever a key is added. The subclasses may
71 // override OnKeyAdded() if they need to collect additional information.
72 virtual void OnKeyAdded(const Slice& key) {}
73
74 // Inform the index builder that all entries has been written. Block builder
75 // may therefore perform any operation required for block finalization.
76 //
77 // REQUIRES: Finish() has not yet been called.
78 inline Status Finish(IndexBlocks* index_blocks) {
79 // Throw away the changes to last_partition_block_handle. It has no effect
80 // on the first call to Finish anyway.
81 BlockHandle last_partition_block_handle;
82 return Finish(index_blocks, last_partition_block_handle);
83 }
84
85 // This override of Finish can be utilized to build the 2nd level index in
86 // PartitionIndexBuilder.
87 //
88 // index_blocks will be filled with the resulting index data. If the return
89 // value is Status::InComplete() then it means that the index is partitioned
90 // and the callee should keep calling Finish until Status::OK() is returned.
91 // In that case, last_partition_block_handle is pointer to the block written
92 // with the result of the last call to Finish. This can be utilized to build
93 // the second level index pointing to each block of partitioned indexes. The
94 // last call to Finish() that returns Status::OK() populates index_blocks with
95 // the 2nd level index content.
96 virtual Status Finish(IndexBlocks* index_blocks,
97 const BlockHandle& last_partition_block_handle) = 0;
98
99 // Get the estimated size for index block.
100 virtual size_t EstimatedSize() const = 0;
101
102 protected:
103 const InternalKeyComparator* comparator_;
104};
105
106// This index builder builds space-efficient index block.
107//
108// Optimizations:
109// 1. Made block's `block_restart_interval` to be 1, which will avoid linear
110// search when doing index lookup (can be disabled by setting
111// index_block_restart_interval).
112// 2. Shorten the key length for index block. Other than honestly using the
113// last key in the data block as the index key, we instead find a shortest
114// substitute key that serves the same function.
115class ShortenedIndexBuilder : public IndexBuilder {
116 public:
117 explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator,
118 int index_block_restart_interval)
119 : IndexBuilder(comparator),
120 index_block_builder_(index_block_restart_interval) {}
121
122 virtual void AddIndexEntry(std::string* last_key_in_current_block,
123 const Slice* first_key_in_next_block,
124 const BlockHandle& block_handle) override {
125 if (first_key_in_next_block != nullptr) {
126 comparator_->FindShortestSeparator(last_key_in_current_block,
127 *first_key_in_next_block);
128 } else {
129 comparator_->FindShortSuccessor(last_key_in_current_block);
130 }
131
132 std::string handle_encoding;
133 block_handle.EncodeTo(&handle_encoding);
134 index_block_builder_.Add(*last_key_in_current_block, handle_encoding);
135 }
136
137 using IndexBuilder::Finish;
138 virtual Status Finish(
139 IndexBlocks* index_blocks,
140 const BlockHandle& last_partition_block_handle) override {
141 index_blocks->index_block_contents = index_block_builder_.Finish();
142 return Status::OK();
143 }
144
145 virtual size_t EstimatedSize() const override {
146 return index_block_builder_.CurrentSizeEstimate();
147 }
148
149 friend class PartitionedIndexBuilder;
150
151 private:
152 BlockBuilder index_block_builder_;
153};
154
155// HashIndexBuilder contains a binary-searchable primary index and the
156// metadata for secondary hash index construction.
157// The metadata for hash index consists two parts:
158// - a metablock that compactly contains a sequence of prefixes. All prefixes
159// are stored consectively without any metadata (like, prefix sizes) being
160// stored, which is kept in the other metablock.
161// - a metablock contains the metadata of the prefixes, including prefix size,
162// restart index and number of block it spans. The format looks like:
163//
164// +-----------------+---------------------------+---------------------+
165// <=prefix 1
166// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
167// +-----------------+---------------------------+---------------------+
168// <=prefix 2
169// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
170// +-----------------+---------------------------+---------------------+
171// | |
172// | .... |
173// | |
174// +-----------------+---------------------------+---------------------+
175// <=prefix n
176// | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes |
177// +-----------------+---------------------------+---------------------+
178//
179// The reason of separating these two metablocks is to enable the efficiently
180// reuse the first metablock during hash index construction without unnecessary
181// data copy or small heap allocations for prefixes.
182class HashIndexBuilder : public IndexBuilder {
183 public:
184 explicit HashIndexBuilder(const InternalKeyComparator* comparator,
185 const SliceTransform* hash_key_extractor,
186 int index_block_restart_interval)
187 : IndexBuilder(comparator),
188 primary_index_builder_(comparator, index_block_restart_interval),
189 hash_key_extractor_(hash_key_extractor) {}
190
191 virtual void AddIndexEntry(std::string* last_key_in_current_block,
192 const Slice* first_key_in_next_block,
193 const BlockHandle& block_handle) override {
194 ++current_restart_index_;
195 primary_index_builder_.AddIndexEntry(last_key_in_current_block,
196 first_key_in_next_block, block_handle);
197 }
198
199 virtual void OnKeyAdded(const Slice& key) override {
200 auto key_prefix = hash_key_extractor_->Transform(key);
201 bool is_first_entry = pending_block_num_ == 0;
202
203 // Keys may share the prefix
204 if (is_first_entry || pending_entry_prefix_ != key_prefix) {
205 if (!is_first_entry) {
206 FlushPendingPrefix();
207 }
208
209 // need a hard copy otherwise the underlying data changes all the time.
210 // TODO(kailiu) ToString() is expensive. We may speed up can avoid data
211 // copy.
212 pending_entry_prefix_ = key_prefix.ToString();
213 pending_block_num_ = 1;
214 pending_entry_index_ = static_cast<uint32_t>(current_restart_index_);
215 } else {
216 // entry number increments when keys share the prefix reside in
217 // different data blocks.
218 auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1;
219 assert(last_restart_index <= current_restart_index_);
220 if (last_restart_index != current_restart_index_) {
221 ++pending_block_num_;
222 }
223 }
224 }
225
226 virtual Status Finish(
227 IndexBlocks* index_blocks,
228 const BlockHandle& last_partition_block_handle) override {
229 FlushPendingPrefix();
230 primary_index_builder_.Finish(index_blocks, last_partition_block_handle);
231 index_blocks->meta_blocks.insert(
232 {kHashIndexPrefixesBlock.c_str(), prefix_block_});
233 index_blocks->meta_blocks.insert(
234 {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_});
235 return Status::OK();
236 }
237
238 virtual size_t EstimatedSize() const override {
239 return primary_index_builder_.EstimatedSize() + prefix_block_.size() +
240 prefix_meta_block_.size();
241 }
242
243 private:
244 void FlushPendingPrefix() {
245 prefix_block_.append(pending_entry_prefix_.data(),
246 pending_entry_prefix_.size());
247 PutVarint32Varint32Varint32(
248 &prefix_meta_block_,
249 static_cast<uint32_t>(pending_entry_prefix_.size()),
250 pending_entry_index_, pending_block_num_);
251 }
252
253 ShortenedIndexBuilder primary_index_builder_;
254 const SliceTransform* hash_key_extractor_;
255
256 // stores a sequence of prefixes
257 std::string prefix_block_;
258 // stores the metadata of prefixes
259 std::string prefix_meta_block_;
260
261 // The following 3 variables keeps unflushed prefix and its metadata.
262 // The details of block_num and entry_index can be found in
263 // "block_hash_index.{h,cc}"
264 uint32_t pending_block_num_ = 0;
265 uint32_t pending_entry_index_ = 0;
266 std::string pending_entry_prefix_;
267
268 uint64_t current_restart_index_ = 0;
269};
270
271/**
272 * IndexBuilder for two-level indexing. Internally it creates a new index for
273 * each partition and Finish then in order when Finish is called on it
274 * continiously until Status::OK() is returned.
275 *
276 * The format on the disk would be I I I I I I IP where I is block containing a
277 * partition of indexes built using ShortenedIndexBuilder and IP is a block
278 * containing a secondary index on the partitions, built using
279 * ShortenedIndexBuilder.
280 */
281class PartitionedIndexBuilder : public IndexBuilder {
282 public:
283 static PartitionedIndexBuilder* CreateIndexBuilder(
284 const rocksdb::InternalKeyComparator* comparator,
285 const BlockBasedTableOptions& table_opt);
286
287 explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator,
288 const BlockBasedTableOptions& table_opt);
289
290 virtual ~PartitionedIndexBuilder();
291
292 virtual void AddIndexEntry(std::string* last_key_in_current_block,
293 const Slice* first_key_in_next_block,
294 const BlockHandle& block_handle) override;
295
296 virtual Status Finish(
297 IndexBlocks* index_blocks,
298 const BlockHandle& last_partition_block_handle) override;
299
300 virtual size_t EstimatedSize() const override;
301 size_t EstimateTopLevelIndexSize(uint64_t) const;
302 size_t NumPartitions() const;
303
304 inline bool ShouldCutFilterBlock() {
305 // Current policy is to align the partitions of index and filters
306 if (cut_filter_block) {
307 cut_filter_block = false;
308 return true;
309 }
310 return false;
311 }
312
313 std::string& GetPartitionKey() { return sub_index_last_key_; }
314
315 // Called when an external entity (such as filter partition builder) request
316 // cutting the next partition
317 void RequestPartitionCut();
318
319 private:
320 void MakeNewSubIndexBuilder();
321
322 struct Entry {
323 std::string key;
324 std::unique_ptr<ShortenedIndexBuilder> value;
325 };
326 std::list<Entry> entries_; // list of partitioned indexes and their keys
327 BlockBuilder index_block_builder_; // top-level index builder
328 // the active partition index builder
329 ShortenedIndexBuilder* sub_index_builder_;
330 // the last key in the active partition index builder
331 std::string sub_index_last_key_;
332 std::unique_ptr<FlushBlockPolicy> flush_policy_;
333 // true if Finish is called once but not complete yet.
334 bool finishing_indexes = false;
335 const BlockBasedTableOptions& table_opt_;
336 // true if an external entity (such as filter partition builder) request
337 // cutting the next partition
338 bool partition_cut_requested_ = true;
339 // true if it should cut the next filter partition block
340 bool cut_filter_block = false;
341};
342} // namespace rocksdb
343