1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
2 | // This source code is licensed under both the GPLv2 (found in the |
3 | // COPYING file in the root directory) and Apache 2.0 License |
4 | // (found in the LICENSE.Apache file in the root directory). |
5 | // |
6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
7 | // Use of this source code is governed by a BSD-style license that can be |
8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
9 | |
10 | #pragma once |
11 | |
12 | #include <assert.h> |
13 | #include <inttypes.h> |
14 | |
15 | #include <list> |
16 | #include <string> |
17 | #include <unordered_map> |
18 | |
19 | #include "rocksdb/comparator.h" |
20 | #include "table/block_based_table_factory.h" |
21 | #include "table/block_builder.h" |
22 | #include "table/format.h" |
23 | |
24 | namespace rocksdb { |
25 | // The interface for building index. |
26 | // Instruction for adding a new concrete IndexBuilder: |
27 | // 1. Create a subclass instantiated from IndexBuilder. |
28 | // 2. Add a new entry associated with that subclass in TableOptions::IndexType. |
29 | // 3. Add a create function for the new subclass in CreateIndexBuilder. |
30 | // Note: we can devise more advanced design to simplify the process for adding |
31 | // new subclass, which will, on the other hand, increase the code complexity and |
32 | // catch unwanted attention from readers. Given that we won't add/change |
33 | // indexes frequently, it makes sense to just embrace a more straightforward |
34 | // design that just works. |
35 | class IndexBuilder { |
36 | public: |
37 | static IndexBuilder* CreateIndexBuilder( |
38 | BlockBasedTableOptions::IndexType index_type, |
39 | const rocksdb::InternalKeyComparator* comparator, |
40 | const InternalKeySliceTransform* int_key_slice_transform, |
41 | const BlockBasedTableOptions& table_opt); |
42 | |
43 | // Index builder will construct a set of blocks which contain: |
44 | // 1. One primary index block. |
45 | // 2. (Optional) a set of metablocks that contains the metadata of the |
46 | // primary index. |
47 | struct IndexBlocks { |
48 | Slice index_block_contents; |
49 | std::unordered_map<std::string, Slice> meta_blocks; |
50 | }; |
51 | explicit IndexBuilder(const InternalKeyComparator* comparator) |
52 | : comparator_(comparator) {} |
53 | |
54 | virtual ~IndexBuilder() {} |
55 | |
56 | // Add a new index entry to index block. |
57 | // To allow further optimization, we provide `last_key_in_current_block` and |
58 | // `first_key_in_next_block`, based on which the specific implementation can |
59 | // determine the best index key to be used for the index block. |
60 | // @last_key_in_current_block: this parameter maybe overridden with the value |
61 | // "substitute key". |
62 | // @first_key_in_next_block: it will be nullptr if the entry being added is |
63 | // the last one in the table |
64 | // |
65 | // REQUIRES: Finish() has not yet been called. |
66 | virtual void AddIndexEntry(std::string* last_key_in_current_block, |
67 | const Slice* first_key_in_next_block, |
68 | const BlockHandle& block_handle) = 0; |
69 | |
70 | // This method will be called whenever a key is added. The subclasses may |
71 | // override OnKeyAdded() if they need to collect additional information. |
72 | virtual void OnKeyAdded(const Slice& key) {} |
73 | |
74 | // Inform the index builder that all entries has been written. Block builder |
75 | // may therefore perform any operation required for block finalization. |
76 | // |
77 | // REQUIRES: Finish() has not yet been called. |
78 | inline Status Finish(IndexBlocks* index_blocks) { |
79 | // Throw away the changes to last_partition_block_handle. It has no effect |
80 | // on the first call to Finish anyway. |
81 | BlockHandle last_partition_block_handle; |
82 | return Finish(index_blocks, last_partition_block_handle); |
83 | } |
84 | |
85 | // This override of Finish can be utilized to build the 2nd level index in |
86 | // PartitionIndexBuilder. |
87 | // |
88 | // index_blocks will be filled with the resulting index data. If the return |
89 | // value is Status::InComplete() then it means that the index is partitioned |
90 | // and the callee should keep calling Finish until Status::OK() is returned. |
91 | // In that case, last_partition_block_handle is pointer to the block written |
92 | // with the result of the last call to Finish. This can be utilized to build |
93 | // the second level index pointing to each block of partitioned indexes. The |
94 | // last call to Finish() that returns Status::OK() populates index_blocks with |
95 | // the 2nd level index content. |
96 | virtual Status Finish(IndexBlocks* index_blocks, |
97 | const BlockHandle& last_partition_block_handle) = 0; |
98 | |
99 | // Get the estimated size for index block. |
100 | virtual size_t EstimatedSize() const = 0; |
101 | |
102 | protected: |
103 | const InternalKeyComparator* comparator_; |
104 | }; |
105 | |
106 | // This index builder builds space-efficient index block. |
107 | // |
108 | // Optimizations: |
109 | // 1. Made block's `block_restart_interval` to be 1, which will avoid linear |
110 | // search when doing index lookup (can be disabled by setting |
111 | // index_block_restart_interval). |
112 | // 2. Shorten the key length for index block. Other than honestly using the |
113 | // last key in the data block as the index key, we instead find a shortest |
114 | // substitute key that serves the same function. |
115 | class ShortenedIndexBuilder : public IndexBuilder { |
116 | public: |
117 | explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator, |
118 | int index_block_restart_interval) |
119 | : IndexBuilder(comparator), |
120 | index_block_builder_(index_block_restart_interval) {} |
121 | |
122 | virtual void AddIndexEntry(std::string* last_key_in_current_block, |
123 | const Slice* first_key_in_next_block, |
124 | const BlockHandle& block_handle) override { |
125 | if (first_key_in_next_block != nullptr) { |
126 | comparator_->FindShortestSeparator(last_key_in_current_block, |
127 | *first_key_in_next_block); |
128 | } else { |
129 | comparator_->FindShortSuccessor(last_key_in_current_block); |
130 | } |
131 | |
132 | std::string handle_encoding; |
133 | block_handle.EncodeTo(&handle_encoding); |
134 | index_block_builder_.Add(*last_key_in_current_block, handle_encoding); |
135 | } |
136 | |
137 | using IndexBuilder::Finish; |
138 | virtual Status Finish( |
139 | IndexBlocks* index_blocks, |
140 | const BlockHandle& last_partition_block_handle) override { |
141 | index_blocks->index_block_contents = index_block_builder_.Finish(); |
142 | return Status::OK(); |
143 | } |
144 | |
145 | virtual size_t EstimatedSize() const override { |
146 | return index_block_builder_.CurrentSizeEstimate(); |
147 | } |
148 | |
149 | friend class PartitionedIndexBuilder; |
150 | |
151 | private: |
152 | BlockBuilder index_block_builder_; |
153 | }; |
154 | |
155 | // HashIndexBuilder contains a binary-searchable primary index and the |
156 | // metadata for secondary hash index construction. |
157 | // The metadata for hash index consists two parts: |
158 | // - a metablock that compactly contains a sequence of prefixes. All prefixes |
159 | // are stored consectively without any metadata (like, prefix sizes) being |
160 | // stored, which is kept in the other metablock. |
161 | // - a metablock contains the metadata of the prefixes, including prefix size, |
162 | // restart index and number of block it spans. The format looks like: |
163 | // |
164 | // +-----------------+---------------------------+---------------------+ |
165 | // <=prefix 1 |
166 | // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | |
167 | // +-----------------+---------------------------+---------------------+ |
168 | // <=prefix 2 |
169 | // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | |
170 | // +-----------------+---------------------------+---------------------+ |
171 | // | | |
172 | // | .... | |
173 | // | | |
174 | // +-----------------+---------------------------+---------------------+ |
175 | // <=prefix n |
176 | // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | |
177 | // +-----------------+---------------------------+---------------------+ |
178 | // |
179 | // The reason of separating these two metablocks is to enable the efficiently |
180 | // reuse the first metablock during hash index construction without unnecessary |
181 | // data copy or small heap allocations for prefixes. |
182 | class HashIndexBuilder : public IndexBuilder { |
183 | public: |
184 | explicit HashIndexBuilder(const InternalKeyComparator* comparator, |
185 | const SliceTransform* , |
186 | int index_block_restart_interval) |
187 | : IndexBuilder(comparator), |
188 | primary_index_builder_(comparator, index_block_restart_interval), |
189 | hash_key_extractor_(hash_key_extractor) {} |
190 | |
191 | virtual void AddIndexEntry(std::string* last_key_in_current_block, |
192 | const Slice* first_key_in_next_block, |
193 | const BlockHandle& block_handle) override { |
194 | ++current_restart_index_; |
195 | primary_index_builder_.AddIndexEntry(last_key_in_current_block, |
196 | first_key_in_next_block, block_handle); |
197 | } |
198 | |
199 | virtual void OnKeyAdded(const Slice& key) override { |
200 | auto key_prefix = hash_key_extractor_->Transform(key); |
201 | bool is_first_entry = pending_block_num_ == 0; |
202 | |
203 | // Keys may share the prefix |
204 | if (is_first_entry || pending_entry_prefix_ != key_prefix) { |
205 | if (!is_first_entry) { |
206 | FlushPendingPrefix(); |
207 | } |
208 | |
209 | // need a hard copy otherwise the underlying data changes all the time. |
210 | // TODO(kailiu) ToString() is expensive. We may speed up can avoid data |
211 | // copy. |
212 | pending_entry_prefix_ = key_prefix.ToString(); |
213 | pending_block_num_ = 1; |
214 | pending_entry_index_ = static_cast<uint32_t>(current_restart_index_); |
215 | } else { |
216 | // entry number increments when keys share the prefix reside in |
217 | // different data blocks. |
218 | auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1; |
219 | assert(last_restart_index <= current_restart_index_); |
220 | if (last_restart_index != current_restart_index_) { |
221 | ++pending_block_num_; |
222 | } |
223 | } |
224 | } |
225 | |
226 | virtual Status Finish( |
227 | IndexBlocks* index_blocks, |
228 | const BlockHandle& last_partition_block_handle) override { |
229 | FlushPendingPrefix(); |
230 | primary_index_builder_.Finish(index_blocks, last_partition_block_handle); |
231 | index_blocks->meta_blocks.insert( |
232 | {kHashIndexPrefixesBlock.c_str(), prefix_block_}); |
233 | index_blocks->meta_blocks.insert( |
234 | {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_}); |
235 | return Status::OK(); |
236 | } |
237 | |
238 | virtual size_t EstimatedSize() const override { |
239 | return primary_index_builder_.EstimatedSize() + prefix_block_.size() + |
240 | prefix_meta_block_.size(); |
241 | } |
242 | |
243 | private: |
244 | void FlushPendingPrefix() { |
245 | prefix_block_.append(pending_entry_prefix_.data(), |
246 | pending_entry_prefix_.size()); |
247 | PutVarint32Varint32Varint32( |
248 | &prefix_meta_block_, |
249 | static_cast<uint32_t>(pending_entry_prefix_.size()), |
250 | pending_entry_index_, pending_block_num_); |
251 | } |
252 | |
253 | ShortenedIndexBuilder primary_index_builder_; |
254 | const SliceTransform* ; |
255 | |
256 | // stores a sequence of prefixes |
257 | std::string prefix_block_; |
258 | // stores the metadata of prefixes |
259 | std::string prefix_meta_block_; |
260 | |
261 | // The following 3 variables keeps unflushed prefix and its metadata. |
262 | // The details of block_num and entry_index can be found in |
263 | // "block_hash_index.{h,cc}" |
264 | uint32_t pending_block_num_ = 0; |
265 | uint32_t pending_entry_index_ = 0; |
266 | std::string pending_entry_prefix_; |
267 | |
268 | uint64_t current_restart_index_ = 0; |
269 | }; |
270 | |
271 | /** |
272 | * IndexBuilder for two-level indexing. Internally it creates a new index for |
273 | * each partition and Finish then in order when Finish is called on it |
274 | * continiously until Status::OK() is returned. |
275 | * |
276 | * The format on the disk would be I I I I I I IP where I is block containing a |
277 | * partition of indexes built using ShortenedIndexBuilder and IP is a block |
278 | * containing a secondary index on the partitions, built using |
279 | * ShortenedIndexBuilder. |
280 | */ |
281 | class PartitionedIndexBuilder : public IndexBuilder { |
282 | public: |
283 | static PartitionedIndexBuilder* CreateIndexBuilder( |
284 | const rocksdb::InternalKeyComparator* comparator, |
285 | const BlockBasedTableOptions& table_opt); |
286 | |
287 | explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator, |
288 | const BlockBasedTableOptions& table_opt); |
289 | |
290 | virtual ~PartitionedIndexBuilder(); |
291 | |
292 | virtual void AddIndexEntry(std::string* last_key_in_current_block, |
293 | const Slice* first_key_in_next_block, |
294 | const BlockHandle& block_handle) override; |
295 | |
296 | virtual Status Finish( |
297 | IndexBlocks* index_blocks, |
298 | const BlockHandle& last_partition_block_handle) override; |
299 | |
300 | virtual size_t EstimatedSize() const override; |
301 | size_t EstimateTopLevelIndexSize(uint64_t) const; |
302 | size_t NumPartitions() const; |
303 | |
304 | inline bool ShouldCutFilterBlock() { |
305 | // Current policy is to align the partitions of index and filters |
306 | if (cut_filter_block) { |
307 | cut_filter_block = false; |
308 | return true; |
309 | } |
310 | return false; |
311 | } |
312 | |
313 | std::string& GetPartitionKey() { return sub_index_last_key_; } |
314 | |
315 | // Called when an external entity (such as filter partition builder) request |
316 | // cutting the next partition |
317 | void RequestPartitionCut(); |
318 | |
319 | private: |
320 | void MakeNewSubIndexBuilder(); |
321 | |
322 | struct Entry { |
323 | std::string key; |
324 | std::unique_ptr<ShortenedIndexBuilder> value; |
325 | }; |
326 | std::list<Entry> entries_; // list of partitioned indexes and their keys |
327 | BlockBuilder index_block_builder_; // top-level index builder |
328 | // the active partition index builder |
329 | ShortenedIndexBuilder* sub_index_builder_; |
330 | // the last key in the active partition index builder |
331 | std::string sub_index_last_key_; |
332 | std::unique_ptr<FlushBlockPolicy> flush_policy_; |
333 | // true if Finish is called once but not complete yet. |
334 | bool finishing_indexes = false; |
335 | const BlockBasedTableOptions& table_opt_; |
336 | // true if an external entity (such as filter partition builder) request |
337 | // cutting the next partition |
338 | bool partition_cut_requested_ = true; |
339 | // true if it should cut the next filter partition block |
340 | bool cut_filter_block = false; |
341 | }; |
342 | } // namespace rocksdb |
343 | |