| 1 | // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. |
| 2 | // This source code is licensed under both the GPLv2 (found in the |
| 3 | // COPYING file in the root directory) and Apache 2.0 License |
| 4 | // (found in the LICENSE.Apache file in the root directory). |
| 5 | // |
| 6 | // Copyright (c) 2011 The LevelDB Authors. All rights reserved. |
| 7 | // Use of this source code is governed by a BSD-style license that can be |
| 8 | // found in the LICENSE file. See the AUTHORS file for names of contributors. |
| 9 | |
| 10 | #pragma once |
| 11 | |
| 12 | #include <assert.h> |
| 13 | #include <inttypes.h> |
| 14 | |
| 15 | #include <list> |
| 16 | #include <string> |
| 17 | #include <unordered_map> |
| 18 | |
| 19 | #include "rocksdb/comparator.h" |
| 20 | #include "table/block_based_table_factory.h" |
| 21 | #include "table/block_builder.h" |
| 22 | #include "table/format.h" |
| 23 | |
| 24 | namespace rocksdb { |
| 25 | // The interface for building index. |
| 26 | // Instruction for adding a new concrete IndexBuilder: |
| 27 | // 1. Create a subclass instantiated from IndexBuilder. |
| 28 | // 2. Add a new entry associated with that subclass in TableOptions::IndexType. |
| 29 | // 3. Add a create function for the new subclass in CreateIndexBuilder. |
| 30 | // Note: we can devise more advanced design to simplify the process for adding |
| 31 | // new subclass, which will, on the other hand, increase the code complexity and |
| 32 | // catch unwanted attention from readers. Given that we won't add/change |
| 33 | // indexes frequently, it makes sense to just embrace a more straightforward |
| 34 | // design that just works. |
| 35 | class IndexBuilder { |
| 36 | public: |
| 37 | static IndexBuilder* CreateIndexBuilder( |
| 38 | BlockBasedTableOptions::IndexType index_type, |
| 39 | const rocksdb::InternalKeyComparator* comparator, |
| 40 | const InternalKeySliceTransform* int_key_slice_transform, |
| 41 | const BlockBasedTableOptions& table_opt); |
| 42 | |
| 43 | // Index builder will construct a set of blocks which contain: |
| 44 | // 1. One primary index block. |
| 45 | // 2. (Optional) a set of metablocks that contains the metadata of the |
| 46 | // primary index. |
| 47 | struct IndexBlocks { |
| 48 | Slice index_block_contents; |
| 49 | std::unordered_map<std::string, Slice> meta_blocks; |
| 50 | }; |
| 51 | explicit IndexBuilder(const InternalKeyComparator* comparator) |
| 52 | : comparator_(comparator) {} |
| 53 | |
| 54 | virtual ~IndexBuilder() {} |
| 55 | |
| 56 | // Add a new index entry to index block. |
| 57 | // To allow further optimization, we provide `last_key_in_current_block` and |
| 58 | // `first_key_in_next_block`, based on which the specific implementation can |
| 59 | // determine the best index key to be used for the index block. |
| 60 | // @last_key_in_current_block: this parameter maybe overridden with the value |
| 61 | // "substitute key". |
| 62 | // @first_key_in_next_block: it will be nullptr if the entry being added is |
| 63 | // the last one in the table |
| 64 | // |
| 65 | // REQUIRES: Finish() has not yet been called. |
| 66 | virtual void AddIndexEntry(std::string* last_key_in_current_block, |
| 67 | const Slice* first_key_in_next_block, |
| 68 | const BlockHandle& block_handle) = 0; |
| 69 | |
| 70 | // This method will be called whenever a key is added. The subclasses may |
| 71 | // override OnKeyAdded() if they need to collect additional information. |
| 72 | virtual void OnKeyAdded(const Slice& key) {} |
| 73 | |
| 74 | // Inform the index builder that all entries has been written. Block builder |
| 75 | // may therefore perform any operation required for block finalization. |
| 76 | // |
| 77 | // REQUIRES: Finish() has not yet been called. |
| 78 | inline Status Finish(IndexBlocks* index_blocks) { |
| 79 | // Throw away the changes to last_partition_block_handle. It has no effect |
| 80 | // on the first call to Finish anyway. |
| 81 | BlockHandle last_partition_block_handle; |
| 82 | return Finish(index_blocks, last_partition_block_handle); |
| 83 | } |
| 84 | |
| 85 | // This override of Finish can be utilized to build the 2nd level index in |
| 86 | // PartitionIndexBuilder. |
| 87 | // |
| 88 | // index_blocks will be filled with the resulting index data. If the return |
| 89 | // value is Status::InComplete() then it means that the index is partitioned |
| 90 | // and the callee should keep calling Finish until Status::OK() is returned. |
| 91 | // In that case, last_partition_block_handle is pointer to the block written |
| 92 | // with the result of the last call to Finish. This can be utilized to build |
| 93 | // the second level index pointing to each block of partitioned indexes. The |
| 94 | // last call to Finish() that returns Status::OK() populates index_blocks with |
| 95 | // the 2nd level index content. |
| 96 | virtual Status Finish(IndexBlocks* index_blocks, |
| 97 | const BlockHandle& last_partition_block_handle) = 0; |
| 98 | |
| 99 | // Get the estimated size for index block. |
| 100 | virtual size_t EstimatedSize() const = 0; |
| 101 | |
| 102 | protected: |
| 103 | const InternalKeyComparator* comparator_; |
| 104 | }; |
| 105 | |
| 106 | // This index builder builds space-efficient index block. |
| 107 | // |
| 108 | // Optimizations: |
| 109 | // 1. Made block's `block_restart_interval` to be 1, which will avoid linear |
| 110 | // search when doing index lookup (can be disabled by setting |
| 111 | // index_block_restart_interval). |
| 112 | // 2. Shorten the key length for index block. Other than honestly using the |
| 113 | // last key in the data block as the index key, we instead find a shortest |
| 114 | // substitute key that serves the same function. |
| 115 | class ShortenedIndexBuilder : public IndexBuilder { |
| 116 | public: |
| 117 | explicit ShortenedIndexBuilder(const InternalKeyComparator* comparator, |
| 118 | int index_block_restart_interval) |
| 119 | : IndexBuilder(comparator), |
| 120 | index_block_builder_(index_block_restart_interval) {} |
| 121 | |
| 122 | virtual void AddIndexEntry(std::string* last_key_in_current_block, |
| 123 | const Slice* first_key_in_next_block, |
| 124 | const BlockHandle& block_handle) override { |
| 125 | if (first_key_in_next_block != nullptr) { |
| 126 | comparator_->FindShortestSeparator(last_key_in_current_block, |
| 127 | *first_key_in_next_block); |
| 128 | } else { |
| 129 | comparator_->FindShortSuccessor(last_key_in_current_block); |
| 130 | } |
| 131 | |
| 132 | std::string handle_encoding; |
| 133 | block_handle.EncodeTo(&handle_encoding); |
| 134 | index_block_builder_.Add(*last_key_in_current_block, handle_encoding); |
| 135 | } |
| 136 | |
| 137 | using IndexBuilder::Finish; |
| 138 | virtual Status Finish( |
| 139 | IndexBlocks* index_blocks, |
| 140 | const BlockHandle& last_partition_block_handle) override { |
| 141 | index_blocks->index_block_contents = index_block_builder_.Finish(); |
| 142 | return Status::OK(); |
| 143 | } |
| 144 | |
| 145 | virtual size_t EstimatedSize() const override { |
| 146 | return index_block_builder_.CurrentSizeEstimate(); |
| 147 | } |
| 148 | |
| 149 | friend class PartitionedIndexBuilder; |
| 150 | |
| 151 | private: |
| 152 | BlockBuilder index_block_builder_; |
| 153 | }; |
| 154 | |
| 155 | // HashIndexBuilder contains a binary-searchable primary index and the |
| 156 | // metadata for secondary hash index construction. |
| 157 | // The metadata for hash index consists two parts: |
| 158 | // - a metablock that compactly contains a sequence of prefixes. All prefixes |
| 159 | // are stored consectively without any metadata (like, prefix sizes) being |
| 160 | // stored, which is kept in the other metablock. |
| 161 | // - a metablock contains the metadata of the prefixes, including prefix size, |
| 162 | // restart index and number of block it spans. The format looks like: |
| 163 | // |
| 164 | // +-----------------+---------------------------+---------------------+ |
| 165 | // <=prefix 1 |
| 166 | // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | |
| 167 | // +-----------------+---------------------------+---------------------+ |
| 168 | // <=prefix 2 |
| 169 | // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | |
| 170 | // +-----------------+---------------------------+---------------------+ |
| 171 | // | | |
| 172 | // | .... | |
| 173 | // | | |
| 174 | // +-----------------+---------------------------+---------------------+ |
| 175 | // <=prefix n |
| 176 | // | length: 4 bytes | restart interval: 4 bytes | num-blocks: 4 bytes | |
| 177 | // +-----------------+---------------------------+---------------------+ |
| 178 | // |
| 179 | // The reason of separating these two metablocks is to enable the efficiently |
| 180 | // reuse the first metablock during hash index construction without unnecessary |
| 181 | // data copy or small heap allocations for prefixes. |
| 182 | class HashIndexBuilder : public IndexBuilder { |
| 183 | public: |
| 184 | explicit HashIndexBuilder(const InternalKeyComparator* comparator, |
| 185 | const SliceTransform* , |
| 186 | int index_block_restart_interval) |
| 187 | : IndexBuilder(comparator), |
| 188 | primary_index_builder_(comparator, index_block_restart_interval), |
| 189 | hash_key_extractor_(hash_key_extractor) {} |
| 190 | |
| 191 | virtual void AddIndexEntry(std::string* last_key_in_current_block, |
| 192 | const Slice* first_key_in_next_block, |
| 193 | const BlockHandle& block_handle) override { |
| 194 | ++current_restart_index_; |
| 195 | primary_index_builder_.AddIndexEntry(last_key_in_current_block, |
| 196 | first_key_in_next_block, block_handle); |
| 197 | } |
| 198 | |
| 199 | virtual void OnKeyAdded(const Slice& key) override { |
| 200 | auto key_prefix = hash_key_extractor_->Transform(key); |
| 201 | bool is_first_entry = pending_block_num_ == 0; |
| 202 | |
| 203 | // Keys may share the prefix |
| 204 | if (is_first_entry || pending_entry_prefix_ != key_prefix) { |
| 205 | if (!is_first_entry) { |
| 206 | FlushPendingPrefix(); |
| 207 | } |
| 208 | |
| 209 | // need a hard copy otherwise the underlying data changes all the time. |
| 210 | // TODO(kailiu) ToString() is expensive. We may speed up can avoid data |
| 211 | // copy. |
| 212 | pending_entry_prefix_ = key_prefix.ToString(); |
| 213 | pending_block_num_ = 1; |
| 214 | pending_entry_index_ = static_cast<uint32_t>(current_restart_index_); |
| 215 | } else { |
| 216 | // entry number increments when keys share the prefix reside in |
| 217 | // different data blocks. |
| 218 | auto last_restart_index = pending_entry_index_ + pending_block_num_ - 1; |
| 219 | assert(last_restart_index <= current_restart_index_); |
| 220 | if (last_restart_index != current_restart_index_) { |
| 221 | ++pending_block_num_; |
| 222 | } |
| 223 | } |
| 224 | } |
| 225 | |
| 226 | virtual Status Finish( |
| 227 | IndexBlocks* index_blocks, |
| 228 | const BlockHandle& last_partition_block_handle) override { |
| 229 | FlushPendingPrefix(); |
| 230 | primary_index_builder_.Finish(index_blocks, last_partition_block_handle); |
| 231 | index_blocks->meta_blocks.insert( |
| 232 | {kHashIndexPrefixesBlock.c_str(), prefix_block_}); |
| 233 | index_blocks->meta_blocks.insert( |
| 234 | {kHashIndexPrefixesMetadataBlock.c_str(), prefix_meta_block_}); |
| 235 | return Status::OK(); |
| 236 | } |
| 237 | |
| 238 | virtual size_t EstimatedSize() const override { |
| 239 | return primary_index_builder_.EstimatedSize() + prefix_block_.size() + |
| 240 | prefix_meta_block_.size(); |
| 241 | } |
| 242 | |
| 243 | private: |
| 244 | void FlushPendingPrefix() { |
| 245 | prefix_block_.append(pending_entry_prefix_.data(), |
| 246 | pending_entry_prefix_.size()); |
| 247 | PutVarint32Varint32Varint32( |
| 248 | &prefix_meta_block_, |
| 249 | static_cast<uint32_t>(pending_entry_prefix_.size()), |
| 250 | pending_entry_index_, pending_block_num_); |
| 251 | } |
| 252 | |
| 253 | ShortenedIndexBuilder primary_index_builder_; |
| 254 | const SliceTransform* ; |
| 255 | |
| 256 | // stores a sequence of prefixes |
| 257 | std::string prefix_block_; |
| 258 | // stores the metadata of prefixes |
| 259 | std::string prefix_meta_block_; |
| 260 | |
| 261 | // The following 3 variables keeps unflushed prefix and its metadata. |
| 262 | // The details of block_num and entry_index can be found in |
| 263 | // "block_hash_index.{h,cc}" |
| 264 | uint32_t pending_block_num_ = 0; |
| 265 | uint32_t pending_entry_index_ = 0; |
| 266 | std::string pending_entry_prefix_; |
| 267 | |
| 268 | uint64_t current_restart_index_ = 0; |
| 269 | }; |
| 270 | |
| 271 | /** |
| 272 | * IndexBuilder for two-level indexing. Internally it creates a new index for |
| 273 | * each partition and Finish then in order when Finish is called on it |
| 274 | * continiously until Status::OK() is returned. |
| 275 | * |
| 276 | * The format on the disk would be I I I I I I IP where I is block containing a |
| 277 | * partition of indexes built using ShortenedIndexBuilder and IP is a block |
| 278 | * containing a secondary index on the partitions, built using |
| 279 | * ShortenedIndexBuilder. |
| 280 | */ |
| 281 | class PartitionedIndexBuilder : public IndexBuilder { |
| 282 | public: |
| 283 | static PartitionedIndexBuilder* CreateIndexBuilder( |
| 284 | const rocksdb::InternalKeyComparator* comparator, |
| 285 | const BlockBasedTableOptions& table_opt); |
| 286 | |
| 287 | explicit PartitionedIndexBuilder(const InternalKeyComparator* comparator, |
| 288 | const BlockBasedTableOptions& table_opt); |
| 289 | |
| 290 | virtual ~PartitionedIndexBuilder(); |
| 291 | |
| 292 | virtual void AddIndexEntry(std::string* last_key_in_current_block, |
| 293 | const Slice* first_key_in_next_block, |
| 294 | const BlockHandle& block_handle) override; |
| 295 | |
| 296 | virtual Status Finish( |
| 297 | IndexBlocks* index_blocks, |
| 298 | const BlockHandle& last_partition_block_handle) override; |
| 299 | |
| 300 | virtual size_t EstimatedSize() const override; |
| 301 | size_t EstimateTopLevelIndexSize(uint64_t) const; |
| 302 | size_t NumPartitions() const; |
| 303 | |
| 304 | inline bool ShouldCutFilterBlock() { |
| 305 | // Current policy is to align the partitions of index and filters |
| 306 | if (cut_filter_block) { |
| 307 | cut_filter_block = false; |
| 308 | return true; |
| 309 | } |
| 310 | return false; |
| 311 | } |
| 312 | |
| 313 | std::string& GetPartitionKey() { return sub_index_last_key_; } |
| 314 | |
| 315 | // Called when an external entity (such as filter partition builder) request |
| 316 | // cutting the next partition |
| 317 | void RequestPartitionCut(); |
| 318 | |
| 319 | private: |
| 320 | void MakeNewSubIndexBuilder(); |
| 321 | |
| 322 | struct Entry { |
| 323 | std::string key; |
| 324 | std::unique_ptr<ShortenedIndexBuilder> value; |
| 325 | }; |
| 326 | std::list<Entry> entries_; // list of partitioned indexes and their keys |
| 327 | BlockBuilder index_block_builder_; // top-level index builder |
| 328 | // the active partition index builder |
| 329 | ShortenedIndexBuilder* sub_index_builder_; |
| 330 | // the last key in the active partition index builder |
| 331 | std::string sub_index_last_key_; |
| 332 | std::unique_ptr<FlushBlockPolicy> flush_policy_; |
| 333 | // true if Finish is called once but not complete yet. |
| 334 | bool finishing_indexes = false; |
| 335 | const BlockBasedTableOptions& table_opt_; |
| 336 | // true if an external entity (such as filter partition builder) request |
| 337 | // cutting the next partition |
| 338 | bool partition_cut_requested_ = true; |
| 339 | // true if it should cut the next filter partition block |
| 340 | bool cut_filter_block = false; |
| 341 | }; |
| 342 | } // namespace rocksdb |
| 343 | |