| 1 | /* |
| 2 | Copyright (c) 2012,2013 Monty Program Ab |
| 3 | |
| 4 | This program is free software; you can redistribute it and/or modify |
| 5 | it under the terms of the GNU General Public License as published by |
| 6 | the Free Software Foundation; version 2 of the License. |
| 7 | |
| 8 | This program is distributed in the hope that it will be useful, |
| 9 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 11 | GNU General Public License for more details. |
| 12 | |
| 13 | You should have received a copy of the GNU General Public License |
| 14 | along with this program; if not, write to the Free Software |
| 15 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ |
| 16 | #pragma once |
| 17 | |
| 18 | /* C++ standard header files */ |
| 19 | #include <cstdlib> |
| 20 | #include <algorithm> |
| 21 | #include <atomic> |
| 22 | #include <map> |
| 23 | #include <mutex> |
| 24 | #include <string> |
| 25 | #include <unordered_set> |
| 26 | #include <utility> |
| 27 | #include <vector> |
| 28 | #include <array> |
| 29 | |
| 30 | /* C standard header files */ |
| 31 | #ifndef _WIN32 |
| 32 | #include <arpa/inet.h> |
| 33 | #endif |
| 34 | |
| 35 | /* MyRocks header files */ |
| 36 | #include "./ha_rocksdb.h" |
| 37 | #include "./properties_collector.h" |
| 38 | #include "./rdb_buff.h" |
| 39 | #include "./rdb_utils.h" |
| 40 | |
| 41 | namespace myrocks { |
| 42 | |
| 43 | class Rdb_dict_manager; |
| 44 | class Rdb_key_def; |
| 45 | class Rdb_field_packing; |
| 46 | class Rdb_cf_manager; |
| 47 | class Rdb_ddl_manager; |
| 48 | |
| 49 | const uint32_t GTID_BUF_LEN = 60; |
| 50 | |
| 51 | /* |
| 52 | @brief |
| 53 | Field packing context. |
| 54 | The idea is to ensure that a call to rdb_index_field_pack_t function |
| 55 | is followed by a call to rdb_make_unpack_info_t. |
| 56 | |
| 57 | @detail |
| 58 | For some datatypes, unpack_info is produced as a side effect of |
| 59 | rdb_index_field_pack_t function call. |
| 60 | For other datatypes, packing is just calling make_sort_key(), while |
| 61 | rdb_make_unpack_info_t is a custom function. |
| 62 | In order to accommodate both cases, we require both calls to be made and |
| 63 | unpack_info is passed as context data between the two. |
| 64 | */ |
| 65 | class Rdb_pack_field_context { |
| 66 | public: |
| 67 | Rdb_pack_field_context(const Rdb_pack_field_context &) = delete; |
| 68 | Rdb_pack_field_context &operator=(const Rdb_pack_field_context &) = delete; |
| 69 | |
| 70 | explicit Rdb_pack_field_context(Rdb_string_writer *const writer_arg) |
| 71 | : writer(writer_arg) {} |
| 72 | |
| 73 | // NULL means we're not producing unpack_info. |
| 74 | Rdb_string_writer *writer; |
| 75 | }; |
| 76 | |
| 77 | struct Rdb_collation_codec; |
| 78 | struct Rdb_index_info; |
| 79 | |
| 80 | /* |
| 81 | C-style "virtual table" allowing different handling of packing logic based |
| 82 | on the field type. See Rdb_field_packing::setup() implementation. |
| 83 | */ |
| 84 | using rdb_make_unpack_info_t = |
| 85 | void (Rdb_key_def::*)(const Rdb_collation_codec *codec, const Field *field, |
| 86 | Rdb_pack_field_context *pack_ctx) const; |
| 87 | using rdb_index_field_unpack_t = int (Rdb_key_def::*)( |
| 88 | Rdb_field_packing *fpi, Field *field, uchar *field_ptr, |
| 89 | Rdb_string_reader *reader, Rdb_string_reader *unpack_reader) const; |
| 90 | using rdb_index_field_skip_t = |
| 91 | int (Rdb_key_def::*)(const Rdb_field_packing *fpi, const Field *field, |
| 92 | Rdb_string_reader *reader) const; |
| 93 | using rdb_index_field_pack_t = |
| 94 | void (Rdb_key_def::*)(Rdb_field_packing *fpi, Field *field, uchar *buf, |
| 95 | uchar **dst, Rdb_pack_field_context *pack_ctx) const; |
| 96 | |
| 97 | const uint RDB_INVALID_KEY_LEN = uint(-1); |
| 98 | |
| 99 | /* How much one checksum occupies when stored in the record */ |
| 100 | const size_t RDB_CHECKSUM_SIZE = sizeof(uint32_t); |
| 101 | |
| 102 | /* |
| 103 | How much the checksum data occupies in record, in total. |
| 104 | It is storing two checksums plus 1 tag-byte. |
| 105 | */ |
| 106 | const size_t RDB_CHECKSUM_CHUNK_SIZE = 2 * RDB_CHECKSUM_SIZE + 1; |
| 107 | |
| 108 | /* |
| 109 | Checksum data starts from CHECKSUM_DATA_TAG which is followed by two CRC32 |
| 110 | checksums. |
| 111 | */ |
| 112 | const char RDB_CHECKSUM_DATA_TAG = 0x01; |
| 113 | |
| 114 | /* |
| 115 | Unpack data is variable length. The header is 1 tag-byte plus a two byte |
| 116 | length field. The length field includes the header as well. |
| 117 | */ |
| 118 | const char RDB_UNPACK_DATA_TAG = 0x02; |
| 119 | const size_t RDB_UNPACK_DATA_LEN_SIZE = sizeof(uint16_t); |
| 120 | const size_t = |
| 121 | sizeof(RDB_UNPACK_DATA_TAG) + RDB_UNPACK_DATA_LEN_SIZE; |
| 122 | |
| 123 | /* |
| 124 | This header format is 1 tag-byte plus a two byte length field plus a two byte |
| 125 | covered bitmap. The length field includes the header size. |
| 126 | */ |
| 127 | const char RDB_UNPACK_COVERED_DATA_TAG = 0x03; |
| 128 | const size_t RDB_UNPACK_COVERED_DATA_LEN_SIZE = sizeof(uint16_t); |
| 129 | const size_t RDB_COVERED_BITMAP_SIZE = sizeof(uint16_t); |
| 130 | const size_t = |
| 131 | sizeof(RDB_UNPACK_COVERED_DATA_TAG) + RDB_UNPACK_COVERED_DATA_LEN_SIZE + |
| 132 | RDB_COVERED_BITMAP_SIZE; |
| 133 | |
| 134 | /* |
| 135 | Data dictionary index info field sizes. |
| 136 | */ |
| 137 | const size_t RDB_SIZEOF_INDEX_INFO_VERSION = sizeof(uint16); |
| 138 | const size_t RDB_SIZEOF_INDEX_TYPE = sizeof(uchar); |
| 139 | const size_t RDB_SIZEOF_KV_VERSION = sizeof(uint16); |
| 140 | const size_t RDB_SIZEOF_INDEX_FLAGS = sizeof(uint32); |
| 141 | const size_t RDB_SIZEOF_AUTO_INCREMENT_VERSION = sizeof(uint16); |
| 142 | |
| 143 | // Possible return values for rdb_index_field_unpack_t functions. |
| 144 | enum { |
| 145 | UNPACK_SUCCESS = 0, |
| 146 | UNPACK_FAILURE = 1, |
| 147 | }; |
| 148 | |
| 149 | /* |
| 150 | An object of this class represents information about an index in an SQL |
| 151 | table. It provides services to encode and decode index tuples. |
| 152 | |
| 153 | Note: a table (as in, on-disk table) has a single Rdb_key_def object which |
| 154 | is shared across multiple TABLE* objects and may be used simultaneously from |
| 155 | different threads. |
| 156 | |
| 157 | There are several data encodings: |
| 158 | |
| 159 | === SQL LAYER === |
| 160 | SQL layer uses two encodings: |
| 161 | |
| 162 | - "Table->record format". This is the format that is used for the data in |
| 163 | the record buffers, table->record[i] |
| 164 | |
| 165 | - KeyTupleFormat (see opt_range.cc) - this is used in parameters to index |
| 166 | lookup functions, like handler::index_read_map(). |
| 167 | |
| 168 | === Inside RocksDB === |
| 169 | Primary Key is stored as a mapping: |
| 170 | |
| 171 | index_tuple -> StoredRecord |
| 172 | |
| 173 | StoredRecord is in Table->record format, except for blobs, which are stored |
| 174 | in-place. See ha_rocksdb::convert_record_to_storage_format for details. |
| 175 | |
| 176 | Secondary indexes are stored as one of two variants: |
| 177 | |
| 178 | index_tuple -> unpack_info |
| 179 | index_tuple -> empty_string |
| 180 | |
| 181 | index_tuple here is the form of key that can be compared with memcmp(), aka |
| 182 | "mem-comparable form". |
| 183 | |
| 184 | unpack_info is extra data that allows to restore the original value from its |
| 185 | mem-comparable form. It is present only if the index supports index-only |
| 186 | reads. |
| 187 | */ |
| 188 | |
| 189 | class Rdb_key_def { |
| 190 | public: |
| 191 | /* Convert a key from KeyTupleFormat to mem-comparable form */ |
| 192 | uint pack_index_tuple(TABLE *const tbl, uchar *const pack_buffer, |
| 193 | uchar *const packed_tuple, const uchar *const key_tuple, |
| 194 | const key_part_map &keypart_map) const; |
| 195 | |
| 196 | uchar *pack_field(Field *const field, Rdb_field_packing *pack_info, |
| 197 | uchar *tuple, uchar *const packed_tuple, |
| 198 | uchar *const pack_buffer, |
| 199 | Rdb_string_writer *const unpack_info, |
| 200 | uint *const n_null_fields) const; |
| 201 | /* Convert a key from Table->record format to mem-comparable form */ |
| 202 | uint pack_record(const TABLE *const tbl, uchar *const pack_buffer, |
| 203 | const uchar *const record, uchar *const packed_tuple, |
| 204 | Rdb_string_writer *const unpack_info, |
| 205 | const bool &should_store_row_debug_checksums, |
| 206 | const longlong &hidden_pk_id = 0, uint n_key_parts = 0, |
| 207 | uint *const n_null_fields = nullptr, |
| 208 | uint *const ttl_pk_offset = nullptr, |
| 209 | const char *const ttl_bytes = nullptr) const; |
| 210 | /* Pack the hidden primary key into mem-comparable form. */ |
| 211 | uint pack_hidden_pk(const longlong &hidden_pk_id, |
| 212 | uchar *const packed_tuple) const; |
| 213 | int unpack_field(Rdb_field_packing *const fpi, |
| 214 | Field *const field, |
| 215 | Rdb_string_reader* reader, |
| 216 | const uchar *const default_value, |
| 217 | Rdb_string_reader* unp_reader) const; |
| 218 | int unpack_record(TABLE *const table, uchar *const buf, |
| 219 | const rocksdb::Slice *const packed_key, |
| 220 | const rocksdb::Slice *const unpack_info, |
| 221 | const bool &verify_row_debug_checksums) const; |
| 222 | |
| 223 | static bool unpack_info_has_checksum(const rocksdb::Slice &unpack_info); |
| 224 | int compare_keys(const rocksdb::Slice *key1, const rocksdb::Slice *key2, |
| 225 | std::size_t *const column_index) const; |
| 226 | |
| 227 | size_t key_length(const TABLE *const table, const rocksdb::Slice &key) const; |
| 228 | |
| 229 | /* Get the key that is the "infimum" for this index */ |
| 230 | inline void get_infimum_key(uchar *const key, uint *const size) const { |
| 231 | rdb_netbuf_store_index(key, m_index_number); |
| 232 | *size = INDEX_NUMBER_SIZE; |
| 233 | } |
| 234 | |
| 235 | /* Get the key that is a "supremum" for this index */ |
| 236 | inline void get_supremum_key(uchar *const key, uint *const size) const { |
| 237 | rdb_netbuf_store_index(key, m_index_number + 1); |
| 238 | *size = INDEX_NUMBER_SIZE; |
| 239 | } |
| 240 | |
| 241 | /* |
| 242 | Get the first key that you need to position at to start iterating. |
| 243 | |
| 244 | Stores into *key a "supremum" or "infimum" key value for the index. |
| 245 | |
| 246 | @return Number of bytes in the key that are usable for bloom filter use. |
| 247 | */ |
| 248 | inline int get_first_key(uchar *const key, uint *const size) const { |
| 249 | if (m_is_reverse_cf) |
| 250 | get_supremum_key(key, size); |
| 251 | else |
| 252 | get_infimum_key(key, size); |
| 253 | |
| 254 | /* Find out how many bytes of infimum are the same as m_index_number */ |
| 255 | uchar unmodified_key[INDEX_NUMBER_SIZE]; |
| 256 | rdb_netbuf_store_index(unmodified_key, m_index_number); |
| 257 | int i; |
| 258 | for (i = 0; i < INDEX_NUMBER_SIZE; i++) { |
| 259 | if (key[i] != unmodified_key[i]) |
| 260 | break; |
| 261 | } |
| 262 | return i; |
| 263 | } |
| 264 | |
| 265 | /* Make a key that is right after the given key. */ |
| 266 | static int successor(uchar *const packed_tuple, const uint &len); |
| 267 | |
| 268 | /* Make a key that is right before the given key. */ |
| 269 | static int predecessor(uchar *const packed_tuple, const uint &len); |
| 270 | |
| 271 | /* |
| 272 | This can be used to compare prefixes. |
| 273 | if X is a prefix of Y, then we consider that X = Y. |
| 274 | */ |
| 275 | // b describes the lookup key, which can be a prefix of a. |
| 276 | // b might be outside of the index_number range, if successor() is called. |
| 277 | int cmp_full_keys(const rocksdb::Slice &a, const rocksdb::Slice &b) const { |
| 278 | DBUG_ASSERT(covers_key(a)); |
| 279 | |
| 280 | return memcmp(a.data(), b.data(), std::min(a.size(), b.size())); |
| 281 | } |
| 282 | |
| 283 | /* Check if given mem-comparable key belongs to this index */ |
| 284 | bool covers_key(const rocksdb::Slice &slice) const { |
| 285 | if (slice.size() < INDEX_NUMBER_SIZE) |
| 286 | return false; |
| 287 | |
| 288 | if (memcmp(slice.data(), m_index_number_storage_form, INDEX_NUMBER_SIZE)) |
| 289 | return false; |
| 290 | |
| 291 | return true; |
| 292 | } |
| 293 | |
| 294 | void get_lookup_bitmap(const TABLE *table, MY_BITMAP *map) const; |
| 295 | |
| 296 | bool covers_lookup(TABLE *const table, |
| 297 | const rocksdb::Slice *const unpack_info, |
| 298 | const MY_BITMAP *const map) const; |
| 299 | |
| 300 | inline bool use_covered_bitmap_format() const { |
| 301 | return m_index_type == INDEX_TYPE_SECONDARY && |
| 302 | m_kv_format_version >= SECONDARY_FORMAT_VERSION_UPDATE3; |
| 303 | } |
| 304 | |
| 305 | /* |
| 306 | Return true if the passed mem-comparable key |
| 307 | - is from this index, and |
| 308 | - it matches the passed key prefix (the prefix is also in mem-comparable |
| 309 | form) |
| 310 | */ |
| 311 | bool value_matches_prefix(const rocksdb::Slice &value, |
| 312 | const rocksdb::Slice &prefix) const { |
| 313 | return covers_key(value) && !cmp_full_keys(value, prefix); |
| 314 | } |
| 315 | |
| 316 | uint32 get_keyno() const { return m_keyno; } |
| 317 | |
| 318 | uint32 get_index_number() const { return m_index_number; } |
| 319 | |
| 320 | GL_INDEX_ID get_gl_index_id() const { |
| 321 | const GL_INDEX_ID gl_index_id = {m_cf_handle->GetID(), m_index_number}; |
| 322 | return gl_index_id; |
| 323 | } |
| 324 | |
| 325 | int read_memcmp_key_part(const TABLE *table_arg, Rdb_string_reader *reader, |
| 326 | const uint part_num) const; |
| 327 | |
| 328 | /* Must only be called for secondary keys: */ |
| 329 | uint get_primary_key_tuple(const TABLE *const tbl, |
| 330 | const Rdb_key_def &pk_descr, |
| 331 | const rocksdb::Slice *const key, |
| 332 | uchar *const pk_buffer) const; |
| 333 | |
| 334 | uint get_memcmp_sk_parts(const TABLE *table, const rocksdb::Slice &key, |
| 335 | uchar *sk_buffer, uint *n_null_fields) const; |
| 336 | |
| 337 | /* Return max length of mem-comparable form */ |
| 338 | uint max_storage_fmt_length() const { return m_maxlength; } |
| 339 | |
| 340 | uint get_key_parts() const { return m_key_parts; } |
| 341 | |
| 342 | uint get_ttl_field_offset() const { return m_ttl_field_offset; } |
| 343 | |
| 344 | /* |
| 345 | Get a field object for key part #part_no |
| 346 | |
| 347 | @detail |
| 348 | SQL layer thinks unique secondary indexes and indexes in partitioned |
| 349 | tables are not "Extended" with Primary Key columns. |
| 350 | |
| 351 | Internally, we always extend all indexes with PK columns. This function |
| 352 | uses our definition of how the index is Extended. |
| 353 | */ |
| 354 | inline Field *get_table_field_for_part_no(TABLE *table, uint part_no) const; |
| 355 | |
| 356 | const std::string &get_name() const { return m_name; } |
| 357 | |
| 358 | const rocksdb::SliceTransform *() const { |
| 359 | return m_prefix_extractor.get(); |
| 360 | } |
| 361 | |
| 362 | static size_t (char tag); |
| 363 | |
| 364 | Rdb_key_def &operator=(const Rdb_key_def &) = delete; |
| 365 | Rdb_key_def(const Rdb_key_def &k); |
| 366 | Rdb_key_def(uint indexnr_arg, uint keyno_arg, |
| 367 | rocksdb::ColumnFamilyHandle *cf_handle_arg, |
| 368 | uint16_t index_dict_version_arg, uchar index_type_arg, |
| 369 | uint16_t kv_format_version_arg, bool is_reverse_cf_arg, |
| 370 | bool is_per_partition_cf, const char *name, |
| 371 | Rdb_index_stats stats = Rdb_index_stats(), uint32 index_flags = 0, |
| 372 | uint32 ttl_rec_offset = UINT_MAX, uint64 ttl_duration = 0); |
| 373 | ~Rdb_key_def(); |
| 374 | |
| 375 | enum { |
| 376 | INDEX_NUMBER_SIZE = 4, |
| 377 | VERSION_SIZE = 2, |
| 378 | CF_NUMBER_SIZE = 4, |
| 379 | CF_FLAG_SIZE = 4, |
| 380 | PACKED_SIZE = 4, // one int |
| 381 | }; |
| 382 | |
| 383 | // bit flags for combining bools when writing to disk |
| 384 | enum { |
| 385 | REVERSE_CF_FLAG = 1, |
| 386 | AUTO_CF_FLAG = 2, // Deprecated |
| 387 | PER_PARTITION_CF_FLAG = 4, |
| 388 | }; |
| 389 | |
| 390 | // bit flags which denote myrocks specific fields stored in the record |
| 391 | // currently only used for TTL. |
| 392 | enum INDEX_FLAG { |
| 393 | TTL_FLAG = 1 << 0, |
| 394 | |
| 395 | // MAX_FLAG marks where the actual record starts |
| 396 | // This flag always needs to be set to the last index flag enum. |
| 397 | MAX_FLAG = TTL_FLAG << 1, |
| 398 | }; |
| 399 | |
| 400 | // Set of flags to ignore when comparing two CF-s and determining if |
| 401 | // they're same. |
| 402 | static const uint CF_FLAGS_TO_IGNORE = PER_PARTITION_CF_FLAG; |
| 403 | |
| 404 | // Data dictionary types |
| 405 | enum DATA_DICT_TYPE { |
| 406 | DDL_ENTRY_INDEX_START_NUMBER = 1, |
| 407 | INDEX_INFO = 2, |
| 408 | CF_DEFINITION = 3, |
| 409 | BINLOG_INFO_INDEX_NUMBER = 4, |
| 410 | DDL_DROP_INDEX_ONGOING = 5, |
| 411 | INDEX_STATISTICS = 6, |
| 412 | MAX_INDEX_ID = 7, |
| 413 | DDL_CREATE_INDEX_ONGOING = 8, |
| 414 | AUTO_INC = 9, |
| 415 | END_DICT_INDEX_ID = 255 |
| 416 | }; |
| 417 | |
| 418 | // Data dictionary schema version. Introduce newer versions |
| 419 | // if changing schema layout |
| 420 | enum { |
| 421 | DDL_ENTRY_INDEX_VERSION = 1, |
| 422 | CF_DEFINITION_VERSION = 1, |
| 423 | BINLOG_INFO_INDEX_NUMBER_VERSION = 1, |
| 424 | DDL_DROP_INDEX_ONGOING_VERSION = 1, |
| 425 | MAX_INDEX_ID_VERSION = 1, |
| 426 | DDL_CREATE_INDEX_ONGOING_VERSION = 1, |
| 427 | AUTO_INCREMENT_VERSION = 1, |
| 428 | // Version for index stats is stored in IndexStats struct |
| 429 | }; |
| 430 | |
| 431 | // Index info version. Introduce newer versions when changing the |
| 432 | // INDEX_INFO layout. Update INDEX_INFO_VERSION_LATEST to point to the |
| 433 | // latest version number. |
| 434 | enum { |
| 435 | INDEX_INFO_VERSION_INITIAL = 1, // Obsolete |
| 436 | INDEX_INFO_VERSION_KV_FORMAT, |
| 437 | INDEX_INFO_VERSION_GLOBAL_ID, |
| 438 | // There is no change to data format in this version, but this version |
| 439 | // verifies KV format version, whereas previous versions do not. A version |
| 440 | // bump is needed to prevent older binaries from skipping the KV version |
| 441 | // check inadvertently. |
| 442 | INDEX_INFO_VERSION_VERIFY_KV_FORMAT, |
| 443 | // This changes the data format to include a 8 byte TTL duration for tables |
| 444 | INDEX_INFO_VERSION_TTL, |
| 445 | // This changes the data format to include a bitmap before the TTL duration |
| 446 | // which will indicate in the future whether TTL or other special fields |
| 447 | // are turned on or off. |
| 448 | INDEX_INFO_VERSION_FIELD_FLAGS, |
| 449 | // This normally point to the latest (currently it does). |
| 450 | INDEX_INFO_VERSION_LATEST = INDEX_INFO_VERSION_FIELD_FLAGS, |
| 451 | }; |
| 452 | |
| 453 | // MyRocks index types |
| 454 | enum { |
| 455 | INDEX_TYPE_PRIMARY = 1, |
| 456 | INDEX_TYPE_SECONDARY = 2, |
| 457 | INDEX_TYPE_HIDDEN_PRIMARY = 3, |
| 458 | }; |
| 459 | |
| 460 | // Key/Value format version for each index type |
| 461 | enum { |
| 462 | PRIMARY_FORMAT_VERSION_INITIAL = 10, |
| 463 | // This change includes: |
| 464 | // - For columns that can be unpacked with unpack_info, PK |
| 465 | // stores the unpack_info. |
| 466 | // - DECIMAL datatype is no longer stored in the row (because |
| 467 | // it can be decoded from its mem-comparable form) |
| 468 | // - VARCHAR-columns use endspace-padding. |
| 469 | PRIMARY_FORMAT_VERSION_UPDATE1 = 11, |
| 470 | // This change includes: |
| 471 | // - Binary encoded variable length fields have a new format that avoids |
| 472 | // an inefficient where data that was a multiple of 8 bytes in length |
| 473 | // had an extra 9 bytes of encoded data. |
| 474 | PRIMARY_FORMAT_VERSION_UPDATE2 = 12, |
| 475 | // This change includes support for TTL |
| 476 | // - This means that when TTL is specified for the table an 8-byte TTL |
| 477 | // field is prepended in front of each value. |
| 478 | PRIMARY_FORMAT_VERSION_TTL = 13, |
| 479 | PRIMARY_FORMAT_VERSION_LATEST = PRIMARY_FORMAT_VERSION_TTL, |
| 480 | |
| 481 | SECONDARY_FORMAT_VERSION_INITIAL = 10, |
| 482 | // This change the SK format to include unpack_info. |
| 483 | SECONDARY_FORMAT_VERSION_UPDATE1 = 11, |
| 484 | // This change includes: |
| 485 | // - Binary encoded variable length fields have a new format that avoids |
| 486 | // an inefficient where data that was a multiple of 8 bytes in length |
| 487 | // had an extra 9 bytes of encoded data. |
| 488 | SECONDARY_FORMAT_VERSION_UPDATE2 = 12, |
| 489 | // This change includes support for TTL |
| 490 | // - This means that when TTL is specified for the table an 8-byte TTL |
| 491 | // field is prepended in front of each value. |
| 492 | SECONDARY_FORMAT_VERSION_TTL = 13, |
| 493 | SECONDARY_FORMAT_VERSION_LATEST = SECONDARY_FORMAT_VERSION_TTL, |
| 494 | // This change includes support for covering SK lookups for varchars. A |
| 495 | // 2-byte bitmap is added after the tag-byte to unpack_info only for |
| 496 | // records which have covered varchar columns. Currently waiting before |
| 497 | // enabling in prod. |
| 498 | SECONDARY_FORMAT_VERSION_UPDATE3 = 65535, |
| 499 | }; |
| 500 | |
| 501 | void setup(const TABLE *const table, const Rdb_tbl_def *const tbl_def); |
| 502 | |
| 503 | static uint (const TABLE *const table_arg, |
| 504 | const Rdb_tbl_def *const tbl_def_arg, |
| 505 | uint64 *ttl_duration); |
| 506 | static uint (const TABLE *const table_arg, |
| 507 | const Rdb_tbl_def *const tbl_def_arg, |
| 508 | std::string *ttl_column, uint *ttl_field_offset, |
| 509 | bool skip_checks = false); |
| 510 | inline bool has_ttl() const { return m_ttl_duration > 0; } |
| 511 | |
| 512 | static bool has_index_flag(uint32 index_flags, enum INDEX_FLAG flag); |
| 513 | static uint32 calculate_index_flag_offset(uint32 index_flags, |
| 514 | enum INDEX_FLAG flag, |
| 515 | uint *const field_length = nullptr); |
| 516 | void write_index_flag_field(Rdb_string_writer *const buf, |
| 517 | const uchar *const val, |
| 518 | enum INDEX_FLAG flag) const; |
| 519 | |
| 520 | static const std::string |
| 521 | gen_qualifier_for_table(const char *const qualifier, |
| 522 | const std::string &partition_name = "" ); |
| 523 | static const std::string |
| 524 | gen_cf_name_qualifier_for_partition(const std::string &s); |
| 525 | static const std::string |
| 526 | gen_ttl_duration_qualifier_for_partition(const std::string &s); |
| 527 | static const std::string |
| 528 | gen_ttl_col_qualifier_for_partition(const std::string &s); |
| 529 | |
| 530 | static const std::string ( |
| 531 | const std::string &, const TABLE *const table_arg, |
| 532 | const Rdb_tbl_def *const tbl_def_arg, bool *per_part_match_found, |
| 533 | const char *const qualifier); |
| 534 | |
| 535 | rocksdb::ColumnFamilyHandle *get_cf() const { return m_cf_handle; } |
| 536 | |
| 537 | /* Check if keypart #kp can be unpacked from index tuple */ |
| 538 | inline bool can_unpack(const uint &kp) const; |
| 539 | /* Check if keypart #kp needs unpack info */ |
| 540 | inline bool has_unpack_info(const uint &kp) const; |
| 541 | |
| 542 | /* Check if given table has a primary key */ |
| 543 | static bool table_has_hidden_pk(const TABLE *const table); |
| 544 | |
| 545 | void report_checksum_mismatch(const bool &is_key, const char *const data, |
| 546 | const size_t data_size) const; |
| 547 | |
| 548 | /* Check if index is at least pk_min if it is a PK, |
| 549 | or at least sk_min if SK.*/ |
| 550 | bool index_format_min_check(const int &pk_min, const int &sk_min) const; |
| 551 | |
| 552 | void pack_with_make_sort_key( |
| 553 | Rdb_field_packing *const fpi, Field *const field, |
| 554 | uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst, |
| 555 | Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__))) const; |
| 556 | |
| 557 | void pack_with_varchar_encoding( |
| 558 | Rdb_field_packing *const fpi, Field *const field, uchar *buf, uchar **dst, |
| 559 | Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__))) const; |
| 560 | |
| 561 | void |
| 562 | pack_with_varchar_space_pad(Rdb_field_packing *const fpi, Field *const field, |
| 563 | uchar *buf, uchar **dst, |
| 564 | Rdb_pack_field_context *const pack_ctx) const; |
| 565 | |
| 566 | int unpack_integer(Rdb_field_packing *const fpi, Field *const field, |
| 567 | uchar *const to, Rdb_string_reader *const reader, |
| 568 | Rdb_string_reader *const unp_reader |
| 569 | MY_ATTRIBUTE((__unused__))) const; |
| 570 | |
| 571 | int unpack_double(Rdb_field_packing *const fpi MY_ATTRIBUTE((__unused__)), |
| 572 | Field *const field MY_ATTRIBUTE((__unused__)), |
| 573 | uchar *const field_ptr, Rdb_string_reader *const reader, |
| 574 | Rdb_string_reader *const unp_reader |
| 575 | MY_ATTRIBUTE((__unused__))) const; |
| 576 | |
| 577 | int unpack_float(Rdb_field_packing *const fpi, |
| 578 | Field *const field MY_ATTRIBUTE((__unused__)), |
| 579 | uchar *const field_ptr, Rdb_string_reader *const reader, |
| 580 | Rdb_string_reader *const unp_reader |
| 581 | MY_ATTRIBUTE((__unused__))) const; |
| 582 | |
| 583 | int unpack_binary_str(Rdb_field_packing *const fpi, Field *const field, |
| 584 | uchar *const to, Rdb_string_reader *const reader, |
| 585 | Rdb_string_reader *const unp_reader |
| 586 | MY_ATTRIBUTE((__unused__))) const; |
| 587 | |
| 588 | int unpack_binary_or_utf8_varchar( |
| 589 | Rdb_field_packing *const fpi, Field *const field, uchar *dst, |
| 590 | Rdb_string_reader *const reader, |
| 591 | Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__))) const; |
| 592 | |
| 593 | int unpack_binary_or_utf8_varchar_space_pad( |
| 594 | Rdb_field_packing *const fpi, Field *const field, uchar *dst, |
| 595 | Rdb_string_reader *const reader, |
| 596 | Rdb_string_reader *const unp_reader) const; |
| 597 | |
| 598 | int unpack_newdate(Rdb_field_packing *const fpi, |
| 599 | Field *const field MY_ATTRIBUTE((__unused__)), |
| 600 | uchar *const field_ptr, Rdb_string_reader *const reader, |
| 601 | Rdb_string_reader *const unp_reader |
| 602 | MY_ATTRIBUTE((__unused__))) const; |
| 603 | |
| 604 | int unpack_utf8_str(Rdb_field_packing *const fpi, Field *const field, |
| 605 | uchar *dst, Rdb_string_reader *const reader, |
| 606 | Rdb_string_reader *const unp_reader |
| 607 | MY_ATTRIBUTE((__unused__))) const; |
| 608 | |
| 609 | int unpack_unknown_varchar(Rdb_field_packing *const fpi, Field *const field, |
| 610 | uchar *dst, Rdb_string_reader *const reader, |
| 611 | Rdb_string_reader *const unp_reader) const; |
| 612 | |
| 613 | int unpack_simple_varchar_space_pad( |
| 614 | Rdb_field_packing *const fpi, Field *const field, uchar *dst, |
| 615 | Rdb_string_reader *const reader, |
| 616 | Rdb_string_reader *const unp_reader) const; |
| 617 | |
| 618 | int unpack_simple(Rdb_field_packing *const fpi, |
| 619 | Field *const field MY_ATTRIBUTE((__unused__)), |
| 620 | uchar *const dst, Rdb_string_reader *const reader, |
| 621 | Rdb_string_reader *const unp_reader) const; |
| 622 | |
| 623 | int unpack_unknown(Rdb_field_packing *const fpi, Field *const field, |
| 624 | uchar *const dst, Rdb_string_reader *const reader, |
| 625 | Rdb_string_reader *const unp_reader) const; |
| 626 | |
| 627 | int unpack_floating_point(uchar *const dst, Rdb_string_reader *const reader, |
| 628 | const size_t &size, const int &exp_digit, |
| 629 | const uchar *const zero_pattern, |
| 630 | const uchar *const zero_val, |
| 631 | void (*swap_func)(uchar *, const uchar *)) const; |
| 632 | |
| 633 | void make_unpack_simple_varchar(const Rdb_collation_codec *const codec, |
| 634 | const Field *const field, |
| 635 | Rdb_pack_field_context *const pack_ctx) const; |
| 636 | |
| 637 | void make_unpack_simple(const Rdb_collation_codec *const codec, |
| 638 | const Field *const field, |
| 639 | Rdb_pack_field_context *const pack_ctx) const; |
| 640 | |
| 641 | void make_unpack_unknown( |
| 642 | const Rdb_collation_codec *codec MY_ATTRIBUTE((__unused__)), |
| 643 | const Field *const field, Rdb_pack_field_context *const pack_ctx) const; |
| 644 | |
| 645 | void make_unpack_unknown_varchar( |
| 646 | const Rdb_collation_codec *const codec MY_ATTRIBUTE((__unused__)), |
| 647 | const Field *const field, Rdb_pack_field_context *const pack_ctx) const; |
| 648 | |
| 649 | void dummy_make_unpack_info( |
| 650 | const Rdb_collation_codec *codec MY_ATTRIBUTE((__unused__)), |
| 651 | const Field *field MY_ATTRIBUTE((__unused__)), |
| 652 | Rdb_pack_field_context *pack_ctx MY_ATTRIBUTE((__unused__))) const; |
| 653 | |
| 654 | int skip_max_length(const Rdb_field_packing *const fpi, |
| 655 | const Field *const field MY_ATTRIBUTE((__unused__)), |
| 656 | Rdb_string_reader *const reader) const; |
| 657 | |
| 658 | int skip_variable_length( |
| 659 | const Rdb_field_packing *const fpi MY_ATTRIBUTE((__unused__)), |
| 660 | const Field *const field, Rdb_string_reader *const reader) const; |
| 661 | |
| 662 | int skip_variable_space_pad(const Rdb_field_packing *const fpi, |
| 663 | const Field *const field, |
| 664 | Rdb_string_reader *const reader) const; |
| 665 | |
| 666 | inline bool use_legacy_varbinary_format() const { |
| 667 | return !index_format_min_check(PRIMARY_FORMAT_VERSION_UPDATE2, |
| 668 | SECONDARY_FORMAT_VERSION_UPDATE2); |
| 669 | } |
| 670 | |
| 671 | static inline bool is_unpack_data_tag(char c) { |
| 672 | return c == RDB_UNPACK_DATA_TAG || c == RDB_UNPACK_COVERED_DATA_TAG; |
| 673 | } |
| 674 | |
| 675 | private: |
| 676 | #ifndef DBUG_OFF |
| 677 | inline bool is_storage_available(const int &offset, const int &needed) const { |
| 678 | const int storage_length = static_cast<int>(max_storage_fmt_length()); |
| 679 | return (storage_length - offset) >= needed; |
| 680 | } |
| 681 | #else |
| 682 | inline bool is_storage_available(const int &offset, const int &needed) const { |
| 683 | return 1; |
| 684 | } |
| 685 | #endif // DBUG_OFF |
| 686 | |
| 687 | /* Global number of this index (used as prefix in StorageFormat) */ |
| 688 | const uint32 m_index_number; |
| 689 | |
| 690 | uchar m_index_number_storage_form[INDEX_NUMBER_SIZE]; |
| 691 | |
| 692 | rocksdb::ColumnFamilyHandle *m_cf_handle; |
| 693 | |
| 694 | void pack_legacy_variable_format(const uchar *src, size_t src_len, |
| 695 | uchar **dst) const; |
| 696 | |
| 697 | void pack_variable_format(const uchar *src, size_t src_len, |
| 698 | uchar **dst) const; |
| 699 | |
| 700 | uint calc_unpack_legacy_variable_format(uchar flag, bool *done) const; |
| 701 | |
| 702 | uint calc_unpack_variable_format(uchar flag, bool *done) const; |
| 703 | |
| 704 | public: |
| 705 | uint16_t m_index_dict_version; |
| 706 | uchar m_index_type; |
| 707 | /* KV format version for the index id */ |
| 708 | uint16_t m_kv_format_version; |
| 709 | /* If true, the column family stores data in the reverse order */ |
| 710 | bool m_is_reverse_cf; |
| 711 | |
| 712 | /* If true, then column family is created per partition. */ |
| 713 | bool m_is_per_partition_cf; |
| 714 | |
| 715 | std::string m_name; |
| 716 | mutable Rdb_index_stats m_stats; |
| 717 | |
| 718 | /* |
| 719 | Bitmap containing information about whether TTL or other special fields |
| 720 | are enabled for the given index. |
| 721 | */ |
| 722 | uint32 m_index_flags_bitmap; |
| 723 | |
| 724 | /* |
| 725 | How much space in bytes the index flag fields occupy. |
| 726 | */ |
| 727 | uint32 m_total_index_flags_length; |
| 728 | |
| 729 | /* |
| 730 | Offset in the records where the 8-byte TTL is stored (UINT_MAX if no TTL) |
| 731 | */ |
| 732 | uint32 m_ttl_rec_offset; |
| 733 | |
| 734 | /* Default TTL duration */ |
| 735 | uint64 m_ttl_duration; |
| 736 | |
| 737 | /* TTL column (if defined by user, otherwise implicit TTL is used) */ |
| 738 | std::string m_ttl_column; |
| 739 | |
| 740 | private: |
| 741 | friend class Rdb_tbl_def; // for m_index_number above |
| 742 | |
| 743 | /* Number of key parts in the primary key*/ |
| 744 | uint m_pk_key_parts; |
| 745 | |
| 746 | /* |
| 747 | pk_part_no[X]=Y means that keypart #X of this key is key part #Y of the |
| 748 | primary key. Y==-1 means this column is not present in the primary key. |
| 749 | */ |
| 750 | uint *m_pk_part_no; |
| 751 | |
| 752 | /* Array of index-part descriptors. */ |
| 753 | Rdb_field_packing *m_pack_info; |
| 754 | |
| 755 | uint m_keyno; /* number of this index in the table */ |
| 756 | |
| 757 | /* |
| 758 | Number of key parts in the index (including "index extension"). This is how |
| 759 | many elements are in the m_pack_info array. |
| 760 | */ |
| 761 | uint m_key_parts; |
| 762 | |
| 763 | /* |
| 764 | If TTL column is part of the PK, offset of the column within pk. |
| 765 | Default is UINT_MAX to denote that TTL col is not part of PK. |
| 766 | */ |
| 767 | uint m_ttl_pk_key_part_offset; |
| 768 | |
| 769 | /* |
| 770 | Index of the TTL column in table->s->fields, if it exists. |
| 771 | Default is UINT_MAX to denote that it does not exist. |
| 772 | */ |
| 773 | uint m_ttl_field_offset; |
| 774 | |
| 775 | /* Prefix extractor for the column family of the key definiton */ |
| 776 | std::shared_ptr<const rocksdb::SliceTransform> ; |
| 777 | |
| 778 | /* Maximum length of the mem-comparable form. */ |
| 779 | uint m_maxlength; |
| 780 | |
| 781 | /* mutex to protect setup */ |
| 782 | mysql_mutex_t m_mutex; |
| 783 | }; |
| 784 | |
| 785 | // "Simple" collations (those specified in strings/ctype-simple.c) are simple |
| 786 | // because their strnxfrm function maps one byte to one byte. However, the |
| 787 | // mapping is not injective, so the inverse function will take in an extra |
| 788 | // index parameter containing information to disambiguate what the original |
| 789 | // character was. |
| 790 | // |
| 791 | // The m_enc* members are for encoding. Generally, we want encoding to be: |
| 792 | // src -> (dst, idx) |
| 793 | // |
| 794 | // Since strnxfrm already gives us dst, we just need m_enc_idx[src] to give us |
| 795 | // idx. |
| 796 | // |
| 797 | // For the inverse, we have: |
| 798 | // (dst, idx) -> src |
| 799 | // |
| 800 | // We have m_dec_idx[idx][dst] = src to get our original character back. |
| 801 | // |
| 802 | struct Rdb_collation_codec { |
| 803 | const my_core::CHARSET_INFO *m_cs; |
| 804 | // The first element unpacks VARCHAR(n), the second one - CHAR(n). |
| 805 | std::array<rdb_make_unpack_info_t, 2> m_make_unpack_info_func; |
| 806 | std::array<rdb_index_field_unpack_t, 2> m_unpack_func; |
| 807 | |
| 808 | std::array<uchar, 256> m_enc_idx; |
| 809 | std::array<uchar, 256> m_enc_size; |
| 810 | |
| 811 | std::array<uchar, 256> m_dec_size; |
| 812 | std::vector<std::array<uchar, 256>> m_dec_idx; |
| 813 | }; |
| 814 | |
| 815 | extern mysql_mutex_t rdb_collation_data_mutex; |
| 816 | extern mysql_mutex_t rdb_mem_cmp_space_mutex; |
| 817 | extern std::array<const Rdb_collation_codec *, MY_ALL_CHARSETS_SIZE> |
| 818 | rdb_collation_data; |
| 819 | |
| 820 | class Rdb_field_packing { |
| 821 | public: |
| 822 | Rdb_field_packing(const Rdb_field_packing &) = delete; |
| 823 | Rdb_field_packing &operator=(const Rdb_field_packing &) = delete; |
| 824 | Rdb_field_packing() = default; |
| 825 | |
| 826 | /* Length of mem-comparable image of the field, in bytes */ |
| 827 | int m_max_image_len; |
| 828 | |
| 829 | /* Length of image in the unpack data */ |
| 830 | int m_unpack_data_len; |
| 831 | int m_unpack_data_offset; |
| 832 | |
| 833 | bool m_maybe_null; /* TRUE <=> NULL-byte is stored */ |
| 834 | |
| 835 | /* |
| 836 | Valid only for VARCHAR fields. |
| 837 | */ |
| 838 | const CHARSET_INFO *m_varchar_charset; |
| 839 | |
| 840 | // (Valid when Variable Length Space Padded Encoding is used): |
| 841 | uint m_segment_size; // size of segment used |
| 842 | |
| 843 | // number of bytes used to store number of trimmed (or added) |
| 844 | // spaces in the upack_info |
| 845 | bool m_unpack_info_uses_two_bytes; |
| 846 | |
| 847 | /* |
| 848 | True implies that an index-only read is always possible for this field. |
| 849 | False means an index-only read may be possible depending on the record and |
| 850 | field type. |
| 851 | */ |
| 852 | bool m_covered; |
| 853 | |
| 854 | const std::vector<uchar> *space_xfrm; |
| 855 | size_t space_xfrm_len; |
| 856 | size_t space_mb_len; |
| 857 | |
| 858 | const Rdb_collation_codec *m_charset_codec; |
| 859 | |
| 860 | /* |
| 861 | @return TRUE: this field makes use of unpack_info. |
| 862 | */ |
| 863 | bool uses_unpack_info() const { return (m_make_unpack_info_func != nullptr); } |
| 864 | |
| 865 | /* TRUE means unpack_info stores the original field value */ |
| 866 | bool m_unpack_info_stores_value; |
| 867 | |
| 868 | rdb_index_field_pack_t m_pack_func; |
| 869 | rdb_make_unpack_info_t m_make_unpack_info_func; |
| 870 | |
| 871 | /* |
| 872 | This function takes |
| 873 | - mem-comparable form |
| 874 | - unpack_info data |
| 875 | and restores the original value. |
| 876 | */ |
| 877 | rdb_index_field_unpack_t m_unpack_func; |
| 878 | |
| 879 | /* |
| 880 | This function skips over mem-comparable form. |
| 881 | */ |
| 882 | rdb_index_field_skip_t m_skip_func; |
| 883 | |
| 884 | private: |
| 885 | /* |
| 886 | Location of the field in the table (key number and key part number). |
| 887 | |
| 888 | Note that this describes not the field, but rather a position of field in |
| 889 | the index. Consider an example: |
| 890 | |
| 891 | col1 VARCHAR (100), |
| 892 | INDEX idx1 (col1)), |
| 893 | INDEX idx2 (col1(10)), |
| 894 | |
| 895 | Here, idx2 has a special Field object that is set to describe a 10-char |
| 896 | prefix of col1. |
| 897 | |
| 898 | We must also store the keynr. It is needed for implicit "extended keys". |
| 899 | Every key in MyRocks needs to include PK columns. Generally, SQL layer |
| 900 | includes PK columns as part of its "Extended Keys" feature, but sometimes |
| 901 | it does not (known examples are unique secondary indexes and partitioned |
| 902 | tables). |
| 903 | In that case, MyRocks's index descriptor has invisible suffix of PK |
| 904 | columns (and the point is that these columns are parts of PK, not parts |
| 905 | of the current index). |
| 906 | */ |
| 907 | uint m_keynr; |
| 908 | uint m_key_part; |
| 909 | |
| 910 | public: |
| 911 | bool setup(const Rdb_key_def *const key_descr, const Field *const field, |
| 912 | const uint &keynr_arg, const uint &key_part_arg, |
| 913 | const uint16 &key_length); |
| 914 | Field *get_field_in_table(const TABLE *const tbl) const; |
| 915 | void fill_hidden_pk_val(uchar **dst, const longlong &hidden_pk_id) const; |
| 916 | }; |
| 917 | |
| 918 | /* |
| 919 | Descriptor telling how to decode/encode a field to on-disk record storage |
| 920 | format. Not all information is in the structure yet, but eventually we |
| 921 | want to have as much as possible there to avoid virtual calls. |
| 922 | |
| 923 | For encoding/decoding of index tuples, see Rdb_key_def. |
| 924 | */ |
| 925 | class Rdb_field_encoder { |
| 926 | public: |
| 927 | Rdb_field_encoder(const Rdb_field_encoder &) = delete; |
| 928 | Rdb_field_encoder &operator=(const Rdb_field_encoder &) = delete; |
| 929 | /* |
| 930 | STORE_NONE is set when a column can be decoded solely from their |
| 931 | mem-comparable form. |
| 932 | STORE_SOME is set when a column can be decoded from their mem-comparable |
| 933 | form plus unpack_info. |
| 934 | STORE_ALL is set when a column cannot be decoded, so its original value |
| 935 | must be stored in the PK records. |
| 936 | */ |
| 937 | enum STORAGE_TYPE { |
| 938 | STORE_NONE, |
| 939 | STORE_SOME, |
| 940 | STORE_ALL, |
| 941 | }; |
| 942 | STORAGE_TYPE m_storage_type; |
| 943 | |
| 944 | uint m_null_offset; |
| 945 | uint16 m_field_index; |
| 946 | |
| 947 | uchar m_null_mask; // 0 means the field cannot be null |
| 948 | |
| 949 | my_core::enum_field_types m_field_type; |
| 950 | |
| 951 | uint m_pack_length_in_rec; |
| 952 | |
| 953 | bool maybe_null() const { return m_null_mask != 0; } |
| 954 | |
| 955 | bool uses_variable_len_encoding() const { |
| 956 | return (m_field_type == MYSQL_TYPE_BLOB || |
| 957 | m_field_type == MYSQL_TYPE_VARCHAR); |
| 958 | } |
| 959 | }; |
| 960 | |
| 961 | inline Field *Rdb_key_def::get_table_field_for_part_no(TABLE *table, |
| 962 | uint part_no) const { |
| 963 | DBUG_ASSERT(part_no < get_key_parts()); |
| 964 | return m_pack_info[part_no].get_field_in_table(table); |
| 965 | } |
| 966 | |
| 967 | inline bool Rdb_key_def::can_unpack(const uint &kp) const { |
| 968 | DBUG_ASSERT(kp < m_key_parts); |
| 969 | return (m_pack_info[kp].m_unpack_func != nullptr); |
| 970 | } |
| 971 | |
| 972 | inline bool Rdb_key_def::has_unpack_info(const uint &kp) const { |
| 973 | DBUG_ASSERT(kp < m_key_parts); |
| 974 | return m_pack_info[kp].uses_unpack_info(); |
| 975 | } |
| 976 | |
| 977 | /* |
| 978 | A table definition. This is an entry in the mapping |
| 979 | |
| 980 | dbname.tablename -> {index_nr, index_nr, ... } |
| 981 | |
| 982 | There is only one Rdb_tbl_def object for a given table. |
| 983 | That's why we keep auto_increment value here, too. |
| 984 | */ |
| 985 | |
| 986 | class Rdb_tbl_def { |
| 987 | private: |
| 988 | void check_if_is_mysql_system_table(); |
| 989 | |
| 990 | /* Stores 'dbname.tablename' */ |
| 991 | std::string m_dbname_tablename; |
| 992 | |
| 993 | /* Store the db name, table name, and partition name */ |
| 994 | std::string m_dbname; |
| 995 | std::string m_tablename; |
| 996 | std::string m_partition; |
| 997 | |
| 998 | void set_name(const std::string &name); |
| 999 | |
| 1000 | public: |
| 1001 | Rdb_tbl_def(const Rdb_tbl_def &) = delete; |
| 1002 | Rdb_tbl_def &operator=(const Rdb_tbl_def &) = delete; |
| 1003 | |
| 1004 | explicit Rdb_tbl_def(const std::string &name) |
| 1005 | : m_key_descr_arr(nullptr), m_hidden_pk_val(0), m_auto_incr_val(0) { |
| 1006 | set_name(name); |
| 1007 | } |
| 1008 | |
| 1009 | Rdb_tbl_def(const char *const name, const size_t &len) |
| 1010 | : m_key_descr_arr(nullptr), m_hidden_pk_val(0), m_auto_incr_val(0) { |
| 1011 | set_name(std::string(name, len)); |
| 1012 | } |
| 1013 | |
| 1014 | explicit Rdb_tbl_def(const rocksdb::Slice &slice, const size_t &pos = 0) |
| 1015 | : m_key_descr_arr(nullptr), m_hidden_pk_val(0), m_auto_incr_val(0) { |
| 1016 | set_name(std::string(slice.data() + pos, slice.size() - pos)); |
| 1017 | } |
| 1018 | |
| 1019 | ~Rdb_tbl_def(); |
| 1020 | |
| 1021 | /* Number of indexes */ |
| 1022 | uint m_key_count; |
| 1023 | |
| 1024 | /* Array of index descriptors */ |
| 1025 | std::shared_ptr<Rdb_key_def> *m_key_descr_arr; |
| 1026 | |
| 1027 | std::atomic<longlong> m_hidden_pk_val; |
| 1028 | std::atomic<ulonglong> m_auto_incr_val; |
| 1029 | |
| 1030 | /* Is this a system table */ |
| 1031 | bool m_is_mysql_system_table; |
| 1032 | |
| 1033 | bool put_dict(Rdb_dict_manager *const dict, rocksdb::WriteBatch *const batch, |
| 1034 | uchar *const key, const size_t &keylen); |
| 1035 | |
| 1036 | const std::string &full_tablename() const { return m_dbname_tablename; } |
| 1037 | const std::string &base_dbname() const { return m_dbname; } |
| 1038 | const std::string &base_tablename() const { return m_tablename; } |
| 1039 | const std::string &base_partition() const { return m_partition; } |
| 1040 | GL_INDEX_ID get_autoincr_gl_index_id(); |
| 1041 | }; |
| 1042 | |
| 1043 | /* |
| 1044 | A thread-safe sequential number generator. Its performance is not a concern |
| 1045 | hence it is ok to protect it by a mutex. |
| 1046 | */ |
| 1047 | |
| 1048 | class Rdb_seq_generator { |
| 1049 | uint m_next_number = 0; |
| 1050 | |
| 1051 | mysql_mutex_t m_mutex; |
| 1052 | |
| 1053 | public: |
| 1054 | Rdb_seq_generator(const Rdb_seq_generator &) = delete; |
| 1055 | Rdb_seq_generator &operator=(const Rdb_seq_generator &) = delete; |
| 1056 | Rdb_seq_generator() = default; |
| 1057 | |
| 1058 | void init(const uint &initial_number) { |
| 1059 | mysql_mutex_init(0, &m_mutex, MY_MUTEX_INIT_FAST); |
| 1060 | m_next_number = initial_number; |
| 1061 | } |
| 1062 | |
| 1063 | uint get_and_update_next_number(Rdb_dict_manager *const dict); |
| 1064 | |
| 1065 | void cleanup() { mysql_mutex_destroy(&m_mutex); } |
| 1066 | }; |
| 1067 | |
| 1068 | interface Rdb_tables_scanner { |
| 1069 | virtual int add_table(Rdb_tbl_def * tdef) = 0; |
| 1070 | virtual ~Rdb_tables_scanner() {} /* Keep the compiler happy */ |
| 1071 | }; |
| 1072 | |
| 1073 | /* |
| 1074 | This contains a mapping of |
| 1075 | |
| 1076 | dbname.table_name -> array{Rdb_key_def}. |
| 1077 | |
| 1078 | objects are shared among all threads. |
| 1079 | */ |
| 1080 | |
| 1081 | class Rdb_ddl_manager { |
| 1082 | Rdb_dict_manager *m_dict = nullptr; |
| 1083 | my_core::HASH m_ddl_hash; // Contains Rdb_tbl_def elements |
| 1084 | // Maps index id to <table_name, index number> |
| 1085 | std::map<GL_INDEX_ID, std::pair<std::string, uint>> m_index_num_to_keydef; |
| 1086 | |
| 1087 | // Maps index id to key definitons not yet committed to data dictionary. |
| 1088 | // This is mainly used to store key definitions during ALTER TABLE. |
| 1089 | std::map<GL_INDEX_ID, std::shared_ptr<Rdb_key_def>> |
| 1090 | m_index_num_to_uncommitted_keydef; |
| 1091 | mysql_rwlock_t m_rwlock; |
| 1092 | |
| 1093 | Rdb_seq_generator m_sequence; |
| 1094 | // A queue of table stats to write into data dictionary |
| 1095 | // It is produced by event listener (ie compaction and flush threads) |
| 1096 | // and consumed by the rocksdb background thread |
| 1097 | std::map<GL_INDEX_ID, Rdb_index_stats> m_stats2store; |
| 1098 | |
| 1099 | const std::shared_ptr<Rdb_key_def> &find(GL_INDEX_ID gl_index_id); |
| 1100 | |
| 1101 | public: |
| 1102 | Rdb_ddl_manager(const Rdb_ddl_manager &) = delete; |
| 1103 | Rdb_ddl_manager &operator=(const Rdb_ddl_manager &) = delete; |
| 1104 | Rdb_ddl_manager() {} |
| 1105 | |
| 1106 | /* Load the data dictionary from on-disk storage */ |
| 1107 | bool init(Rdb_dict_manager *const dict_arg, Rdb_cf_manager *const cf_manager, |
| 1108 | const uint32_t &validate_tables); |
| 1109 | |
| 1110 | void cleanup(); |
| 1111 | |
| 1112 | Rdb_tbl_def *find(const std::string &table_name, const bool &lock = true); |
| 1113 | std::shared_ptr<const Rdb_key_def> safe_find(GL_INDEX_ID gl_index_id); |
| 1114 | void set_stats(const std::unordered_map<GL_INDEX_ID, Rdb_index_stats> &stats); |
| 1115 | void adjust_stats(const std::vector<Rdb_index_stats> &new_data, |
| 1116 | const std::vector<Rdb_index_stats> &deleted_data = |
| 1117 | std::vector<Rdb_index_stats>()); |
| 1118 | void persist_stats(const bool &sync = false); |
| 1119 | |
| 1120 | /* Modify the mapping and write it to on-disk storage */ |
| 1121 | int put_and_write(Rdb_tbl_def *const key_descr, |
| 1122 | rocksdb::WriteBatch *const batch); |
| 1123 | void remove(Rdb_tbl_def *const rec, rocksdb::WriteBatch *const batch, |
| 1124 | const bool &lock = true); |
| 1125 | bool rename(const std::string &from, const std::string &to, |
| 1126 | rocksdb::WriteBatch *const batch); |
| 1127 | |
| 1128 | uint get_and_update_next_number(Rdb_dict_manager *const dict) { |
| 1129 | return m_sequence.get_and_update_next_number(dict); |
| 1130 | } |
| 1131 | |
| 1132 | const std::string safe_get_table_name(const GL_INDEX_ID &gl_index_id); |
| 1133 | |
| 1134 | /* Walk the data dictionary */ |
| 1135 | int scan_for_tables(Rdb_tables_scanner *tables_scanner); |
| 1136 | |
| 1137 | void erase_index_num(const GL_INDEX_ID &gl_index_id); |
| 1138 | void add_uncommitted_keydefs( |
| 1139 | const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes); |
| 1140 | void remove_uncommitted_keydefs( |
| 1141 | const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes); |
| 1142 | |
| 1143 | private: |
| 1144 | /* Put the data into in-memory table (only) */ |
| 1145 | int put(Rdb_tbl_def *const key_descr, const bool &lock = true); |
| 1146 | |
| 1147 | /* Helper functions to be passed to my_core::HASH object */ |
| 1148 | static const uchar *get_hash_key(Rdb_tbl_def *const rec, size_t *const length, |
| 1149 | my_bool not_used MY_ATTRIBUTE((unused))); |
| 1150 | static void free_hash_elem(void *const data); |
| 1151 | |
| 1152 | bool validate_schemas(); |
| 1153 | |
| 1154 | bool validate_auto_incr(); |
| 1155 | }; |
| 1156 | |
| 1157 | /* |
| 1158 | Writing binlog information into RocksDB at commit(), |
| 1159 | and retrieving binlog information at crash recovery. |
| 1160 | commit() and recovery are always executed by at most single client |
| 1161 | at the same time, so concurrency control is not needed. |
| 1162 | |
| 1163 | Binlog info is stored in RocksDB as the following. |
| 1164 | key: BINLOG_INFO_INDEX_NUMBER |
| 1165 | value: packed single row: |
| 1166 | binlog_name_length (2 byte form) |
| 1167 | binlog_name |
| 1168 | binlog_position (4 byte form) |
| 1169 | binlog_gtid_length (2 byte form) |
| 1170 | binlog_gtid |
| 1171 | */ |
| 1172 | class Rdb_binlog_manager { |
| 1173 | public: |
| 1174 | Rdb_binlog_manager(const Rdb_binlog_manager &) = delete; |
| 1175 | Rdb_binlog_manager &operator=(const Rdb_binlog_manager &) = delete; |
| 1176 | Rdb_binlog_manager() = default; |
| 1177 | |
| 1178 | bool init(Rdb_dict_manager *const dict); |
| 1179 | void cleanup(); |
| 1180 | void update(const char *const binlog_name, const my_off_t binlog_pos, |
| 1181 | rocksdb::WriteBatchBase *const batch); |
| 1182 | bool read(char *const binlog_name, my_off_t *const binlog_pos, |
| 1183 | char *const binlog_gtid) const; |
| 1184 | void update_slave_gtid_info(const uint &id, const char *const db, |
| 1185 | const char *const gtid, |
| 1186 | rocksdb::WriteBatchBase *const write_batch); |
| 1187 | |
| 1188 | private: |
| 1189 | Rdb_dict_manager *m_dict = nullptr; |
| 1190 | uchar m_key_buf[Rdb_key_def::INDEX_NUMBER_SIZE] = {0}; |
| 1191 | rocksdb::Slice m_key_slice; |
| 1192 | |
| 1193 | rocksdb::Slice pack_value(uchar *const buf, const char *const binlog_name, |
| 1194 | const my_off_t &binlog_pos, |
| 1195 | const char *const binlog_gtid) const; |
| 1196 | bool unpack_value(const uchar *const value, size_t value_size, |
| 1197 | char *const binlog_name, |
| 1198 | my_off_t *const binlog_pos, char *const binlog_gtid) const; |
| 1199 | |
| 1200 | std::atomic<Rdb_tbl_def *> m_slave_gtid_info_tbl; |
| 1201 | }; |
| 1202 | |
| 1203 | /* |
| 1204 | Rdb_dict_manager manages how MySQL on RocksDB (MyRocks) stores its |
| 1205 | internal data dictionary. |
| 1206 | MyRocks stores data dictionary on dedicated system column family |
| 1207 | named __system__. The system column family is used by MyRocks |
| 1208 | internally only, and not used by applications. |
| 1209 | |
| 1210 | Currently MyRocks has the following data dictionary data models. |
| 1211 | |
| 1212 | 1. Table Name => internal index id mappings |
| 1213 | key: Rdb_key_def::DDL_ENTRY_INDEX_START_NUMBER(0x1) + dbname.tablename |
| 1214 | value: version, {cf_id, index_id}*n_indexes_of_the_table |
| 1215 | version is 2 bytes. cf_id and index_id are 4 bytes. |
| 1216 | |
| 1217 | 2. internal cf_id, index id => index information |
| 1218 | key: Rdb_key_def::INDEX_INFO(0x2) + cf_id + index_id |
| 1219 | value: version, index_type, kv_format_version, index_flags, ttl_duration |
| 1220 | index_type is 1 byte, version and kv_format_version are 2 bytes. |
| 1221 | index_flags is 4 bytes. |
| 1222 | ttl_duration is 8 bytes. |
| 1223 | |
| 1224 | 3. CF id => CF flags |
| 1225 | key: Rdb_key_def::CF_DEFINITION(0x3) + cf_id |
| 1226 | value: version, {is_reverse_cf, is_auto_cf (deprecated), is_per_partition_cf} |
| 1227 | cf_flags is 4 bytes in total. |
| 1228 | |
| 1229 | 4. Binlog entry (updated at commit) |
| 1230 | key: Rdb_key_def::BINLOG_INFO_INDEX_NUMBER (0x4) |
| 1231 | value: version, {binlog_name,binlog_pos,binlog_gtid} |
| 1232 | |
| 1233 | 5. Ongoing drop index entry |
| 1234 | key: Rdb_key_def::DDL_DROP_INDEX_ONGOING(0x5) + cf_id + index_id |
| 1235 | value: version |
| 1236 | |
| 1237 | 6. index stats |
| 1238 | key: Rdb_key_def::INDEX_STATISTICS(0x6) + cf_id + index_id |
| 1239 | value: version, {materialized PropertiesCollector::IndexStats} |
| 1240 | |
| 1241 | 7. maximum index id |
| 1242 | key: Rdb_key_def::MAX_INDEX_ID(0x7) |
| 1243 | value: index_id |
| 1244 | index_id is 4 bytes |
| 1245 | |
| 1246 | 8. Ongoing create index entry |
| 1247 | key: Rdb_key_def::DDL_CREATE_INDEX_ONGOING(0x8) + cf_id + index_id |
| 1248 | value: version |
| 1249 | |
| 1250 | 9. auto_increment values |
| 1251 | key: Rdb_key_def::AUTO_INC(0x9) + cf_id + index_id |
| 1252 | value: version, {max auto_increment so far} |
| 1253 | max auto_increment is 8 bytes |
| 1254 | |
| 1255 | Data dictionary operations are atomic inside RocksDB. For example, |
| 1256 | when creating a table with two indexes, it is necessary to call Put |
| 1257 | three times. They have to be atomic. Rdb_dict_manager has a wrapper function |
| 1258 | begin() and commit() to make it easier to do atomic operations. |
| 1259 | |
| 1260 | */ |
| 1261 | class Rdb_dict_manager { |
| 1262 | private: |
| 1263 | mysql_mutex_t m_mutex; |
| 1264 | rocksdb::DB *m_db = nullptr; |
| 1265 | rocksdb::ColumnFamilyHandle *m_system_cfh = nullptr; |
| 1266 | /* Utility to put INDEX_INFO and CF_DEFINITION */ |
| 1267 | |
| 1268 | uchar m_key_buf_max_index_id[Rdb_key_def::INDEX_NUMBER_SIZE] = {0}; |
| 1269 | rocksdb::Slice m_key_slice_max_index_id; |
| 1270 | |
| 1271 | static void dump_index_id(uchar *const netbuf, |
| 1272 | Rdb_key_def::DATA_DICT_TYPE dict_type, |
| 1273 | const GL_INDEX_ID &gl_index_id); |
| 1274 | void delete_with_prefix(rocksdb::WriteBatch *const batch, |
| 1275 | Rdb_key_def::DATA_DICT_TYPE dict_type, |
| 1276 | const GL_INDEX_ID &gl_index_id) const; |
| 1277 | /* Functions for fast DROP TABLE/INDEX */ |
| 1278 | void resume_drop_indexes() const; |
| 1279 | void log_start_drop_table(const std::shared_ptr<Rdb_key_def> *const key_descr, |
| 1280 | const uint32 &n_keys, |
| 1281 | const char *const log_action) const; |
| 1282 | void log_start_drop_index(GL_INDEX_ID gl_index_id, |
| 1283 | const char *log_action) const; |
| 1284 | |
| 1285 | public: |
| 1286 | Rdb_dict_manager(const Rdb_dict_manager &) = delete; |
| 1287 | Rdb_dict_manager &operator=(const Rdb_dict_manager &) = delete; |
| 1288 | Rdb_dict_manager() = default; |
| 1289 | |
| 1290 | bool init(rocksdb::DB *const rdb_dict, Rdb_cf_manager *const cf_manager); |
| 1291 | |
| 1292 | inline void cleanup() { mysql_mutex_destroy(&m_mutex); } |
| 1293 | |
| 1294 | inline void lock() { RDB_MUTEX_LOCK_CHECK(m_mutex); } |
| 1295 | |
| 1296 | inline void unlock() { RDB_MUTEX_UNLOCK_CHECK(m_mutex); } |
| 1297 | |
| 1298 | inline rocksdb::ColumnFamilyHandle *get_system_cf() const { |
| 1299 | return m_system_cfh; |
| 1300 | } |
| 1301 | |
| 1302 | /* Raw RocksDB operations */ |
| 1303 | std::unique_ptr<rocksdb::WriteBatch> begin() const; |
| 1304 | int commit(rocksdb::WriteBatch *const batch, const bool &sync = true) const; |
| 1305 | rocksdb::Status get_value(const rocksdb::Slice &key, |
| 1306 | std::string *const value) const; |
| 1307 | void put_key(rocksdb::WriteBatchBase *const batch, const rocksdb::Slice &key, |
| 1308 | const rocksdb::Slice &value) const; |
| 1309 | void delete_key(rocksdb::WriteBatchBase *batch, |
| 1310 | const rocksdb::Slice &key) const; |
| 1311 | rocksdb::Iterator *new_iterator() const; |
| 1312 | |
| 1313 | /* Internal Index id => CF */ |
| 1314 | void |
| 1315 | add_or_update_index_cf_mapping(rocksdb::WriteBatch *batch, |
| 1316 | struct Rdb_index_info *const index_info) const; |
| 1317 | void delete_index_info(rocksdb::WriteBatch *batch, |
| 1318 | const GL_INDEX_ID &index_id) const; |
| 1319 | bool get_index_info(const GL_INDEX_ID &gl_index_id, |
| 1320 | struct Rdb_index_info *const index_info) const; |
| 1321 | |
| 1322 | /* CF id => CF flags */ |
| 1323 | void add_cf_flags(rocksdb::WriteBatch *const batch, const uint &cf_id, |
| 1324 | const uint &cf_flags) const; |
| 1325 | bool get_cf_flags(const uint &cf_id, uint *const cf_flags) const; |
| 1326 | |
| 1327 | /* Functions for fast CREATE/DROP TABLE/INDEX */ |
| 1328 | void |
| 1329 | get_ongoing_index_operation(std::unordered_set<GL_INDEX_ID> *gl_index_ids, |
| 1330 | Rdb_key_def::DATA_DICT_TYPE dd_type) const; |
| 1331 | bool is_index_operation_ongoing(const GL_INDEX_ID &gl_index_id, |
| 1332 | Rdb_key_def::DATA_DICT_TYPE dd_type) const; |
| 1333 | void start_ongoing_index_operation(rocksdb::WriteBatch *batch, |
| 1334 | const GL_INDEX_ID &gl_index_id, |
| 1335 | Rdb_key_def::DATA_DICT_TYPE dd_type) const; |
| 1336 | void end_ongoing_index_operation(rocksdb::WriteBatch *const batch, |
| 1337 | const GL_INDEX_ID &gl_index_id, |
| 1338 | Rdb_key_def::DATA_DICT_TYPE dd_type) const; |
| 1339 | bool is_drop_index_empty() const; |
| 1340 | void add_drop_table(std::shared_ptr<Rdb_key_def> *const key_descr, |
| 1341 | const uint32 &n_keys, |
| 1342 | rocksdb::WriteBatch *const batch) const; |
| 1343 | void add_drop_index(const std::unordered_set<GL_INDEX_ID> &gl_index_ids, |
| 1344 | rocksdb::WriteBatch *const batch) const; |
| 1345 | void add_create_index(const std::unordered_set<GL_INDEX_ID> &gl_index_ids, |
| 1346 | rocksdb::WriteBatch *const batch) const; |
| 1347 | void |
| 1348 | finish_indexes_operation(const std::unordered_set<GL_INDEX_ID> &gl_index_ids, |
| 1349 | Rdb_key_def::DATA_DICT_TYPE dd_type) const; |
| 1350 | void rollback_ongoing_index_creation() const; |
| 1351 | |
| 1352 | inline void get_ongoing_drop_indexes( |
| 1353 | std::unordered_set<GL_INDEX_ID> *gl_index_ids) const { |
| 1354 | get_ongoing_index_operation(gl_index_ids, |
| 1355 | Rdb_key_def::DDL_DROP_INDEX_ONGOING); |
| 1356 | } |
| 1357 | inline void get_ongoing_create_indexes( |
| 1358 | std::unordered_set<GL_INDEX_ID> *gl_index_ids) const { |
| 1359 | get_ongoing_index_operation(gl_index_ids, |
| 1360 | Rdb_key_def::DDL_CREATE_INDEX_ONGOING); |
| 1361 | } |
| 1362 | inline void start_drop_index(rocksdb::WriteBatch *wb, |
| 1363 | const GL_INDEX_ID &gl_index_id) const { |
| 1364 | start_ongoing_index_operation(wb, gl_index_id, |
| 1365 | Rdb_key_def::DDL_DROP_INDEX_ONGOING); |
| 1366 | } |
| 1367 | inline void start_create_index(rocksdb::WriteBatch *wb, |
| 1368 | const GL_INDEX_ID &gl_index_id) const { |
| 1369 | start_ongoing_index_operation(wb, gl_index_id, |
| 1370 | Rdb_key_def::DDL_CREATE_INDEX_ONGOING); |
| 1371 | } |
| 1372 | inline void finish_drop_indexes( |
| 1373 | const std::unordered_set<GL_INDEX_ID> &gl_index_ids) const { |
| 1374 | finish_indexes_operation(gl_index_ids, Rdb_key_def::DDL_DROP_INDEX_ONGOING); |
| 1375 | } |
| 1376 | inline void finish_create_indexes( |
| 1377 | const std::unordered_set<GL_INDEX_ID> &gl_index_ids) const { |
| 1378 | finish_indexes_operation(gl_index_ids, |
| 1379 | Rdb_key_def::DDL_CREATE_INDEX_ONGOING); |
| 1380 | } |
| 1381 | inline bool is_drop_index_ongoing(const GL_INDEX_ID &gl_index_id) const { |
| 1382 | return is_index_operation_ongoing(gl_index_id, |
| 1383 | Rdb_key_def::DDL_DROP_INDEX_ONGOING); |
| 1384 | } |
| 1385 | inline bool is_create_index_ongoing(const GL_INDEX_ID &gl_index_id) const { |
| 1386 | return is_index_operation_ongoing(gl_index_id, |
| 1387 | Rdb_key_def::DDL_CREATE_INDEX_ONGOING); |
| 1388 | } |
| 1389 | |
| 1390 | bool get_max_index_id(uint32_t *const index_id) const; |
| 1391 | bool update_max_index_id(rocksdb::WriteBatch *const batch, |
| 1392 | const uint32_t &index_id) const; |
| 1393 | void add_stats(rocksdb::WriteBatch *const batch, |
| 1394 | const std::vector<Rdb_index_stats> &stats) const; |
| 1395 | Rdb_index_stats get_stats(GL_INDEX_ID gl_index_id) const; |
| 1396 | |
| 1397 | rocksdb::Status put_auto_incr_val(rocksdb::WriteBatchBase *batch, |
| 1398 | const GL_INDEX_ID &gl_index_id, |
| 1399 | ulonglong val, |
| 1400 | bool overwrite = false) const; |
| 1401 | bool get_auto_incr_val(const GL_INDEX_ID &gl_index_id, |
| 1402 | ulonglong *new_val) const; |
| 1403 | }; |
| 1404 | |
| 1405 | struct Rdb_index_info { |
| 1406 | GL_INDEX_ID m_gl_index_id; |
| 1407 | uint16_t m_index_dict_version = 0; |
| 1408 | uchar m_index_type = 0; |
| 1409 | uint16_t m_kv_version = 0; |
| 1410 | uint32 m_index_flags = 0; |
| 1411 | uint64 m_ttl_duration = 0; |
| 1412 | }; |
| 1413 | |
| 1414 | /* |
| 1415 | @brief |
| 1416 | Merge Operator for the auto_increment value in the system_cf |
| 1417 | |
| 1418 | @detail |
| 1419 | This class implements the rocksdb Merge Operator for auto_increment values |
| 1420 | that are stored to the data dictionary every transaction. |
| 1421 | |
| 1422 | The actual Merge function is triggered on compaction, memtable flushes, or |
| 1423 | when get() is called on the same key. |
| 1424 | |
| 1425 | */ |
| 1426 | class Rdb_system_merge_op : public rocksdb::AssociativeMergeOperator { |
| 1427 | public: |
| 1428 | /* |
| 1429 | Updates the new value associated with a key to be the maximum of the |
| 1430 | passed in value and the existing value. |
| 1431 | |
| 1432 | @param[IN] key |
| 1433 | @param[IN] existing_value existing value for a key; nullptr if nonexistent |
| 1434 | key |
| 1435 | @param[IN] value |
| 1436 | @param[OUT] new_value new value after Merge |
| 1437 | @param[IN] logger |
| 1438 | */ |
| 1439 | bool Merge(const rocksdb::Slice &key, const rocksdb::Slice *existing_value, |
| 1440 | const rocksdb::Slice &value, std::string *new_value, |
| 1441 | rocksdb::Logger *logger) const override { |
| 1442 | DBUG_ASSERT(new_value != nullptr); |
| 1443 | |
| 1444 | if (key.size() != Rdb_key_def::INDEX_NUMBER_SIZE * 3 || |
| 1445 | GetKeyType(key) != Rdb_key_def::AUTO_INC || |
| 1446 | value.size() != |
| 1447 | RDB_SIZEOF_AUTO_INCREMENT_VERSION + ROCKSDB_SIZEOF_AUTOINC_VALUE || |
| 1448 | GetVersion(value) > Rdb_key_def::AUTO_INCREMENT_VERSION) { |
| 1449 | abort(); |
| 1450 | } |
| 1451 | |
| 1452 | uint64_t merged_value = Deserialize(value); |
| 1453 | |
| 1454 | if (existing_value != nullptr) { |
| 1455 | if (existing_value->size() != RDB_SIZEOF_AUTO_INCREMENT_VERSION + |
| 1456 | ROCKSDB_SIZEOF_AUTOINC_VALUE || |
| 1457 | GetVersion(*existing_value) > Rdb_key_def::AUTO_INCREMENT_VERSION) { |
| 1458 | abort(); |
| 1459 | } |
| 1460 | |
| 1461 | merged_value = std::max(merged_value, Deserialize(*existing_value)); |
| 1462 | } |
| 1463 | Serialize(merged_value, new_value); |
| 1464 | return true; |
| 1465 | } |
| 1466 | |
| 1467 | virtual const char *Name() const override { return "Rdb_system_merge_op" ; } |
| 1468 | |
| 1469 | private: |
| 1470 | /* |
| 1471 | Serializes the integer data to the new_value buffer or the target buffer |
| 1472 | the merge operator will update to |
| 1473 | */ |
| 1474 | void Serialize(const uint64_t data, std::string *new_value) const { |
| 1475 | uchar value_buf[RDB_SIZEOF_AUTO_INCREMENT_VERSION + |
| 1476 | ROCKSDB_SIZEOF_AUTOINC_VALUE] = {0}; |
| 1477 | uchar *ptr = value_buf; |
| 1478 | /* fill in the auto increment version */ |
| 1479 | rdb_netbuf_store_uint16(ptr, Rdb_key_def::AUTO_INCREMENT_VERSION); |
| 1480 | ptr += RDB_SIZEOF_AUTO_INCREMENT_VERSION; |
| 1481 | /* fill in the auto increment value */ |
| 1482 | rdb_netbuf_store_uint64(ptr, data); |
| 1483 | ptr += ROCKSDB_SIZEOF_AUTOINC_VALUE; |
| 1484 | new_value->assign(reinterpret_cast<char *>(value_buf), ptr - value_buf); |
| 1485 | } |
| 1486 | |
| 1487 | /* |
| 1488 | Gets the value of auto_increment type in the data dictionary from the |
| 1489 | value slice |
| 1490 | |
| 1491 | @Note Only to be used on data dictionary keys for the auto_increment type |
| 1492 | */ |
| 1493 | uint64_t Deserialize(const rocksdb::Slice &s) const { |
| 1494 | return rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(s.data()) + |
| 1495 | RDB_SIZEOF_AUTO_INCREMENT_VERSION); |
| 1496 | } |
| 1497 | |
| 1498 | /* |
| 1499 | Gets the type of the key of the key in the data dictionary. |
| 1500 | |
| 1501 | @Note Only to be used on data dictionary keys for the auto_increment type |
| 1502 | */ |
| 1503 | uint16_t GetKeyType(const rocksdb::Slice &s) const { |
| 1504 | return rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>(s.data())); |
| 1505 | } |
| 1506 | |
| 1507 | /* |
| 1508 | Gets the version of the auto_increment value in the data dictionary. |
| 1509 | |
| 1510 | @Note Only to be used on data dictionary value for the auto_increment type |
| 1511 | */ |
| 1512 | uint16_t GetVersion(const rocksdb::Slice &s) const { |
| 1513 | return rdb_netbuf_to_uint16(reinterpret_cast<const uchar *>(s.data())); |
| 1514 | } |
| 1515 | }; |
| 1516 | |
| 1517 | bool rdb_is_collation_supported(const my_core::CHARSET_INFO *const cs); |
| 1518 | |
| 1519 | } // namespace myrocks |
| 1520 | |