1/*
2 Copyright (c) 2012,2013 Monty Program Ab
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
16#pragma once
17
18/* C++ standard header files */
19#include <cstdlib>
20#include <algorithm>
21#include <atomic>
22#include <map>
23#include <mutex>
24#include <string>
25#include <unordered_set>
26#include <utility>
27#include <vector>
28#include <array>
29
30/* C standard header files */
31#ifndef _WIN32
32#include <arpa/inet.h>
33#endif
34
35/* MyRocks header files */
36#include "./ha_rocksdb.h"
37#include "./properties_collector.h"
38#include "./rdb_buff.h"
39#include "./rdb_utils.h"
40
41namespace myrocks {
42
43class Rdb_dict_manager;
44class Rdb_key_def;
45class Rdb_field_packing;
46class Rdb_cf_manager;
47class Rdb_ddl_manager;
48
49const uint32_t GTID_BUF_LEN = 60;
50
51/*
52 @brief
53 Field packing context.
54 The idea is to ensure that a call to rdb_index_field_pack_t function
55 is followed by a call to rdb_make_unpack_info_t.
56
57 @detail
58 For some datatypes, unpack_info is produced as a side effect of
59 rdb_index_field_pack_t function call.
60 For other datatypes, packing is just calling make_sort_key(), while
61 rdb_make_unpack_info_t is a custom function.
62 In order to accommodate both cases, we require both calls to be made and
63 unpack_info is passed as context data between the two.
64*/
65class Rdb_pack_field_context {
66public:
67 Rdb_pack_field_context(const Rdb_pack_field_context &) = delete;
68 Rdb_pack_field_context &operator=(const Rdb_pack_field_context &) = delete;
69
70 explicit Rdb_pack_field_context(Rdb_string_writer *const writer_arg)
71 : writer(writer_arg) {}
72
73 // NULL means we're not producing unpack_info.
74 Rdb_string_writer *writer;
75};
76
77struct Rdb_collation_codec;
78struct Rdb_index_info;
79
80/*
81 C-style "virtual table" allowing different handling of packing logic based
82 on the field type. See Rdb_field_packing::setup() implementation.
83 */
84using rdb_make_unpack_info_t =
85 void (Rdb_key_def::*)(const Rdb_collation_codec *codec, const Field *field,
86 Rdb_pack_field_context *pack_ctx) const;
87using rdb_index_field_unpack_t = int (Rdb_key_def::*)(
88 Rdb_field_packing *fpi, Field *field, uchar *field_ptr,
89 Rdb_string_reader *reader, Rdb_string_reader *unpack_reader) const;
90using rdb_index_field_skip_t =
91 int (Rdb_key_def::*)(const Rdb_field_packing *fpi, const Field *field,
92 Rdb_string_reader *reader) const;
93using rdb_index_field_pack_t =
94 void (Rdb_key_def::*)(Rdb_field_packing *fpi, Field *field, uchar *buf,
95 uchar **dst, Rdb_pack_field_context *pack_ctx) const;
96
97const uint RDB_INVALID_KEY_LEN = uint(-1);
98
99/* How much one checksum occupies when stored in the record */
100const size_t RDB_CHECKSUM_SIZE = sizeof(uint32_t);
101
102/*
103 How much the checksum data occupies in record, in total.
104 It is storing two checksums plus 1 tag-byte.
105*/
106const size_t RDB_CHECKSUM_CHUNK_SIZE = 2 * RDB_CHECKSUM_SIZE + 1;
107
108/*
109 Checksum data starts from CHECKSUM_DATA_TAG which is followed by two CRC32
110 checksums.
111*/
112const char RDB_CHECKSUM_DATA_TAG = 0x01;
113
114/*
115 Unpack data is variable length. The header is 1 tag-byte plus a two byte
116 length field. The length field includes the header as well.
117*/
118const char RDB_UNPACK_DATA_TAG = 0x02;
119const size_t RDB_UNPACK_DATA_LEN_SIZE = sizeof(uint16_t);
120const size_t RDB_UNPACK_HEADER_SIZE =
121 sizeof(RDB_UNPACK_DATA_TAG) + RDB_UNPACK_DATA_LEN_SIZE;
122
123/*
124 This header format is 1 tag-byte plus a two byte length field plus a two byte
125 covered bitmap. The length field includes the header size.
126*/
127const char RDB_UNPACK_COVERED_DATA_TAG = 0x03;
128const size_t RDB_UNPACK_COVERED_DATA_LEN_SIZE = sizeof(uint16_t);
129const size_t RDB_COVERED_BITMAP_SIZE = sizeof(uint16_t);
130const size_t RDB_UNPACK_COVERED_HEADER_SIZE =
131 sizeof(RDB_UNPACK_COVERED_DATA_TAG) + RDB_UNPACK_COVERED_DATA_LEN_SIZE +
132 RDB_COVERED_BITMAP_SIZE;
133
134/*
135 Data dictionary index info field sizes.
136*/
137const size_t RDB_SIZEOF_INDEX_INFO_VERSION = sizeof(uint16);
138const size_t RDB_SIZEOF_INDEX_TYPE = sizeof(uchar);
139const size_t RDB_SIZEOF_KV_VERSION = sizeof(uint16);
140const size_t RDB_SIZEOF_INDEX_FLAGS = sizeof(uint32);
141const size_t RDB_SIZEOF_AUTO_INCREMENT_VERSION = sizeof(uint16);
142
143// Possible return values for rdb_index_field_unpack_t functions.
144enum {
145 UNPACK_SUCCESS = 0,
146 UNPACK_FAILURE = 1,
147};
148
149/*
150 An object of this class represents information about an index in an SQL
151 table. It provides services to encode and decode index tuples.
152
153 Note: a table (as in, on-disk table) has a single Rdb_key_def object which
154 is shared across multiple TABLE* objects and may be used simultaneously from
155 different threads.
156
157 There are several data encodings:
158
159 === SQL LAYER ===
160 SQL layer uses two encodings:
161
162 - "Table->record format". This is the format that is used for the data in
163 the record buffers, table->record[i]
164
165 - KeyTupleFormat (see opt_range.cc) - this is used in parameters to index
166 lookup functions, like handler::index_read_map().
167
168 === Inside RocksDB ===
169 Primary Key is stored as a mapping:
170
171 index_tuple -> StoredRecord
172
173 StoredRecord is in Table->record format, except for blobs, which are stored
174 in-place. See ha_rocksdb::convert_record_to_storage_format for details.
175
176 Secondary indexes are stored as one of two variants:
177
178 index_tuple -> unpack_info
179 index_tuple -> empty_string
180
181 index_tuple here is the form of key that can be compared with memcmp(), aka
182 "mem-comparable form".
183
184 unpack_info is extra data that allows to restore the original value from its
185 mem-comparable form. It is present only if the index supports index-only
186 reads.
187*/
188
189class Rdb_key_def {
190public:
191 /* Convert a key from KeyTupleFormat to mem-comparable form */
192 uint pack_index_tuple(TABLE *const tbl, uchar *const pack_buffer,
193 uchar *const packed_tuple, const uchar *const key_tuple,
194 const key_part_map &keypart_map) const;
195
196 uchar *pack_field(Field *const field, Rdb_field_packing *pack_info,
197 uchar *tuple, uchar *const packed_tuple,
198 uchar *const pack_buffer,
199 Rdb_string_writer *const unpack_info,
200 uint *const n_null_fields) const;
201 /* Convert a key from Table->record format to mem-comparable form */
202 uint pack_record(const TABLE *const tbl, uchar *const pack_buffer,
203 const uchar *const record, uchar *const packed_tuple,
204 Rdb_string_writer *const unpack_info,
205 const bool &should_store_row_debug_checksums,
206 const longlong &hidden_pk_id = 0, uint n_key_parts = 0,
207 uint *const n_null_fields = nullptr,
208 uint *const ttl_pk_offset = nullptr,
209 const char *const ttl_bytes = nullptr) const;
210 /* Pack the hidden primary key into mem-comparable form. */
211 uint pack_hidden_pk(const longlong &hidden_pk_id,
212 uchar *const packed_tuple) const;
213 int unpack_field(Rdb_field_packing *const fpi,
214 Field *const field,
215 Rdb_string_reader* reader,
216 const uchar *const default_value,
217 Rdb_string_reader* unp_reader) const;
218 int unpack_record(TABLE *const table, uchar *const buf,
219 const rocksdb::Slice *const packed_key,
220 const rocksdb::Slice *const unpack_info,
221 const bool &verify_row_debug_checksums) const;
222
223 static bool unpack_info_has_checksum(const rocksdb::Slice &unpack_info);
224 int compare_keys(const rocksdb::Slice *key1, const rocksdb::Slice *key2,
225 std::size_t *const column_index) const;
226
227 size_t key_length(const TABLE *const table, const rocksdb::Slice &key) const;
228
229 /* Get the key that is the "infimum" for this index */
230 inline void get_infimum_key(uchar *const key, uint *const size) const {
231 rdb_netbuf_store_index(key, m_index_number);
232 *size = INDEX_NUMBER_SIZE;
233 }
234
235 /* Get the key that is a "supremum" for this index */
236 inline void get_supremum_key(uchar *const key, uint *const size) const {
237 rdb_netbuf_store_index(key, m_index_number + 1);
238 *size = INDEX_NUMBER_SIZE;
239 }
240
241 /*
242 Get the first key that you need to position at to start iterating.
243
244 Stores into *key a "supremum" or "infimum" key value for the index.
245
246 @return Number of bytes in the key that are usable for bloom filter use.
247 */
248 inline int get_first_key(uchar *const key, uint *const size) const {
249 if (m_is_reverse_cf)
250 get_supremum_key(key, size);
251 else
252 get_infimum_key(key, size);
253
254 /* Find out how many bytes of infimum are the same as m_index_number */
255 uchar unmodified_key[INDEX_NUMBER_SIZE];
256 rdb_netbuf_store_index(unmodified_key, m_index_number);
257 int i;
258 for (i = 0; i < INDEX_NUMBER_SIZE; i++) {
259 if (key[i] != unmodified_key[i])
260 break;
261 }
262 return i;
263 }
264
265 /* Make a key that is right after the given key. */
266 static int successor(uchar *const packed_tuple, const uint &len);
267
268 /* Make a key that is right before the given key. */
269 static int predecessor(uchar *const packed_tuple, const uint &len);
270
271 /*
272 This can be used to compare prefixes.
273 if X is a prefix of Y, then we consider that X = Y.
274 */
275 // b describes the lookup key, which can be a prefix of a.
276 // b might be outside of the index_number range, if successor() is called.
277 int cmp_full_keys(const rocksdb::Slice &a, const rocksdb::Slice &b) const {
278 DBUG_ASSERT(covers_key(a));
279
280 return memcmp(a.data(), b.data(), std::min(a.size(), b.size()));
281 }
282
283 /* Check if given mem-comparable key belongs to this index */
284 bool covers_key(const rocksdb::Slice &slice) const {
285 if (slice.size() < INDEX_NUMBER_SIZE)
286 return false;
287
288 if (memcmp(slice.data(), m_index_number_storage_form, INDEX_NUMBER_SIZE))
289 return false;
290
291 return true;
292 }
293
294 void get_lookup_bitmap(const TABLE *table, MY_BITMAP *map) const;
295
296 bool covers_lookup(TABLE *const table,
297 const rocksdb::Slice *const unpack_info,
298 const MY_BITMAP *const map) const;
299
300 inline bool use_covered_bitmap_format() const {
301 return m_index_type == INDEX_TYPE_SECONDARY &&
302 m_kv_format_version >= SECONDARY_FORMAT_VERSION_UPDATE3;
303 }
304
305 /*
306 Return true if the passed mem-comparable key
307 - is from this index, and
308 - it matches the passed key prefix (the prefix is also in mem-comparable
309 form)
310 */
311 bool value_matches_prefix(const rocksdb::Slice &value,
312 const rocksdb::Slice &prefix) const {
313 return covers_key(value) && !cmp_full_keys(value, prefix);
314 }
315
316 uint32 get_keyno() const { return m_keyno; }
317
318 uint32 get_index_number() const { return m_index_number; }
319
320 GL_INDEX_ID get_gl_index_id() const {
321 const GL_INDEX_ID gl_index_id = {m_cf_handle->GetID(), m_index_number};
322 return gl_index_id;
323 }
324
325 int read_memcmp_key_part(const TABLE *table_arg, Rdb_string_reader *reader,
326 const uint part_num) const;
327
328 /* Must only be called for secondary keys: */
329 uint get_primary_key_tuple(const TABLE *const tbl,
330 const Rdb_key_def &pk_descr,
331 const rocksdb::Slice *const key,
332 uchar *const pk_buffer) const;
333
334 uint get_memcmp_sk_parts(const TABLE *table, const rocksdb::Slice &key,
335 uchar *sk_buffer, uint *n_null_fields) const;
336
337 /* Return max length of mem-comparable form */
338 uint max_storage_fmt_length() const { return m_maxlength; }
339
340 uint get_key_parts() const { return m_key_parts; }
341
342 uint get_ttl_field_offset() const { return m_ttl_field_offset; }
343
344 /*
345 Get a field object for key part #part_no
346
347 @detail
348 SQL layer thinks unique secondary indexes and indexes in partitioned
349 tables are not "Extended" with Primary Key columns.
350
351 Internally, we always extend all indexes with PK columns. This function
352 uses our definition of how the index is Extended.
353 */
354 inline Field *get_table_field_for_part_no(TABLE *table, uint part_no) const;
355
356 const std::string &get_name() const { return m_name; }
357
358 const rocksdb::SliceTransform *get_extractor() const {
359 return m_prefix_extractor.get();
360 }
361
362 static size_t get_unpack_header_size(char tag);
363
364 Rdb_key_def &operator=(const Rdb_key_def &) = delete;
365 Rdb_key_def(const Rdb_key_def &k);
366 Rdb_key_def(uint indexnr_arg, uint keyno_arg,
367 rocksdb::ColumnFamilyHandle *cf_handle_arg,
368 uint16_t index_dict_version_arg, uchar index_type_arg,
369 uint16_t kv_format_version_arg, bool is_reverse_cf_arg,
370 bool is_per_partition_cf, const char *name,
371 Rdb_index_stats stats = Rdb_index_stats(), uint32 index_flags = 0,
372 uint32 ttl_rec_offset = UINT_MAX, uint64 ttl_duration = 0);
373 ~Rdb_key_def();
374
375 enum {
376 INDEX_NUMBER_SIZE = 4,
377 VERSION_SIZE = 2,
378 CF_NUMBER_SIZE = 4,
379 CF_FLAG_SIZE = 4,
380 PACKED_SIZE = 4, // one int
381 };
382
383 // bit flags for combining bools when writing to disk
384 enum {
385 REVERSE_CF_FLAG = 1,
386 AUTO_CF_FLAG = 2, // Deprecated
387 PER_PARTITION_CF_FLAG = 4,
388 };
389
390 // bit flags which denote myrocks specific fields stored in the record
391 // currently only used for TTL.
392 enum INDEX_FLAG {
393 TTL_FLAG = 1 << 0,
394
395 // MAX_FLAG marks where the actual record starts
396 // This flag always needs to be set to the last index flag enum.
397 MAX_FLAG = TTL_FLAG << 1,
398 };
399
400 // Set of flags to ignore when comparing two CF-s and determining if
401 // they're same.
402 static const uint CF_FLAGS_TO_IGNORE = PER_PARTITION_CF_FLAG;
403
404 // Data dictionary types
405 enum DATA_DICT_TYPE {
406 DDL_ENTRY_INDEX_START_NUMBER = 1,
407 INDEX_INFO = 2,
408 CF_DEFINITION = 3,
409 BINLOG_INFO_INDEX_NUMBER = 4,
410 DDL_DROP_INDEX_ONGOING = 5,
411 INDEX_STATISTICS = 6,
412 MAX_INDEX_ID = 7,
413 DDL_CREATE_INDEX_ONGOING = 8,
414 AUTO_INC = 9,
415 END_DICT_INDEX_ID = 255
416 };
417
418 // Data dictionary schema version. Introduce newer versions
419 // if changing schema layout
420 enum {
421 DDL_ENTRY_INDEX_VERSION = 1,
422 CF_DEFINITION_VERSION = 1,
423 BINLOG_INFO_INDEX_NUMBER_VERSION = 1,
424 DDL_DROP_INDEX_ONGOING_VERSION = 1,
425 MAX_INDEX_ID_VERSION = 1,
426 DDL_CREATE_INDEX_ONGOING_VERSION = 1,
427 AUTO_INCREMENT_VERSION = 1,
428 // Version for index stats is stored in IndexStats struct
429 };
430
431 // Index info version. Introduce newer versions when changing the
432 // INDEX_INFO layout. Update INDEX_INFO_VERSION_LATEST to point to the
433 // latest version number.
434 enum {
435 INDEX_INFO_VERSION_INITIAL = 1, // Obsolete
436 INDEX_INFO_VERSION_KV_FORMAT,
437 INDEX_INFO_VERSION_GLOBAL_ID,
438 // There is no change to data format in this version, but this version
439 // verifies KV format version, whereas previous versions do not. A version
440 // bump is needed to prevent older binaries from skipping the KV version
441 // check inadvertently.
442 INDEX_INFO_VERSION_VERIFY_KV_FORMAT,
443 // This changes the data format to include a 8 byte TTL duration for tables
444 INDEX_INFO_VERSION_TTL,
445 // This changes the data format to include a bitmap before the TTL duration
446 // which will indicate in the future whether TTL or other special fields
447 // are turned on or off.
448 INDEX_INFO_VERSION_FIELD_FLAGS,
449 // This normally point to the latest (currently it does).
450 INDEX_INFO_VERSION_LATEST = INDEX_INFO_VERSION_FIELD_FLAGS,
451 };
452
453 // MyRocks index types
454 enum {
455 INDEX_TYPE_PRIMARY = 1,
456 INDEX_TYPE_SECONDARY = 2,
457 INDEX_TYPE_HIDDEN_PRIMARY = 3,
458 };
459
460 // Key/Value format version for each index type
461 enum {
462 PRIMARY_FORMAT_VERSION_INITIAL = 10,
463 // This change includes:
464 // - For columns that can be unpacked with unpack_info, PK
465 // stores the unpack_info.
466 // - DECIMAL datatype is no longer stored in the row (because
467 // it can be decoded from its mem-comparable form)
468 // - VARCHAR-columns use endspace-padding.
469 PRIMARY_FORMAT_VERSION_UPDATE1 = 11,
470 // This change includes:
471 // - Binary encoded variable length fields have a new format that avoids
472 // an inefficient where data that was a multiple of 8 bytes in length
473 // had an extra 9 bytes of encoded data.
474 PRIMARY_FORMAT_VERSION_UPDATE2 = 12,
475 // This change includes support for TTL
476 // - This means that when TTL is specified for the table an 8-byte TTL
477 // field is prepended in front of each value.
478 PRIMARY_FORMAT_VERSION_TTL = 13,
479 PRIMARY_FORMAT_VERSION_LATEST = PRIMARY_FORMAT_VERSION_TTL,
480
481 SECONDARY_FORMAT_VERSION_INITIAL = 10,
482 // This change the SK format to include unpack_info.
483 SECONDARY_FORMAT_VERSION_UPDATE1 = 11,
484 // This change includes:
485 // - Binary encoded variable length fields have a new format that avoids
486 // an inefficient where data that was a multiple of 8 bytes in length
487 // had an extra 9 bytes of encoded data.
488 SECONDARY_FORMAT_VERSION_UPDATE2 = 12,
489 // This change includes support for TTL
490 // - This means that when TTL is specified for the table an 8-byte TTL
491 // field is prepended in front of each value.
492 SECONDARY_FORMAT_VERSION_TTL = 13,
493 SECONDARY_FORMAT_VERSION_LATEST = SECONDARY_FORMAT_VERSION_TTL,
494 // This change includes support for covering SK lookups for varchars. A
495 // 2-byte bitmap is added after the tag-byte to unpack_info only for
496 // records which have covered varchar columns. Currently waiting before
497 // enabling in prod.
498 SECONDARY_FORMAT_VERSION_UPDATE3 = 65535,
499 };
500
501 void setup(const TABLE *const table, const Rdb_tbl_def *const tbl_def);
502
503 static uint extract_ttl_duration(const TABLE *const table_arg,
504 const Rdb_tbl_def *const tbl_def_arg,
505 uint64 *ttl_duration);
506 static uint extract_ttl_col(const TABLE *const table_arg,
507 const Rdb_tbl_def *const tbl_def_arg,
508 std::string *ttl_column, uint *ttl_field_offset,
509 bool skip_checks = false);
510 inline bool has_ttl() const { return m_ttl_duration > 0; }
511
512 static bool has_index_flag(uint32 index_flags, enum INDEX_FLAG flag);
513 static uint32 calculate_index_flag_offset(uint32 index_flags,
514 enum INDEX_FLAG flag,
515 uint *const field_length = nullptr);
516 void write_index_flag_field(Rdb_string_writer *const buf,
517 const uchar *const val,
518 enum INDEX_FLAG flag) const;
519
520 static const std::string
521 gen_qualifier_for_table(const char *const qualifier,
522 const std::string &partition_name = "");
523 static const std::string
524 gen_cf_name_qualifier_for_partition(const std::string &s);
525 static const std::string
526 gen_ttl_duration_qualifier_for_partition(const std::string &s);
527 static const std::string
528 gen_ttl_col_qualifier_for_partition(const std::string &s);
529
530 static const std::string parse_comment_for_qualifier(
531 const std::string &comment, const TABLE *const table_arg,
532 const Rdb_tbl_def *const tbl_def_arg, bool *per_part_match_found,
533 const char *const qualifier);
534
535 rocksdb::ColumnFamilyHandle *get_cf() const { return m_cf_handle; }
536
537 /* Check if keypart #kp can be unpacked from index tuple */
538 inline bool can_unpack(const uint &kp) const;
539 /* Check if keypart #kp needs unpack info */
540 inline bool has_unpack_info(const uint &kp) const;
541
542 /* Check if given table has a primary key */
543 static bool table_has_hidden_pk(const TABLE *const table);
544
545 void report_checksum_mismatch(const bool &is_key, const char *const data,
546 const size_t data_size) const;
547
548 /* Check if index is at least pk_min if it is a PK,
549 or at least sk_min if SK.*/
550 bool index_format_min_check(const int &pk_min, const int &sk_min) const;
551
552 void pack_with_make_sort_key(
553 Rdb_field_packing *const fpi, Field *const field,
554 uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst,
555 Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__))) const;
556
557 void pack_with_varchar_encoding(
558 Rdb_field_packing *const fpi, Field *const field, uchar *buf, uchar **dst,
559 Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__))) const;
560
561 void
562 pack_with_varchar_space_pad(Rdb_field_packing *const fpi, Field *const field,
563 uchar *buf, uchar **dst,
564 Rdb_pack_field_context *const pack_ctx) const;
565
566 int unpack_integer(Rdb_field_packing *const fpi, Field *const field,
567 uchar *const to, Rdb_string_reader *const reader,
568 Rdb_string_reader *const unp_reader
569 MY_ATTRIBUTE((__unused__))) const;
570
571 int unpack_double(Rdb_field_packing *const fpi MY_ATTRIBUTE((__unused__)),
572 Field *const field MY_ATTRIBUTE((__unused__)),
573 uchar *const field_ptr, Rdb_string_reader *const reader,
574 Rdb_string_reader *const unp_reader
575 MY_ATTRIBUTE((__unused__))) const;
576
577 int unpack_float(Rdb_field_packing *const fpi,
578 Field *const field MY_ATTRIBUTE((__unused__)),
579 uchar *const field_ptr, Rdb_string_reader *const reader,
580 Rdb_string_reader *const unp_reader
581 MY_ATTRIBUTE((__unused__))) const;
582
583 int unpack_binary_str(Rdb_field_packing *const fpi, Field *const field,
584 uchar *const to, Rdb_string_reader *const reader,
585 Rdb_string_reader *const unp_reader
586 MY_ATTRIBUTE((__unused__))) const;
587
588 int unpack_binary_or_utf8_varchar(
589 Rdb_field_packing *const fpi, Field *const field, uchar *dst,
590 Rdb_string_reader *const reader,
591 Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__))) const;
592
593 int unpack_binary_or_utf8_varchar_space_pad(
594 Rdb_field_packing *const fpi, Field *const field, uchar *dst,
595 Rdb_string_reader *const reader,
596 Rdb_string_reader *const unp_reader) const;
597
598 int unpack_newdate(Rdb_field_packing *const fpi,
599 Field *const field MY_ATTRIBUTE((__unused__)),
600 uchar *const field_ptr, Rdb_string_reader *const reader,
601 Rdb_string_reader *const unp_reader
602 MY_ATTRIBUTE((__unused__))) const;
603
604 int unpack_utf8_str(Rdb_field_packing *const fpi, Field *const field,
605 uchar *dst, Rdb_string_reader *const reader,
606 Rdb_string_reader *const unp_reader
607 MY_ATTRIBUTE((__unused__))) const;
608
609 int unpack_unknown_varchar(Rdb_field_packing *const fpi, Field *const field,
610 uchar *dst, Rdb_string_reader *const reader,
611 Rdb_string_reader *const unp_reader) const;
612
613 int unpack_simple_varchar_space_pad(
614 Rdb_field_packing *const fpi, Field *const field, uchar *dst,
615 Rdb_string_reader *const reader,
616 Rdb_string_reader *const unp_reader) const;
617
618 int unpack_simple(Rdb_field_packing *const fpi,
619 Field *const field MY_ATTRIBUTE((__unused__)),
620 uchar *const dst, Rdb_string_reader *const reader,
621 Rdb_string_reader *const unp_reader) const;
622
623 int unpack_unknown(Rdb_field_packing *const fpi, Field *const field,
624 uchar *const dst, Rdb_string_reader *const reader,
625 Rdb_string_reader *const unp_reader) const;
626
627 int unpack_floating_point(uchar *const dst, Rdb_string_reader *const reader,
628 const size_t &size, const int &exp_digit,
629 const uchar *const zero_pattern,
630 const uchar *const zero_val,
631 void (*swap_func)(uchar *, const uchar *)) const;
632
633 void make_unpack_simple_varchar(const Rdb_collation_codec *const codec,
634 const Field *const field,
635 Rdb_pack_field_context *const pack_ctx) const;
636
637 void make_unpack_simple(const Rdb_collation_codec *const codec,
638 const Field *const field,
639 Rdb_pack_field_context *const pack_ctx) const;
640
641 void make_unpack_unknown(
642 const Rdb_collation_codec *codec MY_ATTRIBUTE((__unused__)),
643 const Field *const field, Rdb_pack_field_context *const pack_ctx) const;
644
645 void make_unpack_unknown_varchar(
646 const Rdb_collation_codec *const codec MY_ATTRIBUTE((__unused__)),
647 const Field *const field, Rdb_pack_field_context *const pack_ctx) const;
648
649 void dummy_make_unpack_info(
650 const Rdb_collation_codec *codec MY_ATTRIBUTE((__unused__)),
651 const Field *field MY_ATTRIBUTE((__unused__)),
652 Rdb_pack_field_context *pack_ctx MY_ATTRIBUTE((__unused__))) const;
653
654 int skip_max_length(const Rdb_field_packing *const fpi,
655 const Field *const field MY_ATTRIBUTE((__unused__)),
656 Rdb_string_reader *const reader) const;
657
658 int skip_variable_length(
659 const Rdb_field_packing *const fpi MY_ATTRIBUTE((__unused__)),
660 const Field *const field, Rdb_string_reader *const reader) const;
661
662 int skip_variable_space_pad(const Rdb_field_packing *const fpi,
663 const Field *const field,
664 Rdb_string_reader *const reader) const;
665
666 inline bool use_legacy_varbinary_format() const {
667 return !index_format_min_check(PRIMARY_FORMAT_VERSION_UPDATE2,
668 SECONDARY_FORMAT_VERSION_UPDATE2);
669 }
670
671 static inline bool is_unpack_data_tag(char c) {
672 return c == RDB_UNPACK_DATA_TAG || c == RDB_UNPACK_COVERED_DATA_TAG;
673 }
674
675 private:
676#ifndef DBUG_OFF
677 inline bool is_storage_available(const int &offset, const int &needed) const {
678 const int storage_length = static_cast<int>(max_storage_fmt_length());
679 return (storage_length - offset) >= needed;
680 }
681#else
682 inline bool is_storage_available(const int &offset, const int &needed) const {
683 return 1;
684 }
685#endif // DBUG_OFF
686
687 /* Global number of this index (used as prefix in StorageFormat) */
688 const uint32 m_index_number;
689
690 uchar m_index_number_storage_form[INDEX_NUMBER_SIZE];
691
692 rocksdb::ColumnFamilyHandle *m_cf_handle;
693
694 void pack_legacy_variable_format(const uchar *src, size_t src_len,
695 uchar **dst) const;
696
697 void pack_variable_format(const uchar *src, size_t src_len,
698 uchar **dst) const;
699
700 uint calc_unpack_legacy_variable_format(uchar flag, bool *done) const;
701
702 uint calc_unpack_variable_format(uchar flag, bool *done) const;
703
704 public:
705 uint16_t m_index_dict_version;
706 uchar m_index_type;
707 /* KV format version for the index id */
708 uint16_t m_kv_format_version;
709 /* If true, the column family stores data in the reverse order */
710 bool m_is_reverse_cf;
711
712 /* If true, then column family is created per partition. */
713 bool m_is_per_partition_cf;
714
715 std::string m_name;
716 mutable Rdb_index_stats m_stats;
717
718 /*
719 Bitmap containing information about whether TTL or other special fields
720 are enabled for the given index.
721 */
722 uint32 m_index_flags_bitmap;
723
724 /*
725 How much space in bytes the index flag fields occupy.
726 */
727 uint32 m_total_index_flags_length;
728
729 /*
730 Offset in the records where the 8-byte TTL is stored (UINT_MAX if no TTL)
731 */
732 uint32 m_ttl_rec_offset;
733
734 /* Default TTL duration */
735 uint64 m_ttl_duration;
736
737 /* TTL column (if defined by user, otherwise implicit TTL is used) */
738 std::string m_ttl_column;
739
740 private:
741 friend class Rdb_tbl_def; // for m_index_number above
742
743 /* Number of key parts in the primary key*/
744 uint m_pk_key_parts;
745
746 /*
747 pk_part_no[X]=Y means that keypart #X of this key is key part #Y of the
748 primary key. Y==-1 means this column is not present in the primary key.
749 */
750 uint *m_pk_part_no;
751
752 /* Array of index-part descriptors. */
753 Rdb_field_packing *m_pack_info;
754
755 uint m_keyno; /* number of this index in the table */
756
757 /*
758 Number of key parts in the index (including "index extension"). This is how
759 many elements are in the m_pack_info array.
760 */
761 uint m_key_parts;
762
763 /*
764 If TTL column is part of the PK, offset of the column within pk.
765 Default is UINT_MAX to denote that TTL col is not part of PK.
766 */
767 uint m_ttl_pk_key_part_offset;
768
769 /*
770 Index of the TTL column in table->s->fields, if it exists.
771 Default is UINT_MAX to denote that it does not exist.
772 */
773 uint m_ttl_field_offset;
774
775 /* Prefix extractor for the column family of the key definiton */
776 std::shared_ptr<const rocksdb::SliceTransform> m_prefix_extractor;
777
778 /* Maximum length of the mem-comparable form. */
779 uint m_maxlength;
780
781 /* mutex to protect setup */
782 mysql_mutex_t m_mutex;
783};
784
785// "Simple" collations (those specified in strings/ctype-simple.c) are simple
786// because their strnxfrm function maps one byte to one byte. However, the
787// mapping is not injective, so the inverse function will take in an extra
788// index parameter containing information to disambiguate what the original
789// character was.
790//
791// The m_enc* members are for encoding. Generally, we want encoding to be:
792// src -> (dst, idx)
793//
794// Since strnxfrm already gives us dst, we just need m_enc_idx[src] to give us
795// idx.
796//
797// For the inverse, we have:
798// (dst, idx) -> src
799//
800// We have m_dec_idx[idx][dst] = src to get our original character back.
801//
802struct Rdb_collation_codec {
803 const my_core::CHARSET_INFO *m_cs;
804 // The first element unpacks VARCHAR(n), the second one - CHAR(n).
805 std::array<rdb_make_unpack_info_t, 2> m_make_unpack_info_func;
806 std::array<rdb_index_field_unpack_t, 2> m_unpack_func;
807
808 std::array<uchar, 256> m_enc_idx;
809 std::array<uchar, 256> m_enc_size;
810
811 std::array<uchar, 256> m_dec_size;
812 std::vector<std::array<uchar, 256>> m_dec_idx;
813};
814
815extern mysql_mutex_t rdb_collation_data_mutex;
816extern mysql_mutex_t rdb_mem_cmp_space_mutex;
817extern std::array<const Rdb_collation_codec *, MY_ALL_CHARSETS_SIZE>
818 rdb_collation_data;
819
820class Rdb_field_packing {
821public:
822 Rdb_field_packing(const Rdb_field_packing &) = delete;
823 Rdb_field_packing &operator=(const Rdb_field_packing &) = delete;
824 Rdb_field_packing() = default;
825
826 /* Length of mem-comparable image of the field, in bytes */
827 int m_max_image_len;
828
829 /* Length of image in the unpack data */
830 int m_unpack_data_len;
831 int m_unpack_data_offset;
832
833 bool m_maybe_null; /* TRUE <=> NULL-byte is stored */
834
835 /*
836 Valid only for VARCHAR fields.
837 */
838 const CHARSET_INFO *m_varchar_charset;
839
840 // (Valid when Variable Length Space Padded Encoding is used):
841 uint m_segment_size; // size of segment used
842
843 // number of bytes used to store number of trimmed (or added)
844 // spaces in the upack_info
845 bool m_unpack_info_uses_two_bytes;
846
847 /*
848 True implies that an index-only read is always possible for this field.
849 False means an index-only read may be possible depending on the record and
850 field type.
851 */
852 bool m_covered;
853
854 const std::vector<uchar> *space_xfrm;
855 size_t space_xfrm_len;
856 size_t space_mb_len;
857
858 const Rdb_collation_codec *m_charset_codec;
859
860 /*
861 @return TRUE: this field makes use of unpack_info.
862 */
863 bool uses_unpack_info() const { return (m_make_unpack_info_func != nullptr); }
864
865 /* TRUE means unpack_info stores the original field value */
866 bool m_unpack_info_stores_value;
867
868 rdb_index_field_pack_t m_pack_func;
869 rdb_make_unpack_info_t m_make_unpack_info_func;
870
871 /*
872 This function takes
873 - mem-comparable form
874 - unpack_info data
875 and restores the original value.
876 */
877 rdb_index_field_unpack_t m_unpack_func;
878
879 /*
880 This function skips over mem-comparable form.
881 */
882 rdb_index_field_skip_t m_skip_func;
883
884private:
885 /*
886 Location of the field in the table (key number and key part number).
887
888 Note that this describes not the field, but rather a position of field in
889 the index. Consider an example:
890
891 col1 VARCHAR (100),
892 INDEX idx1 (col1)),
893 INDEX idx2 (col1(10)),
894
895 Here, idx2 has a special Field object that is set to describe a 10-char
896 prefix of col1.
897
898 We must also store the keynr. It is needed for implicit "extended keys".
899 Every key in MyRocks needs to include PK columns. Generally, SQL layer
900 includes PK columns as part of its "Extended Keys" feature, but sometimes
901 it does not (known examples are unique secondary indexes and partitioned
902 tables).
903 In that case, MyRocks's index descriptor has invisible suffix of PK
904 columns (and the point is that these columns are parts of PK, not parts
905 of the current index).
906 */
907 uint m_keynr;
908 uint m_key_part;
909
910public:
911 bool setup(const Rdb_key_def *const key_descr, const Field *const field,
912 const uint &keynr_arg, const uint &key_part_arg,
913 const uint16 &key_length);
914 Field *get_field_in_table(const TABLE *const tbl) const;
915 void fill_hidden_pk_val(uchar **dst, const longlong &hidden_pk_id) const;
916};
917
918/*
919 Descriptor telling how to decode/encode a field to on-disk record storage
920 format. Not all information is in the structure yet, but eventually we
921 want to have as much as possible there to avoid virtual calls.
922
923 For encoding/decoding of index tuples, see Rdb_key_def.
924 */
925class Rdb_field_encoder {
926public:
927 Rdb_field_encoder(const Rdb_field_encoder &) = delete;
928 Rdb_field_encoder &operator=(const Rdb_field_encoder &) = delete;
929 /*
930 STORE_NONE is set when a column can be decoded solely from their
931 mem-comparable form.
932 STORE_SOME is set when a column can be decoded from their mem-comparable
933 form plus unpack_info.
934 STORE_ALL is set when a column cannot be decoded, so its original value
935 must be stored in the PK records.
936 */
937 enum STORAGE_TYPE {
938 STORE_NONE,
939 STORE_SOME,
940 STORE_ALL,
941 };
942 STORAGE_TYPE m_storage_type;
943
944 uint m_null_offset;
945 uint16 m_field_index;
946
947 uchar m_null_mask; // 0 means the field cannot be null
948
949 my_core::enum_field_types m_field_type;
950
951 uint m_pack_length_in_rec;
952
953 bool maybe_null() const { return m_null_mask != 0; }
954
955 bool uses_variable_len_encoding() const {
956 return (m_field_type == MYSQL_TYPE_BLOB ||
957 m_field_type == MYSQL_TYPE_VARCHAR);
958 }
959};
960
961inline Field *Rdb_key_def::get_table_field_for_part_no(TABLE *table,
962 uint part_no) const {
963 DBUG_ASSERT(part_no < get_key_parts());
964 return m_pack_info[part_no].get_field_in_table(table);
965}
966
967inline bool Rdb_key_def::can_unpack(const uint &kp) const {
968 DBUG_ASSERT(kp < m_key_parts);
969 return (m_pack_info[kp].m_unpack_func != nullptr);
970}
971
972inline bool Rdb_key_def::has_unpack_info(const uint &kp) const {
973 DBUG_ASSERT(kp < m_key_parts);
974 return m_pack_info[kp].uses_unpack_info();
975}
976
977/*
978 A table definition. This is an entry in the mapping
979
980 dbname.tablename -> {index_nr, index_nr, ... }
981
982 There is only one Rdb_tbl_def object for a given table.
983 That's why we keep auto_increment value here, too.
984*/
985
986class Rdb_tbl_def {
987private:
988 void check_if_is_mysql_system_table();
989
990 /* Stores 'dbname.tablename' */
991 std::string m_dbname_tablename;
992
993 /* Store the db name, table name, and partition name */
994 std::string m_dbname;
995 std::string m_tablename;
996 std::string m_partition;
997
998 void set_name(const std::string &name);
999
1000public:
1001 Rdb_tbl_def(const Rdb_tbl_def &) = delete;
1002 Rdb_tbl_def &operator=(const Rdb_tbl_def &) = delete;
1003
1004 explicit Rdb_tbl_def(const std::string &name)
1005 : m_key_descr_arr(nullptr), m_hidden_pk_val(0), m_auto_incr_val(0) {
1006 set_name(name);
1007 }
1008
1009 Rdb_tbl_def(const char *const name, const size_t &len)
1010 : m_key_descr_arr(nullptr), m_hidden_pk_val(0), m_auto_incr_val(0) {
1011 set_name(std::string(name, len));
1012 }
1013
1014 explicit Rdb_tbl_def(const rocksdb::Slice &slice, const size_t &pos = 0)
1015 : m_key_descr_arr(nullptr), m_hidden_pk_val(0), m_auto_incr_val(0) {
1016 set_name(std::string(slice.data() + pos, slice.size() - pos));
1017 }
1018
1019 ~Rdb_tbl_def();
1020
1021 /* Number of indexes */
1022 uint m_key_count;
1023
1024 /* Array of index descriptors */
1025 std::shared_ptr<Rdb_key_def> *m_key_descr_arr;
1026
1027 std::atomic<longlong> m_hidden_pk_val;
1028 std::atomic<ulonglong> m_auto_incr_val;
1029
1030 /* Is this a system table */
1031 bool m_is_mysql_system_table;
1032
1033 bool put_dict(Rdb_dict_manager *const dict, rocksdb::WriteBatch *const batch,
1034 uchar *const key, const size_t &keylen);
1035
1036 const std::string &full_tablename() const { return m_dbname_tablename; }
1037 const std::string &base_dbname() const { return m_dbname; }
1038 const std::string &base_tablename() const { return m_tablename; }
1039 const std::string &base_partition() const { return m_partition; }
1040 GL_INDEX_ID get_autoincr_gl_index_id();
1041};
1042
1043/*
1044 A thread-safe sequential number generator. Its performance is not a concern
1045 hence it is ok to protect it by a mutex.
1046*/
1047
1048class Rdb_seq_generator {
1049 uint m_next_number = 0;
1050
1051 mysql_mutex_t m_mutex;
1052
1053public:
1054 Rdb_seq_generator(const Rdb_seq_generator &) = delete;
1055 Rdb_seq_generator &operator=(const Rdb_seq_generator &) = delete;
1056 Rdb_seq_generator() = default;
1057
1058 void init(const uint &initial_number) {
1059 mysql_mutex_init(0, &m_mutex, MY_MUTEX_INIT_FAST);
1060 m_next_number = initial_number;
1061 }
1062
1063 uint get_and_update_next_number(Rdb_dict_manager *const dict);
1064
1065 void cleanup() { mysql_mutex_destroy(&m_mutex); }
1066};
1067
1068interface Rdb_tables_scanner {
1069 virtual int add_table(Rdb_tbl_def * tdef) = 0;
1070 virtual ~Rdb_tables_scanner() {} /* Keep the compiler happy */
1071};
1072
1073/*
1074 This contains a mapping of
1075
1076 dbname.table_name -> array{Rdb_key_def}.
1077
1078 objects are shared among all threads.
1079*/
1080
1081class Rdb_ddl_manager {
1082 Rdb_dict_manager *m_dict = nullptr;
1083 my_core::HASH m_ddl_hash; // Contains Rdb_tbl_def elements
1084 // Maps index id to <table_name, index number>
1085 std::map<GL_INDEX_ID, std::pair<std::string, uint>> m_index_num_to_keydef;
1086
1087 // Maps index id to key definitons not yet committed to data dictionary.
1088 // This is mainly used to store key definitions during ALTER TABLE.
1089 std::map<GL_INDEX_ID, std::shared_ptr<Rdb_key_def>>
1090 m_index_num_to_uncommitted_keydef;
1091 mysql_rwlock_t m_rwlock;
1092
1093 Rdb_seq_generator m_sequence;
1094 // A queue of table stats to write into data dictionary
1095 // It is produced by event listener (ie compaction and flush threads)
1096 // and consumed by the rocksdb background thread
1097 std::map<GL_INDEX_ID, Rdb_index_stats> m_stats2store;
1098
1099 const std::shared_ptr<Rdb_key_def> &find(GL_INDEX_ID gl_index_id);
1100
1101public:
1102 Rdb_ddl_manager(const Rdb_ddl_manager &) = delete;
1103 Rdb_ddl_manager &operator=(const Rdb_ddl_manager &) = delete;
1104 Rdb_ddl_manager() {}
1105
1106 /* Load the data dictionary from on-disk storage */
1107 bool init(Rdb_dict_manager *const dict_arg, Rdb_cf_manager *const cf_manager,
1108 const uint32_t &validate_tables);
1109
1110 void cleanup();
1111
1112 Rdb_tbl_def *find(const std::string &table_name, const bool &lock = true);
1113 std::shared_ptr<const Rdb_key_def> safe_find(GL_INDEX_ID gl_index_id);
1114 void set_stats(const std::unordered_map<GL_INDEX_ID, Rdb_index_stats> &stats);
1115 void adjust_stats(const std::vector<Rdb_index_stats> &new_data,
1116 const std::vector<Rdb_index_stats> &deleted_data =
1117 std::vector<Rdb_index_stats>());
1118 void persist_stats(const bool &sync = false);
1119
1120 /* Modify the mapping and write it to on-disk storage */
1121 int put_and_write(Rdb_tbl_def *const key_descr,
1122 rocksdb::WriteBatch *const batch);
1123 void remove(Rdb_tbl_def *const rec, rocksdb::WriteBatch *const batch,
1124 const bool &lock = true);
1125 bool rename(const std::string &from, const std::string &to,
1126 rocksdb::WriteBatch *const batch);
1127
1128 uint get_and_update_next_number(Rdb_dict_manager *const dict) {
1129 return m_sequence.get_and_update_next_number(dict);
1130 }
1131
1132 const std::string safe_get_table_name(const GL_INDEX_ID &gl_index_id);
1133
1134 /* Walk the data dictionary */
1135 int scan_for_tables(Rdb_tables_scanner *tables_scanner);
1136
1137 void erase_index_num(const GL_INDEX_ID &gl_index_id);
1138 void add_uncommitted_keydefs(
1139 const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes);
1140 void remove_uncommitted_keydefs(
1141 const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes);
1142
1143private:
1144 /* Put the data into in-memory table (only) */
1145 int put(Rdb_tbl_def *const key_descr, const bool &lock = true);
1146
1147 /* Helper functions to be passed to my_core::HASH object */
1148 static const uchar *get_hash_key(Rdb_tbl_def *const rec, size_t *const length,
1149 my_bool not_used MY_ATTRIBUTE((unused)));
1150 static void free_hash_elem(void *const data);
1151
1152 bool validate_schemas();
1153
1154 bool validate_auto_incr();
1155};
1156
1157/*
1158 Writing binlog information into RocksDB at commit(),
1159 and retrieving binlog information at crash recovery.
1160 commit() and recovery are always executed by at most single client
1161 at the same time, so concurrency control is not needed.
1162
1163 Binlog info is stored in RocksDB as the following.
1164 key: BINLOG_INFO_INDEX_NUMBER
1165 value: packed single row:
1166 binlog_name_length (2 byte form)
1167 binlog_name
1168 binlog_position (4 byte form)
1169 binlog_gtid_length (2 byte form)
1170 binlog_gtid
1171*/
1172class Rdb_binlog_manager {
1173public:
1174 Rdb_binlog_manager(const Rdb_binlog_manager &) = delete;
1175 Rdb_binlog_manager &operator=(const Rdb_binlog_manager &) = delete;
1176 Rdb_binlog_manager() = default;
1177
1178 bool init(Rdb_dict_manager *const dict);
1179 void cleanup();
1180 void update(const char *const binlog_name, const my_off_t binlog_pos,
1181 rocksdb::WriteBatchBase *const batch);
1182 bool read(char *const binlog_name, my_off_t *const binlog_pos,
1183 char *const binlog_gtid) const;
1184 void update_slave_gtid_info(const uint &id, const char *const db,
1185 const char *const gtid,
1186 rocksdb::WriteBatchBase *const write_batch);
1187
1188private:
1189 Rdb_dict_manager *m_dict = nullptr;
1190 uchar m_key_buf[Rdb_key_def::INDEX_NUMBER_SIZE] = {0};
1191 rocksdb::Slice m_key_slice;
1192
1193 rocksdb::Slice pack_value(uchar *const buf, const char *const binlog_name,
1194 const my_off_t &binlog_pos,
1195 const char *const binlog_gtid) const;
1196 bool unpack_value(const uchar *const value, size_t value_size,
1197 char *const binlog_name,
1198 my_off_t *const binlog_pos, char *const binlog_gtid) const;
1199
1200 std::atomic<Rdb_tbl_def *> m_slave_gtid_info_tbl;
1201};
1202
1203/*
1204 Rdb_dict_manager manages how MySQL on RocksDB (MyRocks) stores its
1205 internal data dictionary.
1206 MyRocks stores data dictionary on dedicated system column family
1207 named __system__. The system column family is used by MyRocks
1208 internally only, and not used by applications.
1209
1210 Currently MyRocks has the following data dictionary data models.
1211
1212 1. Table Name => internal index id mappings
1213 key: Rdb_key_def::DDL_ENTRY_INDEX_START_NUMBER(0x1) + dbname.tablename
1214 value: version, {cf_id, index_id}*n_indexes_of_the_table
1215 version is 2 bytes. cf_id and index_id are 4 bytes.
1216
1217 2. internal cf_id, index id => index information
1218 key: Rdb_key_def::INDEX_INFO(0x2) + cf_id + index_id
1219 value: version, index_type, kv_format_version, index_flags, ttl_duration
1220 index_type is 1 byte, version and kv_format_version are 2 bytes.
1221 index_flags is 4 bytes.
1222 ttl_duration is 8 bytes.
1223
1224 3. CF id => CF flags
1225 key: Rdb_key_def::CF_DEFINITION(0x3) + cf_id
1226 value: version, {is_reverse_cf, is_auto_cf (deprecated), is_per_partition_cf}
1227 cf_flags is 4 bytes in total.
1228
1229 4. Binlog entry (updated at commit)
1230 key: Rdb_key_def::BINLOG_INFO_INDEX_NUMBER (0x4)
1231 value: version, {binlog_name,binlog_pos,binlog_gtid}
1232
1233 5. Ongoing drop index entry
1234 key: Rdb_key_def::DDL_DROP_INDEX_ONGOING(0x5) + cf_id + index_id
1235 value: version
1236
1237 6. index stats
1238 key: Rdb_key_def::INDEX_STATISTICS(0x6) + cf_id + index_id
1239 value: version, {materialized PropertiesCollector::IndexStats}
1240
1241 7. maximum index id
1242 key: Rdb_key_def::MAX_INDEX_ID(0x7)
1243 value: index_id
1244 index_id is 4 bytes
1245
1246 8. Ongoing create index entry
1247 key: Rdb_key_def::DDL_CREATE_INDEX_ONGOING(0x8) + cf_id + index_id
1248 value: version
1249
1250 9. auto_increment values
1251 key: Rdb_key_def::AUTO_INC(0x9) + cf_id + index_id
1252 value: version, {max auto_increment so far}
1253 max auto_increment is 8 bytes
1254
1255 Data dictionary operations are atomic inside RocksDB. For example,
1256 when creating a table with two indexes, it is necessary to call Put
1257 three times. They have to be atomic. Rdb_dict_manager has a wrapper function
1258 begin() and commit() to make it easier to do atomic operations.
1259
1260*/
1261class Rdb_dict_manager {
1262private:
1263 mysql_mutex_t m_mutex;
1264 rocksdb::DB *m_db = nullptr;
1265 rocksdb::ColumnFamilyHandle *m_system_cfh = nullptr;
1266 /* Utility to put INDEX_INFO and CF_DEFINITION */
1267
1268 uchar m_key_buf_max_index_id[Rdb_key_def::INDEX_NUMBER_SIZE] = {0};
1269 rocksdb::Slice m_key_slice_max_index_id;
1270
1271 static void dump_index_id(uchar *const netbuf,
1272 Rdb_key_def::DATA_DICT_TYPE dict_type,
1273 const GL_INDEX_ID &gl_index_id);
1274 void delete_with_prefix(rocksdb::WriteBatch *const batch,
1275 Rdb_key_def::DATA_DICT_TYPE dict_type,
1276 const GL_INDEX_ID &gl_index_id) const;
1277 /* Functions for fast DROP TABLE/INDEX */
1278 void resume_drop_indexes() const;
1279 void log_start_drop_table(const std::shared_ptr<Rdb_key_def> *const key_descr,
1280 const uint32 &n_keys,
1281 const char *const log_action) const;
1282 void log_start_drop_index(GL_INDEX_ID gl_index_id,
1283 const char *log_action) const;
1284
1285public:
1286 Rdb_dict_manager(const Rdb_dict_manager &) = delete;
1287 Rdb_dict_manager &operator=(const Rdb_dict_manager &) = delete;
1288 Rdb_dict_manager() = default;
1289
1290 bool init(rocksdb::DB *const rdb_dict, Rdb_cf_manager *const cf_manager);
1291
1292 inline void cleanup() { mysql_mutex_destroy(&m_mutex); }
1293
1294 inline void lock() { RDB_MUTEX_LOCK_CHECK(m_mutex); }
1295
1296 inline void unlock() { RDB_MUTEX_UNLOCK_CHECK(m_mutex); }
1297
1298 inline rocksdb::ColumnFamilyHandle *get_system_cf() const {
1299 return m_system_cfh;
1300 }
1301
1302 /* Raw RocksDB operations */
1303 std::unique_ptr<rocksdb::WriteBatch> begin() const;
1304 int commit(rocksdb::WriteBatch *const batch, const bool &sync = true) const;
1305 rocksdb::Status get_value(const rocksdb::Slice &key,
1306 std::string *const value) const;
1307 void put_key(rocksdb::WriteBatchBase *const batch, const rocksdb::Slice &key,
1308 const rocksdb::Slice &value) const;
1309 void delete_key(rocksdb::WriteBatchBase *batch,
1310 const rocksdb::Slice &key) const;
1311 rocksdb::Iterator *new_iterator() const;
1312
1313 /* Internal Index id => CF */
1314 void
1315 add_or_update_index_cf_mapping(rocksdb::WriteBatch *batch,
1316 struct Rdb_index_info *const index_info) const;
1317 void delete_index_info(rocksdb::WriteBatch *batch,
1318 const GL_INDEX_ID &index_id) const;
1319 bool get_index_info(const GL_INDEX_ID &gl_index_id,
1320 struct Rdb_index_info *const index_info) const;
1321
1322 /* CF id => CF flags */
1323 void add_cf_flags(rocksdb::WriteBatch *const batch, const uint &cf_id,
1324 const uint &cf_flags) const;
1325 bool get_cf_flags(const uint &cf_id, uint *const cf_flags) const;
1326
1327 /* Functions for fast CREATE/DROP TABLE/INDEX */
1328 void
1329 get_ongoing_index_operation(std::unordered_set<GL_INDEX_ID> *gl_index_ids,
1330 Rdb_key_def::DATA_DICT_TYPE dd_type) const;
1331 bool is_index_operation_ongoing(const GL_INDEX_ID &gl_index_id,
1332 Rdb_key_def::DATA_DICT_TYPE dd_type) const;
1333 void start_ongoing_index_operation(rocksdb::WriteBatch *batch,
1334 const GL_INDEX_ID &gl_index_id,
1335 Rdb_key_def::DATA_DICT_TYPE dd_type) const;
1336 void end_ongoing_index_operation(rocksdb::WriteBatch *const batch,
1337 const GL_INDEX_ID &gl_index_id,
1338 Rdb_key_def::DATA_DICT_TYPE dd_type) const;
1339 bool is_drop_index_empty() const;
1340 void add_drop_table(std::shared_ptr<Rdb_key_def> *const key_descr,
1341 const uint32 &n_keys,
1342 rocksdb::WriteBatch *const batch) const;
1343 void add_drop_index(const std::unordered_set<GL_INDEX_ID> &gl_index_ids,
1344 rocksdb::WriteBatch *const batch) const;
1345 void add_create_index(const std::unordered_set<GL_INDEX_ID> &gl_index_ids,
1346 rocksdb::WriteBatch *const batch) const;
1347 void
1348 finish_indexes_operation(const std::unordered_set<GL_INDEX_ID> &gl_index_ids,
1349 Rdb_key_def::DATA_DICT_TYPE dd_type) const;
1350 void rollback_ongoing_index_creation() const;
1351
1352 inline void get_ongoing_drop_indexes(
1353 std::unordered_set<GL_INDEX_ID> *gl_index_ids) const {
1354 get_ongoing_index_operation(gl_index_ids,
1355 Rdb_key_def::DDL_DROP_INDEX_ONGOING);
1356 }
1357 inline void get_ongoing_create_indexes(
1358 std::unordered_set<GL_INDEX_ID> *gl_index_ids) const {
1359 get_ongoing_index_operation(gl_index_ids,
1360 Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
1361 }
1362 inline void start_drop_index(rocksdb::WriteBatch *wb,
1363 const GL_INDEX_ID &gl_index_id) const {
1364 start_ongoing_index_operation(wb, gl_index_id,
1365 Rdb_key_def::DDL_DROP_INDEX_ONGOING);
1366 }
1367 inline void start_create_index(rocksdb::WriteBatch *wb,
1368 const GL_INDEX_ID &gl_index_id) const {
1369 start_ongoing_index_operation(wb, gl_index_id,
1370 Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
1371 }
1372 inline void finish_drop_indexes(
1373 const std::unordered_set<GL_INDEX_ID> &gl_index_ids) const {
1374 finish_indexes_operation(gl_index_ids, Rdb_key_def::DDL_DROP_INDEX_ONGOING);
1375 }
1376 inline void finish_create_indexes(
1377 const std::unordered_set<GL_INDEX_ID> &gl_index_ids) const {
1378 finish_indexes_operation(gl_index_ids,
1379 Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
1380 }
1381 inline bool is_drop_index_ongoing(const GL_INDEX_ID &gl_index_id) const {
1382 return is_index_operation_ongoing(gl_index_id,
1383 Rdb_key_def::DDL_DROP_INDEX_ONGOING);
1384 }
1385 inline bool is_create_index_ongoing(const GL_INDEX_ID &gl_index_id) const {
1386 return is_index_operation_ongoing(gl_index_id,
1387 Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
1388 }
1389
1390 bool get_max_index_id(uint32_t *const index_id) const;
1391 bool update_max_index_id(rocksdb::WriteBatch *const batch,
1392 const uint32_t &index_id) const;
1393 void add_stats(rocksdb::WriteBatch *const batch,
1394 const std::vector<Rdb_index_stats> &stats) const;
1395 Rdb_index_stats get_stats(GL_INDEX_ID gl_index_id) const;
1396
1397 rocksdb::Status put_auto_incr_val(rocksdb::WriteBatchBase *batch,
1398 const GL_INDEX_ID &gl_index_id,
1399 ulonglong val,
1400 bool overwrite = false) const;
1401 bool get_auto_incr_val(const GL_INDEX_ID &gl_index_id,
1402 ulonglong *new_val) const;
1403};
1404
1405struct Rdb_index_info {
1406 GL_INDEX_ID m_gl_index_id;
1407 uint16_t m_index_dict_version = 0;
1408 uchar m_index_type = 0;
1409 uint16_t m_kv_version = 0;
1410 uint32 m_index_flags = 0;
1411 uint64 m_ttl_duration = 0;
1412};
1413
1414/*
1415 @brief
1416 Merge Operator for the auto_increment value in the system_cf
1417
1418 @detail
1419 This class implements the rocksdb Merge Operator for auto_increment values
1420 that are stored to the data dictionary every transaction.
1421
1422 The actual Merge function is triggered on compaction, memtable flushes, or
1423 when get() is called on the same key.
1424
1425 */
1426class Rdb_system_merge_op : public rocksdb::AssociativeMergeOperator {
1427 public:
1428 /*
1429 Updates the new value associated with a key to be the maximum of the
1430 passed in value and the existing value.
1431
1432 @param[IN] key
1433 @param[IN] existing_value existing value for a key; nullptr if nonexistent
1434 key
1435 @param[IN] value
1436 @param[OUT] new_value new value after Merge
1437 @param[IN] logger
1438 */
1439 bool Merge(const rocksdb::Slice &key, const rocksdb::Slice *existing_value,
1440 const rocksdb::Slice &value, std::string *new_value,
1441 rocksdb::Logger *logger) const override {
1442 DBUG_ASSERT(new_value != nullptr);
1443
1444 if (key.size() != Rdb_key_def::INDEX_NUMBER_SIZE * 3 ||
1445 GetKeyType(key) != Rdb_key_def::AUTO_INC ||
1446 value.size() !=
1447 RDB_SIZEOF_AUTO_INCREMENT_VERSION + ROCKSDB_SIZEOF_AUTOINC_VALUE ||
1448 GetVersion(value) > Rdb_key_def::AUTO_INCREMENT_VERSION) {
1449 abort();
1450 }
1451
1452 uint64_t merged_value = Deserialize(value);
1453
1454 if (existing_value != nullptr) {
1455 if (existing_value->size() != RDB_SIZEOF_AUTO_INCREMENT_VERSION +
1456 ROCKSDB_SIZEOF_AUTOINC_VALUE ||
1457 GetVersion(*existing_value) > Rdb_key_def::AUTO_INCREMENT_VERSION) {
1458 abort();
1459 }
1460
1461 merged_value = std::max(merged_value, Deserialize(*existing_value));
1462 }
1463 Serialize(merged_value, new_value);
1464 return true;
1465 }
1466
1467 virtual const char *Name() const override { return "Rdb_system_merge_op"; }
1468
1469 private:
1470 /*
1471 Serializes the integer data to the new_value buffer or the target buffer
1472 the merge operator will update to
1473 */
1474 void Serialize(const uint64_t data, std::string *new_value) const {
1475 uchar value_buf[RDB_SIZEOF_AUTO_INCREMENT_VERSION +
1476 ROCKSDB_SIZEOF_AUTOINC_VALUE] = {0};
1477 uchar *ptr = value_buf;
1478 /* fill in the auto increment version */
1479 rdb_netbuf_store_uint16(ptr, Rdb_key_def::AUTO_INCREMENT_VERSION);
1480 ptr += RDB_SIZEOF_AUTO_INCREMENT_VERSION;
1481 /* fill in the auto increment value */
1482 rdb_netbuf_store_uint64(ptr, data);
1483 ptr += ROCKSDB_SIZEOF_AUTOINC_VALUE;
1484 new_value->assign(reinterpret_cast<char *>(value_buf), ptr - value_buf);
1485 }
1486
1487 /*
1488 Gets the value of auto_increment type in the data dictionary from the
1489 value slice
1490
1491 @Note Only to be used on data dictionary keys for the auto_increment type
1492 */
1493 uint64_t Deserialize(const rocksdb::Slice &s) const {
1494 return rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(s.data()) +
1495 RDB_SIZEOF_AUTO_INCREMENT_VERSION);
1496 }
1497
1498 /*
1499 Gets the type of the key of the key in the data dictionary.
1500
1501 @Note Only to be used on data dictionary keys for the auto_increment type
1502 */
1503 uint16_t GetKeyType(const rocksdb::Slice &s) const {
1504 return rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>(s.data()));
1505 }
1506
1507 /*
1508 Gets the version of the auto_increment value in the data dictionary.
1509
1510 @Note Only to be used on data dictionary value for the auto_increment type
1511 */
1512 uint16_t GetVersion(const rocksdb::Slice &s) const {
1513 return rdb_netbuf_to_uint16(reinterpret_cast<const uchar *>(s.data()));
1514 }
1515};
1516
1517bool rdb_is_collation_supported(const my_core::CHARSET_INFO *const cs);
1518
1519} // namespace myrocks
1520