1 | /* |
2 | Copyright (c) 2012,2013 Monty Program Ab |
3 | |
4 | This program is free software; you can redistribute it and/or modify |
5 | it under the terms of the GNU General Public License as published by |
6 | the Free Software Foundation; version 2 of the License. |
7 | |
8 | This program is distributed in the hope that it will be useful, |
9 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | GNU General Public License for more details. |
12 | |
13 | You should have received a copy of the GNU General Public License |
14 | along with this program; if not, write to the Free Software |
15 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ |
16 | #pragma once |
17 | |
18 | /* C++ standard header files */ |
19 | #include <cstdlib> |
20 | #include <algorithm> |
21 | #include <atomic> |
22 | #include <map> |
23 | #include <mutex> |
24 | #include <string> |
25 | #include <unordered_set> |
26 | #include <utility> |
27 | #include <vector> |
28 | #include <array> |
29 | |
30 | /* C standard header files */ |
31 | #ifndef _WIN32 |
32 | #include <arpa/inet.h> |
33 | #endif |
34 | |
35 | /* MyRocks header files */ |
36 | #include "./ha_rocksdb.h" |
37 | #include "./properties_collector.h" |
38 | #include "./rdb_buff.h" |
39 | #include "./rdb_utils.h" |
40 | |
41 | namespace myrocks { |
42 | |
43 | class Rdb_dict_manager; |
44 | class Rdb_key_def; |
45 | class Rdb_field_packing; |
46 | class Rdb_cf_manager; |
47 | class Rdb_ddl_manager; |
48 | |
49 | const uint32_t GTID_BUF_LEN = 60; |
50 | |
51 | /* |
52 | @brief |
53 | Field packing context. |
54 | The idea is to ensure that a call to rdb_index_field_pack_t function |
55 | is followed by a call to rdb_make_unpack_info_t. |
56 | |
57 | @detail |
58 | For some datatypes, unpack_info is produced as a side effect of |
59 | rdb_index_field_pack_t function call. |
60 | For other datatypes, packing is just calling make_sort_key(), while |
61 | rdb_make_unpack_info_t is a custom function. |
62 | In order to accommodate both cases, we require both calls to be made and |
63 | unpack_info is passed as context data between the two. |
64 | */ |
65 | class Rdb_pack_field_context { |
66 | public: |
67 | Rdb_pack_field_context(const Rdb_pack_field_context &) = delete; |
68 | Rdb_pack_field_context &operator=(const Rdb_pack_field_context &) = delete; |
69 | |
70 | explicit Rdb_pack_field_context(Rdb_string_writer *const writer_arg) |
71 | : writer(writer_arg) {} |
72 | |
73 | // NULL means we're not producing unpack_info. |
74 | Rdb_string_writer *writer; |
75 | }; |
76 | |
77 | struct Rdb_collation_codec; |
78 | struct Rdb_index_info; |
79 | |
80 | /* |
81 | C-style "virtual table" allowing different handling of packing logic based |
82 | on the field type. See Rdb_field_packing::setup() implementation. |
83 | */ |
84 | using rdb_make_unpack_info_t = |
85 | void (Rdb_key_def::*)(const Rdb_collation_codec *codec, const Field *field, |
86 | Rdb_pack_field_context *pack_ctx) const; |
87 | using rdb_index_field_unpack_t = int (Rdb_key_def::*)( |
88 | Rdb_field_packing *fpi, Field *field, uchar *field_ptr, |
89 | Rdb_string_reader *reader, Rdb_string_reader *unpack_reader) const; |
90 | using rdb_index_field_skip_t = |
91 | int (Rdb_key_def::*)(const Rdb_field_packing *fpi, const Field *field, |
92 | Rdb_string_reader *reader) const; |
93 | using rdb_index_field_pack_t = |
94 | void (Rdb_key_def::*)(Rdb_field_packing *fpi, Field *field, uchar *buf, |
95 | uchar **dst, Rdb_pack_field_context *pack_ctx) const; |
96 | |
97 | const uint RDB_INVALID_KEY_LEN = uint(-1); |
98 | |
99 | /* How much one checksum occupies when stored in the record */ |
100 | const size_t RDB_CHECKSUM_SIZE = sizeof(uint32_t); |
101 | |
102 | /* |
103 | How much the checksum data occupies in record, in total. |
104 | It is storing two checksums plus 1 tag-byte. |
105 | */ |
106 | const size_t RDB_CHECKSUM_CHUNK_SIZE = 2 * RDB_CHECKSUM_SIZE + 1; |
107 | |
108 | /* |
109 | Checksum data starts from CHECKSUM_DATA_TAG which is followed by two CRC32 |
110 | checksums. |
111 | */ |
112 | const char RDB_CHECKSUM_DATA_TAG = 0x01; |
113 | |
114 | /* |
115 | Unpack data is variable length. The header is 1 tag-byte plus a two byte |
116 | length field. The length field includes the header as well. |
117 | */ |
118 | const char RDB_UNPACK_DATA_TAG = 0x02; |
119 | const size_t RDB_UNPACK_DATA_LEN_SIZE = sizeof(uint16_t); |
120 | const size_t = |
121 | sizeof(RDB_UNPACK_DATA_TAG) + RDB_UNPACK_DATA_LEN_SIZE; |
122 | |
123 | /* |
124 | This header format is 1 tag-byte plus a two byte length field plus a two byte |
125 | covered bitmap. The length field includes the header size. |
126 | */ |
127 | const char RDB_UNPACK_COVERED_DATA_TAG = 0x03; |
128 | const size_t RDB_UNPACK_COVERED_DATA_LEN_SIZE = sizeof(uint16_t); |
129 | const size_t RDB_COVERED_BITMAP_SIZE = sizeof(uint16_t); |
130 | const size_t = |
131 | sizeof(RDB_UNPACK_COVERED_DATA_TAG) + RDB_UNPACK_COVERED_DATA_LEN_SIZE + |
132 | RDB_COVERED_BITMAP_SIZE; |
133 | |
134 | /* |
135 | Data dictionary index info field sizes. |
136 | */ |
137 | const size_t RDB_SIZEOF_INDEX_INFO_VERSION = sizeof(uint16); |
138 | const size_t RDB_SIZEOF_INDEX_TYPE = sizeof(uchar); |
139 | const size_t RDB_SIZEOF_KV_VERSION = sizeof(uint16); |
140 | const size_t RDB_SIZEOF_INDEX_FLAGS = sizeof(uint32); |
141 | const size_t RDB_SIZEOF_AUTO_INCREMENT_VERSION = sizeof(uint16); |
142 | |
143 | // Possible return values for rdb_index_field_unpack_t functions. |
144 | enum { |
145 | UNPACK_SUCCESS = 0, |
146 | UNPACK_FAILURE = 1, |
147 | }; |
148 | |
149 | /* |
150 | An object of this class represents information about an index in an SQL |
151 | table. It provides services to encode and decode index tuples. |
152 | |
153 | Note: a table (as in, on-disk table) has a single Rdb_key_def object which |
154 | is shared across multiple TABLE* objects and may be used simultaneously from |
155 | different threads. |
156 | |
157 | There are several data encodings: |
158 | |
159 | === SQL LAYER === |
160 | SQL layer uses two encodings: |
161 | |
162 | - "Table->record format". This is the format that is used for the data in |
163 | the record buffers, table->record[i] |
164 | |
165 | - KeyTupleFormat (see opt_range.cc) - this is used in parameters to index |
166 | lookup functions, like handler::index_read_map(). |
167 | |
168 | === Inside RocksDB === |
169 | Primary Key is stored as a mapping: |
170 | |
171 | index_tuple -> StoredRecord |
172 | |
173 | StoredRecord is in Table->record format, except for blobs, which are stored |
174 | in-place. See ha_rocksdb::convert_record_to_storage_format for details. |
175 | |
176 | Secondary indexes are stored as one of two variants: |
177 | |
178 | index_tuple -> unpack_info |
179 | index_tuple -> empty_string |
180 | |
181 | index_tuple here is the form of key that can be compared with memcmp(), aka |
182 | "mem-comparable form". |
183 | |
184 | unpack_info is extra data that allows to restore the original value from its |
185 | mem-comparable form. It is present only if the index supports index-only |
186 | reads. |
187 | */ |
188 | |
189 | class Rdb_key_def { |
190 | public: |
191 | /* Convert a key from KeyTupleFormat to mem-comparable form */ |
192 | uint pack_index_tuple(TABLE *const tbl, uchar *const pack_buffer, |
193 | uchar *const packed_tuple, const uchar *const key_tuple, |
194 | const key_part_map &keypart_map) const; |
195 | |
196 | uchar *pack_field(Field *const field, Rdb_field_packing *pack_info, |
197 | uchar *tuple, uchar *const packed_tuple, |
198 | uchar *const pack_buffer, |
199 | Rdb_string_writer *const unpack_info, |
200 | uint *const n_null_fields) const; |
201 | /* Convert a key from Table->record format to mem-comparable form */ |
202 | uint pack_record(const TABLE *const tbl, uchar *const pack_buffer, |
203 | const uchar *const record, uchar *const packed_tuple, |
204 | Rdb_string_writer *const unpack_info, |
205 | const bool &should_store_row_debug_checksums, |
206 | const longlong &hidden_pk_id = 0, uint n_key_parts = 0, |
207 | uint *const n_null_fields = nullptr, |
208 | uint *const ttl_pk_offset = nullptr, |
209 | const char *const ttl_bytes = nullptr) const; |
210 | /* Pack the hidden primary key into mem-comparable form. */ |
211 | uint pack_hidden_pk(const longlong &hidden_pk_id, |
212 | uchar *const packed_tuple) const; |
213 | int unpack_field(Rdb_field_packing *const fpi, |
214 | Field *const field, |
215 | Rdb_string_reader* reader, |
216 | const uchar *const default_value, |
217 | Rdb_string_reader* unp_reader) const; |
218 | int unpack_record(TABLE *const table, uchar *const buf, |
219 | const rocksdb::Slice *const packed_key, |
220 | const rocksdb::Slice *const unpack_info, |
221 | const bool &verify_row_debug_checksums) const; |
222 | |
223 | static bool unpack_info_has_checksum(const rocksdb::Slice &unpack_info); |
224 | int compare_keys(const rocksdb::Slice *key1, const rocksdb::Slice *key2, |
225 | std::size_t *const column_index) const; |
226 | |
227 | size_t key_length(const TABLE *const table, const rocksdb::Slice &key) const; |
228 | |
229 | /* Get the key that is the "infimum" for this index */ |
230 | inline void get_infimum_key(uchar *const key, uint *const size) const { |
231 | rdb_netbuf_store_index(key, m_index_number); |
232 | *size = INDEX_NUMBER_SIZE; |
233 | } |
234 | |
235 | /* Get the key that is a "supremum" for this index */ |
236 | inline void get_supremum_key(uchar *const key, uint *const size) const { |
237 | rdb_netbuf_store_index(key, m_index_number + 1); |
238 | *size = INDEX_NUMBER_SIZE; |
239 | } |
240 | |
241 | /* |
242 | Get the first key that you need to position at to start iterating. |
243 | |
244 | Stores into *key a "supremum" or "infimum" key value for the index. |
245 | |
246 | @return Number of bytes in the key that are usable for bloom filter use. |
247 | */ |
248 | inline int get_first_key(uchar *const key, uint *const size) const { |
249 | if (m_is_reverse_cf) |
250 | get_supremum_key(key, size); |
251 | else |
252 | get_infimum_key(key, size); |
253 | |
254 | /* Find out how many bytes of infimum are the same as m_index_number */ |
255 | uchar unmodified_key[INDEX_NUMBER_SIZE]; |
256 | rdb_netbuf_store_index(unmodified_key, m_index_number); |
257 | int i; |
258 | for (i = 0; i < INDEX_NUMBER_SIZE; i++) { |
259 | if (key[i] != unmodified_key[i]) |
260 | break; |
261 | } |
262 | return i; |
263 | } |
264 | |
265 | /* Make a key that is right after the given key. */ |
266 | static int successor(uchar *const packed_tuple, const uint &len); |
267 | |
268 | /* Make a key that is right before the given key. */ |
269 | static int predecessor(uchar *const packed_tuple, const uint &len); |
270 | |
271 | /* |
272 | This can be used to compare prefixes. |
273 | if X is a prefix of Y, then we consider that X = Y. |
274 | */ |
275 | // b describes the lookup key, which can be a prefix of a. |
276 | // b might be outside of the index_number range, if successor() is called. |
277 | int cmp_full_keys(const rocksdb::Slice &a, const rocksdb::Slice &b) const { |
278 | DBUG_ASSERT(covers_key(a)); |
279 | |
280 | return memcmp(a.data(), b.data(), std::min(a.size(), b.size())); |
281 | } |
282 | |
283 | /* Check if given mem-comparable key belongs to this index */ |
284 | bool covers_key(const rocksdb::Slice &slice) const { |
285 | if (slice.size() < INDEX_NUMBER_SIZE) |
286 | return false; |
287 | |
288 | if (memcmp(slice.data(), m_index_number_storage_form, INDEX_NUMBER_SIZE)) |
289 | return false; |
290 | |
291 | return true; |
292 | } |
293 | |
294 | void get_lookup_bitmap(const TABLE *table, MY_BITMAP *map) const; |
295 | |
296 | bool covers_lookup(TABLE *const table, |
297 | const rocksdb::Slice *const unpack_info, |
298 | const MY_BITMAP *const map) const; |
299 | |
300 | inline bool use_covered_bitmap_format() const { |
301 | return m_index_type == INDEX_TYPE_SECONDARY && |
302 | m_kv_format_version >= SECONDARY_FORMAT_VERSION_UPDATE3; |
303 | } |
304 | |
305 | /* |
306 | Return true if the passed mem-comparable key |
307 | - is from this index, and |
308 | - it matches the passed key prefix (the prefix is also in mem-comparable |
309 | form) |
310 | */ |
311 | bool value_matches_prefix(const rocksdb::Slice &value, |
312 | const rocksdb::Slice &prefix) const { |
313 | return covers_key(value) && !cmp_full_keys(value, prefix); |
314 | } |
315 | |
316 | uint32 get_keyno() const { return m_keyno; } |
317 | |
318 | uint32 get_index_number() const { return m_index_number; } |
319 | |
320 | GL_INDEX_ID get_gl_index_id() const { |
321 | const GL_INDEX_ID gl_index_id = {m_cf_handle->GetID(), m_index_number}; |
322 | return gl_index_id; |
323 | } |
324 | |
325 | int read_memcmp_key_part(const TABLE *table_arg, Rdb_string_reader *reader, |
326 | const uint part_num) const; |
327 | |
328 | /* Must only be called for secondary keys: */ |
329 | uint get_primary_key_tuple(const TABLE *const tbl, |
330 | const Rdb_key_def &pk_descr, |
331 | const rocksdb::Slice *const key, |
332 | uchar *const pk_buffer) const; |
333 | |
334 | uint get_memcmp_sk_parts(const TABLE *table, const rocksdb::Slice &key, |
335 | uchar *sk_buffer, uint *n_null_fields) const; |
336 | |
337 | /* Return max length of mem-comparable form */ |
338 | uint max_storage_fmt_length() const { return m_maxlength; } |
339 | |
340 | uint get_key_parts() const { return m_key_parts; } |
341 | |
342 | uint get_ttl_field_offset() const { return m_ttl_field_offset; } |
343 | |
344 | /* |
345 | Get a field object for key part #part_no |
346 | |
347 | @detail |
348 | SQL layer thinks unique secondary indexes and indexes in partitioned |
349 | tables are not "Extended" with Primary Key columns. |
350 | |
351 | Internally, we always extend all indexes with PK columns. This function |
352 | uses our definition of how the index is Extended. |
353 | */ |
354 | inline Field *get_table_field_for_part_no(TABLE *table, uint part_no) const; |
355 | |
356 | const std::string &get_name() const { return m_name; } |
357 | |
358 | const rocksdb::SliceTransform *() const { |
359 | return m_prefix_extractor.get(); |
360 | } |
361 | |
362 | static size_t (char tag); |
363 | |
364 | Rdb_key_def &operator=(const Rdb_key_def &) = delete; |
365 | Rdb_key_def(const Rdb_key_def &k); |
366 | Rdb_key_def(uint indexnr_arg, uint keyno_arg, |
367 | rocksdb::ColumnFamilyHandle *cf_handle_arg, |
368 | uint16_t index_dict_version_arg, uchar index_type_arg, |
369 | uint16_t kv_format_version_arg, bool is_reverse_cf_arg, |
370 | bool is_per_partition_cf, const char *name, |
371 | Rdb_index_stats stats = Rdb_index_stats(), uint32 index_flags = 0, |
372 | uint32 ttl_rec_offset = UINT_MAX, uint64 ttl_duration = 0); |
373 | ~Rdb_key_def(); |
374 | |
375 | enum { |
376 | INDEX_NUMBER_SIZE = 4, |
377 | VERSION_SIZE = 2, |
378 | CF_NUMBER_SIZE = 4, |
379 | CF_FLAG_SIZE = 4, |
380 | PACKED_SIZE = 4, // one int |
381 | }; |
382 | |
383 | // bit flags for combining bools when writing to disk |
384 | enum { |
385 | REVERSE_CF_FLAG = 1, |
386 | AUTO_CF_FLAG = 2, // Deprecated |
387 | PER_PARTITION_CF_FLAG = 4, |
388 | }; |
389 | |
390 | // bit flags which denote myrocks specific fields stored in the record |
391 | // currently only used for TTL. |
392 | enum INDEX_FLAG { |
393 | TTL_FLAG = 1 << 0, |
394 | |
395 | // MAX_FLAG marks where the actual record starts |
396 | // This flag always needs to be set to the last index flag enum. |
397 | MAX_FLAG = TTL_FLAG << 1, |
398 | }; |
399 | |
400 | // Set of flags to ignore when comparing two CF-s and determining if |
401 | // they're same. |
402 | static const uint CF_FLAGS_TO_IGNORE = PER_PARTITION_CF_FLAG; |
403 | |
404 | // Data dictionary types |
405 | enum DATA_DICT_TYPE { |
406 | DDL_ENTRY_INDEX_START_NUMBER = 1, |
407 | INDEX_INFO = 2, |
408 | CF_DEFINITION = 3, |
409 | BINLOG_INFO_INDEX_NUMBER = 4, |
410 | DDL_DROP_INDEX_ONGOING = 5, |
411 | INDEX_STATISTICS = 6, |
412 | MAX_INDEX_ID = 7, |
413 | DDL_CREATE_INDEX_ONGOING = 8, |
414 | AUTO_INC = 9, |
415 | END_DICT_INDEX_ID = 255 |
416 | }; |
417 | |
418 | // Data dictionary schema version. Introduce newer versions |
419 | // if changing schema layout |
420 | enum { |
421 | DDL_ENTRY_INDEX_VERSION = 1, |
422 | CF_DEFINITION_VERSION = 1, |
423 | BINLOG_INFO_INDEX_NUMBER_VERSION = 1, |
424 | DDL_DROP_INDEX_ONGOING_VERSION = 1, |
425 | MAX_INDEX_ID_VERSION = 1, |
426 | DDL_CREATE_INDEX_ONGOING_VERSION = 1, |
427 | AUTO_INCREMENT_VERSION = 1, |
428 | // Version for index stats is stored in IndexStats struct |
429 | }; |
430 | |
431 | // Index info version. Introduce newer versions when changing the |
432 | // INDEX_INFO layout. Update INDEX_INFO_VERSION_LATEST to point to the |
433 | // latest version number. |
434 | enum { |
435 | INDEX_INFO_VERSION_INITIAL = 1, // Obsolete |
436 | INDEX_INFO_VERSION_KV_FORMAT, |
437 | INDEX_INFO_VERSION_GLOBAL_ID, |
438 | // There is no change to data format in this version, but this version |
439 | // verifies KV format version, whereas previous versions do not. A version |
440 | // bump is needed to prevent older binaries from skipping the KV version |
441 | // check inadvertently. |
442 | INDEX_INFO_VERSION_VERIFY_KV_FORMAT, |
443 | // This changes the data format to include a 8 byte TTL duration for tables |
444 | INDEX_INFO_VERSION_TTL, |
445 | // This changes the data format to include a bitmap before the TTL duration |
446 | // which will indicate in the future whether TTL or other special fields |
447 | // are turned on or off. |
448 | INDEX_INFO_VERSION_FIELD_FLAGS, |
449 | // This normally point to the latest (currently it does). |
450 | INDEX_INFO_VERSION_LATEST = INDEX_INFO_VERSION_FIELD_FLAGS, |
451 | }; |
452 | |
453 | // MyRocks index types |
454 | enum { |
455 | INDEX_TYPE_PRIMARY = 1, |
456 | INDEX_TYPE_SECONDARY = 2, |
457 | INDEX_TYPE_HIDDEN_PRIMARY = 3, |
458 | }; |
459 | |
460 | // Key/Value format version for each index type |
461 | enum { |
462 | PRIMARY_FORMAT_VERSION_INITIAL = 10, |
463 | // This change includes: |
464 | // - For columns that can be unpacked with unpack_info, PK |
465 | // stores the unpack_info. |
466 | // - DECIMAL datatype is no longer stored in the row (because |
467 | // it can be decoded from its mem-comparable form) |
468 | // - VARCHAR-columns use endspace-padding. |
469 | PRIMARY_FORMAT_VERSION_UPDATE1 = 11, |
470 | // This change includes: |
471 | // - Binary encoded variable length fields have a new format that avoids |
472 | // an inefficient where data that was a multiple of 8 bytes in length |
473 | // had an extra 9 bytes of encoded data. |
474 | PRIMARY_FORMAT_VERSION_UPDATE2 = 12, |
475 | // This change includes support for TTL |
476 | // - This means that when TTL is specified for the table an 8-byte TTL |
477 | // field is prepended in front of each value. |
478 | PRIMARY_FORMAT_VERSION_TTL = 13, |
479 | PRIMARY_FORMAT_VERSION_LATEST = PRIMARY_FORMAT_VERSION_TTL, |
480 | |
481 | SECONDARY_FORMAT_VERSION_INITIAL = 10, |
482 | // This change the SK format to include unpack_info. |
483 | SECONDARY_FORMAT_VERSION_UPDATE1 = 11, |
484 | // This change includes: |
485 | // - Binary encoded variable length fields have a new format that avoids |
486 | // an inefficient where data that was a multiple of 8 bytes in length |
487 | // had an extra 9 bytes of encoded data. |
488 | SECONDARY_FORMAT_VERSION_UPDATE2 = 12, |
489 | // This change includes support for TTL |
490 | // - This means that when TTL is specified for the table an 8-byte TTL |
491 | // field is prepended in front of each value. |
492 | SECONDARY_FORMAT_VERSION_TTL = 13, |
493 | SECONDARY_FORMAT_VERSION_LATEST = SECONDARY_FORMAT_VERSION_TTL, |
494 | // This change includes support for covering SK lookups for varchars. A |
495 | // 2-byte bitmap is added after the tag-byte to unpack_info only for |
496 | // records which have covered varchar columns. Currently waiting before |
497 | // enabling in prod. |
498 | SECONDARY_FORMAT_VERSION_UPDATE3 = 65535, |
499 | }; |
500 | |
501 | void setup(const TABLE *const table, const Rdb_tbl_def *const tbl_def); |
502 | |
503 | static uint (const TABLE *const table_arg, |
504 | const Rdb_tbl_def *const tbl_def_arg, |
505 | uint64 *ttl_duration); |
506 | static uint (const TABLE *const table_arg, |
507 | const Rdb_tbl_def *const tbl_def_arg, |
508 | std::string *ttl_column, uint *ttl_field_offset, |
509 | bool skip_checks = false); |
510 | inline bool has_ttl() const { return m_ttl_duration > 0; } |
511 | |
512 | static bool has_index_flag(uint32 index_flags, enum INDEX_FLAG flag); |
513 | static uint32 calculate_index_flag_offset(uint32 index_flags, |
514 | enum INDEX_FLAG flag, |
515 | uint *const field_length = nullptr); |
516 | void write_index_flag_field(Rdb_string_writer *const buf, |
517 | const uchar *const val, |
518 | enum INDEX_FLAG flag) const; |
519 | |
520 | static const std::string |
521 | gen_qualifier_for_table(const char *const qualifier, |
522 | const std::string &partition_name = "" ); |
523 | static const std::string |
524 | gen_cf_name_qualifier_for_partition(const std::string &s); |
525 | static const std::string |
526 | gen_ttl_duration_qualifier_for_partition(const std::string &s); |
527 | static const std::string |
528 | gen_ttl_col_qualifier_for_partition(const std::string &s); |
529 | |
530 | static const std::string ( |
531 | const std::string &, const TABLE *const table_arg, |
532 | const Rdb_tbl_def *const tbl_def_arg, bool *per_part_match_found, |
533 | const char *const qualifier); |
534 | |
535 | rocksdb::ColumnFamilyHandle *get_cf() const { return m_cf_handle; } |
536 | |
537 | /* Check if keypart #kp can be unpacked from index tuple */ |
538 | inline bool can_unpack(const uint &kp) const; |
539 | /* Check if keypart #kp needs unpack info */ |
540 | inline bool has_unpack_info(const uint &kp) const; |
541 | |
542 | /* Check if given table has a primary key */ |
543 | static bool table_has_hidden_pk(const TABLE *const table); |
544 | |
545 | void report_checksum_mismatch(const bool &is_key, const char *const data, |
546 | const size_t data_size) const; |
547 | |
548 | /* Check if index is at least pk_min if it is a PK, |
549 | or at least sk_min if SK.*/ |
550 | bool index_format_min_check(const int &pk_min, const int &sk_min) const; |
551 | |
552 | void pack_with_make_sort_key( |
553 | Rdb_field_packing *const fpi, Field *const field, |
554 | uchar *buf MY_ATTRIBUTE((__unused__)), uchar **dst, |
555 | Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__))) const; |
556 | |
557 | void pack_with_varchar_encoding( |
558 | Rdb_field_packing *const fpi, Field *const field, uchar *buf, uchar **dst, |
559 | Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__))) const; |
560 | |
561 | void |
562 | pack_with_varchar_space_pad(Rdb_field_packing *const fpi, Field *const field, |
563 | uchar *buf, uchar **dst, |
564 | Rdb_pack_field_context *const pack_ctx) const; |
565 | |
566 | int unpack_integer(Rdb_field_packing *const fpi, Field *const field, |
567 | uchar *const to, Rdb_string_reader *const reader, |
568 | Rdb_string_reader *const unp_reader |
569 | MY_ATTRIBUTE((__unused__))) const; |
570 | |
571 | int unpack_double(Rdb_field_packing *const fpi MY_ATTRIBUTE((__unused__)), |
572 | Field *const field MY_ATTRIBUTE((__unused__)), |
573 | uchar *const field_ptr, Rdb_string_reader *const reader, |
574 | Rdb_string_reader *const unp_reader |
575 | MY_ATTRIBUTE((__unused__))) const; |
576 | |
577 | int unpack_float(Rdb_field_packing *const fpi, |
578 | Field *const field MY_ATTRIBUTE((__unused__)), |
579 | uchar *const field_ptr, Rdb_string_reader *const reader, |
580 | Rdb_string_reader *const unp_reader |
581 | MY_ATTRIBUTE((__unused__))) const; |
582 | |
583 | int unpack_binary_str(Rdb_field_packing *const fpi, Field *const field, |
584 | uchar *const to, Rdb_string_reader *const reader, |
585 | Rdb_string_reader *const unp_reader |
586 | MY_ATTRIBUTE((__unused__))) const; |
587 | |
588 | int unpack_binary_or_utf8_varchar( |
589 | Rdb_field_packing *const fpi, Field *const field, uchar *dst, |
590 | Rdb_string_reader *const reader, |
591 | Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__))) const; |
592 | |
593 | int unpack_binary_or_utf8_varchar_space_pad( |
594 | Rdb_field_packing *const fpi, Field *const field, uchar *dst, |
595 | Rdb_string_reader *const reader, |
596 | Rdb_string_reader *const unp_reader) const; |
597 | |
598 | int unpack_newdate(Rdb_field_packing *const fpi, |
599 | Field *const field MY_ATTRIBUTE((__unused__)), |
600 | uchar *const field_ptr, Rdb_string_reader *const reader, |
601 | Rdb_string_reader *const unp_reader |
602 | MY_ATTRIBUTE((__unused__))) const; |
603 | |
604 | int unpack_utf8_str(Rdb_field_packing *const fpi, Field *const field, |
605 | uchar *dst, Rdb_string_reader *const reader, |
606 | Rdb_string_reader *const unp_reader |
607 | MY_ATTRIBUTE((__unused__))) const; |
608 | |
609 | int unpack_unknown_varchar(Rdb_field_packing *const fpi, Field *const field, |
610 | uchar *dst, Rdb_string_reader *const reader, |
611 | Rdb_string_reader *const unp_reader) const; |
612 | |
613 | int unpack_simple_varchar_space_pad( |
614 | Rdb_field_packing *const fpi, Field *const field, uchar *dst, |
615 | Rdb_string_reader *const reader, |
616 | Rdb_string_reader *const unp_reader) const; |
617 | |
618 | int unpack_simple(Rdb_field_packing *const fpi, |
619 | Field *const field MY_ATTRIBUTE((__unused__)), |
620 | uchar *const dst, Rdb_string_reader *const reader, |
621 | Rdb_string_reader *const unp_reader) const; |
622 | |
623 | int unpack_unknown(Rdb_field_packing *const fpi, Field *const field, |
624 | uchar *const dst, Rdb_string_reader *const reader, |
625 | Rdb_string_reader *const unp_reader) const; |
626 | |
627 | int unpack_floating_point(uchar *const dst, Rdb_string_reader *const reader, |
628 | const size_t &size, const int &exp_digit, |
629 | const uchar *const zero_pattern, |
630 | const uchar *const zero_val, |
631 | void (*swap_func)(uchar *, const uchar *)) const; |
632 | |
633 | void make_unpack_simple_varchar(const Rdb_collation_codec *const codec, |
634 | const Field *const field, |
635 | Rdb_pack_field_context *const pack_ctx) const; |
636 | |
637 | void make_unpack_simple(const Rdb_collation_codec *const codec, |
638 | const Field *const field, |
639 | Rdb_pack_field_context *const pack_ctx) const; |
640 | |
641 | void make_unpack_unknown( |
642 | const Rdb_collation_codec *codec MY_ATTRIBUTE((__unused__)), |
643 | const Field *const field, Rdb_pack_field_context *const pack_ctx) const; |
644 | |
645 | void make_unpack_unknown_varchar( |
646 | const Rdb_collation_codec *const codec MY_ATTRIBUTE((__unused__)), |
647 | const Field *const field, Rdb_pack_field_context *const pack_ctx) const; |
648 | |
649 | void dummy_make_unpack_info( |
650 | const Rdb_collation_codec *codec MY_ATTRIBUTE((__unused__)), |
651 | const Field *field MY_ATTRIBUTE((__unused__)), |
652 | Rdb_pack_field_context *pack_ctx MY_ATTRIBUTE((__unused__))) const; |
653 | |
654 | int skip_max_length(const Rdb_field_packing *const fpi, |
655 | const Field *const field MY_ATTRIBUTE((__unused__)), |
656 | Rdb_string_reader *const reader) const; |
657 | |
658 | int skip_variable_length( |
659 | const Rdb_field_packing *const fpi MY_ATTRIBUTE((__unused__)), |
660 | const Field *const field, Rdb_string_reader *const reader) const; |
661 | |
662 | int skip_variable_space_pad(const Rdb_field_packing *const fpi, |
663 | const Field *const field, |
664 | Rdb_string_reader *const reader) const; |
665 | |
666 | inline bool use_legacy_varbinary_format() const { |
667 | return !index_format_min_check(PRIMARY_FORMAT_VERSION_UPDATE2, |
668 | SECONDARY_FORMAT_VERSION_UPDATE2); |
669 | } |
670 | |
671 | static inline bool is_unpack_data_tag(char c) { |
672 | return c == RDB_UNPACK_DATA_TAG || c == RDB_UNPACK_COVERED_DATA_TAG; |
673 | } |
674 | |
675 | private: |
676 | #ifndef DBUG_OFF |
677 | inline bool is_storage_available(const int &offset, const int &needed) const { |
678 | const int storage_length = static_cast<int>(max_storage_fmt_length()); |
679 | return (storage_length - offset) >= needed; |
680 | } |
681 | #else |
682 | inline bool is_storage_available(const int &offset, const int &needed) const { |
683 | return 1; |
684 | } |
685 | #endif // DBUG_OFF |
686 | |
687 | /* Global number of this index (used as prefix in StorageFormat) */ |
688 | const uint32 m_index_number; |
689 | |
690 | uchar m_index_number_storage_form[INDEX_NUMBER_SIZE]; |
691 | |
692 | rocksdb::ColumnFamilyHandle *m_cf_handle; |
693 | |
694 | void pack_legacy_variable_format(const uchar *src, size_t src_len, |
695 | uchar **dst) const; |
696 | |
697 | void pack_variable_format(const uchar *src, size_t src_len, |
698 | uchar **dst) const; |
699 | |
700 | uint calc_unpack_legacy_variable_format(uchar flag, bool *done) const; |
701 | |
702 | uint calc_unpack_variable_format(uchar flag, bool *done) const; |
703 | |
704 | public: |
705 | uint16_t m_index_dict_version; |
706 | uchar m_index_type; |
707 | /* KV format version for the index id */ |
708 | uint16_t m_kv_format_version; |
709 | /* If true, the column family stores data in the reverse order */ |
710 | bool m_is_reverse_cf; |
711 | |
712 | /* If true, then column family is created per partition. */ |
713 | bool m_is_per_partition_cf; |
714 | |
715 | std::string m_name; |
716 | mutable Rdb_index_stats m_stats; |
717 | |
718 | /* |
719 | Bitmap containing information about whether TTL or other special fields |
720 | are enabled for the given index. |
721 | */ |
722 | uint32 m_index_flags_bitmap; |
723 | |
724 | /* |
725 | How much space in bytes the index flag fields occupy. |
726 | */ |
727 | uint32 m_total_index_flags_length; |
728 | |
729 | /* |
730 | Offset in the records where the 8-byte TTL is stored (UINT_MAX if no TTL) |
731 | */ |
732 | uint32 m_ttl_rec_offset; |
733 | |
734 | /* Default TTL duration */ |
735 | uint64 m_ttl_duration; |
736 | |
737 | /* TTL column (if defined by user, otherwise implicit TTL is used) */ |
738 | std::string m_ttl_column; |
739 | |
740 | private: |
741 | friend class Rdb_tbl_def; // for m_index_number above |
742 | |
743 | /* Number of key parts in the primary key*/ |
744 | uint m_pk_key_parts; |
745 | |
746 | /* |
747 | pk_part_no[X]=Y means that keypart #X of this key is key part #Y of the |
748 | primary key. Y==-1 means this column is not present in the primary key. |
749 | */ |
750 | uint *m_pk_part_no; |
751 | |
752 | /* Array of index-part descriptors. */ |
753 | Rdb_field_packing *m_pack_info; |
754 | |
755 | uint m_keyno; /* number of this index in the table */ |
756 | |
757 | /* |
758 | Number of key parts in the index (including "index extension"). This is how |
759 | many elements are in the m_pack_info array. |
760 | */ |
761 | uint m_key_parts; |
762 | |
763 | /* |
764 | If TTL column is part of the PK, offset of the column within pk. |
765 | Default is UINT_MAX to denote that TTL col is not part of PK. |
766 | */ |
767 | uint m_ttl_pk_key_part_offset; |
768 | |
769 | /* |
770 | Index of the TTL column in table->s->fields, if it exists. |
771 | Default is UINT_MAX to denote that it does not exist. |
772 | */ |
773 | uint m_ttl_field_offset; |
774 | |
775 | /* Prefix extractor for the column family of the key definiton */ |
776 | std::shared_ptr<const rocksdb::SliceTransform> ; |
777 | |
778 | /* Maximum length of the mem-comparable form. */ |
779 | uint m_maxlength; |
780 | |
781 | /* mutex to protect setup */ |
782 | mysql_mutex_t m_mutex; |
783 | }; |
784 | |
785 | // "Simple" collations (those specified in strings/ctype-simple.c) are simple |
786 | // because their strnxfrm function maps one byte to one byte. However, the |
787 | // mapping is not injective, so the inverse function will take in an extra |
788 | // index parameter containing information to disambiguate what the original |
789 | // character was. |
790 | // |
791 | // The m_enc* members are for encoding. Generally, we want encoding to be: |
792 | // src -> (dst, idx) |
793 | // |
794 | // Since strnxfrm already gives us dst, we just need m_enc_idx[src] to give us |
795 | // idx. |
796 | // |
797 | // For the inverse, we have: |
798 | // (dst, idx) -> src |
799 | // |
800 | // We have m_dec_idx[idx][dst] = src to get our original character back. |
801 | // |
802 | struct Rdb_collation_codec { |
803 | const my_core::CHARSET_INFO *m_cs; |
804 | // The first element unpacks VARCHAR(n), the second one - CHAR(n). |
805 | std::array<rdb_make_unpack_info_t, 2> m_make_unpack_info_func; |
806 | std::array<rdb_index_field_unpack_t, 2> m_unpack_func; |
807 | |
808 | std::array<uchar, 256> m_enc_idx; |
809 | std::array<uchar, 256> m_enc_size; |
810 | |
811 | std::array<uchar, 256> m_dec_size; |
812 | std::vector<std::array<uchar, 256>> m_dec_idx; |
813 | }; |
814 | |
815 | extern mysql_mutex_t rdb_collation_data_mutex; |
816 | extern mysql_mutex_t rdb_mem_cmp_space_mutex; |
817 | extern std::array<const Rdb_collation_codec *, MY_ALL_CHARSETS_SIZE> |
818 | rdb_collation_data; |
819 | |
820 | class Rdb_field_packing { |
821 | public: |
822 | Rdb_field_packing(const Rdb_field_packing &) = delete; |
823 | Rdb_field_packing &operator=(const Rdb_field_packing &) = delete; |
824 | Rdb_field_packing() = default; |
825 | |
826 | /* Length of mem-comparable image of the field, in bytes */ |
827 | int m_max_image_len; |
828 | |
829 | /* Length of image in the unpack data */ |
830 | int m_unpack_data_len; |
831 | int m_unpack_data_offset; |
832 | |
833 | bool m_maybe_null; /* TRUE <=> NULL-byte is stored */ |
834 | |
835 | /* |
836 | Valid only for VARCHAR fields. |
837 | */ |
838 | const CHARSET_INFO *m_varchar_charset; |
839 | |
840 | // (Valid when Variable Length Space Padded Encoding is used): |
841 | uint m_segment_size; // size of segment used |
842 | |
843 | // number of bytes used to store number of trimmed (or added) |
844 | // spaces in the upack_info |
845 | bool m_unpack_info_uses_two_bytes; |
846 | |
847 | /* |
848 | True implies that an index-only read is always possible for this field. |
849 | False means an index-only read may be possible depending on the record and |
850 | field type. |
851 | */ |
852 | bool m_covered; |
853 | |
854 | const std::vector<uchar> *space_xfrm; |
855 | size_t space_xfrm_len; |
856 | size_t space_mb_len; |
857 | |
858 | const Rdb_collation_codec *m_charset_codec; |
859 | |
860 | /* |
861 | @return TRUE: this field makes use of unpack_info. |
862 | */ |
863 | bool uses_unpack_info() const { return (m_make_unpack_info_func != nullptr); } |
864 | |
865 | /* TRUE means unpack_info stores the original field value */ |
866 | bool m_unpack_info_stores_value; |
867 | |
868 | rdb_index_field_pack_t m_pack_func; |
869 | rdb_make_unpack_info_t m_make_unpack_info_func; |
870 | |
871 | /* |
872 | This function takes |
873 | - mem-comparable form |
874 | - unpack_info data |
875 | and restores the original value. |
876 | */ |
877 | rdb_index_field_unpack_t m_unpack_func; |
878 | |
879 | /* |
880 | This function skips over mem-comparable form. |
881 | */ |
882 | rdb_index_field_skip_t m_skip_func; |
883 | |
884 | private: |
885 | /* |
886 | Location of the field in the table (key number and key part number). |
887 | |
888 | Note that this describes not the field, but rather a position of field in |
889 | the index. Consider an example: |
890 | |
891 | col1 VARCHAR (100), |
892 | INDEX idx1 (col1)), |
893 | INDEX idx2 (col1(10)), |
894 | |
895 | Here, idx2 has a special Field object that is set to describe a 10-char |
896 | prefix of col1. |
897 | |
898 | We must also store the keynr. It is needed for implicit "extended keys". |
899 | Every key in MyRocks needs to include PK columns. Generally, SQL layer |
900 | includes PK columns as part of its "Extended Keys" feature, but sometimes |
901 | it does not (known examples are unique secondary indexes and partitioned |
902 | tables). |
903 | In that case, MyRocks's index descriptor has invisible suffix of PK |
904 | columns (and the point is that these columns are parts of PK, not parts |
905 | of the current index). |
906 | */ |
907 | uint m_keynr; |
908 | uint m_key_part; |
909 | |
910 | public: |
911 | bool setup(const Rdb_key_def *const key_descr, const Field *const field, |
912 | const uint &keynr_arg, const uint &key_part_arg, |
913 | const uint16 &key_length); |
914 | Field *get_field_in_table(const TABLE *const tbl) const; |
915 | void fill_hidden_pk_val(uchar **dst, const longlong &hidden_pk_id) const; |
916 | }; |
917 | |
918 | /* |
919 | Descriptor telling how to decode/encode a field to on-disk record storage |
920 | format. Not all information is in the structure yet, but eventually we |
921 | want to have as much as possible there to avoid virtual calls. |
922 | |
923 | For encoding/decoding of index tuples, see Rdb_key_def. |
924 | */ |
925 | class Rdb_field_encoder { |
926 | public: |
927 | Rdb_field_encoder(const Rdb_field_encoder &) = delete; |
928 | Rdb_field_encoder &operator=(const Rdb_field_encoder &) = delete; |
929 | /* |
930 | STORE_NONE is set when a column can be decoded solely from their |
931 | mem-comparable form. |
932 | STORE_SOME is set when a column can be decoded from their mem-comparable |
933 | form plus unpack_info. |
934 | STORE_ALL is set when a column cannot be decoded, so its original value |
935 | must be stored in the PK records. |
936 | */ |
937 | enum STORAGE_TYPE { |
938 | STORE_NONE, |
939 | STORE_SOME, |
940 | STORE_ALL, |
941 | }; |
942 | STORAGE_TYPE m_storage_type; |
943 | |
944 | uint m_null_offset; |
945 | uint16 m_field_index; |
946 | |
947 | uchar m_null_mask; // 0 means the field cannot be null |
948 | |
949 | my_core::enum_field_types m_field_type; |
950 | |
951 | uint m_pack_length_in_rec; |
952 | |
953 | bool maybe_null() const { return m_null_mask != 0; } |
954 | |
955 | bool uses_variable_len_encoding() const { |
956 | return (m_field_type == MYSQL_TYPE_BLOB || |
957 | m_field_type == MYSQL_TYPE_VARCHAR); |
958 | } |
959 | }; |
960 | |
961 | inline Field *Rdb_key_def::get_table_field_for_part_no(TABLE *table, |
962 | uint part_no) const { |
963 | DBUG_ASSERT(part_no < get_key_parts()); |
964 | return m_pack_info[part_no].get_field_in_table(table); |
965 | } |
966 | |
967 | inline bool Rdb_key_def::can_unpack(const uint &kp) const { |
968 | DBUG_ASSERT(kp < m_key_parts); |
969 | return (m_pack_info[kp].m_unpack_func != nullptr); |
970 | } |
971 | |
972 | inline bool Rdb_key_def::has_unpack_info(const uint &kp) const { |
973 | DBUG_ASSERT(kp < m_key_parts); |
974 | return m_pack_info[kp].uses_unpack_info(); |
975 | } |
976 | |
977 | /* |
978 | A table definition. This is an entry in the mapping |
979 | |
980 | dbname.tablename -> {index_nr, index_nr, ... } |
981 | |
982 | There is only one Rdb_tbl_def object for a given table. |
983 | That's why we keep auto_increment value here, too. |
984 | */ |
985 | |
986 | class Rdb_tbl_def { |
987 | private: |
988 | void check_if_is_mysql_system_table(); |
989 | |
990 | /* Stores 'dbname.tablename' */ |
991 | std::string m_dbname_tablename; |
992 | |
993 | /* Store the db name, table name, and partition name */ |
994 | std::string m_dbname; |
995 | std::string m_tablename; |
996 | std::string m_partition; |
997 | |
998 | void set_name(const std::string &name); |
999 | |
1000 | public: |
1001 | Rdb_tbl_def(const Rdb_tbl_def &) = delete; |
1002 | Rdb_tbl_def &operator=(const Rdb_tbl_def &) = delete; |
1003 | |
1004 | explicit Rdb_tbl_def(const std::string &name) |
1005 | : m_key_descr_arr(nullptr), m_hidden_pk_val(0), m_auto_incr_val(0) { |
1006 | set_name(name); |
1007 | } |
1008 | |
1009 | Rdb_tbl_def(const char *const name, const size_t &len) |
1010 | : m_key_descr_arr(nullptr), m_hidden_pk_val(0), m_auto_incr_val(0) { |
1011 | set_name(std::string(name, len)); |
1012 | } |
1013 | |
1014 | explicit Rdb_tbl_def(const rocksdb::Slice &slice, const size_t &pos = 0) |
1015 | : m_key_descr_arr(nullptr), m_hidden_pk_val(0), m_auto_incr_val(0) { |
1016 | set_name(std::string(slice.data() + pos, slice.size() - pos)); |
1017 | } |
1018 | |
1019 | ~Rdb_tbl_def(); |
1020 | |
1021 | /* Number of indexes */ |
1022 | uint m_key_count; |
1023 | |
1024 | /* Array of index descriptors */ |
1025 | std::shared_ptr<Rdb_key_def> *m_key_descr_arr; |
1026 | |
1027 | std::atomic<longlong> m_hidden_pk_val; |
1028 | std::atomic<ulonglong> m_auto_incr_val; |
1029 | |
1030 | /* Is this a system table */ |
1031 | bool m_is_mysql_system_table; |
1032 | |
1033 | bool put_dict(Rdb_dict_manager *const dict, rocksdb::WriteBatch *const batch, |
1034 | uchar *const key, const size_t &keylen); |
1035 | |
1036 | const std::string &full_tablename() const { return m_dbname_tablename; } |
1037 | const std::string &base_dbname() const { return m_dbname; } |
1038 | const std::string &base_tablename() const { return m_tablename; } |
1039 | const std::string &base_partition() const { return m_partition; } |
1040 | GL_INDEX_ID get_autoincr_gl_index_id(); |
1041 | }; |
1042 | |
1043 | /* |
1044 | A thread-safe sequential number generator. Its performance is not a concern |
1045 | hence it is ok to protect it by a mutex. |
1046 | */ |
1047 | |
1048 | class Rdb_seq_generator { |
1049 | uint m_next_number = 0; |
1050 | |
1051 | mysql_mutex_t m_mutex; |
1052 | |
1053 | public: |
1054 | Rdb_seq_generator(const Rdb_seq_generator &) = delete; |
1055 | Rdb_seq_generator &operator=(const Rdb_seq_generator &) = delete; |
1056 | Rdb_seq_generator() = default; |
1057 | |
1058 | void init(const uint &initial_number) { |
1059 | mysql_mutex_init(0, &m_mutex, MY_MUTEX_INIT_FAST); |
1060 | m_next_number = initial_number; |
1061 | } |
1062 | |
1063 | uint get_and_update_next_number(Rdb_dict_manager *const dict); |
1064 | |
1065 | void cleanup() { mysql_mutex_destroy(&m_mutex); } |
1066 | }; |
1067 | |
1068 | interface Rdb_tables_scanner { |
1069 | virtual int add_table(Rdb_tbl_def * tdef) = 0; |
1070 | virtual ~Rdb_tables_scanner() {} /* Keep the compiler happy */ |
1071 | }; |
1072 | |
1073 | /* |
1074 | This contains a mapping of |
1075 | |
1076 | dbname.table_name -> array{Rdb_key_def}. |
1077 | |
1078 | objects are shared among all threads. |
1079 | */ |
1080 | |
1081 | class Rdb_ddl_manager { |
1082 | Rdb_dict_manager *m_dict = nullptr; |
1083 | my_core::HASH m_ddl_hash; // Contains Rdb_tbl_def elements |
1084 | // Maps index id to <table_name, index number> |
1085 | std::map<GL_INDEX_ID, std::pair<std::string, uint>> m_index_num_to_keydef; |
1086 | |
1087 | // Maps index id to key definitons not yet committed to data dictionary. |
1088 | // This is mainly used to store key definitions during ALTER TABLE. |
1089 | std::map<GL_INDEX_ID, std::shared_ptr<Rdb_key_def>> |
1090 | m_index_num_to_uncommitted_keydef; |
1091 | mysql_rwlock_t m_rwlock; |
1092 | |
1093 | Rdb_seq_generator m_sequence; |
1094 | // A queue of table stats to write into data dictionary |
1095 | // It is produced by event listener (ie compaction and flush threads) |
1096 | // and consumed by the rocksdb background thread |
1097 | std::map<GL_INDEX_ID, Rdb_index_stats> m_stats2store; |
1098 | |
1099 | const std::shared_ptr<Rdb_key_def> &find(GL_INDEX_ID gl_index_id); |
1100 | |
1101 | public: |
1102 | Rdb_ddl_manager(const Rdb_ddl_manager &) = delete; |
1103 | Rdb_ddl_manager &operator=(const Rdb_ddl_manager &) = delete; |
1104 | Rdb_ddl_manager() {} |
1105 | |
1106 | /* Load the data dictionary from on-disk storage */ |
1107 | bool init(Rdb_dict_manager *const dict_arg, Rdb_cf_manager *const cf_manager, |
1108 | const uint32_t &validate_tables); |
1109 | |
1110 | void cleanup(); |
1111 | |
1112 | Rdb_tbl_def *find(const std::string &table_name, const bool &lock = true); |
1113 | std::shared_ptr<const Rdb_key_def> safe_find(GL_INDEX_ID gl_index_id); |
1114 | void set_stats(const std::unordered_map<GL_INDEX_ID, Rdb_index_stats> &stats); |
1115 | void adjust_stats(const std::vector<Rdb_index_stats> &new_data, |
1116 | const std::vector<Rdb_index_stats> &deleted_data = |
1117 | std::vector<Rdb_index_stats>()); |
1118 | void persist_stats(const bool &sync = false); |
1119 | |
1120 | /* Modify the mapping and write it to on-disk storage */ |
1121 | int put_and_write(Rdb_tbl_def *const key_descr, |
1122 | rocksdb::WriteBatch *const batch); |
1123 | void remove(Rdb_tbl_def *const rec, rocksdb::WriteBatch *const batch, |
1124 | const bool &lock = true); |
1125 | bool rename(const std::string &from, const std::string &to, |
1126 | rocksdb::WriteBatch *const batch); |
1127 | |
1128 | uint get_and_update_next_number(Rdb_dict_manager *const dict) { |
1129 | return m_sequence.get_and_update_next_number(dict); |
1130 | } |
1131 | |
1132 | const std::string safe_get_table_name(const GL_INDEX_ID &gl_index_id); |
1133 | |
1134 | /* Walk the data dictionary */ |
1135 | int scan_for_tables(Rdb_tables_scanner *tables_scanner); |
1136 | |
1137 | void erase_index_num(const GL_INDEX_ID &gl_index_id); |
1138 | void add_uncommitted_keydefs( |
1139 | const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes); |
1140 | void remove_uncommitted_keydefs( |
1141 | const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes); |
1142 | |
1143 | private: |
1144 | /* Put the data into in-memory table (only) */ |
1145 | int put(Rdb_tbl_def *const key_descr, const bool &lock = true); |
1146 | |
1147 | /* Helper functions to be passed to my_core::HASH object */ |
1148 | static const uchar *get_hash_key(Rdb_tbl_def *const rec, size_t *const length, |
1149 | my_bool not_used MY_ATTRIBUTE((unused))); |
1150 | static void free_hash_elem(void *const data); |
1151 | |
1152 | bool validate_schemas(); |
1153 | |
1154 | bool validate_auto_incr(); |
1155 | }; |
1156 | |
1157 | /* |
1158 | Writing binlog information into RocksDB at commit(), |
1159 | and retrieving binlog information at crash recovery. |
1160 | commit() and recovery are always executed by at most single client |
1161 | at the same time, so concurrency control is not needed. |
1162 | |
1163 | Binlog info is stored in RocksDB as the following. |
1164 | key: BINLOG_INFO_INDEX_NUMBER |
1165 | value: packed single row: |
1166 | binlog_name_length (2 byte form) |
1167 | binlog_name |
1168 | binlog_position (4 byte form) |
1169 | binlog_gtid_length (2 byte form) |
1170 | binlog_gtid |
1171 | */ |
1172 | class Rdb_binlog_manager { |
1173 | public: |
1174 | Rdb_binlog_manager(const Rdb_binlog_manager &) = delete; |
1175 | Rdb_binlog_manager &operator=(const Rdb_binlog_manager &) = delete; |
1176 | Rdb_binlog_manager() = default; |
1177 | |
1178 | bool init(Rdb_dict_manager *const dict); |
1179 | void cleanup(); |
1180 | void update(const char *const binlog_name, const my_off_t binlog_pos, |
1181 | rocksdb::WriteBatchBase *const batch); |
1182 | bool read(char *const binlog_name, my_off_t *const binlog_pos, |
1183 | char *const binlog_gtid) const; |
1184 | void update_slave_gtid_info(const uint &id, const char *const db, |
1185 | const char *const gtid, |
1186 | rocksdb::WriteBatchBase *const write_batch); |
1187 | |
1188 | private: |
1189 | Rdb_dict_manager *m_dict = nullptr; |
1190 | uchar m_key_buf[Rdb_key_def::INDEX_NUMBER_SIZE] = {0}; |
1191 | rocksdb::Slice m_key_slice; |
1192 | |
1193 | rocksdb::Slice pack_value(uchar *const buf, const char *const binlog_name, |
1194 | const my_off_t &binlog_pos, |
1195 | const char *const binlog_gtid) const; |
1196 | bool unpack_value(const uchar *const value, size_t value_size, |
1197 | char *const binlog_name, |
1198 | my_off_t *const binlog_pos, char *const binlog_gtid) const; |
1199 | |
1200 | std::atomic<Rdb_tbl_def *> m_slave_gtid_info_tbl; |
1201 | }; |
1202 | |
1203 | /* |
1204 | Rdb_dict_manager manages how MySQL on RocksDB (MyRocks) stores its |
1205 | internal data dictionary. |
1206 | MyRocks stores data dictionary on dedicated system column family |
1207 | named __system__. The system column family is used by MyRocks |
1208 | internally only, and not used by applications. |
1209 | |
1210 | Currently MyRocks has the following data dictionary data models. |
1211 | |
1212 | 1. Table Name => internal index id mappings |
1213 | key: Rdb_key_def::DDL_ENTRY_INDEX_START_NUMBER(0x1) + dbname.tablename |
1214 | value: version, {cf_id, index_id}*n_indexes_of_the_table |
1215 | version is 2 bytes. cf_id and index_id are 4 bytes. |
1216 | |
1217 | 2. internal cf_id, index id => index information |
1218 | key: Rdb_key_def::INDEX_INFO(0x2) + cf_id + index_id |
1219 | value: version, index_type, kv_format_version, index_flags, ttl_duration |
1220 | index_type is 1 byte, version and kv_format_version are 2 bytes. |
1221 | index_flags is 4 bytes. |
1222 | ttl_duration is 8 bytes. |
1223 | |
1224 | 3. CF id => CF flags |
1225 | key: Rdb_key_def::CF_DEFINITION(0x3) + cf_id |
1226 | value: version, {is_reverse_cf, is_auto_cf (deprecated), is_per_partition_cf} |
1227 | cf_flags is 4 bytes in total. |
1228 | |
1229 | 4. Binlog entry (updated at commit) |
1230 | key: Rdb_key_def::BINLOG_INFO_INDEX_NUMBER (0x4) |
1231 | value: version, {binlog_name,binlog_pos,binlog_gtid} |
1232 | |
1233 | 5. Ongoing drop index entry |
1234 | key: Rdb_key_def::DDL_DROP_INDEX_ONGOING(0x5) + cf_id + index_id |
1235 | value: version |
1236 | |
1237 | 6. index stats |
1238 | key: Rdb_key_def::INDEX_STATISTICS(0x6) + cf_id + index_id |
1239 | value: version, {materialized PropertiesCollector::IndexStats} |
1240 | |
1241 | 7. maximum index id |
1242 | key: Rdb_key_def::MAX_INDEX_ID(0x7) |
1243 | value: index_id |
1244 | index_id is 4 bytes |
1245 | |
1246 | 8. Ongoing create index entry |
1247 | key: Rdb_key_def::DDL_CREATE_INDEX_ONGOING(0x8) + cf_id + index_id |
1248 | value: version |
1249 | |
1250 | 9. auto_increment values |
1251 | key: Rdb_key_def::AUTO_INC(0x9) + cf_id + index_id |
1252 | value: version, {max auto_increment so far} |
1253 | max auto_increment is 8 bytes |
1254 | |
1255 | Data dictionary operations are atomic inside RocksDB. For example, |
1256 | when creating a table with two indexes, it is necessary to call Put |
1257 | three times. They have to be atomic. Rdb_dict_manager has a wrapper function |
1258 | begin() and commit() to make it easier to do atomic operations. |
1259 | |
1260 | */ |
1261 | class Rdb_dict_manager { |
1262 | private: |
1263 | mysql_mutex_t m_mutex; |
1264 | rocksdb::DB *m_db = nullptr; |
1265 | rocksdb::ColumnFamilyHandle *m_system_cfh = nullptr; |
1266 | /* Utility to put INDEX_INFO and CF_DEFINITION */ |
1267 | |
1268 | uchar m_key_buf_max_index_id[Rdb_key_def::INDEX_NUMBER_SIZE] = {0}; |
1269 | rocksdb::Slice m_key_slice_max_index_id; |
1270 | |
1271 | static void dump_index_id(uchar *const netbuf, |
1272 | Rdb_key_def::DATA_DICT_TYPE dict_type, |
1273 | const GL_INDEX_ID &gl_index_id); |
1274 | void delete_with_prefix(rocksdb::WriteBatch *const batch, |
1275 | Rdb_key_def::DATA_DICT_TYPE dict_type, |
1276 | const GL_INDEX_ID &gl_index_id) const; |
1277 | /* Functions for fast DROP TABLE/INDEX */ |
1278 | void resume_drop_indexes() const; |
1279 | void log_start_drop_table(const std::shared_ptr<Rdb_key_def> *const key_descr, |
1280 | const uint32 &n_keys, |
1281 | const char *const log_action) const; |
1282 | void log_start_drop_index(GL_INDEX_ID gl_index_id, |
1283 | const char *log_action) const; |
1284 | |
1285 | public: |
1286 | Rdb_dict_manager(const Rdb_dict_manager &) = delete; |
1287 | Rdb_dict_manager &operator=(const Rdb_dict_manager &) = delete; |
1288 | Rdb_dict_manager() = default; |
1289 | |
1290 | bool init(rocksdb::DB *const rdb_dict, Rdb_cf_manager *const cf_manager); |
1291 | |
1292 | inline void cleanup() { mysql_mutex_destroy(&m_mutex); } |
1293 | |
1294 | inline void lock() { RDB_MUTEX_LOCK_CHECK(m_mutex); } |
1295 | |
1296 | inline void unlock() { RDB_MUTEX_UNLOCK_CHECK(m_mutex); } |
1297 | |
1298 | inline rocksdb::ColumnFamilyHandle *get_system_cf() const { |
1299 | return m_system_cfh; |
1300 | } |
1301 | |
1302 | /* Raw RocksDB operations */ |
1303 | std::unique_ptr<rocksdb::WriteBatch> begin() const; |
1304 | int commit(rocksdb::WriteBatch *const batch, const bool &sync = true) const; |
1305 | rocksdb::Status get_value(const rocksdb::Slice &key, |
1306 | std::string *const value) const; |
1307 | void put_key(rocksdb::WriteBatchBase *const batch, const rocksdb::Slice &key, |
1308 | const rocksdb::Slice &value) const; |
1309 | void delete_key(rocksdb::WriteBatchBase *batch, |
1310 | const rocksdb::Slice &key) const; |
1311 | rocksdb::Iterator *new_iterator() const; |
1312 | |
1313 | /* Internal Index id => CF */ |
1314 | void |
1315 | add_or_update_index_cf_mapping(rocksdb::WriteBatch *batch, |
1316 | struct Rdb_index_info *const index_info) const; |
1317 | void delete_index_info(rocksdb::WriteBatch *batch, |
1318 | const GL_INDEX_ID &index_id) const; |
1319 | bool get_index_info(const GL_INDEX_ID &gl_index_id, |
1320 | struct Rdb_index_info *const index_info) const; |
1321 | |
1322 | /* CF id => CF flags */ |
1323 | void add_cf_flags(rocksdb::WriteBatch *const batch, const uint &cf_id, |
1324 | const uint &cf_flags) const; |
1325 | bool get_cf_flags(const uint &cf_id, uint *const cf_flags) const; |
1326 | |
1327 | /* Functions for fast CREATE/DROP TABLE/INDEX */ |
1328 | void |
1329 | get_ongoing_index_operation(std::unordered_set<GL_INDEX_ID> *gl_index_ids, |
1330 | Rdb_key_def::DATA_DICT_TYPE dd_type) const; |
1331 | bool is_index_operation_ongoing(const GL_INDEX_ID &gl_index_id, |
1332 | Rdb_key_def::DATA_DICT_TYPE dd_type) const; |
1333 | void start_ongoing_index_operation(rocksdb::WriteBatch *batch, |
1334 | const GL_INDEX_ID &gl_index_id, |
1335 | Rdb_key_def::DATA_DICT_TYPE dd_type) const; |
1336 | void end_ongoing_index_operation(rocksdb::WriteBatch *const batch, |
1337 | const GL_INDEX_ID &gl_index_id, |
1338 | Rdb_key_def::DATA_DICT_TYPE dd_type) const; |
1339 | bool is_drop_index_empty() const; |
1340 | void add_drop_table(std::shared_ptr<Rdb_key_def> *const key_descr, |
1341 | const uint32 &n_keys, |
1342 | rocksdb::WriteBatch *const batch) const; |
1343 | void add_drop_index(const std::unordered_set<GL_INDEX_ID> &gl_index_ids, |
1344 | rocksdb::WriteBatch *const batch) const; |
1345 | void add_create_index(const std::unordered_set<GL_INDEX_ID> &gl_index_ids, |
1346 | rocksdb::WriteBatch *const batch) const; |
1347 | void |
1348 | finish_indexes_operation(const std::unordered_set<GL_INDEX_ID> &gl_index_ids, |
1349 | Rdb_key_def::DATA_DICT_TYPE dd_type) const; |
1350 | void rollback_ongoing_index_creation() const; |
1351 | |
1352 | inline void get_ongoing_drop_indexes( |
1353 | std::unordered_set<GL_INDEX_ID> *gl_index_ids) const { |
1354 | get_ongoing_index_operation(gl_index_ids, |
1355 | Rdb_key_def::DDL_DROP_INDEX_ONGOING); |
1356 | } |
1357 | inline void get_ongoing_create_indexes( |
1358 | std::unordered_set<GL_INDEX_ID> *gl_index_ids) const { |
1359 | get_ongoing_index_operation(gl_index_ids, |
1360 | Rdb_key_def::DDL_CREATE_INDEX_ONGOING); |
1361 | } |
1362 | inline void start_drop_index(rocksdb::WriteBatch *wb, |
1363 | const GL_INDEX_ID &gl_index_id) const { |
1364 | start_ongoing_index_operation(wb, gl_index_id, |
1365 | Rdb_key_def::DDL_DROP_INDEX_ONGOING); |
1366 | } |
1367 | inline void start_create_index(rocksdb::WriteBatch *wb, |
1368 | const GL_INDEX_ID &gl_index_id) const { |
1369 | start_ongoing_index_operation(wb, gl_index_id, |
1370 | Rdb_key_def::DDL_CREATE_INDEX_ONGOING); |
1371 | } |
1372 | inline void finish_drop_indexes( |
1373 | const std::unordered_set<GL_INDEX_ID> &gl_index_ids) const { |
1374 | finish_indexes_operation(gl_index_ids, Rdb_key_def::DDL_DROP_INDEX_ONGOING); |
1375 | } |
1376 | inline void finish_create_indexes( |
1377 | const std::unordered_set<GL_INDEX_ID> &gl_index_ids) const { |
1378 | finish_indexes_operation(gl_index_ids, |
1379 | Rdb_key_def::DDL_CREATE_INDEX_ONGOING); |
1380 | } |
1381 | inline bool is_drop_index_ongoing(const GL_INDEX_ID &gl_index_id) const { |
1382 | return is_index_operation_ongoing(gl_index_id, |
1383 | Rdb_key_def::DDL_DROP_INDEX_ONGOING); |
1384 | } |
1385 | inline bool is_create_index_ongoing(const GL_INDEX_ID &gl_index_id) const { |
1386 | return is_index_operation_ongoing(gl_index_id, |
1387 | Rdb_key_def::DDL_CREATE_INDEX_ONGOING); |
1388 | } |
1389 | |
1390 | bool get_max_index_id(uint32_t *const index_id) const; |
1391 | bool update_max_index_id(rocksdb::WriteBatch *const batch, |
1392 | const uint32_t &index_id) const; |
1393 | void add_stats(rocksdb::WriteBatch *const batch, |
1394 | const std::vector<Rdb_index_stats> &stats) const; |
1395 | Rdb_index_stats get_stats(GL_INDEX_ID gl_index_id) const; |
1396 | |
1397 | rocksdb::Status put_auto_incr_val(rocksdb::WriteBatchBase *batch, |
1398 | const GL_INDEX_ID &gl_index_id, |
1399 | ulonglong val, |
1400 | bool overwrite = false) const; |
1401 | bool get_auto_incr_val(const GL_INDEX_ID &gl_index_id, |
1402 | ulonglong *new_val) const; |
1403 | }; |
1404 | |
1405 | struct Rdb_index_info { |
1406 | GL_INDEX_ID m_gl_index_id; |
1407 | uint16_t m_index_dict_version = 0; |
1408 | uchar m_index_type = 0; |
1409 | uint16_t m_kv_version = 0; |
1410 | uint32 m_index_flags = 0; |
1411 | uint64 m_ttl_duration = 0; |
1412 | }; |
1413 | |
1414 | /* |
1415 | @brief |
1416 | Merge Operator for the auto_increment value in the system_cf |
1417 | |
1418 | @detail |
1419 | This class implements the rocksdb Merge Operator for auto_increment values |
1420 | that are stored to the data dictionary every transaction. |
1421 | |
1422 | The actual Merge function is triggered on compaction, memtable flushes, or |
1423 | when get() is called on the same key. |
1424 | |
1425 | */ |
1426 | class Rdb_system_merge_op : public rocksdb::AssociativeMergeOperator { |
1427 | public: |
1428 | /* |
1429 | Updates the new value associated with a key to be the maximum of the |
1430 | passed in value and the existing value. |
1431 | |
1432 | @param[IN] key |
1433 | @param[IN] existing_value existing value for a key; nullptr if nonexistent |
1434 | key |
1435 | @param[IN] value |
1436 | @param[OUT] new_value new value after Merge |
1437 | @param[IN] logger |
1438 | */ |
1439 | bool Merge(const rocksdb::Slice &key, const rocksdb::Slice *existing_value, |
1440 | const rocksdb::Slice &value, std::string *new_value, |
1441 | rocksdb::Logger *logger) const override { |
1442 | DBUG_ASSERT(new_value != nullptr); |
1443 | |
1444 | if (key.size() != Rdb_key_def::INDEX_NUMBER_SIZE * 3 || |
1445 | GetKeyType(key) != Rdb_key_def::AUTO_INC || |
1446 | value.size() != |
1447 | RDB_SIZEOF_AUTO_INCREMENT_VERSION + ROCKSDB_SIZEOF_AUTOINC_VALUE || |
1448 | GetVersion(value) > Rdb_key_def::AUTO_INCREMENT_VERSION) { |
1449 | abort(); |
1450 | } |
1451 | |
1452 | uint64_t merged_value = Deserialize(value); |
1453 | |
1454 | if (existing_value != nullptr) { |
1455 | if (existing_value->size() != RDB_SIZEOF_AUTO_INCREMENT_VERSION + |
1456 | ROCKSDB_SIZEOF_AUTOINC_VALUE || |
1457 | GetVersion(*existing_value) > Rdb_key_def::AUTO_INCREMENT_VERSION) { |
1458 | abort(); |
1459 | } |
1460 | |
1461 | merged_value = std::max(merged_value, Deserialize(*existing_value)); |
1462 | } |
1463 | Serialize(merged_value, new_value); |
1464 | return true; |
1465 | } |
1466 | |
1467 | virtual const char *Name() const override { return "Rdb_system_merge_op" ; } |
1468 | |
1469 | private: |
1470 | /* |
1471 | Serializes the integer data to the new_value buffer or the target buffer |
1472 | the merge operator will update to |
1473 | */ |
1474 | void Serialize(const uint64_t data, std::string *new_value) const { |
1475 | uchar value_buf[RDB_SIZEOF_AUTO_INCREMENT_VERSION + |
1476 | ROCKSDB_SIZEOF_AUTOINC_VALUE] = {0}; |
1477 | uchar *ptr = value_buf; |
1478 | /* fill in the auto increment version */ |
1479 | rdb_netbuf_store_uint16(ptr, Rdb_key_def::AUTO_INCREMENT_VERSION); |
1480 | ptr += RDB_SIZEOF_AUTO_INCREMENT_VERSION; |
1481 | /* fill in the auto increment value */ |
1482 | rdb_netbuf_store_uint64(ptr, data); |
1483 | ptr += ROCKSDB_SIZEOF_AUTOINC_VALUE; |
1484 | new_value->assign(reinterpret_cast<char *>(value_buf), ptr - value_buf); |
1485 | } |
1486 | |
1487 | /* |
1488 | Gets the value of auto_increment type in the data dictionary from the |
1489 | value slice |
1490 | |
1491 | @Note Only to be used on data dictionary keys for the auto_increment type |
1492 | */ |
1493 | uint64_t Deserialize(const rocksdb::Slice &s) const { |
1494 | return rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(s.data()) + |
1495 | RDB_SIZEOF_AUTO_INCREMENT_VERSION); |
1496 | } |
1497 | |
1498 | /* |
1499 | Gets the type of the key of the key in the data dictionary. |
1500 | |
1501 | @Note Only to be used on data dictionary keys for the auto_increment type |
1502 | */ |
1503 | uint16_t GetKeyType(const rocksdb::Slice &s) const { |
1504 | return rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>(s.data())); |
1505 | } |
1506 | |
1507 | /* |
1508 | Gets the version of the auto_increment value in the data dictionary. |
1509 | |
1510 | @Note Only to be used on data dictionary value for the auto_increment type |
1511 | */ |
1512 | uint16_t GetVersion(const rocksdb::Slice &s) const { |
1513 | return rdb_netbuf_to_uint16(reinterpret_cast<const uchar *>(s.data())); |
1514 | } |
1515 | }; |
1516 | |
1517 | bool rdb_is_collation_supported(const my_core::CHARSET_INFO *const cs); |
1518 | |
1519 | } // namespace myrocks |
1520 | |