rdb_datadic.h source code [MariaDB/storage/rocksdb/rdb_datadic.h]

1	/*
2	Copyright (c) 2012,2013 Monty Program Ab
3
4	This program is free software; you can redistribute it and/or modify
5	it under the terms of the GNU General Public License as published by
6	the Free Software Foundation; version 2 of the License.
7
8	This program is distributed in the hope that it will be useful,
9	but WITHOUT ANY WARRANTY; without even the implied warranty of
10	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11	GNU General Public License for more details.
12
13	You should have received a copy of the GNU General Public License
14	along with this program; if not, write to the Free Software
15	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA /*
16	#pragma once
17
18	/ C++ standard header files /
19	#include <cstdlib>
20	#include <algorithm>
21	#include <atomic>
22	#include <map>
23	#include <mutex>
24	#include <string>
25	#include <unordered_set>
26	#include <utility>
27	#include <vector>
28	#include <array>
29
30	/ C standard header files /
31	#ifndef _WIN32
32	#include <arpa/inet.h>
33	#endif
34
35	/ MyRocks header files /
36	#include "./ha_rocksdb.h"
37	#include "./properties_collector.h"
38	#include "./rdb_buff.h"
39	#include "./rdb_utils.h"
40
41	namespace myrocks {
42
43	class Rdb_dict_manager;
44	class Rdb_key_def;
45	class Rdb_field_packing;
46	class Rdb_cf_manager;
47	class Rdb_ddl_manager;
48
49	const uint32_t GTID_BUF_LEN = `60`;
50
51	/*
52	@brief
53	Field packing context.
54	The idea is to ensure that a call to rdb_index_field_pack_t function
55	is followed by a call to rdb_make_unpack_info_t.
56
57	@detail
58	For some datatypes, unpack_info is produced as a side effect of
59	rdb_index_field_pack_t function call.
60	For other datatypes, packing is just calling make_sort_key(), while
61	rdb_make_unpack_info_t is a custom function.
62	In order to accommodate both cases, we require both calls to be made and
63	unpack_info is passed as context data between the two.
64	*/
65	class Rdb_pack_field_context {
66	public:
67	Rdb_pack_field_context(const Rdb_pack_field_context &) = delete;
68	Rdb_pack_field_context &operator=(const Rdb_pack_field_context &) = delete;
69
70	explicit Rdb_pack_field_context(Rdb_string_writer *const writer_arg)
71	: writer(writer_arg) {}
72
73	// NULL means we're not producing unpack_info.
74	Rdb_string_writer *writer;
75	};
76
77	struct Rdb_collation_codec;
78	struct Rdb_index_info;
79
80	/*
81	C-style "virtual table" allowing different handling of packing logic based
82	on the field type. See Rdb_field_packing::setup() implementation.
83	*/
84	using rdb_make_unpack_info_t =
85	void (Rdb_key_def::)(const* Rdb_collation_codec codec, const* Field *field,
86	Rdb_pack_field_context pack_ctx) const*;
87	using rdb_index_field_unpack_t = int (Rdb_key_def::*)(
88	Rdb_field_packing fpi, Field field, uchar *field_ptr,
89	Rdb_string_reader reader, Rdb_string_reader unpack_reader) const;
90	using rdb_index_field_skip_t =
91	int (Rdb_key_def::)(const* Rdb_field_packing fpi, const* Field *field,
92	Rdb_string_reader reader) const*;
93	using rdb_index_field_pack_t =
94	void (Rdb_key_def::)(Rdb_field_packing fpi, Field field, uchar buf,
95	uchar *dst, Rdb_pack_field_context pack_ctx) const;
96
97	const uint RDB_INVALID_KEY_LEN = uint(-`1`);
98
99	/ How much one checksum occupies when stored in the record /
100	const size_t RDB_CHECKSUM_SIZE = sizeof(uint32_t);
101
102	/*
103	How much the checksum data occupies in record, in total.
104	It is storing two checksums plus 1 tag-byte.
105	*/
106	const size_t RDB_CHECKSUM_CHUNK_SIZE = `2` * RDB_CHECKSUM_SIZE + `1`;
107
108	/*
109	Checksum data starts from CHECKSUM_DATA_TAG which is followed by two CRC32
110	checksums.
111	*/
112	const char RDB_CHECKSUM_DATA_TAG = `0x01`;
113
114	/*
115	Unpack data is variable length. The header is 1 tag-byte plus a two byte
116	length field. The length field includes the header as well.
117	*/
118	const char RDB_UNPACK_DATA_TAG = `0x02`;
119	const size_t RDB_UNPACK_DATA_LEN_SIZE = sizeof(uint16_t);
120	const size_t RDB_UNPACK_HEADER_SIZE =
121	sizeof(RDB_UNPACK_DATA_TAG) + RDB_UNPACK_DATA_LEN_SIZE;
122
123	/*
124	This header format is 1 tag-byte plus a two byte length field plus a two byte
125	covered bitmap. The length field includes the header size.
126	*/
127	const char RDB_UNPACK_COVERED_DATA_TAG = `0x03`;
128	const size_t RDB_UNPACK_COVERED_DATA_LEN_SIZE = sizeof(uint16_t);
129	const size_t RDB_COVERED_BITMAP_SIZE = sizeof(uint16_t);
130	const size_t RDB_UNPACK_COVERED_HEADER_SIZE =
131	sizeof(RDB_UNPACK_COVERED_DATA_TAG) + RDB_UNPACK_COVERED_DATA_LEN_SIZE +
132	RDB_COVERED_BITMAP_SIZE;
133
134	/*
135	Data dictionary index info field sizes.
136	*/
137	const size_t RDB_SIZEOF_INDEX_INFO_VERSION = sizeof(uint16);
138	const size_t RDB_SIZEOF_INDEX_TYPE = sizeof(uchar);
139	const size_t RDB_SIZEOF_KV_VERSION = sizeof(uint16);
140	const size_t RDB_SIZEOF_INDEX_FLAGS = sizeof(uint32);
141	const size_t RDB_SIZEOF_AUTO_INCREMENT_VERSION = sizeof(uint16);
142
143	// Possible return values for rdb_index_field_unpack_t functions.
144	enum {
145	UNPACK_SUCCESS = `0`,
146	UNPACK_FAILURE = `1`,
147	};
148
149	/*
150	An object of this class represents information about an index in an SQL
151	table. It provides services to encode and decode index tuples.
152
153	Note: a table (as in, on-disk table) has a single Rdb_key_def object which
154	is shared across multiple TABLE objects and may be used simultaneously from*
155	different threads.
156
157	There are several data encodings:
158
159	=== SQL LAYER ===
160	SQL layer uses two encodings:
161
162	- "Table->record format". This is the format that is used for the data in
163	the record buffers, table->record[i]
164
165	- KeyTupleFormat (see opt_range.cc) - this is used in parameters to index
166	lookup functions, like handler::index_read_map().
167
168	=== Inside RocksDB ===
169	Primary Key is stored as a mapping:
170
171	index_tuple -> StoredRecord
172
173	StoredRecord is in Table->record format, except for blobs, which are stored
174	in-place. See ha_rocksdb::convert_record_to_storage_format for details.
175
176	Secondary indexes are stored as one of two variants:
177
178	index_tuple -> unpack_info
179	index_tuple -> empty_string
180
181	index_tuple here is the form of key that can be compared with memcmp(), aka
182	"mem-comparable form".
183
184	unpack_info is extra data that allows to restore the original value from its
185	mem-comparable form. It is present only if the index supports index-only
186	reads.
187	*/
188
189	class Rdb_key_def {
190	public:
191	/ Convert a key from KeyTupleFormat to mem-comparable form /
192	uint pack_index_tuple(TABLE *const tbl, uchar *const pack_buffer,
193	uchar *const packed_tuple, const uchar *const key_tuple,
194	const key_part_map &keypart_map) const;
195
196	uchar pack_field(Field const field, Rdb_field_packing *pack_info,
197	uchar tuple, uchar const packed_tuple,
198	uchar *const pack_buffer,
199	Rdb_string_writer *const unpack_info,
200	uint *const n_null_fields) const;
201	/ Convert a key from Table->record format to mem-comparable form /
202	uint pack_record(const TABLE *const tbl, uchar *const pack_buffer,
203	const uchar *const record, uchar *const packed_tuple,
204	Rdb_string_writer *const unpack_info,
205	const bool &should_store_row_debug_checksums,
206	const longlong &hidden_pk_id = `0`, uint n_key_parts = `0`,
207	uint *const n_null_fields = nullptr,
208	uint *const ttl_pk_offset = nullptr,
209	const char *const ttl_bytes = nullptr) const;
210	/ Pack the hidden primary key into mem-comparable form. /
211	uint pack_hidden_pk(const longlong &hidden_pk_id,
212	uchar *const packed_tuple) const;
213	int unpack_field(Rdb_field_packing *const fpi,
214	Field *const field,
215	Rdb_string_reader* reader,
216	const uchar *const default_value,
217	Rdb_string_reader* unp_reader) const;
218	int unpack_record(TABLE *const table, uchar *const buf,
219	const rocksdb::Slice *const packed_key,
220	const rocksdb::Slice *const unpack_info,
221	const bool &verify_row_debug_checksums) const;
222
223	static bool unpack_info_has_checksum(const rocksdb::Slice &unpack_info);
224	int compare_keys(const rocksdb::Slice key1, const* rocksdb::Slice *key2,
225	std::size_t *const column_index) const;
226
227	size_t key_length(const TABLE *const table, const rocksdb::Slice &key) const;
228
229	/ Get the key that is the "infimum" for this index /
230	inline void get_infimum_key(uchar *const key, uint *const size) const {
231	rdb_netbuf_store_index(key, m_index_number);
232	*size = INDEX_NUMBER_SIZE;
233	}
234
235	/ Get the key that is a "supremum" for this index /
236	inline void get_supremum_key(uchar *const key, uint *const size) const {
237	rdb_netbuf_store_index(key, m_index_number + `1`);
238	*size = INDEX_NUMBER_SIZE;
239	}
240
241	/*
242	Get the first key that you need to position at to start iterating.
243
244	Stores into key a "supremum" or "infimum" key value for the index.*
245
246	@return Number of bytes in the key that are usable for bloom filter use.
247	*/
248	inline int get_first_key(uchar *const key, uint *const size) const {
249	if (m_is_reverse_cf)
250	get_supremum_key(key, size);
251	else
252	get_infimum_key(key, size);
253
254	/ Find out how many bytes of infimum are the same as m_index_number /
255	uchar unmodified_key[INDEX_NUMBER_SIZE];
256	rdb_netbuf_store_index(unmodified_key, m_index_number);
257	int i;
258	for (i = `0`; i < INDEX_NUMBER_SIZE; i++) {
259	if (key[i] != unmodified_key[i])
260	break;
261	}
262	return i;
263	}
264
265	/ Make a key that is right after the given key. /
266	static int successor(uchar *const packed_tuple, const uint &len);
267
268	/ Make a key that is right before the given key. /
269	static int predecessor(uchar *const packed_tuple, const uint &len);
270
271	/*
272	This can be used to compare prefixes.
273	if X is a prefix of Y, then we consider that X = Y.
274	*/
275	// b describes the lookup key, which can be a prefix of a.
276	// b might be outside of the index_number range, if successor() is called.
277	int cmp_full_keys(const rocksdb::Slice &a, const rocksdb::Slice &b) const {
278	DBUG_ASSERT(covers_key(a));
279
280	return memcmp(a.data(), b.data(), std::min(a.size(), b.size()));
281	}
282
283	/ Check if given mem-comparable key belongs to this index /
284	bool covers_key(const rocksdb::Slice &slice) const {
285	if (slice.size() < INDEX_NUMBER_SIZE)
286	return false;
287
288	if (memcmp(slice.data(), m_index_number_storage_form, INDEX_NUMBER_SIZE))
289	return false;
290
291	return true;
292	}
293
294	void get_lookup_bitmap(const TABLE table, MY_BITMAP map) const;
295
296	bool covers_lookup(TABLE *const table,
297	const rocksdb::Slice *const unpack_info,
298	const MY_BITMAP *const map) const;
299
300	inline bool use_covered_bitmap_format() const {
301	return m_index_type == INDEX_TYPE_SECONDARY &&
302	m_kv_format_version >= SECONDARY_FORMAT_VERSION_UPDATE3;
303	}
304
305	/*
306	Return true if the passed mem-comparable key
307	- is from this index, and
308	- it matches the passed key prefix (the prefix is also in mem-comparable
309	form)
310	*/
311	bool value_matches_prefix(const rocksdb::Slice &value,
312	const rocksdb::Slice &prefix) const {
313	return covers_key(value) && !cmp_full_keys(value, prefix);
314	}
315
316	uint32 get_keyno() const { return m_keyno; }
317
318	uint32 get_index_number() const { return m_index_number; }
319
320	GL_INDEX_ID get_gl_index_id() const {
321	const GL_INDEX_ID gl_index_id = {m_cf_handle->GetID(), m_index_number};
322	return gl_index_id;
323	}
324
325	int read_memcmp_key_part(const TABLE table_arg, Rdb_string_reader reader,
326	const uint part_num) const;
327
328	/ Must only be called for secondary keys: /
329	uint get_primary_key_tuple(const TABLE *const tbl,
330	const Rdb_key_def &pk_descr,
331	const rocksdb::Slice *const key,
332	uchar *const pk_buffer) const;
333
334	uint get_memcmp_sk_parts(const TABLE table, const* rocksdb::Slice &key,
335	uchar sk_buffer, uint n_null_fields) const;
336
337	/ Return max length of mem-comparable form /
338	uint max_storage_fmt_length() const { return m_maxlength; }
339
340	uint get_key_parts() const { return m_key_parts; }
341
342	uint get_ttl_field_offset() const { return m_ttl_field_offset; }
343
344	/*
345	Get a field object for key part #part_no
346
347	@detail
348	SQL layer thinks unique secondary indexes and indexes in partitioned
349	tables are not "Extended" with Primary Key columns.
350
351	Internally, we always extend all indexes with PK columns. This function
352	uses our definition of how the index is Extended.
353	*/
354	inline Field get_table_field_for_part_no(TABLE table, uint part_no) const;
355
356	const std::string &get_name() const { return m_name; }
357
358	const rocksdb::SliceTransform get_extractor() const* {
359	return m_prefix_extractor.get();
360	}
361
362	static size_t get_unpack_header_size(char tag);
363
364	Rdb_key_def &operator=(const Rdb_key_def &) = delete;
365	Rdb_key_def(const Rdb_key_def &k);
366	Rdb_key_def(uint indexnr_arg, uint keyno_arg,
367	rocksdb::ColumnFamilyHandle *cf_handle_arg,
368	uint16_t index_dict_version_arg, uchar index_type_arg,
369	uint16_t kv_format_version_arg, bool is_reverse_cf_arg,
370	bool is_per_partition_cf, const char *name,
371	Rdb_index_stats stats = Rdb_index_stats (), uint32 index_flags = `0`,
372	uint32 ttl_rec_offset = UINT_MAX, uint64 ttl_duration = `0`);
373	~Rdb_key_def();
374
375	enum {
376	INDEX_NUMBER_SIZE = `4`,
377	VERSION_SIZE = `2`,
378	CF_NUMBER_SIZE = `4`,
379	CF_FLAG_SIZE = `4`,
380	PACKED_SIZE = `4`, // one int
381	};
382
383	// bit flags for combining bools when writing to disk
384	enum {
385	REVERSE_CF_FLAG = `1`,
386	AUTO_CF_FLAG = `2`, // Deprecated
387	PER_PARTITION_CF_FLAG = `4`,
388	};
389
390	// bit flags which denote myrocks specific fields stored in the record
391	// currently only used for TTL.
392	enum INDEX_FLAG {
393	TTL_FLAG = `1` << `0`,
394
395	// MAX_FLAG marks where the actual record starts
396	// This flag always needs to be set to the last index flag enum.
397	MAX_FLAG = TTL_FLAG << `1`,
398	};
399
400	// Set of flags to ignore when comparing two CF-s and determining if
401	// they're same.
402	static const uint CF_FLAGS_TO_IGNORE = PER_PARTITION_CF_FLAG;
403
404	// Data dictionary types
405	enum DATA_DICT_TYPE {
406	DDL_ENTRY_INDEX_START_NUMBER = `1`,
407	INDEX_INFO = `2`,
408	CF_DEFINITION = `3`,
409	BINLOG_INFO_INDEX_NUMBER = `4`,
410	DDL_DROP_INDEX_ONGOING = `5`,
411	INDEX_STATISTICS = `6`,
412	MAX_INDEX_ID = `7`,
413	DDL_CREATE_INDEX_ONGOING = `8`,
414	AUTO_INC = `9`,
415	END_DICT_INDEX_ID = `255`
416	};
417
418	// Data dictionary schema version. Introduce newer versions
419	// if changing schema layout
420	enum {
421	DDL_ENTRY_INDEX_VERSION = `1`,
422	CF_DEFINITION_VERSION = `1`,
423	BINLOG_INFO_INDEX_NUMBER_VERSION = `1`,
424	DDL_DROP_INDEX_ONGOING_VERSION = `1`,
425	MAX_INDEX_ID_VERSION = `1`,
426	DDL_CREATE_INDEX_ONGOING_VERSION = `1`,
427	AUTO_INCREMENT_VERSION = `1`,
428	// Version for index stats is stored in IndexStats struct
429	};
430
431	// Index info version. Introduce newer versions when changing the
432	// INDEX_INFO layout. Update INDEX_INFO_VERSION_LATEST to point to the
433	// latest version number.
434	enum {
435	INDEX_INFO_VERSION_INITIAL = `1`, // Obsolete
436	INDEX_INFO_VERSION_KV_FORMAT,
437	INDEX_INFO_VERSION_GLOBAL_ID,
438	// There is no change to data format in this version, but this version
439	// verifies KV format version, whereas previous versions do not. A version
440	// bump is needed to prevent older binaries from skipping the KV version
441	// check inadvertently.
442	INDEX_INFO_VERSION_VERIFY_KV_FORMAT,
443	// This changes the data format to include a 8 byte TTL duration for tables
444	INDEX_INFO_VERSION_TTL,
445	// This changes the data format to include a bitmap before the TTL duration
446	// which will indicate in the future whether TTL or other special fields
447	// are turned on or off.
448	INDEX_INFO_VERSION_FIELD_FLAGS,
449	// This normally point to the latest (currently it does).
450	INDEX_INFO_VERSION_LATEST = INDEX_INFO_VERSION_FIELD_FLAGS,
451	};
452
453	// MyRocks index types
454	enum {
455	INDEX_TYPE_PRIMARY = `1`,
456	INDEX_TYPE_SECONDARY = `2`,
457	INDEX_TYPE_HIDDEN_PRIMARY = `3`,
458	};
459
460	// Key/Value format version for each index type
461	enum {
462	PRIMARY_FORMAT_VERSION_INITIAL = `10`,
463	// This change includes:
464	// - For columns that can be unpacked with unpack_info, PK
465	// stores the unpack_info.
466	// - DECIMAL datatype is no longer stored in the row (because
467	// it can be decoded from its mem-comparable form)
468	// - VARCHAR-columns use endspace-padding.
469	PRIMARY_FORMAT_VERSION_UPDATE1 = `11`,
470	// This change includes:
471	// - Binary encoded variable length fields have a new format that avoids
472	// an inefficient where data that was a multiple of 8 bytes in length
473	// had an extra 9 bytes of encoded data.
474	PRIMARY_FORMAT_VERSION_UPDATE2 = `12`,
475	// This change includes support for TTL
476	// - This means that when TTL is specified for the table an 8-byte TTL
477	// field is prepended in front of each value.
478	PRIMARY_FORMAT_VERSION_TTL = `13`,
479	PRIMARY_FORMAT_VERSION_LATEST = PRIMARY_FORMAT_VERSION_TTL,
480
481	SECONDARY_FORMAT_VERSION_INITIAL = `10`,
482	// This change the SK format to include unpack_info.
483	SECONDARY_FORMAT_VERSION_UPDATE1 = `11`,
484	// This change includes:
485	// - Binary encoded variable length fields have a new format that avoids
486	// an inefficient where data that was a multiple of 8 bytes in length
487	// had an extra 9 bytes of encoded data.
488	SECONDARY_FORMAT_VERSION_UPDATE2 = `12`,
489	// This change includes support for TTL
490	// - This means that when TTL is specified for the table an 8-byte TTL
491	// field is prepended in front of each value.
492	SECONDARY_FORMAT_VERSION_TTL = `13`,
493	SECONDARY_FORMAT_VERSION_LATEST = SECONDARY_FORMAT_VERSION_TTL,
494	// This change includes support for covering SK lookups for varchars. A
495	// 2-byte bitmap is added after the tag-byte to unpack_info only for
496	// records which have covered varchar columns. Currently waiting before
497	// enabling in prod.
498	SECONDARY_FORMAT_VERSION_UPDATE3 = `65535`,
499	};
500
501	void setup(const TABLE *const table, const Rdb_tbl_def *const tbl_def);
502
503	static uint extract_ttl_duration(const TABLE *const table_arg,
504	const Rdb_tbl_def *const tbl_def_arg,
505	uint64 *ttl_duration);
506	static uint extract_ttl_col(const TABLE *const table_arg,
507	const Rdb_tbl_def *const tbl_def_arg,
508	std::string ttl_column, uint ttl_field_offset,
509	bool skip_checks = false);
510	inline bool has_ttl() const { return m_ttl_duration > `0`; }
511
512	static bool has_index_flag(uint32 index_flags, enum INDEX_FLAG flag);
513	static uint32 calculate_index_flag_offset(uint32 index_flags,
514	enum INDEX_FLAG flag,
515	uint *const field_length = nullptr);
516	void write_index_flag_field(Rdb_string_writer *const buf,
517	const uchar *const val,
518	enum INDEX_FLAG flag) const;
519
520	static const std::string
521	gen_qualifier_for_table(const char *const qualifier,
522	const std::string &partition_name = "");
523	static const std::string
524	gen_cf_name_qualifier_for_partition(const std::string &s);
525	static const std::string
526	gen_ttl_duration_qualifier_for_partition(const std::string &s);
527	static const std::string
528	gen_ttl_col_qualifier_for_partition(const std::string &s);
529
530	static const std::string parse_comment_for_qualifier(
531	const std::string &comment, const TABLE *const table_arg,
532	const Rdb_tbl_def *const tbl_def_arg, bool *per_part_match_found,
533	const char *const qualifier);
534
535	rocksdb::ColumnFamilyHandle get_cf() const* { return m_cf_handle; }
536
537	/ Check if keypart #kp can be unpacked from index tuple /
538	inline bool can_unpack(const uint &kp) const;
539	/ Check if keypart #kp needs unpack info /
540	inline bool has_unpack_info(const uint &kp) const;
541
542	/ Check if given table has a primary key /
543	static bool table_has_hidden_pk(const TABLE *const table);
544
545	void report_checksum_mismatch(const bool &is_key, const char *const data,
546	const size_t data_size) const;
547
548	/ Check if index is at least pk_min if it is a PK,*
549	or at least sk_min if SK./*
550	bool index_format_min_check(const int &pk_min, const int &sk_min) const;
551
552	void pack_with_make_sort_key(
553	Rdb_field_packing *const fpi, Field *const field,
554	uchar buf MY_ATTRIBUTE((__unused__)), uchar *dst,
555	Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__))) const;
556
557	void pack_with_varchar_encoding(
558	Rdb_field_packing *const fpi, Field *const field, uchar buf, uchar *dst,
559	Rdb_pack_field_context *const pack_ctx MY_ATTRIBUTE((__unused__))) const;
560
561	void
562	pack_with_varchar_space_pad(Rdb_field_packing *const fpi, Field *const field,
563	uchar buf, uchar *dst,
564	Rdb_pack_field_context *const pack_ctx) const;
565
566	int unpack_integer(Rdb_field_packing *const fpi, Field *const field,
567	uchar *const to, Rdb_string_reader *const reader,
568	Rdb_string_reader *const unp_reader
569	MY_ATTRIBUTE((__unused__))) const;
570
571	int unpack_double(Rdb_field_packing *const fpi MY_ATTRIBUTE((__unused__)),
572	Field *const field MY_ATTRIBUTE((__unused__)),
573	uchar *const field_ptr, Rdb_string_reader *const reader,
574	Rdb_string_reader *const unp_reader
575	MY_ATTRIBUTE((__unused__))) const;
576
577	int unpack_float(Rdb_field_packing *const fpi,
578	Field *const field MY_ATTRIBUTE((__unused__)),
579	uchar *const field_ptr, Rdb_string_reader *const reader,
580	Rdb_string_reader *const unp_reader
581	MY_ATTRIBUTE((__unused__))) const;
582
583	int unpack_binary_str(Rdb_field_packing *const fpi, Field *const field,
584	uchar *const to, Rdb_string_reader *const reader,
585	Rdb_string_reader *const unp_reader
586	MY_ATTRIBUTE((__unused__))) const;
587
588	int unpack_binary_or_utf8_varchar(
589	Rdb_field_packing *const fpi, Field *const field, uchar *dst,
590	Rdb_string_reader *const reader,
591	Rdb_string_reader *const unp_reader MY_ATTRIBUTE((__unused__))) const;
592
593	int unpack_binary_or_utf8_varchar_space_pad(
594	Rdb_field_packing *const fpi, Field *const field, uchar *dst,
595	Rdb_string_reader *const reader,
596	Rdb_string_reader *const unp_reader) const;
597
598	int unpack_newdate(Rdb_field_packing *const fpi,
599	Field *const field MY_ATTRIBUTE((__unused__)),
600	uchar *const field_ptr, Rdb_string_reader *const reader,
601	Rdb_string_reader *const unp_reader
602	MY_ATTRIBUTE((__unused__))) const;
603
604	int unpack_utf8_str(Rdb_field_packing *const fpi, Field *const field,
605	uchar dst, Rdb_string_reader const reader,
606	Rdb_string_reader *const unp_reader
607	MY_ATTRIBUTE((__unused__))) const;
608
609	int unpack_unknown_varchar(Rdb_field_packing *const fpi, Field *const field,
610	uchar dst, Rdb_string_reader const reader,
611	Rdb_string_reader *const unp_reader) const;
612
613	int unpack_simple_varchar_space_pad(
614	Rdb_field_packing *const fpi, Field *const field, uchar *dst,
615	Rdb_string_reader *const reader,
616	Rdb_string_reader *const unp_reader) const;
617
618	int unpack_simple(Rdb_field_packing *const fpi,
619	Field *const field MY_ATTRIBUTE((__unused__)),
620	uchar *const dst, Rdb_string_reader *const reader,
621	Rdb_string_reader *const unp_reader) const;
622
623	int unpack_unknown(Rdb_field_packing *const fpi, Field *const field,
624	uchar *const dst, Rdb_string_reader *const reader,
625	Rdb_string_reader *const unp_reader) const;
626
627	int unpack_floating_point(uchar *const dst, Rdb_string_reader *const reader,
628	const size_t &size, const int &exp_digit,
629	const uchar *const zero_pattern,
630	const uchar *const zero_val,
631	void (swap_func)(uchar , const uchar )) const*;
632
633	void make_unpack_simple_varchar(const Rdb_collation_codec *const codec,
634	const Field *const field,
635	Rdb_pack_field_context *const pack_ctx) const;
636
637	void make_unpack_simple(const Rdb_collation_codec *const codec,
638	const Field *const field,
639	Rdb_pack_field_context *const pack_ctx) const;
640
641	void make_unpack_unknown(
642	const Rdb_collation_codec *codec MY_ATTRIBUTE((__unused__)),
643	const Field *const field, Rdb_pack_field_context *const pack_ctx) const;
644
645	void make_unpack_unknown_varchar(
646	const Rdb_collation_codec *const codec MY_ATTRIBUTE((__unused__)),
647	const Field *const field, Rdb_pack_field_context *const pack_ctx) const;
648
649	void dummy_make_unpack_info(
650	const Rdb_collation_codec *codec MY_ATTRIBUTE((__unused__)),
651	const Field *field MY_ATTRIBUTE((__unused__)),
652	Rdb_pack_field_context pack_ctx MY_ATTRIBUTE((__unused__))) const*;
653
654	int skip_max_length(const Rdb_field_packing *const fpi,
655	const Field *const field MY_ATTRIBUTE((__unused__)),
656	Rdb_string_reader *const reader) const;
657
658	int skip_variable_length(
659	const Rdb_field_packing *const fpi MY_ATTRIBUTE((__unused__)),
660	const Field *const field, Rdb_string_reader *const reader) const;
661
662	int skip_variable_space_pad(const Rdb_field_packing *const fpi,
663	const Field *const field,
664	Rdb_string_reader *const reader) const;
665
666	inline bool use_legacy_varbinary_format() const {
667	return !index_format_min_check(PRIMARY_FORMAT_VERSION_UPDATE2,
668	SECONDARY_FORMAT_VERSION_UPDATE2);
669	}
670
671	static inline bool is_unpack_data_tag(char c) {
672	return c == RDB_UNPACK_DATA_TAG \|\| c == RDB_UNPACK_COVERED_DATA_TAG;
673	}
674
675	private:
676	#ifndef DBUG_OFF
677	inline bool is_storage_available(const int &offset, const int &needed) const {
678	const int storage_length = static_cast<int>(max_storage_fmt_length());
679	return (storage_length - offset) >= needed;
680	}
681	#else
682	inline bool is_storage_available(const int &offset, const int &needed) const {
683	return `1`;
684	}
685	#endif // DBUG_OFF
686
687	/ Global number of this index (used as prefix in StorageFormat) /
688	const uint32 m_index_number;
689
690	uchar m_index_number_storage_form[INDEX_NUMBER_SIZE];
691
692	rocksdb::ColumnFamilyHandle *m_cf_handle;
693
694	void pack_legacy_variable_format(const uchar *src, size_t src_len,
695	uchar *dst) const*;
696
697	void pack_variable_format(const uchar *src, size_t src_len,
698	uchar *dst) const*;
699
700	uint calc_unpack_legacy_variable_format(uchar flag, bool done) const*;
701
702	uint calc_unpack_variable_format(uchar flag, bool done) const*;
703
704	public:
705	uint16_t m_index_dict_version;
706	uchar m_index_type;
707	/ KV format version for the index id /
708	uint16_t m_kv_format_version;
709	/ If true, the column family stores data in the reverse order /
710	bool m_is_reverse_cf;
711
712	/ If true, then column family is created per partition. /
713	bool m_is_per_partition_cf;
714
715	std::string m_name;
716	mutable Rdb_index_stats m_stats;
717
718	/*
719	Bitmap containing information about whether TTL or other special fields
720	are enabled for the given index.
721	*/
722	uint32 m_index_flags_bitmap;
723
724	/*
725	How much space in bytes the index flag fields occupy.
726	*/
727	uint32 m_total_index_flags_length;
728
729	/*
730	Offset in the records where the 8-byte TTL is stored (UINT_MAX if no TTL)
731	*/
732	uint32 m_ttl_rec_offset;
733
734	/ Default TTL duration /
735	uint64 m_ttl_duration;
736
737	/ TTL column (if defined by user, otherwise implicit TTL is used) /
738	std::string m_ttl_column;
739
740	private:
741	friend class Rdb_tbl_def; // for m_index_number above
742
743	/ Number of key parts in the primary key/
744	uint m_pk_key_parts;
745
746	/*
747	pk_part_no[X]=Y means that keypart #X of this key is key part #Y of the
748	primary key. Y==-1 means this column is not present in the primary key.
749	*/
750	uint *m_pk_part_no;
751
752	/ Array of index-part descriptors. /
753	Rdb_field_packing *m_pack_info;
754
755	uint m_keyno; / number of this index in the table /
756
757	/*
758	Number of key parts in the index (including "index extension"). This is how
759	many elements are in the m_pack_info array.
760	*/
761	uint m_key_parts;
762
763	/*
764	If TTL column is part of the PK, offset of the column within pk.
765	Default is UINT_MAX to denote that TTL col is not part of PK.
766	*/
767	uint m_ttl_pk_key_part_offset;
768
769	/*
770	Index of the TTL column in table->s->fields, if it exists.
771	Default is UINT_MAX to denote that it does not exist.
772	*/
773	uint m_ttl_field_offset;
774
775	/ Prefix extractor for the column family of the key definiton /
776	std::shared_ptr<const rocksdb::SliceTransform> m_prefix_extractor;
777
778	/ Maximum length of the mem-comparable form. /
779	uint m_maxlength;
780
781	/ mutex to protect setup /
782	mysql_mutex_t m_mutex;
783	};
784
785	// "Simple" collations (those specified in strings/ctype-simple.c) are simple
786	// because their strnxfrm function maps one byte to one byte. However, the
787	// mapping is not injective, so the inverse function will take in an extra
788	// index parameter containing information to disambiguate what the original
789	// character was.
790	//
791	// The m_enc members are for encoding. Generally, we want encoding to be:*
792	// src -> (dst, idx)
793	//
794	// Since strnxfrm already gives us dst, we just need m_enc_idx[src] to give us
795	// idx.
796	//
797	// For the inverse, we have:
798	// (dst, idx) -> src
799	//
800	// We have m_dec_idx[idx][dst] = src to get our original character back.
801	//
802	struct Rdb_collation_codec {
803	const my_core::CHARSET_INFO *m_cs;
804	// The first element unpacks VARCHAR(n), the second one - CHAR(n).
805	std::array<rdb_make_unpack_info_t, `2`> m_make_unpack_info_func;
806	std::array<rdb_index_field_unpack_t, `2`> m_unpack_func;
807
808	std::array<uchar, `256`> m_enc_idx;
809	std::array<uchar, `256`> m_enc_size;
810
811	std::array<uchar, `256`> m_dec_size;
812	std::vector<std::array<uchar, `256`>> m_dec_idx;
813	};
814
815	extern mysql_mutex_t rdb_collation_data_mutex;
816	extern mysql_mutex_t rdb_mem_cmp_space_mutex;
817	extern std::array<const Rdb_collation_codec *, MY_ALL_CHARSETS_SIZE>
818	rdb_collation_data;
819
820	class Rdb_field_packing {
821	public:
822	Rdb_field_packing(const Rdb_field_packing &) = delete;
823	Rdb_field_packing &operator=(const Rdb_field_packing &) = delete;
824	Rdb_field_packing() = default;
825
826	/ Length of mem-comparable image of the field, in bytes /
827	int m_max_image_len;
828
829	/ Length of image in the unpack data /
830	int m_unpack_data_len;
831	int m_unpack_data_offset;
832
833	bool m_maybe_null; / TRUE <=> NULL-byte is stored /
834
835	/*
836	Valid only for VARCHAR fields.
837	*/
838	const CHARSET_INFO *m_varchar_charset;
839
840	// (Valid when Variable Length Space Padded Encoding is used):
841	uint m_segment_size; // size of segment used
842
843	// number of bytes used to store number of trimmed (or added)
844	// spaces in the upack_info
845	bool m_unpack_info_uses_two_bytes;
846
847	/*
848	True implies that an index-only read is always possible for this field.
849	False means an index-only read may be possible depending on the record and
850	field type.
851	*/
852	bool m_covered;
853
854	const std::vector<uchar> *space_xfrm;
855	size_t space_xfrm_len;
856	size_t space_mb_len;
857
858	const Rdb_collation_codec *m_charset_codec;
859
860	/*
861	@return TRUE: this field makes use of unpack_info.
862	*/
863	bool uses_unpack_info() const { return (m_make_unpack_info_func != nullptr); }
864
865	/ TRUE means unpack_info stores the original field value /
866	bool m_unpack_info_stores_value;
867
868	rdb_index_field_pack_t m_pack_func;
869	rdb_make_unpack_info_t m_make_unpack_info_func;
870
871	/*
872	This function takes
873	- mem-comparable form
874	- unpack_info data
875	and restores the original value.
876	*/
877	rdb_index_field_unpack_t m_unpack_func;
878
879	/*
880	This function skips over mem-comparable form.
881	*/
882	rdb_index_field_skip_t m_skip_func;
883
884	private:
885	/*
886	Location of the field in the table (key number and key part number).
887
888	Note that this describes not the field, but rather a position of field in
889	the index. Consider an example:
890
891	col1 VARCHAR (100),
892	INDEX idx1 (col1)),
893	INDEX idx2 (col1(10)),
894
895	Here, idx2 has a special Field object that is set to describe a 10-char
896	prefix of col1.
897
898	We must also store the keynr. It is needed for implicit "extended keys".
899	Every key in MyRocks needs to include PK columns. Generally, SQL layer
900	includes PK columns as part of its "Extended Keys" feature, but sometimes
901	it does not (known examples are unique secondary indexes and partitioned
902	tables).
903	In that case, MyRocks's index descriptor has invisible suffix of PK
904	columns (and the point is that these columns are parts of PK, not parts
905	of the current index).
906	*/
907	uint m_keynr;
908	uint m_key_part;
909
910	public:
911	bool setup(const Rdb_key_def *const key_descr, const Field *const field,
912	const uint &keynr_arg, const uint &key_part_arg,
913	const uint16 &key_length);
914	Field get_field_in_table(const* TABLE *const tbl) const;
915	void fill_hidden_pk_val(uchar *dst, const* longlong &hidden_pk_id) const;
916	};
917
918	/*
919	Descriptor telling how to decode/encode a field to on-disk record storage
920	format. Not all information is in the structure yet, but eventually we
921	want to have as much as possible there to avoid virtual calls.
922
923	For encoding/decoding of index tuples, see Rdb_key_def.
924	*/
925	class Rdb_field_encoder {
926	public:
927	Rdb_field_encoder(const Rdb_field_encoder &) = delete;
928	Rdb_field_encoder &operator=(const Rdb_field_encoder &) = delete;
929	/*
930	STORE_NONE is set when a column can be decoded solely from their
931	mem-comparable form.
932	STORE_SOME is set when a column can be decoded from their mem-comparable
933	form plus unpack_info.
934	STORE_ALL is set when a column cannot be decoded, so its original value
935	must be stored in the PK records.
936	*/
937	enum STORAGE_TYPE {
938	STORE_NONE,
939	STORE_SOME,
940	STORE_ALL,
941	};
942	STORAGE_TYPE m_storage_type;
943
944	uint m_null_offset;
945	uint16 m_field_index;
946
947	uchar m_null_mask; // 0 means the field cannot be null
948
949	my_core::enum_field_types m_field_type;
950
951	uint m_pack_length_in_rec;
952
953	bool maybe_null() const { return m_null_mask != `0`; }
954
955	bool uses_variable_len_encoding() const {
956	return (m_field_type == MYSQL_TYPE_BLOB \|\|
957	m_field_type == MYSQL_TYPE_VARCHAR);
958	}
959	};
960
961	inline Field Rdb_key_def::get_table_field_for_part_no(TABLE table,
962	uint part_no) const {
963	DBUG_ASSERT(part_no < get_key_parts());
964	return m_pack_info[part_no].get_field_in_table(table);
965	}
966
967	inline bool Rdb_key_def::can_unpack(const uint &kp) const {
968	DBUG_ASSERT(kp < m_key_parts);
969	return (m_pack_info[kp].m_unpack_func != nullptr);
970	}
971
972	inline bool Rdb_key_def::has_unpack_info(const uint &kp) const {
973	DBUG_ASSERT(kp < m_key_parts);
974	return m_pack_info[kp].uses_unpack_info();
975	}
976
977	/*
978	A table definition. This is an entry in the mapping
979
980	dbname.tablename -> {index_nr, index_nr, ... }
981
982	There is only one Rdb_tbl_def object for a given table.
983	That's why we keep auto_increment value here, too.
984	*/
985
986	class Rdb_tbl_def {
987	private:
988	void check_if_is_mysql_system_table();
989
990	/ Stores 'dbname.tablename' /
991	std::string m_dbname_tablename;
992
993	/ Store the db name, table name, and partition name /
994	std::string m_dbname;
995	std::string m_tablename;
996	std::string m_partition;
997
998	void set_name(const std::string &name);
999
1000	public:
1001	Rdb_tbl_def(const Rdb_tbl_def &) = delete;
1002	Rdb_tbl_def &operator=(const Rdb_tbl_def &) = delete;
1003
1004	explicit Rdb_tbl_def(const std::string &name)
1005	: m_key_descr_arr(nullptr), m_hidden_pk_val (`0`), m_auto_incr_val (`0`) {
1006	set_name(name);
1007	}
1008
1009	Rdb_tbl_def(const char *const name, const size_t &len)
1010	: m_key_descr_arr(nullptr), m_hidden_pk_val (`0`), m_auto_incr_val (`0`) {
1011	set_name(std::string (name, len));
1012	}
1013
1014	explicit Rdb_tbl_def(const rocksdb::Slice &slice, const size_t &pos = `0`)
1015	: m_key_descr_arr(nullptr), m_hidden_pk_val (`0`), m_auto_incr_val (`0`) {
1016	set_name(std::string (slice.data() + pos, slice.size() - pos));
1017	}
1018
1019	~Rdb_tbl_def();
1020
1021	/ Number of indexes /
1022	uint m_key_count;
1023
1024	/ Array of index descriptors /
1025	std::shared_ptr<Rdb_key_def> *m_key_descr_arr;
1026
1027	std::atomic<longlong> m_hidden_pk_val;
1028	std::atomic<ulonglong> m_auto_incr_val;
1029
1030	/ Is this a system table /
1031	bool m_is_mysql_system_table;
1032
1033	bool put_dict(Rdb_dict_manager *const dict, rocksdb::WriteBatch *const batch,
1034	uchar *const key, const size_t &keylen);
1035
1036	const std::string &full_tablename() const { return m_dbname_tablename; }
1037	const std::string &base_dbname() const { return m_dbname; }
1038	const std::string &base_tablename() const { return m_tablename; }
1039	const std::string &base_partition() const { return m_partition; }
1040	GL_INDEX_ID get_autoincr_gl_index_id();
1041	};
1042
1043	/*
1044	A thread-safe sequential number generator. Its performance is not a concern
1045	hence it is ok to protect it by a mutex.
1046	*/
1047
1048	class Rdb_seq_generator {
1049	uint m_next_number = `0`;
1050
1051	mysql_mutex_t m_mutex;
1052
1053	public:
1054	Rdb_seq_generator(const Rdb_seq_generator &) = delete;
1055	Rdb_seq_generator &operator=(const Rdb_seq_generator &) = delete;
1056	Rdb_seq_generator() = default;
1057
1058	void init(const uint &initial_number) {
1059	mysql_mutex_init(`0`, &m_mutex, MY_MUTEX_INIT_FAST);
1060	m_next_number = initial_number;
1061	}
1062
1063	uint get_and_update_next_number(Rdb_dict_manager *const dict);
1064
1065	void cleanup() { mysql_mutex_destroy(&m_mutex); }
1066	};
1067
1068	interface Rdb_tables_scanner {
1069	virtual int add_table(Rdb_tbl_def * tdef) = `0`;
1070	virtual ~Rdb_tables_scanner() {} / Keep the compiler happy /
1071	};
1072
1073	/*
1074	This contains a mapping of
1075
1076	dbname.table_name -> array{Rdb_key_def}.
1077
1078	objects are shared among all threads.
1079	*/
1080
1081	class Rdb_ddl_manager {
1082	Rdb_dict_manager m_dict = nullptr*;
1083	my_core::HASH m_ddl_hash; // Contains Rdb_tbl_def elements
1084	// Maps index id to <table_name, index number>
1085	std::map<GL_INDEX_ID, std::pair<std::string, uint>> m_index_num_to_keydef;
1086
1087	// Maps index id to key definitons not yet committed to data dictionary.
1088	// This is mainly used to store key definitions during ALTER TABLE.
1089	std::map<GL_INDEX_ID, std::shared_ptr<Rdb_key_def>>
1090	m_index_num_to_uncommitted_keydef;
1091	mysql_rwlock_t m_rwlock;
1092
1093	Rdb_seq_generator m_sequence;
1094	// A queue of table stats to write into data dictionary
1095	// It is produced by event listener (ie compaction and flush threads)
1096	// and consumed by the rocksdb background thread
1097	std::map<GL_INDEX_ID, Rdb_index_stats> m_stats2store;
1098
1099	const std::shared_ptr<Rdb_key_def> &find(GL_INDEX_ID gl_index_id);
1100
1101	public:
1102	Rdb_ddl_manager(const Rdb_ddl_manager &) = delete;
1103	Rdb_ddl_manager &operator=(const Rdb_ddl_manager &) = delete;
1104	Rdb_ddl_manager() {}
1105
1106	/ Load the data dictionary from on-disk storage /
1107	bool init(Rdb_dict_manager *const dict_arg, Rdb_cf_manager *const cf_manager,
1108	const uint32_t &validate_tables);
1109
1110	void cleanup();
1111
1112	Rdb_tbl_def find(const* std::string &table_name, const bool &lock = true);
1113	std::shared_ptr<const Rdb_key_def> safe_find(GL_INDEX_ID gl_index_id);
1114	void set_stats(const std::unordered_map<GL_INDEX_ID, Rdb_index_stats> &stats);
1115	void adjust_stats(const std::vector<Rdb_index_stats> &new_data,
1116	const std::vector<Rdb_index_stats> &deleted_data =
1117	std::vector<Rdb_index_stats>());
1118	void persist_stats(const bool &sync = false);
1119
1120	/ Modify the mapping and write it to on-disk storage /
1121	int put_and_write(Rdb_tbl_def *const key_descr,
1122	rocksdb::WriteBatch *const batch);
1123	void remove(Rdb_tbl_def *const rec, rocksdb::WriteBatch *const batch,
1124	const bool &lock = true);
1125	bool rename(const std::string &from, const std::string &to,
1126	rocksdb::WriteBatch *const batch);
1127
1128	uint get_and_update_next_number(Rdb_dict_manager *const dict) {
1129	return m_sequence.get_and_update_next_number(dict);
1130	}
1131
1132	const std::string safe_get_table_name(const GL_INDEX_ID &gl_index_id);
1133
1134	/ Walk the data dictionary /
1135	int scan_for_tables(Rdb_tables_scanner *tables_scanner);
1136
1137	void erase_index_num(const GL_INDEX_ID &gl_index_id);
1138	void add_uncommitted_keydefs(
1139	const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes);
1140	void remove_uncommitted_keydefs(
1141	const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes);
1142
1143	private:
1144	/ Put the data into in-memory table (only) /
1145	int put(Rdb_tbl_def *const key_descr, const bool &lock = true);
1146
1147	/ Helper functions to be passed to my_core::HASH object /
1148	static const uchar get_hash_key(Rdb_tbl_def const rec, size_t *const length,
1149	my_bool not_used MY_ATTRIBUTE((unused)));
1150	static void free_hash_elem(void *const data);
1151
1152	bool validate_schemas();
1153
1154	bool validate_auto_incr();
1155	};
1156
1157	/*
1158	Writing binlog information into RocksDB at commit(),
1159	and retrieving binlog information at crash recovery.
1160	commit() and recovery are always executed by at most single client
1161	at the same time, so concurrency control is not needed.
1162
1163	Binlog info is stored in RocksDB as the following.
1164	key: BINLOG_INFO_INDEX_NUMBER
1165	value: packed single row:
1166	binlog_name_length (2 byte form)
1167	binlog_name
1168	binlog_position (4 byte form)
1169	binlog_gtid_length (2 byte form)
1170	binlog_gtid
1171	*/
1172	class Rdb_binlog_manager {
1173	public:
1174	Rdb_binlog_manager(const Rdb_binlog_manager &) = delete;
1175	Rdb_binlog_manager &operator=(const Rdb_binlog_manager &) = delete;
1176	Rdb_binlog_manager() = default;
1177
1178	bool init(Rdb_dict_manager *const dict);
1179	void cleanup();
1180	void update(const char *const binlog_name, const my_off_t binlog_pos,
1181	rocksdb::WriteBatchBase *const batch);
1182	bool read(char *const binlog_name, my_off_t *const binlog_pos,
1183	char *const binlog_gtid) const;
1184	void update_slave_gtid_info(const uint &id, const char *const db,
1185	const char *const gtid,
1186	rocksdb::WriteBatchBase *const write_batch);
1187
1188	private:
1189	Rdb_dict_manager m_dict = nullptr*;
1190	uchar m_key_buf[Rdb_key_def::INDEX_NUMBER_SIZE] = {`0`};
1191	rocksdb::Slice m_key_slice;
1192
1193	rocksdb::Slice pack_value(uchar *const buf, const char *const binlog_name,
1194	const my_off_t &binlog_pos,
1195	const char *const binlog_gtid) const;
1196	bool unpack_value(const uchar *const value, size_t value_size,
1197	char *const binlog_name,
1198	my_off_t *const binlog_pos, char *const binlog_gtid) const;
1199
1200	std::atomic<Rdb_tbl_def *> m_slave_gtid_info_tbl;
1201	};
1202
1203	/*
1204	Rdb_dict_manager manages how MySQL on RocksDB (MyRocks) stores its
1205	internal data dictionary.
1206	MyRocks stores data dictionary on dedicated system column family
1207	named __system__. The system column family is used by MyRocks
1208	internally only, and not used by applications.
1209
1210	Currently MyRocks has the following data dictionary data models.
1211
1212	1. Table Name => internal index id mappings
1213	key: Rdb_key_def::DDL_ENTRY_INDEX_START_NUMBER(0x1) + dbname.tablename
1214	value: version, {cf_id, index_id}n_indexes_of_the_table*
1215	version is 2 bytes. cf_id and index_id are 4 bytes.
1216
1217	2. internal cf_id, index id => index information
1218	key: Rdb_key_def::INDEX_INFO(0x2) + cf_id + index_id
1219	value: version, index_type, kv_format_version, index_flags, ttl_duration
1220	index_type is 1 byte, version and kv_format_version are 2 bytes.
1221	index_flags is 4 bytes.
1222	ttl_duration is 8 bytes.
1223
1224	3. CF id => CF flags
1225	key: Rdb_key_def::CF_DEFINITION(0x3) + cf_id
1226	value: version, {is_reverse_cf, is_auto_cf (deprecated), is_per_partition_cf}
1227	cf_flags is 4 bytes in total.
1228
1229	4. Binlog entry (updated at commit)
1230	key: Rdb_key_def::BINLOG_INFO_INDEX_NUMBER (0x4)
1231	value: version, {binlog_name,binlog_pos,binlog_gtid}
1232
1233	5. Ongoing drop index entry
1234	key: Rdb_key_def::DDL_DROP_INDEX_ONGOING(0x5) + cf_id + index_id
1235	value: version
1236
1237	6. index stats
1238	key: Rdb_key_def::INDEX_STATISTICS(0x6) + cf_id + index_id
1239	value: version, {materialized PropertiesCollector::IndexStats}
1240
1241	7. maximum index id
1242	key: Rdb_key_def::MAX_INDEX_ID(0x7)
1243	value: index_id
1244	index_id is 4 bytes
1245
1246	8. Ongoing create index entry
1247	key: Rdb_key_def::DDL_CREATE_INDEX_ONGOING(0x8) + cf_id + index_id
1248	value: version
1249
1250	9. auto_increment values
1251	key: Rdb_key_def::AUTO_INC(0x9) + cf_id + index_id
1252	value: version, {max auto_increment so far}
1253	max auto_increment is 8 bytes
1254
1255	Data dictionary operations are atomic inside RocksDB. For example,
1256	when creating a table with two indexes, it is necessary to call Put
1257	three times. They have to be atomic. Rdb_dict_manager has a wrapper function
1258	begin() and commit() to make it easier to do atomic operations.
1259
1260	*/
1261	class Rdb_dict_manager {
1262	private:
1263	mysql_mutex_t m_mutex;
1264	rocksdb::DB m_db = nullptr*;
1265	rocksdb::ColumnFamilyHandle m_system_cfh = nullptr*;
1266	/ Utility to put INDEX_INFO and CF_DEFINITION /
1267
1268	uchar m_key_buf_max_index_id[Rdb_key_def::INDEX_NUMBER_SIZE] = {`0`};
1269	rocksdb::Slice m_key_slice_max_index_id;
1270
1271	static void dump_index_id(uchar *const netbuf,
1272	Rdb_key_def::DATA_DICT_TYPE dict_type,
1273	const GL_INDEX_ID &gl_index_id);
1274	void delete_with_prefix(rocksdb::WriteBatch *const batch,
1275	Rdb_key_def::DATA_DICT_TYPE dict_type,
1276	const GL_INDEX_ID &gl_index_id) const;
1277	/ Functions for fast DROP TABLE/INDEX /
1278	void resume_drop_indexes() const;
1279	void log_start_drop_table(const std::shared_ptr<Rdb_key_def> *const key_descr,
1280	const uint32 &n_keys,
1281	const char *const log_action) const;
1282	void log_start_drop_index(GL_INDEX_ID gl_index_id,
1283	const char log_action) const*;
1284
1285	public:
1286	Rdb_dict_manager(const Rdb_dict_manager &) = delete;
1287	Rdb_dict_manager &operator=(const Rdb_dict_manager &) = delete;
1288	Rdb_dict_manager() = default;
1289
1290	bool init(rocksdb::DB *const rdb_dict, Rdb_cf_manager *const cf_manager);
1291
1292	inline void cleanup() { mysql_mutex_destroy(&m_mutex); }
1293
1294	inline void lock() { RDB_MUTEX_LOCK_CHECK(m_mutex); }
1295
1296	inline void unlock() { RDB_MUTEX_UNLOCK_CHECK(m_mutex); }
1297
1298	inline rocksdb::ColumnFamilyHandle get_system_cf() const* {
1299	return m_system_cfh;
1300	}
1301
1302	/ Raw RocksDB operations /
1303	std::unique_ptr<rocksdb::WriteBatch> begin() const;
1304	int commit(rocksdb::WriteBatch *const batch, const bool &sync = true) const;
1305	rocksdb::Status get_value(const rocksdb::Slice &key,
1306	std::string *const value) const;
1307	void put_key(rocksdb::WriteBatchBase *const batch, const rocksdb::Slice &key,
1308	const rocksdb::Slice &value) const;
1309	void delete_key(rocksdb::WriteBatchBase *batch,
1310	const rocksdb::Slice &key) const;
1311	rocksdb::Iterator new_iterator() const*;
1312
1313	/ Internal Index id => CF /
1314	void
1315	add_or_update_index_cf_mapping(rocksdb::WriteBatch *batch,
1316	struct Rdb_index_info *const index_info) const;
1317	void delete_index_info(rocksdb::WriteBatch *batch,
1318	const GL_INDEX_ID &index_id) const;
1319	bool get_index_info(const GL_INDEX_ID &gl_index_id,
1320	struct Rdb_index_info *const index_info) const;
1321
1322	/ CF id => CF flags /
1323	void add_cf_flags(rocksdb::WriteBatch *const batch, const uint &cf_id,
1324	const uint &cf_flags) const;
1325	bool get_cf_flags(const uint &cf_id, uint *const cf_flags) const;
1326
1327	/ Functions for fast CREATE/DROP TABLE/INDEX /
1328	void
1329	get_ongoing_index_operation(std::unordered_set<GL_INDEX_ID> *gl_index_ids,
1330	Rdb_key_def::DATA_DICT_TYPE dd_type) const;
1331	bool is_index_operation_ongoing(const GL_INDEX_ID &gl_index_id,
1332	Rdb_key_def::DATA_DICT_TYPE dd_type) const;
1333	void start_ongoing_index_operation(rocksdb::WriteBatch *batch,
1334	const GL_INDEX_ID &gl_index_id,
1335	Rdb_key_def::DATA_DICT_TYPE dd_type) const;
1336	void end_ongoing_index_operation(rocksdb::WriteBatch *const batch,
1337	const GL_INDEX_ID &gl_index_id,
1338	Rdb_key_def::DATA_DICT_TYPE dd_type) const;
1339	bool is_drop_index_empty() const;
1340	void add_drop_table(std::shared_ptr<Rdb_key_def> *const key_descr,
1341	const uint32 &n_keys,
1342	rocksdb::WriteBatch *const batch) const;
1343	void add_drop_index(const std::unordered_set<GL_INDEX_ID> &gl_index_ids,
1344	rocksdb::WriteBatch *const batch) const;
1345	void add_create_index(const std::unordered_set<GL_INDEX_ID> &gl_index_ids,
1346	rocksdb::WriteBatch *const batch) const;
1347	void
1348	finish_indexes_operation(const std::unordered_set<GL_INDEX_ID> &gl_index_ids,
1349	Rdb_key_def::DATA_DICT_TYPE dd_type) const;
1350	void rollback_ongoing_index_creation() const;
1351
1352	inline void get_ongoing_drop_indexes(
1353	std::unordered_set<GL_INDEX_ID> gl_index_ids) const* {
1354	get_ongoing_index_operation(gl_index_ids,
1355	Rdb_key_def::DDL_DROP_INDEX_ONGOING);
1356	}
1357	inline void get_ongoing_create_indexes(
1358	std::unordered_set<GL_INDEX_ID> gl_index_ids) const* {
1359	get_ongoing_index_operation(gl_index_ids,
1360	Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
1361	}
1362	inline void start_drop_index(rocksdb::WriteBatch *wb,
1363	const GL_INDEX_ID &gl_index_id) const {
1364	start_ongoing_index_operation(wb, gl_index_id,
1365	Rdb_key_def::DDL_DROP_INDEX_ONGOING);
1366	}
1367	inline void start_create_index(rocksdb::WriteBatch *wb,
1368	const GL_INDEX_ID &gl_index_id) const {
1369	start_ongoing_index_operation(wb, gl_index_id,
1370	Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
1371	}
1372	inline void finish_drop_indexes(
1373	const std::unordered_set<GL_INDEX_ID> &gl_index_ids) const {
1374	finish_indexes_operation(gl_index_ids, Rdb_key_def::DDL_DROP_INDEX_ONGOING);
1375	}
1376	inline void finish_create_indexes(
1377	const std::unordered_set<GL_INDEX_ID> &gl_index_ids) const {
1378	finish_indexes_operation(gl_index_ids,
1379	Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
1380	}
1381	inline bool is_drop_index_ongoing(const GL_INDEX_ID &gl_index_id) const {
1382	return is_index_operation_ongoing(gl_index_id,
1383	Rdb_key_def::DDL_DROP_INDEX_ONGOING);
1384	}
1385	inline bool is_create_index_ongoing(const GL_INDEX_ID &gl_index_id) const {
1386	return is_index_operation_ongoing(gl_index_id,
1387	Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
1388	}
1389
1390	bool get_max_index_id(uint32_t *const index_id) const;
1391	bool update_max_index_id(rocksdb::WriteBatch *const batch,
1392	const uint32_t &index_id) const;
1393	void add_stats(rocksdb::WriteBatch *const batch,
1394	const std::vector<Rdb_index_stats> &stats) const;
1395	Rdb_index_stats get_stats(GL_INDEX_ID gl_index_id) const;
1396
1397	rocksdb::Status put_auto_incr_val(rocksdb::WriteBatchBase *batch,
1398	const GL_INDEX_ID &gl_index_id,
1399	ulonglong val,
1400	bool overwrite = false) const;
1401	bool get_auto_incr_val(const GL_INDEX_ID &gl_index_id,
1402	ulonglong new_val) const*;
1403	};
1404
1405	struct Rdb_index_info {
1406	GL_INDEX_ID m_gl_index_id;
1407	uint16_t m_index_dict_version = `0`;
1408	uchar m_index_type = `0`;
1409	uint16_t m_kv_version = `0`;
1410	uint32 m_index_flags = `0`;
1411	uint64 m_ttl_duration = `0`;
1412	};
1413
1414	/*
1415	@brief
1416	Merge Operator for the auto_increment value in the system_cf
1417
1418	@detail
1419	This class implements the rocksdb Merge Operator for auto_increment values
1420	that are stored to the data dictionary every transaction.
1421
1422	The actual Merge function is triggered on compaction, memtable flushes, or
1423	when get() is called on the same key.
1424
1425	*/
1426	class Rdb_system_merge_op : public rocksdb::AssociativeMergeOperator {
1427	public:
1428	/*
1429	Updates the new value associated with a key to be the maximum of the
1430	passed in value and the existing value.
1431
1432	@param[IN] key
1433	@param[IN] existing_value existing value for a key; nullptr if nonexistent
1434	key
1435	@param[IN] value
1436	@param[OUT] new_value new value after Merge
1437	@param[IN] logger
1438	*/
1439	bool Merge(const rocksdb::Slice &key, const rocksdb::Slice *existing_value,
1440	const rocksdb::Slice &value, std::string *new_value,
1441	rocksdb::Logger logger) const* override {
1442	DBUG_ASSERT(new_value != nullptr);
1443
1444	if (key.size() != Rdb_key_def::INDEX_NUMBER_SIZE * `3` \|\|
1445	GetKeyType(key) != Rdb_key_def::AUTO_INC \|\|
1446	value.size() !=
1447	RDB_SIZEOF_AUTO_INCREMENT_VERSION + ROCKSDB_SIZEOF_AUTOINC_VALUE \|\|
1448	GetVersion(value) > Rdb_key_def::AUTO_INCREMENT_VERSION) {
1449	abort();
1450	}
1451
1452	uint64_t merged_value = Deserialize(value);
1453
1454	if (existing_value != nullptr) {
1455	if (existing_value->size() != RDB_SIZEOF_AUTO_INCREMENT_VERSION +
1456	ROCKSDB_SIZEOF_AUTOINC_VALUE \|\|
1457	GetVersion(*existing_value) > Rdb_key_def::AUTO_INCREMENT_VERSION) {
1458	abort();
1459	}
1460
1461	merged_value = std::max(merged_value, Deserialize(*existing_value));
1462	}
1463	Serialize(merged_value, new_value);
1464	return true;
1465	}
1466
1467	virtual const char Name() const* override { return "Rdb_system_merge_op"; }
1468
1469	private:
1470	/*
1471	Serializes the integer data to the new_value buffer or the target buffer
1472	the merge operator will update to
1473	*/
1474	void Serialize(const uint64_t data, std::string new_value) const* {
1475	uchar value_buf[RDB_SIZEOF_AUTO_INCREMENT_VERSION +
1476	ROCKSDB_SIZEOF_AUTOINC_VALUE] = {`0`};
1477	uchar *ptr = value_buf;
1478	/ fill in the auto increment version /
1479	rdb_netbuf_store_uint16(ptr, Rdb_key_def::AUTO_INCREMENT_VERSION);
1480	ptr += RDB_SIZEOF_AUTO_INCREMENT_VERSION;
1481	/ fill in the auto increment value /
1482	rdb_netbuf_store_uint64(ptr, data);
1483	ptr += ROCKSDB_SIZEOF_AUTOINC_VALUE;
1484	new_value->assign(reinterpret_cast<char *>(value_buf), ptr - value_buf);
1485	}
1486
1487	/*
1488	Gets the value of auto_increment type in the data dictionary from the
1489	value slice
1490
1491	@Note Only to be used on data dictionary keys for the auto_increment type
1492	*/
1493	uint64_t Deserialize(const rocksdb::Slice &s) const {
1494	return rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(s.data()) +
1495	RDB_SIZEOF_AUTO_INCREMENT_VERSION);
1496	}
1497
1498	/*
1499	Gets the type of the key of the key in the data dictionary.
1500
1501	@Note Only to be used on data dictionary keys for the auto_increment type
1502	*/
1503	uint16_t GetKeyType(const rocksdb::Slice &s) const {
1504	return rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>(s.data()));
1505	}
1506
1507	/*
1508	Gets the version of the auto_increment value in the data dictionary.
1509
1510	@Note Only to be used on data dictionary value for the auto_increment type
1511	*/
1512	uint16_t GetVersion(const rocksdb::Slice &s) const {
1513	return rdb_netbuf_to_uint16(reinterpret_cast<const uchar *>(s.data()));
1514	}
1515	};
1516
1517	bool rdb_is_collation_supported(const my_core::CHARSET_INFO *const cs);
1518
1519	} // namespace myrocks
1520

Browse the source code of MariaDB/storage/rocksdb/rdb_datadic.h