1/*
2 Copyright (c) 2012,2013 Monty Program Ab
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
16#pragma once
17
18#ifdef USE_PRAGMA_INTERFACE
19#pragma interface /* gcc class implementation */
20#endif
21
22/* C++ standard header files */
23#include <set>
24#include <string>
25#include <unordered_map>
26#include <unordered_set>
27#include <vector>
28
29/* MySQL header files */
30#include "./handler.h" /* handler */
31#include "./my_global.h" /* ulonglong */
32#include "./sql_string.h"
33
34#ifdef _WIN32
35#undef pthread_key_create
36#undef pthread_key_delete
37#undef pthread_setspecific
38#undef pthread_getspecific
39#endif
40
41
42/* RocksDB header files */
43#include "rocksdb/cache.h"
44#include "rocksdb/merge_operator.h"
45#include "rocksdb/perf_context.h"
46#include "rocksdb/sst_file_manager.h"
47#include "rocksdb/statistics.h"
48#include "rocksdb/utilities/options_util.h"
49#include "rocksdb/utilities/transaction_db.h"
50#include "rocksdb/utilities/write_batch_with_index.h"
51
52/* MyRocks header files */
53#include "./rdb_comparator.h"
54#include "./rdb_index_merge.h"
55#include "./rdb_io_watchdog.h"
56#include "./rdb_perf_context.h"
57#include "./rdb_sst_info.h"
58#include "./rdb_utils.h"
59#include "./ut0counter.h"
60
61class Field_blob;
62class Field_varstring;
63/**
64 @note MyRocks Coding Conventions:
65 MyRocks code follows the baseline MySQL coding conventions, available at
66 http://dev.mysql.com/doc/internals/en/coding-guidelines.html, with several
67 refinements (@see /storage/rocksdb/README file).
68*/
69
70namespace myrocks {
71
72/*
73 * class for exporting transaction information for
74 * information_schema.rocksdb_trx
75 */
76struct Rdb_trx_info {
77 std::string name;
78 ulonglong trx_id;
79 ulonglong write_count;
80 ulonglong lock_count;
81 int timeout_sec;
82 std::string state;
83 std::string waiting_key;
84 ulonglong waiting_cf_id;
85 int is_replication;
86 int skip_trx_api;
87 int read_only;
88 int deadlock_detect;
89 int num_ongoing_bulk_load;
90 ulong thread_id;
91 std::string query_str;
92};
93
94std::vector<Rdb_trx_info> rdb_get_all_trx_info();
95
96/*
97 * class for exporting deadlock transaction information for
98 * information_schema.rocksdb_deadlock
99 */
100struct Rdb_deadlock_info {
101 struct Rdb_dl_trx_info {
102 ulonglong trx_id;
103 std::string cf_name;
104 std::string waiting_key;
105 bool exclusive_lock;
106 std::string index_name;
107 std::string table_name;
108 };
109 std::vector <Rdb_dl_trx_info> path;
110 ulonglong victim_trx_id;
111};
112
113std::vector<Rdb_deadlock_info> rdb_get_deadlock_info();
114
115/*
116 This is
117 - the name of the default Column Family (the CF which stores indexes which
118 didn't explicitly specify which CF they are in)
119 - the name used to set the default column family parameter for per-cf
120 arguments.
121*/
122extern const std::string DEFAULT_CF_NAME;
123
124/*
125 This is the name of the Column Family used for storing the data dictionary.
126*/
127extern const std::string DEFAULT_SYSTEM_CF_NAME;
128
129/*
130 This is the name of the hidden primary key for tables with no pk.
131*/
132const char *const HIDDEN_PK_NAME = "HIDDEN_PK_ID";
133
134/*
135 Column family name which means "put this index into its own column family".
136 DEPRECATED!!!
137*/
138extern const std::string PER_INDEX_CF_NAME;
139
140/*
141 Name for the background thread.
142*/
143const char *const BG_THREAD_NAME = "myrocks-bg";
144
145/*
146 Name for the drop index thread.
147*/
148const char *const INDEX_THREAD_NAME = "myrocks-index";
149
150/*
151 Separator between partition name and the qualifier. Sample usage:
152
153 - p0_cfname=foo
154 - p3_tts_col=bar
155*/
156const char RDB_PER_PARTITION_QUALIFIER_NAME_SEP = '_';
157
158/*
159 Separator between qualifier name and value. Sample usage:
160
161 - p0_cfname=foo
162 - p3_tts_col=bar
163*/
164const char RDB_QUALIFIER_VALUE_SEP = '=';
165
166/*
167 Separator between multiple qualifier assignments. Sample usage:
168
169 - p0_cfname=foo;p1_cfname=bar;p2_cfname=baz
170*/
171const char RDB_QUALIFIER_SEP = ';';
172
173/*
174 Qualifier name for a custom per partition column family.
175*/
176const char *const RDB_CF_NAME_QUALIFIER = "cfname";
177
178/*
179 Qualifier name for a custom per partition ttl duration.
180*/
181const char *const RDB_TTL_DURATION_QUALIFIER = "ttl_duration";
182
183/*
184 Qualifier name for a custom per partition ttl duration.
185*/
186const char *const RDB_TTL_COL_QUALIFIER = "ttl_col";
187
188/*
189 Default, minimal valid, and maximum valid sampling rate values when collecting
190 statistics about table.
191*/
192#define RDB_DEFAULT_TBL_STATS_SAMPLE_PCT 10
193#define RDB_TBL_STATS_SAMPLE_PCT_MIN 1
194#define RDB_TBL_STATS_SAMPLE_PCT_MAX 100
195
196/*
197 Default and maximum values for rocksdb-compaction-sequential-deletes and
198 rocksdb-compaction-sequential-deletes-window to add basic boundary checking.
199*/
200#define DEFAULT_COMPACTION_SEQUENTIAL_DELETES 0
201#define MAX_COMPACTION_SEQUENTIAL_DELETES 2000000
202
203#define DEFAULT_COMPACTION_SEQUENTIAL_DELETES_WINDOW 0
204#define MAX_COMPACTION_SEQUENTIAL_DELETES_WINDOW 2000000
205
206/*
207 Default and maximum values for various compaction and flushing related
208 options. Numbers are based on the hardware we currently use and our internal
209 benchmarks which indicate that parallelization helps with the speed of
210 compactions.
211
212 Ideally of course we'll use heuristic technique to determine the number of
213 CPU-s and derive the values from there. This however has its own set of
214 problems and we'll choose simplicity for now.
215*/
216#define MAX_BACKGROUND_JOBS 64
217
218#define DEFAULT_SUBCOMPACTIONS 1
219#define MAX_SUBCOMPACTIONS 64
220
221/*
222 Default value for rocksdb_sst_mgr_rate_bytes_per_sec = 0 (disabled).
223*/
224#define DEFAULT_SST_MGR_RATE_BYTES_PER_SEC 0
225
226/*
227 Defines the field sizes for serializing XID object to a string representation.
228 string byte format: [field_size: field_value, ...]
229 [
230 8: XID.formatID,
231 1: XID.gtrid_length,
232 1: XID.bqual_length,
233 XID.gtrid_length + XID.bqual_length: XID.data
234 ]
235*/
236#define RDB_FORMATID_SZ 8
237#define RDB_GTRID_SZ 1
238#define RDB_BQUAL_SZ 1
239#define RDB_XIDHDR_LEN (RDB_FORMATID_SZ + RDB_GTRID_SZ + RDB_BQUAL_SZ)
240
241/* collations, used in MariaRocks */
242enum collations_used {
243 COLLATION_UTF8MB4_BIN = 46,
244 COLLATION_LATIN1_BIN = 47,
245 COLLATION_UTF16LE_BIN = 55,
246 COLLATION_UTF32_BIN = 61,
247 COLLATION_UTF16_BIN = 62,
248 COLLATION_BINARY = 63,
249 COLLATION_UTF8_BIN = 83
250};
251
252/*
253 To fix an unhandled exception we specify the upper bound as LONGLONGMAX
254 instead of ULONGLONGMAX because the latter is -1 and causes an exception when
255 cast to jlong (signed) of JNI
256
257 The reason behind the cast issue is the lack of unsigned int support in Java.
258*/
259#define MAX_RATE_LIMITER_BYTES_PER_SEC static_cast<uint64_t>(LONGLONG_MAX)
260
261/*
262 Hidden PK column (for tables with no primary key) is a longlong (aka 8 bytes).
263 static_assert() in code will validate this assumption.
264*/
265#define ROCKSDB_SIZEOF_HIDDEN_PK_COLUMN sizeof(longlong)
266
267/*
268 Bytes used to store TTL, in the beginning of all records for tables with TTL
269 enabled.
270*/
271#define ROCKSDB_SIZEOF_TTL_RECORD sizeof(longlong)
272
273#define ROCKSDB_SIZEOF_AUTOINC_VALUE sizeof(longlong)
274
275/*
276 Maximum index prefix length in bytes.
277*/
278#define MAX_INDEX_COL_LEN_LARGE 3072
279#define MAX_INDEX_COL_LEN_SMALL 767
280
281/*
282 MyRocks specific error codes. NB! Please make sure that you will update
283 HA_ERR_ROCKSDB_LAST when adding new ones. Also update the strings in
284 rdb_error_messages to include any new error messages.
285*/
286#define HA_ERR_ROCKSDB_FIRST (HA_ERR_LAST + 1)
287#define HA_ERR_ROCKSDB_PK_REQUIRED (HA_ERR_ROCKSDB_FIRST + 0)
288#define HA_ERR_ROCKSDB_TABLE_DATA_DIRECTORY_NOT_SUPPORTED \
289 (HA_ERR_ROCKSDB_FIRST + 1)
290#define HA_ERR_ROCKSDB_TABLE_INDEX_DIRECTORY_NOT_SUPPORTED \
291 (HA_ERR_ROCKSDB_FIRST + 2)
292#define HA_ERR_ROCKSDB_COMMIT_FAILED (HA_ERR_ROCKSDB_FIRST + 3)
293#define HA_ERR_ROCKSDB_BULK_LOAD (HA_ERR_ROCKSDB_FIRST + 4)
294#define HA_ERR_ROCKSDB_CORRUPT_DATA (HA_ERR_ROCKSDB_FIRST + 5)
295#define HA_ERR_ROCKSDB_CHECKSUM_MISMATCH (HA_ERR_ROCKSDB_FIRST + 6)
296#define HA_ERR_ROCKSDB_INVALID_TABLE (HA_ERR_ROCKSDB_FIRST + 7)
297#define HA_ERR_ROCKSDB_PROPERTIES (HA_ERR_ROCKSDB_FIRST + 8)
298#define HA_ERR_ROCKSDB_MERGE_FILE_ERR (HA_ERR_ROCKSDB_FIRST + 9)
299/*
300 Each error code below maps to a RocksDB status code found in:
301 rocksdb/include/rocksdb/status.h
302*/
303#define HA_ERR_ROCKSDB_STATUS_NOT_FOUND (HA_ERR_LAST + 10)
304#define HA_ERR_ROCKSDB_STATUS_CORRUPTION (HA_ERR_LAST + 11)
305#define HA_ERR_ROCKSDB_STATUS_NOT_SUPPORTED (HA_ERR_LAST + 12)
306#define HA_ERR_ROCKSDB_STATUS_INVALID_ARGUMENT (HA_ERR_LAST + 13)
307#define HA_ERR_ROCKSDB_STATUS_IO_ERROR (HA_ERR_LAST + 14)
308#define HA_ERR_ROCKSDB_STATUS_NO_SPACE (HA_ERR_LAST + 15)
309#define HA_ERR_ROCKSDB_STATUS_MERGE_IN_PROGRESS (HA_ERR_LAST + 16)
310#define HA_ERR_ROCKSDB_STATUS_INCOMPLETE (HA_ERR_LAST + 17)
311#define HA_ERR_ROCKSDB_STATUS_SHUTDOWN_IN_PROGRESS (HA_ERR_LAST + 18)
312#define HA_ERR_ROCKSDB_STATUS_TIMED_OUT (HA_ERR_LAST + 19)
313#define HA_ERR_ROCKSDB_STATUS_ABORTED (HA_ERR_LAST + 20)
314#define HA_ERR_ROCKSDB_STATUS_LOCK_LIMIT (HA_ERR_LAST + 21)
315#define HA_ERR_ROCKSDB_STATUS_BUSY (HA_ERR_LAST + 22)
316#define HA_ERR_ROCKSDB_STATUS_DEADLOCK (HA_ERR_LAST + 23)
317#define HA_ERR_ROCKSDB_STATUS_EXPIRED (HA_ERR_LAST + 24)
318#define HA_ERR_ROCKSDB_STATUS_TRY_AGAIN (HA_ERR_LAST + 25)
319#define HA_ERR_ROCKSDB_LAST HA_ERR_ROCKSDB_STATUS_TRY_AGAIN
320
321/**
322 @brief
323 Rdb_table_handler is a reference-counted structure storing information for
324 each open table. All the objects are stored in a global hash map.
325
326 //TODO: join this with Rdb_tbl_def ?
327*/
328struct Rdb_table_handler {
329 char *m_table_name;
330 uint m_table_name_length;
331 int m_ref_count;
332 atomic_stat<int> m_lock_wait_timeout_counter;
333 atomic_stat<int> m_deadlock_counter;
334
335 my_core::THR_LOCK m_thr_lock; ///< MySQL latch needed by m_db_lock
336
337 /* Stores cumulative table statistics */
338 my_io_perf_atomic_t m_io_perf_read;
339 my_io_perf_atomic_t m_io_perf_write;
340 Rdb_atomic_perf_counters m_table_perf_context;
341
342 /* Stores cached memtable estimate statistics */
343 std::atomic_uint m_mtcache_lock;
344 uint64_t m_mtcache_count;
345 uint64_t m_mtcache_size;
346 uint64_t m_mtcache_last_update;
347};
348
349class Rdb_key_def;
350class Rdb_tbl_def;
351class Rdb_transaction;
352class Rdb_transaction_impl;
353class Rdb_writebatch_impl;
354class Rdb_field_encoder;
355
356const char *const rocksdb_hton_name = "ROCKSDB";
357
358typedef struct _gl_index_id_s {
359 uint32_t cf_id;
360 uint32_t index_id;
361 bool operator==(const struct _gl_index_id_s &other) const {
362 return cf_id == other.cf_id && index_id == other.index_id;
363 }
364 bool operator!=(const struct _gl_index_id_s &other) const {
365 return cf_id != other.cf_id || index_id != other.index_id;
366 }
367 bool operator<(const struct _gl_index_id_s &other) const {
368 return cf_id < other.cf_id ||
369 (cf_id == other.cf_id && index_id < other.index_id);
370 }
371 bool operator<=(const struct _gl_index_id_s &other) const {
372 return cf_id < other.cf_id ||
373 (cf_id == other.cf_id && index_id <= other.index_id);
374 }
375 bool operator>(const struct _gl_index_id_s &other) const {
376 return cf_id > other.cf_id ||
377 (cf_id == other.cf_id && index_id > other.index_id);
378 }
379 bool operator>=(const struct _gl_index_id_s &other) const {
380 return cf_id > other.cf_id ||
381 (cf_id == other.cf_id && index_id >= other.index_id);
382 }
383} GL_INDEX_ID;
384
385enum operation_type : int {
386 ROWS_DELETED = 0,
387 ROWS_INSERTED,
388 ROWS_READ,
389 ROWS_UPDATED,
390 ROWS_DELETED_BLIND,
391 ROWS_EXPIRED,
392 ROWS_FILTERED,
393 ROWS_HIDDEN_NO_SNAPSHOT,
394 ROWS_MAX
395};
396
397enum query_type : int { QUERIES_POINT = 0, QUERIES_RANGE, QUERIES_MAX };
398
399#if defined(HAVE_SCHED_GETCPU)
400#define RDB_INDEXER get_sched_indexer_t
401#else
402#define RDB_INDEXER thread_id_indexer_t
403#endif
404
405/* Global statistics struct used inside MyRocks */
406struct st_global_stats {
407 ib_counter_t<ulonglong, 64, RDB_INDEXER> rows[ROWS_MAX];
408
409 // system_rows_ stats are only for system
410 // tables. They are not counted in rows_* stats.
411 ib_counter_t<ulonglong, 64, RDB_INDEXER> system_rows[ROWS_MAX];
412
413 ib_counter_t<ulonglong, 64, RDB_INDEXER> queries[QUERIES_MAX];
414
415 ib_counter_t<ulonglong, 64, RDB_INDEXER> covered_secondary_key_lookups;
416};
417
418/* Struct used for exporting status to MySQL */
419struct st_export_stats {
420 ulonglong rows_deleted;
421 ulonglong rows_inserted;
422 ulonglong rows_read;
423 ulonglong rows_updated;
424 ulonglong rows_deleted_blind;
425 ulonglong rows_expired;
426 ulonglong rows_filtered;
427 ulonglong rows_hidden_no_snapshot;
428
429 ulonglong system_rows_deleted;
430 ulonglong system_rows_inserted;
431 ulonglong system_rows_read;
432 ulonglong system_rows_updated;
433
434 ulonglong queries_point;
435 ulonglong queries_range;
436
437 ulonglong covered_secondary_key_lookups;
438};
439
440/* Struct used for exporting RocksDB memory status */
441struct st_memory_stats {
442 ulonglong memtable_total;
443 ulonglong memtable_unflushed;
444};
445
446/* Struct used for exporting RocksDB IO stalls stats */
447struct st_io_stall_stats {
448 ulonglong level0_slowdown;
449 ulonglong level0_slowdown_with_compaction;
450 ulonglong level0_numfiles;
451 ulonglong level0_numfiles_with_compaction;
452 ulonglong stop_for_pending_compaction_bytes;
453 ulonglong slowdown_for_pending_compaction_bytes;
454 ulonglong memtable_compaction;
455 ulonglong memtable_slowdown;
456 ulonglong total_stop;
457 ulonglong total_slowdown;
458
459 st_io_stall_stats()
460 : level0_slowdown(0), level0_slowdown_with_compaction(0),
461 level0_numfiles(0), level0_numfiles_with_compaction(0),
462 stop_for_pending_compaction_bytes(0),
463 slowdown_for_pending_compaction_bytes(0), memtable_compaction(0),
464 memtable_slowdown(0), total_stop(0), total_slowdown(0) {}
465};
466
467} // namespace myrocks
468
469#include "./rdb_buff.h"
470
471/* Provide hash function for GL_INDEX_ID so we can include it in sets */
472namespace std {
473template <> struct hash<myrocks::GL_INDEX_ID> {
474 std::size_t operator()(const myrocks::GL_INDEX_ID &gl_index_id) const {
475 const uint64_t val =
476 ((uint64_t)gl_index_id.cf_id << 32 | (uint64_t)gl_index_id.index_id);
477 return std::hash<uint64_t>()(val);
478 }
479};
480} // namespace std
481
482namespace myrocks {
483
484/**
485 @brief
486 Class definition for ROCKSDB storage engine plugin handler
487*/
488
489class ha_rocksdb : public my_core::handler {
490 my_core::THR_LOCK_DATA m_db_lock; ///< MySQL database lock
491
492 Rdb_table_handler *m_table_handler; ///< Open table handler
493
494 /* Iterator used for range scans and for full table/index scans */
495 rocksdb::Iterator *m_scan_it;
496
497 /* Same as handler::end_key but for start. Reverse-ordered scans need it */
498 key_range m_save_start_range;
499 const key_range *m_start_range;
500
501 /* Whether m_scan_it was created with skip_bloom=true */
502 bool m_scan_it_skips_bloom;
503
504 const rocksdb::Snapshot *m_scan_it_snapshot;
505
506 Rdb_tbl_def *m_tbl_def;
507
508 /* Primary Key encoder from KeyTupleFormat to StorageFormat */
509 std::shared_ptr<Rdb_key_def> m_pk_descr;
510
511 /* Array of index descriptors */
512 std::shared_ptr<Rdb_key_def> *m_key_descr_arr;
513
514 bool check_keyread_allowed(uint inx, uint part, bool all_parts) const;
515
516 /*
517 Number of key parts in PK. This is the same as
518 table->key_info[table->s->primary_key].keyparts
519 */
520 uint m_pk_key_parts;
521
522 /*
523 TRUE <=> Primary Key columns can be decoded from the index
524 */
525 mutable bool m_pk_can_be_decoded;
526
527 /*
528 TRUE <=> Some fields in the PK may require unpack_info.
529 */
530 bool m_maybe_unpack_info;
531
532 uchar *m_pk_tuple; /* Buffer for storing PK in KeyTupleFormat */
533 uchar *m_pk_packed_tuple; /* Buffer for storing PK in StorageFormat */
534 // ^^ todo: change it to 'char*'? TODO: ^ can we join this with last_rowkey?
535
536 /*
537 Temporary buffers for storing the key part of the Key/Value pair
538 for secondary indexes.
539 */
540 uchar *m_sk_packed_tuple;
541
542 /*
543 Temporary buffers for storing end key part of the Key/Value pair.
544 This is used for range scan only.
545 */
546 uchar *m_end_key_packed_tuple;
547
548 Rdb_string_writer m_sk_tails;
549 Rdb_string_writer m_pk_unpack_info;
550
551 /*
552 ha_rockdb->index_read_map(.. HA_READ_KEY_EXACT or similar) will save here
553 mem-comparable form of the index lookup tuple.
554 */
555 uchar *m_sk_match_prefix;
556 uint m_sk_match_length;
557
558 /* Buffer space for the above */
559 uchar *m_sk_match_prefix_buf;
560
561 /* Second buffers, used by UPDATE. */
562 uchar *m_sk_packed_tuple_old;
563 Rdb_string_writer m_sk_tails_old;
564
565 /* Buffers used for duplicate checking during unique_index_creation */
566 uchar *m_dup_sk_packed_tuple;
567 uchar *m_dup_sk_packed_tuple_old;
568
569 /* Buffers used for passing upper/bound eq conditions. */
570 uchar *m_eq_cond_lower_bound;
571 uchar *m_eq_cond_upper_bound;
572 rocksdb::Slice m_eq_cond_lower_bound_slice;
573 rocksdb::Slice m_eq_cond_upper_bound_slice;
574
575 /*
576 Temporary space for packing VARCHARs (we provide it to
577 pack_record()/pack_index_tuple() calls).
578 */
579 uchar *m_pack_buffer;
580
581 /*
582 Pointer to the original TTL timestamp value (8 bytes) during UPDATE.
583 */
584 char m_ttl_bytes[ROCKSDB_SIZEOF_TTL_RECORD];
585 /*
586 The TTL timestamp value can change if the explicit TTL column is
587 updated. If we detect this when updating the PK, we indicate it here so
588 we know we must always update any SK's.
589 */
590 bool m_ttl_bytes_updated;
591
592 /* rowkey of the last record we've read, in StorageFormat. */
593 String m_last_rowkey;
594
595 /* Buffer used by convert_record_to_storage_format() */
596 String m_storage_record;
597
598 /*
599 Last retrieved record, in table->record[0] data format.
600
601 This is used only when we get the record with rocksdb's Get() call (The
602 other option is when we get a rocksdb::Slice from an iterator)
603 */
604 rocksdb::PinnableSlice m_retrieved_record;
605
606 /* Type of locking to apply to rows */
607 enum { RDB_LOCK_NONE, RDB_LOCK_READ, RDB_LOCK_WRITE } m_lock_rows;
608
609 /* TRUE means we're doing an index-only read. FALSE means otherwise. */
610 bool m_keyread_only;
611
612 bool m_skip_scan_it_next_call;
613
614 /* TRUE means we are accessing the first row after a snapshot was created */
615 bool m_rnd_scan_is_new_snapshot;
616
617 /* TRUE means the replication slave will use Read Free Replication */
618 bool m_use_read_free_rpl;
619
620 /*
621 TRUE means we should skip unique key checks for this table if the
622 replication lag gets too large
623 */
624 bool m_skip_unique_check;
625
626 /**
627 @brief
628 This is a bitmap of indexes (i.e. a set) whose keys (in future, values) may
629 be changed by this statement. Indexes that are not in the bitmap do not need
630 to be updated.
631 @note Valid inside UPDATE statements, IIF(m_update_scope_is_valid == true).
632 */
633 my_core::key_map m_update_scope;
634 bool m_update_scope_is_valid;
635
636 /* SST information used for bulk loading the primary key */
637 std::shared_ptr<Rdb_sst_info> m_sst_info;
638
639 /*
640 MySQL index number for duplicate key error
641 */
642 int m_dupp_errkey;
643
644 int create_key_defs(const TABLE *const table_arg,
645 Rdb_tbl_def *const tbl_def_arg,
646 const TABLE *const old_table_arg = nullptr,
647 const Rdb_tbl_def *const old_tbl_def_arg = nullptr) const
648 MY_ATTRIBUTE((__nonnull__(2, 3), __warn_unused_result__));
649 int secondary_index_read(const int keyno, uchar *const buf)
650 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
651 void setup_iterator_for_rnd_scan();
652 bool is_ascending(const Rdb_key_def &keydef,
653 enum ha_rkey_function find_flag) const
654 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
655 void setup_iterator_bounds(const Rdb_key_def &kd,
656 const rocksdb::Slice &eq_cond,
657 uchar *lower_bound_buf,
658 uchar *upper_bound_buf,
659 rocksdb::Slice *out_lower_bound,
660 rocksdb::Slice *out_upper_bound);
661 bool can_use_bloom_filter(THD *thd, const Rdb_key_def &kd,
662 const rocksdb::Slice &eq_cond,
663 const bool use_all_keys);
664 bool check_bloom_and_set_bounds(THD *thd, const Rdb_key_def &kd,
665 const rocksdb::Slice &eq_cond,
666 const bool use_all_keys,
667 uchar *lower_bound_buf,
668 uchar *upper_bound_buf,
669 rocksdb::Slice *out_lower_bound,
670 rocksdb::Slice *out_upper_bound);
671 void setup_scan_iterator(const Rdb_key_def &kd, rocksdb::Slice *slice,
672 const bool use_all_keys, const uint eq_cond_len)
673 MY_ATTRIBUTE((__nonnull__));
674 void release_scan_iterator(void);
675
676 rocksdb::Status
677 get_for_update(Rdb_transaction *const tx,
678 rocksdb::ColumnFamilyHandle *const column_family,
679 const rocksdb::Slice &key,
680 rocksdb::PinnableSlice *value) const;
681
682 int get_row_by_rowid(uchar *const buf, const char *const rowid,
683 const uint rowid_size, const bool skip_lookup = false,
684 const bool skip_ttl_check = true)
685 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
686 int get_row_by_rowid(uchar *const buf, const uchar *const rowid,
687 const uint rowid_size, const bool skip_lookup = false,
688 const bool skip_ttl_check = true)
689 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__)) {
690 return get_row_by_rowid(buf, reinterpret_cast<const char *>(rowid),
691 rowid_size, skip_lookup, skip_ttl_check);
692 }
693
694 void load_auto_incr_value();
695 ulonglong load_auto_incr_value_from_index();
696 void update_auto_incr_val(ulonglong val);
697 void update_auto_incr_val_from_field();
698 rocksdb::Status get_datadic_auto_incr(Rdb_transaction *const tx,
699 const GL_INDEX_ID &gl_index_id,
700 ulonglong *new_val) const;
701 longlong update_hidden_pk_val();
702 int load_hidden_pk_value() MY_ATTRIBUTE((__warn_unused_result__));
703 int read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id)
704 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
705 bool can_use_single_delete(const uint &index) const
706 MY_ATTRIBUTE((__warn_unused_result__));
707 bool is_blind_delete_enabled();
708 bool skip_unique_check() const MY_ATTRIBUTE((__warn_unused_result__));
709#ifdef MARIAROCKS_NOT_YET // MDEV-10975
710 void set_force_skip_unique_check(bool skip) override;
711#endif
712 bool commit_in_the_middle() MY_ATTRIBUTE((__warn_unused_result__));
713 bool do_bulk_commit(Rdb_transaction *const tx)
714 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
715 bool has_hidden_pk(const TABLE *const table) const
716 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
717
718 void update_row_stats(const operation_type &type);
719
720 void set_last_rowkey(const uchar *const old_data);
721
722 /*
723 Array of table->s->fields elements telling how to store fields in the
724 record.
725 */
726 Rdb_field_encoder *m_encoder_arr;
727
728 /* Describes instructions on how to decode the field */
729 class READ_FIELD {
730 public:
731 /* Points to Rdb_field_encoder describing the field */
732 Rdb_field_encoder *m_field_enc;
733 /* if true, decode the field, otherwise skip it */
734 bool m_decode;
735 /* Skip this many bytes before reading (or skipping) this field */
736 int m_skip;
737 };
738
739 /*
740 This tells which table fields should be decoded (or skipped) when
741 decoding table row from (pk, encoded_row) pair. (Secondary keys are
742 just always decoded in full currently)
743 */
744 std::vector<READ_FIELD> m_decoders_vect;
745
746 /*
747 This tells if any field which is part of the key needs to be unpacked and
748 decoded.
749 */
750 bool m_key_requested = false;
751
752 /* Setup field_decoders based on type of scan and table->read_set */
753 void setup_read_decoders();
754
755 /*
756 For the active index, indicates which columns must be covered for the
757 current lookup to be covered. If the bitmap field is null, that means this
758 index does not cover the current lookup for any record.
759 */
760 MY_BITMAP m_lookup_bitmap = {nullptr, nullptr, nullptr, 0, 0};
761
762 /*
763 Number of bytes in on-disk (storage) record format that are used for
764 storing SQL NULL flags.
765 */
766 uint m_null_bytes_in_rec;
767
768 void get_storage_type(Rdb_field_encoder *const encoder, const uint &kp);
769 void setup_field_converters();
770 int alloc_key_buffers(const TABLE *const table_arg,
771 const Rdb_tbl_def *const tbl_def_arg,
772 bool alloc_alter_buffers = false)
773 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
774 void free_key_buffers();
775
776 // the buffer size should be at least 2*Rdb_key_def::INDEX_NUMBER_SIZE
777 rocksdb::Range get_range(const int &i, uchar buf[]) const;
778
779 /*
780 Perf timers for data reads
781 */
782 Rdb_io_perf m_io_perf;
783
784 /*
785 A counter of how many row checksums were checked for this table. Note that
786 this does not include checksums for secondary index entries.
787 */
788 my_core::ha_rows m_row_checksums_checked;
789
790 /*
791 Update stats
792 */
793 void update_stats(void);
794
795public:
796 /*
797 The following two are currently only used for getting the range bounds
798 from QUICK_SELECT_DESC.
799 We don't need to implement prepare_index_key_scan[_map] because it is
800 only used with HA_READ_KEY_EXACT and HA_READ_PREFIX_LAST where one
801 can infer the bounds of the range being scanned, anyway.
802 */
803 int prepare_index_scan() override;
804 int prepare_range_scan(const key_range *start_key,
805 const key_range *end_key) override;
806
807 /*
808 Controls whether writes include checksums. This is updated from the session
809 variable
810 at the start of each query.
811 */
812 bool m_store_row_debug_checksums;
813
814 /* Same as above but for verifying checksums when reading */
815 bool m_verify_row_debug_checksums;
816 int m_checksums_pct;
817
818 ha_rocksdb(my_core::handlerton *const hton,
819 my_core::TABLE_SHARE *const table_arg);
820 ~ha_rocksdb() {
821 int err MY_ATTRIBUTE((__unused__));
822 err = finalize_bulk_load(false);
823 if (err != 0) {
824 sql_print_error("RocksDB: Error %d finalizing bulk load while closing "
825 "handler.",
826 err);
827 }
828 }
829
830 /** @brief
831 The name that will be used for display purposes.
832 */
833 const char *table_type() const /*override*/ {
834 DBUG_ENTER_FUNC();
835 // MariaDB: this function is not virtual, however ha_innodb
836 // declares it (and then never uses!) psergey-merge-todo:.
837 DBUG_RETURN(rocksdb_hton_name);
838 }
839
840 /* The following is only used by SHOW KEYS: */
841 const char *index_type(uint inx) override {
842 DBUG_ENTER_FUNC();
843
844 DBUG_RETURN("LSMTREE");
845 }
846
847 /*
848 Not present in MariaDB:
849 const char **bas_ext() const override;
850 */
851
852 /*
853 Returns the name of the table's base name
854 */
855 const std::string &get_table_basename() const;
856
857 /** @brief
858 This is a list of flags that indicate what functionality the storage engine
859 implements. The current table flags are documented in handler.h
860 */
861 ulonglong table_flags() const override {
862 DBUG_ENTER_FUNC();
863
864 /*
865 HA_BINLOG_STMT_CAPABLE
866 We are saying that this engine is just statement capable to have
867 an engine that can only handle statement-based logging. This is
868 used in testing.
869 HA_REC_NOT_IN_SEQ
870 If we don't set it, filesort crashes, because it assumes rowids are
871 1..8 byte numbers
872 HA_PRIMARY_KEY_IN_READ_INDEX
873 This flag is always set, even for tables that:
874 - have no PK
875 - have some (or all) of PK that can't be decoded from the secondary
876 index.
877 */
878 DBUG_RETURN(HA_BINLOG_ROW_CAPABLE | HA_BINLOG_STMT_CAPABLE |
879 HA_REC_NOT_IN_SEQ | HA_CAN_INDEX_BLOBS |
880 HA_PRIMARY_KEY_IN_READ_INDEX |
881 HA_PRIMARY_KEY_REQUIRED_FOR_POSITION | HA_NULL_IN_KEY |
882 HA_PARTIAL_COLUMN_READ |
883 HA_TABLE_SCAN_ON_INDEX);
884 }
885
886private:
887 bool init_with_fields(); /* no 'override' in MariaDB */
888public:
889 /** @brief
890 This is a bitmap of flags that indicates how the storage engine
891 implements indexes. The current index flags are documented in
892 handler.h. If you do not implement indexes, just return zero here.
893
894 @details
895 part is the key part to check. First key part is 0.
896 If all_parts is set, MySQL wants to know the flags for the combined
897 index, up to and including 'part'.
898 */
899 ulong index_flags(uint inx, uint part, bool all_parts) const override;
900
901 const key_map *keys_to_use_for_scanning() override {
902 DBUG_ENTER_FUNC();
903
904 DBUG_RETURN(&key_map_full);
905 }
906
907 bool primary_key_is_clustered() override {
908 DBUG_ENTER_FUNC();
909
910 DBUG_RETURN(true);
911 }
912
913 bool should_store_row_debug_checksums() const {
914 return m_store_row_debug_checksums && (rand() % 100 < m_checksums_pct);
915 }
916
917 int rename_table(const char *const from, const char *const to) override
918 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
919
920 int convert_blob_from_storage_format(my_core::Field_blob *const blob,
921 Rdb_string_reader *const reader,
922 bool decode)
923 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
924
925 int convert_varchar_from_storage_format(
926 my_core::Field_varstring *const field_var,
927 Rdb_string_reader *const reader, bool decode)
928 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
929
930 int convert_field_from_storage_format(my_core::Field *const field,
931 Rdb_string_reader *const reader,
932 bool decode, uint len)
933 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
934
935 int convert_record_from_storage_format(const rocksdb::Slice *const key,
936 const rocksdb::Slice *const value,
937 uchar *const buf)
938 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
939
940 int convert_record_from_storage_format(const rocksdb::Slice *const key,
941 uchar *const buf)
942 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
943
944 static const std::vector<std::string> parse_into_tokens(const std::string &s,
945 const char delim);
946
947 static const std::string generate_cf_name(const uint index,
948 const TABLE *const table_arg,
949 const Rdb_tbl_def *const tbl_def_arg,
950 bool *per_part_match_found);
951
952 static const char *get_key_name(const uint index,
953 const TABLE *const table_arg,
954 const Rdb_tbl_def *const tbl_def_arg)
955 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
956
957 static const char *get_key_comment(const uint index,
958 const TABLE *const table_arg,
959 const Rdb_tbl_def *const tbl_def_arg)
960 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
961
962 static const std::string get_table_comment(const TABLE *const table_arg)
963 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
964
965 static bool is_hidden_pk(const uint index, const TABLE *const table_arg,
966 const Rdb_tbl_def *const tbl_def_arg)
967 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
968
969 static uint pk_index(const TABLE *const table_arg,
970 const Rdb_tbl_def *const tbl_def_arg)
971 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
972
973 static bool is_pk(const uint index, const TABLE *table_arg,
974 const Rdb_tbl_def *tbl_def_arg)
975 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
976 /** @brief
977 unireg.cc will call max_supported_record_length(), max_supported_keys(),
978 max_supported_key_parts(), uint max_supported_key_length()
979 to make sure that the storage engine can handle the data it is about to
980 send. Return *real* limits of your storage engine here; MySQL will do
981 min(your_limits, MySQL_limits) automatically.
982 */
983 uint max_supported_record_length() const override {
984 DBUG_ENTER_FUNC();
985
986 DBUG_RETURN(HA_MAX_REC_LENGTH);
987 }
988
989 uint max_supported_keys() const override {
990 DBUG_ENTER_FUNC();
991
992 DBUG_RETURN(MAX_INDEXES);
993 }
994
995 uint max_supported_key_parts() const override {
996 DBUG_ENTER_FUNC();
997
998 DBUG_RETURN(MAX_REF_PARTS);
999 }
1000
1001 uint max_supported_key_part_length() const override;
1002
1003 /** @brief
1004 unireg.cc will call this to make sure that the storage engine can handle
1005 the data it is about to send. Return *real* limits of your storage engine
1006 here; MySQL will do min(your_limits, MySQL_limits) automatically.
1007
1008 @details
1009 There is no need to implement ..._key_... methods if your engine doesn't
1010 support indexes.
1011 */
1012 uint max_supported_key_length() const override {
1013 DBUG_ENTER_FUNC();
1014
1015 DBUG_RETURN(16 * 1024); /* just to return something*/
1016 }
1017
1018 /**
1019 TODO: return actual upper bound of number of records in the table.
1020 (e.g. save number of records seen on full table scan and/or use file size
1021 as upper bound)
1022 */
1023 ha_rows estimate_rows_upper_bound() override {
1024 DBUG_ENTER_FUNC();
1025
1026 DBUG_RETURN(HA_POS_ERROR);
1027 }
1028
1029 /* At the moment, we're ok with default handler::index_init() implementation.
1030 */
1031 int index_read_map(uchar *const buf, const uchar *const key,
1032 key_part_map keypart_map,
1033 enum ha_rkey_function find_flag) override
1034 MY_ATTRIBUTE((__warn_unused_result__));
1035
1036 int index_read_map_impl(uchar *const buf, const uchar *const key,
1037 key_part_map keypart_map,
1038 enum ha_rkey_function find_flag,
1039 const key_range *end_key)
1040 MY_ATTRIBUTE((__warn_unused_result__));
1041
1042 bool is_using_full_key(key_part_map keypart_map, uint actual_key_parts);
1043 int read_range_first(const key_range *const start_key,
1044 const key_range *const end_key, bool eq_range,
1045 bool sorted) override
1046 MY_ATTRIBUTE((__warn_unused_result__));
1047
1048 virtual double scan_time() override {
1049 DBUG_ENTER_FUNC();
1050
1051 DBUG_RETURN(
1052 static_cast<double>((stats.records + stats.deleted) / 20.0 + 10));
1053 }
1054
1055 virtual double read_time(uint, uint, ha_rows rows) override;
1056
1057 int open(const char *const name, int mode, uint test_if_locked) override
1058 MY_ATTRIBUTE((__warn_unused_result__));
1059 int close(void) override MY_ATTRIBUTE((__warn_unused_result__));
1060
1061 int write_row(uchar *const buf) override
1062 MY_ATTRIBUTE((__warn_unused_result__));
1063 int update_row(const uchar *const old_data, const uchar *const new_data) override
1064 MY_ATTRIBUTE((__warn_unused_result__));
1065 int delete_row(const uchar *const buf) override
1066 MY_ATTRIBUTE((__warn_unused_result__));
1067 rocksdb::Status delete_or_singledelete(uint index, Rdb_transaction *const tx,
1068 rocksdb::ColumnFamilyHandle *const cf,
1069 const rocksdb::Slice &key)
1070 MY_ATTRIBUTE((__warn_unused_result__));
1071
1072 int index_next(uchar *const buf) override
1073 MY_ATTRIBUTE((__warn_unused_result__));
1074 int index_next_with_direction(uchar *const buf, bool move_forward)
1075 MY_ATTRIBUTE((__warn_unused_result__));
1076 int index_prev(uchar *const buf) override
1077 MY_ATTRIBUTE((__warn_unused_result__));
1078
1079 int index_first(uchar *const buf) override
1080 MY_ATTRIBUTE((__warn_unused_result__));
1081 int index_last(uchar *const buf) override
1082 MY_ATTRIBUTE((__warn_unused_result__));
1083
1084 class Item *idx_cond_push(uint keyno, class Item *const idx_cond) override;
1085 /*
1086 Default implementation from cancel_pushed_idx_cond() suits us
1087 */
1088private:
1089 struct key_def_cf_info {
1090 rocksdb::ColumnFamilyHandle *cf_handle;
1091 bool is_reverse_cf;
1092 bool is_per_partition_cf;
1093 };
1094
1095 struct update_row_info {
1096 Rdb_transaction *tx;
1097 const uchar *new_data;
1098 const uchar *old_data;
1099 rocksdb::Slice new_pk_slice;
1100 rocksdb::Slice old_pk_slice;
1101 rocksdb::Slice old_pk_rec;
1102
1103 // "unpack_info" data for the new PK value
1104 Rdb_string_writer *new_pk_unpack_info;
1105
1106 longlong hidden_pk_id;
1107 bool skip_unique_check;
1108
1109 // In certain cases, TTL is enabled on a table, as well as an explicit TTL
1110 // column. The TTL column can be part of either the key or the value part
1111 // of the record. If it is part of the key, we store the offset here.
1112 //
1113 // Later on, we use this offset to store the TTL in the value part of the
1114 // record, which we can then access in the compaction filter.
1115 //
1116 // Set to UINT_MAX by default to indicate that the TTL is not in key.
1117 uint ttl_pk_offset = UINT_MAX;
1118 };
1119
1120 /*
1121 Used to check for duplicate entries during fast unique secondary index
1122 creation.
1123 */
1124 struct unique_sk_buf_info {
1125 bool sk_buf_switch = false;
1126 rocksdb::Slice sk_memcmp_key;
1127 rocksdb::Slice sk_memcmp_key_old;
1128 uchar *dup_sk_buf;
1129 uchar *dup_sk_buf_old;
1130
1131 /*
1132 This method is meant to be called back to back during inplace creation
1133 of unique indexes. It will switch between two buffers, which
1134 will each store the memcmp form of secondary keys, which are then
1135 converted to slices in sk_memcmp_key or sk_memcmp_key_old.
1136
1137 Switching buffers on each iteration allows us to retain the
1138 sk_memcmp_key_old value for duplicate comparison.
1139 */
1140 inline uchar *swap_and_get_sk_buf() {
1141 sk_buf_switch = !sk_buf_switch;
1142 return sk_buf_switch ? dup_sk_buf : dup_sk_buf_old;
1143 }
1144 };
1145
1146 int create_cfs(const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
1147 std::array<struct key_def_cf_info, MAX_INDEXES + 1> *const cfs)
1148 const MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
1149
1150 int create_key_def(const TABLE *const table_arg, const uint &i,
1151 const Rdb_tbl_def *const tbl_def_arg,
1152 std::shared_ptr<Rdb_key_def> *const new_key_def,
1153 const struct key_def_cf_info &cf_info) const
1154 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
1155
1156 int create_inplace_key_defs(
1157 const TABLE *const table_arg, Rdb_tbl_def *vtbl_def_arg,
1158 const TABLE *const old_table_arg,
1159 const Rdb_tbl_def *const old_tbl_def_arg,
1160 const std::array<key_def_cf_info, MAX_INDEXES + 1> &cfs) const
1161 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
1162
1163 std::unordered_map<std::string, uint>
1164 get_old_key_positions(const TABLE *table_arg, const Rdb_tbl_def *tbl_def_arg,
1165 const TABLE *old_table_arg,
1166 const Rdb_tbl_def *old_tbl_def_arg) const
1167 MY_ATTRIBUTE((__nonnull__));
1168
1169 int compare_key_parts(const KEY *const old_key,
1170 const KEY *const new_key) const;
1171 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
1172
1173 int compare_keys(const KEY *const old_key, const KEY *const new_key) const
1174 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
1175
1176 int convert_record_to_storage_format(const struct update_row_info &row_info,
1177 rocksdb::Slice *const packed_rec)
1178 MY_ATTRIBUTE((__nonnull__));
1179
1180 bool should_hide_ttl_rec(const Rdb_key_def &kd,
1181 const rocksdb::Slice &ttl_rec_val,
1182 const int64_t curr_ts)
1183 MY_ATTRIBUTE((__warn_unused_result__));
1184 void rocksdb_skip_expired_records(const Rdb_key_def &kd,
1185 rocksdb::Iterator *const iter,
1186 bool seek_backward);
1187
1188 int index_first_intern(uchar *buf)
1189 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
1190 int index_last_intern(uchar *buf)
1191 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
1192
1193 int find_icp_matching_index_rec(const bool &move_forward, uchar *const buf)
1194 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
1195
1196 void calc_updated_indexes();
1197 int update_write_row(const uchar *const old_data, const uchar *const new_data,
1198 const bool skip_unique_check)
1199 MY_ATTRIBUTE((__warn_unused_result__));
1200 int get_pk_for_update(struct update_row_info *const row_info);
1201 int check_and_lock_unique_pk(const uint &key_id,
1202 const struct update_row_info &row_info,
1203 bool *const found, bool *const pk_changed)
1204 MY_ATTRIBUTE((__warn_unused_result__));
1205 int check_and_lock_sk(const uint &key_id,
1206 const struct update_row_info &row_info,
1207 bool *const found)
1208 MY_ATTRIBUTE((__warn_unused_result__));
1209 int check_uniqueness_and_lock(const struct update_row_info &row_info,
1210 bool *const pk_changed)
1211 MY_ATTRIBUTE((__warn_unused_result__));
1212 bool over_bulk_load_threshold(int *err)
1213 MY_ATTRIBUTE((__warn_unused_result__));
1214 int check_duplicate_sk(const TABLE *table_arg, const Rdb_key_def &index,
1215 const rocksdb::Slice *key,
1216 struct unique_sk_buf_info *sk_info)
1217 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
1218 int bulk_load_key(Rdb_transaction *const tx, const Rdb_key_def &kd,
1219 const rocksdb::Slice &key, const rocksdb::Slice &value,
1220 bool sort)
1221 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
1222 void update_bytes_written(ulonglong bytes_written);
1223 int update_pk(const Rdb_key_def &kd, const struct update_row_info &row_info,
1224 const bool &pk_changed) MY_ATTRIBUTE((__warn_unused_result__));
1225 int update_sk(const TABLE *const table_arg, const Rdb_key_def &kd,
1226 const struct update_row_info &row_info)
1227 MY_ATTRIBUTE((__warn_unused_result__));
1228 int update_indexes(const struct update_row_info &row_info,
1229 const bool &pk_changed)
1230 MY_ATTRIBUTE((__warn_unused_result__));
1231
1232 int read_key_exact(const Rdb_key_def &kd, rocksdb::Iterator *const iter,
1233 const bool &using_full_key,
1234 const rocksdb::Slice &key_slice,
1235 const int64_t ttl_filter_ts)
1236 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
1237 int read_before_key(const Rdb_key_def &kd, const bool &using_full_key,
1238 const rocksdb::Slice &key_slice,
1239 const int64_t ttl_filter_ts)
1240 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
1241 int read_after_key(const Rdb_key_def &kd, const rocksdb::Slice &key_slice,
1242 const int64_t ttl_filter_ts)
1243 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
1244 int position_to_correct_key(
1245 const Rdb_key_def &kd, const enum ha_rkey_function &find_flag,
1246 const bool &full_key_match, const uchar *const key,
1247 const key_part_map &keypart_map, const rocksdb::Slice &key_slice,
1248 bool *const move_forward, const int64_t ttl_filter_ts)
1249 MY_ATTRIBUTE((__warn_unused_result__));
1250
1251 int read_row_from_primary_key(uchar *const buf)
1252 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
1253 int read_row_from_secondary_key(uchar *const buf, const Rdb_key_def &kd,
1254 bool move_forward)
1255 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
1256
1257 int calc_eq_cond_len(const Rdb_key_def &kd,
1258 const enum ha_rkey_function &find_flag,
1259 const rocksdb::Slice &slice,
1260 const int &bytes_changed_by_succ,
1261 const key_range *const end_key,
1262 uint *const end_key_packed_size)
1263 MY_ATTRIBUTE((__warn_unused_result__));
1264
1265 Rdb_tbl_def *get_table_if_exists(const char *const tablename)
1266 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
1267 void read_thd_vars(THD *const thd) MY_ATTRIBUTE((__nonnull__));
1268
1269 bool contains_foreign_key(THD *const thd)
1270 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
1271
1272 int inplace_populate_sk(
1273 TABLE *const table_arg,
1274 const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes)
1275 MY_ATTRIBUTE((__nonnull__, __warn_unused_result__));
1276
1277 int finalize_bulk_load(bool print_client_error = true)
1278 MY_ATTRIBUTE((__warn_unused_result__));
1279
1280public:
1281 int index_init(uint idx, bool sorted) override
1282 MY_ATTRIBUTE((__warn_unused_result__));
1283 int index_end() override MY_ATTRIBUTE((__warn_unused_result__));
1284
1285 void unlock_row() override;
1286
1287 /** @brief
1288 Unlike index_init(), rnd_init() can be called two consecutive times
1289 without rnd_end() in between (it only makes sense if scan=1). In this
1290 case, the second call should prepare for the new table scan (e.g if
1291 rnd_init() allocates the cursor, the second call should position the
1292 cursor to the start of the table; no need to deallocate and allocate
1293 it again. This is a required method.
1294 */
1295 int rnd_init(bool scan) override MY_ATTRIBUTE((__warn_unused_result__));
1296 int rnd_end() override MY_ATTRIBUTE((__warn_unused_result__));
1297
1298 int rnd_next(uchar *const buf) override
1299 MY_ATTRIBUTE((__warn_unused_result__));
1300 int rnd_next_with_direction(uchar *const buf, bool move_forward)
1301 MY_ATTRIBUTE((__warn_unused_result__));
1302
1303 int rnd_pos(uchar *const buf, uchar *const pos) override
1304 MY_ATTRIBUTE((__warn_unused_result__));
1305 void position(const uchar *const record) override;
1306 int info(uint) override;
1307
1308 /* This function will always return success, therefore no annotation related
1309 * to checking the return value. Can't change the signature because it's
1310 * required by the interface. */
1311 int extra(enum ha_extra_function operation) override;
1312
1313 int start_stmt(THD *const thd, thr_lock_type lock_type) override
1314 MY_ATTRIBUTE((__warn_unused_result__));
1315 int external_lock(THD *const thd, int lock_type) override
1316 MY_ATTRIBUTE((__warn_unused_result__));
1317 int truncate() override MY_ATTRIBUTE((__warn_unused_result__));
1318
1319 int reset() override {
1320 DBUG_ENTER_FUNC();
1321
1322 /* Free blob data */
1323 m_retrieved_record.Reset();
1324
1325 DBUG_RETURN(HA_EXIT_SUCCESS);
1326 }
1327
1328 int check(THD *const thd, HA_CHECK_OPT *const check_opt) override
1329 MY_ATTRIBUTE((__warn_unused_result__));
1330 int remove_rows(Rdb_tbl_def *const tbl);
1331 ha_rows records_in_range(uint inx, key_range *const min_key,
1332 key_range *const max_key) override
1333 MY_ATTRIBUTE((__warn_unused_result__));
1334 int delete_table(const char *const from) override
1335 MY_ATTRIBUTE((__warn_unused_result__));
1336 int create(const char *const name, TABLE *const form,
1337 HA_CREATE_INFO *const create_info) override
1338 MY_ATTRIBUTE((__warn_unused_result__));
1339 bool check_if_incompatible_data(HA_CREATE_INFO *const info,
1340 uint table_changes) override
1341 MY_ATTRIBUTE((__warn_unused_result__));
1342
1343 THR_LOCK_DATA **store_lock(THD *const thd, THR_LOCK_DATA **to,
1344 enum thr_lock_type lock_type) override
1345 MY_ATTRIBUTE((__warn_unused_result__));
1346
1347 my_bool register_query_cache_table(THD *const thd, const char *table_key,
1348 uint key_length,
1349 qc_engine_callback *const engine_callback,
1350 ulonglong *const engine_data) override {
1351 DBUG_ENTER_FUNC();
1352
1353 /* Currently, we don't support query cache */
1354 DBUG_RETURN(FALSE);
1355 }
1356
1357 bool get_error_message(const int error, String *const buf) override
1358 MY_ATTRIBUTE((__nonnull__));
1359
1360 static int rdb_error_to_mysql(const rocksdb::Status &s,
1361 const char *msg = nullptr)
1362 MY_ATTRIBUTE((__warn_unused_result__));
1363
1364 void get_auto_increment(ulonglong offset, ulonglong increment,
1365 ulonglong nb_desired_values,
1366 ulonglong *const first_value,
1367 ulonglong *const nb_reserved_values) override;
1368 void update_create_info(HA_CREATE_INFO *const create_info) override;
1369 int optimize(THD *const thd, HA_CHECK_OPT *const check_opt) override
1370 MY_ATTRIBUTE((__warn_unused_result__));
1371 int analyze(THD *const thd, HA_CHECK_OPT *const check_opt) override
1372 MY_ATTRIBUTE((__warn_unused_result__));
1373 int calculate_stats(const TABLE *const table_arg, THD *const thd,
1374 HA_CHECK_OPT *const check_opt)
1375 MY_ATTRIBUTE((__warn_unused_result__));
1376
1377 enum_alter_inplace_result check_if_supported_inplace_alter(
1378 TABLE *altered_table,
1379 my_core::Alter_inplace_info *const ha_alter_info) override;
1380
1381 bool prepare_inplace_alter_table(
1382 TABLE *const altered_table,
1383 my_core::Alter_inplace_info *const ha_alter_info) override;
1384
1385 bool inplace_alter_table(
1386 TABLE *const altered_table,
1387 my_core::Alter_inplace_info *const ha_alter_info) override;
1388
1389 bool
1390 commit_inplace_alter_table(TABLE *const altered_table,
1391 my_core::Alter_inplace_info *const ha_alter_info,
1392 bool commit) override;
1393
1394#ifdef MARIAROCKS_NOT_YET // MDEV-10976
1395 void set_use_read_free_rpl(const char *const whitelist);
1396#endif
1397 void set_skip_unique_check_tables(const char *const whitelist);
1398
1399#ifdef MARIAROCKS_NOT_YET // MDEV-10976
1400public:
1401 virtual void rpl_before_delete_rows() override;
1402 virtual void rpl_after_delete_rows() override;
1403 virtual void rpl_before_update_rows() override;
1404 virtual void rpl_after_update_rows() override;
1405 virtual bool use_read_free_rpl();
1406#endif // MARIAROCKS_NOT_YET
1407
1408private:
1409 /* Flags tracking if we are inside different replication operation */
1410 bool m_in_rpl_delete_rows;
1411 bool m_in_rpl_update_rows;
1412
1413 bool m_force_skip_unique_check;
1414};
1415
1416/*
1417 Helper class for in-place alter, for storing handler context between inplace
1418 alter calls
1419*/
1420struct Rdb_inplace_alter_ctx : public my_core::inplace_alter_handler_ctx {
1421 /* The new table definition */
1422 Rdb_tbl_def *const m_new_tdef;
1423
1424 /* Stores the original key definitions */
1425 std::shared_ptr<Rdb_key_def> *const m_old_key_descr;
1426
1427 /* Stores the new key definitions */
1428 std::shared_ptr<Rdb_key_def> *m_new_key_descr;
1429
1430 /* Stores the old number of key definitions */
1431 const uint m_old_n_keys;
1432
1433 /* Stores the new number of key definitions */
1434 const uint m_new_n_keys;
1435
1436 /* Stores the added key glids */
1437 const std::unordered_set<std::shared_ptr<Rdb_key_def>> m_added_indexes;
1438
1439 /* Stores the dropped key glids */
1440 const std::unordered_set<GL_INDEX_ID> m_dropped_index_ids;
1441
1442 /* Stores number of keys to add */
1443 const uint m_n_added_keys;
1444
1445 /* Stores number of keys to drop */
1446 const uint m_n_dropped_keys;
1447
1448 /* Stores the largest current auto increment value in the index */
1449 const ulonglong m_max_auto_incr;
1450
1451 Rdb_inplace_alter_ctx(
1452 Rdb_tbl_def *new_tdef, std::shared_ptr<Rdb_key_def> *old_key_descr,
1453 std::shared_ptr<Rdb_key_def> *new_key_descr, uint old_n_keys,
1454 uint new_n_keys,
1455 std::unordered_set<std::shared_ptr<Rdb_key_def>> added_indexes,
1456 std::unordered_set<GL_INDEX_ID> dropped_index_ids, uint n_added_keys,
1457 uint n_dropped_keys, ulonglong max_auto_incr)
1458 : my_core::inplace_alter_handler_ctx(), m_new_tdef(new_tdef),
1459 m_old_key_descr(old_key_descr), m_new_key_descr(new_key_descr),
1460 m_old_n_keys(old_n_keys), m_new_n_keys(new_n_keys),
1461 m_added_indexes(added_indexes), m_dropped_index_ids(dropped_index_ids),
1462 m_n_added_keys(n_added_keys), m_n_dropped_keys(n_dropped_keys),
1463 m_max_auto_incr(max_auto_incr) {}
1464
1465 ~Rdb_inplace_alter_ctx() {}
1466
1467private:
1468 /* Disable Copying */
1469 Rdb_inplace_alter_ctx(const Rdb_inplace_alter_ctx &);
1470 Rdb_inplace_alter_ctx &operator=(const Rdb_inplace_alter_ctx &);
1471};
1472
1473// file name indicating RocksDB data corruption
1474std::string rdb_corruption_marker_file_name();
1475
1476const int MYROCKS_MARIADB_PLUGIN_MATURITY_LEVEL= MariaDB_PLUGIN_MATURITY_STABLE;
1477
1478extern bool prevent_myrocks_loading;
1479
1480void sql_print_verbose_info(const char *format, ...);
1481
1482} // namespace myrocks
1483
1484