1/*
2 Copyright (c) 2012, Monty Program Ab
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; version 2 of the License.
7
8 This program is distributed in the hope that it will be useful,
9 but WITHOUT ANY WARRANTY; without even the implied warranty of
10 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11 GNU General Public License for more details.
12
13 You should have received a copy of the GNU General Public License
14 along with this program; if not, write to the Free Software
15 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
16
17#ifdef USE_PRAGMA_IMPLEMENTATION
18#pragma implementation // gcc: Class implementation
19#endif
20
21#define MYSQL_SERVER 1
22
23/* For use of 'PRIu64': */
24#define __STDC_FORMAT_MACROS
25
26#include <my_global.h>
27
28#include <inttypes.h>
29
30/* The C++ file's header */
31#include "./ha_rocksdb.h"
32
33/* C++ standard header files */
34#include <algorithm>
35#include <limits>
36#include <map>
37#include <queue>
38#include <set>
39#include <string>
40#include <vector>
41
42/* MySQL includes */
43#include "./debug_sync.h"
44#include "./my_bit.h"
45#include "./my_stacktrace.h"
46#include "./my_sys.h"
47#include "./sql_audit.h"
48#include "./sql_table.h"
49#include "./sql_hset.h"
50#include <mysql/psi/mysql_table.h>
51#ifdef MARIAROCKS_NOT_YET
52#include <mysql/thread_pool_priv.h>
53#endif
54#include <mysys_err.h>
55
56// Both MySQL and RocksDB define the same constant. To avoid compilation errors
57// till we make the fix in RocksDB, we'll temporary undefine it here.
58#undef CACHE_LINE_SIZE
59
60/* RocksDB includes */
61#include "monitoring/histogram.h"
62#include "rocksdb/compaction_filter.h"
63#include "rocksdb/env.h"
64#include "rocksdb/persistent_cache.h"
65#include "rocksdb/rate_limiter.h"
66#include "rocksdb/slice_transform.h"
67#include "rocksdb/thread_status.h"
68#include "rocksdb/utilities/checkpoint.h"
69#include "rocksdb/utilities/convenience.h"
70#include "rocksdb/utilities/memory_util.h"
71#include "rocksdb/utilities/sim_cache.h"
72#include "util/stop_watch.h"
73#include "./rdb_source_revision.h"
74
75/* MyRocks includes */
76#include "./event_listener.h"
77#include "./ha_rocksdb_proto.h"
78#include "./logger.h"
79#include "./rdb_cf_manager.h"
80#include "./rdb_cf_options.h"
81#include "./rdb_datadic.h"
82#include "./rdb_i_s.h"
83#include "./rdb_index_merge.h"
84#include "./rdb_mutex_wrapper.h"
85#include "./rdb_psi.h"
86#include "./rdb_threads.h"
87#include "./rdb_mariadb_server_port.h"
88
89// Internal MySQL APIs not exposed in any header.
90extern "C" {
91/**
92 Mark transaction to rollback and mark error as fatal to a sub-statement.
93 @param thd Thread handle
94 @param all TRUE <=> rollback main transaction.
95*/
96void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all);
97
98/**
99 * Get the user thread's binary logging format
100 * @param thd user thread
101 * @return Value to be used as index into the binlog_format_names array
102*/
103int thd_binlog_format(const MYSQL_THD thd);
104
105/**
106 * Check if binary logging is filtered for thread's current db.
107 * @param thd Thread handle
108 * @retval 1 the query is not filtered, 0 otherwise.
109*/
110bool thd_binlog_filter_ok(const MYSQL_THD thd);
111}
112
113MYSQL_PLUGIN_IMPORT bool my_disable_leak_check;
114
115// Needed in rocksdb_init_func
116void ignore_db_dirs_append(const char *dirname_arg);
117
118
119namespace myrocks {
120
121static st_global_stats global_stats;
122static st_export_stats export_stats;
123static st_memory_stats memory_stats;
124static st_io_stall_stats io_stall_stats;
125
126const std::string DEFAULT_CF_NAME("default");
127const std::string DEFAULT_SYSTEM_CF_NAME("__system__");
128const std::string PER_INDEX_CF_NAME("$per_index_cf");
129
130/**
131 Updates row counters based on the table type and operation type.
132*/
133void ha_rocksdb::update_row_stats(const operation_type &type) {
134 DBUG_ASSERT(type < ROWS_MAX);
135 // Find if we are modifying system databases.
136 if (table->s && m_tbl_def->m_is_mysql_system_table)
137 global_stats.system_rows[type].inc();
138 else
139 global_stats.rows[type].inc();
140}
141
142void dbug_dump_database(rocksdb::DB *db);
143static handler *rocksdb_create_handler(my_core::handlerton *hton,
144 my_core::TABLE_SHARE *table_arg,
145 my_core::MEM_ROOT *mem_root);
146
147static rocksdb::CompactRangeOptions getCompactRangeOptions() {
148 rocksdb::CompactRangeOptions compact_range_options;
149 compact_range_options.bottommost_level_compaction =
150 rocksdb::BottommostLevelCompaction::kForce;
151 compact_range_options.exclusive_manual_compaction = false;
152 return compact_range_options;
153}
154
155///////////////////////////////////////////////////////////
156// Parameters and settings
157///////////////////////////////////////////////////////////
158static char *rocksdb_default_cf_options = nullptr;
159static char *rocksdb_override_cf_options = nullptr;
160static char *rocksdb_update_cf_options = nullptr;
161
162///////////////////////////////////////////////////////////
163// Globals
164///////////////////////////////////////////////////////////
165handlerton *rocksdb_hton;
166
167rocksdb::TransactionDB *rdb = nullptr;
168rocksdb::HistogramImpl *commit_latency_stats = nullptr;
169
170static std::shared_ptr<rocksdb::Statistics> rocksdb_stats;
171static std::unique_ptr<rocksdb::Env> flashcache_aware_env;
172static std::shared_ptr<Rdb_tbl_prop_coll_factory> properties_collector_factory;
173
174Rdb_dict_manager dict_manager;
175Rdb_cf_manager cf_manager;
176Rdb_ddl_manager ddl_manager;
177Rdb_binlog_manager binlog_manager;
178
179#if !defined(_WIN32) && !defined(__APPLE__)
180Rdb_io_watchdog *io_watchdog = nullptr;
181#endif
182/**
183 MyRocks background thread control
184 N.B. This is besides RocksDB's own background threads
185 (@see rocksdb::CancelAllBackgroundWork())
186*/
187
188static Rdb_background_thread rdb_bg_thread;
189
190// List of table names (using regex) that are exceptions to the strict
191// collation check requirement.
192Regex_list_handler *rdb_collation_exceptions;
193
194static const char **rdb_get_error_messages(int nr);
195
196static void rocksdb_flush_all_memtables() {
197 const Rdb_cf_manager &cf_manager = rdb_get_cf_manager();
198 for (const auto &cf_handle : cf_manager.get_all_cf()) {
199 rdb->Flush(rocksdb::FlushOptions(), cf_handle);
200 }
201}
202
203static void rocksdb_compact_column_family_stub(
204 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
205 const void *const save) {}
206
207static int rocksdb_compact_column_family(THD *const thd,
208 struct st_mysql_sys_var *const var,
209 void *const var_ptr,
210 struct st_mysql_value *const value) {
211 char buff[STRING_BUFFER_USUAL_SIZE];
212 int len = sizeof(buff);
213
214 DBUG_ASSERT(value != nullptr);
215
216 if (const char *const cf = value->val_str(value, buff, &len)) {
217 auto cfh = cf_manager.get_cf(cf);
218 if (cfh != nullptr && rdb != nullptr) {
219 sql_print_verbose_info("RocksDB: Manual compaction of column family: %s\n",
220 cf);
221 rdb->CompactRange(getCompactRangeOptions(), cfh, nullptr, nullptr);
222 }
223 }
224 return HA_EXIT_SUCCESS;
225}
226
227///////////////////////////////////////////////////////////
228// Hash map: table name => open table handler
229///////////////////////////////////////////////////////////
230
231namespace // anonymous namespace = not visible outside this source file
232{
233
234const ulong TABLE_HASH_SIZE = 32;
235typedef Hash_set<Rdb_table_handler> Rdb_table_set;
236
237struct Rdb_open_tables_map {
238 /* Hash table used to track the handlers of open tables */
239 Rdb_table_set m_hash;
240 /* The mutex used to protect the hash table */
241 mutable mysql_mutex_t m_mutex;
242
243 static uchar *get_hash_key(const Rdb_table_handler *const table_handler,
244 size_t *const length,
245 my_bool not_used MY_ATTRIBUTE((__unused__)));
246
247 Rdb_table_handler *get_table_handler(const char *const table_name);
248 void release_table_handler(Rdb_table_handler *const table_handler);
249
250 Rdb_open_tables_map() : m_hash(get_hash_key, system_charset_info) { }
251
252 std::vector<std::string> get_table_names(void) const;
253};
254
255} // anonymous namespace
256
257static Rdb_open_tables_map rdb_open_tables;
258
259static std::string rdb_normalize_dir(std::string dir) {
260 while (dir.size() > 0 && dir.back() == '/') {
261 dir.resize(dir.size() - 1);
262 }
263 return dir;
264}
265
266static int rocksdb_create_checkpoint(
267 THD *const thd MY_ATTRIBUTE((__unused__)),
268 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
269 void *const save MY_ATTRIBUTE((__unused__)),
270 struct st_mysql_value *const value) {
271 char buf[FN_REFLEN];
272 int len = sizeof(buf);
273 const char *const checkpoint_dir_raw = value->val_str(value, buf, &len);
274 if (checkpoint_dir_raw) {
275 if (rdb != nullptr) {
276 std::string checkpoint_dir = rdb_normalize_dir(checkpoint_dir_raw);
277 // NO_LINT_DEBUG
278 sql_print_information("RocksDB: creating checkpoint in directory : %s\n",
279 checkpoint_dir.c_str());
280 rocksdb::Checkpoint *checkpoint;
281 auto status = rocksdb::Checkpoint::Create(rdb, &checkpoint);
282 // We can only return HA_EXIT_FAILURE/HA_EXIT_SUCCESS here which is why
283 // the return code is ignored, but by calling into rdb_error_to_mysql,
284 // it will call my_error for us, which will propogate up to the client.
285 int rc __attribute__((__unused__));
286 if (status.ok()) {
287 status = checkpoint->CreateCheckpoint(checkpoint_dir.c_str());
288 delete checkpoint;
289 if (status.ok()) {
290 sql_print_information(
291 "RocksDB: created checkpoint in directory : %s\n",
292 checkpoint_dir.c_str());
293 return HA_EXIT_SUCCESS;
294 } else {
295 rc = ha_rocksdb::rdb_error_to_mysql(status);
296 }
297 } else {
298 rc = ha_rocksdb::rdb_error_to_mysql(status);
299 }
300 }
301 }
302 return HA_EXIT_FAILURE;
303}
304
305/* This method is needed to indicate that the
306 ROCKSDB_CREATE_CHECKPOINT command is not read-only */
307static void rocksdb_create_checkpoint_stub(THD *const thd,
308 struct st_mysql_sys_var *const var,
309 void *const var_ptr,
310 const void *const save) {}
311
312static void rocksdb_force_flush_memtable_now_stub(
313 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
314 const void *const save) {}
315
316static int rocksdb_force_flush_memtable_now(
317 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
318 struct st_mysql_value *const value) {
319 sql_print_information("RocksDB: Manual memtable flush.");
320 rocksdb_flush_all_memtables();
321 return HA_EXIT_SUCCESS;
322}
323
324static void rocksdb_force_flush_memtable_and_lzero_now_stub(
325 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
326 const void *const save) {}
327
328static int rocksdb_force_flush_memtable_and_lzero_now(
329 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
330 struct st_mysql_value *const value) {
331 sql_print_information("RocksDB: Manual memtable and L0 flush.");
332 rocksdb_flush_all_memtables();
333
334 const Rdb_cf_manager &cf_manager = rdb_get_cf_manager();
335 rocksdb::CompactionOptions c_options = rocksdb::CompactionOptions();
336 rocksdb::ColumnFamilyMetaData metadata;
337 rocksdb::ColumnFamilyDescriptor cf_descr;
338
339 for (const auto &cf_handle : cf_manager.get_all_cf()) {
340 rdb->GetColumnFamilyMetaData(cf_handle, &metadata);
341 cf_handle->GetDescriptor(&cf_descr);
342 c_options.output_file_size_limit = cf_descr.options.target_file_size_base;
343
344 DBUG_ASSERT(metadata.levels[0].level == 0);
345 std::vector<std::string> file_names;
346 for (auto &file : metadata.levels[0].files) {
347 file_names.emplace_back(file.db_path + file.name);
348 }
349
350 if (!file_names.empty()) {
351 rocksdb::Status s;
352 s = rdb->CompactFiles(c_options, cf_handle, file_names, 1);
353
354 if (!s.ok() && !s.IsAborted()) {
355 rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
356 return HA_EXIT_FAILURE;
357 }
358 }
359 }
360
361 return HA_EXIT_SUCCESS;
362}
363
364static void rocksdb_drop_index_wakeup_thread(
365 my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
366 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
367 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save);
368
369static my_bool rocksdb_pause_background_work = 0;
370static mysql_mutex_t rdb_sysvars_mutex;
371
372static void rocksdb_set_pause_background_work(
373 my_core::THD *const,
374 struct st_mysql_sys_var *const,
375 void *const, const void *const save) {
376 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
377 const my_bool pause_requested = *static_cast<const my_bool *>(save);
378 if (rocksdb_pause_background_work != pause_requested) {
379 if (pause_requested) {
380 rdb->PauseBackgroundWork();
381 } else {
382 rdb->ContinueBackgroundWork();
383 }
384 rocksdb_pause_background_work = pause_requested;
385 }
386 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
387}
388
389static void rocksdb_set_compaction_options(THD *thd,
390 struct st_mysql_sys_var *var,
391 void *var_ptr, const void *save);
392
393static void rocksdb_set_table_stats_sampling_pct(THD *thd,
394 struct st_mysql_sys_var *var,
395 void *var_ptr,
396 const void *save);
397
398static void rocksdb_set_rate_limiter_bytes_per_sec(THD *thd,
399 struct st_mysql_sys_var *var,
400 void *var_ptr,
401 const void *save);
402
403static void rocksdb_set_sst_mgr_rate_bytes_per_sec(THD *thd,
404 struct st_mysql_sys_var *var,
405 void *var_ptr,
406 const void *save);
407
408static void rocksdb_set_delayed_write_rate(THD *thd,
409 struct st_mysql_sys_var *var,
410 void *var_ptr, const void *save);
411
412static void rocksdb_set_max_latest_deadlocks(THD *thd,
413 struct st_mysql_sys_var *var,
414 void *var_ptr, const void *save);
415
416static void rdb_set_collation_exception_list(const char *exception_list);
417static void rocksdb_set_collation_exception_list(THD *thd,
418 struct st_mysql_sys_var *var,
419 void *var_ptr,
420 const void *save);
421
422static int rocksdb_validate_update_cf_options(THD *thd,
423 struct st_mysql_sys_var *var,
424 void *save,
425 st_mysql_value *value);
426
427static void rocksdb_set_update_cf_options(THD *thd,
428 struct st_mysql_sys_var *var,
429 void *var_ptr, const void *save);
430
431static int rocksdb_check_bulk_load(THD *const thd,
432 struct st_mysql_sys_var *var
433 MY_ATTRIBUTE((__unused__)),
434 void *save,
435 struct st_mysql_value *value);
436
437static int rocksdb_check_bulk_load_allow_unsorted(
438 THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
439 void *save, struct st_mysql_value *value);
440
441static void rocksdb_set_max_background_jobs(THD *thd,
442 struct st_mysql_sys_var *const var,
443 void *const var_ptr,
444 const void *const save);
445static void rocksdb_set_bytes_per_sync(THD *thd,
446 struct st_mysql_sys_var *const var,
447 void *const var_ptr,
448 const void *const save);
449static void rocksdb_set_wal_bytes_per_sync(THD *thd,
450 struct st_mysql_sys_var *const var,
451 void *const var_ptr,
452 const void *const save);
453//////////////////////////////////////////////////////////////////////////////
454// Options definitions
455//////////////////////////////////////////////////////////////////////////////
456static long long rocksdb_block_cache_size;
457static long long rocksdb_sim_cache_size;
458static my_bool rocksdb_use_clock_cache;
459/* Use unsigned long long instead of uint64_t because of MySQL compatibility */
460static unsigned long long // NOLINT(runtime/int)
461 rocksdb_rate_limiter_bytes_per_sec;
462static unsigned long long // NOLINT(runtime/int)
463 rocksdb_sst_mgr_rate_bytes_per_sec;
464static unsigned long long rocksdb_delayed_write_rate;
465static uint32_t rocksdb_max_latest_deadlocks;
466static unsigned long // NOLINT(runtime/int)
467 rocksdb_persistent_cache_size_mb;
468static ulong rocksdb_info_log_level;
469static char *rocksdb_wal_dir;
470static char *rocksdb_persistent_cache_path;
471static ulong rocksdb_index_type;
472static uint32_t rocksdb_flush_log_at_trx_commit;
473static uint32_t rocksdb_debug_optimizer_n_rows;
474static my_bool rocksdb_force_compute_memtable_stats;
475static uint32_t rocksdb_force_compute_memtable_stats_cachetime;
476static my_bool rocksdb_debug_optimizer_no_zero_cardinality;
477static uint32_t rocksdb_wal_recovery_mode;
478static uint32_t rocksdb_access_hint_on_compaction_start;
479static char *rocksdb_compact_cf_name;
480static char *rocksdb_checkpoint_name;
481static my_bool rocksdb_signal_drop_index_thread;
482static my_bool rocksdb_strict_collation_check = 1;
483static my_bool rocksdb_ignore_unknown_options = 1;
484static my_bool rocksdb_enable_2pc = 0;
485static char *rocksdb_strict_collation_exceptions;
486static my_bool rocksdb_collect_sst_properties = 1;
487static my_bool rocksdb_force_flush_memtable_now_var = 0;
488static my_bool rocksdb_force_flush_memtable_and_lzero_now_var = 0;
489static my_bool rocksdb_enable_ttl = 1;
490static my_bool rocksdb_enable_ttl_read_filtering = 1;
491static int rocksdb_debug_ttl_rec_ts = 0;
492static int rocksdb_debug_ttl_snapshot_ts = 0;
493static int rocksdb_debug_ttl_read_filter_ts = 0;
494static my_bool rocksdb_debug_ttl_ignore_pk = 0;
495static my_bool rocksdb_reset_stats = 0;
496static uint32_t rocksdb_io_write_timeout_secs = 0;
497static uint32_t rocksdb_seconds_between_stat_computes = 3600;
498static long long rocksdb_compaction_sequential_deletes = 0l;
499static long long rocksdb_compaction_sequential_deletes_window = 0l;
500static long long rocksdb_compaction_sequential_deletes_file_size = 0l;
501static uint32_t rocksdb_validate_tables = 1;
502static char *rocksdb_datadir;
503static uint32_t rocksdb_table_stats_sampling_pct;
504static my_bool rocksdb_enable_bulk_load_api = 1;
505static my_bool rocksdb_print_snapshot_conflict_queries = 0;
506static my_bool rocksdb_large_prefix = 0;
507static my_bool rocksdb_allow_to_start_after_corruption = 0;
508static char* rocksdb_git_hash;
509
510char *compression_types_val=
511 const_cast<char*>(get_rocksdb_supported_compression_types());
512
513std::atomic<uint64_t> rocksdb_row_lock_deadlocks(0);
514std::atomic<uint64_t> rocksdb_row_lock_wait_timeouts(0);
515std::atomic<uint64_t> rocksdb_snapshot_conflict_errors(0);
516std::atomic<uint64_t> rocksdb_wal_group_syncs(0);
517
518static std::unique_ptr<rocksdb::DBOptions> rdb_init_rocksdb_db_options(void) {
519 auto o = std::unique_ptr<rocksdb::DBOptions>(new rocksdb::DBOptions());
520
521 o->create_if_missing = true;
522 o->listeners.push_back(std::make_shared<Rdb_event_listener>(&ddl_manager));
523 o->info_log_level = rocksdb::InfoLogLevel::INFO_LEVEL;
524 o->max_subcompactions = DEFAULT_SUBCOMPACTIONS;
525 o->max_open_files = -2; // auto-tune to 50% open_files_limit
526
527 o->two_write_queues = true;
528 o->manual_wal_flush = true;
529 return o;
530}
531
532/* DBOptions contains Statistics and needs to be destructed last */
533static std::unique_ptr<rocksdb::BlockBasedTableOptions> rocksdb_tbl_options =
534 std::unique_ptr<rocksdb::BlockBasedTableOptions>(
535 new rocksdb::BlockBasedTableOptions());
536static std::unique_ptr<rocksdb::DBOptions> rocksdb_db_options =
537 rdb_init_rocksdb_db_options();
538
539static std::shared_ptr<rocksdb::RateLimiter> rocksdb_rate_limiter;
540
541/* This enum needs to be kept up to date with rocksdb::InfoLogLevel */
542static const char *info_log_level_names[] = {"debug_level", "info_level",
543 "warn_level", "error_level",
544 "fatal_level", NullS};
545
546static TYPELIB info_log_level_typelib = {
547 array_elements(info_log_level_names) - 1, "info_log_level_typelib",
548 info_log_level_names, nullptr};
549
550static void rocksdb_set_rocksdb_info_log_level(
551 THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
552 const void *const save) {
553 DBUG_ASSERT(save != nullptr);
554
555 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
556 rocksdb_info_log_level = *static_cast<const uint64_t *>(save);
557 rocksdb_db_options->info_log->SetInfoLogLevel(
558 static_cast<const rocksdb::InfoLogLevel>(rocksdb_info_log_level));
559 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
560}
561
562static void rocksdb_set_reset_stats(
563 my_core::THD *const /* unused */,
564 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
565 void *const var_ptr, const void *const save) {
566 DBUG_ASSERT(save != nullptr);
567 DBUG_ASSERT(rdb != nullptr);
568 DBUG_ASSERT(rocksdb_stats != nullptr);
569
570 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
571
572 *static_cast<bool *>(var_ptr) = *static_cast<const bool *>(save);
573
574 if (rocksdb_reset_stats) {
575 rocksdb::Status s = rdb->ResetStats();
576
577 // RocksDB will always return success. Let's document this assumption here
578 // as well so that we'll get immediately notified when contract changes.
579 DBUG_ASSERT(s == rocksdb::Status::OK());
580
581 s = rocksdb_stats->Reset();
582 DBUG_ASSERT(s == rocksdb::Status::OK());
583 }
584
585 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
586}
587
588static void rocksdb_set_io_write_timeout(
589 my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
590 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
591 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
592 DBUG_ASSERT(save != nullptr);
593 DBUG_ASSERT(rdb != nullptr);
594#if !defined(_WIN32) && !defined(__APPLE__)
595 DBUG_ASSERT(io_watchdog != nullptr);
596#endif
597
598 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
599
600 const uint32_t new_val = *static_cast<const uint32_t *>(save);
601
602 rocksdb_io_write_timeout_secs = new_val;
603#if !defined(_WIN32) && !defined(__APPLE__)
604 io_watchdog->reset_timeout(rocksdb_io_write_timeout_secs);
605#endif
606 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
607}
608
609enum rocksdb_flush_log_at_trx_commit_type : unsigned int {
610 FLUSH_LOG_NEVER = 0,
611 FLUSH_LOG_SYNC,
612 FLUSH_LOG_BACKGROUND,
613 FLUSH_LOG_MAX /* must be last */
614};
615
616static int rocksdb_validate_flush_log_at_trx_commit(
617 THD *const thd,
618 struct st_mysql_sys_var *const var, /* in: pointer to system variable */
619 void *var_ptr, /* out: immediate result for update function */
620 struct st_mysql_value *const value /* in: incoming value */) {
621 long long new_value;
622
623 /* value is NULL */
624 if (value->val_int(value, &new_value)) {
625 return HA_EXIT_FAILURE;
626 }
627
628 if (rocksdb_db_options->allow_mmap_writes && new_value != FLUSH_LOG_NEVER) {
629 return HA_EXIT_FAILURE;
630 }
631
632 *static_cast<uint32_t *>(var_ptr) = static_cast<uint32_t>(new_value);
633 return HA_EXIT_SUCCESS;
634}
635
636static const char *index_type_names[] = {"kBinarySearch", "kHashSearch", NullS};
637
638static TYPELIB index_type_typelib = {array_elements(index_type_names) - 1,
639 "index_type_typelib", index_type_names,
640 nullptr};
641
642const ulong RDB_MAX_LOCK_WAIT_SECONDS = 1024 * 1024 * 1024;
643const ulong RDB_MAX_ROW_LOCKS = 1024 * 1024;
644const ulong RDB_DEFAULT_BULK_LOAD_SIZE = 1000;
645const ulong RDB_MAX_BULK_LOAD_SIZE = 1024 * 1024 * 1024;
646const size_t RDB_DEFAULT_MERGE_BUF_SIZE = 64 * 1024 * 1024;
647const size_t RDB_MIN_MERGE_BUF_SIZE = 100;
648const size_t RDB_DEFAULT_MERGE_COMBINE_READ_SIZE = 1024 * 1024 * 1024;
649const size_t RDB_MIN_MERGE_COMBINE_READ_SIZE = 100;
650const size_t RDB_DEFAULT_MERGE_TMP_FILE_REMOVAL_DELAY = 0;
651const size_t RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY = 0;
652const int64 RDB_DEFAULT_BLOCK_CACHE_SIZE = 512 * 1024 * 1024;
653const int64 RDB_MIN_BLOCK_CACHE_SIZE = 1024;
654const int RDB_MAX_CHECKSUMS_PCT = 100;
655const ulong RDB_DEADLOCK_DETECT_DEPTH = 50;
656
657// TODO: 0 means don't wait at all, and we don't support it yet?
658static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
659 "Number of seconds to wait for lock", nullptr,
660 nullptr, /*default*/ 1, /*min*/ 1,
661 /*max*/ RDB_MAX_LOCK_WAIT_SECONDS, 0);
662
663static MYSQL_THDVAR_BOOL(deadlock_detect, PLUGIN_VAR_RQCMDARG,
664 "Enables deadlock detection", nullptr, nullptr, FALSE);
665
666static MYSQL_THDVAR_ULONG(deadlock_detect_depth, PLUGIN_VAR_RQCMDARG,
667 "Number of transactions deadlock detection will "
668 "traverse through before assuming deadlock",
669 nullptr, nullptr,
670 /*default*/ RDB_DEADLOCK_DETECT_DEPTH,
671 /*min*/ 2,
672 /*max*/ ULONG_MAX, 0);
673
674static MYSQL_THDVAR_BOOL(
675 trace_sst_api, PLUGIN_VAR_RQCMDARG,
676 "Generate trace output in the log for each call to the SstFileWriter",
677 nullptr, nullptr, FALSE);
678
679static MYSQL_THDVAR_BOOL(
680 bulk_load, PLUGIN_VAR_RQCMDARG,
681 "Use bulk-load mode for inserts. This disables "
682 "unique_checks and enables rocksdb_commit_in_the_middle.",
683 rocksdb_check_bulk_load, nullptr, FALSE);
684
685static MYSQL_THDVAR_BOOL(bulk_load_allow_unsorted, PLUGIN_VAR_RQCMDARG,
686 "Allow unsorted input during bulk-load. "
687 "Can be changed only when bulk load is disabled.",
688 rocksdb_check_bulk_load_allow_unsorted, nullptr,
689 FALSE);
690
691static MYSQL_SYSVAR_BOOL(enable_bulk_load_api, rocksdb_enable_bulk_load_api,
692 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
693 "Enables using SstFileWriter for bulk loading",
694 nullptr, nullptr, rocksdb_enable_bulk_load_api);
695
696static MYSQL_SYSVAR_STR(git_hash, rocksdb_git_hash,
697 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
698 "Git revision of the RocksDB library used by MyRocks",
699 nullptr, nullptr, ROCKSDB_GIT_HASH);
700
701static MYSQL_THDVAR_STR(tmpdir, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_MEMALLOC,
702 "Directory for temporary files during DDL operations.",
703 nullptr, nullptr, "");
704
705static MYSQL_THDVAR_STR(
706 skip_unique_check_tables, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
707 "Skip unique constraint checking for the specified tables", nullptr,
708 nullptr, ".*");
709
710static MYSQL_THDVAR_BOOL(
711 commit_in_the_middle, PLUGIN_VAR_RQCMDARG,
712 "Commit rows implicitly every rocksdb_bulk_load_size, on bulk load/insert, "
713 "update and delete",
714 nullptr, nullptr, FALSE);
715
716static MYSQL_THDVAR_BOOL(
717 blind_delete_primary_key, PLUGIN_VAR_RQCMDARG,
718 "Deleting rows by primary key lookup, without reading rows (Blind Deletes)."
719 " Blind delete is disabled if the table has secondary key",
720 nullptr, nullptr, FALSE);
721
722static MYSQL_THDVAR_STR(
723 read_free_rpl_tables, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
724 "List of tables that will use read-free replication on the slave "
725 "(i.e. not lookup a row during replication)",
726 nullptr, nullptr, "");
727
728static MYSQL_THDVAR_BOOL(skip_bloom_filter_on_read, PLUGIN_VAR_RQCMDARG,
729 "Skip using bloom filter for reads", nullptr, nullptr,
730 FALSE);
731
732static MYSQL_THDVAR_ULONG(max_row_locks, PLUGIN_VAR_RQCMDARG,
733 "Maximum number of locks a transaction can have",
734 nullptr, nullptr,
735 /*default*/ RDB_MAX_ROW_LOCKS,
736 /*min*/ 1,
737 /*max*/ RDB_MAX_ROW_LOCKS, 0);
738
739static MYSQL_THDVAR_ULONGLONG(
740 write_batch_max_bytes, PLUGIN_VAR_RQCMDARG,
741 "Maximum size of write batch in bytes. 0 means no limit.", nullptr, nullptr,
742 /* default */ 0, /* min */ 0, /* max */ SIZE_T_MAX, 1);
743
744static MYSQL_THDVAR_BOOL(
745 lock_scanned_rows, PLUGIN_VAR_RQCMDARG,
746 "Take and hold locks on rows that are scanned but not updated", nullptr,
747 nullptr, FALSE);
748
749static MYSQL_THDVAR_ULONG(bulk_load_size, PLUGIN_VAR_RQCMDARG,
750 "Max #records in a batch for bulk-load mode", nullptr,
751 nullptr,
752 /*default*/ RDB_DEFAULT_BULK_LOAD_SIZE,
753 /*min*/ 1,
754 /*max*/ RDB_MAX_BULK_LOAD_SIZE, 0);
755
756static MYSQL_THDVAR_ULONGLONG(
757 merge_buf_size, PLUGIN_VAR_RQCMDARG,
758 "Size to allocate for merge sort buffers written out to disk "
759 "during inplace index creation.",
760 nullptr, nullptr,
761 /* default (64MB) */ RDB_DEFAULT_MERGE_BUF_SIZE,
762 /* min (100B) */ RDB_MIN_MERGE_BUF_SIZE,
763 /* max */ SIZE_T_MAX, 1);
764
765static MYSQL_THDVAR_ULONGLONG(
766 merge_combine_read_size, PLUGIN_VAR_RQCMDARG,
767 "Size that we have to work with during combine (reading from disk) phase "
768 "of "
769 "external sort during fast index creation.",
770 nullptr, nullptr,
771 /* default (1GB) */ RDB_DEFAULT_MERGE_COMBINE_READ_SIZE,
772 /* min (100B) */ RDB_MIN_MERGE_COMBINE_READ_SIZE,
773 /* max */ SIZE_T_MAX, 1);
774
775static MYSQL_THDVAR_ULONGLONG(
776 merge_tmp_file_removal_delay_ms, PLUGIN_VAR_RQCMDARG,
777 "Fast index creation creates a large tmp file on disk during index "
778 "creation. Removing this large file all at once when index creation is "
779 "complete can cause trim stalls on Flash. This variable specifies a "
780 "duration to sleep (in milliseconds) between calling chsize() to truncate "
781 "the file in chunks. The chunk size is the same as merge_buf_size.",
782 nullptr, nullptr,
783 /* default (0ms) */ RDB_DEFAULT_MERGE_TMP_FILE_REMOVAL_DELAY,
784 /* min (0ms) */ RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY,
785 /* max */ SIZE_T_MAX, 1);
786
787static MYSQL_SYSVAR_BOOL(
788 create_if_missing,
789 *reinterpret_cast<my_bool *>(&rocksdb_db_options->create_if_missing),
790 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
791 "DBOptions::create_if_missing for RocksDB", nullptr, nullptr,
792 rocksdb_db_options->create_if_missing);
793
794static MYSQL_SYSVAR_BOOL(
795 two_write_queues,
796 *reinterpret_cast<my_bool *>(&rocksdb_db_options->two_write_queues),
797 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
798 "DBOptions::two_write_queues for RocksDB", nullptr, nullptr,
799 rocksdb_db_options->two_write_queues);
800
801static MYSQL_SYSVAR_BOOL(
802 manual_wal_flush,
803 *reinterpret_cast<my_bool *>(&rocksdb_db_options->manual_wal_flush),
804 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
805 "DBOptions::manual_wal_flush for RocksDB", nullptr, nullptr,
806 rocksdb_db_options->manual_wal_flush);
807
808static MYSQL_SYSVAR_BOOL(
809 create_missing_column_families,
810 *reinterpret_cast<my_bool *>(
811 &rocksdb_db_options->create_missing_column_families),
812 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
813 "DBOptions::create_missing_column_families for RocksDB", nullptr, nullptr,
814 rocksdb_db_options->create_missing_column_families);
815
816static MYSQL_SYSVAR_BOOL(
817 error_if_exists,
818 *reinterpret_cast<my_bool *>(&rocksdb_db_options->error_if_exists),
819 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
820 "DBOptions::error_if_exists for RocksDB", nullptr, nullptr,
821 rocksdb_db_options->error_if_exists);
822
823static MYSQL_SYSVAR_BOOL(
824 paranoid_checks,
825 *reinterpret_cast<my_bool *>(&rocksdb_db_options->paranoid_checks),
826 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
827 "DBOptions::paranoid_checks for RocksDB", nullptr, nullptr,
828 rocksdb_db_options->paranoid_checks);
829
830static MYSQL_SYSVAR_ULONGLONG(
831 rate_limiter_bytes_per_sec, rocksdb_rate_limiter_bytes_per_sec,
832 PLUGIN_VAR_RQCMDARG, "DBOptions::rate_limiter bytes_per_sec for RocksDB",
833 nullptr, rocksdb_set_rate_limiter_bytes_per_sec, /* default */ 0L,
834 /* min */ 0L, /* max */ MAX_RATE_LIMITER_BYTES_PER_SEC, 0);
835
836static MYSQL_SYSVAR_ULONGLONG(
837 sst_mgr_rate_bytes_per_sec, rocksdb_sst_mgr_rate_bytes_per_sec,
838 PLUGIN_VAR_RQCMDARG,
839 "DBOptions::sst_file_manager rate_bytes_per_sec for RocksDB", nullptr,
840 rocksdb_set_sst_mgr_rate_bytes_per_sec,
841 /* default */ DEFAULT_SST_MGR_RATE_BYTES_PER_SEC,
842 /* min */ 0L, /* max */ UINT64_MAX, 0);
843
844static MYSQL_SYSVAR_ULONGLONG(delayed_write_rate, rocksdb_delayed_write_rate,
845 PLUGIN_VAR_RQCMDARG,
846 "DBOptions::delayed_write_rate", nullptr,
847 rocksdb_set_delayed_write_rate,
848 rocksdb_db_options->delayed_write_rate, 0,
849 UINT64_MAX, 0);
850
851static MYSQL_SYSVAR_UINT(max_latest_deadlocks, rocksdb_max_latest_deadlocks,
852 PLUGIN_VAR_RQCMDARG,
853 "Maximum number of recent "
854 "deadlocks to store",
855 nullptr, rocksdb_set_max_latest_deadlocks,
856 rocksdb::kInitialMaxDeadlocks, 0, UINT32_MAX, 0);
857
858static MYSQL_SYSVAR_ENUM(
859 info_log_level, rocksdb_info_log_level, PLUGIN_VAR_RQCMDARG,
860 "Filter level for info logs to be written mysqld error log. "
861 "Valid values include 'debug_level', 'info_level', 'warn_level'"
862 "'error_level' and 'fatal_level'.",
863 nullptr, rocksdb_set_rocksdb_info_log_level,
864 rocksdb::InfoLogLevel::ERROR_LEVEL, &info_log_level_typelib);
865
866static MYSQL_THDVAR_INT(
867 perf_context_level, PLUGIN_VAR_RQCMDARG,
868 "Perf Context Level for rocksdb internal timer stat collection", nullptr,
869 nullptr,
870 /* default */ rocksdb::PerfLevel::kUninitialized,
871 /* min */ rocksdb::PerfLevel::kUninitialized,
872 /* max */ rocksdb::PerfLevel::kOutOfBounds - 1, 0);
873
874static MYSQL_SYSVAR_UINT(
875 wal_recovery_mode, rocksdb_wal_recovery_mode, PLUGIN_VAR_RQCMDARG,
876 "DBOptions::wal_recovery_mode for RocksDB. Default is kAbsoluteConsistency",
877 nullptr, nullptr,
878 /* default */ (uint)rocksdb::WALRecoveryMode::kAbsoluteConsistency,
879 /* min */ (uint)rocksdb::WALRecoveryMode::kTolerateCorruptedTailRecords,
880 /* max */ (uint)rocksdb::WALRecoveryMode::kSkipAnyCorruptedRecords, 0);
881
882static MYSQL_SYSVAR_SIZE_T(compaction_readahead_size,
883 rocksdb_db_options->compaction_readahead_size,
884 PLUGIN_VAR_RQCMDARG,
885 "DBOptions::compaction_readahead_size for RocksDB",
886 nullptr, nullptr,
887 rocksdb_db_options->compaction_readahead_size,
888 /* min */ 0L, /* max */ SIZE_T_MAX, 0);
889
890static MYSQL_SYSVAR_BOOL(
891 new_table_reader_for_compaction_inputs,
892 *reinterpret_cast<my_bool *>(
893 &rocksdb_db_options->new_table_reader_for_compaction_inputs),
894 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
895 "DBOptions::new_table_reader_for_compaction_inputs for RocksDB", nullptr,
896 nullptr, rocksdb_db_options->new_table_reader_for_compaction_inputs);
897
898static MYSQL_SYSVAR_UINT(
899 access_hint_on_compaction_start, rocksdb_access_hint_on_compaction_start,
900 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
901 "DBOptions::access_hint_on_compaction_start for RocksDB", nullptr, nullptr,
902 /* default */ (uint)rocksdb::Options::AccessHint::NORMAL,
903 /* min */ (uint)rocksdb::Options::AccessHint::NONE,
904 /* max */ (uint)rocksdb::Options::AccessHint::WILLNEED, 0);
905
906static MYSQL_SYSVAR_BOOL(
907 allow_concurrent_memtable_write,
908 *reinterpret_cast<my_bool *>(
909 &rocksdb_db_options->allow_concurrent_memtable_write),
910 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
911 "DBOptions::allow_concurrent_memtable_write for RocksDB", nullptr, nullptr,
912 false);
913
914static MYSQL_SYSVAR_BOOL(
915 enable_write_thread_adaptive_yield,
916 *reinterpret_cast<my_bool *>(
917 &rocksdb_db_options->enable_write_thread_adaptive_yield),
918 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
919 "DBOptions::enable_write_thread_adaptive_yield for RocksDB", nullptr,
920 nullptr, false);
921
922static MYSQL_SYSVAR_INT(max_open_files, rocksdb_db_options->max_open_files,
923 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
924 "DBOptions::max_open_files for RocksDB", nullptr,
925 nullptr, rocksdb_db_options->max_open_files,
926 /* min */ -2, /* max */ INT_MAX, 0);
927
928static MYSQL_SYSVAR_UINT64_T(max_total_wal_size,
929 rocksdb_db_options->max_total_wal_size,
930 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
931 "DBOptions::max_total_wal_size for RocksDB", nullptr,
932 nullptr, rocksdb_db_options->max_total_wal_size,
933 /* min */ 0, /* max */ LONGLONG_MAX, 0);
934
935static MYSQL_SYSVAR_BOOL(
936 use_fsync, *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_fsync),
937 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
938 "DBOptions::use_fsync for RocksDB", nullptr, nullptr,
939 rocksdb_db_options->use_fsync);
940
941static MYSQL_SYSVAR_STR(wal_dir, rocksdb_wal_dir,
942 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
943 "DBOptions::wal_dir for RocksDB", nullptr, nullptr,
944 rocksdb_db_options->wal_dir.c_str());
945
946static MYSQL_SYSVAR_STR(
947 persistent_cache_path, rocksdb_persistent_cache_path,
948 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
949 "Path for BlockBasedTableOptions::persistent_cache for RocksDB", nullptr,
950 nullptr, "");
951
952static MYSQL_SYSVAR_ULONG(
953 persistent_cache_size_mb, rocksdb_persistent_cache_size_mb,
954 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
955 "Size of cache in MB for BlockBasedTableOptions::persistent_cache "
956 "for RocksDB", nullptr, nullptr, rocksdb_persistent_cache_size_mb,
957 /* min */ 0L, /* max */ ULONG_MAX, 0);
958
959static MYSQL_SYSVAR_UINT64_T(
960 delete_obsolete_files_period_micros,
961 rocksdb_db_options->delete_obsolete_files_period_micros,
962 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
963 "DBOptions::delete_obsolete_files_period_micros for RocksDB", nullptr,
964 nullptr, rocksdb_db_options->delete_obsolete_files_period_micros,
965 /* min */ 0, /* max */ LONGLONG_MAX, 0);
966
967static MYSQL_SYSVAR_INT(max_background_jobs,
968 rocksdb_db_options->max_background_jobs,
969 PLUGIN_VAR_RQCMDARG,
970 "DBOptions::max_background_jobs for RocksDB", nullptr,
971 rocksdb_set_max_background_jobs,
972 rocksdb_db_options->max_background_jobs,
973 /* min */ -1, /* max */ MAX_BACKGROUND_JOBS, 0);
974
975static MYSQL_SYSVAR_UINT(max_subcompactions,
976 rocksdb_db_options->max_subcompactions,
977 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
978 "DBOptions::max_subcompactions for RocksDB", nullptr,
979 nullptr, rocksdb_db_options->max_subcompactions,
980 /* min */ 1, /* max */ MAX_SUBCOMPACTIONS, 0);
981
982static MYSQL_SYSVAR_SIZE_T(max_log_file_size,
983 rocksdb_db_options->max_log_file_size,
984 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
985 "DBOptions::max_log_file_size for RocksDB", nullptr,
986 nullptr, rocksdb_db_options->max_log_file_size,
987 /* min */ 0L, /* max */ SIZE_T_MAX, 0);
988
989static MYSQL_SYSVAR_SIZE_T(log_file_time_to_roll,
990 rocksdb_db_options->log_file_time_to_roll,
991 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
992 "DBOptions::log_file_time_to_roll for RocksDB",
993 nullptr, nullptr,
994 rocksdb_db_options->log_file_time_to_roll,
995 /* min */ 0L, /* max */ SIZE_T_MAX, 0);
996
997static MYSQL_SYSVAR_SIZE_T(keep_log_file_num,
998 rocksdb_db_options->keep_log_file_num,
999 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1000 "DBOptions::keep_log_file_num for RocksDB", nullptr,
1001 nullptr, rocksdb_db_options->keep_log_file_num,
1002 /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1003
1004static MYSQL_SYSVAR_UINT64_T(max_manifest_file_size,
1005 rocksdb_db_options->max_manifest_file_size,
1006 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1007 "DBOptions::max_manifest_file_size for RocksDB",
1008 nullptr, nullptr,
1009 rocksdb_db_options->max_manifest_file_size,
1010 /* min */ 0L, /* max */ ULONGLONG_MAX, 0);
1011
1012static MYSQL_SYSVAR_INT(table_cache_numshardbits,
1013 rocksdb_db_options->table_cache_numshardbits,
1014 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1015 "DBOptions::table_cache_numshardbits for RocksDB",
1016 nullptr, nullptr,
1017 rocksdb_db_options->table_cache_numshardbits,
1018 /* min */ 0, /* max */ INT_MAX, 0);
1019
1020static MYSQL_SYSVAR_UINT64_T(wal_ttl_seconds, rocksdb_db_options->WAL_ttl_seconds,
1021 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1022 "DBOptions::WAL_ttl_seconds for RocksDB", nullptr,
1023 nullptr, rocksdb_db_options->WAL_ttl_seconds,
1024 /* min */ 0L, /* max */ LONGLONG_MAX, 0);
1025
1026static MYSQL_SYSVAR_UINT64_T(wal_size_limit_mb,
1027 rocksdb_db_options->WAL_size_limit_MB,
1028 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1029 "DBOptions::WAL_size_limit_MB for RocksDB", nullptr,
1030 nullptr, rocksdb_db_options->WAL_size_limit_MB,
1031 /* min */ 0L, /* max */ LONGLONG_MAX, 0);
1032
1033static MYSQL_SYSVAR_SIZE_T(manifest_preallocation_size,
1034 rocksdb_db_options->manifest_preallocation_size,
1035 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1036 "DBOptions::manifest_preallocation_size for RocksDB",
1037 nullptr, nullptr,
1038 rocksdb_db_options->manifest_preallocation_size,
1039 /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1040
1041static MYSQL_SYSVAR_BOOL(
1042 use_direct_reads,
1043 *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_direct_reads),
1044 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1045 "DBOptions::use_direct_reads for RocksDB", nullptr, nullptr,
1046 rocksdb_db_options->use_direct_reads);
1047
1048static MYSQL_SYSVAR_BOOL(
1049 use_direct_io_for_flush_and_compaction,
1050 *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_direct_io_for_flush_and_compaction),
1051 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1052 "DBOptions::use_direct_io_for_flush_and_compaction for RocksDB", nullptr, nullptr,
1053 rocksdb_db_options->use_direct_io_for_flush_and_compaction);
1054
1055static MYSQL_SYSVAR_BOOL(
1056 allow_mmap_reads,
1057 *reinterpret_cast<my_bool *>(&rocksdb_db_options->allow_mmap_reads),
1058 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1059 "DBOptions::allow_mmap_reads for RocksDB", nullptr, nullptr,
1060 rocksdb_db_options->allow_mmap_reads);
1061
1062static MYSQL_SYSVAR_BOOL(
1063 allow_mmap_writes,
1064 *reinterpret_cast<my_bool *>(&rocksdb_db_options->allow_mmap_writes),
1065 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1066 "DBOptions::allow_mmap_writes for RocksDB", nullptr, nullptr,
1067 rocksdb_db_options->allow_mmap_writes);
1068
1069static MYSQL_SYSVAR_BOOL(
1070 is_fd_close_on_exec,
1071 *reinterpret_cast<my_bool *>(&rocksdb_db_options->is_fd_close_on_exec),
1072 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1073 "DBOptions::is_fd_close_on_exec for RocksDB", nullptr, nullptr,
1074 rocksdb_db_options->is_fd_close_on_exec);
1075
1076static MYSQL_SYSVAR_UINT(stats_dump_period_sec,
1077 rocksdb_db_options->stats_dump_period_sec,
1078 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1079 "DBOptions::stats_dump_period_sec for RocksDB",
1080 nullptr, nullptr,
1081 rocksdb_db_options->stats_dump_period_sec,
1082 /* min */ 0, /* max */ INT_MAX, 0);
1083
1084static MYSQL_SYSVAR_BOOL(
1085 advise_random_on_open,
1086 *reinterpret_cast<my_bool *>(&rocksdb_db_options->advise_random_on_open),
1087 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1088 "DBOptions::advise_random_on_open for RocksDB", nullptr, nullptr,
1089 rocksdb_db_options->advise_random_on_open);
1090
1091static MYSQL_SYSVAR_SIZE_T(db_write_buffer_size,
1092 rocksdb_db_options->db_write_buffer_size,
1093 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1094 "DBOptions::db_write_buffer_size for RocksDB",
1095 nullptr, nullptr,
1096 rocksdb_db_options->db_write_buffer_size,
1097 /* min */ 0L, /* max */ SIZE_T_MAX, 0);
1098
1099static MYSQL_SYSVAR_BOOL(
1100 use_adaptive_mutex,
1101 *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_adaptive_mutex),
1102 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1103 "DBOptions::use_adaptive_mutex for RocksDB", nullptr, nullptr,
1104 rocksdb_db_options->use_adaptive_mutex);
1105
1106static MYSQL_SYSVAR_UINT64_T(bytes_per_sync, rocksdb_db_options->bytes_per_sync,
1107 PLUGIN_VAR_RQCMDARG,
1108 "DBOptions::bytes_per_sync for RocksDB", nullptr,
1109 rocksdb_set_bytes_per_sync,
1110 rocksdb_db_options->bytes_per_sync,
1111 /* min */ 0L, /* max */ ULONGLONG_MAX, 0);
1112
1113static MYSQL_SYSVAR_UINT64_T(wal_bytes_per_sync,
1114 rocksdb_db_options->wal_bytes_per_sync,
1115 PLUGIN_VAR_RQCMDARG,
1116 "DBOptions::wal_bytes_per_sync for RocksDB", nullptr,
1117 rocksdb_set_wal_bytes_per_sync,
1118 rocksdb_db_options->wal_bytes_per_sync,
1119 /* min */ 0L, /* max */ ULONGLONG_MAX, 0);
1120
1121static MYSQL_SYSVAR_BOOL(
1122 enable_thread_tracking,
1123 *reinterpret_cast<my_bool *>(&rocksdb_db_options->enable_thread_tracking),
1124 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1125 "DBOptions::enable_thread_tracking for RocksDB", nullptr, nullptr, true);
1126
1127static MYSQL_SYSVAR_LONGLONG(block_cache_size, rocksdb_block_cache_size,
1128 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1129 "block_cache size for RocksDB", nullptr, nullptr,
1130 /* default */ RDB_DEFAULT_BLOCK_CACHE_SIZE,
1131 /* min */ RDB_MIN_BLOCK_CACHE_SIZE,
1132 /* max */ LONGLONG_MAX,
1133 /* Block size */ RDB_MIN_BLOCK_CACHE_SIZE);
1134
1135static MYSQL_SYSVAR_LONGLONG(sim_cache_size, rocksdb_sim_cache_size,
1136 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1137 "Simulated cache size for RocksDB", nullptr,
1138 nullptr,
1139 /* default */ 0,
1140 /* min */ 0,
1141 /* max */ LONGLONG_MAX,
1142 /* Block size */ 0);
1143
1144static MYSQL_SYSVAR_BOOL(
1145 use_clock_cache,
1146 rocksdb_use_clock_cache,
1147 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1148 "Use ClockCache instead of default LRUCache for RocksDB",
1149 nullptr, nullptr, false);
1150
1151static MYSQL_SYSVAR_BOOL(
1152 cache_index_and_filter_blocks,
1153 *reinterpret_cast<my_bool *>(
1154 &rocksdb_tbl_options->cache_index_and_filter_blocks),
1155 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1156 "BlockBasedTableOptions::cache_index_and_filter_blocks for RocksDB",
1157 nullptr, nullptr, true);
1158
1159// When pin_l0_filter_and_index_blocks_in_cache is true, RocksDB will use the
1160// LRU cache, but will always keep the filter & idndex block's handle checked
1161// out (=won't call ShardedLRUCache::Release), plus the parsed out objects
1162// the LRU cache will never push flush them out, hence they're pinned.
1163//
1164// This fixes the mutex contention between :ShardedLRUCache::Lookup and
1165// ShardedLRUCache::Release which reduced the QPS ratio (QPS using secondary
1166// index / QPS using PK).
1167static MYSQL_SYSVAR_BOOL(
1168 pin_l0_filter_and_index_blocks_in_cache,
1169 *reinterpret_cast<my_bool *>(
1170 &rocksdb_tbl_options->pin_l0_filter_and_index_blocks_in_cache),
1171 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1172 "pin_l0_filter_and_index_blocks_in_cache for RocksDB", nullptr, nullptr,
1173 true);
1174
1175static MYSQL_SYSVAR_ENUM(index_type, rocksdb_index_type,
1176 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1177 "BlockBasedTableOptions::index_type for RocksDB",
1178 nullptr, nullptr,
1179 (ulong)rocksdb_tbl_options->index_type,
1180 &index_type_typelib);
1181
1182static MYSQL_SYSVAR_BOOL(
1183 hash_index_allow_collision,
1184 *reinterpret_cast<my_bool *>(
1185 &rocksdb_tbl_options->hash_index_allow_collision),
1186 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1187 "BlockBasedTableOptions::hash_index_allow_collision for RocksDB", nullptr,
1188 nullptr, rocksdb_tbl_options->hash_index_allow_collision);
1189
1190static MYSQL_SYSVAR_BOOL(
1191 no_block_cache,
1192 *reinterpret_cast<my_bool *>(&rocksdb_tbl_options->no_block_cache),
1193 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1194 "BlockBasedTableOptions::no_block_cache for RocksDB", nullptr, nullptr,
1195 rocksdb_tbl_options->no_block_cache);
1196
1197static MYSQL_SYSVAR_SIZE_T(block_size, rocksdb_tbl_options->block_size,
1198 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1199 "BlockBasedTableOptions::block_size for RocksDB",
1200 nullptr, nullptr, rocksdb_tbl_options->block_size,
1201 /* min */ 1L, /* max */ SIZE_T_MAX, 0);
1202
1203static MYSQL_SYSVAR_INT(
1204 block_size_deviation, rocksdb_tbl_options->block_size_deviation,
1205 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1206 "BlockBasedTableOptions::block_size_deviation for RocksDB", nullptr,
1207 nullptr, rocksdb_tbl_options->block_size_deviation,
1208 /* min */ 0, /* max */ INT_MAX, 0);
1209
1210static MYSQL_SYSVAR_INT(
1211 block_restart_interval, rocksdb_tbl_options->block_restart_interval,
1212 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1213 "BlockBasedTableOptions::block_restart_interval for RocksDB", nullptr,
1214 nullptr, rocksdb_tbl_options->block_restart_interval,
1215 /* min */ 1, /* max */ INT_MAX, 0);
1216
1217static MYSQL_SYSVAR_BOOL(
1218 whole_key_filtering,
1219 *reinterpret_cast<my_bool *>(&rocksdb_tbl_options->whole_key_filtering),
1220 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1221 "BlockBasedTableOptions::whole_key_filtering for RocksDB", nullptr, nullptr,
1222 rocksdb_tbl_options->whole_key_filtering);
1223
1224static MYSQL_SYSVAR_STR(default_cf_options, rocksdb_default_cf_options,
1225 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1226 "default cf options for RocksDB", nullptr, nullptr, "");
1227
1228static MYSQL_SYSVAR_STR(override_cf_options, rocksdb_override_cf_options,
1229 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1230 "option overrides per cf for RocksDB", nullptr, nullptr,
1231 "");
1232
1233static MYSQL_SYSVAR_STR(update_cf_options, rocksdb_update_cf_options,
1234 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC
1235 /* psergey-merge: need this? : PLUGIN_VAR_ALLOCATED*/,
1236 "Option updates per column family for RocksDB",
1237 rocksdb_validate_update_cf_options,
1238 rocksdb_set_update_cf_options, nullptr);
1239
1240static MYSQL_SYSVAR_UINT(flush_log_at_trx_commit,
1241 rocksdb_flush_log_at_trx_commit, PLUGIN_VAR_RQCMDARG,
1242 "Sync on transaction commit. Similar to "
1243 "innodb_flush_log_at_trx_commit. 1: sync on commit, "
1244 "0,2: not sync on commit",
1245 rocksdb_validate_flush_log_at_trx_commit, nullptr,
1246 /* default */ FLUSH_LOG_SYNC,
1247 /* min */ FLUSH_LOG_NEVER,
1248 /* max */ FLUSH_LOG_BACKGROUND, 0);
1249
1250static MYSQL_THDVAR_BOOL(write_disable_wal, PLUGIN_VAR_RQCMDARG,
1251 "WriteOptions::disableWAL for RocksDB", nullptr,
1252 nullptr, rocksdb::WriteOptions().disableWAL);
1253
1254static MYSQL_THDVAR_BOOL(
1255 write_ignore_missing_column_families, PLUGIN_VAR_RQCMDARG,
1256 "WriteOptions::ignore_missing_column_families for RocksDB", nullptr,
1257 nullptr, rocksdb::WriteOptions().ignore_missing_column_families);
1258
1259static MYSQL_THDVAR_BOOL(skip_fill_cache, PLUGIN_VAR_RQCMDARG,
1260 "Skip filling block cache on read requests", nullptr,
1261 nullptr, FALSE);
1262
1263static MYSQL_THDVAR_BOOL(
1264 unsafe_for_binlog, PLUGIN_VAR_RQCMDARG,
1265 "Allowing statement based binary logging which may break consistency",
1266 nullptr, nullptr, FALSE);
1267
1268static MYSQL_THDVAR_UINT(records_in_range, PLUGIN_VAR_RQCMDARG,
1269 "Used to override the result of records_in_range(). "
1270 "Set to a positive number to override",
1271 nullptr, nullptr, 0,
1272 /* min */ 0, /* max */ INT_MAX, 0);
1273
1274static MYSQL_THDVAR_UINT(force_index_records_in_range, PLUGIN_VAR_RQCMDARG,
1275 "Used to override the result of records_in_range() "
1276 "when FORCE INDEX is used.",
1277 nullptr, nullptr, 0,
1278 /* min */ 0, /* max */ INT_MAX, 0);
1279
1280static MYSQL_SYSVAR_UINT(
1281 debug_optimizer_n_rows, rocksdb_debug_optimizer_n_rows,
1282 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOSYSVAR,
1283 "Test only to override rocksdb estimates of table size in a memtable",
1284 nullptr, nullptr, 0, /* min */ 0, /* max */ INT_MAX, 0);
1285
1286static MYSQL_SYSVAR_BOOL(force_compute_memtable_stats,
1287 rocksdb_force_compute_memtable_stats,
1288 PLUGIN_VAR_RQCMDARG,
1289 "Force to always compute memtable stats",
1290 nullptr, nullptr, TRUE);
1291
1292static MYSQL_SYSVAR_UINT(force_compute_memtable_stats_cachetime,
1293 rocksdb_force_compute_memtable_stats_cachetime,
1294 PLUGIN_VAR_RQCMDARG,
1295 "Time in usecs to cache memtable estimates", nullptr,
1296 nullptr, /* default */ 60 * 1000 * 1000,
1297 /* min */ 0, /* max */ INT_MAX, 0);
1298
1299static MYSQL_SYSVAR_BOOL(
1300 debug_optimizer_no_zero_cardinality,
1301 rocksdb_debug_optimizer_no_zero_cardinality, PLUGIN_VAR_RQCMDARG,
1302 "In case if cardinality is zero, overrides it with some value", nullptr,
1303 nullptr, TRUE);
1304
1305static MYSQL_SYSVAR_STR(compact_cf, rocksdb_compact_cf_name,
1306 PLUGIN_VAR_RQCMDARG, "Compact column family",
1307 rocksdb_compact_column_family,
1308 rocksdb_compact_column_family_stub, "");
1309
1310static MYSQL_SYSVAR_STR(create_checkpoint, rocksdb_checkpoint_name,
1311 PLUGIN_VAR_RQCMDARG, "Checkpoint directory",
1312 rocksdb_create_checkpoint,
1313 rocksdb_create_checkpoint_stub, "");
1314
1315static MYSQL_SYSVAR_BOOL(signal_drop_index_thread,
1316 rocksdb_signal_drop_index_thread, PLUGIN_VAR_RQCMDARG,
1317 "Wake up drop index thread", nullptr,
1318 rocksdb_drop_index_wakeup_thread, FALSE);
1319
1320static MYSQL_SYSVAR_BOOL(pause_background_work, rocksdb_pause_background_work,
1321 PLUGIN_VAR_RQCMDARG,
1322 "Disable all rocksdb background operations", nullptr,
1323 rocksdb_set_pause_background_work, FALSE);
1324
1325static MYSQL_SYSVAR_BOOL(
1326 enable_ttl, rocksdb_enable_ttl, PLUGIN_VAR_RQCMDARG,
1327 "Enable expired TTL records to be dropped during compaction.", nullptr,
1328 nullptr, TRUE);
1329
1330static MYSQL_SYSVAR_BOOL(
1331 enable_ttl_read_filtering, rocksdb_enable_ttl_read_filtering,
1332 PLUGIN_VAR_RQCMDARG,
1333 "For tables with TTL, expired records are skipped/filtered out during "
1334 "processing and in query results. Disabling this will allow these records "
1335 "to be seen, but as a result rows may disappear in the middle of "
1336 "transactions as they are dropped during compaction. Use with caution.",
1337 nullptr, nullptr, TRUE);
1338
1339static MYSQL_SYSVAR_INT(
1340 debug_ttl_rec_ts, rocksdb_debug_ttl_rec_ts, PLUGIN_VAR_RQCMDARG,
1341 "For debugging purposes only. Overrides the TTL of records to "
1342 "now() + debug_ttl_rec_ts. The value can be +/- to simulate "
1343 "a record inserted in the past vs a record inserted in the 'future'. "
1344 "A value of 0 denotes that the variable is not set. This variable is a "
1345 "no-op in non-debug builds.",
1346 nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0);
1347
1348static MYSQL_SYSVAR_INT(
1349 debug_ttl_snapshot_ts, rocksdb_debug_ttl_snapshot_ts, PLUGIN_VAR_RQCMDARG,
1350 "For debugging purposes only. Sets the snapshot during compaction to "
1351 "now() + debug_set_ttl_snapshot_ts. The value can be +/- to simulate "
1352 "a snapshot in the past vs a snapshot created in the 'future'. "
1353 "A value of 0 denotes that the variable is not set. This variable is a "
1354 "no-op in non-debug builds.",
1355 nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0);
1356
1357static MYSQL_SYSVAR_INT(
1358 debug_ttl_read_filter_ts, rocksdb_debug_ttl_read_filter_ts,
1359 PLUGIN_VAR_RQCMDARG,
1360 "For debugging purposes only. Overrides the TTL read filtering time to "
1361 "time + debug_ttl_read_filter_ts. A value of 0 denotes that the variable "
1362 "is not set. This variable is a no-op in non-debug builds.",
1363 nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0);
1364
1365static MYSQL_SYSVAR_BOOL(
1366 debug_ttl_ignore_pk, rocksdb_debug_ttl_ignore_pk, PLUGIN_VAR_RQCMDARG,
1367 "For debugging purposes only. If true, compaction filtering will not occur "
1368 "on PK TTL data. This variable is a no-op in non-debug builds.",
1369 nullptr, nullptr, FALSE);
1370
1371static MYSQL_SYSVAR_BOOL(
1372 reset_stats, rocksdb_reset_stats, PLUGIN_VAR_RQCMDARG,
1373 "Reset the RocksDB internal statistics without restarting the DB.", nullptr,
1374 rocksdb_set_reset_stats, FALSE);
1375
1376static MYSQL_SYSVAR_UINT(io_write_timeout, rocksdb_io_write_timeout_secs,
1377 PLUGIN_VAR_RQCMDARG,
1378 "Timeout for experimental I/O watchdog.", nullptr,
1379 rocksdb_set_io_write_timeout, /* default */ 0,
1380 /* min */ 0L,
1381 /* max */ UINT_MAX, 0);
1382
1383static MYSQL_SYSVAR_BOOL(enable_2pc, rocksdb_enable_2pc, PLUGIN_VAR_RQCMDARG,
1384 "Enable two phase commit for MyRocks", nullptr,
1385 nullptr, TRUE);
1386
1387static MYSQL_SYSVAR_BOOL(ignore_unknown_options, rocksdb_ignore_unknown_options,
1388 PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1389 "Enable ignoring unknown options passed to RocksDB",
1390 nullptr, nullptr, TRUE);
1391
1392static MYSQL_SYSVAR_BOOL(strict_collation_check, rocksdb_strict_collation_check,
1393 PLUGIN_VAR_RQCMDARG,
1394 "Enforce case sensitive collation for MyRocks indexes",
1395 nullptr, nullptr, TRUE);
1396
1397static MYSQL_SYSVAR_STR(strict_collation_exceptions,
1398 rocksdb_strict_collation_exceptions,
1399 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC,
1400 "List of tables (using regex) that are excluded "
1401 "from the case sensitive collation enforcement",
1402 nullptr, rocksdb_set_collation_exception_list, "");
1403
1404static MYSQL_SYSVAR_BOOL(collect_sst_properties, rocksdb_collect_sst_properties,
1405 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1406 "Enables collecting SST file properties on each flush",
1407 nullptr, nullptr, rocksdb_collect_sst_properties);
1408
1409static MYSQL_SYSVAR_BOOL(
1410 force_flush_memtable_now, rocksdb_force_flush_memtable_now_var,
1411 PLUGIN_VAR_RQCMDARG,
1412 "Forces memstore flush which may block all write requests so be careful",
1413 rocksdb_force_flush_memtable_now, rocksdb_force_flush_memtable_now_stub,
1414 FALSE);
1415
1416static MYSQL_SYSVAR_BOOL(
1417 force_flush_memtable_and_lzero_now,
1418 rocksdb_force_flush_memtable_and_lzero_now_var, PLUGIN_VAR_RQCMDARG,
1419 "Acts similar to force_flush_memtable_now, but also compacts all L0 files.",
1420 rocksdb_force_flush_memtable_and_lzero_now,
1421 rocksdb_force_flush_memtable_and_lzero_now_stub, FALSE);
1422
1423static MYSQL_SYSVAR_UINT(
1424 seconds_between_stat_computes, rocksdb_seconds_between_stat_computes,
1425 PLUGIN_VAR_RQCMDARG,
1426 "Sets a number of seconds to wait between optimizer stats recomputation. "
1427 "Only changed indexes will be refreshed.",
1428 nullptr, nullptr, rocksdb_seconds_between_stat_computes,
1429 /* min */ 0L, /* max */ UINT_MAX, 0);
1430
1431static MYSQL_SYSVAR_LONGLONG(compaction_sequential_deletes,
1432 rocksdb_compaction_sequential_deletes,
1433 PLUGIN_VAR_RQCMDARG,
1434 "RocksDB will trigger compaction for the file if "
1435 "it has more than this number sequential deletes "
1436 "per window",
1437 nullptr, rocksdb_set_compaction_options,
1438 DEFAULT_COMPACTION_SEQUENTIAL_DELETES,
1439 /* min */ 0L,
1440 /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES, 0);
1441
1442static MYSQL_SYSVAR_LONGLONG(
1443 compaction_sequential_deletes_window,
1444 rocksdb_compaction_sequential_deletes_window, PLUGIN_VAR_RQCMDARG,
1445 "Size of the window for counting rocksdb_compaction_sequential_deletes",
1446 nullptr, rocksdb_set_compaction_options,
1447 DEFAULT_COMPACTION_SEQUENTIAL_DELETES_WINDOW,
1448 /* min */ 0L, /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES_WINDOW, 0);
1449
1450static MYSQL_SYSVAR_LONGLONG(
1451 compaction_sequential_deletes_file_size,
1452 rocksdb_compaction_sequential_deletes_file_size, PLUGIN_VAR_RQCMDARG,
1453 "Minimum file size required for compaction_sequential_deletes", nullptr,
1454 rocksdb_set_compaction_options, 0L,
1455 /* min */ -1L, /* max */ LONGLONG_MAX, 0);
1456
1457static MYSQL_SYSVAR_BOOL(
1458 compaction_sequential_deletes_count_sd,
1459 rocksdb_compaction_sequential_deletes_count_sd, PLUGIN_VAR_RQCMDARG,
1460 "Counting SingleDelete as rocksdb_compaction_sequential_deletes", nullptr,
1461 nullptr, rocksdb_compaction_sequential_deletes_count_sd);
1462
1463static MYSQL_SYSVAR_BOOL(
1464 print_snapshot_conflict_queries, rocksdb_print_snapshot_conflict_queries,
1465 PLUGIN_VAR_RQCMDARG,
1466 "Logging queries that got snapshot conflict errors into *.err log", nullptr,
1467 nullptr, rocksdb_print_snapshot_conflict_queries);
1468
1469static MYSQL_THDVAR_INT(checksums_pct, PLUGIN_VAR_RQCMDARG,
1470 "How many percentages of rows to be checksummed",
1471 nullptr, nullptr, RDB_MAX_CHECKSUMS_PCT,
1472 /* min */ 0, /* max */ RDB_MAX_CHECKSUMS_PCT, 0);
1473
1474static MYSQL_THDVAR_BOOL(store_row_debug_checksums, PLUGIN_VAR_RQCMDARG,
1475 "Include checksums when writing index/table records",
1476 nullptr, nullptr, false /* default value */);
1477
1478static MYSQL_THDVAR_BOOL(verify_row_debug_checksums, PLUGIN_VAR_RQCMDARG,
1479 "Verify checksums when reading index/table records",
1480 nullptr, nullptr, false /* default value */);
1481
1482static MYSQL_THDVAR_BOOL(master_skip_tx_api, PLUGIN_VAR_RQCMDARG,
1483 "Skipping holding any lock on row access. "
1484 "Not effective on slave.",
1485 nullptr, nullptr, false);
1486
1487static MYSQL_SYSVAR_UINT(
1488 validate_tables, rocksdb_validate_tables,
1489 PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY,
1490 "Verify all .frm files match all RocksDB tables (0 means no verification, "
1491 "1 means verify and fail on error, and 2 means verify but continue",
1492 nullptr, nullptr, 1 /* default value */, 0 /* min value */,
1493 2 /* max value */, 0);
1494
1495static MYSQL_SYSVAR_STR(datadir, rocksdb_datadir,
1496 PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1497 "RocksDB data directory", nullptr, nullptr,
1498 "./#rocksdb");
1499
1500static MYSQL_SYSVAR_STR(supported_compression_types,
1501 compression_types_val,
1502 PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_READONLY,
1503 "Compression algorithms supported by RocksDB",
1504 nullptr, nullptr,
1505 compression_types_val);
1506
1507static MYSQL_SYSVAR_UINT(
1508 table_stats_sampling_pct, rocksdb_table_stats_sampling_pct,
1509 PLUGIN_VAR_RQCMDARG,
1510 "Percentage of entries to sample when collecting statistics about table "
1511 "properties. Specify either 0 to sample everything or percentage "
1512 "[" STRINGIFY_ARG(RDB_TBL_STATS_SAMPLE_PCT_MIN) ".." STRINGIFY_ARG(
1513 RDB_TBL_STATS_SAMPLE_PCT_MAX) "]. "
1514 "By default " STRINGIFY_ARG(
1515 RDB_DEFAULT_TBL_STATS_SAMPLE_PCT) "% "
1516 "of"
1517 " e"
1518 "nt"
1519 "ri"
1520 "es"
1521 " a"
1522 "re"
1523 " "
1524 "sa"
1525 "mp"
1526 "le"
1527 "d"
1528 ".",
1529 nullptr, rocksdb_set_table_stats_sampling_pct, /* default */
1530 RDB_DEFAULT_TBL_STATS_SAMPLE_PCT, /* everything */ 0,
1531 /* max */ RDB_TBL_STATS_SAMPLE_PCT_MAX, 0);
1532
1533static MYSQL_SYSVAR_BOOL(
1534 large_prefix, rocksdb_large_prefix, PLUGIN_VAR_RQCMDARG,
1535 "Support large index prefix length of 3072 bytes. If off, the maximum "
1536 "index prefix length is 767.",
1537 nullptr, nullptr, FALSE);
1538
1539static MYSQL_SYSVAR_BOOL(
1540 allow_to_start_after_corruption, rocksdb_allow_to_start_after_corruption,
1541 PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY,
1542 "Allow server still to start successfully even if RocksDB corruption is "
1543 "detected.",
1544 nullptr, nullptr, FALSE);
1545
1546static const int ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE = 100;
1547
1548static struct st_mysql_sys_var *rocksdb_system_variables[] = {
1549 MYSQL_SYSVAR(lock_wait_timeout),
1550 MYSQL_SYSVAR(deadlock_detect),
1551 MYSQL_SYSVAR(deadlock_detect_depth),
1552 MYSQL_SYSVAR(max_row_locks),
1553 MYSQL_SYSVAR(write_batch_max_bytes),
1554 MYSQL_SYSVAR(lock_scanned_rows),
1555 MYSQL_SYSVAR(bulk_load),
1556 MYSQL_SYSVAR(bulk_load_allow_unsorted),
1557 MYSQL_SYSVAR(skip_unique_check_tables),
1558 MYSQL_SYSVAR(trace_sst_api),
1559 MYSQL_SYSVAR(commit_in_the_middle),
1560 MYSQL_SYSVAR(blind_delete_primary_key),
1561 MYSQL_SYSVAR(read_free_rpl_tables),
1562 MYSQL_SYSVAR(bulk_load_size),
1563 MYSQL_SYSVAR(merge_buf_size),
1564 MYSQL_SYSVAR(enable_bulk_load_api),
1565 MYSQL_SYSVAR(tmpdir),
1566 MYSQL_SYSVAR(merge_combine_read_size),
1567 MYSQL_SYSVAR(merge_tmp_file_removal_delay_ms),
1568 MYSQL_SYSVAR(skip_bloom_filter_on_read),
1569
1570 MYSQL_SYSVAR(create_if_missing),
1571 MYSQL_SYSVAR(two_write_queues),
1572 MYSQL_SYSVAR(manual_wal_flush),
1573 MYSQL_SYSVAR(create_missing_column_families),
1574 MYSQL_SYSVAR(error_if_exists),
1575 MYSQL_SYSVAR(paranoid_checks),
1576 MYSQL_SYSVAR(rate_limiter_bytes_per_sec),
1577 MYSQL_SYSVAR(sst_mgr_rate_bytes_per_sec),
1578 MYSQL_SYSVAR(delayed_write_rate),
1579 MYSQL_SYSVAR(max_latest_deadlocks),
1580 MYSQL_SYSVAR(info_log_level),
1581 MYSQL_SYSVAR(max_open_files),
1582 MYSQL_SYSVAR(max_total_wal_size),
1583 MYSQL_SYSVAR(use_fsync),
1584 MYSQL_SYSVAR(wal_dir),
1585 MYSQL_SYSVAR(persistent_cache_path),
1586 MYSQL_SYSVAR(persistent_cache_size_mb),
1587 MYSQL_SYSVAR(delete_obsolete_files_period_micros),
1588 MYSQL_SYSVAR(max_background_jobs),
1589 MYSQL_SYSVAR(max_log_file_size),
1590 MYSQL_SYSVAR(max_subcompactions),
1591 MYSQL_SYSVAR(log_file_time_to_roll),
1592 MYSQL_SYSVAR(keep_log_file_num),
1593 MYSQL_SYSVAR(max_manifest_file_size),
1594 MYSQL_SYSVAR(table_cache_numshardbits),
1595 MYSQL_SYSVAR(wal_ttl_seconds),
1596 MYSQL_SYSVAR(wal_size_limit_mb),
1597 MYSQL_SYSVAR(manifest_preallocation_size),
1598 MYSQL_SYSVAR(use_direct_reads),
1599 MYSQL_SYSVAR(use_direct_io_for_flush_and_compaction),
1600 MYSQL_SYSVAR(allow_mmap_reads),
1601 MYSQL_SYSVAR(allow_mmap_writes),
1602 MYSQL_SYSVAR(is_fd_close_on_exec),
1603 MYSQL_SYSVAR(stats_dump_period_sec),
1604 MYSQL_SYSVAR(advise_random_on_open),
1605 MYSQL_SYSVAR(db_write_buffer_size),
1606 MYSQL_SYSVAR(use_adaptive_mutex),
1607 MYSQL_SYSVAR(bytes_per_sync),
1608 MYSQL_SYSVAR(wal_bytes_per_sync),
1609 MYSQL_SYSVAR(enable_thread_tracking),
1610 MYSQL_SYSVAR(perf_context_level),
1611 MYSQL_SYSVAR(wal_recovery_mode),
1612 MYSQL_SYSVAR(access_hint_on_compaction_start),
1613 MYSQL_SYSVAR(new_table_reader_for_compaction_inputs),
1614 MYSQL_SYSVAR(compaction_readahead_size),
1615 MYSQL_SYSVAR(allow_concurrent_memtable_write),
1616 MYSQL_SYSVAR(enable_write_thread_adaptive_yield),
1617
1618 MYSQL_SYSVAR(block_cache_size),
1619 MYSQL_SYSVAR(sim_cache_size),
1620 MYSQL_SYSVAR(use_clock_cache),
1621 MYSQL_SYSVAR(cache_index_and_filter_blocks),
1622 MYSQL_SYSVAR(pin_l0_filter_and_index_blocks_in_cache),
1623 MYSQL_SYSVAR(index_type),
1624 MYSQL_SYSVAR(hash_index_allow_collision),
1625 MYSQL_SYSVAR(no_block_cache),
1626 MYSQL_SYSVAR(block_size),
1627 MYSQL_SYSVAR(block_size_deviation),
1628 MYSQL_SYSVAR(block_restart_interval),
1629 MYSQL_SYSVAR(whole_key_filtering),
1630
1631 MYSQL_SYSVAR(default_cf_options),
1632 MYSQL_SYSVAR(override_cf_options),
1633 MYSQL_SYSVAR(update_cf_options),
1634
1635 MYSQL_SYSVAR(flush_log_at_trx_commit),
1636 MYSQL_SYSVAR(write_disable_wal),
1637 MYSQL_SYSVAR(write_ignore_missing_column_families),
1638
1639 MYSQL_SYSVAR(skip_fill_cache),
1640 MYSQL_SYSVAR(unsafe_for_binlog),
1641
1642 MYSQL_SYSVAR(records_in_range),
1643 MYSQL_SYSVAR(force_index_records_in_range),
1644 MYSQL_SYSVAR(debug_optimizer_n_rows),
1645 MYSQL_SYSVAR(force_compute_memtable_stats),
1646 MYSQL_SYSVAR(force_compute_memtable_stats_cachetime),
1647 MYSQL_SYSVAR(debug_optimizer_no_zero_cardinality),
1648
1649 MYSQL_SYSVAR(compact_cf),
1650 MYSQL_SYSVAR(signal_drop_index_thread),
1651 MYSQL_SYSVAR(pause_background_work),
1652 MYSQL_SYSVAR(enable_2pc),
1653 MYSQL_SYSVAR(ignore_unknown_options),
1654 MYSQL_SYSVAR(strict_collation_check),
1655 MYSQL_SYSVAR(strict_collation_exceptions),
1656 MYSQL_SYSVAR(collect_sst_properties),
1657 MYSQL_SYSVAR(force_flush_memtable_now),
1658 MYSQL_SYSVAR(force_flush_memtable_and_lzero_now),
1659 MYSQL_SYSVAR(enable_ttl),
1660 MYSQL_SYSVAR(enable_ttl_read_filtering),
1661 MYSQL_SYSVAR(debug_ttl_rec_ts),
1662 MYSQL_SYSVAR(debug_ttl_snapshot_ts),
1663 MYSQL_SYSVAR(debug_ttl_read_filter_ts),
1664 MYSQL_SYSVAR(debug_ttl_ignore_pk),
1665 MYSQL_SYSVAR(reset_stats),
1666 MYSQL_SYSVAR(io_write_timeout),
1667 MYSQL_SYSVAR(seconds_between_stat_computes),
1668
1669 MYSQL_SYSVAR(compaction_sequential_deletes),
1670 MYSQL_SYSVAR(compaction_sequential_deletes_window),
1671 MYSQL_SYSVAR(compaction_sequential_deletes_file_size),
1672 MYSQL_SYSVAR(compaction_sequential_deletes_count_sd),
1673 MYSQL_SYSVAR(print_snapshot_conflict_queries),
1674
1675 MYSQL_SYSVAR(datadir),
1676 MYSQL_SYSVAR(supported_compression_types),
1677 MYSQL_SYSVAR(create_checkpoint),
1678
1679 MYSQL_SYSVAR(checksums_pct),
1680 MYSQL_SYSVAR(store_row_debug_checksums),
1681 MYSQL_SYSVAR(verify_row_debug_checksums),
1682 MYSQL_SYSVAR(master_skip_tx_api),
1683
1684 MYSQL_SYSVAR(validate_tables),
1685 MYSQL_SYSVAR(table_stats_sampling_pct),
1686
1687 MYSQL_SYSVAR(large_prefix),
1688 MYSQL_SYSVAR(allow_to_start_after_corruption),
1689 MYSQL_SYSVAR(git_hash),
1690 nullptr};
1691
1692static rocksdb::WriteOptions
1693rdb_get_rocksdb_write_options(my_core::THD *const thd) {
1694 rocksdb::WriteOptions opt;
1695
1696 opt.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
1697 opt.disableWAL = THDVAR(thd, write_disable_wal);
1698 opt.ignore_missing_column_families =
1699 THDVAR(thd, write_ignore_missing_column_families);
1700
1701 return opt;
1702}
1703
1704///////////////////////////////////////////////////////////////////////////////////////////
1705
1706/**
1707 @brief
1708 Function we use in the creation of our hash to get key.
1709*/
1710
1711uchar *
1712Rdb_open_tables_map::get_hash_key(const Rdb_table_handler *const table_handler,
1713 size_t *const length,
1714 my_bool not_used MY_ATTRIBUTE((__unused__))) {
1715 *length = table_handler->m_table_name_length;
1716 return reinterpret_cast<uchar *>(table_handler->m_table_name);
1717}
1718
1719/*
1720 Drop index thread's control
1721*/
1722
1723static Rdb_drop_index_thread rdb_drop_idx_thread;
1724
1725static void rocksdb_drop_index_wakeup_thread(
1726 my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
1727 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
1728 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
1729 if (*static_cast<const bool *>(save)) {
1730 rdb_drop_idx_thread.signal();
1731 }
1732}
1733
1734static inline uint32_t rocksdb_perf_context_level(THD *const thd) {
1735 DBUG_ASSERT(thd != nullptr);
1736
1737 const int session_perf_context_level = THDVAR(thd, perf_context_level);
1738 if (session_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
1739 return session_perf_context_level;
1740 }
1741
1742 /*
1743 Fallback to global thdvar, if session specific one was not set to a valid
1744 value.
1745 */
1746
1747 const int global_perf_context_level = THDVAR(nullptr, perf_context_level);
1748 if (global_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
1749 return global_perf_context_level;
1750 }
1751
1752 return rocksdb::PerfLevel::kDisable;
1753}
1754
1755/*
1756 Very short (functor-like) interface to be passed to
1757 Rdb_transaction::walk_tx_list()
1758*/
1759
1760interface Rdb_tx_list_walker {
1761 virtual ~Rdb_tx_list_walker() {}
1762 virtual void process_tran(const Rdb_transaction *const) = 0;
1763};
1764
1765/*
1766 This is a helper class that is passed to RocksDB to get notifications when
1767 a snapshot gets created.
1768*/
1769
1770class Rdb_snapshot_notifier : public rocksdb::TransactionNotifier {
1771 Rdb_transaction *m_owning_tx;
1772
1773 void SnapshotCreated(const rocksdb::Snapshot *snapshot) override;
1774
1775public:
1776 Rdb_snapshot_notifier(const Rdb_snapshot_notifier &) = delete;
1777 Rdb_snapshot_notifier &operator=(const Rdb_snapshot_notifier &) = delete;
1778
1779 explicit Rdb_snapshot_notifier(Rdb_transaction *const owning_tx)
1780 : m_owning_tx(owning_tx) {}
1781
1782 // If the owning Rdb_transaction gets destructed we need to not reference
1783 // it anymore.
1784 void detach() { m_owning_tx = nullptr; }
1785};
1786
1787
1788#ifdef MARIAROCKS_NOT_YET
1789// ER_LOCK_WAIT_TIMEOUT error also has a reason in facebook/mysql-5.6
1790#endif
1791String timeout_message(const char *command, const char *name1,
1792 const char *name2)
1793{
1794 String msg;
1795 msg.append("Timeout on ");
1796 msg.append(command);
1797 msg.append(": ");
1798 msg.append(name1);
1799 if (name2 && name2[0])
1800 {
1801 msg.append(".");
1802 msg.append(name2);
1803 }
1804 return msg;
1805}
1806
1807
1808/* This is the base class for transactions when interacting with rocksdb.
1809*/
1810class Rdb_transaction {
1811protected:
1812 ulonglong m_write_count = 0;
1813 ulonglong m_insert_count = 0;
1814 ulonglong m_update_count = 0;
1815 ulonglong m_delete_count = 0;
1816 ulonglong m_lock_count = 0;
1817 std::unordered_map<GL_INDEX_ID, ulonglong> m_auto_incr_map;
1818
1819 bool m_is_delayed_snapshot = false;
1820 bool m_is_two_phase = false;
1821
1822private:
1823 /* Number of RockDB savepoints taken */
1824 int m_n_savepoints;
1825 /*
1826 Number of write operations this transaction had when we took the last
1827 savepoint (the idea is not to take another savepoint if we haven't made
1828 any changes)
1829 */
1830 ulonglong m_writes_at_last_savepoint;
1831
1832protected:
1833 THD *m_thd = nullptr;
1834
1835 rocksdb::ReadOptions m_read_opts;
1836
1837 static std::multiset<Rdb_transaction *> s_tx_list;
1838 static mysql_mutex_t s_tx_list_mutex;
1839
1840 Rdb_io_perf *m_tbl_io_perf;
1841
1842 bool m_tx_read_only = false;
1843
1844 int m_timeout_sec; /* Cached value of @@rocksdb_lock_wait_timeout */
1845
1846 /* Maximum number of locks the transaction can have */
1847 ulonglong m_max_row_locks;
1848
1849 bool m_is_tx_failed = false;
1850 bool m_rollback_only = false;
1851
1852 std::shared_ptr<Rdb_snapshot_notifier> m_notifier;
1853
1854 // This should be used only when updating binlog information.
1855 virtual rocksdb::WriteBatchBase *get_write_batch() = 0;
1856 virtual bool commit_no_binlog() = 0;
1857 virtual rocksdb::Iterator *
1858 get_iterator(const rocksdb::ReadOptions &options,
1859 rocksdb::ColumnFamilyHandle *column_family) = 0;
1860
1861protected:
1862 /*
1863 The following two are helper functions to be overloaded by child classes.
1864 They should provide RocksDB's savepoint semantics.
1865 */
1866 virtual void do_set_savepoint() = 0;
1867 virtual void do_rollback_to_savepoint() = 0;
1868
1869 /*
1870 @detail
1871 This function takes in the WriteBatch of the transaction to add
1872 all the AUTO_INCREMENT merges. It does so by iterating through
1873 m_auto_incr_map and then constructing key/value pairs to call merge upon.
1874
1875 @param wb
1876 */
1877 rocksdb::Status merge_auto_incr_map(rocksdb::WriteBatchBase *const wb) {
1878 DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", return rocksdb::Status::OK(););
1879
1880 // Iterate through the merge map merging all keys into data dictionary.
1881 rocksdb::Status s;
1882 for (auto &it : m_auto_incr_map) {
1883 s = dict_manager.put_auto_incr_val(wb, it.first, it.second);
1884 if (!s.ok()) {
1885 return s;
1886 }
1887 }
1888 m_auto_incr_map.clear();
1889 return s;
1890 }
1891
1892 public:
1893 const char *m_mysql_log_file_name;
1894 my_off_t m_mysql_log_offset;
1895#ifdef MARIAROCKS_NOT_YET
1896 // TODO: MariaDB probably doesn't need these at all:
1897 const char *m_mysql_gtid;
1898 const char *m_mysql_max_gtid;
1899#endif
1900 String m_detailed_error;
1901 int64_t m_snapshot_timestamp = 0;
1902 bool m_ddl_transaction;
1903
1904 /*
1905 Tracks the number of tables in use through external_lock.
1906 This should not be reset during start_tx().
1907 */
1908 int64_t m_n_mysql_tables_in_use = 0;
1909
1910 /*
1911 MariaDB's group commit:
1912 */
1913 bool commit_ordered_done;
1914 bool commit_ordered_res;
1915
1916 /*
1917 for distinction between rdb_transaction_impl and rdb_writebatch_impl
1918 when using walk tx list
1919 */
1920 virtual bool is_writebatch_trx() const = 0;
1921
1922 static void init_mutex() {
1923 mysql_mutex_init(key_mutex_tx_list, &s_tx_list_mutex, MY_MUTEX_INIT_FAST);
1924 }
1925
1926 static void term_mutex() {
1927 DBUG_ASSERT(s_tx_list.size() == 0);
1928 mysql_mutex_destroy(&s_tx_list_mutex);
1929 }
1930
1931 static void walk_tx_list(Rdb_tx_list_walker *walker) {
1932 DBUG_ASSERT(walker != nullptr);
1933
1934 RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
1935
1936 for (auto it : s_tx_list)
1937 walker->process_tran(it);
1938
1939 RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
1940 }
1941
1942 int set_status_error(THD *const thd, const rocksdb::Status &s,
1943 const Rdb_key_def &kd, Rdb_tbl_def *const tbl_def,
1944 Rdb_table_handler *const table_handler) {
1945 DBUG_ASSERT(!s.ok());
1946 DBUG_ASSERT(tbl_def != nullptr);
1947
1948 if (s.IsTimedOut()) {
1949 /*
1950 SQL layer has weird expectations. If we return an error when
1951 doing a read in DELETE IGNORE, it will ignore the error ("because it's
1952 an IGNORE command!) but then will fail an assert, because "error code
1953 was returned, but no error happened". Do what InnoDB's
1954 convert_error_code_to_mysql() does: force a statement
1955 rollback before returning HA_ERR_LOCK_WAIT_TIMEOUT:
1956 */
1957 my_core::thd_mark_transaction_to_rollback(thd, false /*just statement*/);
1958 m_detailed_error.copy(timeout_message(
1959 "index", tbl_def->full_tablename().c_str(), kd.get_name().c_str()));
1960 table_handler->m_lock_wait_timeout_counter.inc();
1961 rocksdb_row_lock_wait_timeouts++;
1962
1963 return HA_ERR_LOCK_WAIT_TIMEOUT;
1964 }
1965
1966 if (s.IsDeadlock()) {
1967 my_core::thd_mark_transaction_to_rollback(thd,
1968 false /* just statement */);
1969 m_detailed_error = String();
1970 table_handler->m_deadlock_counter.inc();
1971 rocksdb_row_lock_deadlocks++;
1972 return HA_ERR_LOCK_DEADLOCK;
1973 } else if (s.IsBusy()) {
1974 rocksdb_snapshot_conflict_errors++;
1975 if (rocksdb_print_snapshot_conflict_queries) {
1976 char user_host_buff[MAX_USER_HOST_SIZE + 1];
1977 make_user_name(thd, user_host_buff);
1978 // NO_LINT_DEBUG
1979 sql_print_warning("Got snapshot conflict errors: User: %s "
1980 "Query: %s",
1981 user_host_buff, thd->query());
1982 }
1983 m_detailed_error = String(" (snapshot conflict)", system_charset_info);
1984 table_handler->m_deadlock_counter.inc();
1985 return HA_ERR_LOCK_DEADLOCK;
1986 }
1987
1988 if (s.IsIOError() || s.IsCorruption()) {
1989 rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
1990 }
1991
1992 return ha_rocksdb::rdb_error_to_mysql(s);
1993 }
1994
1995 THD *get_thd() const { return m_thd; }
1996
1997 /* Used for tracking io_perf counters */
1998 void io_perf_start(Rdb_io_perf *const io_perf) {
1999 /*
2000 Since perf_context is tracked per thread, it is difficult and expensive
2001 to maintain perf_context on a per table basis. Therefore, roll all
2002 perf_context data into the first table used in a query. This works well
2003 for single table queries and is probably good enough for queries that hit
2004 multiple tables.
2005
2006 perf_context stats gathering is started when the table lock is acquired
2007 or when ha_rocksdb::start_stmt is called in case of LOCK TABLES. They
2008 are recorded when the table lock is released, or when commit/rollback
2009 is called on the transaction, whichever comes first. Table lock release
2010 and commit/rollback can happen in different orders. In the case where
2011 the lock is released before commit/rollback is called, an extra step to
2012 gather stats during commit/rollback is needed.
2013 */
2014 if (m_tbl_io_perf == nullptr &&
2015 io_perf->start(rocksdb_perf_context_level(m_thd))) {
2016 m_tbl_io_perf = io_perf;
2017 }
2018 }
2019
2020 void io_perf_end_and_record(void) {
2021 if (m_tbl_io_perf != nullptr) {
2022 m_tbl_io_perf->end_and_record(rocksdb_perf_context_level(m_thd));
2023 m_tbl_io_perf = nullptr;
2024 }
2025 }
2026
2027 void io_perf_end_and_record(Rdb_io_perf *const io_perf) {
2028 if (m_tbl_io_perf == io_perf) {
2029 io_perf_end_and_record();
2030 }
2031 }
2032
2033 void update_bytes_written(ulonglong bytes_written) {
2034 if (m_tbl_io_perf != nullptr) {
2035 m_tbl_io_perf->update_bytes_written(rocksdb_perf_context_level(m_thd),
2036 bytes_written);
2037 }
2038 }
2039
2040 void set_params(int timeout_sec_arg, int max_row_locks_arg) {
2041 m_timeout_sec = timeout_sec_arg;
2042 m_max_row_locks = max_row_locks_arg;
2043 set_lock_timeout(timeout_sec_arg);
2044 }
2045
2046 virtual void set_lock_timeout(int timeout_sec_arg) = 0;
2047
2048 ulonglong get_write_count() const { return m_write_count; }
2049
2050 ulonglong get_insert_count() const { return m_insert_count; }
2051
2052 ulonglong get_update_count() const { return m_update_count; }
2053
2054 ulonglong get_delete_count() const { return m_delete_count; }
2055
2056 void incr_insert_count() { ++m_insert_count; }
2057
2058 void incr_update_count() { ++m_update_count; }
2059
2060 void incr_delete_count() { ++m_delete_count; }
2061
2062 int get_timeout_sec() const { return m_timeout_sec; }
2063
2064 ulonglong get_lock_count() const { return m_lock_count; }
2065
2066 virtual void set_sync(bool sync) = 0;
2067
2068 virtual void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
2069 const std::string &rowkey) = 0;
2070
2071 virtual bool prepare(const rocksdb::TransactionName &name) = 0;
2072
2073 bool commit_or_rollback() {
2074 bool res;
2075 if (m_is_tx_failed) {
2076 rollback();
2077 res = false;
2078 } else
2079 res = commit();
2080 return res;
2081 }
2082
2083 bool commit() {
2084 if (get_write_count() == 0) {
2085 rollback();
2086 return false;
2087 } else if (m_rollback_only) {
2088 /*
2089 Transactions marked as rollback_only are expected to be rolled back at
2090 prepare(). But there are some exceptions like below that prepare() is
2091 never called and commit() is called instead.
2092 1. Binlog is disabled
2093 2. No modification exists in binlog cache for the transaction (#195)
2094 In both cases, rolling back transaction is safe. Nothing is written to
2095 binlog.
2096 */
2097 my_error(ER_ROLLBACK_ONLY, MYF(0));
2098 rollback();
2099 return true;
2100 } else {
2101#ifdef MARIAROCKS_NOT_YET
2102 /*
2103 Storing binlog position inside MyRocks is needed only for restoring
2104 MyRocks from backups. This feature is not supported yet.
2105 */
2106 mysql_bin_log_commit_pos(m_thd, &m_mysql_log_offset,
2107 &m_mysql_log_file_name);
2108 binlog_manager.update(m_mysql_log_file_name, m_mysql_log_offset,
2109 get_write_batch());
2110#endif
2111 return commit_no_binlog();
2112 }
2113 }
2114
2115 virtual void rollback() = 0;
2116
2117 void snapshot_created(const rocksdb::Snapshot *const snapshot) {
2118 DBUG_ASSERT(snapshot != nullptr);
2119
2120 m_read_opts.snapshot = snapshot;
2121 rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
2122 m_is_delayed_snapshot = false;
2123 }
2124
2125 virtual void acquire_snapshot(bool acquire_now) = 0;
2126 virtual void release_snapshot() = 0;
2127
2128 bool has_snapshot() const { return m_read_opts.snapshot != nullptr; }
2129
2130private:
2131 // The Rdb_sst_info structures we are currently loading. In a partitioned
2132 // table this can have more than one entry
2133 std::vector<std::shared_ptr<Rdb_sst_info>> m_curr_bulk_load;
2134 std::string m_curr_bulk_load_tablename;
2135
2136 /* External merge sorts for bulk load: key ID -> merge sort instance */
2137 std::unordered_map<GL_INDEX_ID, Rdb_index_merge> m_key_merge;
2138
2139public:
2140 int get_key_merge(GL_INDEX_ID kd_gl_id, rocksdb::ColumnFamilyHandle *cf,
2141 Rdb_index_merge **key_merge) {
2142 int res;
2143 auto it = m_key_merge.find(kd_gl_id);
2144 if (it == m_key_merge.end()) {
2145 m_key_merge.emplace(
2146 std::piecewise_construct, std::make_tuple(kd_gl_id),
2147 std::make_tuple(
2148 get_rocksdb_tmpdir(), THDVAR(get_thd(), merge_buf_size),
2149 THDVAR(get_thd(), merge_combine_read_size),
2150 THDVAR(get_thd(), merge_tmp_file_removal_delay_ms), cf));
2151 it = m_key_merge.find(kd_gl_id);
2152 if ((res = it->second.init()) != 0) {
2153 return res;
2154 }
2155 }
2156 *key_merge = &it->second;
2157 return HA_EXIT_SUCCESS;
2158 }
2159
2160 int finish_bulk_load(int print_client_error = true) {
2161 int rc = 0, rc2;
2162
2163 std::vector<std::shared_ptr<Rdb_sst_info>>::iterator it;
2164 for (it = m_curr_bulk_load.begin(); it != m_curr_bulk_load.end(); it++) {
2165 rc2 = (*it)->commit(print_client_error);
2166 if (rc2 != 0 && rc == 0) {
2167 rc = rc2;
2168 }
2169 }
2170 m_curr_bulk_load.clear();
2171 m_curr_bulk_load_tablename.clear();
2172 DBUG_ASSERT(m_curr_bulk_load.size() == 0);
2173
2174 // Flush the index_merge sort buffers
2175 if (!m_key_merge.empty()) {
2176 rocksdb::Slice merge_key;
2177 rocksdb::Slice merge_val;
2178 for (auto it = m_key_merge.begin(); it != m_key_merge.end(); it++) {
2179 GL_INDEX_ID index_id = it->first;
2180 std::shared_ptr<const Rdb_key_def> keydef =
2181 ddl_manager.safe_find(index_id);
2182 std::string table_name = ddl_manager.safe_get_table_name(index_id);
2183
2184 // Unable to find key definition or table name since the
2185 // table could have been dropped.
2186 // TODO(herman): there is a race here between dropping the table
2187 // and detecting a drop here. If the table is dropped while bulk
2188 // loading is finishing, these keys being added here may
2189 // be missed by the compaction filter and not be marked for
2190 // removal. It is unclear how to lock the sql table from the storage
2191 // engine to prevent modifications to it while bulk load is occurring.
2192 if (keydef == nullptr || table_name.empty()) {
2193 rc2 = HA_ERR_ROCKSDB_BULK_LOAD;
2194 break;
2195 }
2196 const std::string &index_name = keydef->get_name();
2197 Rdb_index_merge &rdb_merge = it->second;
2198
2199 // Rdb_sst_info expects a denormalized table name in the form of
2200 // "./database/table"
2201 std::replace(table_name.begin(), table_name.end(), '.', '/');
2202 table_name = "./" + table_name;
2203 Rdb_sst_info sst_info(rdb, table_name, index_name, rdb_merge.get_cf(),
2204 *rocksdb_db_options,
2205 THDVAR(get_thd(), trace_sst_api));
2206
2207 while ((rc2 = rdb_merge.next(&merge_key, &merge_val)) == 0) {
2208 if ((rc2 = sst_info.put(merge_key, merge_val)) != 0) {
2209 break;
2210 }
2211 }
2212
2213 // rc2 == -1 => finished ok; rc2 > 0 => error
2214 if (rc2 > 0 || (rc2 = sst_info.commit(print_client_error)) != 0) {
2215 if (rc == 0) {
2216 rc = rc2;
2217 }
2218 break;
2219 }
2220 }
2221 m_key_merge.clear();
2222
2223 /*
2224 Explicitly tell jemalloc to clean up any unused dirty pages at this
2225 point.
2226 See https://reviews.facebook.net/D63723 for more details.
2227 */
2228 purge_all_jemalloc_arenas();
2229 }
2230 return rc;
2231 }
2232
2233 int start_bulk_load(ha_rocksdb *const bulk_load,
2234 std::shared_ptr<Rdb_sst_info> sst_info) {
2235 /*
2236 If we already have an open bulk load of a table and the name doesn't
2237 match the current one, close out the currently running one. This allows
2238 multiple bulk loads to occur on a partitioned table, but then closes
2239 them all out when we switch to another table.
2240 */
2241 DBUG_ASSERT(bulk_load != nullptr);
2242
2243 if (!m_curr_bulk_load.empty() &&
2244 bulk_load->get_table_basename() != m_curr_bulk_load_tablename) {
2245 const auto res = finish_bulk_load();
2246 if (res != HA_EXIT_SUCCESS) {
2247 m_curr_bulk_load.clear();
2248 m_curr_bulk_load_tablename.clear();
2249 return res;
2250 }
2251 }
2252
2253 /*
2254 This used to track ha_rocksdb handler objects, but those can be
2255 freed by the table cache while this was referencing them. Instead
2256 of tracking ha_rocksdb handler objects, this now tracks the
2257 Rdb_sst_info allocated, and both the ha_rocksdb handler and the
2258 Rdb_transaction both have shared pointers to them.
2259
2260 On transaction complete, it will commit each Rdb_sst_info structure found.
2261 If the ha_rocksdb object is freed, etc., it will also commit
2262 the Rdb_sst_info. The Rdb_sst_info commit path needs to be idempotent.
2263 */
2264 m_curr_bulk_load.push_back(sst_info);
2265 m_curr_bulk_load_tablename = bulk_load->get_table_basename();
2266 return HA_EXIT_SUCCESS;
2267 }
2268
2269 int num_ongoing_bulk_load() const { return m_curr_bulk_load.size(); }
2270
2271 const char *get_rocksdb_tmpdir() const {
2272 const char *tmp_dir = THDVAR(get_thd(), tmpdir);
2273
2274 /*
2275 We want to treat an empty string as nullptr, in these cases DDL operations
2276 will use the default --tmpdir passed to mysql instead.
2277 */
2278 if (tmp_dir != nullptr && *tmp_dir == '\0') {
2279 tmp_dir = nullptr;
2280 }
2281 return (tmp_dir);
2282 }
2283
2284 /*
2285 Flush the data accumulated so far. This assumes we're doing a bulk insert.
2286
2287 @detail
2288 This should work like transaction commit, except that we don't
2289 synchronize with the binlog (there is no API that would allow to have
2290 binlog flush the changes accumulated so far and return its current
2291 position)
2292
2293 @todo
2294 Add test coverage for what happens when somebody attempts to do bulk
2295 inserts while inside a multi-statement transaction.
2296 */
2297 bool flush_batch() {
2298 if (get_write_count() == 0)
2299 return false;
2300
2301 /* Commit the current transaction */
2302 if (commit_no_binlog())
2303 return true;
2304
2305 /* Start another one */
2306 start_tx();
2307 return false;
2308 }
2309
2310 void set_auto_incr(const GL_INDEX_ID &gl_index_id, ulonglong curr_id) {
2311 m_auto_incr_map[gl_index_id] =
2312 std::max(m_auto_incr_map[gl_index_id], curr_id);
2313 }
2314
2315#ifndef NDEBUG
2316 ulonglong get_auto_incr(const GL_INDEX_ID &gl_index_id) {
2317 if (m_auto_incr_map.count(gl_index_id) > 0) {
2318 return m_auto_incr_map[gl_index_id];
2319 }
2320 return 0;
2321 }
2322#endif
2323
2324 virtual rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
2325 const rocksdb::Slice &key,
2326 const rocksdb::Slice &value) = 0;
2327 virtual rocksdb::Status
2328 delete_key(rocksdb::ColumnFamilyHandle *const column_family,
2329 const rocksdb::Slice &key) = 0;
2330 virtual rocksdb::Status
2331 single_delete(rocksdb::ColumnFamilyHandle *const column_family,
2332 const rocksdb::Slice &key) = 0;
2333
2334 virtual bool has_modifications() const = 0;
2335
2336 virtual rocksdb::WriteBatchBase *get_indexed_write_batch() = 0;
2337 /*
2338 Return a WriteBatch that one can write to. The writes will skip any
2339 transaction locking. The writes will NOT be visible to the transaction.
2340 */
2341 rocksdb::WriteBatchBase *get_blind_write_batch() {
2342 return get_indexed_write_batch()->GetWriteBatch();
2343 }
2344
2345 virtual rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
2346 const rocksdb::Slice &key,
2347 rocksdb::PinnableSlice *const value) const = 0;
2348 virtual rocksdb::Status
2349 get_for_update(rocksdb::ColumnFamilyHandle *const column_family,
2350 const rocksdb::Slice &key, rocksdb::PinnableSlice *const value,
2351 bool exclusive) = 0;
2352
2353 rocksdb::Iterator *
2354 get_iterator(rocksdb::ColumnFamilyHandle *const column_family,
2355 bool skip_bloom_filter, bool fill_cache,
2356 const rocksdb::Slice &eq_cond_lower_bound,
2357 const rocksdb::Slice &eq_cond_upper_bound,
2358 bool read_current = false, bool create_snapshot = true) {
2359 // Make sure we are not doing both read_current (which implies we don't
2360 // want a snapshot) and create_snapshot which makes sure we create
2361 // a snapshot
2362 DBUG_ASSERT(column_family != nullptr);
2363 DBUG_ASSERT(!read_current || !create_snapshot);
2364
2365 if (create_snapshot)
2366 acquire_snapshot(true);
2367
2368 rocksdb::ReadOptions options = m_read_opts;
2369
2370 if (skip_bloom_filter) {
2371 options.total_order_seek = true;
2372 options.iterate_lower_bound = &eq_cond_lower_bound;
2373 options.iterate_upper_bound = &eq_cond_upper_bound;
2374 } else {
2375 // With this option, Iterator::Valid() returns false if key
2376 // is outside of the prefix bloom filter range set at Seek().
2377 // Must not be set to true if not using bloom filter.
2378 options.prefix_same_as_start = true;
2379 }
2380 options.fill_cache = fill_cache;
2381 if (read_current) {
2382 options.snapshot = nullptr;
2383 }
2384 return get_iterator(options, column_family);
2385 }
2386
2387 virtual bool is_tx_started() const = 0;
2388 virtual void start_tx() = 0;
2389 virtual void start_stmt() = 0;
2390
2391 void set_initial_savepoint() {
2392 /*
2393 Set the initial savepoint. If the first statement in the transaction
2394 fails, we need something to roll back to, without rolling back the
2395 entire transaction.
2396 */
2397 do_set_savepoint();
2398 m_n_savepoints= 1;
2399 m_writes_at_last_savepoint= m_write_count;
2400 }
2401
2402 /*
2403 Called when a "top-level" statement inside a transaction completes
2404 successfully and its changes become part of the transaction's changes.
2405 */
2406 void make_stmt_savepoint_permanent() {
2407
2408 // Take another RocksDB savepoint only if we had changes since the last
2409 // one. This is very important for long transactions doing lots of
2410 // SELECTs.
2411 if (m_writes_at_last_savepoint != m_write_count)
2412 {
2413 do_set_savepoint();
2414 m_writes_at_last_savepoint= m_write_count;
2415 m_n_savepoints++;
2416 }
2417 }
2418
2419
2420 /*
2421 Rollback to the savepoint we've set before the last statement
2422 */
2423 void rollback_to_stmt_savepoint() {
2424 if (m_writes_at_last_savepoint != m_write_count) {
2425 do_rollback_to_savepoint();
2426 if (!--m_n_savepoints) {
2427 do_set_savepoint();
2428 m_n_savepoints= 1;
2429 }
2430 m_writes_at_last_savepoint= m_write_count;
2431 }
2432 }
2433
2434 virtual void rollback_stmt() = 0;
2435
2436 void set_tx_failed(bool failed_arg) { m_is_tx_failed = failed_arg; }
2437
2438 bool can_prepare() const {
2439 if (m_rollback_only) {
2440 my_error(ER_ROLLBACK_ONLY, MYF(0));
2441 return false;
2442 }
2443 return true;
2444 }
2445
2446 int rollback_to_savepoint(void *const savepoint) {
2447 if (has_modifications()) {
2448 my_error(ER_ROLLBACK_TO_SAVEPOINT, MYF(0));
2449 m_rollback_only = true;
2450 return HA_EXIT_FAILURE;
2451 }
2452 return HA_EXIT_SUCCESS;
2453 }
2454
2455 /*
2456 This is used by transactions started with "START TRANSACTION WITH "
2457 "CONSISTENT [ROCKSDB] SNAPSHOT". When tx_read_only is turned on,
2458 snapshot has to be created via DB::GetSnapshot(), not via Transaction
2459 API.
2460 */
2461 bool is_tx_read_only() const { return m_tx_read_only; }
2462
2463 bool is_two_phase() const { return m_is_two_phase; }
2464
2465 void set_tx_read_only(bool val) { m_tx_read_only = val; }
2466
2467 explicit Rdb_transaction(THD *const thd)
2468 : m_thd(thd), m_tbl_io_perf(nullptr) {
2469 RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
2470 s_tx_list.insert(this);
2471 RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
2472 }
2473
2474 virtual ~Rdb_transaction() {
2475 RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
2476 s_tx_list.erase(this);
2477 RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
2478 }
2479};
2480
2481/*
2482 This is a rocksdb transaction. Its members represent the current transaction,
2483 which consists of:
2484 - the snapshot
2485 - the changes we've made but are not seeing yet.
2486
2487 The changes are made to individual tables, which store them here and then
2488 this object commits them on commit.
2489*/
2490class Rdb_transaction_impl : public Rdb_transaction {
2491 rocksdb::Transaction *m_rocksdb_tx = nullptr;
2492 rocksdb::Transaction *m_rocksdb_reuse_tx = nullptr;
2493
2494public:
2495 void set_lock_timeout(int timeout_sec_arg) override {
2496 if (m_rocksdb_tx)
2497 m_rocksdb_tx->SetLockTimeout(rdb_convert_sec_to_ms(m_timeout_sec));
2498 }
2499
2500 void set_sync(bool sync) override {
2501 if (m_rocksdb_tx)
2502 m_rocksdb_tx->GetWriteOptions()->sync = sync;
2503 }
2504
2505 void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
2506 const std::string &rowkey) override {
2507 if (!THDVAR(m_thd, lock_scanned_rows)) {
2508 m_rocksdb_tx->UndoGetForUpdate(column_family, rocksdb::Slice(rowkey));
2509 }
2510 }
2511
2512 virtual bool is_writebatch_trx() const override { return false; }
2513
2514private:
2515 void release_tx(void) {
2516 // We are done with the current active transaction object. Preserve it
2517 // for later reuse.
2518 DBUG_ASSERT(m_rocksdb_reuse_tx == nullptr);
2519 m_rocksdb_reuse_tx = m_rocksdb_tx;
2520 m_rocksdb_tx = nullptr;
2521 }
2522
2523 bool prepare(const rocksdb::TransactionName &name) override {
2524 rocksdb::Status s;
2525 s = m_rocksdb_tx->SetName(name);
2526 if (!s.ok()) {
2527 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
2528 return false;
2529 }
2530
2531 s = merge_auto_incr_map(m_rocksdb_tx->GetWriteBatch()->GetWriteBatch());
2532 if (!s.ok()) {
2533 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
2534 return false;
2535 }
2536
2537 s = m_rocksdb_tx->Prepare();
2538 if (!s.ok()) {
2539 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
2540 return false;
2541 }
2542 return true;
2543 }
2544
2545 bool commit_no_binlog() override {
2546 bool res = false;
2547 rocksdb::Status s;
2548
2549 s = merge_auto_incr_map(m_rocksdb_tx->GetWriteBatch()->GetWriteBatch());
2550 if (!s.ok()) {
2551 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
2552 res = true;
2553 goto error;
2554 }
2555
2556 release_snapshot();
2557 s = m_rocksdb_tx->Commit();
2558 if (!s.ok()) {
2559 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
2560 res = true;
2561 goto error;
2562 }
2563
2564error:
2565 /* Save the transaction object to be reused */
2566 release_tx();
2567
2568 m_write_count = 0;
2569 m_insert_count = 0;
2570 m_update_count = 0;
2571 m_delete_count = 0;
2572 m_lock_count = 0;
2573 set_tx_read_only(false);
2574 m_rollback_only = false;
2575 return res;
2576 }
2577
2578public:
2579 void rollback() override {
2580 m_write_count = 0;
2581 m_insert_count = 0;
2582 m_update_count = 0;
2583 m_delete_count = 0;
2584 m_lock_count = 0;
2585 m_auto_incr_map.clear();
2586 m_ddl_transaction = false;
2587 if (m_rocksdb_tx) {
2588 release_snapshot();
2589 /* This will also release all of the locks: */
2590 m_rocksdb_tx->Rollback();
2591
2592 /* Save the transaction object to be reused */
2593 release_tx();
2594
2595 set_tx_read_only(false);
2596 m_rollback_only = false;
2597 }
2598 }
2599
2600 void acquire_snapshot(bool acquire_now) override {
2601 if (m_read_opts.snapshot == nullptr) {
2602 if (is_tx_read_only()) {
2603 snapshot_created(rdb->GetSnapshot());
2604 } else if (acquire_now) {
2605 m_rocksdb_tx->SetSnapshot();
2606 snapshot_created(m_rocksdb_tx->GetSnapshot());
2607 } else if (!m_is_delayed_snapshot) {
2608 m_rocksdb_tx->SetSnapshotOnNextOperation(m_notifier);
2609 m_is_delayed_snapshot = true;
2610 }
2611 }
2612 }
2613
2614 void release_snapshot() override {
2615 bool need_clear = m_is_delayed_snapshot;
2616
2617 if (m_read_opts.snapshot != nullptr) {
2618 m_snapshot_timestamp = 0;
2619 if (is_tx_read_only()) {
2620 rdb->ReleaseSnapshot(m_read_opts.snapshot);
2621 need_clear = false;
2622 } else {
2623 need_clear = true;
2624 }
2625 m_read_opts.snapshot = nullptr;
2626 }
2627
2628 if (need_clear && m_rocksdb_tx != nullptr)
2629 m_rocksdb_tx->ClearSnapshot();
2630 }
2631
2632 bool has_snapshot() { return m_read_opts.snapshot != nullptr; }
2633
2634 rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
2635 const rocksdb::Slice &key,
2636 const rocksdb::Slice &value) override {
2637 ++m_write_count;
2638 ++m_lock_count;
2639 if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks)
2640 return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
2641 return m_rocksdb_tx->Put(column_family, key, value);
2642 }
2643
2644 rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family,
2645 const rocksdb::Slice &key) override {
2646 ++m_write_count;
2647 ++m_lock_count;
2648 if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks)
2649 return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
2650 return m_rocksdb_tx->Delete(column_family, key);
2651 }
2652
2653 rocksdb::Status
2654 single_delete(rocksdb::ColumnFamilyHandle *const column_family,
2655 const rocksdb::Slice &key) override {
2656 ++m_write_count;
2657 ++m_lock_count;
2658 if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks)
2659 return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
2660 return m_rocksdb_tx->SingleDelete(column_family, key);
2661 }
2662
2663 bool has_modifications() const override {
2664 return m_rocksdb_tx->GetWriteBatch() &&
2665 m_rocksdb_tx->GetWriteBatch()->GetWriteBatch() &&
2666 m_rocksdb_tx->GetWriteBatch()->GetWriteBatch()->Count() > 0;
2667 }
2668
2669 rocksdb::WriteBatchBase *get_write_batch() override {
2670 if (is_two_phase()) {
2671 return m_rocksdb_tx->GetCommitTimeWriteBatch();
2672 }
2673 return m_rocksdb_tx->GetWriteBatch()->GetWriteBatch();
2674 }
2675
2676 /*
2677 Return a WriteBatch that one can write to. The writes will skip any
2678 transaction locking. The writes WILL be visible to the transaction.
2679 */
2680 rocksdb::WriteBatchBase *get_indexed_write_batch() override {
2681 ++m_write_count;
2682 return m_rocksdb_tx->GetWriteBatch();
2683 }
2684
2685 rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
2686 const rocksdb::Slice &key,
2687 rocksdb::PinnableSlice *const value) const override {
2688 // clean PinnableSlice right begfore Get() for multiple gets per statement
2689 // the resources after the last Get in a statement are cleared in
2690 // handler::reset call
2691 value->Reset();
2692 global_stats.queries[QUERIES_POINT].inc();
2693 return m_rocksdb_tx->Get(m_read_opts, column_family, key, value);
2694 }
2695
2696 rocksdb::Status
2697 get_for_update(rocksdb::ColumnFamilyHandle *const column_family,
2698 const rocksdb::Slice &key, rocksdb::PinnableSlice *const value,
2699 bool exclusive) override {
2700 if (++m_lock_count > m_max_row_locks)
2701 return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
2702
2703 if (value != nullptr) {
2704 value->Reset();
2705 }
2706 return m_rocksdb_tx->GetForUpdate(m_read_opts, column_family, key, value,
2707 exclusive);
2708 }
2709
2710 rocksdb::Iterator *
2711 get_iterator(const rocksdb::ReadOptions &options,
2712 rocksdb::ColumnFamilyHandle *const column_family) override {
2713 global_stats.queries[QUERIES_RANGE].inc();
2714 return m_rocksdb_tx->GetIterator(options, column_family);
2715 }
2716
2717 const rocksdb::Transaction *get_rdb_trx() const { return m_rocksdb_tx; }
2718
2719 bool is_tx_started() const override { return (m_rocksdb_tx != nullptr); }
2720
2721 void start_tx() override {
2722 rocksdb::TransactionOptions tx_opts;
2723 rocksdb::WriteOptions write_opts;
2724 tx_opts.set_snapshot = false;
2725 tx_opts.lock_timeout = rdb_convert_sec_to_ms(m_timeout_sec);
2726 tx_opts.deadlock_detect = THDVAR(m_thd, deadlock_detect);
2727 tx_opts.deadlock_detect_depth = THDVAR(m_thd, deadlock_detect_depth);
2728 tx_opts.max_write_batch_size = THDVAR(m_thd, write_batch_max_bytes);
2729
2730 write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
2731 write_opts.disableWAL = THDVAR(m_thd, write_disable_wal);
2732 write_opts.ignore_missing_column_families =
2733 THDVAR(m_thd, write_ignore_missing_column_families);
2734 m_is_two_phase = rocksdb_enable_2pc;
2735
2736 commit_ordered_done= false;
2737
2738 /*
2739 If m_rocksdb_reuse_tx is null this will create a new transaction object.
2740 Otherwise it will reuse the existing one.
2741 */
2742 m_rocksdb_tx =
2743 rdb->BeginTransaction(write_opts, tx_opts, m_rocksdb_reuse_tx);
2744 m_rocksdb_reuse_tx = nullptr;
2745
2746 m_read_opts = rocksdb::ReadOptions();
2747
2748 set_initial_savepoint();
2749
2750 m_ddl_transaction = false;
2751 }
2752
2753 /* Implementations of do_*savepoint based on rocksdB::Transaction savepoints */
2754 void do_set_savepoint() override {
2755 m_rocksdb_tx->SetSavePoint();
2756 }
2757
2758 void do_rollback_to_savepoint() override {
2759 m_rocksdb_tx->RollbackToSavePoint();
2760 }
2761
2762 /*
2763 Start a statement inside a multi-statement transaction.
2764
2765 @todo: are we sure this is called once (and not several times) per
2766 statement start?
2767
2768 For hooking to start of statement that is its own transaction, see
2769 ha_rocksdb::external_lock().
2770 */
2771 void start_stmt() override {
2772 // Set the snapshot to delayed acquisition (SetSnapshotOnNextOperation)
2773 acquire_snapshot(false);
2774 }
2775
2776 /*
2777 This must be called when last statement is rolled back, but the transaction
2778 continues
2779 */
2780 void rollback_stmt() override {
2781 /* TODO: here we must release the locks taken since the start_stmt() call */
2782 if (m_rocksdb_tx) {
2783 const rocksdb::Snapshot *const org_snapshot = m_rocksdb_tx->GetSnapshot();
2784 rollback_to_stmt_savepoint();
2785
2786 const rocksdb::Snapshot *const cur_snapshot = m_rocksdb_tx->GetSnapshot();
2787 if (org_snapshot != cur_snapshot) {
2788 if (org_snapshot != nullptr)
2789 m_snapshot_timestamp = 0;
2790
2791 m_read_opts.snapshot = cur_snapshot;
2792 if (cur_snapshot != nullptr)
2793 rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
2794 else
2795 m_is_delayed_snapshot = true;
2796 }
2797 }
2798 }
2799
2800 explicit Rdb_transaction_impl(THD *const thd)
2801 : Rdb_transaction(thd), m_rocksdb_tx(nullptr) {
2802 // Create a notifier that can be called when a snapshot gets generated.
2803 m_notifier = std::make_shared<Rdb_snapshot_notifier>(this);
2804 }
2805
2806 virtual ~Rdb_transaction_impl() {
2807 rollback();
2808
2809 // Theoretically the notifier could outlive the Rdb_transaction_impl
2810 // (because of the shared_ptr), so let it know it can't reference
2811 // the transaction anymore.
2812 m_notifier->detach();
2813
2814 // Free any transaction memory that is still hanging around.
2815 delete m_rocksdb_reuse_tx;
2816 DBUG_ASSERT(m_rocksdb_tx == nullptr);
2817 }
2818};
2819
2820/* This is a rocksdb write batch. This class doesn't hold or wait on any
2821 transaction locks (skips rocksdb transaction API) thus giving better
2822 performance. The commit is done through rdb->GetBaseDB()->Commit().
2823
2824 Currently this is only used for replication threads which are guaranteed
2825 to be non-conflicting. Any further usage of this class should completely
2826 be thought thoroughly.
2827*/
2828class Rdb_writebatch_impl : public Rdb_transaction {
2829 rocksdb::WriteBatchWithIndex *m_batch;
2830 rocksdb::WriteOptions write_opts;
2831 // Called after commit/rollback.
2832 void reset() {
2833 m_batch->Clear();
2834 m_read_opts = rocksdb::ReadOptions();
2835 m_ddl_transaction = false;
2836 }
2837
2838private:
2839 bool prepare(const rocksdb::TransactionName &name) override { return true; }
2840
2841 bool commit_no_binlog() override {
2842 bool res = false;
2843 rocksdb::Status s;
2844
2845 s = merge_auto_incr_map(m_batch->GetWriteBatch());
2846 if (!s.ok()) {
2847 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
2848 res = true;
2849 goto error;
2850 }
2851
2852 release_snapshot();
2853
2854 s = rdb->GetBaseDB()->Write(write_opts, m_batch->GetWriteBatch());
2855 if (!s.ok()) {
2856 rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
2857 res = true;
2858 goto error;
2859 }
2860error:
2861 reset();
2862
2863 m_write_count = 0;
2864 m_insert_count = 0;
2865 m_update_count = 0;
2866 m_delete_count = 0;
2867 set_tx_read_only(false);
2868 m_rollback_only = false;
2869 return res;
2870 }
2871
2872protected:
2873 /* Implementations of do_*savepoint based on rocksdB::WriteBatch savepoints */
2874 void do_set_savepoint() override {
2875 m_batch->SetSavePoint();
2876 }
2877
2878 void do_rollback_to_savepoint() override {
2879 m_batch->RollbackToSavePoint();
2880 }
2881
2882public:
2883 bool is_writebatch_trx() const override { return true; }
2884
2885 void set_lock_timeout(int timeout_sec_arg) override {
2886 // Nothing to do here.
2887 }
2888
2889 void set_sync(bool sync) override { write_opts.sync = sync; }
2890
2891 void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
2892 const std::string &rowkey) override {
2893 // Nothing to do here since we don't hold any row locks.
2894 }
2895
2896 void rollback() override {
2897 m_write_count = 0;
2898 m_insert_count = 0;
2899 m_update_count = 0;
2900 m_delete_count = 0;
2901 m_lock_count = 0;
2902 release_snapshot();
2903
2904 reset();
2905 set_tx_read_only(false);
2906 m_rollback_only = false;
2907 }
2908
2909 void acquire_snapshot(bool acquire_now) override {
2910 if (m_read_opts.snapshot == nullptr)
2911 snapshot_created(rdb->GetSnapshot());
2912 }
2913
2914 void release_snapshot() override {
2915 if (m_read_opts.snapshot != nullptr) {
2916 rdb->ReleaseSnapshot(m_read_opts.snapshot);
2917 m_read_opts.snapshot = nullptr;
2918 }
2919 }
2920
2921 rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
2922 const rocksdb::Slice &key,
2923 const rocksdb::Slice &value) override {
2924 ++m_write_count;
2925 m_batch->Put(column_family, key, value);
2926 // Note Put/Delete in write batch doesn't return any error code. We simply
2927 // return OK here.
2928 return rocksdb::Status::OK();
2929 }
2930
2931 rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family,
2932 const rocksdb::Slice &key) override {
2933 ++m_write_count;
2934 m_batch->Delete(column_family, key);
2935 return rocksdb::Status::OK();
2936 }
2937
2938 rocksdb::Status
2939 single_delete(rocksdb::ColumnFamilyHandle *const column_family,
2940 const rocksdb::Slice &key) override {
2941 ++m_write_count;
2942 m_batch->SingleDelete(column_family, key);
2943 return rocksdb::Status::OK();
2944 }
2945
2946 bool has_modifications() const override {
2947 return m_batch->GetWriteBatch()->Count() > 0;
2948 }
2949
2950 rocksdb::WriteBatchBase *get_write_batch() override { return m_batch; }
2951
2952 rocksdb::WriteBatchBase *get_indexed_write_batch() override {
2953 ++m_write_count;
2954 return m_batch;
2955 }
2956
2957 rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
2958 const rocksdb::Slice &key,
2959 rocksdb::PinnableSlice *const value) const override {
2960 value->Reset();
2961 return m_batch->GetFromBatchAndDB(rdb, m_read_opts, column_family, key,
2962 value);
2963 }
2964
2965 rocksdb::Status
2966 get_for_update(rocksdb::ColumnFamilyHandle *const column_family,
2967 const rocksdb::Slice &key, rocksdb::PinnableSlice *const value,
2968 bool exclusive) override {
2969 return get(column_family, key, value);
2970 }
2971
2972 rocksdb::Iterator *
2973 get_iterator(const rocksdb::ReadOptions &options,
2974 rocksdb::ColumnFamilyHandle *const column_family) override {
2975 const auto it = rdb->NewIterator(options);
2976 return m_batch->NewIteratorWithBase(it);
2977 }
2978
2979 bool is_tx_started() const override { return (m_batch != nullptr); }
2980
2981 void start_tx() override {
2982 commit_ordered_done= false; // Do we need this here?
2983 reset();
2984 write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
2985 write_opts.disableWAL = THDVAR(m_thd, write_disable_wal);
2986 write_opts.ignore_missing_column_families =
2987 THDVAR(m_thd, write_ignore_missing_column_families);
2988
2989 set_initial_savepoint();
2990 }
2991
2992 void start_stmt() override {}
2993
2994 void rollback_stmt() override {
2995 if (m_batch)
2996 rollback_to_stmt_savepoint();
2997 }
2998
2999 explicit Rdb_writebatch_impl(THD *const thd)
3000 : Rdb_transaction(thd), m_batch(nullptr) {
3001 m_batch = new rocksdb::WriteBatchWithIndex(rocksdb::BytewiseComparator(), 0,
3002 true);
3003 }
3004
3005 virtual ~Rdb_writebatch_impl() {
3006 rollback();
3007 delete m_batch;
3008 }
3009};
3010
3011void Rdb_snapshot_notifier::SnapshotCreated(
3012 const rocksdb::Snapshot *const snapshot) {
3013 if (m_owning_tx != nullptr) {
3014 m_owning_tx->snapshot_created(snapshot);
3015 }
3016}
3017
3018std::multiset<Rdb_transaction *> Rdb_transaction::s_tx_list;
3019mysql_mutex_t Rdb_transaction::s_tx_list_mutex;
3020
3021static Rdb_transaction *&get_tx_from_thd(THD *const thd) {
3022 return *reinterpret_cast<Rdb_transaction **>(
3023 my_core::thd_ha_data(thd, rocksdb_hton));
3024}
3025
3026namespace {
3027
3028class Rdb_perf_context_guard {
3029 Rdb_io_perf m_io_perf;
3030 Rdb_io_perf *m_io_perf_ptr;
3031 Rdb_transaction *m_tx;
3032 uint m_level;
3033
3034 public:
3035 Rdb_perf_context_guard(const Rdb_perf_context_guard &) = delete;
3036 Rdb_perf_context_guard &operator=(const Rdb_perf_context_guard &) = delete;
3037
3038 explicit Rdb_perf_context_guard(Rdb_io_perf *io_perf, uint level)
3039 : m_io_perf_ptr(io_perf), m_tx(nullptr), m_level(level) {
3040 m_io_perf_ptr->start(m_level);
3041 }
3042
3043 explicit Rdb_perf_context_guard(Rdb_transaction *tx, uint level)
3044 : m_io_perf_ptr(nullptr), m_tx(tx), m_level(level) {
3045 /*
3046 if perf_context information is already being recorded, this becomes a
3047 no-op
3048 */
3049 if (tx != nullptr) {
3050 tx->io_perf_start(&m_io_perf);
3051 }
3052 }
3053
3054 ~Rdb_perf_context_guard() {
3055 if (m_tx != nullptr) {
3056 m_tx->io_perf_end_and_record();
3057 } else if (m_io_perf_ptr != nullptr) {
3058 m_io_perf_ptr->end_and_record(m_level);
3059 }
3060 }
3061};
3062
3063} // anonymous namespace
3064
3065/*
3066 TODO: maybe, call this in external_lock() and store in ha_rocksdb..
3067*/
3068
3069static Rdb_transaction *get_or_create_tx(THD *const thd) {
3070 Rdb_transaction *&tx = get_tx_from_thd(thd);
3071 // TODO: this is called too many times.. O(#rows)
3072 if (tx == nullptr) {
3073 bool rpl_skip_tx_api= false; // MARIAROCKS_NOT_YET.
3074 if ((rpl_skip_tx_api && thd->rgi_slave) ||
3075 false /* MARIAROCKS_NOT_YET: THDVAR(thd, master_skip_tx_api) && !thd->rgi_slave)*/)
3076 {
3077 tx = new Rdb_writebatch_impl(thd);
3078 }
3079 else
3080 {
3081 tx = new Rdb_transaction_impl(thd);
3082 }
3083 tx->set_params(THDVAR(thd, lock_wait_timeout), THDVAR(thd, max_row_locks));
3084 tx->start_tx();
3085 } else {
3086 tx->set_params(THDVAR(thd, lock_wait_timeout), THDVAR(thd, max_row_locks));
3087 if (!tx->is_tx_started()) {
3088 tx->start_tx();
3089 }
3090 }
3091
3092 return tx;
3093}
3094
3095static int rocksdb_close_connection(handlerton *const hton, THD *const thd) {
3096 Rdb_transaction *&tx = get_tx_from_thd(thd);
3097 if (tx != nullptr) {
3098 int rc = tx->finish_bulk_load(false);
3099 if (rc != 0) {
3100 // NO_LINT_DEBUG
3101 sql_print_error("RocksDB: Error %d finalizing last SST file while "
3102 "disconnecting",
3103 rc);
3104 }
3105
3106 delete tx;
3107 tx = nullptr;
3108 }
3109 return HA_EXIT_SUCCESS;
3110}
3111
3112/*
3113 * Serializes an xid to a string so that it can
3114 * be used as a rocksdb transaction name
3115 */
3116static std::string rdb_xid_to_string(const XID &src) {
3117 DBUG_ASSERT(src.gtrid_length >= 0 && src.gtrid_length <= MAXGTRIDSIZE);
3118 DBUG_ASSERT(src.bqual_length >= 0 && src.bqual_length <= MAXBQUALSIZE);
3119
3120 std::string buf;
3121 buf.reserve(RDB_XIDHDR_LEN + src.gtrid_length + src.bqual_length);
3122
3123 /*
3124 * expand formatID to fill 8 bytes if it doesn't already
3125 * then reinterpret bit pattern as unsigned and store in network order
3126 */
3127 uchar fidbuf[RDB_FORMATID_SZ];
3128 int64 signed_fid8 = src.formatID;
3129 const uint64 raw_fid8 = *reinterpret_cast<uint64 *>(&signed_fid8);
3130 rdb_netbuf_store_uint64(fidbuf, raw_fid8);
3131 buf.append(reinterpret_cast<const char *>(fidbuf), RDB_FORMATID_SZ);
3132
3133 buf.push_back(src.gtrid_length);
3134 buf.push_back(src.bqual_length);
3135 buf.append(src.data, (src.gtrid_length) + (src.bqual_length));
3136 return buf;
3137}
3138
3139#if 0
3140// MARIAROCKS: MariaDB doesn't have flush_wal method
3141/**
3142 Called by hton->flush_logs after MySQL group commit prepares a set of
3143 transactions.
3144*/
3145static bool rocksdb_flush_wal(handlerton* hton __attribute__((__unused__)))
3146 DBUG_ASSERT(rdb != nullptr);
3147
3148 rocksdb::Status s;
3149 /*
3150 target_lsn is set to 0 when MySQL wants to sync the wal files
3151 */
3152 if ((target_lsn == 0 && !rocksdb_db_options->allow_mmap_writes) ||
3153 rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER) {
3154 rocksdb_wal_group_syncs++;
3155 s = rdb->FlushWAL(target_lsn == 0 ||
3156 rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
3157 }
3158
3159 if (!s.ok()) {
3160 rdb_log_status_error(s);
3161 return HA_EXIT_FAILURE;
3162 }
3163 return HA_EXIT_SUCCESS;
3164}
3165#endif
3166
3167/**
3168 For a slave, prepare() updates the slave_gtid_info table which tracks the
3169 replication progress.
3170*/
3171static int rocksdb_prepare(handlerton* hton, THD* thd, bool prepare_tx)
3172{
3173 bool async=false; // This is "ASYNC_COMMIT" feature which is only present in webscalesql
3174
3175 Rdb_transaction *&tx = get_tx_from_thd(thd);
3176 if (!tx->can_prepare()) {
3177 return HA_EXIT_FAILURE;
3178 }
3179 if (prepare_tx ||
3180 (!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
3181 /* We were instructed to prepare the whole transaction, or
3182 this is an SQL statement end and autocommit is on */
3183
3184#ifdef MARIAROCKS_NOT_YET
3185 /*
3186 Storing binlog position inside MyRocks is needed only for restoring
3187 MyRocks from backups. This feature is not supported yet.
3188 */
3189 std::vector<st_slave_gtid_info> slave_gtid_info;
3190 my_core::thd_slave_gtid_info(thd, &slave_gtid_info);
3191 for (const auto &it : slave_gtid_info) {
3192 rocksdb::WriteBatchBase *const write_batch = tx->get_blind_write_batch();
3193 binlog_manager.update_slave_gtid_info(it.id, it.db, it.gtid, write_batch);
3194 }
3195#endif
3196
3197 if (tx->is_two_phase()) {
3198
3199 /*
3200 MariaDB: the following branch is never taken.
3201 We always flush at Prepare and rely on RocksDB's internal Group Commit
3202 to do some grouping.
3203 */
3204 if (thd->durability_property == HA_IGNORE_DURABILITY || async) {
3205 tx->set_sync(false);
3206 }
3207
3208 /*
3209 MariaDB: do not flush logs if we are running in a non-crash-safe mode.
3210 */
3211 if (!rocksdb_flush_log_at_trx_commit)
3212 tx->set_sync(false);
3213
3214 XID xid;
3215 thd_get_xid(thd, reinterpret_cast<MYSQL_XID *>(&xid));
3216 if (!tx->prepare(rdb_xid_to_string(xid))) {
3217 return HA_EXIT_FAILURE;
3218 }
3219
3220 /*
3221 MariaDB: our Group Commit implementation does not use the
3222 hton->flush_logs call (at least currently) so the following is not
3223 needed (TODO: will we need this for binlog rotation?)
3224 */
3225#ifdef MARIAROCKS_NOT_YET
3226 if (thd->durability_property == HA_IGNORE_DURABILITY )
3227 (rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER))
3228 &&
3229 THDVAR(thd, flush_log_at_trx_commit))
3230#endif
3231#ifdef MARIAROCKS_NOT_YET
3232 {
3233 // MariaRocks: disable the
3234 // "write/sync redo log before flushing binlog cache to file"
3235 // feature. See a869c56d361bb44f46c0efeb11a8f03561676247
3236 /**
3237 we set the log sequence as '1' just to trigger hton->flush_logs
3238 */
3239 thd_store_lsn(thd, 1, DB_TYPE_ROCKSDB);
3240 }
3241#endif
3242 }
3243
3244 DEBUG_SYNC(thd, "rocksdb.prepared");
3245 }
3246 else
3247 tx->make_stmt_savepoint_permanent();
3248 return HA_EXIT_SUCCESS;
3249}
3250
3251/**
3252 do nothing for prepare/commit by xid
3253 this is needed to avoid crashes in XA scenarios
3254*/
3255static int rocksdb_commit_by_xid(handlerton *const hton, XID *const xid) {
3256 DBUG_ENTER_FUNC();
3257
3258 DBUG_ASSERT(hton != nullptr);
3259 DBUG_ASSERT(xid != nullptr);
3260 DBUG_ASSERT(commit_latency_stats != nullptr);
3261
3262 rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
3263
3264 const auto name = rdb_xid_to_string(*xid);
3265 DBUG_ASSERT(!name.empty());
3266
3267 rocksdb::Transaction *const trx = rdb->GetTransactionByName(name);
3268
3269 if (trx == nullptr) {
3270 DBUG_RETURN(HA_EXIT_FAILURE);
3271 }
3272
3273 const rocksdb::Status s = trx->Commit();
3274
3275 if (!s.ok()) {
3276 rdb_log_status_error(s);
3277 DBUG_RETURN(HA_EXIT_FAILURE);
3278 }
3279
3280 delete trx;
3281
3282 // `Add()` is implemented in a thread-safe manner.
3283 commit_latency_stats->Add(timer.ElapsedNanos() / 1000);
3284
3285 DBUG_RETURN(HA_EXIT_SUCCESS);
3286}
3287
3288static int
3289rocksdb_rollback_by_xid(handlerton *const hton MY_ATTRIBUTE((__unused__)),
3290 XID *const xid) {
3291 DBUG_ENTER_FUNC();
3292
3293 DBUG_ASSERT(hton != nullptr);
3294 DBUG_ASSERT(xid != nullptr);
3295 DBUG_ASSERT(rdb != nullptr);
3296
3297 const auto name = rdb_xid_to_string(*xid);
3298
3299 rocksdb::Transaction *const trx = rdb->GetTransactionByName(name);
3300
3301 if (trx == nullptr) {
3302 DBUG_RETURN(HA_EXIT_FAILURE);
3303 }
3304
3305 const rocksdb::Status s = trx->Rollback();
3306
3307 if (!s.ok()) {
3308 rdb_log_status_error(s);
3309 DBUG_RETURN(HA_EXIT_FAILURE);
3310 }
3311
3312 delete trx;
3313
3314 DBUG_RETURN(HA_EXIT_SUCCESS);
3315}
3316
3317/**
3318 Rebuilds an XID from a serialized version stored in a string.
3319*/
3320static void rdb_xid_from_string(const std::string &src, XID *const dst) {
3321 DBUG_ASSERT(dst != nullptr);
3322 uint offset = 0;
3323 uint64 raw_fid8 =
3324 rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(src.data()));
3325 const int64 signed_fid8 = *reinterpret_cast<int64 *>(&raw_fid8);
3326 dst->formatID = signed_fid8;
3327 offset += RDB_FORMATID_SZ;
3328 dst->gtrid_length = src.at(offset);
3329 offset += RDB_GTRID_SZ;
3330 dst->bqual_length = src.at(offset);
3331 offset += RDB_BQUAL_SZ;
3332
3333 DBUG_ASSERT(dst->gtrid_length >= 0 && dst->gtrid_length <= MAXGTRIDSIZE);
3334 DBUG_ASSERT(dst->bqual_length >= 0 && dst->bqual_length <= MAXBQUALSIZE);
3335
3336 src.copy(dst->data, (dst->gtrid_length) + (dst->bqual_length),
3337 RDB_XIDHDR_LEN);
3338}
3339
3340/**
3341 Reading last committed binary log info from RocksDB system row.
3342 The info is needed for crash safe slave/master to work.
3343*/
3344static int rocksdb_recover(handlerton* hton, XID* xid_list, uint len)
3345#ifdef MARIAROCKS_NOT_YET
3346 char* const binlog_file,
3347 my_off_t *const binlog_pos,
3348 Gtid *const binlog_max_gtid) {
3349#endif
3350{
3351#ifdef MARIAROCKS_NOT_YET
3352 if (binlog_file && binlog_pos) {
3353 char file_buf[FN_REFLEN + 1] = {0};
3354 my_off_t pos;
3355 char gtid_buf[FN_REFLEN + 1] = {0};
3356 if (binlog_manager.read(file_buf, &pos, gtid_buf)) {
3357 if (is_binlog_advanced(binlog_file, *binlog_pos, file_buf, pos)) {
3358 memcpy(binlog_file, file_buf, FN_REFLEN + 1);
3359 *binlog_pos = pos;
3360 fprintf(stderr, "RocksDB: Last binlog file position %llu,"
3361 " file name %s\n",
3362 pos, file_buf);
3363 if (*gtid_buf) {
3364 global_sid_lock->rdlock();
3365 binlog_max_gtid->parse(global_sid_map, gtid_buf);
3366 global_sid_lock->unlock();
3367 fprintf(stderr, "RocksDB: Last MySQL Gtid %s\n", gtid_buf);
3368 }
3369 }
3370 }
3371 }
3372#endif
3373
3374 if (len == 0 || xid_list == nullptr) {
3375 return HA_EXIT_SUCCESS;
3376 }
3377
3378 std::vector<rocksdb::Transaction *> trans_list;
3379 rdb->GetAllPreparedTransactions(&trans_list);
3380
3381 uint count = 0;
3382 for (auto &trans : trans_list) {
3383 if (count >= len) {
3384 break;
3385 }
3386 auto name = trans->GetName();
3387 rdb_xid_from_string(name, &xid_list[count]);
3388 count++;
3389 }
3390 return count;
3391}
3392
3393
3394/*
3395 Handle a commit checkpoint request from server layer.
3396
3397 InnoDB does this:
3398 We put the request in a queue, so that we can notify upper layer about
3399 checkpoint complete when we have flushed the redo log.
3400 If we have already flushed all relevant redo log, we notify immediately.
3401
3402 MariaRocks just flushes everything right away ATM
3403*/
3404
3405static void rocksdb_checkpoint_request(handlerton *hton,
3406 void *cookie)
3407{
3408 const rocksdb::Status s= rdb->SyncWAL();
3409 //TODO: what to do on error?
3410 if (s.ok())
3411 {
3412 rocksdb_wal_group_syncs++;
3413 commit_checkpoint_notify_ha(hton, cookie);
3414 }
3415}
3416
3417/*
3418 @param all: TRUE - commit the transaction
3419 FALSE - SQL statement ended
3420*/
3421static void rocksdb_commit_ordered(handlerton *hton, THD* thd, bool all)
3422{
3423 // Same assert as InnoDB has
3424 DBUG_ASSERT(all || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT |
3425 OPTION_BEGIN)));
3426 Rdb_transaction *&tx = get_tx_from_thd(thd);
3427 if (!tx->is_two_phase()) {
3428 /*
3429 ordered_commit is supposedly slower as it is done sequentially
3430 in order to preserve commit order.
3431
3432 if we are not required do 2-phase commit with the binlog, do not do
3433 anything here.
3434 */
3435 return;
3436 }
3437
3438 tx->set_sync(false);
3439
3440 /* This will note the master position also */
3441 tx->commit_ordered_res= tx->commit();
3442 tx->commit_ordered_done= true;
3443
3444}
3445
3446
3447static int rocksdb_commit(handlerton* hton, THD* thd, bool commit_tx)
3448{
3449 DBUG_ENTER_FUNC();
3450
3451 DBUG_ASSERT(hton != nullptr);
3452 DBUG_ASSERT(thd != nullptr);
3453 DBUG_ASSERT(commit_latency_stats != nullptr);
3454
3455 rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
3456
3457 /* note: h->external_lock(F_UNLCK) is called after this function is called) */
3458 Rdb_transaction *&tx = get_tx_from_thd(thd);
3459
3460 /* this will trigger saving of perf_context information */
3461 Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
3462
3463 if (tx != nullptr) {
3464 if (commit_tx || (!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT |
3465 OPTION_BEGIN))) {
3466 /*
3467 This will not add anything to commit_latency_stats, and this is correct
3468 right?
3469 */
3470 if (tx->commit_ordered_done)
3471 {
3472 thd_wakeup_subsequent_commits(thd, 0);
3473 DBUG_RETURN((tx->commit_ordered_res? HA_ERR_INTERNAL_ERROR: 0));
3474 }
3475
3476 /*
3477 We get here
3478 - For a COMMIT statement that finishes a multi-statement transaction
3479 - For a statement that has its own transaction
3480 */
3481
3482 // First, commit without syncing. This establishes the commit order
3483 tx->set_sync(false);
3484 if (tx->commit()) {
3485 DBUG_RETURN(HA_ERR_ROCKSDB_COMMIT_FAILED);
3486 }
3487 thd_wakeup_subsequent_commits(thd, 0);
3488
3489 if (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC)
3490 {
3491 rocksdb::Status s= rdb->FlushWAL(true);
3492 if (!s.ok())
3493 DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
3494 }
3495 } else {
3496 /*
3497 We get here when committing a statement within a transaction.
3498 */
3499 tx->make_stmt_savepoint_permanent();
3500 }
3501
3502 if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
3503 // For READ_COMMITTED, we release any existing snapshot so that we will
3504 // see any changes that occurred since the last statement.
3505 tx->release_snapshot();
3506 }
3507 }
3508
3509 // `Add()` is implemented in a thread-safe manner.
3510 commit_latency_stats->Add(timer.ElapsedNanos() / 1000);
3511
3512 DBUG_RETURN(HA_EXIT_SUCCESS);
3513}
3514
3515
3516static int rocksdb_rollback(handlerton *const hton, THD *const thd,
3517 bool rollback_tx) {
3518 Rdb_transaction *&tx = get_tx_from_thd(thd);
3519 Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
3520
3521 if (tx != nullptr) {
3522 if (rollback_tx) {
3523 /*
3524 We get here, when
3525 - ROLLBACK statement is issued.
3526
3527 Discard the changes made by the transaction
3528 */
3529 tx->rollback();
3530 } else {
3531 /*
3532 We get here when
3533 - a statement with AUTOCOMMIT=1 is being rolled back (because of some
3534 error)
3535 - a statement inside a transaction is rolled back
3536 */
3537
3538 tx->rollback_stmt();
3539 tx->set_tx_failed(true);
3540 }
3541
3542 if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
3543 // For READ_COMMITTED, we release any existing snapshot so that we will
3544 // see any changes that occurred since the last statement.
3545 tx->release_snapshot();
3546 }
3547 }
3548 return HA_EXIT_SUCCESS;
3549}
3550
3551static bool print_stats(THD *const thd, std::string const &type,
3552 std::string const &name, std::string const &status,
3553 stat_print_fn *stat_print) {
3554 return stat_print(thd, type.c_str(), type.size(), name.c_str(), name.size(),
3555 status.c_str(), status.size());
3556}
3557
3558static std::string format_string(const char *const format, ...) {
3559 std::string res;
3560 va_list args;
3561 va_list args_copy;
3562 char static_buff[256];
3563
3564 DBUG_ASSERT(format != nullptr);
3565
3566 va_start(args, format);
3567 va_copy(args_copy, args);
3568
3569 // Calculate how much space we will need
3570 int len = vsnprintf(nullptr, 0, format, args);
3571 va_end(args);
3572
3573 if (len < 0) {
3574 res = std::string("<format error>");
3575 } else if (len == 0) {
3576 // Shortcut for an empty string
3577 res = std::string("");
3578 } else {
3579 // For short enough output use a static buffer
3580 char *buff = static_buff;
3581 std::unique_ptr<char[]> dynamic_buff = nullptr;
3582
3583 len++; // Add one for null terminator
3584
3585 // for longer output use an allocated buffer
3586 if (static_cast<uint>(len) > sizeof(static_buff)) {
3587 dynamic_buff.reset(new char[len]);
3588 buff = dynamic_buff.get();
3589 }
3590
3591 // Now re-do the vsnprintf with the buffer which is now large enough
3592 (void)vsnprintf(buff, len, format, args_copy);
3593
3594 // Convert to a std::string. Note we could have created a std::string
3595 // large enough and then converted the buffer to a 'char*' and created
3596 // the output in place. This would probably work but feels like a hack.
3597 // Since this isn't code that needs to be super-performant we are going
3598 // with this 'safer' method.
3599 res = std::string(buff);
3600 }
3601
3602 va_end(args_copy);
3603
3604 return res;
3605}
3606
3607class Rdb_snapshot_status : public Rdb_tx_list_walker {
3608private:
3609 std::string m_data;
3610
3611 static std::string current_timestamp(void) {
3612 static const char *const format = "%d-%02d-%02d %02d:%02d:%02d";
3613 time_t currtime;
3614 struct tm currtm;
3615
3616 time(&currtime);
3617
3618 localtime_r(&currtime, &currtm);
3619
3620 return format_string(format, currtm.tm_year + 1900, currtm.tm_mon + 1,
3621 currtm.tm_mday, currtm.tm_hour, currtm.tm_min,
3622 currtm.tm_sec);
3623 }
3624
3625 static std::string get_header(void) {
3626 return "\n============================================================\n" +
3627 current_timestamp() +
3628 " ROCKSDB TRANSACTION MONITOR OUTPUT\n"
3629 "============================================================\n"
3630 "---------\n"
3631 "SNAPSHOTS\n"
3632 "---------\n"
3633 "LIST OF SNAPSHOTS FOR EACH SESSION:\n";
3634 }
3635
3636 static std::string get_footer(void) {
3637 return "-----------------------------------------\n"
3638 "END OF ROCKSDB TRANSACTION MONITOR OUTPUT\n"
3639 "=========================================\n";
3640 }
3641
3642 static Rdb_deadlock_info::Rdb_dl_trx_info
3643 get_dl_txn_info(const rocksdb::DeadlockInfo &txn,
3644 const GL_INDEX_ID &gl_index_id) {
3645 Rdb_deadlock_info::Rdb_dl_trx_info txn_data;
3646
3647 txn_data.trx_id = txn.m_txn_id;
3648
3649 txn_data.table_name = ddl_manager.safe_get_table_name(gl_index_id);
3650 if (txn_data.table_name.empty()) {
3651 txn_data.table_name =
3652 "NOT FOUND; INDEX_ID: " + std::to_string(gl_index_id.index_id);
3653 }
3654
3655 auto kd = ddl_manager.safe_find(gl_index_id);
3656 txn_data.index_name =
3657 (kd) ? kd->get_name()
3658 : "NOT FOUND; INDEX_ID: " + std::to_string(gl_index_id.index_id);
3659
3660 rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(txn.m_cf_id);
3661 txn_data.cf_name = cfh->GetName();
3662
3663 txn_data.waiting_key =
3664 rdb_hexdump(txn.m_waiting_key.c_str(), txn.m_waiting_key.length());
3665
3666 txn_data.exclusive_lock = txn.m_exclusive;
3667
3668 return txn_data;
3669 }
3670
3671 static Rdb_deadlock_info
3672 get_dl_path_trx_info(const rocksdb::DeadlockPath &path_entry) {
3673 Rdb_deadlock_info deadlock_info;
3674
3675 for (auto it = path_entry.path.begin(); it != path_entry.path.end();
3676 it++) {
3677 auto txn = *it;
3678 const GL_INDEX_ID gl_index_id = {
3679 txn.m_cf_id, rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>(
3680 txn.m_waiting_key.c_str()))};
3681 deadlock_info.path.push_back(get_dl_txn_info(txn, gl_index_id));
3682 }
3683 DBUG_ASSERT_IFF(path_entry.limit_exceeded, path_entry.path.empty());
3684 /* print the first txn in the path to display the full deadlock cycle */
3685 if (!path_entry.path.empty() && !path_entry.limit_exceeded) {
3686 auto deadlocking_txn = *(path_entry.path.end() - 1);
3687 deadlock_info.victim_trx_id = deadlocking_txn.m_txn_id;
3688 }
3689 return deadlock_info;
3690 }
3691
3692 public:
3693 Rdb_snapshot_status() : m_data(get_header()) {}
3694
3695 std::string getResult() { return m_data + get_footer(); }
3696
3697 /* Implement Rdb_transaction interface */
3698 /* Create one row in the snapshot status table */
3699 void process_tran(const Rdb_transaction *const tx) override {
3700 DBUG_ASSERT(tx != nullptr);
3701
3702 /* Calculate the duration the snapshot has existed */
3703 int64_t snapshot_timestamp = tx->m_snapshot_timestamp;
3704 if (snapshot_timestamp != 0) {
3705 int64_t curr_time;
3706 rdb->GetEnv()->GetCurrentTime(&curr_time);
3707
3708 char buffer[1024];
3709#ifdef MARIAROCKS_NOT_YET
3710 thd_security_context(tx->get_thd(), buffer, sizeof buffer, 0);
3711#endif
3712 m_data += format_string(
3713 "---SNAPSHOT, ACTIVE %lld sec\n"
3714 "%s\n"
3715 "lock count %llu, write count %llu\n"
3716 "insert count %llu, update count %llu, delete count %llu\n",
3717 (longlong)(curr_time - snapshot_timestamp), buffer, tx->get_lock_count(),
3718 tx->get_write_count(), tx->get_insert_count(), tx->get_update_count(),
3719 tx->get_delete_count());
3720 }
3721 }
3722
3723 void populate_deadlock_buffer() {
3724 auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
3725 m_data += "----------LATEST DETECTED DEADLOCKS----------\n";
3726
3727 for (auto path_entry : dlock_buffer) {
3728 std::string path_data;
3729 if (path_entry.limit_exceeded) {
3730 path_data += "\n-------DEADLOCK EXCEEDED MAX DEPTH-------\n";
3731 } else {
3732 path_data += "\n*** DEADLOCK PATH\n"
3733 "=========================================\n";
3734 const auto dl_info = get_dl_path_trx_info(path_entry);
3735 for (auto it = dl_info.path.begin(); it != dl_info.path.end(); it++) {
3736 const auto trx_info = *it;
3737 path_data += format_string(
3738 "TRANSACTION ID: %u\n"
3739 "COLUMN FAMILY NAME: %s\n"
3740 "WAITING KEY: %s\n"
3741 "LOCK TYPE: %s\n"
3742 "INDEX NAME: %s\n"
3743 "TABLE NAME: %s\n",
3744 trx_info.trx_id, trx_info.cf_name.c_str(),
3745 trx_info.waiting_key.c_str(),
3746 trx_info.exclusive_lock ? "EXCLUSIVE" : "SHARED",
3747 trx_info.index_name.c_str(), trx_info.table_name.c_str());
3748 if (it != dl_info.path.end() - 1) {
3749 path_data += "---------------WAITING FOR---------------\n";
3750 }
3751 }
3752 path_data +=
3753 format_string("\n--------TRANSACTION ID: %u GOT DEADLOCK---------\n",
3754 dl_info.victim_trx_id);
3755 }
3756 m_data += path_data;
3757 }
3758 }
3759
3760 std::vector<Rdb_deadlock_info> get_deadlock_info() {
3761 std::vector<Rdb_deadlock_info> deadlock_info;
3762 auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
3763 for (auto path_entry : dlock_buffer) {
3764 if (!path_entry.limit_exceeded) {
3765 deadlock_info.push_back(get_dl_path_trx_info(path_entry));
3766 }
3767 }
3768 return deadlock_info;
3769 }
3770};
3771
3772/**
3773 * @brief
3774 * walks through all non-replication transactions and copies
3775 * out relevant information for information_schema.rocksdb_trx
3776 */
3777class Rdb_trx_info_aggregator : public Rdb_tx_list_walker {
3778private:
3779 std::vector<Rdb_trx_info> *m_trx_info;
3780
3781public:
3782 explicit Rdb_trx_info_aggregator(std::vector<Rdb_trx_info> *const trx_info)
3783 : m_trx_info(trx_info) {}
3784
3785 void process_tran(const Rdb_transaction *const tx) override {
3786 static const std::map<int, std::string> state_map = {
3787 {rocksdb::Transaction::STARTED, "STARTED"},
3788 {rocksdb::Transaction::AWAITING_PREPARE, "AWAITING_PREPARE"},
3789 {rocksdb::Transaction::PREPARED, "PREPARED"},
3790 {rocksdb::Transaction::AWAITING_COMMIT, "AWAITING_COMMIT"},
3791 {rocksdb::Transaction::COMMITED, "COMMITED"},
3792 {rocksdb::Transaction::AWAITING_ROLLBACK, "AWAITING_ROLLBACK"},
3793 {rocksdb::Transaction::ROLLEDBACK, "ROLLEDBACK"},
3794 };
3795
3796 DBUG_ASSERT(tx != nullptr);
3797
3798 THD *const thd = tx->get_thd();
3799 ulong thread_id = thd_get_thread_id(thd);
3800
3801 if (tx->is_writebatch_trx()) {
3802 const auto wb_impl = static_cast<const Rdb_writebatch_impl *>(tx);
3803 DBUG_ASSERT(wb_impl);
3804 m_trx_info->push_back(
3805 {"", /* name */
3806 0, /* trx_id */
3807 wb_impl->get_write_count(), 0, /* lock_count */
3808 0, /* timeout_sec */
3809 "", /* state */
3810 "", /* waiting_key */
3811 0, /* waiting_cf_id */
3812 1, /*is_replication */
3813 1, /* skip_trx_api */
3814 wb_impl->is_tx_read_only(), 0, /* deadlock detection */
3815 wb_impl->num_ongoing_bulk_load(), thread_id, "" /* query string */});
3816 } else {
3817 const auto tx_impl = static_cast<const Rdb_transaction_impl *>(tx);
3818 DBUG_ASSERT(tx_impl);
3819 const rocksdb::Transaction *rdb_trx = tx_impl->get_rdb_trx();
3820
3821 if (rdb_trx == nullptr) {
3822 return;
3823 }
3824
3825 char query_buf[NAME_LEN+1];
3826 thd_query_safe(thd, query_buf, sizeof(query_buf));
3827 std::string query_str(query_buf);
3828
3829 const auto state_it = state_map.find(rdb_trx->GetState());
3830 DBUG_ASSERT(state_it != state_map.end());
3831 const int is_replication = (thd->rgi_slave != nullptr);
3832 uint32_t waiting_cf_id;
3833 std::string waiting_key;
3834 rdb_trx->GetWaitingTxns(&waiting_cf_id, &waiting_key),
3835
3836 m_trx_info->push_back(
3837 {rdb_trx->GetName(), rdb_trx->GetID(), tx_impl->get_write_count(),
3838 tx_impl->get_lock_count(), tx_impl->get_timeout_sec(),
3839 state_it->second, waiting_key, waiting_cf_id, is_replication,
3840 0, /* skip_trx_api */
3841 tx_impl->is_tx_read_only(), rdb_trx->IsDeadlockDetect(),
3842 tx_impl->num_ongoing_bulk_load(), thread_id, query_str});
3843 }
3844 }
3845};
3846
3847/*
3848 returns a vector of info for all non-replication threads
3849 for use by information_schema.rocksdb_trx
3850*/
3851std::vector<Rdb_trx_info> rdb_get_all_trx_info() {
3852 std::vector<Rdb_trx_info> trx_info;
3853 Rdb_trx_info_aggregator trx_info_agg(&trx_info);
3854 Rdb_transaction::walk_tx_list(&trx_info_agg);
3855 return trx_info;
3856}
3857
3858
3859/*
3860 returns a vector of info of recent deadlocks
3861 for use by information_schema.rocksdb_deadlock
3862*/
3863std::vector<Rdb_deadlock_info> rdb_get_deadlock_info() {
3864 Rdb_snapshot_status showStatus;
3865 Rdb_transaction::walk_tx_list(&showStatus);
3866 return showStatus.get_deadlock_info();
3867}
3868
3869#ifdef MARIAROCKS_NOT_YET
3870/* Generate the snapshot status table */
3871static bool rocksdb_show_snapshot_status(handlerton *const hton, THD *const thd,
3872 stat_print_fn *const stat_print) {
3873 Rdb_snapshot_status showStatus;
3874
3875 Rdb_transaction::walk_tx_list(&showStatus);
3876 showStatus.populate_deadlock_buffer();
3877
3878 /* Send the result data back to MySQL */
3879 return print_stats(thd, "rocksdb", "", showStatus.getResult(), stat_print);
3880}
3881#endif
3882
3883/*
3884 This is called for SHOW ENGINE ROCKSDB STATUS | LOGS | etc.
3885
3886 For now, produce info about live files (which gives an imprecise idea about
3887 what column families are there).
3888*/
3889static bool rocksdb_show_status(handlerton *const hton, THD *const thd,
3890 stat_print_fn *const stat_print,
3891 enum ha_stat_type stat_type) {
3892 DBUG_ASSERT(hton != nullptr);
3893 DBUG_ASSERT(thd != nullptr);
3894 DBUG_ASSERT(stat_print != nullptr);
3895
3896 bool res = false;
3897 char buf[100] = {'\0'};
3898
3899 if (stat_type == HA_ENGINE_STATUS) {
3900 DBUG_ASSERT(rdb != nullptr);
3901
3902 std::string str;
3903
3904 /* Global DB Statistics */
3905 if (rocksdb_stats) {
3906 str = rocksdb_stats->ToString();
3907
3908 // Use the same format as internal RocksDB statistics entries to make
3909 // sure that output will look unified.
3910 DBUG_ASSERT(commit_latency_stats != nullptr);
3911
3912 snprintf(buf, sizeof(buf), "rocksdb.commit_latency statistics "
3913 "Percentiles :=> 50 : %.2f 95 : %.2f "
3914 "99 : %.2f 100 : %.2f\n",
3915 commit_latency_stats->Percentile(50),
3916 commit_latency_stats->Percentile(95),
3917 commit_latency_stats->Percentile(99),
3918 commit_latency_stats->Percentile(100));
3919 str.append(buf);
3920
3921 uint64_t v = 0;
3922
3923 // Retrieve additional stalling related numbers from RocksDB and append
3924 // them to the buffer meant for displaying detailed statistics. The intent
3925 // here is to avoid adding another row to the query output because of
3926 // just two numbers.
3927 //
3928 // NB! We're replacing hyphens with underscores in output to better match
3929 // the existing naming convention.
3930 if (rdb->GetIntProperty("rocksdb.is-write-stopped", &v)) {
3931 snprintf(buf, sizeof(buf), "rocksdb.is_write_stopped COUNT : %llu\n", (ulonglong)v);
3932 str.append(buf);
3933 }
3934
3935 if (rdb->GetIntProperty("rocksdb.actual-delayed-write-rate", &v)) {
3936 snprintf(buf, sizeof(buf), "rocksdb.actual_delayed_write_rate "
3937 "COUNT : %llu\n",
3938 (ulonglong)v);
3939 str.append(buf);
3940 }
3941
3942 res |= print_stats(thd, "STATISTICS", "rocksdb", str, stat_print);
3943 }
3944
3945 /* Per DB stats */
3946 if (rdb->GetProperty("rocksdb.dbstats", &str)) {
3947 res |= print_stats(thd, "DBSTATS", "rocksdb", str, stat_print);
3948 }
3949
3950 /* Per column family stats */
3951 for (const auto &cf_name : cf_manager.get_cf_names()) {
3952 rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name);
3953 if (cfh == nullptr) {
3954 continue;
3955 }
3956
3957 if (!rdb->GetProperty(cfh, "rocksdb.cfstats", &str)) {
3958 continue;
3959 }
3960
3961 res |= print_stats(thd, "CF_COMPACTION", cf_name, str, stat_print);
3962 }
3963
3964 /* Memory Statistics */
3965 std::vector<rocksdb::DB *> dbs;
3966 std::unordered_set<const rocksdb::Cache *> cache_set;
3967 size_t internal_cache_count = 0;
3968 size_t kDefaultInternalCacheSize = 8 * 1024 * 1024;
3969
3970 dbs.push_back(rdb);
3971 cache_set.insert(rocksdb_tbl_options->block_cache.get());
3972
3973 for (const auto &cf_handle : cf_manager.get_all_cf()) {
3974 rocksdb::ColumnFamilyDescriptor cf_desc;
3975 cf_handle->GetDescriptor(&cf_desc);
3976 auto *const table_factory = cf_desc.options.table_factory.get();
3977
3978 if (table_factory != nullptr) {
3979 std::string tf_name = table_factory->Name();
3980
3981 if (tf_name.find("BlockBasedTable") != std::string::npos) {
3982 const rocksdb::BlockBasedTableOptions *const bbt_opt =
3983 reinterpret_cast<rocksdb::BlockBasedTableOptions *>(
3984 table_factory->GetOptions());
3985
3986 if (bbt_opt != nullptr) {
3987 if (bbt_opt->block_cache.get() != nullptr) {
3988 cache_set.insert(bbt_opt->block_cache.get());
3989 } else {
3990 internal_cache_count++;
3991 }
3992 cache_set.insert(bbt_opt->block_cache_compressed.get());
3993 }
3994 }
3995 }
3996 }
3997
3998 std::map<rocksdb::MemoryUtil::UsageType, uint64_t> temp_usage_by_type;
3999 str.clear();
4000 rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
4001 &temp_usage_by_type);
4002 snprintf(buf, sizeof(buf), "\nMemTable Total: %llu",
4003 (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kMemTableTotal]);
4004 str.append(buf);
4005 snprintf(buf, sizeof(buf), "\nMemTable Unflushed: %llu",
4006 (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kMemTableUnFlushed]);
4007 str.append(buf);
4008 snprintf(buf, sizeof(buf), "\nTable Readers Total: %llu",
4009 (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kTableReadersTotal]);
4010 str.append(buf);
4011 snprintf(buf, sizeof(buf), "\nCache Total: %llu",
4012 (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kCacheTotal]);
4013 str.append(buf);
4014 snprintf(buf, sizeof(buf), "\nDefault Cache Capacity: %llu",
4015 (ulonglong)internal_cache_count * kDefaultInternalCacheSize);
4016 str.append(buf);
4017 res |= print_stats(thd, "MEMORY_STATS", "rocksdb", str, stat_print);
4018#ifdef MARIAROCKS_NOT_YET
4019 /* Show the background thread status */
4020 std::vector<rocksdb::ThreadStatus> thread_list;
4021 rocksdb::Status s = rdb->GetEnv()->GetThreadList(&thread_list);
4022
4023 if (!s.ok()) {
4024 sql_print_error("RocksDB: Returned error (%s) from GetThreadList.\n",
4025 s.ToString().c_str());
4026 res |= true;
4027 } else {
4028 /* For each background thread retrieved, print out its information */
4029 for (auto &it : thread_list) {
4030 /* Only look at background threads. Ignore user threads, if any. */
4031 if (it.thread_type > rocksdb::ThreadStatus::LOW_PRIORITY) {
4032 continue;
4033 }
4034
4035 str = "\nthread_type: " + it.GetThreadTypeName(it.thread_type) +
4036 "\ncf_name: " + it.cf_name +
4037 "\noperation_type: " + it.GetOperationName(it.operation_type) +
4038 "\noperation_stage: " +
4039 it.GetOperationStageName(it.operation_stage) +
4040 "\nelapsed_time_ms: " +
4041 it.MicrosToString(it.op_elapsed_micros);
4042
4043 for (auto &it_props :
4044 it.InterpretOperationProperties(it.operation_type,
4045 it.op_properties)) {
4046 str += "\n" + it_props.first + ": " + std::to_string(it_props.second);
4047 }
4048
4049 str += "\nstate_type: " + it.GetStateName(it.state_type);
4050
4051 res |= print_stats(thd, "BG_THREADS", std::to_string(it.thread_id),
4052 str, stat_print);
4053 }
4054 }
4055#endif
4056
4057#ifdef MARIAROCKS_NOT_YET
4058 } else if (stat_type == HA_ENGINE_TRX) {
4059 /* Handle the SHOW ENGINE ROCKSDB TRANSACTION STATUS command */
4060 res |= rocksdb_show_snapshot_status(hton, thd, stat_print);
4061#endif
4062 }
4063 return res;
4064}
4065
4066static inline void rocksdb_register_tx(handlerton *const hton, THD *const thd,
4067 Rdb_transaction *const tx) {
4068 DBUG_ASSERT(tx != nullptr);
4069
4070 trans_register_ha(thd, FALSE, rocksdb_hton);
4071 if (my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
4072 tx->start_stmt();
4073 trans_register_ha(thd, TRUE, rocksdb_hton);
4074 }
4075}
4076
4077static const char *ha_rocksdb_exts[] = {NullS};
4078
4079/*
4080 Supporting START TRANSACTION WITH CONSISTENT [ROCKSDB] SNAPSHOT
4081
4082 Features:
4083 1. Supporting START TRANSACTION WITH CONSISTENT SNAPSHOT
4084 2. Getting current binlog position in addition to #1.
4085
4086 The second feature is done by START TRANSACTION WITH
4087 CONSISTENT ROCKSDB SNAPSHOT. This is Facebook's extension, and
4088 it works like existing START TRANSACTION WITH CONSISTENT INNODB SNAPSHOT.
4089
4090 - When not setting engine, START TRANSACTION WITH CONSISTENT SNAPSHOT
4091 takes both InnoDB and RocksDB snapshots, and both InnoDB and RocksDB
4092 participate in transaction. When executing COMMIT, both InnoDB and
4093 RocksDB modifications are committed. Remember that XA is not supported yet,
4094 so mixing engines is not recommended anyway.
4095
4096 - When setting engine, START TRANSACTION WITH CONSISTENT.. takes
4097 snapshot for the specified engine only. But it starts both
4098 InnoDB and RocksDB transactions.
4099*/
4100static int rocksdb_start_tx_and_assign_read_view(
4101 handlerton *const hton, /*!< in: RocksDB handlerton */
4102 THD* thd) /*!< in: MySQL thread handle of the
4103 user for whom the transaction should
4104 be committed */
4105{
4106 ulong const tx_isolation = my_core::thd_tx_isolation(thd);
4107
4108 if (tx_isolation != ISO_REPEATABLE_READ) {
4109 my_error(ER_ISOLATION_LEVEL_WITH_CONSISTENT_SNAPSHOT, MYF(0));
4110 return HA_EXIT_FAILURE;
4111 }
4112 /*
4113 MariaDB: there is no need to call mysql_bin_log_lock_commits and then
4114 unlock back.
4115 SQL layer calls start_consistent_snapshot() for all engines, including the
4116 binlog under LOCK_commit_ordered mutex.
4117 The mutex prevents binlog commits from happening (right?) while the storage
4118 engine(s) allocate read snapshots. That way, each storage engine is
4119 synchronized with current binlog position.
4120 */
4121 mysql_mutex_assert_owner(&LOCK_commit_ordered);
4122
4123 Rdb_transaction *const tx = get_or_create_tx(thd);
4124 Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4125
4126 DBUG_ASSERT(!tx->has_snapshot());
4127 tx->set_tx_read_only(true);
4128 rocksdb_register_tx(hton, thd, tx);
4129 tx->acquire_snapshot(true);
4130
4131 return HA_EXIT_SUCCESS;
4132}
4133
4134/* Dummy SAVEPOINT support. This is needed for long running transactions
4135 * like mysqldump (https://bugs.mysql.com/bug.php?id=71017).
4136 * Current SAVEPOINT does not correctly handle ROLLBACK and does not return
4137 * errors. This needs to be addressed in future versions (Issue#96).
4138 */
4139static int rocksdb_savepoint(handlerton *const hton, THD *const thd,
4140 void *const savepoint) {
4141 return HA_EXIT_SUCCESS;
4142}
4143
4144static int rocksdb_rollback_to_savepoint(handlerton *const hton, THD *const thd,
4145 void *const savepoint) {
4146 Rdb_transaction *&tx = get_tx_from_thd(thd);
4147 return tx->rollback_to_savepoint(savepoint);
4148}
4149
4150static bool
4151rocksdb_rollback_to_savepoint_can_release_mdl(handlerton *const hton,
4152 THD *const thd) {
4153 return true;
4154}
4155
4156#ifdef MARIAROCKS_NOT_YET
4157/*
4158 This is called for INFORMATION_SCHEMA
4159*/
4160static void rocksdb_update_table_stats(
4161 /* per-table stats callback */
4162 void (*cb)(const char *db, const char *tbl, bool is_partition,
4163 my_io_perf_t *r, my_io_perf_t *w, my_io_perf_t *r_blob,
4164 my_io_perf_t *r_primary, my_io_perf_t *r_secondary,
4165 page_stats_t *page_stats, comp_stats_t *comp_stats,
4166 int n_lock_wait, int n_lock_wait_timeout, int n_lock_deadlock,
4167 const char *engine)) {
4168 my_io_perf_t io_perf_read;
4169 my_io_perf_t io_perf_write;
4170 my_io_perf_t io_perf;
4171 page_stats_t page_stats;
4172 comp_stats_t comp_stats;
4173 uint lock_wait_timeout_stats;
4174 uint deadlock_stats;
4175 uint lock_wait_stats;
4176 std::vector<std::string> tablenames;
4177
4178 /*
4179 Most of these are for innodb, so setting them to 0.
4180 TODO: possibly separate out primary vs. secondary index reads
4181 */
4182 memset(&io_perf, 0, sizeof(io_perf));
4183 memset(&page_stats, 0, sizeof(page_stats));
4184 memset(&comp_stats, 0, sizeof(comp_stats));
4185 memset(&io_perf_write, 0, sizeof(io_perf_write));
4186
4187 tablenames = rdb_open_tables.get_table_names();
4188
4189 for (const auto &it : tablenames) {
4190 Rdb_table_handler *table_handler;
4191 std::string str, dbname, tablename, partname;
4192 char dbname_sys[NAME_LEN + 1];
4193 char tablename_sys[NAME_LEN + 1];
4194 bool is_partition;
4195
4196 if (rdb_normalize_tablename(it, &str) != HA_EXIT_SUCCESS) {
4197 /* Function needs to return void because of the interface and we've
4198 * detected an error which shouldn't happen. There's no way to let
4199 * caller know that something failed.
4200 */
4201 SHIP_ASSERT(false);
4202 return;
4203 }
4204
4205 if (rdb_split_normalized_tablename(str, &dbname, &tablename, &partname)) {
4206 continue;
4207 }
4208
4209 is_partition = (partname.size() != 0);
4210
4211 table_handler = rdb_open_tables.get_table_handler(it.c_str());
4212 if (table_handler == nullptr) {
4213 continue;
4214 }
4215
4216 io_perf_read.bytes = table_handler->m_io_perf_read.bytes.load();
4217 io_perf_read.requests = table_handler->m_io_perf_read.requests.load();
4218 io_perf_write.bytes = table_handler->m_io_perf_write.bytes.load();
4219 io_perf_write.requests = table_handler->m_io_perf_write.requests.load();
4220 lock_wait_timeout_stats = table_handler->m_lock_wait_timeout_counter.load();
4221 deadlock_stats = table_handler->m_deadlock_counter.load();
4222 lock_wait_stats =
4223 table_handler->m_table_perf_context.m_value[PC_KEY_LOCK_WAIT_COUNT]
4224 .load();
4225
4226 /*
4227 Convert from rocksdb timer to mysql timer. RocksDB values are
4228 in nanoseconds, but table statistics expect the value to be
4229 in my_timer format.
4230 */
4231 io_perf_read.svc_time = my_core::microseconds_to_my_timer(
4232 table_handler->m_io_perf_read.svc_time.load() / 1000);
4233 io_perf_read.svc_time_max = my_core::microseconds_to_my_timer(
4234 table_handler->m_io_perf_read.svc_time_max.load() / 1000);
4235 io_perf_read.wait_time = my_core::microseconds_to_my_timer(
4236 table_handler->m_io_perf_read.wait_time.load() / 1000);
4237 io_perf_read.wait_time_max = my_core::microseconds_to_my_timer(
4238 table_handler->m_io_perf_read.wait_time_max.load() / 1000);
4239 io_perf_read.slow_ios = table_handler->m_io_perf_read.slow_ios.load();
4240 rdb_open_tables.release_table_handler(table_handler);
4241
4242 /*
4243 Table stats expects our database and table name to be in system encoding,
4244 not filename format. Convert before calling callback.
4245 */
4246 my_core::filename_to_tablename(dbname.c_str(), dbname_sys,
4247 sizeof(dbname_sys));
4248 my_core::filename_to_tablename(tablename.c_str(), tablename_sys,
4249 sizeof(tablename_sys));
4250 (*cb)(dbname_sys, tablename_sys, is_partition, &io_perf_read,
4251 &io_perf_write, &io_perf, &io_perf, &io_perf, &page_stats,
4252 &comp_stats, lock_wait_stats, lock_wait_timeout_stats, deadlock_stats,
4253 rocksdb_hton_name);
4254 }
4255}
4256#endif
4257static rocksdb::Status check_rocksdb_options_compatibility(
4258 const char *const dbpath, const rocksdb::Options &main_opts,
4259 const std::vector<rocksdb::ColumnFamilyDescriptor> &cf_descr) {
4260 DBUG_ASSERT(rocksdb_datadir != nullptr);
4261
4262 rocksdb::DBOptions loaded_db_opt;
4263 std::vector<rocksdb::ColumnFamilyDescriptor> loaded_cf_descs;
4264 rocksdb::Status status =
4265 LoadLatestOptions(dbpath, rocksdb::Env::Default(), &loaded_db_opt,
4266 &loaded_cf_descs, rocksdb_ignore_unknown_options);
4267
4268 // If we're starting from scratch and there are no options saved yet then this
4269 // is a valid case. Therefore we can't compare the current set of options to
4270 // anything.
4271 if (status.IsNotFound()) {
4272 return rocksdb::Status::OK();
4273 }
4274
4275 if (!status.ok()) {
4276 return status;
4277 }
4278
4279 if (loaded_cf_descs.size() != cf_descr.size()) {
4280 return rocksdb::Status::NotSupported("Mismatched size of column family "
4281 "descriptors.");
4282 }
4283
4284 // Please see RocksDB documentation for more context about why we need to set
4285 // user-defined functions and pointer-typed options manually.
4286 for (size_t i = 0; i < loaded_cf_descs.size(); i++) {
4287 loaded_cf_descs[i].options.compaction_filter =
4288 cf_descr[i].options.compaction_filter;
4289 loaded_cf_descs[i].options.compaction_filter_factory =
4290 cf_descr[i].options.compaction_filter_factory;
4291 loaded_cf_descs[i].options.comparator = cf_descr[i].options.comparator;
4292 loaded_cf_descs[i].options.memtable_factory =
4293 cf_descr[i].options.memtable_factory;
4294 loaded_cf_descs[i].options.merge_operator =
4295 cf_descr[i].options.merge_operator;
4296 loaded_cf_descs[i].options.prefix_extractor =
4297 cf_descr[i].options.prefix_extractor;
4298 loaded_cf_descs[i].options.table_factory =
4299 cf_descr[i].options.table_factory;
4300 }
4301
4302 // This is the essence of the function - determine if it's safe to open the
4303 // database or not.
4304 status = CheckOptionsCompatibility(dbpath, rocksdb::Env::Default(), main_opts,
4305 loaded_cf_descs,
4306 rocksdb_ignore_unknown_options);
4307
4308 return status;
4309}
4310
4311bool prevent_myrocks_loading= false;
4312
4313
4314/*
4315 Storage Engine initialization function, invoked when plugin is loaded.
4316*/
4317
4318static int rocksdb_init_func(void *const p) {
4319
4320 DBUG_ENTER_FUNC();
4321
4322 if (prevent_myrocks_loading)
4323 {
4324 my_error(ER_INTERNAL_ERROR, MYF(0),
4325 "Loading MyRocks plugin after it has been unloaded is not "
4326 "supported. Please restart mysqld");
4327 DBUG_RETURN(1);
4328 }
4329
4330 if (rdb_check_rocksdb_corruption()) {
4331 sql_print_error("RocksDB: There was a corruption detected in RockDB files. "
4332 "Check error log emitted earlier for more details.");
4333 if (rocksdb_allow_to_start_after_corruption) {
4334 sql_print_information(
4335 "RocksDB: Remove rocksdb_allow_to_start_after_corruption to prevent "
4336 "server operating if RocksDB corruption is detected.");
4337 } else {
4338 sql_print_error("RocksDB: The server will exit normally and stop restart "
4339 "attempts. Remove %s file from data directory and "
4340 "start mysqld manually.",
4341 rdb_corruption_marker_file_name().c_str());
4342 exit(0);
4343 }
4344 }
4345
4346 // Validate the assumption about the size of ROCKSDB_SIZEOF_HIDDEN_PK_COLUMN.
4347 static_assert(sizeof(longlong) == 8, "Assuming that longlong is 8 bytes.");
4348
4349 init_rocksdb_psi_keys();
4350
4351 rocksdb_hton = (handlerton *)p;
4352 mysql_mutex_init(rdb_psi_open_tbls_mutex_key, &rdb_open_tables.m_mutex,
4353 MY_MUTEX_INIT_FAST);
4354#ifdef HAVE_PSI_INTERFACE
4355 rdb_bg_thread.init(rdb_signal_bg_psi_mutex_key, rdb_signal_bg_psi_cond_key);
4356 rdb_drop_idx_thread.init(rdb_signal_drop_idx_psi_mutex_key,
4357 rdb_signal_drop_idx_psi_cond_key);
4358#else
4359 rdb_bg_thread.init();
4360 rdb_drop_idx_thread.init();
4361#endif
4362 mysql_mutex_init(rdb_collation_data_mutex_key, &rdb_collation_data_mutex,
4363 MY_MUTEX_INIT_FAST);
4364 mysql_mutex_init(rdb_mem_cmp_space_mutex_key, &rdb_mem_cmp_space_mutex,
4365 MY_MUTEX_INIT_FAST);
4366
4367 const char* initial_rocksdb_datadir_for_ignore_dirs= rocksdb_datadir;
4368 if (!strncmp(rocksdb_datadir, "./", 2))
4369 initial_rocksdb_datadir_for_ignore_dirs += 2;
4370 ignore_db_dirs_append(initial_rocksdb_datadir_for_ignore_dirs);
4371
4372#if defined(HAVE_PSI_INTERFACE)
4373 rdb_collation_exceptions =
4374 new Regex_list_handler(key_rwlock_collation_exception_list);
4375#else
4376 rdb_collation_exceptions = new Regex_list_handler();
4377#endif
4378
4379 mysql_mutex_init(rdb_sysvars_psi_mutex_key, &rdb_sysvars_mutex,
4380 MY_MUTEX_INIT_FAST);
4381 Rdb_transaction::init_mutex();
4382
4383 rocksdb_hton->state = SHOW_OPTION_YES;
4384 rocksdb_hton->create = rocksdb_create_handler;
4385 rocksdb_hton->close_connection = rocksdb_close_connection;
4386
4387 rocksdb_hton->prepare = rocksdb_prepare;
4388 rocksdb_hton->prepare_ordered = NULL; // Do not need it
4389
4390 rocksdb_hton->commit_by_xid = rocksdb_commit_by_xid;
4391 rocksdb_hton->rollback_by_xid = rocksdb_rollback_by_xid;
4392 rocksdb_hton->recover = rocksdb_recover;
4393
4394 rocksdb_hton->commit_ordered= rocksdb_commit_ordered;
4395 rocksdb_hton->commit = rocksdb_commit;
4396
4397 rocksdb_hton->commit_checkpoint_request= rocksdb_checkpoint_request;
4398
4399 rocksdb_hton->rollback = rocksdb_rollback;
4400 rocksdb_hton->show_status = rocksdb_show_status;
4401 rocksdb_hton->start_consistent_snapshot =
4402 rocksdb_start_tx_and_assign_read_view;
4403 rocksdb_hton->savepoint_set = rocksdb_savepoint;
4404 rocksdb_hton->savepoint_rollback = rocksdb_rollback_to_savepoint;
4405 rocksdb_hton->savepoint_rollback_can_release_mdl =
4406 rocksdb_rollback_to_savepoint_can_release_mdl;
4407#ifdef MARIAROCKS_NOT_YET
4408 rocksdb_hton->update_table_stats = rocksdb_update_table_stats;
4409#endif // MARIAROCKS_NOT_YET
4410
4411 /*
4412 Not needed in MariaDB:
4413 rocksdb_hton->flush_logs = rocksdb_flush_wal;
4414 */
4415
4416 rocksdb_hton->flags = HTON_TEMPORARY_NOT_SUPPORTED |
4417 HTON_SUPPORTS_EXTENDED_KEYS | HTON_CAN_RECREATE;
4418
4419 rocksdb_hton->tablefile_extensions= ha_rocksdb_exts;
4420 DBUG_ASSERT(!mysqld_embedded);
4421
4422 if (rocksdb_db_options->max_open_files > (long)open_files_limit) {
4423 sql_print_information("RocksDB: rocksdb_max_open_files should not be "
4424 "greater than the open_files_limit, effective value "
4425 "of rocksdb_max_open_files is being set to "
4426 "open_files_limit / 2.");
4427 rocksdb_db_options->max_open_files = open_files_limit / 2;
4428 } else if (rocksdb_db_options->max_open_files == -2) {
4429 rocksdb_db_options->max_open_files = open_files_limit / 2;
4430 }
4431
4432 rocksdb_stats = rocksdb::CreateDBStatistics();
4433 rocksdb_db_options->statistics = rocksdb_stats;
4434
4435 if (rocksdb_rate_limiter_bytes_per_sec != 0) {
4436 rocksdb_rate_limiter.reset(
4437 rocksdb::NewGenericRateLimiter(rocksdb_rate_limiter_bytes_per_sec));
4438 rocksdb_db_options->rate_limiter = rocksdb_rate_limiter;
4439 }
4440
4441 rocksdb_db_options->delayed_write_rate = rocksdb_delayed_write_rate;
4442
4443 std::shared_ptr<Rdb_logger> myrocks_logger = std::make_shared<Rdb_logger>();
4444 rocksdb::Status s = rocksdb::CreateLoggerFromOptions(
4445 rocksdb_datadir, *rocksdb_db_options, &rocksdb_db_options->info_log);
4446 if (s.ok()) {
4447 myrocks_logger->SetRocksDBLogger(rocksdb_db_options->info_log);
4448 }
4449
4450 rocksdb_db_options->info_log = myrocks_logger;
4451 myrocks_logger->SetInfoLogLevel(
4452 static_cast<rocksdb::InfoLogLevel>(rocksdb_info_log_level));
4453 rocksdb_db_options->wal_dir = rocksdb_wal_dir;
4454
4455 rocksdb_db_options->wal_recovery_mode =
4456 static_cast<rocksdb::WALRecoveryMode>(rocksdb_wal_recovery_mode);
4457
4458 rocksdb_db_options->access_hint_on_compaction_start =
4459 static_cast<rocksdb::Options::AccessHint>(
4460 rocksdb_access_hint_on_compaction_start);
4461
4462 if (rocksdb_db_options->allow_mmap_reads &&
4463 rocksdb_db_options->use_direct_reads) {
4464 // allow_mmap_reads implies !use_direct_reads and RocksDB will not open if
4465 // mmap_reads and direct_reads are both on. (NO_LINT_DEBUG)
4466 sql_print_error("RocksDB: Can't enable both use_direct_reads "
4467 "and allow_mmap_reads\n");
4468 DBUG_RETURN(HA_EXIT_FAILURE);
4469 }
4470
4471 if (rocksdb_db_options->allow_mmap_writes &&
4472 rocksdb_db_options->use_direct_io_for_flush_and_compaction) {
4473 // See above comment for allow_mmap_reads. (NO_LINT_DEBUG)
4474 sql_print_error("RocksDB: Can't enable both "
4475 "use_direct_io_for_flush_and_compaction and "
4476 "allow_mmap_writes\n");
4477 DBUG_RETURN(HA_EXIT_FAILURE);
4478 }
4479
4480 if (rocksdb_db_options->allow_mmap_writes &&
4481 rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER) {
4482 // NO_LINT_DEBUG
4483 sql_print_error("RocksDB: rocksdb_flush_log_at_trx_commit needs to be 0 "
4484 "to use allow_mmap_writes");
4485 DBUG_RETURN(HA_EXIT_FAILURE);
4486 }
4487
4488 // sst_file_manager will move deleted rocksdb sst files to trash_dir
4489 // to be deleted in a background thread.
4490 std::string trash_dir = std::string(rocksdb_datadir) + "/trash";
4491 rocksdb_db_options->sst_file_manager.reset(NewSstFileManager(
4492 rocksdb_db_options->env, myrocks_logger, trash_dir,
4493 rocksdb_sst_mgr_rate_bytes_per_sec, true /* delete_existing_trash */));
4494
4495 std::vector<std::string> cf_names;
4496 rocksdb::Status status;
4497 status = rocksdb::DB::ListColumnFamilies(*rocksdb_db_options, rocksdb_datadir,
4498 &cf_names);
4499 if (!status.ok()) {
4500 /*
4501 When we start on an empty datadir, ListColumnFamilies returns IOError,
4502 and RocksDB doesn't provide any way to check what kind of error it was.
4503 Checking system errno happens to work right now.
4504 */
4505 if (status.IsIOError()
4506#ifndef _WIN32
4507 && errno == ENOENT
4508#endif
4509 ) {
4510 sql_print_information("RocksDB: Got ENOENT when listing column families");
4511 sql_print_information(
4512 "RocksDB: assuming that we're creating a new database");
4513 } else {
4514 rdb_log_status_error(status, "Error listing column families");
4515 DBUG_RETURN(HA_EXIT_FAILURE);
4516 }
4517 } else
4518 sql_print_information("RocksDB: %ld column families found",
4519 cf_names.size());
4520
4521 std::vector<rocksdb::ColumnFamilyDescriptor> cf_descr;
4522 std::vector<rocksdb::ColumnFamilyHandle *> cf_handles;
4523
4524 rocksdb_tbl_options->index_type =
4525 (rocksdb::BlockBasedTableOptions::IndexType)rocksdb_index_type;
4526
4527 if (!rocksdb_tbl_options->no_block_cache) {
4528 std::shared_ptr<rocksdb::Cache> block_cache = rocksdb_use_clock_cache
4529 ? rocksdb::NewClockCache(rocksdb_block_cache_size)
4530 : rocksdb::NewLRUCache(rocksdb_block_cache_size);
4531 if (rocksdb_sim_cache_size > 0) {
4532 // Simulated cache enabled
4533 // Wrap block cache inside a simulated cache and pass it to RocksDB
4534 rocksdb_tbl_options->block_cache =
4535 rocksdb::NewSimCache(block_cache, rocksdb_sim_cache_size, 6);
4536 } else {
4537 // Pass block cache to RocksDB
4538 rocksdb_tbl_options->block_cache = block_cache;
4539 }
4540 }
4541 // Using newer BlockBasedTable format version for better compression
4542 // and better memory allocation.
4543 // See:
4544 // https://github.com/facebook/rocksdb/commit/9ab5adfc59a621d12357580c94451d9f7320c2dd
4545 rocksdb_tbl_options->format_version = 2;
4546
4547 if (rocksdb_collect_sst_properties) {
4548 properties_collector_factory =
4549 std::make_shared<Rdb_tbl_prop_coll_factory>(&ddl_manager);
4550
4551 rocksdb_set_compaction_options(nullptr, nullptr, nullptr, nullptr);
4552
4553 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
4554
4555 DBUG_ASSERT(rocksdb_table_stats_sampling_pct <=
4556 RDB_TBL_STATS_SAMPLE_PCT_MAX);
4557 properties_collector_factory->SetTableStatsSamplingPct(
4558 rocksdb_table_stats_sampling_pct);
4559
4560 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
4561 }
4562
4563 if (rocksdb_persistent_cache_size_mb > 0) {
4564 std::shared_ptr<rocksdb::PersistentCache> pcache;
4565 uint64_t cache_size_bytes= rocksdb_persistent_cache_size_mb * 1024 * 1024;
4566 status = rocksdb::NewPersistentCache(
4567 rocksdb::Env::Default(), std::string(rocksdb_persistent_cache_path),
4568 cache_size_bytes, myrocks_logger, true, &pcache);
4569 if (!status.ok()) {
4570 // NO_LINT_DEBUG
4571 sql_print_error("RocksDB: Persistent cache returned error: (%s)",
4572 status.getState());
4573 DBUG_RETURN(HA_EXIT_FAILURE);
4574 }
4575 rocksdb_tbl_options->persistent_cache = pcache;
4576 } else if (strlen(rocksdb_persistent_cache_path)) {
4577 sql_print_error("RocksDB: Must specify rocksdb_persistent_cache_size_mb");
4578 DBUG_RETURN(HA_EXIT_FAILURE);
4579 }
4580
4581 std::unique_ptr<Rdb_cf_options> cf_options_map(new Rdb_cf_options());
4582 if (!cf_options_map->init(*rocksdb_tbl_options, properties_collector_factory,
4583 rocksdb_default_cf_options,
4584 rocksdb_override_cf_options)) {
4585 // NO_LINT_DEBUG
4586 sql_print_error("RocksDB: Failed to initialize CF options map.");
4587 DBUG_RETURN(HA_EXIT_FAILURE);
4588 }
4589
4590 /*
4591 If there are no column families, we're creating the new database.
4592 Create one column family named "default".
4593 */
4594 if (cf_names.size() == 0)
4595 cf_names.push_back(DEFAULT_CF_NAME);
4596
4597 std::vector<int> compaction_enabled_cf_indices;
4598 sql_print_information("RocksDB: Column Families at start:");
4599 for (size_t i = 0; i < cf_names.size(); ++i) {
4600 rocksdb::ColumnFamilyOptions opts;
4601 cf_options_map->get_cf_options(cf_names[i], &opts);
4602
4603 sql_print_information(" cf=%s", cf_names[i].c_str());
4604 sql_print_information(" write_buffer_size=%ld", opts.write_buffer_size);
4605 sql_print_information(" target_file_size_base=%" PRIu64,
4606 opts.target_file_size_base);
4607
4608 /*
4609 Temporarily disable compactions to prevent a race condition where
4610 compaction starts before compaction filter is ready.
4611 */
4612 if (!opts.disable_auto_compactions) {
4613 compaction_enabled_cf_indices.push_back(i);
4614 opts.disable_auto_compactions = true;
4615 }
4616 cf_descr.push_back(rocksdb::ColumnFamilyDescriptor(cf_names[i], opts));
4617 }
4618
4619 rocksdb::Options main_opts(*rocksdb_db_options,
4620 cf_options_map->get_defaults());
4621
4622 rocksdb::TransactionDBOptions tx_db_options;
4623 tx_db_options.transaction_lock_timeout = 2; // 2 seconds
4624 tx_db_options.custom_mutex_factory = std::make_shared<Rdb_mutex_factory>();
4625
4626 status =
4627 check_rocksdb_options_compatibility(rocksdb_datadir, main_opts, cf_descr);
4628
4629 // We won't start if we'll determine that there's a chance of data corruption
4630 // because of incompatible options.
4631 if (!status.ok()) {
4632 rdb_log_status_error(
4633 status, "Compatibility check against existing database options failed");
4634 DBUG_RETURN(HA_EXIT_FAILURE);
4635 }
4636
4637 status = rocksdb::TransactionDB::Open(
4638 main_opts, tx_db_options, rocksdb_datadir, cf_descr, &cf_handles, &rdb);
4639
4640 if (!status.ok()) {
4641 rdb_log_status_error(status, "Error opening instance");
4642 DBUG_RETURN(HA_EXIT_FAILURE);
4643 }
4644 cf_manager.init(std::move(cf_options_map), &cf_handles);
4645
4646 if (dict_manager.init(rdb->GetBaseDB(), &cf_manager)) {
4647 // NO_LINT_DEBUG
4648 sql_print_error("RocksDB: Failed to initialize data dictionary.");
4649 DBUG_RETURN(HA_EXIT_FAILURE);
4650 }
4651
4652 if (binlog_manager.init(&dict_manager)) {
4653 // NO_LINT_DEBUG
4654 sql_print_error("RocksDB: Failed to initialize binlog manager.");
4655 DBUG_RETURN(HA_EXIT_FAILURE);
4656 }
4657
4658 if (ddl_manager.init(&dict_manager, &cf_manager, rocksdb_validate_tables)) {
4659 // NO_LINT_DEBUG
4660 sql_print_error("RocksDB: Failed to initialize DDL manager.");
4661 DBUG_RETURN(HA_EXIT_FAILURE);
4662 }
4663
4664 Rdb_sst_info::init(rdb);
4665
4666 /*
4667 Enable auto compaction, things needed for compaction filter are finished
4668 initializing
4669 */
4670 std::vector<rocksdb::ColumnFamilyHandle *> compaction_enabled_cf_handles;
4671 compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
4672 for (const auto &index : compaction_enabled_cf_indices) {
4673 compaction_enabled_cf_handles.push_back(cf_handles[index]);
4674 }
4675
4676 status = rdb->EnableAutoCompaction(compaction_enabled_cf_handles);
4677
4678 if (!status.ok()) {
4679 rdb_log_status_error(status, "Error enabling compaction");
4680 DBUG_RETURN(HA_EXIT_FAILURE);
4681 }
4682
4683 auto err = rdb_bg_thread.create_thread(BG_THREAD_NAME
4684#ifdef HAVE_PSI_INTERFACE
4685 ,
4686 rdb_background_psi_thread_key
4687#endif
4688 );
4689 if (err != 0) {
4690 sql_print_error("RocksDB: Couldn't start the background thread: (errno=%d)",
4691 err);
4692 DBUG_RETURN(HA_EXIT_FAILURE);
4693 }
4694
4695 err = rdb_drop_idx_thread.create_thread(INDEX_THREAD_NAME
4696#ifdef HAVE_PSI_INTERFACE
4697 ,
4698 rdb_drop_idx_psi_thread_key
4699#endif
4700 );
4701 if (err != 0) {
4702 sql_print_error("RocksDB: Couldn't start the drop index thread: (errno=%d)",
4703 err);
4704 DBUG_RETURN(HA_EXIT_FAILURE);
4705 }
4706
4707 rdb_set_collation_exception_list(rocksdb_strict_collation_exceptions);
4708
4709 if (rocksdb_pause_background_work) {
4710 rdb->PauseBackgroundWork();
4711 }
4712
4713 // NO_LINT_DEBUG
4714 sql_print_information("RocksDB: global statistics using %s indexer",
4715 STRINGIFY_ARG(RDB_INDEXER));
4716#if defined(HAVE_SCHED_GETCPU)
4717 if (sched_getcpu() == -1) {
4718 // NO_LINT_DEBUG
4719 sql_print_information(
4720 "RocksDB: sched_getcpu() failed - "
4721 "global statistics will use thread_id_indexer_t instead");
4722 }
4723#endif
4724
4725 /**
4726 Rocksdb does not always shutdown its threads, when
4727 plugin is shut down. Disable server's leak check
4728 at exit to avoid crash.
4729 */
4730 my_disable_leak_check = true;
4731
4732 err = my_error_register(rdb_get_error_messages, HA_ERR_ROCKSDB_FIRST,
4733 HA_ERR_ROCKSDB_LAST);
4734 if (err != 0) {
4735 // NO_LINT_DEBUG
4736 sql_print_error("RocksDB: Couldn't initialize error messages");
4737 rdb_open_tables.m_hash.~Rdb_table_set();
4738 DBUG_RETURN(HA_EXIT_FAILURE);
4739 }
4740
4741
4742
4743 // Creating an instance of HistogramImpl should only happen after RocksDB
4744 // has been successfully initialized.
4745 commit_latency_stats = new rocksdb::HistogramImpl();
4746
4747 // Construct a list of directories which will be monitored by I/O watchdog
4748 // to make sure that we won't lose write access to them.
4749 std::vector<std::string> directories;
4750
4751 // 1. Data directory.
4752 directories.push_back(mysql_real_data_home);
4753
4754 // 2. Transaction logs.
4755 if (myrocks::rocksdb_wal_dir && *myrocks::rocksdb_wal_dir) {
4756 directories.push_back(myrocks::rocksdb_wal_dir);
4757 }
4758
4759#if !defined(_WIN32) && !defined(__APPLE__)
4760 io_watchdog = new Rdb_io_watchdog(directories);
4761 io_watchdog->reset_timeout(rocksdb_io_write_timeout_secs);
4762#endif
4763
4764 // NO_LINT_DEBUG
4765 sql_print_information("MyRocks storage engine plugin has been successfully "
4766 "initialized.");
4767
4768 DBUG_RETURN(HA_EXIT_SUCCESS);
4769}
4770
4771/*
4772 Storage Engine deinitialization function, invoked when plugin is unloaded.
4773*/
4774
4775static int rocksdb_done_func(void *const p) {
4776 DBUG_ENTER_FUNC();
4777
4778 int error = 0;
4779
4780 // signal the drop index thread to stop
4781 rdb_drop_idx_thread.signal(true);
4782
4783 // Flush all memtables for not losing data, even if WAL is disabled.
4784 rocksdb_flush_all_memtables();
4785
4786 // Stop all rocksdb background work
4787 CancelAllBackgroundWork(rdb->GetBaseDB(), true);
4788
4789 // Signal the background thread to stop and to persist all stats collected
4790 // from background flushes and compactions. This will add more keys to a new
4791 // memtable, but since the memtables were just flushed, it should not trigger
4792 // a flush that can stall due to background threads being stopped. As long
4793 // as these keys are stored in a WAL file, they can be retrieved on restart.
4794 rdb_bg_thread.signal(true);
4795
4796 // Wait for the background thread to finish.
4797 auto err = rdb_bg_thread.join();
4798 if (err != 0) {
4799 // We'll log the message and continue because we're shutting down and
4800 // continuation is the optimal strategy.
4801 // NO_LINT_DEBUG
4802 sql_print_error("RocksDB: Couldn't stop the background thread: (errno=%d)",
4803 err);
4804 }
4805
4806 // Wait for the drop index thread to finish.
4807 err = rdb_drop_idx_thread.join();
4808 if (err != 0) {
4809 // NO_LINT_DEBUG
4810 sql_print_error("RocksDB: Couldn't stop the index thread: (errno=%d)", err);
4811 }
4812
4813 if (rdb_open_tables.m_hash.size()) {
4814 // Looks like we are getting unloaded and yet we have some open tables
4815 // left behind.
4816 error = 1;
4817 }
4818
4819 /*
4820 MariaDB: When the plugin is unloaded with UNINSTALL SONAME command, some
4821 connections may still have Rdb_transaction objects.
4822
4823 These objects are not genuine transactions (as SQL layer makes sure that
4824 a plugin that is being unloaded has no open tables), they are empty
4825 Rdb_transaction objects that were left there to save on object
4826 creation/deletion.
4827
4828 Go through the list and delete them.
4829 */
4830 {
4831 class Rdb_trx_deleter: public Rdb_tx_list_walker {
4832 public:
4833 std::set<Rdb_transaction*> rdb_trxs;
4834
4835 void process_tran(const Rdb_transaction *const tx) override {
4836 /*
4837 Check if the transaction is really empty. We only check
4838 non-WriteBatch-based transactions, because there is no easy way to
4839 check WriteBatch-based transactions.
4840 */
4841 if (!tx->is_writebatch_trx()) {
4842 const auto tx_impl = static_cast<const Rdb_transaction_impl *>(tx);
4843 DBUG_ASSERT(tx_impl);
4844 if (tx_impl->get_rdb_trx())
4845 DBUG_ASSERT(0);
4846 }
4847 rdb_trxs.insert((Rdb_transaction*)tx);
4848 };
4849 } deleter;
4850
4851 Rdb_transaction::walk_tx_list(&deleter);
4852
4853 for (std::set<Rdb_transaction*>::iterator it= deleter.rdb_trxs.begin();
4854 it != deleter.rdb_trxs.end();
4855 ++it)
4856 {
4857 // When a transaction is deleted, it removes itself from s_tx_list.
4858 delete *it;
4859 }
4860 }
4861
4862 /*
4863 destructors for static objects can be called at _exit(),
4864 but we want to free the memory at dlclose()
4865 */
4866 rdb_open_tables.m_hash.~Rdb_table_set();
4867 mysql_mutex_destroy(&rdb_open_tables.m_mutex);
4868 mysql_mutex_destroy(&rdb_sysvars_mutex);
4869
4870
4871 delete rdb_collation_exceptions;
4872
4873 mysql_mutex_destroy(&rdb_collation_data_mutex);
4874 mysql_mutex_destroy(&rdb_mem_cmp_space_mutex);
4875
4876 Rdb_transaction::term_mutex();
4877
4878 for (auto &it : rdb_collation_data) {
4879 delete it;
4880 it = nullptr;
4881 }
4882
4883 ddl_manager.cleanup();
4884 binlog_manager.cleanup();
4885 dict_manager.cleanup();
4886 cf_manager.cleanup();
4887
4888 delete rdb;
4889 rdb = nullptr;
4890
4891 delete commit_latency_stats;
4892 commit_latency_stats = nullptr;
4893
4894#if !defined(_WIN32) && !defined(__APPLE__)
4895 delete io_watchdog;
4896 io_watchdog = nullptr;
4897#endif
4898
4899// Disown the cache data since we're shutting down.
4900// This results in memory leaks but it improved the shutdown time.
4901// Don't disown when running under valgrind
4902#ifndef HAVE_purify
4903 if (rocksdb_tbl_options->block_cache) {
4904 rocksdb_tbl_options->block_cache->DisownData();
4905 }
4906#endif /* HAVE_purify */
4907
4908 /*
4909 MariaDB: don't clear rocksdb_db_options and rocksdb_tbl_options.
4910 MyRocks' plugin variables refer to them.
4911
4912 The plugin cannot be loaded again (see prevent_myrocks_loading) but plugin
4913 variables are processed before myrocks::rocksdb_init_func is invoked, so
4914 they must point to valid memory.
4915 */
4916 //rocksdb_db_options = nullptr;
4917 rocksdb_db_options->statistics = nullptr;
4918 //rocksdb_tbl_options = nullptr;
4919 rocksdb_stats = nullptr;
4920
4921 my_error_unregister(HA_ERR_ROCKSDB_FIRST, HA_ERR_ROCKSDB_LAST);
4922
4923 /*
4924 Prevent loading the plugin after it has been loaded and then unloaded. This
4925 doesn't work currently.
4926 */
4927 prevent_myrocks_loading= true;
4928
4929 DBUG_RETURN(error);
4930}
4931
4932static inline void rocksdb_smart_seek(bool seek_backward,
4933 rocksdb::Iterator *const iter,
4934 const rocksdb::Slice &key_slice) {
4935 if (seek_backward) {
4936 iter->SeekForPrev(key_slice);
4937 } else {
4938 iter->Seek(key_slice);
4939 }
4940}
4941
4942static inline void rocksdb_smart_next(bool seek_backward,
4943 rocksdb::Iterator *const iter) {
4944 if (seek_backward) {
4945 iter->Prev();
4946 } else {
4947 iter->Next();
4948 }
4949}
4950
4951#ifndef NDEBUG
4952// simulate that RocksDB has reported corrupted data
4953static void dbug_change_status_to_corrupted(rocksdb::Status *status) {
4954 *status = rocksdb::Status::Corruption();
4955}
4956#endif
4957
4958// If the iterator is not valid it might be because of EOF but might be due
4959// to IOError or corruption. The good practice is always check it.
4960// https://github.com/facebook/rocksdb/wiki/Iterator#error-handling
4961static inline bool is_valid(rocksdb::Iterator *scan_it) {
4962 if (scan_it->Valid()) {
4963 return true;
4964 } else {
4965 rocksdb::Status s = scan_it->status();
4966 DBUG_EXECUTE_IF("rocksdb_return_status_corrupted",
4967 dbug_change_status_to_corrupted(&s););
4968 if (s.IsIOError() || s.IsCorruption()) {
4969 if (s.IsCorruption()) {
4970 rdb_persist_corruption_marker();
4971 }
4972 rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
4973 }
4974 return false;
4975 }
4976}
4977
4978/**
4979 @brief
4980 Example of simple lock controls. The "table_handler" it creates is a
4981 structure we will pass to each ha_rocksdb handler. Do you have to have
4982 one of these? Well, you have pieces that are used for locking, and
4983 they are needed to function.
4984*/
4985
4986Rdb_table_handler *
4987Rdb_open_tables_map::get_table_handler(const char *const table_name) {
4988 Rdb_table_handler *table_handler;
4989 uint length;
4990 char *tmp_name;
4991
4992 DBUG_ASSERT(table_name != nullptr);
4993 length = (uint)strlen(table_name);
4994
4995 // First, look up the table in the hash map.
4996 RDB_MUTEX_LOCK_CHECK(m_mutex);
4997 if (!m_hash.size() || !(table_handler = m_hash.find(table_name, length))) {
4998 // Since we did not find it in the hash map, attempt to create and add it
4999 // to the hash map.
5000 if (!(table_handler = reinterpret_cast<Rdb_table_handler *>(my_multi_malloc(
5001 MYF(MY_WME | MY_ZEROFILL), &table_handler, sizeof(*table_handler),
5002 &tmp_name, length + 1, NullS)))) {
5003 // Allocating a new Rdb_table_handler and a new table name failed.
5004 RDB_MUTEX_UNLOCK_CHECK(m_mutex);
5005 return nullptr;
5006 }
5007
5008 table_handler->m_ref_count = 0;
5009 table_handler->m_table_name_length = length;
5010 table_handler->m_table_name = tmp_name;
5011 strmov(table_handler->m_table_name, table_name);
5012
5013 if (m_hash.insert(table_handler)) {
5014 // Inserting into the hash map failed.
5015 RDB_MUTEX_UNLOCK_CHECK(m_mutex);
5016 my_free(table_handler);
5017 return nullptr;
5018 }
5019
5020 thr_lock_init(&table_handler->m_thr_lock);
5021#ifdef MARIAROCKS_NOT_YET
5022 table_handler->m_io_perf_read.init();
5023 table_handler->m_io_perf_write.init();
5024#endif
5025 }
5026 DBUG_ASSERT(table_handler->m_ref_count >= 0);
5027 table_handler->m_ref_count++;
5028
5029 RDB_MUTEX_UNLOCK_CHECK(m_mutex);
5030
5031 return table_handler;
5032}
5033
5034std::vector<std::string> rdb_get_open_table_names(void) {
5035 return rdb_open_tables.get_table_names();
5036}
5037
5038std::vector<std::string> Rdb_open_tables_map::get_table_names(void) const {
5039 size_t i;
5040 const Rdb_table_handler *table_handler;
5041 std::vector<std::string> names;
5042
5043 RDB_MUTEX_LOCK_CHECK(m_mutex);
5044 for (i = 0; (table_handler = m_hash.at(i)); i++) {
5045 DBUG_ASSERT(table_handler != nullptr);
5046 names.push_back(table_handler->m_table_name);
5047 }
5048 DBUG_ASSERT(i == m_hash.size());
5049 RDB_MUTEX_UNLOCK_CHECK(m_mutex);
5050
5051 return names;
5052}
5053
5054/*
5055 Inspired by innobase_get_int_col_max_value from InnoDB. This returns the
5056 maximum value a type can take on.
5057*/
5058static ulonglong rdb_get_int_col_max_value(const Field *field) {
5059 ulonglong max_value = 0;
5060 switch (field->key_type()) {
5061 case HA_KEYTYPE_BINARY:
5062 max_value = 0xFFULL;
5063 break;
5064 case HA_KEYTYPE_INT8:
5065 max_value = 0x7FULL;
5066 break;
5067 case HA_KEYTYPE_USHORT_INT:
5068 max_value = 0xFFFFULL;
5069 break;
5070 case HA_KEYTYPE_SHORT_INT:
5071 max_value = 0x7FFFULL;
5072 break;
5073 case HA_KEYTYPE_UINT24:
5074 max_value = 0xFFFFFFULL;
5075 break;
5076 case HA_KEYTYPE_INT24:
5077 max_value = 0x7FFFFFULL;
5078 break;
5079 case HA_KEYTYPE_ULONG_INT:
5080 max_value = 0xFFFFFFFFULL;
5081 break;
5082 case HA_KEYTYPE_LONG_INT:
5083 max_value = 0x7FFFFFFFULL;
5084 break;
5085 case HA_KEYTYPE_ULONGLONG:
5086 max_value = 0xFFFFFFFFFFFFFFFFULL;
5087 break;
5088 case HA_KEYTYPE_LONGLONG:
5089 max_value = 0x7FFFFFFFFFFFFFFFULL;
5090 break;
5091 case HA_KEYTYPE_FLOAT:
5092 max_value = 0x1000000ULL;
5093 break;
5094 case HA_KEYTYPE_DOUBLE:
5095 max_value = 0x20000000000000ULL;
5096 break;
5097 default:
5098 abort();
5099 }
5100
5101 return max_value;
5102}
5103
5104void ha_rocksdb::load_auto_incr_value() {
5105 ulonglong auto_incr = 0;
5106 bool validate_last = false, use_datadic = true;
5107#ifndef NDEBUG
5108 DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", use_datadic = false;);
5109 validate_last = true;
5110#endif
5111
5112 if (use_datadic && dict_manager.get_auto_incr_val(
5113 m_tbl_def->get_autoincr_gl_index_id(), &auto_incr)) {
5114 update_auto_incr_val(auto_incr);
5115 }
5116
5117 // If we find nothing in the data dictionary, or if we are in debug mode,
5118 // then call index_last to get the last value.
5119 //
5120 // This is needed when upgrading from a server that did not support
5121 // persistent auto_increment, of if the table is empty.
5122 //
5123 // For debug mode, we are just verifying that the data dictionary value is
5124 // greater than or equal to the maximum value in the table.
5125 if (auto_incr == 0 || validate_last) {
5126 auto_incr = load_auto_incr_value_from_index();
5127 update_auto_incr_val(auto_incr);
5128 }
5129
5130 // If we failed to find anything from the data dictionary and index, then
5131 // initialize auto_increment to 1.
5132 if (m_tbl_def->m_auto_incr_val == 0) {
5133 update_auto_incr_val(1);
5134 }
5135}
5136
5137ulonglong ha_rocksdb::load_auto_incr_value_from_index() {
5138 const int save_active_index = active_index;
5139 active_index = table->s->next_number_index;
5140 const uint8 save_table_status = table->status;
5141 ulonglong last_val = 0;
5142
5143 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
5144 const bool is_new_snapshot = !tx->has_snapshot();
5145 if (is_new_snapshot) {
5146 tx->acquire_snapshot(true);
5147 }
5148
5149 // Do a lookup. We only need index column, so it should be index-only.
5150 // (another reason to make it index-only is that table->read_set is not set
5151 // appropriately and non-index-only lookup will not read the value)
5152 const bool save_keyread_only = m_keyread_only;
5153 m_keyread_only = true;
5154 m_key_requested = true;
5155
5156 if (!index_last(table->record[0])) {
5157 Field *field =
5158 table->key_info[table->s->next_number_index].key_part[0].field;
5159 ulonglong max_val = rdb_get_int_col_max_value(field);
5160 my_bitmap_map *const old_map =
5161 dbug_tmp_use_all_columns(table, table->read_set);
5162 last_val = field->val_int();
5163 if (last_val != max_val) {
5164 last_val++;
5165 }
5166#ifndef NDEBUG
5167 ulonglong dd_val;
5168 if (last_val <= max_val) {
5169 const auto &gl_index_id = m_tbl_def->get_autoincr_gl_index_id();
5170 if (dict_manager.get_auto_incr_val(gl_index_id, &dd_val) &&
5171 tx->get_auto_incr(gl_index_id) == 0) {
5172 DBUG_ASSERT(dd_val >= last_val);
5173 }
5174 }
5175#endif
5176 dbug_tmp_restore_column_map(table->read_set, old_map);
5177 }
5178
5179 m_keyread_only = save_keyread_only;
5180 if (is_new_snapshot) {
5181 tx->release_snapshot();
5182 }
5183
5184 table->status = save_table_status;
5185 active_index = save_active_index;
5186
5187 /*
5188 Do what ha_rocksdb::index_end() does.
5189 (Why don't we use index_init/index_end? class handler defines index_init
5190 as private, for some reason).
5191 */
5192 release_scan_iterator();
5193
5194 return last_val;
5195}
5196
5197void ha_rocksdb::update_auto_incr_val(ulonglong val) {
5198 ulonglong auto_incr_val = m_tbl_def->m_auto_incr_val;
5199 while (
5200 auto_incr_val < val &&
5201 !m_tbl_def->m_auto_incr_val.compare_exchange_weak(auto_incr_val, val)) {
5202 // Do nothing - just loop until auto_incr_val is >= val or we successfully
5203 // set it
5204 }
5205}
5206
5207void ha_rocksdb::update_auto_incr_val_from_field() {
5208 Field *field;
5209 ulonglong new_val, max_val;
5210 field = table->key_info[table->s->next_number_index].key_part[0].field;
5211 max_val = rdb_get_int_col_max_value(field);
5212
5213 my_bitmap_map *const old_map =
5214 dbug_tmp_use_all_columns(table, table->read_set);
5215 new_val = field->val_int();
5216 // don't increment if we would wrap around
5217 if (new_val != max_val) {
5218 new_val++;
5219 }
5220
5221 dbug_tmp_restore_column_map(table->read_set, old_map);
5222
5223 // Only update if positive value was set for auto_incr column.
5224 if (new_val <= max_val) {
5225 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
5226 tx->set_auto_incr(m_tbl_def->get_autoincr_gl_index_id(), new_val);
5227
5228 // Update the in memory auto_incr value in m_tbl_def.
5229 update_auto_incr_val(new_val);
5230 }
5231}
5232
5233int ha_rocksdb::load_hidden_pk_value() {
5234 const int save_active_index = active_index;
5235 active_index = m_tbl_def->m_key_count - 1;
5236 const uint8 save_table_status = table->status;
5237
5238 /*
5239 We should read the latest committed value in the database.
5240 That is, if we have an open transaction with a snapshot, we should not use
5241 it as we may get old data. Start a new transaction to read the latest
5242 value.
5243 */
5244 Rdb_transaction *const temp_tx = new Rdb_transaction_impl(table->in_use);
5245 temp_tx->start_tx();
5246 Rdb_transaction *&tx = get_tx_from_thd(table->in_use);
5247 Rdb_transaction *save_tx= tx;
5248 tx= temp_tx;
5249
5250 longlong hidden_pk_id = 1;
5251 // Do a lookup.
5252 if (!index_last(table->record[0])) {
5253 /*
5254 Decode PK field from the key
5255 */
5256 auto err = read_hidden_pk_id_from_rowkey(&hidden_pk_id);
5257 if (err) {
5258 delete tx;
5259 tx= save_tx;
5260 return err;
5261 }
5262
5263 hidden_pk_id++;
5264 }
5265
5266 longlong old = m_tbl_def->m_hidden_pk_val;
5267 while (old < hidden_pk_id &&
5268 !m_tbl_def->m_hidden_pk_val.compare_exchange_weak(old, hidden_pk_id)) {
5269 }
5270
5271 delete tx;
5272 tx= save_tx;
5273
5274 table->status = save_table_status;
5275 active_index = save_active_index;
5276
5277 release_scan_iterator();
5278
5279 return HA_EXIT_SUCCESS;
5280}
5281
5282/* Get PK value from m_tbl_def->m_hidden_pk_info. */
5283longlong ha_rocksdb::update_hidden_pk_val() {
5284 DBUG_ASSERT(has_hidden_pk(table));
5285 const longlong new_val = m_tbl_def->m_hidden_pk_val++;
5286 return new_val;
5287}
5288
5289/* Get the id of the hidden pk id from m_last_rowkey */
5290int ha_rocksdb::read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id) {
5291 DBUG_ASSERT(hidden_pk_id != nullptr);
5292 DBUG_ASSERT(table != nullptr);
5293 DBUG_ASSERT(has_hidden_pk(table));
5294
5295 rocksdb::Slice rowkey_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
5296
5297 // Get hidden primary key from old key slice
5298 Rdb_string_reader reader(&rowkey_slice);
5299 if ((!reader.read(Rdb_key_def::INDEX_NUMBER_SIZE)))
5300 return HA_ERR_ROCKSDB_CORRUPT_DATA;
5301
5302 const int length= 8; /* was Field_longlong::PACK_LENGTH in FB MySQL tree */
5303 const uchar *from = reinterpret_cast<const uchar *>(reader.read(length));
5304 if (from == nullptr) {
5305 /* Mem-comparable image doesn't have enough bytes */
5306 return HA_ERR_ROCKSDB_CORRUPT_DATA;
5307 }
5308
5309 *hidden_pk_id = rdb_netbuf_read_uint64(&from);
5310 return HA_EXIT_SUCCESS;
5311}
5312
5313/**
5314 @brief
5315 Free lock controls. We call this whenever we close a table. If the table had
5316 the last reference to the table_handler, then we free the memory associated
5317 with it.
5318*/
5319
5320void Rdb_open_tables_map::release_table_handler(
5321 Rdb_table_handler *const table_handler) {
5322 RDB_MUTEX_LOCK_CHECK(m_mutex);
5323
5324 DBUG_ASSERT(table_handler != nullptr);
5325 DBUG_ASSERT(table_handler->m_ref_count > 0);
5326 if (!--table_handler->m_ref_count) {
5327 // Last reference was released. Tear down the hash entry.
5328 const auto ret MY_ATTRIBUTE((__unused__)) = m_hash.remove(table_handler);
5329 DBUG_ASSERT(!ret); // the hash entry must actually be found and deleted
5330 my_core::thr_lock_delete(&table_handler->m_thr_lock);
5331 my_free(table_handler);
5332 }
5333
5334 RDB_MUTEX_UNLOCK_CHECK(m_mutex);
5335}
5336
5337static handler *rocksdb_create_handler(my_core::handlerton *const hton,
5338 my_core::TABLE_SHARE *const table_arg,
5339 my_core::MEM_ROOT *const mem_root) {
5340 return new (mem_root) ha_rocksdb(hton, table_arg);
5341}
5342
5343ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton,
5344 my_core::TABLE_SHARE *const table_arg)
5345 : handler(hton, table_arg), m_table_handler(nullptr), m_scan_it(nullptr),
5346 m_scan_it_skips_bloom(false), m_scan_it_snapshot(nullptr),
5347 m_tbl_def(nullptr), m_pk_descr(nullptr), m_key_descr_arr(nullptr),
5348 m_pk_can_be_decoded(false), m_maybe_unpack_info(false),
5349 m_pk_tuple(nullptr), m_pk_packed_tuple(nullptr),
5350 m_sk_packed_tuple(nullptr), m_end_key_packed_tuple(nullptr),
5351 m_sk_match_prefix(nullptr), m_sk_match_prefix_buf(nullptr),
5352 m_sk_packed_tuple_old(nullptr), m_dup_sk_packed_tuple(nullptr),
5353 m_dup_sk_packed_tuple_old(nullptr), m_eq_cond_lower_bound(nullptr),
5354 m_eq_cond_upper_bound(nullptr), m_pack_buffer(nullptr),
5355 m_lock_rows(RDB_LOCK_NONE), m_keyread_only(FALSE), m_encoder_arr(nullptr),
5356 m_row_checksums_checked(0), m_in_rpl_delete_rows(false),
5357 m_in_rpl_update_rows(false), m_force_skip_unique_check(false) {}
5358
5359
5360const std::string &ha_rocksdb::get_table_basename() const {
5361 return m_tbl_def->base_tablename();
5362}
5363
5364/**
5365 @return
5366 false OK
5367 other Error inpacking the data
5368*/
5369bool ha_rocksdb::init_with_fields() {
5370 DBUG_ENTER_FUNC();
5371
5372 const uint pk = table_share->primary_key;
5373 if (pk != MAX_KEY) {
5374 const uint key_parts = table_share->key_info[pk].user_defined_key_parts;
5375 check_keyread_allowed(pk /*PK*/, key_parts - 1, true);
5376 } else
5377 m_pk_can_be_decoded = false;
5378
5379 cached_table_flags = table_flags();
5380
5381 DBUG_RETURN(false); /* Ok */
5382}
5383
5384/*
5385 If the key is a TTL key, we may need to filter it out.
5386
5387 The purpose of read filtering for tables with TTL is to ensure that
5388 during a transaction a key which has expired already but not removed by
5389 compaction yet is not returned to the user.
5390
5391 Without this the user might be hit with problems such as disappearing
5392 rows within a transaction, etc, because the compaction filter ignores
5393 snapshots when filtering keys.
5394*/
5395bool ha_rocksdb::should_hide_ttl_rec(const Rdb_key_def &kd,
5396 const rocksdb::Slice &ttl_rec_val,
5397 const int64_t curr_ts) {
5398 DBUG_ASSERT(kd.has_ttl());
5399 DBUG_ASSERT(kd.m_ttl_rec_offset != UINT_MAX);
5400
5401 /*
5402 Curr_ts can only be 0 if there are no snapshots open.
5403 should_hide_ttl_rec can only be called when there is >=1 snapshots, unless
5404 we are filtering on the write path (single INSERT/UPDATE) in which case
5405 we are passed in the current time as curr_ts.
5406
5407 In the event curr_ts is 0, we always decide not to filter the record. We
5408 also log a warning and increment a diagnostic counter.
5409 */
5410 if (curr_ts == 0) {
5411 update_row_stats(ROWS_HIDDEN_NO_SNAPSHOT);
5412 return false;
5413 }
5414
5415 if (!rdb_is_ttl_read_filtering_enabled() || !rdb_is_ttl_enabled()) {
5416 return false;
5417 }
5418
5419 Rdb_string_reader reader(&ttl_rec_val);
5420
5421 /*
5422 Find where the 8-byte ttl is for each record in this index.
5423 */
5424 uint64 ts;
5425 if (!reader.read(kd.m_ttl_rec_offset) || reader.read_uint64(&ts)) {
5426 /*
5427 This condition should never be reached since all TTL records have an
5428 8 byte ttl field in front. Don't filter the record out, and log an error.
5429 */
5430 std::string buf;
5431 buf = rdb_hexdump(ttl_rec_val.data(), ttl_rec_val.size(),
5432 RDB_MAX_HEXDUMP_LEN);
5433 const GL_INDEX_ID gl_index_id = kd.get_gl_index_id();
5434 // NO_LINT_DEBUG
5435 sql_print_error("Decoding ttl from PK value failed, "
5436 "for index (%u,%u), val: %s",
5437 gl_index_id.cf_id, gl_index_id.index_id, buf.c_str());
5438 DBUG_ASSERT(0);
5439 return false;
5440 }
5441
5442 /* Hide record if it has expired before the current snapshot time. */
5443 uint64 read_filter_ts = 0;
5444#ifndef NDEBUG
5445 read_filter_ts += rdb_dbug_set_ttl_read_filter_ts();
5446#endif
5447 bool is_hide_ttl =
5448 ts + kd.m_ttl_duration + read_filter_ts <= static_cast<uint64>(curr_ts);
5449 if (is_hide_ttl) {
5450 update_row_stats(ROWS_FILTERED);
5451 }
5452 return is_hide_ttl;
5453}
5454
5455void ha_rocksdb::rocksdb_skip_expired_records(const Rdb_key_def &kd,
5456 rocksdb::Iterator *const iter,
5457 bool seek_backward) {
5458 if (kd.has_ttl()) {
5459 while (iter->Valid() &&
5460 should_hide_ttl_rec(
5461 kd, iter->value(),
5462 get_or_create_tx(table->in_use)->m_snapshot_timestamp)) {
5463 rocksdb_smart_next(seek_backward, iter);
5464 }
5465 }
5466}
5467
5468/**
5469 Convert record from table->record[0] form into a form that can be written
5470 into rocksdb.
5471
5472 @param pk_packed_slice Packed PK tuple. We need it in order to compute
5473 and store its CRC.
5474 @param packed_rec OUT Data slice with record data.
5475*/
5476
5477int ha_rocksdb::convert_record_to_storage_format(
5478 const struct update_row_info &row_info, rocksdb::Slice *const packed_rec) {
5479 DBUG_ASSERT_IMP(m_maybe_unpack_info, row_info.new_pk_unpack_info);
5480 DBUG_ASSERT(m_pk_descr != nullptr);
5481
5482 const rocksdb::Slice &pk_packed_slice = row_info.new_pk_slice;
5483 Rdb_string_writer *const pk_unpack_info = row_info.new_pk_unpack_info;
5484 bool has_ttl = m_pk_descr->has_ttl();
5485 bool has_ttl_column = !m_pk_descr->m_ttl_column.empty();
5486 bool ttl_in_pk = has_ttl_column && (row_info.ttl_pk_offset != UINT_MAX);
5487
5488 m_storage_record.length(0);
5489
5490 if (has_ttl) {
5491 /* If it's a TTL record, reserve space for 8 byte TTL value in front. */
5492 m_storage_record.fill(ROCKSDB_SIZEOF_TTL_RECORD + m_null_bytes_in_rec, 0);
5493 m_ttl_bytes_updated = false;
5494
5495 /*
5496 If the TTL is contained within the key, we use the offset to find the
5497 TTL value and place it in the beginning of the value record.
5498 */
5499 if (ttl_in_pk) {
5500 Rdb_string_reader reader(&pk_packed_slice);
5501 const char *ts;
5502 if (!reader.read(row_info.ttl_pk_offset) ||
5503 !(ts = reader.read(ROCKSDB_SIZEOF_TTL_RECORD))) {
5504 std::string buf;
5505 buf = rdb_hexdump(pk_packed_slice.data(), pk_packed_slice.size(),
5506 RDB_MAX_HEXDUMP_LEN);
5507 const GL_INDEX_ID gl_index_id = m_pk_descr->get_gl_index_id();
5508 // NO_LINT_DEBUG
5509 sql_print_error("Decoding ttl from PK failed during insert, "
5510 "for index (%u,%u), key: %s",
5511 gl_index_id.cf_id, gl_index_id.index_id, buf.c_str());
5512 return HA_EXIT_FAILURE;
5513 }
5514
5515 char *const data = const_cast<char *>(m_storage_record.ptr());
5516 memcpy(data, ts, ROCKSDB_SIZEOF_TTL_RECORD);
5517#ifndef NDEBUG
5518 // Adjust for test case if needed
5519 rdb_netbuf_store_uint64(
5520 reinterpret_cast<uchar *>(data),
5521 rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(data)) +
5522 rdb_dbug_set_ttl_rec_ts());
5523#endif
5524 // Also store in m_ttl_bytes to propagate to update_sk
5525 memcpy(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD);
5526 } else if (!has_ttl_column) {
5527 /*
5528 For implicitly generated TTL records we need to copy over the old
5529 TTL value from the old record in the event of an update. It was stored
5530 in m_ttl_bytes.
5531
5532 Otherwise, generate a timestamp using the current time.
5533 */
5534 if (!row_info.old_pk_slice.empty()) {
5535 char *const data = const_cast<char *>(m_storage_record.ptr());
5536 memcpy(data, m_ttl_bytes, sizeof(uint64));
5537 } else {
5538 uint64 ts = static_cast<uint64>(std::time(nullptr));
5539#ifndef NDEBUG
5540 ts += rdb_dbug_set_ttl_rec_ts();
5541#endif
5542 char *const data = const_cast<char *>(m_storage_record.ptr());
5543 rdb_netbuf_store_uint64(reinterpret_cast<uchar *>(data), ts);
5544 // Also store in m_ttl_bytes to propagate to update_sk
5545 memcpy(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD);
5546 }
5547 }
5548 } else {
5549 /* All NULL bits are initially 0 */
5550 m_storage_record.fill(m_null_bytes_in_rec, 0);
5551 }
5552
5553 // If a primary key may have non-empty unpack_info for certain values,
5554 // (m_maybe_unpack_info=TRUE), we write the unpack_info block. The block
5555 // itself was prepared in Rdb_key_def::pack_record.
5556 if (m_maybe_unpack_info) {
5557 m_storage_record.append(reinterpret_cast<char *>(pk_unpack_info->ptr()),
5558 pk_unpack_info->get_current_pos());
5559 }
5560
5561 for (uint i = 0; i < table->s->fields; i++) {
5562 /* Don't pack decodable PK key parts */
5563 if (m_encoder_arr[i].m_storage_type != Rdb_field_encoder::STORE_ALL) {
5564 continue;
5565 }
5566
5567 Field *const field = table->field[i];
5568 if (m_encoder_arr[i].maybe_null()) {
5569 char *data = const_cast<char *>(m_storage_record.ptr());
5570 if (has_ttl) {
5571 data += ROCKSDB_SIZEOF_TTL_RECORD;
5572 }
5573
5574 if (field->is_null()) {
5575 data[m_encoder_arr[i].m_null_offset] |= m_encoder_arr[i].m_null_mask;
5576 /* Don't write anything for NULL values */
5577 continue;
5578 }
5579 }
5580
5581 if (m_encoder_arr[i].m_field_type == MYSQL_TYPE_BLOB) {
5582 my_core::Field_blob *blob = (my_core::Field_blob *)field;
5583 /* Get the number of bytes needed to store length*/
5584 const uint length_bytes = blob->pack_length() - portable_sizeof_char_ptr;
5585
5586 /* Store the length of the value */
5587 m_storage_record.append(reinterpret_cast<char *>(blob->ptr),
5588 length_bytes);
5589
5590 /* Store the blob value itself */
5591 char *data_ptr;
5592 memcpy(&data_ptr, blob->ptr + length_bytes, sizeof(uchar **));
5593 m_storage_record.append(data_ptr, blob->get_length());
5594 } else if (m_encoder_arr[i].m_field_type == MYSQL_TYPE_VARCHAR) {
5595 Field_varstring *const field_var = (Field_varstring *)field;
5596 uint data_len;
5597 /* field_var->length_bytes is 1 or 2 */
5598 if (field_var->length_bytes == 1) {
5599 data_len = field_var->ptr[0];
5600 } else {
5601 DBUG_ASSERT(field_var->length_bytes == 2);
5602 data_len = uint2korr(field_var->ptr);
5603 }
5604 m_storage_record.append(reinterpret_cast<char *>(field_var->ptr),
5605 field_var->length_bytes + data_len);
5606 } else {
5607 /* Copy the field data */
5608 const uint len = field->pack_length_in_rec();
5609 m_storage_record.append(reinterpret_cast<char *>(field->ptr), len);
5610
5611 /*
5612 Check if this is the TTL field within the table, if so store the TTL
5613 in the front of the record as well here.
5614 */
5615 if (has_ttl && has_ttl_column &&
5616 i == m_pk_descr->get_ttl_field_offset()) {
5617 DBUG_ASSERT(len == ROCKSDB_SIZEOF_TTL_RECORD);
5618 DBUG_ASSERT(field->real_type() == MYSQL_TYPE_LONGLONG);
5619 DBUG_ASSERT(m_pk_descr->get_ttl_field_offset() != UINT_MAX);
5620
5621 char *const data = const_cast<char *>(m_storage_record.ptr());
5622 uint64 ts = uint8korr(field->ptr);
5623#ifndef NDEBUG
5624 ts += rdb_dbug_set_ttl_rec_ts();
5625#endif
5626 rdb_netbuf_store_uint64(reinterpret_cast<uchar *>(data), ts);
5627
5628 // If this is an update and the timestamp has been updated, take note
5629 // so we can avoid updating SKs unnecessarily.
5630 if (!row_info.old_pk_slice.empty()) {
5631 m_ttl_bytes_updated =
5632 memcmp(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD);
5633 }
5634 // Store timestamp in m_ttl_bytes to propagate to update_sk
5635 memcpy(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD);
5636 }
5637 }
5638 }
5639
5640 if (should_store_row_debug_checksums()) {
5641 const uint32_t key_crc32 = my_core::crc32(
5642 0, rdb_slice_to_uchar_ptr(&pk_packed_slice), pk_packed_slice.size());
5643 const uint32_t val_crc32 =
5644 my_core::crc32(0, rdb_mysql_str_to_uchar_str(&m_storage_record),
5645 m_storage_record.length());
5646 uchar key_crc_buf[RDB_CHECKSUM_SIZE];
5647 uchar val_crc_buf[RDB_CHECKSUM_SIZE];
5648 rdb_netbuf_store_uint32(key_crc_buf, key_crc32);
5649 rdb_netbuf_store_uint32(val_crc_buf, val_crc32);
5650 m_storage_record.append((const char *)&RDB_CHECKSUM_DATA_TAG, 1);
5651 m_storage_record.append((const char *)key_crc_buf, RDB_CHECKSUM_SIZE);
5652 m_storage_record.append((const char *)val_crc_buf, RDB_CHECKSUM_SIZE);
5653 }
5654
5655 *packed_rec =
5656 rocksdb::Slice(m_storage_record.ptr(), m_storage_record.length());
5657
5658 return HA_EXIT_SUCCESS;
5659}
5660
5661/*
5662 @brief
5663 Setup which fields will be unpacked when reading rows
5664
5665 @detail
5666 Three special cases when we still unpack all fields:
5667 - When this table is being updated (m_lock_rows==RDB_LOCK_WRITE).
5668 - When @@rocksdb_verify_row_debug_checksums is ON (In this mode, we need to
5669 read all fields to find whether there is a row checksum at the end. We could
5670 skip the fields instead of decoding them, but currently we do decoding.)
5671 - On index merge as bitmap is cleared during that operation
5672
5673 @seealso
5674 ha_rocksdb::setup_field_converters()
5675 ha_rocksdb::convert_record_from_storage_format()
5676*/
5677void ha_rocksdb::setup_read_decoders() {
5678 m_decoders_vect.clear();
5679 m_key_requested = false;
5680
5681 int last_useful = 0;
5682 int skip_size = 0;
5683
5684 for (uint i = 0; i < table->s->fields; i++) {
5685 // bitmap is cleared on index merge, but it still needs to decode columns
5686 const bool field_requested =
5687 m_lock_rows == RDB_LOCK_WRITE || m_verify_row_debug_checksums ||
5688 bitmap_is_clear_all(table->read_set) ||
5689 bitmap_is_set(table->read_set, table->field[i]->field_index);
5690
5691 // We only need the decoder if the whole record is stored.
5692 if (m_encoder_arr[i].m_storage_type != Rdb_field_encoder::STORE_ALL) {
5693 // the field potentially needs unpacking
5694 if (field_requested) {
5695 // the field is in the read set
5696 m_key_requested = true;
5697 }
5698 continue;
5699 }
5700
5701 if (field_requested) {
5702 // We will need to decode this field
5703 m_decoders_vect.push_back({&m_encoder_arr[i], true, skip_size});
5704 last_useful = m_decoders_vect.size();
5705 skip_size = 0;
5706 } else {
5707 if (m_encoder_arr[i].uses_variable_len_encoding() ||
5708 m_encoder_arr[i].maybe_null()) {
5709 // For variable-length field, we need to read the data and skip it
5710 m_decoders_vect.push_back({&m_encoder_arr[i], false, skip_size});
5711 skip_size = 0;
5712 } else {
5713 // Fixed-width field can be skipped without looking at it.
5714 // Add appropriate skip_size to the next field.
5715 skip_size += m_encoder_arr[i].m_pack_length_in_rec;
5716 }
5717 }
5718 }
5719
5720 // It could be that the last few elements are varchars that just do
5721 // skipping. Remove them.
5722 m_decoders_vect.erase(m_decoders_vect.begin() + last_useful,
5723 m_decoders_vect.end());
5724}
5725
5726#ifndef NDEBUG
5727void dbug_append_garbage_at_end(rocksdb::PinnableSlice *on_disk_rec) {
5728 std::string str(on_disk_rec->data(), on_disk_rec->size());
5729 on_disk_rec->Reset();
5730 str.append("abc");
5731 on_disk_rec->PinSelf(rocksdb::Slice(str));
5732}
5733
5734void dbug_truncate_record(rocksdb::PinnableSlice *on_disk_rec) {
5735 on_disk_rec->remove_suffix(on_disk_rec->size());
5736}
5737
5738void dbug_modify_rec_varchar12(rocksdb::PinnableSlice *on_disk_rec) {
5739 std::string res;
5740 // The record is NULL-byte followed by VARCHAR(10).
5741 // Put the NULL-byte
5742 res.append("\0", 1);
5743 // Then, add a valid VARCHAR(12) value.
5744 res.append("\xC", 1);
5745 res.append("123456789ab", 12);
5746
5747 on_disk_rec->Reset();
5748 on_disk_rec->PinSelf(rocksdb::Slice(res));
5749}
5750
5751void dbug_modify_key_varchar8(String &on_disk_rec) {
5752 std::string res;
5753 // The key starts with index number
5754 res.append(on_disk_rec.ptr(), Rdb_key_def::INDEX_NUMBER_SIZE);
5755
5756 // Then, a mem-comparable form of a varchar(8) value.
5757 res.append("ABCDE\0\0\0\xFC", 9);
5758 on_disk_rec.length(0);
5759 on_disk_rec.append(res.data(), res.size());
5760}
5761
5762void dbug_create_err_inplace_alter() {
5763 my_printf_error(ER_UNKNOWN_ERROR,
5764 "Intentional failure in inplace alter occurred.", MYF(0));
5765}
5766#endif
5767
5768int ha_rocksdb::convert_record_from_storage_format(
5769 const rocksdb::Slice *const key, uchar *const buf) {
5770
5771 DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read1",
5772 dbug_append_garbage_at_end(&m_retrieved_record););
5773 DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read2",
5774 dbug_truncate_record(&m_retrieved_record););
5775 DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read3",
5776 dbug_modify_rec_varchar12(&m_retrieved_record););
5777
5778 return convert_record_from_storage_format(key, &m_retrieved_record, buf);
5779}
5780
5781int ha_rocksdb::convert_blob_from_storage_format(
5782 my_core::Field_blob *const blob,
5783 Rdb_string_reader *const reader,
5784 bool decode)
5785{
5786 /* Get the number of bytes needed to store length*/
5787 const uint length_bytes = blob->pack_length() - portable_sizeof_char_ptr;
5788
5789 const char *data_len_str;
5790 if (!(data_len_str = reader->read(length_bytes))) {
5791 return HA_ERR_ROCKSDB_CORRUPT_DATA;
5792 }
5793
5794 memcpy(blob->ptr, data_len_str, length_bytes);
5795
5796 const uint32 data_len = blob->get_length(
5797 reinterpret_cast<const uchar*>(data_len_str), length_bytes);
5798 const char *blob_ptr;
5799 if (!(blob_ptr = reader->read(data_len))) {
5800 return HA_ERR_ROCKSDB_CORRUPT_DATA;
5801 }
5802
5803 if (decode) {
5804 // set 8-byte pointer to 0, like innodb does (relevant for 32-bit
5805 // platforms)
5806 memset(blob->ptr + length_bytes, 0, 8);
5807 memcpy(blob->ptr + length_bytes, &blob_ptr, sizeof(uchar **));
5808 }
5809
5810 return HA_EXIT_SUCCESS;
5811}
5812
5813int ha_rocksdb::convert_varchar_from_storage_format(
5814 my_core::Field_varstring *const field_var,
5815 Rdb_string_reader *const reader,
5816 bool decode)
5817{
5818 const char *data_len_str;
5819 if (!(data_len_str = reader->read(field_var->length_bytes)))
5820 return HA_ERR_ROCKSDB_CORRUPT_DATA;
5821
5822 uint data_len;
5823 /* field_var->length_bytes is 1 or 2 */
5824 if (field_var->length_bytes == 1) {
5825 data_len = (uchar)data_len_str[0];
5826 } else {
5827 DBUG_ASSERT(field_var->length_bytes == 2);
5828 data_len = uint2korr(data_len_str);
5829 }
5830
5831 if (data_len > field_var->field_length) {
5832 /* The data on disk is longer than table DDL allows? */
5833 return HA_ERR_ROCKSDB_CORRUPT_DATA;
5834 }
5835
5836 if (!reader->read(data_len)) {
5837 return HA_ERR_ROCKSDB_CORRUPT_DATA;
5838 }
5839
5840 if (decode) {
5841 memcpy(field_var->ptr, data_len_str, field_var->length_bytes + data_len);
5842 }
5843
5844 return HA_EXIT_SUCCESS;
5845}
5846
5847int ha_rocksdb::convert_field_from_storage_format(
5848 my_core::Field *const field,
5849 Rdb_string_reader *const reader,
5850 bool decode,
5851 uint len)
5852{
5853 const char *data_bytes;
5854 if (len > 0) {
5855 if ((data_bytes = reader->read(len)) == nullptr) {
5856 return HA_ERR_ROCKSDB_CORRUPT_DATA;
5857 }
5858
5859 if (decode)
5860 memcpy(field->ptr, data_bytes, len);
5861 }
5862
5863 return HA_EXIT_SUCCESS;
5864}
5865
5866/*
5867 @brief
5868 Unpack the record in this->m_retrieved_record and this->m_last_rowkey from
5869 storage format into buf (which can be table->record[0] or table->record[1]).
5870
5871 @param key Table record's key in mem-comparable form.
5872 @param buf Store record in table->record[0] format here
5873
5874 @detail
5875 If the table has blobs, the unpacked data in buf may keep pointers to the
5876 data in this->m_retrieved_record.
5877
5878 The key is only needed to check its checksum value (the checksum is in
5879 m_retrieved_record).
5880
5881 @seealso
5882 ha_rocksdb::setup_read_decoders() Sets up data structures which tell which
5883 columns to decode.
5884
5885 @return
5886 0 OK
5887 other Error inpacking the data
5888*/
5889
5890int ha_rocksdb::convert_record_from_storage_format(
5891 const rocksdb::Slice *const key, const rocksdb::Slice *const value,
5892 uchar *const buf) {
5893 DBUG_ASSERT(key != nullptr);
5894 DBUG_ASSERT(buf != nullptr);
5895
5896 Rdb_string_reader reader(value);
5897
5898 /*
5899 Decode PK fields from the key
5900 */
5901 DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_read1",
5902 dbug_modify_key_varchar8(m_last_rowkey););
5903
5904 const rocksdb::Slice rowkey_slice(m_last_rowkey.ptr(),
5905 m_last_rowkey.length());
5906 const char *unpack_info = nullptr;
5907 uint16 unpack_info_len = 0;
5908 rocksdb::Slice unpack_slice;
5909
5910 /* If it's a TTL record, skip the 8 byte TTL value */
5911 const char *ttl_bytes;
5912 if (m_pk_descr->has_ttl()) {
5913 if ((ttl_bytes = reader.read(ROCKSDB_SIZEOF_TTL_RECORD))) {
5914 memcpy(m_ttl_bytes, ttl_bytes, ROCKSDB_SIZEOF_TTL_RECORD);
5915 } else {
5916 return HA_ERR_ROCKSDB_CORRUPT_DATA;
5917 }
5918 }
5919
5920 /* Other fields are decoded from the value */
5921 const char *null_bytes = nullptr;
5922 if (m_null_bytes_in_rec && !(null_bytes = reader.read(m_null_bytes_in_rec))) {
5923 return HA_ERR_ROCKSDB_CORRUPT_DATA;
5924 }
5925
5926 if (m_maybe_unpack_info) {
5927 unpack_info = reader.get_current_ptr();
5928 if (!unpack_info || !Rdb_key_def::is_unpack_data_tag(unpack_info[0]) ||
5929 !reader.read(Rdb_key_def::get_unpack_header_size(unpack_info[0]))) {
5930 return HA_ERR_ROCKSDB_CORRUPT_DATA;
5931 }
5932
5933 unpack_info_len =
5934 rdb_netbuf_to_uint16(reinterpret_cast<const uchar *>(unpack_info + 1));
5935 unpack_slice = rocksdb::Slice(unpack_info, unpack_info_len);
5936
5937 reader.read(unpack_info_len -
5938 Rdb_key_def::get_unpack_header_size(unpack_info[0]));
5939 }
5940
5941 int err = HA_EXIT_SUCCESS;
5942 if (m_key_requested) {
5943 err = m_pk_descr->unpack_record(table, buf, &rowkey_slice,
5944 unpack_info ? &unpack_slice : nullptr,
5945 false /* verify_checksum */);
5946 }
5947
5948 if (err != HA_EXIT_SUCCESS) {
5949 return err;
5950 }
5951
5952 for (auto it = m_decoders_vect.begin(); it != m_decoders_vect.end(); it++) {
5953 const Rdb_field_encoder *const field_dec = it->m_field_enc;
5954 const bool decode = it->m_decode;
5955 const bool isNull =
5956 field_dec->maybe_null() &&
5957 ((null_bytes[field_dec->m_null_offset] & field_dec->m_null_mask) != 0);
5958
5959 Field *const field = table->field[field_dec->m_field_index];
5960
5961 /* Skip the bytes we need to skip */
5962 if (it->m_skip && !reader.read(it->m_skip)) {
5963 return HA_ERR_ROCKSDB_CORRUPT_DATA;
5964 }
5965
5966 uint field_offset = field->ptr - table->record[0];
5967 uint null_offset = field->null_offset();
5968 bool maybe_null = field->real_maybe_null();
5969 field->move_field(buf + field_offset,
5970 maybe_null ? buf + null_offset : nullptr,
5971 field->null_bit);
5972 // WARNING! - Don't return before restoring field->ptr and field->null_ptr!
5973
5974 if (isNull) {
5975 if (decode) {
5976 /* This sets the NULL-bit of this record */
5977 field->set_null();
5978 /*
5979 Besides that, set the field value to default value. CHECKSUM TABLE
5980 depends on this.
5981 */
5982 memcpy(field->ptr, table->s->default_values + field_offset,
5983 field->pack_length());
5984 }
5985 } else {
5986 if (decode) {
5987 field->set_notnull();
5988 }
5989
5990 if (field_dec->m_field_type == MYSQL_TYPE_BLOB) {
5991 err = convert_blob_from_storage_format(
5992 (my_core::Field_blob *) field, &reader, decode);
5993 } else if (field_dec->m_field_type == MYSQL_TYPE_VARCHAR) {
5994 err = convert_varchar_from_storage_format(
5995 (my_core::Field_varstring *) field, &reader, decode);
5996 } else {
5997 err = convert_field_from_storage_format(
5998 field, &reader, decode, field_dec->m_pack_length_in_rec);
5999 }
6000 }
6001
6002 // Restore field->ptr and field->null_ptr
6003 field->move_field(table->record[0] + field_offset,
6004 maybe_null ? table->record[0] + null_offset : nullptr,
6005 field->null_bit);
6006
6007 if (err != HA_EXIT_SUCCESS) {
6008 return err;
6009 }
6010 }
6011
6012 if (m_verify_row_debug_checksums) {
6013 if (reader.remaining_bytes() == RDB_CHECKSUM_CHUNK_SIZE &&
6014 reader.read(1)[0] == RDB_CHECKSUM_DATA_TAG) {
6015 uint32_t stored_key_chksum =
6016 rdb_netbuf_to_uint32((const uchar *)reader.read(RDB_CHECKSUM_SIZE));
6017 uint32_t stored_val_chksum =
6018 rdb_netbuf_to_uint32((const uchar *)reader.read(RDB_CHECKSUM_SIZE));
6019
6020 const uint32_t computed_key_chksum =
6021 my_core::crc32(0, rdb_slice_to_uchar_ptr(key), key->size());
6022 const uint32_t computed_val_chksum =
6023 my_core::crc32(0, rdb_slice_to_uchar_ptr(value),
6024 value->size() - RDB_CHECKSUM_CHUNK_SIZE);
6025
6026 DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_checksum1",
6027 stored_key_chksum++;);
6028
6029 if (stored_key_chksum != computed_key_chksum) {
6030 m_pk_descr->report_checksum_mismatch(true, key->data(), key->size());
6031 return HA_ERR_ROCKSDB_CHECKSUM_MISMATCH;
6032 }
6033
6034 DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_checksum2",
6035 stored_val_chksum++;);
6036 if (stored_val_chksum != computed_val_chksum) {
6037 m_pk_descr->report_checksum_mismatch(false, value->data(),
6038 value->size());
6039 return HA_ERR_ROCKSDB_CHECKSUM_MISMATCH;
6040 }
6041
6042 m_row_checksums_checked++;
6043 }
6044 if (reader.remaining_bytes())
6045 return HA_ERR_ROCKSDB_CORRUPT_DATA;
6046 }
6047
6048 return HA_EXIT_SUCCESS;
6049}
6050
6051void ha_rocksdb::get_storage_type(Rdb_field_encoder *const encoder,
6052 const uint &kp) {
6053 // STORE_SOME uses unpack_info.
6054 if (m_pk_descr->has_unpack_info(kp)) {
6055 DBUG_ASSERT(m_pk_descr->can_unpack(kp));
6056 encoder->m_storage_type = Rdb_field_encoder::STORE_SOME;
6057 m_maybe_unpack_info = true;
6058 } else if (m_pk_descr->can_unpack(kp)) {
6059 encoder->m_storage_type = Rdb_field_encoder::STORE_NONE;
6060 }
6061}
6062
6063/*
6064 Setup data needed to convert table->record[] to and from record storage
6065 format.
6066
6067 @seealso
6068 ha_rocksdb::convert_record_to_storage_format,
6069 ha_rocksdb::convert_record_from_storage_format
6070*/
6071
6072void ha_rocksdb::setup_field_converters() {
6073 uint i;
6074 uint null_bytes = 0;
6075 uchar cur_null_mask = 0x1;
6076
6077 DBUG_ASSERT(m_encoder_arr == nullptr);
6078 m_encoder_arr = static_cast<Rdb_field_encoder *>(
6079 my_malloc(table->s->fields * sizeof(Rdb_field_encoder), MYF(0)));
6080 if (m_encoder_arr == nullptr) {
6081 return;
6082 }
6083
6084 for (i = 0; i < table->s->fields; i++) {
6085 Field *const field = table->field[i];
6086 m_encoder_arr[i].m_storage_type = Rdb_field_encoder::STORE_ALL;
6087
6088 /*
6089 Check if this field is
6090 - a part of primary key, and
6091 - it can be decoded back from its key image.
6092 If both hold, we don't need to store this field in the value part of
6093 RocksDB's key-value pair.
6094
6095 If hidden pk exists, we skip this check since the field will never be
6096 part of the hidden pk.
6097 */
6098 if (!has_hidden_pk(table)) {
6099 KEY *const pk_info = &table->key_info[table->s->primary_key];
6100 for (uint kp = 0; kp < pk_info->user_defined_key_parts; kp++) {
6101 /* key_part->fieldnr is counted from 1 */
6102 if (field->field_index + 1 == pk_info->key_part[kp].fieldnr) {
6103 get_storage_type(&m_encoder_arr[i], kp);
6104 break;
6105 }
6106 }
6107 }
6108
6109 m_encoder_arr[i].m_field_type = field->real_type();
6110 m_encoder_arr[i].m_field_index = i;
6111 m_encoder_arr[i].m_pack_length_in_rec = field->pack_length_in_rec();
6112
6113 if (field->real_maybe_null()) {
6114 m_encoder_arr[i].m_null_mask = cur_null_mask;
6115 m_encoder_arr[i].m_null_offset = null_bytes;
6116 if (cur_null_mask == 0x80) {
6117 cur_null_mask = 0x1;
6118 null_bytes++;
6119 } else
6120 cur_null_mask = cur_null_mask << 1;
6121 } else {
6122 m_encoder_arr[i].m_null_mask = 0;
6123 }
6124 }
6125
6126 /* Count the last, unfinished NULL-bits byte */
6127 if (cur_null_mask != 0x1)
6128 null_bytes++;
6129
6130 m_null_bytes_in_rec = null_bytes;
6131}
6132
6133int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg,
6134 const Rdb_tbl_def *const tbl_def_arg,
6135 bool alloc_alter_buffers) {
6136 DBUG_ENTER_FUNC();
6137
6138 DBUG_ASSERT(m_pk_tuple == nullptr);
6139 DBUG_ASSERT(tbl_def_arg != nullptr);
6140
6141 std::shared_ptr<Rdb_key_def> *const kd_arr = tbl_def_arg->m_key_descr_arr;
6142
6143 uint key_len = 0;
6144 uint max_packed_sk_len = 0;
6145 uint pack_key_len = 0;
6146
6147 m_pk_descr = kd_arr[pk_index(table_arg, tbl_def_arg)];
6148 if (has_hidden_pk(table_arg)) {
6149 m_pk_key_parts = 1;
6150 } else {
6151 m_pk_key_parts =
6152 table->key_info[table->s->primary_key].user_defined_key_parts;
6153 key_len = table->key_info[table->s->primary_key].key_length;
6154 }
6155
6156 // move this into get_table_handler() ??
6157 m_pk_descr->setup(table_arg, tbl_def_arg);
6158
6159 m_pk_tuple = reinterpret_cast<uchar *>(my_malloc(key_len, MYF(0)));
6160
6161 pack_key_len = m_pk_descr->max_storage_fmt_length();
6162 m_pk_packed_tuple =
6163 reinterpret_cast<uchar *>(my_malloc(pack_key_len, MYF(0)));
6164
6165 /* Sometimes, we may use m_sk_packed_tuple for storing packed PK */
6166 max_packed_sk_len = pack_key_len;
6167 for (uint i = 0; i < table_arg->s->keys; i++) {
6168 if (i == table_arg->s->primary_key) /* Primary key was processed above */
6169 continue;
6170
6171 // TODO: move this into get_table_handler() ??
6172 kd_arr[i]->setup(table_arg, tbl_def_arg);
6173
6174 const uint packed_len = kd_arr[i]->max_storage_fmt_length();
6175 if (packed_len > max_packed_sk_len) {
6176 max_packed_sk_len = packed_len;
6177 }
6178 }
6179
6180 m_sk_packed_tuple =
6181 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6182 m_sk_match_prefix_buf =
6183 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6184 m_sk_packed_tuple_old =
6185 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6186 m_end_key_packed_tuple =
6187 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6188 m_pack_buffer =
6189 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6190
6191 m_eq_cond_upper_bound =
6192 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6193 m_eq_cond_lower_bound =
6194 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6195
6196 /*
6197 If inplace alter is happening, allocate special buffers for unique
6198 secondary index duplicate checking.
6199 */
6200 if (alloc_alter_buffers) {
6201 m_dup_sk_packed_tuple =
6202 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6203 m_dup_sk_packed_tuple_old =
6204 reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0)));
6205 }
6206
6207 if (m_pk_tuple == nullptr || m_pk_packed_tuple == nullptr ||
6208 m_sk_packed_tuple == nullptr || m_sk_packed_tuple_old == nullptr ||
6209 m_end_key_packed_tuple == nullptr || m_pack_buffer == nullptr ||
6210 m_eq_cond_upper_bound == nullptr || m_eq_cond_lower_bound == nullptr ||
6211 (alloc_alter_buffers && (m_dup_sk_packed_tuple == nullptr ||
6212 m_dup_sk_packed_tuple_old == nullptr))) {
6213 // One or more of the above allocations failed. Clean up and exit
6214 free_key_buffers();
6215
6216 DBUG_RETURN(HA_ERR_OUT_OF_MEM);
6217 }
6218
6219 DBUG_RETURN(HA_EXIT_SUCCESS);
6220}
6221
6222void ha_rocksdb::free_key_buffers() {
6223 my_free(m_pk_tuple);
6224 m_pk_tuple = nullptr;
6225
6226 my_free(m_pk_packed_tuple);
6227 m_pk_packed_tuple = nullptr;
6228
6229 my_free(m_sk_packed_tuple);
6230 m_sk_packed_tuple = nullptr;
6231
6232 my_free(m_sk_match_prefix_buf);
6233 m_sk_match_prefix_buf = nullptr;
6234
6235 my_free(m_sk_packed_tuple_old);
6236 m_sk_packed_tuple_old = nullptr;
6237
6238 my_free(m_end_key_packed_tuple);
6239 m_end_key_packed_tuple = nullptr;
6240
6241 my_free(m_pack_buffer);
6242 m_pack_buffer = nullptr;
6243
6244 my_free(m_dup_sk_packed_tuple);
6245 m_dup_sk_packed_tuple = nullptr;
6246
6247 my_free(m_dup_sk_packed_tuple_old);
6248 m_dup_sk_packed_tuple_old = nullptr;
6249
6250 my_free(m_eq_cond_upper_bound);
6251 m_eq_cond_upper_bound = nullptr;
6252
6253 my_free(m_eq_cond_lower_bound);
6254 m_eq_cond_lower_bound = nullptr;
6255}
6256
6257#ifdef MARIAROCKS_NOT_YET
6258void ha_rocksdb::set_use_read_free_rpl(const char *const whitelist) {
6259 DBUG_ASSERT(whitelist != nullptr);
6260
6261#if defined(HAVE_PSI_INTERFACE)
6262 Regex_list_handler regex_handler(key_rwlock_read_free_rpl_tables);
6263#else
6264 Regex_list_handler regex_handler;
6265#endif
6266
6267 if (!regex_handler.set_patterns(whitelist)) {
6268 warn_about_bad_patterns(&regex_handler, "read_free_rpl_tables");
6269 }
6270
6271 m_use_read_free_rpl = regex_handler.matches(m_tbl_def->base_tablename());
6272}
6273#endif
6274
6275void ha_rocksdb::set_skip_unique_check_tables(const char *const whitelist) {
6276 DBUG_ASSERT(whitelist != nullptr);
6277
6278#if defined(HAVE_PSI_INTERFACE)
6279 Regex_list_handler regex_handler(key_rwlock_skip_unique_check_tables);
6280#else
6281 Regex_list_handler regex_handler;
6282#endif
6283
6284 if (!regex_handler.set_patterns(whitelist)) {
6285 warn_about_bad_patterns(&regex_handler, "skip_unique_check_tables");
6286 }
6287
6288 m_skip_unique_check = regex_handler.matches(m_tbl_def->base_tablename());
6289}
6290
6291/**
6292 @return
6293 HA_EXIT_SUCCESS OK
6294 other HA_ERR error code (can be SE-specific)
6295*/
6296int ha_rocksdb::open(const char *const name, int mode, uint test_if_locked) {
6297 DBUG_ENTER_FUNC();
6298
6299 int err = close();
6300 if (err) {
6301 DBUG_RETURN(err);
6302 }
6303
6304 m_table_handler = rdb_open_tables.get_table_handler(name);
6305
6306 if (m_table_handler == nullptr) {
6307 DBUG_RETURN(HA_ERR_OUT_OF_MEM);
6308 }
6309
6310 my_core::thr_lock_data_init(&m_table_handler->m_thr_lock, &m_db_lock,
6311 nullptr);
6312 m_io_perf.init(&m_table_handler->m_table_perf_context,
6313 &m_table_handler->m_io_perf_read,
6314 &m_table_handler->m_io_perf_write, &stats);
6315 Rdb_perf_context_guard guard(&m_io_perf,
6316 rocksdb_perf_context_level(ha_thd()));
6317
6318 std::string fullname;
6319 err = rdb_normalize_tablename(name, &fullname);
6320 if (err != HA_EXIT_SUCCESS) {
6321 DBUG_RETURN(err);
6322 }
6323
6324 m_tbl_def = ddl_manager.find(fullname);
6325 if (m_tbl_def == nullptr) {
6326 my_error(ER_INTERNAL_ERROR, MYF(0),
6327 "Attempt to open a table that is not present in RocksDB-SE data "
6328 "dictionary");
6329 DBUG_RETURN(HA_ERR_ROCKSDB_INVALID_TABLE);
6330 }
6331 m_lock_rows = RDB_LOCK_NONE;
6332
6333 m_key_descr_arr = m_tbl_def->m_key_descr_arr;
6334
6335 /*
6336 Full table scan actually uses primary key
6337 (UPDATE needs to know this, otherwise it will go into infinite loop on
6338 queries like "UPDATE tbl SET pk=pk+100")
6339 */
6340 key_used_on_scan = table->s->primary_key;
6341
6342 // close() above has already called free_key_buffers(). No need to do it here.
6343 err = alloc_key_buffers(table, m_tbl_def);
6344
6345 if (err) {
6346 DBUG_RETURN(err);
6347 }
6348
6349 /*
6350 init_with_fields() is used to initialize table flags based on the field
6351 definitions in table->field[].
6352 It is called by open_binary_frm(), but that function calls the method for
6353 a temporary ha_rocksdb object which is later destroyed.
6354
6355 If we are here in ::open(), then init_with_fields() has not been called
6356 for this object. Call it ourselves, we want all member variables to be
6357 properly initialized.
6358 */
6359 init_with_fields();
6360
6361 setup_field_converters();
6362
6363 /*
6364 MariaDB: adjust field->part_of_key for PK columns. We can only do it here
6365 because SE API is just relying on the HA_PRIMARY_KEY_IN_READ_INDEX which
6366 does not allow to distinguish between unpack'able and non-unpack'able
6367 columns.
6368 Upstream uses handler->init_with_fields() but we don't have that call.
6369 */
6370 {
6371 if (!has_hidden_pk(table)) {
6372 KEY *const pk_info = &table->key_info[table->s->primary_key];
6373 for (uint kp = 0; kp < pk_info->user_defined_key_parts; kp++) {
6374 if (!m_pk_descr->can_unpack(kp)) {
6375 //
6376 uint field_index= pk_info->key_part[kp].field->field_index;
6377 table->field[field_index]->part_of_key.clear_all();
6378 table->field[field_index]->part_of_key.set_bit(table->s->primary_key);
6379 }
6380 }
6381 }
6382
6383 for (uint key= 0; key < table->s->keys; key++) {
6384 KEY *const key_info = &table->key_info[key];
6385 if (key == table->s->primary_key)
6386 continue;
6387 for (uint kp = 0; kp < key_info->usable_key_parts; kp++) {
6388 uint field_index= key_info->key_part[kp].field->field_index;
6389 if (m_key_descr_arr[key]->can_unpack(kp)) {
6390 table->field[field_index]->part_of_key.set_bit(key);
6391 } else {
6392 table->field[field_index]->part_of_key.clear_bit(key);
6393 }
6394 }
6395 }
6396 }
6397
6398 info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST);
6399
6400 /*
6401 The following load_XXX code calls row decode functions, and they do
6402 that without having done ::external_lock() or index_init()/rnd_init().
6403 (Note: this also means we're doing a read when there was no
6404 setup_field_converters() call)
6405
6406 Initialize the necessary variables for them:
6407 */
6408 m_verify_row_debug_checksums = false;
6409
6410 /* Load auto_increment value only once on first use. */
6411 if (table->found_next_number_field && m_tbl_def->m_auto_incr_val == 0) {
6412 load_auto_incr_value();
6413 }
6414
6415 /* Load hidden pk only once on first use. */
6416 if (has_hidden_pk(table) && m_tbl_def->m_hidden_pk_val == 0 &&
6417 (err = load_hidden_pk_value()) != HA_EXIT_SUCCESS) {
6418 free_key_buffers();
6419 DBUG_RETURN(err);
6420 }
6421
6422 /* Index block size in MyRocks: used by MySQL in query optimization */
6423 stats.block_size = rocksdb_tbl_options->block_size;
6424
6425#ifdef MARIAROCKS_NOT_YET // MDEV-10976
6426 /* Determine at open whether we can use Read Free Replication or not */
6427 set_use_read_free_rpl(THDVAR(ha_thd(), read_free_rpl_tables));
6428#endif
6429
6430 /* Determine at open whether we should skip unique checks for this table */
6431 set_skip_unique_check_tables(THDVAR(ha_thd(), skip_unique_check_tables));
6432
6433 DBUG_RETURN(HA_EXIT_SUCCESS);
6434}
6435
6436int ha_rocksdb::close(void) {
6437 DBUG_ENTER_FUNC();
6438
6439 m_pk_descr = nullptr;
6440 m_key_descr_arr = nullptr;
6441
6442 free_key_buffers();
6443
6444 my_free(m_encoder_arr);
6445 m_encoder_arr = nullptr;
6446
6447 if (m_table_handler != nullptr) {
6448 rdb_open_tables.release_table_handler(m_table_handler);
6449 m_table_handler = nullptr;
6450 }
6451
6452 // These are needed to suppress valgrind errors in rocksdb.partition
6453 m_storage_record.free();
6454 m_last_rowkey.free();
6455 m_sk_tails.free();
6456 m_sk_tails_old.free();
6457 m_pk_unpack_info.free();
6458
6459 DBUG_RETURN(HA_EXIT_SUCCESS);
6460}
6461
6462static const char *rdb_error_messages[] = {
6463 "Table must have a PRIMARY KEY.",
6464 "Specifying DATA DIRECTORY for an individual table is not supported.",
6465 "Specifying INDEX DIRECTORY for an individual table is not supported.",
6466 "RocksDB commit failed.",
6467 "Failure during bulk load operation.",
6468 "Found data corruption.",
6469 "CRC checksum mismatch.",
6470 "Invalid table.",
6471 "Could not access RocksDB properties.",
6472 "File I/O error during merge/sort operation.",
6473 "RocksDB status: not found.",
6474 "RocksDB status: corruption.",
6475 "RocksDB status: invalid argument.",
6476 "RocksDB status: io error.",
6477 "RocksDB status: no space.",
6478 "RocksDB status: merge in progress.",
6479 "RocksDB status: incomplete.",
6480 "RocksDB status: shutdown in progress.",
6481 "RocksDB status: timed out.",
6482 "RocksDB status: aborted.",
6483 "RocksDB status: lock limit reached.",
6484 "RocksDB status: busy.",
6485 "RocksDB status: deadlock.",
6486 "RocksDB status: expired.",
6487 "RocksDB status: try again.",
6488};
6489
6490static_assert((sizeof(rdb_error_messages) / sizeof(rdb_error_messages[0])) ==
6491 ((HA_ERR_ROCKSDB_LAST - HA_ERR_ROCKSDB_FIRST) + 1),
6492 "Number of error messages doesn't match number of error codes");
6493
6494//psergey-merge: do we need this in MariaDB: we have get_error_messages
6495//below...
6496#if 0
6497static const char *rdb_get_error_message(int nr) {
6498 return rdb_error_messages[nr - HA_ERR_ROCKSDB_FIRST];
6499}
6500#endif
6501
6502static const char **rdb_get_error_messages(int nr) { return rdb_error_messages; }
6503
6504bool ha_rocksdb::get_error_message(const int error, String *const buf) {
6505 DBUG_ENTER_FUNC();
6506
6507 static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST,
6508 "HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST");
6509 static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_LAST,
6510 "HA_ERR_ROCKSDB_LAST > HA_ERR_LAST");
6511
6512 DBUG_ASSERT(buf != nullptr);
6513
6514 if (error == HA_ERR_LOCK_WAIT_TIMEOUT || error == HA_ERR_LOCK_DEADLOCK) {
6515 Rdb_transaction *const tx = get_tx_from_thd(ha_thd());
6516 DBUG_ASSERT(tx != nullptr);
6517 buf->append(tx->m_detailed_error);
6518 DBUG_RETURN(true);
6519 }
6520
6521 if (error >= HA_ERR_ROCKSDB_FIRST && error <= HA_ERR_ROCKSDB_LAST) {
6522 buf->append(rdb_error_messages[error - HA_ERR_ROCKSDB_FIRST]);
6523 }
6524
6525 // We can be called with the values which are < HA_ERR_FIRST because most
6526 // MySQL internal functions will just return HA_EXIT_FAILURE in case of
6527 // an error.
6528
6529 DBUG_RETURN(false);
6530}
6531
6532/*
6533 Generalized way to convert RocksDB status errors into MySQL error code, and
6534 print error message.
6535
6536 Each error code below maps to a RocksDB status code found in:
6537 rocksdb/include/rocksdb/status.h
6538*/
6539int ha_rocksdb::rdb_error_to_mysql(const rocksdb::Status &s,
6540 const char *opt_msg) {
6541 DBUG_ASSERT(!s.ok());
6542
6543 int err;
6544 switch (s.code()) {
6545 case rocksdb::Status::Code::kOk:
6546 err = HA_EXIT_SUCCESS;
6547 break;
6548 case rocksdb::Status::Code::kNotFound:
6549 err = HA_ERR_ROCKSDB_STATUS_NOT_FOUND;
6550 break;
6551 case rocksdb::Status::Code::kCorruption:
6552 err = HA_ERR_ROCKSDB_STATUS_CORRUPTION;
6553 break;
6554 case rocksdb::Status::Code::kNotSupported:
6555 err = HA_ERR_ROCKSDB_STATUS_NOT_SUPPORTED;
6556 break;
6557 case rocksdb::Status::Code::kInvalidArgument:
6558 err = HA_ERR_ROCKSDB_STATUS_INVALID_ARGUMENT;
6559 break;
6560 case rocksdb::Status::Code::kIOError:
6561 err = (s.IsNoSpace()) ? HA_ERR_ROCKSDB_STATUS_NO_SPACE
6562 : HA_ERR_ROCKSDB_STATUS_IO_ERROR;
6563 break;
6564 case rocksdb::Status::Code::kMergeInProgress:
6565 err = HA_ERR_ROCKSDB_STATUS_MERGE_IN_PROGRESS;
6566 break;
6567 case rocksdb::Status::Code::kIncomplete:
6568 err = HA_ERR_ROCKSDB_STATUS_INCOMPLETE;
6569 break;
6570 case rocksdb::Status::Code::kShutdownInProgress:
6571 err = HA_ERR_ROCKSDB_STATUS_SHUTDOWN_IN_PROGRESS;
6572 break;
6573 case rocksdb::Status::Code::kTimedOut:
6574 err = HA_ERR_ROCKSDB_STATUS_TIMED_OUT;
6575 break;
6576 case rocksdb::Status::Code::kAborted:
6577 err = (s.IsLockLimit()) ? HA_ERR_ROCKSDB_STATUS_LOCK_LIMIT
6578 : HA_ERR_ROCKSDB_STATUS_ABORTED;
6579 break;
6580 case rocksdb::Status::Code::kBusy:
6581 err = (s.IsDeadlock()) ? HA_ERR_ROCKSDB_STATUS_DEADLOCK
6582 : HA_ERR_ROCKSDB_STATUS_BUSY;
6583 break;
6584 case rocksdb::Status::Code::kExpired:
6585 err = HA_ERR_ROCKSDB_STATUS_EXPIRED;
6586 break;
6587 case rocksdb::Status::Code::kTryAgain:
6588 err = HA_ERR_ROCKSDB_STATUS_TRY_AGAIN;
6589 break;
6590 default:
6591 DBUG_ASSERT(0);
6592 return -1;
6593 }
6594
6595 if (opt_msg) {
6596 my_error(ER_RDB_STATUS_MSG, MYF(0), opt_msg, s.code(),
6597 s.ToString().c_str());
6598 } else {
6599 my_error(ER_RDB_STATUS_GENERAL, MYF(0), s.code(), s.ToString().c_str());
6600 }
6601
6602 return err;
6603}
6604
6605/* MyRocks supports only the following collations for indexed columns */
6606static const std::set<uint> RDB_INDEX_COLLATIONS = {
6607 COLLATION_BINARY, COLLATION_UTF8_BIN, COLLATION_LATIN1_BIN};
6608
6609static bool
6610rdb_is_index_collation_supported(const my_core::Field *const field) {
6611 const my_core::enum_field_types type = field->real_type();
6612 /* Handle [VAR](CHAR|BINARY) or TEXT|BLOB */
6613 if (type == MYSQL_TYPE_VARCHAR || type == MYSQL_TYPE_STRING ||
6614 type == MYSQL_TYPE_BLOB) {
6615
6616 return (RDB_INDEX_COLLATIONS.find(field->charset()->number) !=
6617 RDB_INDEX_COLLATIONS.end()) ||
6618 rdb_is_collation_supported(field->charset());
6619 }
6620 return true;
6621}
6622
6623
6624static bool
6625rdb_field_uses_nopad_collation(const my_core::Field *const field) {
6626 const my_core::enum_field_types type = field->real_type();
6627 /* Handle [VAR](CHAR|BINARY) or TEXT|BLOB */
6628 if (type == MYSQL_TYPE_VARCHAR || type == MYSQL_TYPE_STRING ||
6629 type == MYSQL_TYPE_BLOB) {
6630
6631 /*
6632 This is technically a NOPAD collation but it's a binary collation
6633 that we can handle.
6634 */
6635 if (RDB_INDEX_COLLATIONS.find(field->charset()->number) !=
6636 RDB_INDEX_COLLATIONS.end())
6637 return false;
6638
6639 return (field->charset()->state & MY_CS_NOPAD);
6640 }
6641 return false;
6642}
6643
6644
6645/*
6646 Create structures needed for storing data in rocksdb. This is called when the
6647 table is created. The structures will be shared by all TABLE* objects.
6648
6649 @param
6650 table_arg Table with definition
6651 db_table "dbname.tablename"
6652 len strlen of the above
6653 tbl_def_arg tbl_def whose key_descr is being created/populated
6654 old_tbl_def_arg tbl_def from which keys are being copied over from
6655 (for use during inplace alter)
6656
6657 @return
6658 0 - Ok
6659 other - error, either given table ddl is not supported by rocksdb or OOM.
6660*/
6661int ha_rocksdb::create_key_defs(
6662 const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
6663 const TABLE *const old_table_arg /* = nullptr */,
6664 const Rdb_tbl_def *const old_tbl_def_arg
6665 /* = nullptr */) const {
6666 DBUG_ENTER_FUNC();
6667
6668 DBUG_ASSERT(table_arg != nullptr);
6669 DBUG_ASSERT(table_arg->s != nullptr);
6670
6671 uint i;
6672
6673 /*
6674 These need to be one greater than MAX_INDEXES since the user can create
6675 MAX_INDEXES secondary keys and no primary key which would cause us
6676 to generate a hidden one.
6677 */
6678 std::array<key_def_cf_info, MAX_INDEXES + 1> cfs;
6679
6680 /*
6681 NOTE: All new column families must be created before new index numbers are
6682 allocated to each key definition. See below for more details.
6683 http://github.com/MySQLOnRocksDB/mysql-5.6/issues/86#issuecomment-138515501
6684 */
6685 if (create_cfs(table_arg, tbl_def_arg, &cfs)) {
6686 DBUG_RETURN(HA_EXIT_FAILURE);
6687 }
6688
6689 if (!old_tbl_def_arg) {
6690 /*
6691 old_tbl_def doesn't exist. this means we are in the process of creating
6692 a new table.
6693
6694 Get the index numbers (this will update the next_index_number)
6695 and create Rdb_key_def structures.
6696 */
6697 for (i = 0; i < tbl_def_arg->m_key_count; i++) {
6698 if (create_key_def(table_arg, i, tbl_def_arg, &m_key_descr_arr[i],
6699 cfs[i])) {
6700 DBUG_RETURN(HA_EXIT_FAILURE);
6701 }
6702 }
6703 } else {
6704 /*
6705 old_tbl_def exists. This means we are creating a new tbl_def as part of
6706 in-place alter table. Copy over existing keys from the old_tbl_def and
6707 generate the necessary new key definitions if any.
6708 */
6709 if (create_inplace_key_defs(table_arg, tbl_def_arg, old_table_arg,
6710 old_tbl_def_arg, cfs)) {
6711 DBUG_RETURN(HA_EXIT_FAILURE);
6712 }
6713 }
6714
6715 DBUG_RETURN(HA_EXIT_SUCCESS);
6716}
6717
6718/*
6719 Checks index parameters and creates column families needed for storing data
6720 in rocksdb if necessary.
6721
6722 @param in
6723 table_arg Table with definition
6724 db_table Table name
6725 tbl_def_arg Table def structure being populated
6726
6727 @param out
6728 cfs CF info for each key definition in 'key_info' order
6729
6730 @return
6731 0 - Ok
6732 other - error
6733*/
6734int ha_rocksdb::create_cfs(
6735 const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
6736 std::array<struct key_def_cf_info, MAX_INDEXES + 1> *const cfs) const {
6737 DBUG_ENTER_FUNC();
6738
6739 DBUG_ASSERT(table_arg != nullptr);
6740 DBUG_ASSERT(table_arg->s != nullptr);
6741 DBUG_ASSERT(tbl_def_arg != nullptr);
6742
6743 char tablename_sys[NAME_LEN + 1];
6744 bool tsys_set= false;
6745
6746 /*
6747 The first loop checks the index parameters and creates
6748 column families if necessary.
6749 */
6750 for (uint i = 0; i < tbl_def_arg->m_key_count; i++) {
6751 rocksdb::ColumnFamilyHandle *cf_handle;
6752
6753 if (!is_hidden_pk(i, table_arg, tbl_def_arg) &&
6754 tbl_def_arg->base_tablename().find(tmp_file_prefix) != 0) {
6755 if (!tsys_set)
6756 {
6757 tsys_set= true;
6758 my_core::filename_to_tablename(tbl_def_arg->base_tablename().c_str(),
6759 tablename_sys, sizeof(tablename_sys));
6760 }
6761
6762 for (uint part = 0; part < table_arg->key_info[i].ext_key_parts;
6763 part++)
6764 {
6765 /* MariaDB: disallow NOPAD collations */
6766 if (rdb_field_uses_nopad_collation(
6767 table_arg->key_info[i].key_part[part].field))
6768 {
6769 my_error(ER_MYROCKS_CANT_NOPAD_COLLATION, MYF(0));
6770 DBUG_RETURN(HA_EXIT_FAILURE);
6771 }
6772
6773 if (rocksdb_strict_collation_check &&
6774 !rdb_is_index_collation_supported(
6775 table_arg->key_info[i].key_part[part].field) &&
6776 !rdb_collation_exceptions->matches(tablename_sys)) {
6777
6778 char buf[1024];
6779 my_snprintf(buf, sizeof(buf),
6780 "Indexed column %s.%s uses a collation that does not "
6781 "allow index-only access in secondary key and has "
6782 "reduced disk space efficiency in primary key.",
6783 tbl_def_arg->full_tablename().c_str(),
6784 table_arg->key_info[i].key_part[part].field->field_name.str);
6785
6786 my_error(ER_INTERNAL_ERROR, MYF(ME_JUST_WARNING), buf);
6787 }
6788 }
6789 }
6790
6791 // Internal consistency check to make sure that data in TABLE and
6792 // Rdb_tbl_def structures matches. Either both are missing or both are
6793 // specified. Yes, this is critical enough to make it into SHIP_ASSERT.
6794 SHIP_ASSERT(IF_PARTITIONING(!table_arg->part_info,true) == tbl_def_arg->base_partition().empty());
6795
6796 // Generate the name for the column family to use.
6797 bool per_part_match_found = false;
6798 std::string cf_name = generate_cf_name(i, table_arg, tbl_def_arg,
6799 &per_part_match_found);
6800
6801 // Prevent create from using the system column family.
6802 if (cf_name == DEFAULT_SYSTEM_CF_NAME) {
6803 my_error(ER_WRONG_ARGUMENTS, MYF(0),
6804 "column family not valid for storing index data.");
6805 DBUG_RETURN(HA_EXIT_FAILURE);
6806 }
6807
6808 // Here's how `get_or_create_cf` will use the input parameters:
6809 //
6810 // `cf_name` - will be used as a CF name.
6811 cf_handle = cf_manager.get_or_create_cf(rdb, cf_name);
6812
6813 if (!cf_handle) {
6814 DBUG_RETURN(HA_EXIT_FAILURE);
6815 }
6816
6817 auto &cf = (*cfs)[i];
6818
6819 cf.cf_handle = cf_handle;
6820 cf.is_reverse_cf = Rdb_cf_manager::is_cf_name_reverse(cf_name.c_str());
6821 cf.is_per_partition_cf = per_part_match_found;
6822 }
6823
6824 DBUG_RETURN(HA_EXIT_SUCCESS);
6825}
6826
6827/*
6828 Create key definition needed for storing data in rocksdb during ADD index
6829 inplace operations.
6830
6831 @param in
6832 table_arg Table with definition
6833 tbl_def_arg New table def structure being populated
6834 old_tbl_def_arg Old(current) table def structure
6835 cfs Struct array which contains column family information
6836
6837 @return
6838 0 - Ok
6839 other - error, either given table ddl is not supported by rocksdb or OOM.
6840*/
6841int ha_rocksdb::create_inplace_key_defs(
6842 const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
6843 const TABLE *const old_table_arg, const Rdb_tbl_def *const old_tbl_def_arg,
6844 const std::array<key_def_cf_info, MAX_INDEXES + 1> &cfs) const {
6845 DBUG_ENTER_FUNC();
6846
6847 DBUG_ASSERT(table_arg != nullptr);
6848 DBUG_ASSERT(tbl_def_arg != nullptr);
6849 DBUG_ASSERT(old_tbl_def_arg != nullptr);
6850
6851 std::shared_ptr<Rdb_key_def> *const old_key_descr =
6852 old_tbl_def_arg->m_key_descr_arr;
6853 std::shared_ptr<Rdb_key_def> *const new_key_descr =
6854 tbl_def_arg->m_key_descr_arr;
6855 const std::unordered_map<std::string, uint> old_key_pos =
6856 get_old_key_positions(table_arg, tbl_def_arg, old_table_arg,
6857 old_tbl_def_arg);
6858
6859 uint i;
6860 for (i = 0; i < tbl_def_arg->m_key_count; i++) {
6861 const auto &it = old_key_pos.find(get_key_name(i, table_arg, tbl_def_arg));
6862
6863 if (it != old_key_pos.end()) {
6864 /*
6865 Found matching index in old table definition, so copy it over to the
6866 new one created.
6867 */
6868 const Rdb_key_def &okd = *old_key_descr[it->second];
6869
6870 const GL_INDEX_ID gl_index_id = okd.get_gl_index_id();
6871 struct Rdb_index_info index_info;
6872 if (!dict_manager.get_index_info(gl_index_id, &index_info)) {
6873 // NO_LINT_DEBUG
6874 sql_print_error("RocksDB: Could not get index information "
6875 "for Index Number (%u,%u), table %s",
6876 gl_index_id.cf_id, gl_index_id.index_id,
6877 old_tbl_def_arg->full_tablename().c_str());
6878 DBUG_RETURN(HA_EXIT_FAILURE);
6879 }
6880
6881 uint32 ttl_rec_offset =
6882 Rdb_key_def::has_index_flag(index_info.m_index_flags,
6883 Rdb_key_def::TTL_FLAG)
6884 ? Rdb_key_def::calculate_index_flag_offset(
6885 index_info.m_index_flags, Rdb_key_def::TTL_FLAG)
6886 : UINT_MAX;
6887
6888 /*
6889 We can't use the copy constructor because we need to update the
6890 keynr within the pack_info for each field and the keyno of the keydef
6891 itself.
6892 */
6893 new_key_descr[i] = std::make_shared<Rdb_key_def>(
6894 okd.get_index_number(), i, okd.get_cf(),
6895 index_info.m_index_dict_version, index_info.m_index_type,
6896 index_info.m_kv_version, okd.m_is_reverse_cf,
6897 okd.m_is_per_partition_cf, okd.m_name.c_str(),
6898 dict_manager.get_stats(gl_index_id), index_info.m_index_flags,
6899 ttl_rec_offset, index_info.m_ttl_duration);
6900 } else if (create_key_def(table_arg, i, tbl_def_arg, &new_key_descr[i],
6901 cfs[i])) {
6902 DBUG_RETURN(HA_EXIT_FAILURE);
6903 }
6904
6905 DBUG_ASSERT(new_key_descr[i] != nullptr);
6906 new_key_descr[i]->setup(table_arg, tbl_def_arg);
6907 }
6908
6909 DBUG_RETURN(HA_EXIT_SUCCESS);
6910}
6911
6912std::unordered_map<std::string, uint> ha_rocksdb::get_old_key_positions(
6913 const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg,
6914 const TABLE *const old_table_arg,
6915 const Rdb_tbl_def *const old_tbl_def_arg) const {
6916 DBUG_ENTER_FUNC();
6917
6918 DBUG_ASSERT(table_arg != nullptr);
6919 DBUG_ASSERT(old_table_arg != nullptr);
6920 DBUG_ASSERT(tbl_def_arg != nullptr);
6921 DBUG_ASSERT(old_tbl_def_arg != nullptr);
6922
6923 std::shared_ptr<Rdb_key_def> *const old_key_descr =
6924 old_tbl_def_arg->m_key_descr_arr;
6925 std::unordered_map<std::string, uint> old_key_pos;
6926 std::unordered_map<std::string, uint> new_key_pos;
6927 uint i;
6928
6929 for (i = 0; i < tbl_def_arg->m_key_count; i++) {
6930 new_key_pos[get_key_name(i, table_arg, tbl_def_arg)] = i;
6931 }
6932
6933 for (i = 0; i < old_tbl_def_arg->m_key_count; i++) {
6934 if (is_hidden_pk(i, old_table_arg, old_tbl_def_arg)) {
6935 old_key_pos[old_key_descr[i]->m_name] = i;
6936 continue;
6937 }
6938
6939 /*
6940 In case of matching key name, need to check key parts of keys as well,
6941 in case a simultaneous drop + add is performed, where the key name is the
6942 same but the key parts are different.
6943
6944 Example:
6945 CREATE TABLE t1 (a INT, b INT, KEY ka(a)) ENGINE=RocksDB;
6946 ALTER TABLE t1 DROP INDEX ka, ADD INDEX ka(b), ALGORITHM=INPLACE;
6947 */
6948 const KEY *const old_key = &old_table_arg->key_info[i];
6949 const auto &it = new_key_pos.find(old_key->name.str);
6950 if (it == new_key_pos.end()) {
6951 continue;
6952 }
6953
6954 KEY *const new_key = &table_arg->key_info[it->second];
6955
6956 /*
6957 Check that the key is identical between old and new tables.
6958 If not, we still need to create a new index.
6959
6960 The exception is if there is an index changed from unique to non-unique,
6961 in these cases we don't need to rebuild as they are stored the same way in
6962 RocksDB.
6963 */
6964 bool unique_to_non_unique =
6965 ((old_key->flags ^ new_key->flags) == HA_NOSAME) &&
6966 (old_key->flags & HA_NOSAME);
6967
6968 if (compare_keys(old_key, new_key) && !unique_to_non_unique) {
6969 continue;
6970 }
6971
6972 /* Check to make sure key parts match. */
6973 if (compare_key_parts(old_key, new_key)) {
6974 continue;
6975 }
6976
6977 old_key_pos[old_key->name.str] = i;
6978 }
6979
6980 DBUG_RETURN(old_key_pos);
6981}
6982
6983/* Check to see if two keys are identical. */
6984int ha_rocksdb::compare_keys(const KEY *const old_key,
6985 const KEY *const new_key) const {
6986 DBUG_ENTER_FUNC();
6987
6988 DBUG_ASSERT(old_key != nullptr);
6989 DBUG_ASSERT(new_key != nullptr);
6990
6991 /* Check index name. */
6992 if (strcmp(old_key->name.str, new_key->name.str) != 0) {
6993 DBUG_RETURN(HA_EXIT_FAILURE);
6994 }
6995
6996 /* If index algorithms are different then keys are different. */
6997 if (old_key->algorithm != new_key->algorithm) {
6998 DBUG_RETURN(HA_EXIT_FAILURE);
6999 }
7000
7001 /* Check that the key is identical between old and new tables. */
7002 if ((old_key->flags ^ new_key->flags) & HA_KEYFLAG_MASK) {
7003 DBUG_RETURN(HA_EXIT_FAILURE);
7004 }
7005
7006 /* Check index comment. (for column family changes) */
7007 std::string old_comment(old_key->comment.str, old_key->comment.length);
7008 std::string new_comment(new_key->comment.str, new_key->comment.length);
7009 if (old_comment.compare(new_comment) != 0) {
7010 DBUG_RETURN(HA_EXIT_FAILURE);
7011 }
7012
7013 DBUG_RETURN(HA_EXIT_SUCCESS);
7014}
7015
7016/* Check two keys to ensure that key parts within keys match */
7017int ha_rocksdb::compare_key_parts(const KEY *const old_key,
7018 const KEY *const new_key) const {
7019 DBUG_ENTER_FUNC();
7020
7021 DBUG_ASSERT(old_key != nullptr);
7022 DBUG_ASSERT(new_key != nullptr);
7023
7024 /* Skip if key parts do not match, as it is a different key */
7025 if (new_key->user_defined_key_parts != old_key->user_defined_key_parts) {
7026 DBUG_RETURN(HA_EXIT_FAILURE);
7027 }
7028
7029 /* Check to see that key parts themselves match */
7030 for (uint i = 0; i < old_key->user_defined_key_parts; i++) {
7031 if (strcmp(old_key->key_part[i].field->field_name.str,
7032 new_key->key_part[i].field->field_name.str) != 0) {
7033 DBUG_RETURN(HA_EXIT_FAILURE);
7034 }
7035
7036 /* Check if prefix index key part length has changed */
7037 if (old_key->key_part[i].length != new_key->key_part[i].length) {
7038 DBUG_RETURN(HA_EXIT_FAILURE);
7039 }
7040 }
7041
7042 DBUG_RETURN(HA_EXIT_SUCCESS);
7043}
7044
7045/*
7046 Create key definition needed for storing data in rocksdb.
7047 This can be called either during CREATE table or doing ADD index operations.
7048
7049 @param in
7050 table_arg Table with definition
7051 i Position of index being created inside table_arg->key_info
7052 tbl_def_arg Table def structure being populated
7053 cf_info Struct which contains column family information
7054
7055 @param out
7056 new_key_def Newly created index definition.
7057
7058 @return
7059 0 - Ok
7060 other - error, either given table ddl is not supported by rocksdb or OOM.
7061*/
7062int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint &i,
7063 const Rdb_tbl_def *const tbl_def_arg,
7064 std::shared_ptr<Rdb_key_def> *const new_key_def,
7065 const struct key_def_cf_info &cf_info) const {
7066 DBUG_ENTER_FUNC();
7067
7068 DBUG_ASSERT(new_key_def != nullptr);
7069 DBUG_ASSERT(*new_key_def == nullptr);
7070
7071 uint64 ttl_duration = 0;
7072 std::string ttl_column;
7073 uint ttl_field_offset;
7074
7075 uint err;
7076 if ((err = Rdb_key_def::extract_ttl_duration(table_arg, tbl_def_arg,
7077 &ttl_duration))) {
7078 DBUG_RETURN(err);
7079 }
7080
7081 if ((err = Rdb_key_def::extract_ttl_col(table_arg, tbl_def_arg, &ttl_column,
7082 &ttl_field_offset))) {
7083 DBUG_RETURN(err);
7084 }
7085
7086 /* We don't currently support TTL on tables with hidden primary keys. */
7087 if (ttl_duration > 0 && is_hidden_pk(i, table_arg, tbl_def_arg)) {
7088 my_error(ER_RDB_TTL_UNSUPPORTED, MYF(0));
7089 DBUG_RETURN(HA_EXIT_FAILURE);
7090 }
7091
7092 /*
7093 If TTL duration is not specified but TTL column was specified, throw an
7094 error because TTL column requires duration.
7095 */
7096 if (ttl_duration == 0 && !ttl_column.empty()) {
7097 my_error(ER_RDB_TTL_COL_FORMAT, MYF(0), ttl_column.c_str());
7098 DBUG_RETURN(HA_EXIT_FAILURE);
7099 }
7100
7101 const uint index_id = ddl_manager.get_and_update_next_number(&dict_manager);
7102 const uint16_t index_dict_version = Rdb_key_def::INDEX_INFO_VERSION_LATEST;
7103 uchar index_type;
7104 uint16_t kv_version;
7105
7106 if (is_hidden_pk(i, table_arg, tbl_def_arg)) {
7107 index_type = Rdb_key_def::INDEX_TYPE_HIDDEN_PRIMARY;
7108 kv_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
7109 } else if (i == table_arg->s->primary_key) {
7110 index_type = Rdb_key_def::INDEX_TYPE_PRIMARY;
7111 uint16 pk_latest_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
7112 kv_version = pk_latest_version;
7113 } else {
7114 index_type = Rdb_key_def::INDEX_TYPE_SECONDARY;
7115 uint16 sk_latest_version = Rdb_key_def::SECONDARY_FORMAT_VERSION_LATEST;
7116 kv_version = sk_latest_version;
7117 }
7118
7119 // Use PRIMARY_FORMAT_VERSION_UPDATE1 here since it is the same value as
7120 // SECONDARY_FORMAT_VERSION_UPDATE1 so it doesn't matter if this is a
7121 // primary key or secondary key.
7122 DBUG_EXECUTE_IF("MYROCKS_LEGACY_VARBINARY_FORMAT", {
7123 kv_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_UPDATE1;
7124 });
7125
7126 DBUG_EXECUTE_IF("MYROCKS_NO_COVERED_BITMAP_FORMAT", {
7127 if (index_type == Rdb_key_def::INDEX_TYPE_SECONDARY) {
7128 kv_version = Rdb_key_def::SECONDARY_FORMAT_VERSION_UPDATE2;
7129 }
7130 });
7131
7132 uint32 index_flags = (ttl_duration > 0 ? Rdb_key_def::TTL_FLAG : 0);
7133
7134 uint32 ttl_rec_offset =
7135 Rdb_key_def::has_index_flag(index_flags, Rdb_key_def::TTL_FLAG)
7136 ? Rdb_key_def::calculate_index_flag_offset(index_flags,
7137 Rdb_key_def::TTL_FLAG)
7138 : UINT_MAX;
7139
7140 const char *const key_name = get_key_name(i, table_arg, m_tbl_def);
7141 *new_key_def = std::make_shared<Rdb_key_def>(
7142 index_id, i, cf_info.cf_handle, index_dict_version, index_type,
7143 kv_version, cf_info.is_reverse_cf, cf_info.is_per_partition_cf, key_name,
7144 Rdb_index_stats(), index_flags, ttl_rec_offset, ttl_duration);
7145
7146 if (!ttl_column.empty()) {
7147 (*new_key_def)->m_ttl_column = ttl_column;
7148 }
7149
7150 DBUG_RETURN(HA_EXIT_SUCCESS);
7151}
7152
7153int rdb_normalize_tablename(const std::string &tablename,
7154 std::string *const strbuf) {
7155 DBUG_ASSERT(strbuf != nullptr);
7156
7157 if (tablename.size() < 2 || tablename[0] != '.' ||
7158 (tablename[1] != FN_LIBCHAR && tablename[1] != FN_LIBCHAR2)) {
7159 DBUG_ASSERT(0); // We were not passed table name?
7160 return HA_ERR_ROCKSDB_INVALID_TABLE;
7161 }
7162
7163 size_t pos = tablename.find_first_of(FN_LIBCHAR, 2);
7164 if (pos == std::string::npos) {
7165 pos = tablename.find_first_of(FN_LIBCHAR2, 2);
7166 }
7167
7168 if (pos == std::string::npos) {
7169 DBUG_ASSERT(0); // We were not passed table name?
7170 return HA_ERR_ROCKSDB_INVALID_TABLE;
7171 }
7172
7173 *strbuf = tablename.substr(2, pos - 2) + "." + tablename.substr(pos + 1);
7174
7175 return HA_EXIT_SUCCESS;
7176}
7177
7178/*
7179 Check to see if the user's original statement includes foreign key
7180 references
7181*/
7182bool ha_rocksdb::contains_foreign_key(THD *const thd) {
7183 bool success;
7184 const char *str = thd_query_string(thd)->str;
7185
7186 DBUG_ASSERT(str != nullptr);
7187
7188 while (*str != '\0') {
7189 // Scan from our current pos looking for 'FOREIGN'
7190 str = rdb_find_in_string(str, "FOREIGN", &success);
7191 if (!success) {
7192 return false;
7193 }
7194
7195 // Skip past the found "FOREIGN'
7196 str = rdb_check_next_token(&my_charset_bin, str, "FOREIGN", &success);
7197 DBUG_ASSERT(success);
7198
7199 if (!my_isspace(&my_charset_bin, *str)) {
7200 return false;
7201 }
7202
7203 // See if the next token is 'KEY'
7204 str = rdb_check_next_token(&my_charset_bin, str, "KEY", &success);
7205 if (!success) {
7206 continue;
7207 }
7208
7209 // See if the next token is '('
7210 str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
7211 if (!success) {
7212 // There is an optional index id after 'FOREIGN KEY', skip it
7213 str = rdb_skip_id(&my_charset_bin, str);
7214
7215 // Now check for '(' again
7216 str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
7217 }
7218
7219 // If we have found 'FOREIGN KEY [<word>] (' we can be confident we have
7220 // a foreign key clause.
7221 return success;
7222 }
7223
7224 // We never found a valid foreign key clause
7225 return false;
7226}
7227
7228/**
7229 @brief
7230 splits the normalized table name of <dbname>.<tablename>#P#<part_no> into
7231 the <dbname>, <tablename> and <part_no> components.
7232
7233 @param dbbuf returns database name/table_schema
7234 @param tablebuf returns tablename
7235 @param partitionbuf returns partition suffix if there is one
7236 @return HA_EXIT_SUCCESS on success, non-zero on failure to split
7237*/
7238int rdb_split_normalized_tablename(const std::string &fullname,
7239 std::string *const db,
7240 std::string *const table,
7241 std::string *const partition) {
7242 DBUG_ASSERT(!fullname.empty());
7243
7244#define RDB_PARTITION_STR "#P#"
7245
7246 /* Normalize returns dbname.tablename. */
7247 size_t dotpos = fullname.find('.');
7248
7249 /* Invalid table name? */
7250 if (dotpos == std::string::npos) {
7251 return HA_ERR_ROCKSDB_INVALID_TABLE;
7252 }
7253
7254 // Table must have a database name associated with it.
7255 DBUG_ASSERT(dotpos > 0);
7256
7257 if (db != nullptr) {
7258 *db = fullname.substr(0, dotpos);
7259 }
7260
7261 dotpos++;
7262
7263 const size_t partpos =
7264 fullname.find(RDB_PARTITION_STR, dotpos, strlen(RDB_PARTITION_STR));
7265
7266 if (partpos != std::string::npos) {
7267 DBUG_ASSERT(partpos >= dotpos);
7268
7269 if (table != nullptr) {
7270 *table = fullname.substr(dotpos, partpos - dotpos);
7271 }
7272
7273 if (partition != nullptr) {
7274 *partition = fullname.substr(partpos + strlen(RDB_PARTITION_STR));
7275 }
7276 } else if (table != nullptr) {
7277 *table = fullname.substr(dotpos);
7278 }
7279
7280 return HA_EXIT_SUCCESS;
7281}
7282
7283/**
7284 @brief
7285 create() is called to create a database. The variable name will have the name
7286 of the table.
7287
7288 @details
7289 When create() is called you do not need to worry about
7290 opening the table. Also, the .frm file will have already been
7291 created so adjusting create_info is not necessary. You can overwrite
7292 the .frm file at this point if you wish to change the table
7293 definition, but there are no methods currently provided for doing
7294 so.
7295
7296 Called from handle.cc by ha_create_table().
7297
7298 @return
7299 HA_EXIT_SUCCESS OK
7300 other HA_ERR error code (can be SE-specific)
7301
7302 @see
7303 ha_create_table() in handle.cc
7304*/
7305
7306int ha_rocksdb::create(const char *const name, TABLE *const table_arg,
7307 HA_CREATE_INFO *const create_info) {
7308 DBUG_ENTER_FUNC();
7309
7310 DBUG_ASSERT(table_arg != nullptr);
7311 DBUG_ASSERT(create_info != nullptr);
7312
7313 if (create_info->data_file_name) {
7314 // DATA DIRECTORY is used to create tables under a specific location
7315 // outside the MySQL data directory. We don't support this for MyRocks.
7316 // The `rocksdb_datadir` setting should be used to configure RocksDB data
7317 // directory.
7318 print_error(HA_ERR_ROCKSDB_TABLE_DATA_DIRECTORY_NOT_SUPPORTED,
7319 MYF(ME_JUST_WARNING));
7320 DBUG_RETURN(HA_WRONG_CREATE_OPTION);
7321 }
7322
7323 if (create_info->index_file_name) {
7324 // Similar check for INDEX DIRECTORY as well.
7325 print_error(HA_ERR_ROCKSDB_TABLE_INDEX_DIRECTORY_NOT_SUPPORTED,
7326 MYF(ME_JUST_WARNING));
7327 DBUG_RETURN(HA_WRONG_CREATE_OPTION);
7328 }
7329
7330 std::string str;
7331 int err;
7332
7333 THD *const thd = my_core::thd_get_current_thd();
7334
7335 if (get_table_if_exists(name)) {
7336 if (thd->lex->sql_command == SQLCOM_TRUNCATE) {
7337 err = delete_table(name);
7338 if (err != HA_EXIT_SUCCESS) {
7339 DBUG_RETURN(err);
7340 }
7341 } else {
7342 err = rdb_normalize_tablename(name, &str);
7343 if (err != HA_EXIT_SUCCESS) {
7344 DBUG_RETURN(err);
7345 }
7346 my_error(ER_METADATA_INCONSISTENCY, MYF(0), str.c_str(), name);
7347 DBUG_RETURN(HA_ERR_ROCKSDB_CORRUPT_DATA);
7348 }
7349 }
7350
7351 /*
7352 Construct dbname.tablename ourselves, because parititioning
7353 passes strings like "./test/t14#P#p0" for individual partitions,
7354 while table_arg->s->table_name has none of that.
7355 */
7356 err = rdb_normalize_tablename(name, &str);
7357 if (err != HA_EXIT_SUCCESS) {
7358 DBUG_RETURN(err);
7359 }
7360
7361 if (contains_foreign_key(thd)) {
7362 my_error(ER_NOT_SUPPORTED_YET, MYF(0),
7363 "FOREIGN KEY for the RocksDB storage engine");
7364 DBUG_RETURN(HA_ERR_UNSUPPORTED);
7365 }
7366
7367 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
7368 rocksdb::WriteBatch *const batch = wb.get();
7369
7370 /* Create table/key descriptions and put them into the data dictionary */
7371 m_tbl_def = new Rdb_tbl_def(str);
7372
7373 uint n_keys = table_arg->s->keys;
7374
7375 /*
7376 If no primary key found, create a hidden PK and place it inside table
7377 definition
7378 */
7379 if (has_hidden_pk(table_arg)) {
7380 n_keys += 1;
7381 }
7382
7383 m_key_descr_arr = new std::shared_ptr<Rdb_key_def>[n_keys];
7384 m_tbl_def->m_key_count = n_keys;
7385 m_tbl_def->m_key_descr_arr = m_key_descr_arr;
7386
7387 err = create_key_defs(table_arg, m_tbl_def);
7388 if (err != HA_EXIT_SUCCESS) {
7389 goto error;
7390 }
7391
7392 m_pk_descr = m_key_descr_arr[pk_index(table_arg, m_tbl_def)];
7393
7394 if (create_info->auto_increment_value) {
7395 bool autoinc_upgrade_test = false;
7396 m_tbl_def->m_auto_incr_val = create_info->auto_increment_value;
7397 DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", autoinc_upgrade_test = true;);
7398 if (!autoinc_upgrade_test) {
7399 auto s = dict_manager.put_auto_incr_val(
7400 batch, m_tbl_def->get_autoincr_gl_index_id(),
7401 m_tbl_def->m_auto_incr_val);
7402 if (!s.ok()) {
7403 goto error;
7404 }
7405 }
7406 }
7407
7408 dict_manager.lock();
7409 err = ddl_manager.put_and_write(m_tbl_def, batch);
7410 if (err != HA_EXIT_SUCCESS) {
7411 dict_manager.unlock();
7412 goto error;
7413 }
7414
7415 err = dict_manager.commit(batch);
7416 if (err != HA_EXIT_SUCCESS) {
7417 dict_manager.unlock();
7418 goto error;
7419 }
7420
7421 dict_manager.unlock();
7422
7423 DBUG_RETURN(HA_EXIT_SUCCESS);
7424
7425error:
7426 /* Delete what we have allocated so far */
7427 delete m_tbl_def;
7428 m_tbl_def = nullptr;
7429 m_key_descr_arr = nullptr;
7430
7431 DBUG_RETURN(err);
7432}
7433
7434/**
7435 @note
7436 This function is used only when the table has not yet been opened, and
7437 keyread_allowed bitmap doesn't have the correct values yet.
7438
7439 See comment in ha_rocksdb::index_flags() for details.
7440*/
7441
7442bool ha_rocksdb::check_keyread_allowed(uint inx, uint part,
7443 bool all_parts) const {
7444 bool res = true;
7445 KEY *const key_info = &table_share->key_info[inx];
7446
7447 Rdb_field_packing dummy1;
7448 res = dummy1.setup(nullptr, key_info->key_part[part].field, inx, part,
7449 key_info->key_part[part].length);
7450
7451 if (res && all_parts) {
7452 for (uint i = 0; i < part; i++) {
7453 Field *field;
7454 if ((field = key_info->key_part[i].field)) {
7455 Rdb_field_packing dummy;
7456 if (!dummy.setup(nullptr, field, inx, i,
7457 key_info->key_part[i].length)) {
7458 /* Cannot do index-only reads for this column */
7459 res = false;
7460 break;
7461 }
7462 }
7463 }
7464 }
7465
7466 const uint pk = table_share->primary_key;
7467 if (inx == pk && all_parts &&
7468 part + 1 == table_share->key_info[pk].user_defined_key_parts) {
7469 m_pk_can_be_decoded = res;
7470 }
7471
7472 return res;
7473}
7474
7475int ha_rocksdb::read_key_exact(const Rdb_key_def &kd,
7476 rocksdb::Iterator *const iter,
7477 const bool &full_key_match,
7478 const rocksdb::Slice &key_slice,
7479 const int64_t ttl_filter_ts) {
7480 DBUG_ASSERT(iter != nullptr);
7481
7482 /*
7483 We are looking for the first record such that
7484 index_tuple= lookup_tuple.
7485 lookup_tuple may be a prefix of the index.
7486 */
7487 rocksdb_smart_seek(kd.m_is_reverse_cf, iter, key_slice);
7488
7489 while (iter->Valid() && kd.value_matches_prefix(iter->key(), key_slice)) {
7490 /*
7491 If TTL is enabled we need to check if the given key has already expired
7492 from the POV of the current transaction. If it has, try going to the next
7493 key.
7494 */
7495 if (kd.has_ttl() && should_hide_ttl_rec(kd, iter->value(), ttl_filter_ts)) {
7496 rocksdb_smart_next(kd.m_is_reverse_cf, iter);
7497 continue;
7498 }
7499
7500 return HA_EXIT_SUCCESS;
7501 }
7502
7503 /*
7504 Got a record that is not equal to the lookup value, or even a record
7505 from another table.index.
7506 */
7507 return HA_ERR_KEY_NOT_FOUND;
7508}
7509
7510int ha_rocksdb::read_before_key(const Rdb_key_def &kd,
7511 const bool &full_key_match,
7512 const rocksdb::Slice &key_slice,
7513 const int64_t ttl_filter_ts) {
7514 /*
7515 We are looking for record with the biggest t.key such that
7516 t.key < lookup_tuple.
7517 */
7518 rocksdb_smart_seek(!kd.m_is_reverse_cf, m_scan_it, key_slice);
7519
7520 while (is_valid(m_scan_it)) {
7521 /*
7522 We are using full key and we've hit an exact match, or...
7523
7524 If TTL is enabled we need to check if the given key has already expired
7525 from the POV of the current transaction. If it has, try going to the next
7526 key.
7527 */
7528 if ((full_key_match &&
7529 kd.value_matches_prefix(m_scan_it->key(), key_slice)) ||
7530 (kd.has_ttl() &&
7531 should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts))) {
7532 rocksdb_smart_next(!kd.m_is_reverse_cf, m_scan_it);
7533 continue;
7534 }
7535
7536 return HA_EXIT_SUCCESS;
7537 }
7538
7539 return HA_ERR_KEY_NOT_FOUND;
7540}
7541
7542int ha_rocksdb::read_after_key(const Rdb_key_def &kd,
7543 const rocksdb::Slice &key_slice,
7544 const int64_t ttl_filter_ts) {
7545 /*
7546 We are looking for the first record such that
7547
7548 index_tuple $GT lookup_tuple
7549
7550 with HA_READ_AFTER_KEY, $GT = '>',
7551 with HA_READ_KEY_OR_NEXT, $GT = '>='
7552 */
7553 rocksdb_smart_seek(kd.m_is_reverse_cf, m_scan_it, key_slice);
7554
7555 /*
7556 If TTL is enabled we need to check if the given key has already expired
7557 from the POV of the current transaction. If it has, try going to the next
7558 key.
7559 */
7560 while (is_valid(m_scan_it) && kd.has_ttl() &&
7561 should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts)) {
7562 rocksdb_smart_next(kd.m_is_reverse_cf, m_scan_it);
7563 }
7564
7565 return is_valid(m_scan_it) ? HA_EXIT_SUCCESS : HA_ERR_KEY_NOT_FOUND;
7566}
7567
7568int ha_rocksdb::position_to_correct_key(
7569 const Rdb_key_def &kd, const enum ha_rkey_function &find_flag,
7570 const bool &full_key_match, const uchar *const key,
7571 const key_part_map &keypart_map, const rocksdb::Slice &key_slice,
7572 bool *const move_forward, const int64_t ttl_filter_ts) {
7573 int rc = 0;
7574
7575 *move_forward = true;
7576
7577 switch (find_flag) {
7578 case HA_READ_KEY_EXACT:
7579 rc =
7580 read_key_exact(kd, m_scan_it, full_key_match, key_slice, ttl_filter_ts);
7581 break;
7582 case HA_READ_BEFORE_KEY:
7583 *move_forward = false;
7584 rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts);
7585 if (rc == 0 && !kd.covers_key(m_scan_it->key())) {
7586 /* The record we've got is not from this index */
7587 rc = HA_ERR_KEY_NOT_FOUND;
7588 }
7589 break;
7590 case HA_READ_AFTER_KEY:
7591 case HA_READ_KEY_OR_NEXT:
7592 rc = read_after_key(kd, key_slice, ttl_filter_ts);
7593 if (rc == 0 && !kd.covers_key(m_scan_it->key())) {
7594 /* The record we've got is not from this index */
7595 rc = HA_ERR_KEY_NOT_FOUND;
7596 }
7597 break;
7598 case HA_READ_KEY_OR_PREV:
7599 case HA_READ_PREFIX:
7600 /* This flag is not used by the SQL layer, so we don't support it yet. */
7601 rc = HA_ERR_UNSUPPORTED;
7602 break;
7603 case HA_READ_PREFIX_LAST:
7604 case HA_READ_PREFIX_LAST_OR_PREV:
7605 *move_forward = false;
7606 /*
7607 Find the last record with the specified index prefix lookup.
7608 - HA_READ_PREFIX_LAST requires that the record has the
7609 prefix=lookup (if there are no such records,
7610 HA_ERR_KEY_NOT_FOUND should be returned).
7611 - HA_READ_PREFIX_LAST_OR_PREV has no such requirement. If there are no
7612 records with prefix=lookup, we should return the last record
7613 before that.
7614 */
7615 rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts);
7616 if (rc == 0) {
7617 const rocksdb::Slice &rkey = m_scan_it->key();
7618 if (!kd.covers_key(rkey)) {
7619 /* The record we've got is not from this index */
7620 rc = HA_ERR_KEY_NOT_FOUND;
7621 } else if (find_flag == HA_READ_PREFIX_LAST) {
7622 uint size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
7623 key, keypart_map);
7624 rocksdb::Slice lookup_tuple(reinterpret_cast<char *>(m_sk_packed_tuple),
7625 size);
7626
7627 // We need to compare the key we've got with the original search prefix.
7628 if (!kd.value_matches_prefix(rkey, lookup_tuple)) {
7629 rc = HA_ERR_KEY_NOT_FOUND;
7630 }
7631 }
7632 }
7633 break;
7634 default:
7635 DBUG_ASSERT(0);
7636 break;
7637 }
7638
7639 return rc;
7640}
7641
7642int ha_rocksdb::calc_eq_cond_len(const Rdb_key_def &kd,
7643 const enum ha_rkey_function &find_flag,
7644 const rocksdb::Slice &slice,
7645 const int &bytes_changed_by_succ,
7646 const key_range *const end_key,
7647 uint *const end_key_packed_size) {
7648 if (find_flag == HA_READ_KEY_EXACT)
7649 return slice.size();
7650
7651 if (find_flag == HA_READ_PREFIX_LAST) {
7652 /*
7653 We have made the kd.successor(m_sk_packed_tuple) call above.
7654
7655 The slice is at least Rdb_key_def::INDEX_NUMBER_SIZE bytes long.
7656 */
7657 return slice.size() - bytes_changed_by_succ;
7658 }
7659
7660 if (end_key) {
7661 *end_key_packed_size =
7662 kd.pack_index_tuple(table, m_pack_buffer, m_end_key_packed_tuple,
7663 end_key->key, end_key->keypart_map);
7664
7665 /*
7666 Calculating length of the equal conditions here. 4 byte index id is
7667 included.
7668 Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
7669 WHERE id1=1 AND id2=1 AND id3>=2 => eq_cond_len= 4+8+4= 16
7670 WHERE id1=1 AND id2>=1 AND id3>=2 => eq_cond_len= 4+8= 12
7671 Example2: id1 VARCHAR(30), id2 INT, PRIMARY KEY (id1, id2)
7672 WHERE id1 = 'AAA' and id2 < 3; => eq_cond_len=13 (varchar used 9 bytes)
7673 */
7674 rocksdb::Slice end_slice(reinterpret_cast<char *>(m_end_key_packed_tuple),
7675 *end_key_packed_size);
7676 return slice.difference_offset(end_slice);
7677 }
7678
7679 /*
7680 On range scan without any end key condition, there is no
7681 eq cond, and eq cond length is the same as index_id size (4 bytes).
7682 Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
7683 WHERE id1>=1 AND id2 >= 2 and id2 <= 5 => eq_cond_len= 4
7684 */
7685 return Rdb_key_def::INDEX_NUMBER_SIZE;
7686}
7687
7688int ha_rocksdb::read_row_from_primary_key(uchar *const buf) {
7689 DBUG_ASSERT(buf != nullptr);
7690
7691 int rc;
7692 const rocksdb::Slice &rkey = m_scan_it->key();
7693 const uint pk_size = rkey.size();
7694 const char *pk_data = rkey.data();
7695
7696 memcpy(m_pk_packed_tuple, pk_data, pk_size);
7697 m_last_rowkey.copy(pk_data, pk_size, &my_charset_bin);
7698
7699 if (m_lock_rows != RDB_LOCK_NONE) {
7700 /* We need to put a lock and re-read */
7701 rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
7702 } else {
7703 /* Unpack from the row we've read */
7704 const rocksdb::Slice &value = m_scan_it->value();
7705 rc = convert_record_from_storage_format(&rkey, &value, buf);
7706 }
7707
7708 return rc;
7709}
7710
7711int ha_rocksdb::read_row_from_secondary_key(uchar *const buf,
7712 const Rdb_key_def &kd,
7713 bool move_forward) {
7714 DBUG_ASSERT(buf != nullptr);
7715
7716 int rc = 0;
7717 uint pk_size;
7718
7719 /* Get the key columns and primary key value */
7720 const rocksdb::Slice &rkey = m_scan_it->key();
7721 const rocksdb::Slice &value = m_scan_it->value();
7722
7723 bool covered_lookup =
7724 m_keyread_only || kd.covers_lookup(table, &value, &m_lookup_bitmap);
7725 if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) {
7726 pk_size =
7727 kd.get_primary_key_tuple(table, *m_pk_descr, &rkey, m_pk_packed_tuple);
7728 if (pk_size == RDB_INVALID_KEY_LEN) {
7729 rc = HA_ERR_ROCKSDB_CORRUPT_DATA;
7730 } else {
7731 rc = kd.unpack_record(table, buf, &rkey, &value,
7732 m_verify_row_debug_checksums);
7733 global_stats.covered_secondary_key_lookups.inc();
7734 }
7735 } else {
7736 if (kd.m_is_reverse_cf)
7737 move_forward = !move_forward;
7738
7739 rc = find_icp_matching_index_rec(move_forward, buf);
7740 if (!rc) {
7741 const rocksdb::Slice &rkey = m_scan_it->key();
7742 pk_size = kd.get_primary_key_tuple(table, *m_pk_descr, &rkey,
7743 m_pk_packed_tuple);
7744 if (pk_size == RDB_INVALID_KEY_LEN) {
7745 rc = HA_ERR_ROCKSDB_CORRUPT_DATA;
7746 } else {
7747 rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
7748 }
7749 }
7750 }
7751
7752 if (!rc) {
7753 m_last_rowkey.copy((const char *)m_pk_packed_tuple, pk_size,
7754 &my_charset_bin);
7755 }
7756
7757 return rc;
7758}
7759
7760/**
7761 @note
7762 The problem with this function is that SQL layer calls it, when
7763 - the table has not been yet opened (no ::open() call done)
7764 - this->table_share already exists, but it is in the process of being
7765 filled, so some of fields are still NULL.
7766 - In particular, table_share->key_info[inx].key_part[] is filled only up
7767 to part #part. Subsequent key parts are not yet filled.
7768
7769 To complicate things further, SQL layer will call index_flags() with
7770 all_parts=TRUE. Essentially, we're asked to provide flags for reading
7771 keyparts whose datatype is not yet known.
7772
7773 We walk around this problem by using check_keyread_allowed(), which uses
7774 table_share object and is careful not to step on unitialized data.
7775
7776 When we get a call with all_parts=TRUE, we try to analyze all parts but
7777 ignore those that have key_part->field==nullptr (these are not initialized
7778 yet).
7779*/
7780
7781ulong ha_rocksdb::index_flags(uint inx, uint part, bool all_parts) const {
7782 DBUG_ENTER_FUNC();
7783
7784 ulong base_flags = HA_READ_NEXT | // doesn't seem to be used
7785 HA_READ_ORDER | HA_READ_RANGE | HA_READ_PREV;
7786
7787 if (check_keyread_allowed(inx, part, all_parts))
7788 base_flags |= HA_KEYREAD_ONLY;
7789
7790 if (inx == table_share->primary_key) {
7791 /*
7792 Index-only reads on primary key are the same as table scan for us. Still,
7793 we need to explicitly "allow" them, otherwise SQL layer will miss some
7794 plans.
7795 */
7796 base_flags |= HA_KEYREAD_ONLY | HA_CLUSTERED_INDEX;
7797 } else {
7798 /*
7799 We can Index Condition Pushdown any key except the primary. With primary
7800 key, we get (pk, record) pair immediately, there is no place to put the
7801 ICP check.
7802 */
7803 base_flags |= HA_DO_INDEX_COND_PUSHDOWN;
7804 }
7805
7806 DBUG_RETURN(base_flags);
7807}
7808
7809/**
7810 @brief
7811 Read next index tuple through the secondary index.
7812
7813 @details
7814 m_scan_it points at the index key-value pair that we should read the (pk,row)
7815 pair for.
7816*/
7817int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) {
7818 DBUG_ASSERT(buf != nullptr);
7819 DBUG_ASSERT(table != nullptr);
7820#ifdef MARIAROCKS_NOT_YET
7821 stats.rows_requested++;
7822#endif
7823 /* Use STATUS_NOT_FOUND when record not found or some error occurred */
7824 table->status = STATUS_NOT_FOUND;
7825
7826 if (is_valid(m_scan_it)) {
7827 rocksdb::Slice key = m_scan_it->key();
7828
7829 /* Check if we've ran out of records of this index */
7830 if (m_key_descr_arr[keyno]->covers_key(key)) {
7831 int rc = 0;
7832
7833 // TODO: We could here check if we have ran out of range we're scanning
7834 const uint size = m_key_descr_arr[keyno]->get_primary_key_tuple(
7835 table, *m_pk_descr, &key, m_pk_packed_tuple);
7836 if (size == RDB_INVALID_KEY_LEN) {
7837 return HA_ERR_ROCKSDB_CORRUPT_DATA;
7838 }
7839
7840 m_last_rowkey.copy((const char *)m_pk_packed_tuple, size,
7841 &my_charset_bin);
7842
7843 rocksdb::Slice value = m_scan_it->value();
7844 bool covered_lookup =
7845 m_keyread_only || m_key_descr_arr[keyno]->covers_lookup(
7846 table, &value, &m_lookup_bitmap);
7847 if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) {
7848 rc = m_key_descr_arr[keyno]->unpack_record(
7849 table, buf, &key, &value, m_verify_row_debug_checksums);
7850 global_stats.covered_secondary_key_lookups.inc();
7851 } else {
7852 rc = get_row_by_rowid(buf, m_pk_packed_tuple, size);
7853 }
7854
7855 if (!rc) {
7856 table->status = 0;
7857#ifdef MARIAROCKS_NOT_YET
7858 stats.rows_read++;
7859 stats.rows_index_next++;
7860#endif
7861 update_row_stats(ROWS_READ);
7862 }
7863 return rc;
7864 }
7865 }
7866 return HA_ERR_END_OF_FILE;
7867}
7868
7869bool ha_rocksdb::is_using_full_key(key_part_map keypart_map,
7870 uint actual_key_parts)
7871{
7872 return (keypart_map == HA_WHOLE_KEY) ||
7873 (keypart_map == ((key_part_map(1) << actual_key_parts)
7874 - 1));
7875}
7876
7877/*
7878 ha_rocksdb::read_range_first overrides handler::read_range_first.
7879 The only difference from handler::read_range_first is that
7880 ha_rocksdb::read_range_first passes end_key to
7881 ha_rocksdb::index_read_map_impl function.
7882
7883 @return
7884 HA_EXIT_SUCCESS OK
7885 other HA_ERR error code (can be SE-specific)
7886*/
7887int ha_rocksdb::read_range_first(const key_range *const start_key,
7888 const key_range *const end_key,
7889 bool eq_range_arg, bool sorted) {
7890 DBUG_ENTER_FUNC();
7891
7892 int result;
7893
7894 eq_range = eq_range_arg;
7895
7896 /* MariaDB: Pass RANGE_SCAN_ASC when we support ICP on backwards scans */
7897 set_end_range(end_key);
7898
7899 range_key_part = table->key_info[active_index].key_part;
7900
7901 if (!start_key) // Read first record
7902 result = ha_index_first(table->record[0]);
7903 else {
7904#ifdef MARIAROCKS_NOT_YET
7905 if (is_using_prohibited_gap_locks(
7906 is_using_full_unique_key(active_index, start_key->keypart_map,
7907 start_key->flag))) {
7908 DBUG_RETURN(HA_ERR_LOCK_DEADLOCK);
7909 }
7910#endif
7911 increment_statistics(&SSV::ha_read_key_count);
7912
7913 MYSQL_TABLE_IO_WAIT(m_psi, PSI_TABLE_FETCH_ROW, active_index, 0, {
7914 result =
7915 index_read_map_impl(table->record[0], start_key->key,
7916 start_key->keypart_map, start_key->flag, end_key);
7917 })
7918 }
7919 if (result)
7920 DBUG_RETURN((result == HA_ERR_KEY_NOT_FOUND) ? HA_ERR_END_OF_FILE : result);
7921
7922 if (compare_key(end_range) <= 0) {
7923 DBUG_RETURN(HA_EXIT_SUCCESS);
7924 } else {
7925 /*
7926 The last read row does not fall in the range. So request
7927 storage engine to release row lock if possible.
7928 */
7929 unlock_row();
7930 DBUG_RETURN(HA_ERR_END_OF_FILE);
7931 }
7932}
7933
7934int ha_rocksdb::prepare_index_scan()
7935{
7936 range_key_part= table->key_info[active_index].key_part;
7937 set_end_range(NULL);
7938 return 0;
7939}
7940
7941
7942int ha_rocksdb::prepare_range_scan(const key_range *start_key,
7943 const key_range *end_key)
7944{
7945 range_key_part= table->key_info[active_index].key_part;
7946
7947 if (start_key)
7948 {
7949 m_save_start_range= *start_key;
7950 m_start_range= &m_save_start_range;
7951 }
7952 else
7953 m_start_range= NULL;
7954
7955 set_end_range(end_key);
7956 return 0;
7957}
7958
7959
7960/**
7961 @return
7962 HA_EXIT_SUCCESS OK
7963 other HA_ERR error code (can be SE-specific)
7964*/
7965
7966int ha_rocksdb::index_read_map(uchar *const buf, const uchar *const key,
7967 key_part_map keypart_map,
7968 enum ha_rkey_function find_flag) {
7969 DBUG_ENTER_FUNC();
7970
7971 DBUG_RETURN(index_read_map_impl(buf, key, keypart_map, find_flag, nullptr));
7972}
7973
7974/*
7975 See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL
7976 index navigation commands are converted into RocksDB lookup commands.
7977
7978 This function takes end_key as an argument, and it is set on range scan.
7979 MyRocks needs to decide whether prefix bloom filter can be used or not.
7980 To decide to use prefix bloom filter or not, calculating equal condition
7981 length
7982 is needed. On equal lookups (find_flag == HA_READ_KEY_EXACT), equal
7983 condition length is the same as rocksdb::Slice.size() of the start key.
7984 On range scan, equal condition length is MIN(start_key, end_key) of the
7985 rocksdb::Slice expression.
7986
7987 @return
7988 HA_EXIT_SUCCESS OK
7989 other HA_ERR error code (can be SE-specific)
7990*/
7991int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key,
7992 key_part_map keypart_map,
7993 enum ha_rkey_function find_flag,
7994 const key_range *end_key) {
7995 DBUG_ENTER_FUNC();
7996
7997 int rc = 0;
7998
7999 const Rdb_key_def &kd = *m_key_descr_arr[active_index];
8000 const uint actual_key_parts = kd.get_key_parts();
8001 bool using_full_key = is_using_full_key(keypart_map, actual_key_parts);
8002
8003 if (!end_key)
8004 end_key = end_range;
8005
8006 /* By default, we don't need the retrieved records to match the prefix */
8007 m_sk_match_prefix = nullptr;
8008#ifdef MARIAROCKS_NOT_YET
8009 stats.rows_requested++;
8010#endif
8011 if (active_index == table->s->primary_key && find_flag == HA_READ_KEY_EXACT &&
8012 using_full_key) {
8013 /*
8014 Equality lookup over primary key, using full tuple.
8015 This is a special case, use DB::Get.
8016 */
8017 const uint size = kd.pack_index_tuple(table, m_pack_buffer,
8018 m_pk_packed_tuple, key, keypart_map);
8019 bool skip_lookup = is_blind_delete_enabled();
8020
8021 rc = get_row_by_rowid(buf, m_pk_packed_tuple, size,
8022 skip_lookup, false);
8023
8024 if (!rc && !skip_lookup) {
8025#ifdef MARIAROCKS_NOT_YET
8026 stats.rows_read++;
8027 stats.rows_index_first++;
8028#endif
8029 update_row_stats(ROWS_READ);
8030 }
8031 DBUG_RETURN(rc);
8032 }
8033
8034 /*
8035 Unique secondary index performs lookups without the extended key fields
8036 */
8037 uint packed_size;
8038 if (active_index != table->s->primary_key &&
8039 table->key_info[active_index].flags & HA_NOSAME &&
8040 find_flag == HA_READ_KEY_EXACT && using_full_key) {
8041 key_part_map tmp_map = (key_part_map(1) << table->key_info[active_index]
8042 .user_defined_key_parts) -
8043 1;
8044 packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
8045 key, tmp_map);
8046 if (table->key_info[active_index].user_defined_key_parts !=
8047 kd.get_key_parts())
8048 using_full_key = false;
8049 } else {
8050 packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
8051 key, keypart_map);
8052 }
8053
8054 if ((pushed_idx_cond && pushed_idx_cond_keyno == active_index) &&
8055 (find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_PREFIX_LAST)) {
8056 /*
8057 We are doing a point index lookup, and ICP is enabled. It is possible
8058 that this call will be followed by ha_rocksdb->index_next_same() call.
8059
8060 Do what InnoDB does: save the lookup tuple now. We will need it in
8061 index_next_same/find_icp_matching_index_rec in order to stop scanning
8062 as soon as index record doesn't match the lookup tuple.
8063
8064 When not using ICP, handler::index_next_same() will make sure that rows
8065 that don't match the lookup prefix are not returned.
8066 row matches the lookup prefix.
8067 */
8068 m_sk_match_prefix = m_sk_match_prefix_buf;
8069 m_sk_match_length = packed_size;
8070 memcpy(m_sk_match_prefix, m_sk_packed_tuple, packed_size);
8071 }
8072
8073 int bytes_changed_by_succ = 0;
8074 if (find_flag == HA_READ_PREFIX_LAST_OR_PREV ||
8075 find_flag == HA_READ_PREFIX_LAST || find_flag == HA_READ_AFTER_KEY) {
8076 /* See below */
8077 bytes_changed_by_succ = kd.successor(m_sk_packed_tuple, packed_size);
8078 }
8079
8080 rocksdb::Slice slice(reinterpret_cast<const char *>(m_sk_packed_tuple),
8081 packed_size);
8082
8083 uint end_key_packed_size = 0;
8084 const key_range *cur_end_key= end_key;
8085 if (find_flag == HA_READ_PREFIX_LAST_OR_PREV)
8086 cur_end_key= m_start_range;
8087
8088 const uint eq_cond_len =
8089 calc_eq_cond_len(kd, find_flag, slice, bytes_changed_by_succ, cur_end_key,
8090 &end_key_packed_size);
8091
8092 bool use_all_keys = false;
8093 if (find_flag == HA_READ_KEY_EXACT &&
8094 my_count_bits(keypart_map) == kd.get_key_parts())
8095 use_all_keys = true;
8096
8097 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
8098 const bool is_new_snapshot = !tx->has_snapshot();
8099 // Loop as long as we get a deadlock error AND we end up creating the
8100 // snapshot here (i.e. it did not exist prior to this)
8101 for (;;) {
8102 /*
8103 This will open the iterator and position it at a record that's equal or
8104 greater than the lookup tuple.
8105 */
8106 setup_scan_iterator(kd, &slice, use_all_keys, eq_cond_len);
8107
8108 /*
8109 Once we are positioned on from above, move to the position we really
8110 want: See storage/rocksdb/rocksdb-range-access.txt
8111 */
8112 bool move_forward;
8113 rc =
8114 position_to_correct_key(kd, find_flag, using_full_key, key, keypart_map,
8115 slice, &move_forward, tx->m_snapshot_timestamp);
8116
8117 if (rc) {
8118 /* This status is returned on any error */
8119 table->status = STATUS_NOT_FOUND;
8120 DBUG_RETURN(rc);
8121 }
8122
8123 m_skip_scan_it_next_call = false;
8124
8125 /*
8126 Now get the data for the row into 'buf'. If we were using a primary key
8127 then we have all the rows we need. For a secondary key we now need to
8128 lookup the primary key.
8129 */
8130 if (active_index == table->s->primary_key)
8131 rc = read_row_from_primary_key(buf);
8132 else
8133 rc = read_row_from_secondary_key(buf, kd, move_forward);
8134
8135 if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot)
8136 break; /* Exit the loop */
8137
8138 // release the snapshot and iterator so they will be regenerated
8139 tx->release_snapshot();
8140 release_scan_iterator();
8141 }
8142
8143 if (rc) {
8144 /* the only possible error condition is record-not-found */
8145 table->status = STATUS_NOT_FOUND;
8146 } else {
8147 table->status = 0;
8148#ifdef MARIAROCKS_NOT_YET
8149 stats.rows_read++;
8150 stats.rows_index_first++;
8151#endif
8152 update_row_stats(ROWS_READ);
8153 }
8154
8155 DBUG_RETURN(rc);
8156}
8157
8158/*
8159 @brief
8160 Scan the secondary index until we find an index record that satisfies ICP
8161
8162 @param move_forward TRUE <=> move m_scan_it forward
8163 FALSE <=> move m_scan_it backward
8164 @param buf Record buffer (must be the same buffer that
8165 pushed index condition points to, in practice
8166 it is table->record[0])
8167
8168 @detail
8169 Move the current iterator m_scan_it until we get an index tuple that
8170 satisfies the pushed Index Condition.
8171 (if there is no pushed index condition, return right away)
8172
8173 @return
8174 0 - Index tuple satisfies ICP, can do index read.
8175 other - error code
8176*/
8177
8178int ha_rocksdb::find_icp_matching_index_rec(const bool &move_forward,
8179 uchar *const buf) {
8180 DBUG_ASSERT(buf != nullptr);
8181
8182 if (pushed_idx_cond && pushed_idx_cond_keyno == active_index) {
8183 const Rdb_key_def &kd = *m_key_descr_arr[active_index];
8184
8185 while (1) {
8186 rocksdb_skip_expired_records(kd, m_scan_it, !move_forward);
8187
8188 if (!is_valid(m_scan_it)) {
8189 table->status = STATUS_NOT_FOUND;
8190 return HA_ERR_END_OF_FILE;
8191 }
8192 const rocksdb::Slice rkey = m_scan_it->key();
8193
8194 if (!kd.covers_key(rkey)) {
8195 table->status = STATUS_NOT_FOUND;
8196 return HA_ERR_END_OF_FILE;
8197 }
8198
8199 if (m_sk_match_prefix) {
8200 const rocksdb::Slice prefix((const char *)m_sk_match_prefix,
8201 m_sk_match_length);
8202 if (!kd.value_matches_prefix(rkey, prefix)) {
8203 table->status = STATUS_NOT_FOUND;
8204 return HA_ERR_END_OF_FILE;
8205 }
8206 }
8207
8208 const rocksdb::Slice value = m_scan_it->value();
8209 int err = kd.unpack_record(table, buf, &rkey, &value,
8210 m_verify_row_debug_checksums);
8211 if (err != HA_EXIT_SUCCESS) {
8212 return err;
8213 }
8214
8215 const enum icp_result icp_status= handler_index_cond_check(this);
8216 if (icp_status == ICP_NO_MATCH) {
8217 rocksdb_smart_next(!move_forward, m_scan_it);
8218 continue; /* Get the next (or prev) index tuple */
8219 }
8220 else if (icp_status == ICP_OUT_OF_RANGE || icp_status == ICP_ABORTED_BY_USER) {
8221 /* We have walked out of range we are scanning */
8222 table->status = STATUS_NOT_FOUND;
8223 return HA_ERR_END_OF_FILE;
8224 }
8225 else /* icp_status == ICP_MATCH */
8226 {
8227 /* Index Condition is satisfied. We have rc==0, proceed to fetch the
8228 * row. */
8229 break;
8230 }
8231 /*
8232 TODO: should we have this here, or RockDB handles this internally?
8233 if (my_core::thd_killed(current_thd))
8234 {
8235 rc= HA_ERR_INTERNAL_ERROR; // doesn't matter
8236 break;
8237 }
8238 */
8239 }
8240 }
8241 return HA_EXIT_SUCCESS;
8242}
8243
8244/**
8245 @return
8246 HA_EXIT_SUCCESS OK
8247 other HA_ERR error code (can be SE-specific)
8248*/
8249
8250/**
8251 @return
8252 HA_ADMIN_OK OK
8253 other HA_ADMIN error code
8254*/
8255int ha_rocksdb::check(THD *const thd, HA_CHECK_OPT *const check_opt) {
8256 DBUG_ENTER_FUNC();
8257
8258 DBUG_ASSERT(thd != nullptr);
8259 DBUG_ASSERT(check_opt != nullptr);
8260
8261 const uint pk = pk_index(table, m_tbl_def);
8262 String rowkey_copy;
8263 String sec_key_copy;
8264 const char *const table_name = table->s->table_name.str;
8265
8266 // Only when debugging: don't use snapshot when reading
8267 // Rdb_transaction *tx= get_or_create_tx(table->in_use);
8268 // tx->snapshot= nullptr;
8269
8270 const bool save_verify_row_debug_checksums = m_verify_row_debug_checksums;
8271 m_verify_row_debug_checksums = true;
8272
8273 /* For each secondary index, check that we can get a PK value from it */
8274 // NO_LINT_DEBUG
8275 sql_print_verbose_info("CHECKTABLE %s: Checking table %s", table_name,
8276 table_name);
8277 ha_rows UNINIT_VAR(row_checksums_at_start); // set/used iff first_index==true
8278 ha_rows row_checksums = ha_rows(-1);
8279 bool first_index = true;
8280
8281 for (uint keyno = 0; keyno < table->s->keys; keyno++) {
8282 if (keyno != pk) {
8283 extra(HA_EXTRA_KEYREAD);
8284 ha_index_init(keyno, true);
8285 ha_rows rows = 0;
8286 ha_rows checksums = 0;
8287 if (first_index)
8288 row_checksums_at_start = m_row_checksums_checked;
8289 int res;
8290 // NO_LINT_DEBUG
8291 sql_print_verbose_info("CHECKTABLE %s: Checking index %s", table_name,
8292 table->key_info[keyno].name);
8293 while (1) {
8294 if (!rows)
8295 res = index_first(table->record[0]);
8296 else
8297 res = index_next(table->record[0]);
8298
8299 if (res == HA_ERR_END_OF_FILE)
8300 break;
8301 if (res) {
8302 // error
8303 // NO_LINT_DEBUG
8304 sql_print_error("CHECKTABLE %s: .. row %lld: index scan error %d",
8305 table_name, rows, res);
8306 goto error;
8307 }
8308 rocksdb::Slice key = m_scan_it->key();
8309 sec_key_copy.copy(key.data(), key.size(), &my_charset_bin);
8310 rowkey_copy.copy(m_last_rowkey.ptr(), m_last_rowkey.length(),
8311 &my_charset_bin);
8312
8313 if (m_key_descr_arr[keyno]->unpack_info_has_checksum(
8314 m_scan_it->value())) {
8315 checksums++;
8316 }
8317
8318 if ((res = get_row_by_rowid(table->record[0], rowkey_copy.ptr(),
8319 rowkey_copy.length()))) {
8320 // NO_LINT_DEBUG
8321 sql_print_error("CHECKTABLE %s: .. row %lld: "
8322 "failed to fetch row by rowid",
8323 table_name, rows);
8324 goto error;
8325 }
8326
8327 longlong hidden_pk_id = 0;
8328 if (has_hidden_pk(table) &&
8329 read_hidden_pk_id_from_rowkey(&hidden_pk_id))
8330 goto error;
8331
8332 /* Check if we get the same PK value */
8333 uint packed_size = m_pk_descr->pack_record(
8334 table, m_pack_buffer, table->record[0], m_pk_packed_tuple, nullptr,
8335 false, hidden_pk_id);
8336 if (packed_size != rowkey_copy.length() ||
8337 memcmp(m_pk_packed_tuple, rowkey_copy.ptr(), packed_size)) {
8338 // NO_LINT_DEBUG
8339 sql_print_error("CHECKTABLE %s: .. row %lld: PK value mismatch",
8340 table_name, rows);
8341 goto print_and_error;
8342 }
8343
8344 /* Check if we get the same secondary key value */
8345 packed_size = m_key_descr_arr[keyno]->pack_record(
8346 table, m_pack_buffer, table->record[0], m_sk_packed_tuple,
8347 &m_sk_tails, false, hidden_pk_id);
8348 if (packed_size != sec_key_copy.length() ||
8349 memcmp(m_sk_packed_tuple, sec_key_copy.ptr(), packed_size)) {
8350 // NO_LINT_DEBUG
8351 sql_print_error("CHECKTABLE %s: .. row %lld: "
8352 "secondary index value mismatch",
8353 table_name, rows);
8354 goto print_and_error;
8355 }
8356 rows++;
8357 continue;
8358
8359 print_and_error : {
8360 std::string buf;
8361 buf = rdb_hexdump(rowkey_copy.ptr(), rowkey_copy.length(),
8362 RDB_MAX_HEXDUMP_LEN);
8363 // NO_LINT_DEBUG
8364 sql_print_error("CHECKTABLE %s: rowkey: %s", table_name, buf.c_str());
8365
8366 buf = rdb_hexdump(m_retrieved_record.data(), m_retrieved_record.size(),
8367 RDB_MAX_HEXDUMP_LEN);
8368 // NO_LINT_DEBUG
8369 sql_print_error("CHECKTABLE %s: record: %s", table_name, buf.c_str());
8370
8371 buf = rdb_hexdump(sec_key_copy.ptr(), sec_key_copy.length(),
8372 RDB_MAX_HEXDUMP_LEN);
8373 // NO_LINT_DEBUG
8374 sql_print_error("CHECKTABLE %s: index: %s", table_name, buf.c_str());
8375
8376 goto error;
8377 }
8378 }
8379 // NO_LINT_DEBUG
8380 sql_print_verbose_info("CHECKTABLE %s: ... %lld index entries checked "
8381 "(%lld had checksums)",
8382 table_name, rows, checksums);
8383
8384 if (first_index) {
8385 row_checksums = m_row_checksums_checked - row_checksums_at_start;
8386 first_index = false;
8387 }
8388 ha_index_end();
8389 }
8390 }
8391 if (row_checksums != ha_rows(-1)) {
8392 // NO_LINT_DEBUG
8393 sql_print_verbose_info("CHECKTABLE %s: %lld table records had checksums",
8394 table_name, row_checksums);
8395 }
8396 extra(HA_EXTRA_NO_KEYREAD);
8397
8398 m_verify_row_debug_checksums = save_verify_row_debug_checksums;
8399 /*
8400 TODO: we should check also for PK records that are missing in the secondary
8401 indexes.
8402 For that, need to walk through the PK and check that every PK record has a
8403 proper counterpart in each secondary index.
8404 */
8405 DBUG_RETURN(HA_ADMIN_OK);
8406error:
8407 m_verify_row_debug_checksums = save_verify_row_debug_checksums;
8408 ha_index_or_rnd_end();
8409 extra(HA_EXTRA_NO_KEYREAD);
8410
8411 DBUG_RETURN(HA_ADMIN_CORRUPT);
8412}
8413
8414static void dbug_dump_str(FILE *const out, const char *const str, int len) {
8415 fprintf(out, "\"");
8416 for (int i = 0; i < len; i++) {
8417 if (str[i] > 32)
8418 fprintf(out, "%c", str[i]);
8419 else
8420 fprintf(out, "\\%d", str[i]);
8421 }
8422 fprintf(out, "\"");
8423}
8424
8425/*
8426 Debugging help: dump the whole database into a human-readable file.
8427 Usage:
8428 dbug_dump_database(rdb);
8429*/
8430
8431void dbug_dump_database(rocksdb::DB *const db) {
8432 FILE *const out = fopen("/tmp/rocksdb.dump", "wt");
8433 if (!out)
8434 return;
8435
8436 rocksdb::Iterator *it = db->NewIterator(rocksdb::ReadOptions());
8437 for (it->SeekToFirst(); it->Valid(); it->Next()) {
8438 rocksdb::Slice key = it->key();
8439 rocksdb::Slice val = it->value();
8440 dbug_dump_str(out, key.data(), key.size());
8441 fprintf(out, " -> ");
8442 dbug_dump_str(out, val.data(), val.size());
8443 fprintf(out, "\n");
8444 }
8445
8446 delete it;
8447 fclose(out);
8448}
8449
8450rocksdb::Status ha_rocksdb::get_for_update(
8451 Rdb_transaction *const tx, rocksdb::ColumnFamilyHandle *const column_family,
8452 const rocksdb::Slice &key, rocksdb::PinnableSlice *const value) const {
8453 DBUG_ASSERT(m_lock_rows != RDB_LOCK_NONE);
8454 const bool exclusive = m_lock_rows != RDB_LOCK_READ;
8455
8456 rocksdb::Status s = tx->get_for_update(column_family, key, value, exclusive);
8457
8458 // If we have a lock conflict and we are running in READ COMMITTTED mode
8459 // release and reacquire the snapshot and then retry the get_for_update().
8460 if (s.IsBusy() && !s.IsDeadlock() &&
8461 my_core::thd_tx_isolation(ha_thd()) == ISO_READ_COMMITTED) {
8462 tx->release_snapshot();
8463 tx->acquire_snapshot(false);
8464
8465 s = tx->get_for_update(column_family, key, value, exclusive);
8466 }
8467
8468 return s;
8469}
8470
8471bool ha_rocksdb::is_blind_delete_enabled()
8472{
8473 THD *thd = ha_thd();
8474 return (THDVAR(thd, blind_delete_primary_key) &&
8475 thd->lex->sql_command == SQLCOM_DELETE &&
8476 thd->lex->table_count == 1 &&
8477 table->s->keys == 1 &&
8478 !has_hidden_pk(table) &&
8479 !thd->rgi_slave);
8480}
8481
8482/*
8483 Given a rowid (i.e. packed PK) as a parameter, get the record.
8484
8485 @return
8486 HA_EXIT_SUCCESS OK
8487 other HA_ERR error code (can be SE-specific)
8488*/
8489
8490int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid,
8491 const uint rowid_size, const bool skip_lookup,
8492 const bool skip_ttl_check) {
8493 DBUG_ENTER_FUNC();
8494
8495 DBUG_ASSERT(buf != nullptr);
8496 DBUG_ASSERT(rowid != nullptr);
8497 DBUG_ASSERT(table != nullptr);
8498
8499 int rc;
8500
8501 rocksdb::Slice key_slice(rowid, rowid_size);
8502
8503 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
8504 DBUG_ASSERT(tx != nullptr);
8505
8506 DEBUG_SYNC(ha_thd(), "rocksdb.get_row_by_rowid");
8507 DBUG_EXECUTE_IF("dbug.rocksdb.get_row_by_rowid", {
8508 THD *thd = ha_thd();
8509 const char act[] = "now signal Reached "
8510 "wait_for signal.rocksdb.get_row_by_rowid_let_running";
8511 DBUG_ASSERT(opt_debug_sync_timeout > 0);
8512 DBUG_ASSERT(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
8513 };);
8514
8515 bool found;
8516 rocksdb::Status s;
8517
8518 /* Pretend row found without looking up */
8519 if (skip_lookup)
8520 {
8521#ifdef MARIAROCKS_NOT_YET
8522 stats.rows_deleted_blind++;
8523#endif
8524 update_row_stats(ROWS_DELETED_BLIND);
8525 m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin);
8526 table->status = 0;
8527 DBUG_RETURN(0);
8528 }
8529
8530 if (m_lock_rows == RDB_LOCK_NONE) {
8531 tx->acquire_snapshot(true);
8532 s = tx->get(m_pk_descr->get_cf(), key_slice, &m_retrieved_record);
8533 } else {
8534 s = get_for_update(tx, m_pk_descr->get_cf(), key_slice,
8535 &m_retrieved_record);
8536 }
8537
8538 DBUG_EXECUTE_IF("rocksdb_return_status_corrupted",
8539 dbug_change_status_to_corrupted(&s););
8540
8541 if (!s.IsNotFound() && !s.ok()) {
8542 DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def,
8543 m_table_handler));
8544 }
8545 found = !s.IsNotFound();
8546
8547 table->status = STATUS_NOT_FOUND;
8548 if (found) {
8549 /* If we found the record, but it's expired, pretend we didn't find it. */
8550 if (!skip_ttl_check && m_pk_descr->has_ttl() &&
8551 should_hide_ttl_rec(*m_pk_descr, m_retrieved_record,
8552 tx->m_snapshot_timestamp)) {
8553 DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
8554 }
8555
8556 m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin);
8557 rc = convert_record_from_storage_format(&key_slice, buf);
8558
8559 if (!rc) {
8560 table->status = 0;
8561 }
8562 } else {
8563 /*
8564 Note: we don't need to unlock the row. It is intentional that we keep
8565 locks on rows that don't exist.
8566 */
8567 rc = HA_ERR_KEY_NOT_FOUND;
8568 }
8569
8570 DBUG_RETURN(rc);
8571}
8572
8573/**
8574 @return
8575 HA_EXIT_SUCCESS OK
8576 other HA_ERR error code (can be SE-specific)
8577*/
8578int ha_rocksdb::index_next(uchar *const buf) {
8579 DBUG_ENTER_FUNC();
8580
8581 bool moves_forward = true;
8582 if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
8583 moves_forward = false;
8584 }
8585
8586 int rc = index_next_with_direction(buf, moves_forward);
8587 if (rc == HA_ERR_KEY_NOT_FOUND)
8588 rc = HA_ERR_END_OF_FILE;
8589
8590 DBUG_RETURN(rc);
8591}
8592
8593/**
8594 @return
8595 HA_EXIT_SUCCESS OK
8596 other HA_ERR error code (can be SE-specific)
8597*/
8598int ha_rocksdb::index_prev(uchar *const buf) {
8599 DBUG_ENTER_FUNC();
8600
8601 bool moves_forward = false;
8602 if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
8603 moves_forward = true;
8604 }
8605
8606 int rc = index_next_with_direction(buf, moves_forward);
8607 if (rc == HA_ERR_KEY_NOT_FOUND)
8608 rc = HA_ERR_END_OF_FILE;
8609
8610 DBUG_RETURN(rc);
8611}
8612
8613int ha_rocksdb::index_next_with_direction(uchar *const buf, bool move_forward) {
8614 DBUG_ENTER_FUNC();
8615
8616 int rc;
8617
8618 if (active_index == pk_index(table, m_tbl_def)) {
8619 rc = rnd_next_with_direction(buf, move_forward);
8620 } else {
8621 if (m_skip_scan_it_next_call) {
8622 m_skip_scan_it_next_call = false;
8623 } else {
8624 if (move_forward)
8625 m_scan_it->Next(); /* this call cannot fail */
8626 else
8627 m_scan_it->Prev();
8628 }
8629 rocksdb_skip_expired_records(*m_key_descr_arr[active_index], m_scan_it,
8630 !move_forward);
8631 rc = find_icp_matching_index_rec(move_forward, buf);
8632 if (!rc)
8633 rc = secondary_index_read(active_index, buf);
8634 }
8635
8636 DBUG_RETURN(rc);
8637}
8638
8639/**
8640 @return
8641 HA_EXIT_SUCCESS OK
8642 other HA_ERR error code (can be SE-specific)
8643*/
8644int ha_rocksdb::index_first(uchar *const buf) {
8645 DBUG_ENTER_FUNC();
8646
8647 m_sk_match_prefix = nullptr;
8648 int rc = m_key_descr_arr[active_index]->m_is_reverse_cf
8649 ? index_last_intern(buf)
8650 : index_first_intern(buf);
8651 if (rc == HA_ERR_KEY_NOT_FOUND)
8652 rc = HA_ERR_END_OF_FILE;
8653
8654 DBUG_RETURN(rc);
8655}
8656
8657/**
8658 @return
8659 HA_EXIT_SUCCESS OK
8660 other HA_ERR error code (can be SE-specific)
8661*/
8662int ha_rocksdb::index_last(uchar *const buf) {
8663 DBUG_ENTER_FUNC();
8664
8665 m_sk_match_prefix = nullptr;
8666 int rc = m_key_descr_arr[active_index]->m_is_reverse_cf
8667 ? index_first_intern(buf)
8668 : index_last_intern(buf);
8669 if (rc == HA_ERR_KEY_NOT_FOUND)
8670 rc = HA_ERR_END_OF_FILE;
8671
8672 DBUG_RETURN(rc);
8673}
8674
8675/*
8676 Start scanning from the "first" value.
8677
8678 The 'first' here means "the first from start of the key space".
8679 For reverse-ordered key spaces, the first value will be the biggest, 'zzzz'.
8680
8681 An picture of a forward-ordered keyspace (remember, the keys have form
8682 'indexnr-keyval'. Suppose the index we are at has number n)
8683
8684 (n-1) - ...
8685 ( n ) <--- 1. (n) doesn't exist in the db but it would be here.
8686 ( n ) - aaa <--- 2. Seek("n") will put us here on the first index
8687 ( n ) - bbb record.
8688 ( n ) - cc
8689
8690 So, need to do: Seek(n);
8691
8692 A backward-ordered keyspace:
8693
8694 (n+1) - bbb
8695 (n+1) - aaa
8696 (n+1) <--- (n+1) doesn't exist in the db but would be here.
8697 ( n ) - ccc <--- 1. We need to be here.
8698 ( n ) - bbb
8699 ( n ) - aaa
8700 ( n )
8701
8702 So, need to: Seek(n+1);
8703
8704*/
8705
8706int ha_rocksdb::index_first_intern(uchar *const buf) {
8707 DBUG_ENTER_FUNC();
8708
8709 DBUG_ASSERT(buf != nullptr);
8710
8711 uchar *key;
8712 uint key_size;
8713 int rc;
8714
8715 if (is_pk(active_index, table, m_tbl_def)) {
8716 key = m_pk_packed_tuple;
8717 } else {
8718 key = m_sk_packed_tuple;
8719 }
8720
8721 DBUG_ASSERT(key != nullptr);
8722
8723 const Rdb_key_def &kd = *m_key_descr_arr[active_index];
8724 int key_start_matching_bytes = kd.get_first_key(key, &key_size);
8725
8726 rocksdb::Slice index_key((const char *)key, key_size);
8727
8728 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
8729 DBUG_ASSERT(tx != nullptr);
8730
8731 const bool is_new_snapshot = !tx->has_snapshot();
8732 // Loop as long as we get a deadlock error AND we end up creating the
8733 // snapshot here (i.e. it did not exist prior to this)
8734 for (;;) {
8735 setup_scan_iterator(kd, &index_key, false, key_start_matching_bytes);
8736 m_scan_it->Seek(index_key);
8737 m_skip_scan_it_next_call = true;
8738
8739 rc = index_next_with_direction(buf, true);
8740 if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot)
8741 break; // exit the loop
8742
8743 // release the snapshot and iterator so they will be regenerated
8744 tx->release_snapshot();
8745 release_scan_iterator();
8746 }
8747
8748 if (!rc) {
8749 /*
8750 index_next is always incremented on success, so decrement if it is
8751 index_first instead
8752 */
8753#ifdef MARIAROCKS_NOT_YET
8754 stats.rows_index_first++;
8755 stats.rows_index_next--;
8756#endif
8757 }
8758
8759 DBUG_RETURN(rc);
8760}
8761
8762/**
8763 @details
8764 Start scanning from the "last" value
8765
8766 The 'last' here means "the last from start of the key space".
8767 For reverse-ordered key spaces, we will actually read the smallest value.
8768
8769 An picture of a forward-ordered keyspace (remember, the keys have form
8770 'indexnr-keyval'. Suppose the we are at a key that has number n)
8771
8772 (n-1)-something
8773 ( n )-aaa
8774 ( n )-bbb
8775 ( n )-ccc <----------- Need to seek to here.
8776 (n+1) <---- Doesn't exist, but would be here.
8777 (n+1)-smth, or no value at all
8778
8779 RocksDB's Iterator::SeekForPrev($val) seeks to "at $val or last value that's
8780 smaller". We can't seek to "(n)-ccc" directly, because we don't know what
8781 is the value of 'ccc' (the biggest record with prefix (n)). Instead, we seek
8782 to "(n+1)", which is the least possible value that's greater than any value
8783 in index #n.
8784
8785 So, need to: it->SeekForPrev(n+1)
8786
8787 A backward-ordered keyspace:
8788
8789 (n+1)-something
8790 ( n ) - ccc
8791 ( n ) - bbb
8792 ( n ) - aaa <---------------- (*) Need to seek here.
8793 ( n ) <--- Doesn't exist, but would be here.
8794 (n-1)-smth, or no value at all
8795
8796 So, need to: it->SeekForPrev(n)
8797*/
8798
8799int ha_rocksdb::index_last_intern(uchar *const buf) {
8800 DBUG_ENTER_FUNC();
8801
8802 DBUG_ASSERT(buf != nullptr);
8803
8804 uchar *key;
8805 uint key_size;
8806 int rc;
8807
8808 if (is_pk(active_index, table, m_tbl_def)) {
8809 key = m_pk_packed_tuple;
8810 } else {
8811 key = m_sk_packed_tuple;
8812 }
8813
8814 DBUG_ASSERT(key != nullptr);
8815
8816 const Rdb_key_def &kd = *m_key_descr_arr[active_index];
8817 if (kd.m_is_reverse_cf) {
8818 kd.get_infimum_key(key, &key_size);
8819 } else {
8820 kd.get_supremum_key(key, &key_size);
8821 }
8822
8823 rocksdb::Slice index_key((const char *)key, key_size);
8824
8825 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
8826 DBUG_ASSERT(tx != nullptr);
8827
8828 bool is_new_snapshot = !tx->has_snapshot();
8829 // Loop as long as we get a deadlock error AND we end up creating the
8830 // snapshot here (i.e. it did not exist prior to this)
8831 for (;;) {
8832 setup_scan_iterator(kd, &index_key, false, Rdb_key_def::INDEX_NUMBER_SIZE);
8833 m_scan_it->SeekForPrev(index_key);
8834 m_skip_scan_it_next_call = false;
8835
8836 if (is_pk(active_index, table, m_tbl_def)) {
8837 m_skip_scan_it_next_call = true;
8838 rc = rnd_next_with_direction(buf, false);
8839 } else {
8840 rc = find_icp_matching_index_rec(false /*move_forward*/, buf);
8841 if (!rc)
8842 rc = secondary_index_read(active_index, buf);
8843 }
8844
8845 if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot)
8846 break; /* exit the loop */
8847
8848 // release the snapshot and iterator so they will be regenerated
8849 tx->release_snapshot();
8850 release_scan_iterator();
8851 }
8852
8853 if (!rc) {
8854 /*
8855 index_next is always incremented on success, so decrement if it is
8856 index_first instead
8857 */
8858#ifdef MARIAROCKS_NOT_YET
8859 stats.rows_index_first++;
8860 stats.rows_index_next--;
8861#endif
8862 }
8863
8864 DBUG_RETURN(rc);
8865}
8866
8867void ha_rocksdb::unlock_row() {
8868 DBUG_ENTER_FUNC();
8869
8870 if (m_lock_rows != RDB_LOCK_NONE) {
8871 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
8872 tx->release_lock(m_pk_descr->get_cf(),
8873 std::string(m_last_rowkey.ptr(), m_last_rowkey.length()));
8874 }
8875
8876 DBUG_VOID_RETURN;
8877}
8878
8879/*
8880 Returning true if SingleDelete can be used.
8881 - Secondary Indexes can always use SingleDelete.
8882 - If the index is PRIMARY KEY, and if all of the columns of the table
8883 are covered by the PRIMARY KEY, SingleDelete can be used.
8884*/
8885bool ha_rocksdb::can_use_single_delete(const uint &index) const {
8886 return (index != pk_index(table, m_tbl_def) ||
8887 (!has_hidden_pk(table) &&
8888 table->key_info[index].ext_key_parts == table->s->fields));
8889}
8890
8891bool ha_rocksdb::skip_unique_check() const {
8892 /*
8893 We want to skip unique checks if:
8894 1) bulk_load is on
8895 2) this table is in the whitelist of tables to skip and the replication
8896 lag has reached a large enough value (see unique_check_lag_threshold
8897 and unique_check_lage_reset_threshold)
8898 3) the user set unique_checks option to 0, and the table does not have
8899 any indexes. If the table has secondary keys, then those might becomes
8900 inconsisted/corrupted
8901 */
8902 return THDVAR(table->in_use, bulk_load) ||
8903 (m_force_skip_unique_check && m_skip_unique_check) ||
8904 (my_core::thd_test_options(table->in_use,
8905 OPTION_RELAXED_UNIQUE_CHECKS) &&
8906 m_tbl_def->m_key_count == 1);
8907}
8908
8909#ifdef MARIAROCKS_NOT_YET // MDEV-10975
8910void ha_rocksdb::set_force_skip_unique_check(bool skip) {
8911 DBUG_ENTER_FUNC();
8912
8913 m_force_skip_unique_check = skip;
8914
8915 DBUG_VOID_RETURN;
8916}
8917#endif
8918
8919bool ha_rocksdb::commit_in_the_middle() {
8920 return THDVAR(table->in_use, bulk_load) ||
8921 THDVAR(table->in_use, commit_in_the_middle);
8922}
8923
8924/*
8925 Executing bulk commit if it should.
8926 @retval true if bulk commit failed
8927 @retval false if bulk commit was skipped or succeeded
8928*/
8929bool ha_rocksdb::do_bulk_commit(Rdb_transaction *const tx) {
8930 DBUG_ASSERT(tx != nullptr);
8931 return commit_in_the_middle() &&
8932 tx->get_write_count() >= THDVAR(table->in_use, bulk_load_size) &&
8933 tx->flush_batch();
8934}
8935
8936/*
8937 If table was created without primary key, SQL layer represents the primary
8938 key number as MAX_INDEXES. Hence, this function returns true if the table
8939 does not contain a primary key. (In which case we generate a hidden
8940 'auto-incremented' pk.)
8941*/
8942bool ha_rocksdb::has_hidden_pk(const TABLE *const table) const {
8943 DBUG_ASSERT(table != nullptr);
8944 return Rdb_key_def::table_has_hidden_pk(table);
8945}
8946
8947/*
8948 Returns true if given index number is a hidden_pk.
8949 - This is used when a table is created with no primary key.
8950*/
8951bool ha_rocksdb::is_hidden_pk(const uint index, const TABLE *const table_arg,
8952 const Rdb_tbl_def *const tbl_def_arg) {
8953 DBUG_ASSERT(table_arg != nullptr);
8954 DBUG_ASSERT(table_arg->s != nullptr);
8955 DBUG_ASSERT(tbl_def_arg != nullptr);
8956
8957 return (table_arg->s->primary_key == MAX_INDEXES &&
8958 index == tbl_def_arg->m_key_count - 1);
8959}
8960
8961/* Returns index of primary key */
8962uint ha_rocksdb::pk_index(const TABLE *const table_arg,
8963 const Rdb_tbl_def *const tbl_def_arg) {
8964 DBUG_ASSERT(table_arg != nullptr);
8965 DBUG_ASSERT(table_arg->s != nullptr);
8966 DBUG_ASSERT(tbl_def_arg != nullptr);
8967
8968 return table_arg->s->primary_key == MAX_INDEXES ? tbl_def_arg->m_key_count - 1
8969 : table_arg->s->primary_key;
8970}
8971
8972/* Returns true if given index number is a primary key */
8973bool ha_rocksdb::is_pk(const uint index, const TABLE *const table_arg,
8974 const Rdb_tbl_def *const tbl_def_arg) {
8975 DBUG_ASSERT(table_arg != nullptr);
8976 DBUG_ASSERT(table_arg->s != nullptr);
8977 DBUG_ASSERT(tbl_def_arg != nullptr);
8978
8979 return index == table_arg->s->primary_key ||
8980 is_hidden_pk(index, table_arg, tbl_def_arg);
8981}
8982
8983uint ha_rocksdb::max_supported_key_part_length() const {
8984 DBUG_ENTER_FUNC();
8985 DBUG_RETURN(rocksdb_large_prefix ? MAX_INDEX_COL_LEN_LARGE
8986 : MAX_INDEX_COL_LEN_SMALL);
8987}
8988
8989const char *ha_rocksdb::get_key_name(const uint index,
8990 const TABLE *const table_arg,
8991 const Rdb_tbl_def *const tbl_def_arg) {
8992 DBUG_ASSERT(table_arg != nullptr);
8993 DBUG_ASSERT(tbl_def_arg != nullptr);
8994
8995 if (is_hidden_pk(index, table_arg, tbl_def_arg)) {
8996 return HIDDEN_PK_NAME;
8997 }
8998
8999 DBUG_ASSERT(table_arg->key_info != nullptr);
9000 DBUG_ASSERT(table_arg->key_info[index].name.str != nullptr);
9001
9002 return table_arg->key_info[index].name.str;
9003}
9004
9005const char *ha_rocksdb::get_key_comment(const uint index,
9006 const TABLE *const table_arg,
9007 const Rdb_tbl_def *const tbl_def_arg) {
9008 DBUG_ASSERT(table_arg != nullptr);
9009 DBUG_ASSERT(tbl_def_arg != nullptr);
9010
9011 if (is_hidden_pk(index, table_arg, tbl_def_arg)) {
9012 return nullptr;
9013 }
9014
9015 DBUG_ASSERT(table_arg->key_info != nullptr);
9016
9017 return table_arg->key_info[index].comment.str;
9018}
9019
9020const std::string ha_rocksdb::generate_cf_name(const uint index,
9021 const TABLE *const table_arg,
9022 const Rdb_tbl_def *const tbl_def_arg,
9023 bool *per_part_match_found) {
9024 DBUG_ASSERT(table_arg != nullptr);
9025 DBUG_ASSERT(tbl_def_arg != nullptr);
9026 DBUG_ASSERT(per_part_match_found != nullptr);
9027
9028 // When creating CF-s the caller needs to know if there was a custom CF name
9029 // specified for a given paritition.
9030 *per_part_match_found = false;
9031
9032 // Index comment is used to define the column family name specification(s).
9033 // If there was no comment, we get an emptry string, and it means "use the
9034 // default column family".
9035 const char *const comment = get_key_comment(index, table_arg, tbl_def_arg);
9036
9037 // `get_key_comment` can return `nullptr`, that's why this.
9038 std::string key_comment = comment ? comment : "";
9039
9040 std::string cf_name = Rdb_key_def::parse_comment_for_qualifier(
9041 key_comment, table_arg, tbl_def_arg, per_part_match_found,
9042 RDB_CF_NAME_QUALIFIER);
9043
9044 if (IF_PARTITIONING(table_arg->part_info,nullptr) != nullptr && !*per_part_match_found) {
9045 // At this point we tried to search for a custom CF name for a partition,
9046 // but none was specified. Therefore default one will be used.
9047 return "";
9048 }
9049
9050 // If we didn't find any partitioned/non-partitioned qualifiers, return the
9051 // comment itself. NOTE: this currently handles returning the cf name
9052 // specified in the index comment in the case of no partitions, which doesn't
9053 // use any qualifiers at the moment. (aka its a special case)
9054 if (cf_name.empty() && !key_comment.empty()) {
9055 return key_comment;
9056 }
9057
9058 return cf_name;
9059}
9060
9061const std::string ha_rocksdb::get_table_comment(const TABLE *const table_arg) {
9062 DBUG_ASSERT(table_arg != nullptr);
9063 DBUG_ASSERT(table_arg->s != nullptr);
9064
9065 return table_arg->s->comment.str;
9066}
9067
9068/**
9069 @return
9070 HA_EXIT_SUCCESS OK
9071 other HA_ERR error code (can be SE-specific)
9072*/
9073int ha_rocksdb::write_row(uchar *const buf) {
9074 DBUG_ENTER_FUNC();
9075
9076 DBUG_ASSERT(buf != nullptr);
9077 DBUG_ASSERT(buf == table->record[0]);
9078 DBUG_ASSERT(m_lock_rows == RDB_LOCK_WRITE);
9079
9080 /*
9081 Note: "buf == table->record[0]" is copied from innodb. I am not aware of
9082 any use cases where this condition is not true.
9083 */
9084 if (table->next_number_field && buf == table->record[0]) {
9085 int err;
9086 if ((err = update_auto_increment())) {
9087 DBUG_RETURN(err);
9088 }
9089 }
9090
9091 const int rv = update_write_row(nullptr, buf, skip_unique_check());
9092
9093 if (rv == 0) {
9094#ifdef MARIAROCKS_NOT_YET
9095 stats.rows_inserted++;
9096#endif
9097 update_row_stats(ROWS_INSERTED);
9098 }
9099
9100 DBUG_RETURN(rv);
9101}
9102
9103/**
9104 Constructing m_last_rowkey (MyRocks key expression) from
9105 before_update|delete image (MySQL row expression).
9106 m_last_rowkey is normally set during lookup phase, such as
9107 rnd_next_with_direction() and rnd_pos(). With Read Free Replication,
9108 these read functions are skipped and update_rows(), delete_rows() are
9109 called without setting m_last_rowkey. This function sets m_last_rowkey
9110 for Read Free Replication.
9111*/
9112void ha_rocksdb::set_last_rowkey(const uchar *const old_data) {
9113#ifdef MARIAROCKS_NOT_YET
9114 if (old_data && use_read_free_rpl()) {
9115 const int old_pk_size = m_pk_descr->pack_record(
9116 table, m_pack_buffer, old_data, m_pk_packed_tuple, nullptr, false);
9117 m_last_rowkey.copy((const char *)m_pk_packed_tuple, old_pk_size,
9118 &my_charset_bin);
9119 }
9120#endif
9121}
9122
9123int ha_rocksdb::get_pk_for_update(struct update_row_info *const row_info) {
9124 int size;
9125
9126 /*
9127 Get new row key for any insert, and any update where the pk is not hidden.
9128 Row key for updates with hidden pk is handled below.
9129 */
9130 if (!has_hidden_pk(table)) {
9131 row_info->hidden_pk_id = 0;
9132
9133 row_info->new_pk_unpack_info = &m_pk_unpack_info;
9134
9135 size =
9136 m_pk_descr->pack_record(table, m_pack_buffer, row_info->new_data,
9137 m_pk_packed_tuple, row_info->new_pk_unpack_info,
9138 false, 0, 0, nullptr, &row_info->ttl_pk_offset);
9139 } else if (row_info->old_data == nullptr) {
9140 row_info->hidden_pk_id = update_hidden_pk_val();
9141 size =
9142 m_pk_descr->pack_hidden_pk(row_info->hidden_pk_id, m_pk_packed_tuple);
9143 } else {
9144 /*
9145 If hidden primary key, rowkey for new record will always be the same as
9146 before
9147 */
9148 size = row_info->old_pk_slice.size();
9149 memcpy(m_pk_packed_tuple, row_info->old_pk_slice.data(), size);
9150 int err = read_hidden_pk_id_from_rowkey(&row_info->hidden_pk_id);
9151 if (err) {
9152 return err;
9153 }
9154 }
9155
9156 row_info->new_pk_slice =
9157 rocksdb::Slice((const char *)m_pk_packed_tuple, size);
9158
9159 return HA_EXIT_SUCCESS;
9160}
9161
9162int ha_rocksdb::check_and_lock_unique_pk(const uint &key_id,
9163 const struct update_row_info &row_info,
9164 bool *const found,
9165 bool *const pk_changed) {
9166 DBUG_ASSERT(found != nullptr);
9167 DBUG_ASSERT(pk_changed != nullptr);
9168
9169 *pk_changed = false;
9170
9171 /*
9172 For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs
9173 always require locking.
9174 */
9175 if (row_info.old_pk_slice.size() > 0) {
9176 /*
9177 If the keys are the same, then no lock is needed
9178 */
9179 if (!Rdb_pk_comparator::bytewise_compare(row_info.new_pk_slice,
9180 row_info.old_pk_slice)) {
9181 *found = false;
9182 return HA_EXIT_SUCCESS;
9183 }
9184
9185 *pk_changed = true;
9186 }
9187
9188 /*
9189 Perform a read to determine if a duplicate entry exists. For primary
9190 keys, a point lookup will be sufficient.
9191
9192 note: we intentionally don't set options.snapshot here. We want to read
9193 the latest committed data.
9194 */
9195
9196 /*
9197 To prevent race conditions like below, it is necessary to
9198 take a lock for a target row. get_for_update() holds a gap lock if
9199 target key does not exist, so below conditions should never
9200 happen.
9201
9202 1) T1 Get(empty) -> T2 Get(empty) -> T1 Put(insert) -> T1 commit
9203 -> T2 Put(overwrite) -> T2 commit
9204 2) T1 Get(empty) -> T1 Put(insert, not committed yet) -> T2 Get(empty)
9205 -> T2 Put(insert, blocked) -> T1 commit -> T2 commit(overwrite)
9206 */
9207 const rocksdb::Status s =
9208 get_for_update(row_info.tx, m_pk_descr->get_cf(), row_info.new_pk_slice,
9209 &m_retrieved_record);
9210 if (!s.ok() && !s.IsNotFound()) {
9211 return row_info.tx->set_status_error(
9212 table->in_use, s, *m_key_descr_arr[key_id], m_tbl_def, m_table_handler);
9213 }
9214
9215 *found = !s.IsNotFound();
9216 return HA_EXIT_SUCCESS;
9217}
9218
9219int ha_rocksdb::check_and_lock_sk(const uint &key_id,
9220 const struct update_row_info &row_info,
9221 bool *const found) {
9222 DBUG_ASSERT(found != nullptr);
9223 *found = false;
9224
9225 /*
9226 Can skip checking this key if none of the key fields have changed.
9227 */
9228 if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id)) {
9229 return HA_EXIT_SUCCESS;
9230 }
9231
9232 KEY *key_info = nullptr;
9233 uint n_null_fields = 0;
9234 uint user_defined_key_parts = 1;
9235
9236 key_info = &table->key_info[key_id];
9237 user_defined_key_parts = key_info->user_defined_key_parts;
9238 /*
9239 If there are no uniqueness requirements, there's no need to obtain a
9240 lock for this key.
9241 */
9242 if (!(key_info->flags & HA_NOSAME)) {
9243 return HA_EXIT_SUCCESS;
9244 }
9245
9246 const Rdb_key_def &kd = *m_key_descr_arr[key_id];
9247
9248 /*
9249 Calculate the new key for obtaining the lock
9250
9251 For unique secondary indexes, the key used for locking does not
9252 include the extended fields.
9253 */
9254 int size =
9255 kd.pack_record(table, m_pack_buffer, row_info.new_data, m_sk_packed_tuple,
9256 nullptr, false, 0, user_defined_key_parts, &n_null_fields);
9257 if (n_null_fields > 0) {
9258 /*
9259 If any fields are marked as NULL this will never match another row as
9260 to NULL never matches anything else including another NULL.
9261 */
9262 return HA_EXIT_SUCCESS;
9263 }
9264
9265 const rocksdb::Slice new_slice =
9266 rocksdb::Slice((const char *)m_sk_packed_tuple, size);
9267
9268 /*
9269 For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs
9270 always require locking.
9271 */
9272 if (row_info.old_data != nullptr) {
9273 size = kd.pack_record(table, m_pack_buffer, row_info.old_data,
9274 m_sk_packed_tuple_old, nullptr, false, 0,
9275 user_defined_key_parts);
9276 const rocksdb::Slice old_slice =
9277 rocksdb::Slice((const char *)m_sk_packed_tuple_old, size);
9278
9279 /*
9280 For updates, if the keys are the same, then no lock is needed
9281
9282 Also check to see if the key has any fields set to NULL. If it does, then
9283 this key is unique since NULL is not equal to each other, so no lock is
9284 needed.
9285 */
9286 if (!Rdb_pk_comparator::bytewise_compare(new_slice, old_slice)) {
9287 return HA_EXIT_SUCCESS;
9288 }
9289 }
9290
9291 /*
9292 Perform a read to determine if a duplicate entry exists - since this is
9293 a secondary indexes a range scan is needed.
9294
9295 note: we intentionally don't set options.snapshot here. We want to read
9296 the latest committed data.
9297 */
9298
9299 const bool all_parts_used = (user_defined_key_parts == kd.get_key_parts());
9300
9301 /*
9302 This iterator seems expensive since we need to allocate and free
9303 memory for each unique index.
9304
9305 If this needs to be optimized, for keys without NULL fields, the
9306 extended primary key fields can be migrated to the value portion of the
9307 key. This enables using Get() instead of Seek() as in the primary key
9308 case.
9309
9310 The bloom filter may need to be disabled for this lookup.
9311 */
9312 uchar min_bound_buf[MAX_KEY_LENGTH];
9313 uchar max_bound_buf[MAX_KEY_LENGTH];
9314 rocksdb::Slice min_bound_slice;
9315 rocksdb::Slice max_bound_slice;
9316 const bool total_order_seek = !check_bloom_and_set_bounds(
9317 ha_thd(), kd, new_slice, all_parts_used,
9318 min_bound_buf,
9319 max_bound_buf,
9320 &min_bound_slice,
9321 &max_bound_slice);
9322 const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
9323
9324 const rocksdb::Status s =
9325 get_for_update(row_info.tx, kd.get_cf(), new_slice, nullptr);
9326 if (!s.ok() && !s.IsNotFound()) {
9327 return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def,
9328 m_table_handler);
9329 }
9330
9331 rocksdb::Iterator *const iter = row_info.tx->get_iterator(
9332 kd.get_cf(), total_order_seek, fill_cache,
9333 min_bound_slice, max_bound_slice,
9334 true /* read current data */,
9335 false /* acquire snapshot */);
9336 /*
9337 Need to scan the transaction to see if there is a duplicate key.
9338 Also need to scan RocksDB and verify the key has not been deleted
9339 in the transaction.
9340 */
9341 iter->Seek(new_slice);
9342 *found = !read_key_exact(kd, iter, all_parts_used, new_slice,
9343 row_info.tx->m_snapshot_timestamp);
9344 delete iter;
9345
9346 return HA_EXIT_SUCCESS;
9347}
9348
9349int ha_rocksdb::check_uniqueness_and_lock(
9350 const struct update_row_info &row_info, bool *const pk_changed) {
9351 /*
9352 Go through each index and determine if the index has uniqueness
9353 requirements. If it does, then try to obtain a row lock on the new values.
9354 Once all locks have been obtained, then perform the changes needed to
9355 update/insert the row.
9356 */
9357 for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) {
9358 bool found;
9359 int rc;
9360
9361 if (is_pk(key_id, table, m_tbl_def)) {
9362 rc = check_and_lock_unique_pk(key_id, row_info, &found, pk_changed);
9363 } else {
9364 rc = check_and_lock_sk(key_id, row_info, &found);
9365 }
9366
9367 if (rc != HA_EXIT_SUCCESS) {
9368 return rc;
9369 }
9370
9371 /*
9372 If the pk key has ttl, we may need to pretend the row wasn't
9373 found if it is already expired. The pk record is read into
9374 m_retrieved_record by check_and_lock_unique_pk().
9375 */
9376 if (is_pk(key_id, table, m_tbl_def) && found && m_pk_descr->has_ttl() &&
9377 should_hide_ttl_rec(*m_pk_descr, m_retrieved_record,
9378 (row_info.tx->m_snapshot_timestamp
9379 ? row_info.tx->m_snapshot_timestamp
9380 : static_cast<int64_t>(std::time(nullptr))))) {
9381 found = false;
9382 }
9383
9384 if (found) {
9385 /* There is a row with this key already, so error out. */
9386 errkey = key_id;
9387 m_dupp_errkey = errkey;
9388 return HA_ERR_FOUND_DUPP_KEY;
9389 }
9390 }
9391
9392 return HA_EXIT_SUCCESS;
9393}
9394
9395int ha_rocksdb::check_duplicate_sk(const TABLE *table_arg,
9396 const Rdb_key_def &index,
9397 const rocksdb::Slice *key,
9398 struct unique_sk_buf_info *sk_info) {
9399 uint n_null_fields = 0;
9400 const rocksdb::Comparator *index_comp = index.get_cf()->GetComparator();
9401
9402 /* Get proper SK buffer. */
9403 uchar *sk_buf = sk_info->swap_and_get_sk_buf();
9404
9405 /* Get memcmp form of sk without extended pk tail */
9406 uint sk_memcmp_size =
9407 index.get_memcmp_sk_parts(table_arg, *key, sk_buf, &n_null_fields);
9408
9409 sk_info->sk_memcmp_key =
9410 rocksdb::Slice(reinterpret_cast<char *>(sk_buf), sk_memcmp_size);
9411
9412 if (sk_info->sk_memcmp_key_old.size() > 0 && n_null_fields == 0 &&
9413 index_comp->Compare(sk_info->sk_memcmp_key, sk_info->sk_memcmp_key_old) ==
9414 0) {
9415 return 1;
9416 }
9417
9418 sk_info->sk_memcmp_key_old = sk_info->sk_memcmp_key;
9419 return 0;
9420}
9421
9422int ha_rocksdb::bulk_load_key(Rdb_transaction *const tx, const Rdb_key_def &kd,
9423 const rocksdb::Slice &key,
9424 const rocksdb::Slice &value, bool sort) {
9425 DBUG_ENTER_FUNC();
9426 int res;
9427 rocksdb::ColumnFamilyHandle *cf = kd.get_cf();
9428
9429 // In the case of unsorted inserts, m_sst_info allocated here is not
9430 // used to store the keys. It is still used to indicate when tables
9431 // are switched.
9432 if (m_sst_info == nullptr || m_sst_info->is_committed()) {
9433 m_sst_info.reset(new Rdb_sst_info(rdb, m_table_handler->m_table_name,
9434 kd.get_name(), cf, *rocksdb_db_options,
9435 THDVAR(ha_thd(), trace_sst_api)));
9436 res = tx->start_bulk_load(this, m_sst_info);
9437 if (res != HA_EXIT_SUCCESS) {
9438 DBUG_RETURN(res);
9439 }
9440 }
9441 DBUG_ASSERT(m_sst_info);
9442
9443 if (sort) {
9444 Rdb_index_merge *key_merge;
9445 DBUG_ASSERT(cf != nullptr);
9446
9447 res = tx->get_key_merge(kd.get_gl_index_id(), cf, &key_merge);
9448 if (res == HA_EXIT_SUCCESS) {
9449 res = key_merge->add(key, value);
9450 }
9451 } else {
9452 res = m_sst_info->put(key, value);
9453 }
9454
9455 DBUG_RETURN(res);
9456}
9457
9458int ha_rocksdb::finalize_bulk_load(bool print_client_error) {
9459 DBUG_ENTER_FUNC();
9460
9461 int res = HA_EXIT_SUCCESS;
9462
9463 /* Skip if there are no possible ongoing bulk loads */
9464 if (m_sst_info) {
9465 res = m_sst_info->commit(print_client_error);
9466 m_sst_info.reset();
9467 }
9468 DBUG_RETURN(res);
9469}
9470
9471int ha_rocksdb::update_pk(const Rdb_key_def &kd,
9472 const struct update_row_info &row_info,
9473 const bool &pk_changed) {
9474 const uint key_id = kd.get_keyno();
9475 const bool hidden_pk = is_hidden_pk(key_id, table, m_tbl_def);
9476 ulonglong bytes_written = 0;
9477
9478 /*
9479 If the PK has changed, or if this PK uses single deletes and this is an
9480 update, the old key needs to be deleted. In the single delete case, it
9481 might be possible to have this sequence of keys: PUT(X), PUT(X), SD(X),
9482 resulting in the first PUT(X) showing up.
9483 */
9484 if (!hidden_pk && (pk_changed || ((row_info.old_pk_slice.size() > 0) &&
9485 can_use_single_delete(key_id)))) {
9486 const rocksdb::Status s = delete_or_singledelete(
9487 key_id, row_info.tx, kd.get_cf(), row_info.old_pk_slice);
9488 if (!s.ok()) {
9489 return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def,
9490 m_table_handler);
9491 } else {
9492 bytes_written = row_info.old_pk_slice.size();
9493 }
9494 }
9495
9496 if (table->next_number_field) {
9497 update_auto_incr_val_from_field();
9498 }
9499
9500 int rc = HA_EXIT_SUCCESS;
9501 rocksdb::Slice value_slice;
9502 /* Prepare the new record to be written into RocksDB */
9503 if ((rc = convert_record_to_storage_format(row_info, &value_slice))) {
9504 return rc;
9505 }
9506
9507 const auto cf = m_pk_descr->get_cf();
9508 if (rocksdb_enable_bulk_load_api && THDVAR(table->in_use, bulk_load) &&
9509 !hidden_pk) {
9510 /*
9511 Write the primary key directly to an SST file using an SstFileWriter
9512 */
9513 rc = bulk_load_key(row_info.tx, kd, row_info.new_pk_slice, value_slice,
9514 THDVAR(table->in_use, bulk_load_allow_unsorted));
9515 } else if (row_info.skip_unique_check || row_info.tx->m_ddl_transaction) {
9516 /*
9517 It is responsibility of the user to make sure that the data being
9518 inserted doesn't violate any unique keys.
9519 */
9520 row_info.tx->get_indexed_write_batch()->Put(cf, row_info.new_pk_slice,
9521 value_slice);
9522 } else {
9523 const auto s = row_info.tx->put(cf, row_info.new_pk_slice, value_slice);
9524 if (!s.ok()) {
9525 if (s.IsBusy()) {
9526 errkey = table->s->primary_key;
9527 m_dupp_errkey = errkey;
9528 rc = HA_ERR_FOUND_DUPP_KEY;
9529 } else {
9530 rc = row_info.tx->set_status_error(table->in_use, s, *m_pk_descr,
9531 m_tbl_def, m_table_handler);
9532 }
9533 }
9534 }
9535
9536 if (rc == HA_EXIT_SUCCESS) {
9537 row_info.tx->update_bytes_written(
9538 bytes_written + row_info.new_pk_slice.size() + value_slice.size());
9539 }
9540 return rc;
9541}
9542
9543int ha_rocksdb::update_sk(const TABLE *const table_arg, const Rdb_key_def &kd,
9544 const struct update_row_info &row_info) {
9545 int new_packed_size;
9546 int old_packed_size;
9547
9548 rocksdb::Slice new_key_slice;
9549 rocksdb::Slice new_value_slice;
9550 rocksdb::Slice old_key_slice;
9551
9552 const uint key_id = kd.get_keyno();
9553
9554 ulonglong bytes_written = 0;
9555
9556 /*
9557 Can skip updating this key if none of the key fields have changed and, if
9558 this table has TTL, the TTL timestamp has not changed.
9559 */
9560 if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id) &&
9561 (!kd.has_ttl() || !m_ttl_bytes_updated)) {
9562 return HA_EXIT_SUCCESS;
9563 }
9564
9565 const bool store_row_debug_checksums = should_store_row_debug_checksums();
9566
9567 new_packed_size =
9568 kd.pack_record(table_arg, m_pack_buffer, row_info.new_data,
9569 m_sk_packed_tuple, &m_sk_tails, store_row_debug_checksums,
9570 row_info.hidden_pk_id, 0, nullptr, nullptr, m_ttl_bytes);
9571
9572 if (row_info.old_data != nullptr) {
9573 // The old value
9574 old_packed_size = kd.pack_record(
9575 table_arg, m_pack_buffer, row_info.old_data, m_sk_packed_tuple_old,
9576 &m_sk_tails_old, store_row_debug_checksums, row_info.hidden_pk_id, 0,
9577 nullptr, nullptr, m_ttl_bytes);
9578
9579 /*
9580 Check if we are going to write the same value. This can happen when
9581 one does
9582 UPDATE tbl SET col='foo'
9583 and we are looking at the row that already has col='foo'.
9584
9585 We also need to compare the unpack info. Suppose, the collation is
9586 case-insensitive, and unpack info contains information about whether
9587 the letters were uppercase and lowercase. Then, both 'foo' and 'FOO'
9588 will have the same key value, but different data in unpack_info.
9589
9590 (note: anyone changing bytewise_compare should take this code into
9591 account)
9592 */
9593 if (old_packed_size == new_packed_size &&
9594 m_sk_tails_old.get_current_pos() == m_sk_tails.get_current_pos() &&
9595 !(kd.has_ttl() && m_ttl_bytes_updated) &&
9596 memcmp(m_sk_packed_tuple_old, m_sk_packed_tuple, old_packed_size) ==
9597 0 &&
9598 memcmp(m_sk_tails_old.ptr(), m_sk_tails.ptr(),
9599 m_sk_tails.get_current_pos()) == 0) {
9600 return HA_EXIT_SUCCESS;
9601 }
9602
9603 /*
9604 Deleting entries from secondary index should skip locking, but
9605 be visible to the transaction.
9606 (also note that DDL statements do not delete rows, so this is not a DDL
9607 statement)
9608 */
9609 old_key_slice = rocksdb::Slice(
9610 reinterpret_cast<const char *>(m_sk_packed_tuple_old), old_packed_size);
9611
9612 row_info.tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
9613 old_key_slice);
9614
9615 bytes_written = old_key_slice.size();
9616 }
9617
9618 new_key_slice = rocksdb::Slice(
9619 reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
9620 new_value_slice =
9621 rocksdb::Slice(reinterpret_cast<const char *>(m_sk_tails.ptr()),
9622 m_sk_tails.get_current_pos());
9623
9624 row_info.tx->get_indexed_write_batch()->Put(kd.get_cf(), new_key_slice,
9625 new_value_slice);
9626
9627 row_info.tx->update_bytes_written(bytes_written + new_key_slice.size() +
9628 new_value_slice.size());
9629
9630 return HA_EXIT_SUCCESS;
9631}
9632
9633int ha_rocksdb::update_indexes(const struct update_row_info &row_info,
9634 const bool &pk_changed) {
9635 int rc;
9636
9637 // The PK must be updated first to pull out the TTL value.
9638 rc = update_pk(*m_pk_descr, row_info, pk_changed);
9639 if (rc != HA_EXIT_SUCCESS) {
9640 return rc;
9641 }
9642
9643 // Update the remaining indexes.
9644 for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) {
9645 if (is_pk(key_id, table, m_tbl_def)) {
9646 continue;
9647 }
9648
9649 rc = update_sk(table, *m_key_descr_arr[key_id], row_info);
9650 if (rc != HA_EXIT_SUCCESS) {
9651 return rc;
9652 }
9653 }
9654
9655 return HA_EXIT_SUCCESS;
9656}
9657
9658int ha_rocksdb::update_write_row(const uchar *const old_data,
9659 const uchar *const new_data,
9660 const bool skip_unique_check) {
9661 DBUG_ENTER_FUNC();
9662
9663 bool pk_changed = false;
9664 struct update_row_info row_info;
9665
9666 row_info.old_data = old_data;
9667 row_info.new_data = new_data;
9668 row_info.skip_unique_check = skip_unique_check;
9669 row_info.new_pk_unpack_info = nullptr;
9670
9671 set_last_rowkey(old_data);
9672
9673 row_info.tx = get_or_create_tx(table->in_use);
9674
9675 if (old_data != nullptr) {
9676 row_info.old_pk_slice =
9677 rocksdb::Slice(m_last_rowkey.ptr(), m_last_rowkey.length());
9678
9679 /* Determine which indexes need updating. */
9680 calc_updated_indexes();
9681 }
9682
9683 /*
9684 Get the new row key into row_info.new_pk_slice
9685 */
9686 int rc = get_pk_for_update(&row_info);
9687 if (rc != HA_EXIT_SUCCESS) {
9688 DBUG_RETURN(rc);
9689 }
9690
9691 if (!skip_unique_check) {
9692 /*
9693 Check to see if we are going to have failures because of unique
9694 keys. Also lock the appropriate key values.
9695 */
9696 rc = check_uniqueness_and_lock(row_info, &pk_changed);
9697 if (rc != HA_EXIT_SUCCESS) {
9698 DBUG_RETURN(rc);
9699 }
9700 }
9701
9702 DEBUG_SYNC(ha_thd(), "rocksdb.update_write_row_after_unique_check");
9703
9704 /*
9705 At this point, all locks have been obtained, and all checks for duplicate
9706 keys have been performed. No further errors can be allowed to occur from
9707 here because updates to the transaction will be made and those updates
9708 cannot be easily removed without rolling back the entire transaction.
9709 */
9710 rc = update_indexes(row_info, pk_changed);
9711 if (rc != HA_EXIT_SUCCESS) {
9712 DBUG_RETURN(rc);
9713 }
9714
9715 if (old_data != nullptr) {
9716 row_info.tx->incr_update_count();
9717 } else {
9718 row_info.tx->incr_insert_count();
9719 }
9720
9721 if (do_bulk_commit(row_info.tx)) {
9722 DBUG_RETURN(HA_ERR_ROCKSDB_BULK_LOAD);
9723 }
9724
9725 DBUG_RETURN(HA_EXIT_SUCCESS);
9726}
9727
9728/*
9729 Setting iterator upper/lower bounds for Seek/SeekForPrev.
9730 This makes RocksDB to avoid scanning tombstones outside of
9731 the given key ranges, when prefix_same_as_start=true was not passed
9732 (when prefix bloom filter can not be used).
9733 Inversing upper/lower bound is necessary on reverse order CF.
9734 This covers HA_READ_PREFIX_LAST* case as well. For example,
9735 if given query eq condition was 12 bytes and condition was
9736 0x0000b3eb003f65c5e78858b8, and if doing HA_READ_PREFIX_LAST,
9737 eq_cond_len was 11 (see calc_eq_cond_len() for details).
9738 If the index was reverse order, upper bound would be
9739 0x0000b3eb003f65c5e78857, and lower bound would be
9740 0x0000b3eb003f65c5e78859. These cover given eq condition range.
9741
9742 @param lower_bound_buf IN Buffer for lower bound
9743 @param upper_bound_buf IN Buffer for upper bound
9744
9745 @param outer_u
9746*/
9747void ha_rocksdb::setup_iterator_bounds(const Rdb_key_def &kd,
9748 const rocksdb::Slice &eq_cond,
9749 uchar *lower_bound_buf,
9750 uchar *upper_bound_buf,
9751 rocksdb::Slice *out_lower_bound,
9752 rocksdb::Slice *out_upper_bound) {
9753 uint eq_cond_len = eq_cond.size();
9754 memcpy(upper_bound_buf, eq_cond.data(), eq_cond_len);
9755 kd.successor(upper_bound_buf, eq_cond_len);
9756 memcpy(lower_bound_buf, eq_cond.data(), eq_cond_len);
9757 kd.predecessor(lower_bound_buf, eq_cond_len);
9758
9759 if (kd.m_is_reverse_cf) {
9760 *out_upper_bound =
9761 rocksdb::Slice((const char *)lower_bound_buf, eq_cond_len);
9762 *out_lower_bound =
9763 rocksdb::Slice((const char *)upper_bound_buf, eq_cond_len);
9764 } else {
9765 *out_upper_bound =
9766 rocksdb::Slice((const char *)upper_bound_buf, eq_cond_len);
9767 *out_lower_bound =
9768 rocksdb::Slice((const char *)lower_bound_buf, eq_cond_len);
9769 }
9770}
9771
9772/*
9773 Open a cursor
9774*/
9775
9776void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd,
9777 rocksdb::Slice *const slice,
9778 const bool use_all_keys,
9779 const uint eq_cond_len) {
9780 DBUG_ASSERT(slice != nullptr);
9781 DBUG_ASSERT(slice->size() >= eq_cond_len);
9782
9783 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9784
9785 bool skip_bloom = true;
9786
9787 const rocksdb::Slice eq_cond(slice->data(), eq_cond_len);
9788 if (check_bloom_and_set_bounds(ha_thd(), kd, eq_cond, use_all_keys,
9789 m_eq_cond_lower_bound,
9790 m_eq_cond_upper_bound,
9791 &m_eq_cond_lower_bound_slice,
9792 &m_eq_cond_upper_bound_slice)) {
9793 skip_bloom = false;
9794 }
9795
9796 /*
9797 In some cases, setup_scan_iterator() is called multiple times from
9798 the same query but bloom filter can not always be used.
9799 Suppose the following query example. id2 is VARCHAR(30) and PRIMARY KEY
9800 (id1, id2).
9801 select count(*) from t2 WHERE id1=100 and id2 IN ('00000000000000000000',
9802 '100');
9803 In this case, setup_scan_iterator() is called twice, the first time is for
9804 (id1, id2)=(100, '00000000000000000000') and the second time is for (100,
9805 '100').
9806 If prefix bloom filter length is 24 bytes, prefix bloom filter can be used
9807 for the
9808 first condition but not for the second condition.
9809 If bloom filter condition is changed, currently it is necessary to destroy
9810 and
9811 re-create Iterator.
9812 */
9813 if (m_scan_it_skips_bloom != skip_bloom) {
9814 release_scan_iterator();
9815 }
9816
9817 /*
9818 SQL layer can call rnd_init() multiple times in a row.
9819 In that case, re-use the iterator, but re-position it at the table start.
9820 */
9821 if (!m_scan_it) {
9822 const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
9823 if (commit_in_the_middle()) {
9824 DBUG_ASSERT(m_scan_it_snapshot == nullptr);
9825 m_scan_it_snapshot = rdb->GetSnapshot();
9826
9827 auto read_opts = rocksdb::ReadOptions();
9828 read_opts.total_order_seek = true; // TODO: set based on WHERE conditions
9829 read_opts.snapshot = m_scan_it_snapshot;
9830 m_scan_it = rdb->NewIterator(read_opts, kd.get_cf());
9831 } else {
9832 m_scan_it = tx->get_iterator(kd.get_cf(), skip_bloom, fill_cache,
9833 m_eq_cond_lower_bound_slice,
9834 m_eq_cond_upper_bound_slice);
9835 }
9836 m_scan_it_skips_bloom = skip_bloom;
9837 }
9838}
9839
9840void ha_rocksdb::release_scan_iterator() {
9841 delete m_scan_it;
9842 m_scan_it = nullptr;
9843
9844 if (m_scan_it_snapshot) {
9845 rdb->ReleaseSnapshot(m_scan_it_snapshot);
9846 m_scan_it_snapshot = nullptr;
9847 }
9848}
9849
9850void ha_rocksdb::setup_iterator_for_rnd_scan() {
9851 uint key_size;
9852
9853 int key_start_matching_bytes = m_pk_descr->get_first_key(m_pk_packed_tuple, &key_size);
9854
9855 rocksdb::Slice table_key((const char *)m_pk_packed_tuple, key_size);
9856
9857 setup_scan_iterator(*m_pk_descr, &table_key, false,
9858 key_start_matching_bytes);
9859 m_scan_it->Seek(table_key);
9860 m_skip_scan_it_next_call = true;
9861}
9862
9863/**
9864 @return
9865 HA_EXIT_SUCCESS OK
9866 other HA_ERR error code (can be SE-specific)
9867*/
9868int ha_rocksdb::rnd_init(bool scan) {
9869 DBUG_ENTER_FUNC();
9870
9871 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9872 setup_read_decoders();
9873
9874 if (scan) {
9875 m_rnd_scan_is_new_snapshot = !tx->has_snapshot();
9876 setup_iterator_for_rnd_scan();
9877 } else {
9878 /* We don't need any preparations for rnd_pos() calls. */
9879 }
9880
9881 // If m_lock_rows is on then we will be doing a get_for_update when accessing
9882 // the index, so don't acquire the snapshot right away. Otherwise acquire
9883 // the snapshot immediately.
9884 tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
9885
9886 DBUG_RETURN(HA_EXIT_SUCCESS);
9887}
9888
9889/**
9890 @return
9891 HA_EXIT_SUCCESS OK
9892 other HA_ERR error code (can be SE-specific)
9893*/
9894int ha_rocksdb::rnd_next(uchar *const buf) {
9895 DBUG_ENTER_FUNC();
9896
9897 int rc;
9898 for (;;) {
9899 rc = rnd_next_with_direction(buf, true);
9900 if (rc != HA_ERR_LOCK_DEADLOCK || !m_rnd_scan_is_new_snapshot)
9901 break; /* exit the loop */
9902
9903 // release the snapshot and iterator and then regenerate them
9904 Rdb_transaction *tx = get_or_create_tx(table->in_use);
9905 tx->release_snapshot();
9906 release_scan_iterator();
9907 setup_iterator_for_rnd_scan();
9908 }
9909
9910 m_rnd_scan_is_new_snapshot = false;
9911
9912 if (rc == HA_ERR_KEY_NOT_FOUND)
9913 rc = HA_ERR_END_OF_FILE;
9914
9915 DBUG_RETURN(rc);
9916}
9917
9918/*
9919 See also secondary_index_read().
9920*/
9921int ha_rocksdb::rnd_next_with_direction(uchar *const buf, bool move_forward) {
9922 DBUG_ENTER_FUNC();
9923
9924 int rc;
9925
9926 table->status = STATUS_NOT_FOUND;
9927#ifdef MARIAROCKS_NOT_YET
9928 stats.rows_requested++;
9929#endif
9930 if (!m_scan_it || !is_valid(m_scan_it)) {
9931 /*
9932 We can get here when SQL layer has called
9933
9934 h->index_init(PRIMARY);
9935 h->index_read_map(full index tuple, HA_READ_KEY_EXACT);
9936
9937 In this case, we should return EOF.
9938 */
9939 DBUG_RETURN(HA_ERR_END_OF_FILE);
9940 }
9941
9942 for (;;) {
9943 if (m_skip_scan_it_next_call) {
9944 m_skip_scan_it_next_call = false;
9945 } else {
9946 if (move_forward)
9947 m_scan_it->Next(); /* this call cannot fail */
9948 else
9949 m_scan_it->Prev(); /* this call cannot fail */
9950 }
9951
9952 if (!is_valid(m_scan_it)) {
9953 rc = HA_ERR_END_OF_FILE;
9954 break;
9955 }
9956
9957 /* check if we're out of this table */
9958 const rocksdb::Slice key = m_scan_it->key();
9959 if (!m_pk_descr->covers_key(key)) {
9960 rc = HA_ERR_END_OF_FILE;
9961 break;
9962 }
9963
9964 if (m_lock_rows != RDB_LOCK_NONE) {
9965 /*
9966 Lock the row we've just read.
9967
9968 Now we call get_for_update which will 1) Take a lock and 2) Will fail
9969 if the row was deleted since the snapshot was taken.
9970 */
9971 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9972 DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete");
9973
9974 if (m_pk_descr->has_ttl() &&
9975 should_hide_ttl_rec(*m_pk_descr, m_scan_it->value(),
9976 tx->m_snapshot_timestamp)) {
9977 continue;
9978 }
9979
9980 const rocksdb::Status s =
9981 get_for_update(tx, m_pk_descr->get_cf(), key, &m_retrieved_record);
9982 if (s.IsNotFound() &&
9983 my_core::thd_tx_isolation(ha_thd()) == ISO_READ_COMMITTED) {
9984 // This occurs if we accessed a row, tried to lock it, failed,
9985 // released and reacquired the snapshot (because of READ COMMITTED
9986 // mode) and the row was deleted by someone else in the meantime.
9987 // If so, we just want to move on to the next row.
9988 continue;
9989 }
9990
9991 if (!s.ok()) {
9992 DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr,
9993 m_tbl_def, m_table_handler));
9994 }
9995
9996 // If we called get_for_update() use the value from that call not from
9997 // the iterator as it may be stale since we don't have a snapshot
9998 // when m_lock_rows is not RDB_LOCK_NONE.
9999 m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
10000 rc = convert_record_from_storage_format(&key, buf);
10001 } else {
10002 // Use the value from the iterator
10003 rocksdb::Slice value = m_scan_it->value();
10004
10005 if (m_pk_descr->has_ttl() &&
10006 should_hide_ttl_rec(
10007 *m_pk_descr, value,
10008 get_or_create_tx(table->in_use)->m_snapshot_timestamp)) {
10009 continue;
10010 }
10011
10012 m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
10013 rc = convert_record_from_storage_format(&key, &value, buf);
10014 }
10015
10016 table->status = 0;
10017 break;
10018 }
10019
10020 if (!rc) {
10021#ifdef MARIAROCKS_NOT_YET
10022 stats.rows_read++;
10023 stats.rows_index_next++;
10024#endif
10025 update_row_stats(ROWS_READ);
10026 }
10027
10028 DBUG_RETURN(rc);
10029}
10030
10031int ha_rocksdb::rnd_end() {
10032 DBUG_ENTER_FUNC();
10033
10034 release_scan_iterator();
10035
10036 DBUG_RETURN(HA_EXIT_SUCCESS);
10037}
10038
10039/**
10040 @return
10041 HA_EXIT_SUCCESS OK
10042 other HA_ERR error code (can be SE-specific)
10043*/
10044int ha_rocksdb::index_init(uint idx, bool sorted) {
10045 DBUG_ENTER_FUNC();
10046
10047 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10048 DBUG_ASSERT(tx != nullptr);
10049
10050 setup_read_decoders();
10051 m_start_range= NULL;
10052
10053 if (!m_keyread_only) {
10054 m_key_descr_arr[idx]->get_lookup_bitmap(table, &m_lookup_bitmap);
10055 }
10056
10057 // If m_lock_rows is not RDB_LOCK_NONE then we will be doing a get_for_update
10058 // when accessing the index, so don't acquire the snapshot right away.
10059 // Otherwise acquire the snapshot immediately.
10060 tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
10061
10062 active_index = idx;
10063
10064 DBUG_RETURN(HA_EXIT_SUCCESS);
10065}
10066
10067/**
10068 @return
10069 HA_EXIT_SUCCESS OK
10070*/
10071int ha_rocksdb::index_end() {
10072 DBUG_ENTER_FUNC();
10073
10074 release_scan_iterator();
10075
10076 bitmap_free(&m_lookup_bitmap);
10077
10078 active_index = MAX_KEY;
10079 in_range_check_pushed_down = FALSE;
10080 m_start_range= NULL;
10081
10082 DBUG_RETURN(HA_EXIT_SUCCESS);
10083}
10084
10085/**
10086 @return
10087 HA_EXIT_SUCCESS OK
10088 other HA_ERR error code (can be SE-specific)
10089*/
10090int ha_rocksdb::truncate() {
10091 DBUG_ENTER_FUNC();
10092
10093 DBUG_ASSERT(m_tbl_def != nullptr);
10094
10095 m_tbl_def->m_auto_incr_val = 1;
10096 m_tbl_def->m_hidden_pk_val = 1;
10097
10098 DBUG_RETURN(remove_rows(m_tbl_def));
10099}
10100
10101/*
10102 Delete the row we've last read. The row is also passed as parameter.
10103
10104 @detail
10105 The caller guarantees table buf points to the row that was just read.
10106 The row is either table->record[0] or table->record[1].
10107 (Check out InnoDB: row_update_for_mysql() has "UT_NOT_USED(mysql_rec)"
10108
10109 @return
10110 HA_EXIT_SUCCESS OK
10111 other HA_ERR error code (can be SE-specific)
10112*/
10113int ha_rocksdb::delete_row(const uchar *const buf) {
10114 DBUG_ENTER_FUNC();
10115
10116 DBUG_ASSERT(buf != nullptr);
10117
10118 set_last_rowkey(buf);
10119
10120 rocksdb::Slice key_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
10121 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10122 ulonglong bytes_written = 0;
10123
10124 const uint index = pk_index(table, m_tbl_def);
10125 rocksdb::Status s =
10126 delete_or_singledelete(index, tx, m_pk_descr->get_cf(), key_slice);
10127 if (!s.ok()) {
10128 DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def,
10129 m_table_handler));
10130 } else {
10131 bytes_written = key_slice.size();
10132 }
10133
10134 longlong hidden_pk_id = 0;
10135 if (m_tbl_def->m_key_count > 1 && has_hidden_pk(table)) {
10136 int err = read_hidden_pk_id_from_rowkey(&hidden_pk_id);
10137 if (err) {
10138 DBUG_RETURN(err);
10139 }
10140 }
10141
10142 // Delete the record for every secondary index
10143 for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
10144 if (!is_pk(i, table, m_tbl_def)) {
10145 int packed_size;
10146 const Rdb_key_def &kd = *m_key_descr_arr[i];
10147 packed_size = kd.pack_record(table, m_pack_buffer, buf, m_sk_packed_tuple,
10148 nullptr, false, hidden_pk_id);
10149 rocksdb::Slice secondary_key_slice(
10150 reinterpret_cast<const char *>(m_sk_packed_tuple), packed_size);
10151 /* Deleting on secondary key doesn't need any locks: */
10152 tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
10153 secondary_key_slice);
10154 bytes_written += secondary_key_slice.size();
10155 }
10156 }
10157
10158 tx->incr_delete_count();
10159
10160 if (do_bulk_commit(tx)) {
10161 DBUG_RETURN(HA_ERR_ROCKSDB_BULK_LOAD);
10162 }
10163#ifdef MARIAROCKS_NOT_YET
10164 stats.rows_deleted++;
10165#endif
10166 update_row_stats(ROWS_DELETED);
10167 tx->update_bytes_written(bytes_written);
10168
10169 DBUG_RETURN(HA_EXIT_SUCCESS);
10170}
10171
10172rocksdb::Status ha_rocksdb::delete_or_singledelete(
10173 uint index, Rdb_transaction *const tx,
10174 rocksdb::ColumnFamilyHandle *const column_family,
10175 const rocksdb::Slice &key) {
10176 if (can_use_single_delete(index))
10177 return tx->single_delete(column_family, key);
10178 return tx->delete_key(column_family, key);
10179}
10180
10181void ha_rocksdb::update_stats(void) {
10182 DBUG_ENTER_FUNC();
10183
10184 stats.records = 0;
10185 stats.index_file_length = 0ul;
10186 stats.data_file_length = 0ul;
10187 stats.mean_rec_length = 0;
10188
10189 for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
10190 if (is_pk(i, table, m_tbl_def)) {
10191 stats.data_file_length = m_pk_descr->m_stats.m_actual_disk_size;
10192 stats.records = m_pk_descr->m_stats.m_rows;
10193 } else {
10194 stats.index_file_length += m_key_descr_arr[i]->m_stats.m_actual_disk_size;
10195 }
10196 }
10197
10198 DBUG_VOID_RETURN;
10199}
10200
10201/**
10202 @return
10203 HA_EXIT_SUCCESS OK
10204 HA_EXIT_FAILURE Error
10205*/
10206int ha_rocksdb::info(uint flag) {
10207 DBUG_ENTER_FUNC();
10208
10209 if (!table)
10210 DBUG_RETURN(HA_EXIT_FAILURE);
10211
10212 if (flag & HA_STATUS_VARIABLE) {
10213 /*
10214 Test only to simulate corrupted stats
10215 */
10216 DBUG_EXECUTE_IF("myrocks_simulate_negative_stats",
10217 m_pk_descr->m_stats.m_actual_disk_size =
10218 -m_pk_descr->m_stats.m_actual_disk_size;);
10219
10220 update_stats();
10221
10222 /*
10223 If any stats are negative due to bad cached stats, re-run analyze table
10224 and re-retrieve the stats.
10225 */
10226 if (static_cast<longlong>(stats.data_file_length) < 0 ||
10227 static_cast<longlong>(stats.index_file_length) < 0 ||
10228 static_cast<longlong>(stats.records) < 0) {
10229 if (analyze(nullptr, nullptr)) {
10230 DBUG_RETURN(HA_EXIT_FAILURE);
10231 }
10232
10233 update_stats();
10234 }
10235
10236 // if number of records is hardcoded, we do not want to force computation
10237 // of memtable cardinalities
10238 if (stats.records == 0 ||
10239 (rocksdb_force_compute_memtable_stats &&
10240 rocksdb_debug_optimizer_n_rows == 0))
10241 {
10242 // First, compute SST files stats
10243 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
10244 auto r = get_range(pk_index(table, m_tbl_def), buf);
10245 uint64_t sz = 0;
10246 uint8_t include_flags = rocksdb::DB::INCLUDE_FILES;
10247 // recompute SST files stats only if records count is 0
10248 if (stats.records == 0) {
10249 rdb->GetApproximateSizes(m_pk_descr->get_cf(), &r, 1, &sz,
10250 include_flags);
10251 stats.records+= sz/ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
10252 stats.data_file_length+= sz;
10253 }
10254 // Second, compute memtable stats. This call is expensive, so cache
10255 // values computed for some time.
10256 uint64_t cachetime = rocksdb_force_compute_memtable_stats_cachetime;
10257 uint64_t time = (cachetime == 0) ? 0 : my_interval_timer() / 1000;
10258 if (cachetime == 0 ||
10259 time > m_table_handler->m_mtcache_last_update + cachetime) {
10260 uint64_t memtableCount;
10261 uint64_t memtableSize;
10262
10263 // the stats below are calculated from skiplist wich is a probablistic
10264 // data structure, so the results vary between test runs
10265 // it also can return 0 for quite a large tables which means that
10266 // cardinality for memtable only indxes will be reported as 0
10267 rdb->GetApproximateMemTableStats(m_pk_descr->get_cf(), r,
10268 &memtableCount, &memtableSize);
10269
10270 // Atomically update all of these fields at the same time
10271 if (cachetime > 0) {
10272 if (m_table_handler->m_mtcache_lock.fetch_add(
10273 1, std::memory_order_acquire) == 0) {
10274 m_table_handler->m_mtcache_count = memtableCount;
10275 m_table_handler->m_mtcache_size = memtableSize;
10276 m_table_handler->m_mtcache_last_update = time;
10277 }
10278 m_table_handler->m_mtcache_lock.fetch_sub(1,
10279 std::memory_order_release);
10280 }
10281
10282 stats.records += memtableCount;
10283 stats.data_file_length += memtableSize;
10284 } else {
10285 // Cached data is still valid, so use it instead
10286 stats.records += m_table_handler->m_mtcache_count;
10287 stats.data_file_length += m_table_handler->m_mtcache_size;
10288 }
10289
10290 if (rocksdb_debug_optimizer_n_rows > 0)
10291 stats.records = rocksdb_debug_optimizer_n_rows;
10292 }
10293
10294 if (stats.records != 0)
10295 stats.mean_rec_length = stats.data_file_length / stats.records;
10296 }
10297 if (flag & HA_STATUS_CONST) {
10298 ref_length = m_pk_descr->max_storage_fmt_length();
10299
10300 for (uint i = 0; i < m_tbl_def->m_key_count; i++) {
10301 if (is_hidden_pk(i, table, m_tbl_def)) {
10302 continue;
10303 }
10304 KEY *const k = &table->key_info[i];
10305 for (uint j = 0; j < k->ext_key_parts; j++) {
10306 const Rdb_index_stats &k_stats = m_key_descr_arr[i]->m_stats;
10307 uint x = k_stats.m_distinct_keys_per_prefix.size() > j &&
10308 k_stats.m_distinct_keys_per_prefix[j] > 0
10309 ? k_stats.m_rows / k_stats.m_distinct_keys_per_prefix[j]
10310 : 0;
10311 if (x > stats.records)
10312 x = stats.records;
10313 if ((x == 0 && rocksdb_debug_optimizer_no_zero_cardinality) ||
10314 rocksdb_debug_optimizer_n_rows > 0) {
10315 // Fake cardinality implementation. For example, (idx1, idx2, idx3)
10316 // index
10317 /*
10318 Make MariaRocks behave the same way as MyRocks does:
10319 1. SQL layer thinks that unique secondary indexes are not extended
10320 with PK columns (both in MySQL and MariaDB)
10321 2. MariaDB also thinks that indexes with partially-covered columns
10322 are not extended with PK columns. Use the same number of
10323 keyparts that MyRocks would use.
10324 */
10325 uint ext_key_parts2;
10326 if (k->flags & HA_NOSAME)
10327 ext_key_parts2= k->ext_key_parts; // This is #1
10328 else
10329 ext_key_parts2= m_key_descr_arr[i]->get_key_parts(); // This is #2.
10330
10331 // will have rec_per_key for (idx1)=4, (idx1,2)=2, and (idx1,2,3)=1.
10332 // rec_per_key for the whole index is 1, and multiplied by 2^n if
10333 // n suffix columns of the index are not used.
10334 x = 1 << (ext_key_parts2 - j - 1);
10335 }
10336 k->rec_per_key[j] = x;
10337 }
10338 }
10339 }
10340
10341 if (flag & HA_STATUS_ERRKEY) {
10342 /*
10343 Currently we support only primary keys so we know which key had a
10344 uniqueness violation.
10345 */
10346 errkey = m_dupp_errkey;
10347 dup_ref = m_pk_tuple; // TODO(?): this should store packed PK.
10348 }
10349
10350 if (flag & HA_STATUS_AUTO) {
10351 stats.auto_increment_value = m_tbl_def->m_auto_incr_val;
10352 }
10353
10354 DBUG_RETURN(HA_EXIT_SUCCESS);
10355}
10356
10357void ha_rocksdb::position(const uchar *const record) {
10358 DBUG_ENTER_FUNC();
10359
10360 longlong hidden_pk_id = 0;
10361 if (has_hidden_pk(table) && read_hidden_pk_id_from_rowkey(&hidden_pk_id))
10362 DBUG_ASSERT(false); // should never reach here
10363
10364 /*
10365 Get packed primary key value from the record.
10366
10367 (Note: m_last_rowkey has the packed PK of last-read-row, which allows to
10368 handle most cases, but there is an exception to this: when slave applies
10369 RBR events, it fills the record and then calls position(); rnd_pos())
10370
10371 Also note that we *can* rely on PK column values being available. This is
10372 because table_flags() includes HA_PRIMARY_KEY_REQUIRED_FOR_POSITION bit.
10373 When that is true, table->prepare_for_position() adds PK columns into the
10374 read set (this may potentially disable index-only access when PK column
10375 cannot be restored from its mem-comparable form in the secondary indexes).
10376 */
10377 const uint packed_size = m_pk_descr->pack_record(
10378 table, m_pack_buffer, record, ref, nullptr, false, hidden_pk_id);
10379
10380 /*
10381 It could be that mem-comparable form of PK occupies less than ref_length
10382 bytes. Fill the remainder with zeros.
10383 */
10384 if (ref_length > packed_size)
10385 memset(ref + packed_size, 0, ref_length - packed_size);
10386
10387 DBUG_VOID_RETURN;
10388}
10389
10390/**
10391 @return
10392 HA_EXIT_SUCCESS OK
10393 other HA_ERR error code (can be SE-specific)
10394*/
10395int ha_rocksdb::rnd_pos(uchar *const buf, uchar *const pos) {
10396 DBUG_ENTER_FUNC();
10397
10398 int rc;
10399 size_t len;
10400
10401#ifdef MARIAROCKS_NOT_YET
10402 stats.rows_requested++;
10403#endif
10404 len = m_pk_descr->key_length(table,
10405 rocksdb::Slice((const char *)pos, ref_length));
10406 if (len == size_t(-1)) {
10407 DBUG_RETURN(HA_ERR_ROCKSDB_CORRUPT_DATA); /* Data corruption? */
10408 }
10409
10410 rc = get_row_by_rowid(buf, pos, len);
10411
10412 if (!rc) {
10413#ifdef MARIAROCKS_NOT_YET
10414 stats.rows_read++;
10415#endif
10416 update_row_stats(ROWS_READ);
10417 }
10418
10419 DBUG_RETURN(rc);
10420}
10421
10422/*
10423 @brief
10424 Calculate (if needed) the bitmap of indexes that are modified by the
10425 current query
10426
10427 @detail
10428 The calculation done by checking index definitions against the
10429 table->write_set
10430 column bitmap.
10431*/
10432
10433void ha_rocksdb::calc_updated_indexes() {
10434 if (!m_update_scope_is_valid) {
10435 m_update_scope_is_valid = true;
10436 m_update_scope.clear_all();
10437
10438 for (uint keynr = 0; keynr < table->s->keys; keynr++) {
10439 const Rdb_key_def &kd = *m_key_descr_arr[keynr];
10440 /* Walk over all key parts, including the "extended key" suffix */
10441 const uint key_parts = kd.get_key_parts();
10442 for (uint kp = 0; kp < key_parts; kp++) {
10443 if (has_hidden_pk(table) && kp + 1 == key_parts)
10444 break;
10445
10446 Field *const field = kd.get_table_field_for_part_no(table, kp);
10447 if (bitmap_is_set(table->write_set, field->field_index)) {
10448 m_update_scope.set_bit(keynr);
10449 break;
10450 }
10451 }
10452 }
10453 }
10454}
10455
10456/**
10457 @return
10458 HA_EXIT_SUCCESS OK
10459 other HA_ERR error code (can be SE-specific)
10460*/
10461int ha_rocksdb::update_row(const uchar *const old_data,
10462 const uchar *const new_data) {
10463 DBUG_ENTER_FUNC();
10464
10465 DBUG_ASSERT(old_data != nullptr);
10466 DBUG_ASSERT(new_data != nullptr);
10467 DBUG_ASSERT(m_lock_rows == RDB_LOCK_WRITE);
10468 /*
10469 old_data points to record we're updating. It is the same as the record
10470 we've just read (for multi-table UPDATE, too, because SQL layer will make
10471 an rnd_pos() call to re-read the record before calling update_row())
10472 */
10473 DBUG_ASSERT(new_data == table->record[0]);
10474
10475 const int rv = update_write_row(old_data, new_data, false);
10476
10477 if (rv == 0) {
10478#ifdef MARIAROCKS_NOT_YET
10479 stats.rows_updated++;
10480#endif
10481 update_row_stats(ROWS_UPDATED);
10482 }
10483
10484 DBUG_RETURN(rv);
10485}
10486
10487/*
10488 MariaDB's temporary: MyRocks has this function in sql/handler.cc:
10489*/
10490
10491bool can_hold_read_locks_on_select(THD *thd, thr_lock_type lock_type)
10492{
10493 return (lock_type == TL_READ_WITH_SHARED_LOCKS
10494 || lock_type == TL_READ_NO_INSERT
10495 || (lock_type != TL_IGNORE
10496 && thd->lex->sql_command != SQLCOM_SELECT));
10497}
10498
10499
10500/* The following function was copied from ha_blackhole::store_lock: */
10501THR_LOCK_DATA **ha_rocksdb::store_lock(THD *const thd, THR_LOCK_DATA **to,
10502 enum thr_lock_type lock_type) {
10503 DBUG_ENTER_FUNC();
10504
10505 DBUG_ASSERT(thd != nullptr);
10506 DBUG_ASSERT(to != nullptr);
10507
10508 bool in_lock_tables = my_core::thd_in_lock_tables(thd);
10509
10510 /* First, make a decision about MyRocks's internal locking */
10511 if (lock_type >= TL_WRITE_ALLOW_WRITE) {
10512 m_lock_rows = RDB_LOCK_WRITE;
10513 } else if (lock_type == TL_READ_WITH_SHARED_LOCKS) {
10514 m_lock_rows = RDB_LOCK_READ;
10515 } else {
10516 m_lock_rows = RDB_LOCK_NONE;
10517 if (THDVAR(thd, lock_scanned_rows)) {
10518 /*
10519 The following logic was copied directly from
10520 ha_innobase::store_lock_with_x_type() in
10521 storage/innobase/handler/ha_innodb.cc and causes MyRocks to leave
10522 locks in place on rows that are in a table that is not being updated.
10523 */
10524 const uint sql_command = my_core::thd_sql_command(thd);
10525 if ((lock_type == TL_READ && in_lock_tables) ||
10526 (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) ||
10527 can_hold_read_locks_on_select(thd, lock_type)) {
10528 ulong tx_isolation = my_core::thd_tx_isolation(thd);
10529 if (sql_command != SQLCOM_CHECKSUM &&
10530 ((my_core::thd_test_options(thd, OPTION_BIN_LOG) &&
10531 tx_isolation > ISO_READ_COMMITTED) ||
10532 tx_isolation == ISO_SERIALIZABLE ||
10533 (lock_type != TL_READ && lock_type != TL_READ_NO_INSERT) ||
10534 (sql_command != SQLCOM_INSERT_SELECT &&
10535 sql_command != SQLCOM_REPLACE_SELECT &&
10536 sql_command != SQLCOM_UPDATE && sql_command != SQLCOM_DELETE &&
10537 sql_command != SQLCOM_CREATE_TABLE))) {
10538 m_lock_rows = RDB_LOCK_READ;
10539 }
10540 }
10541 }
10542 }
10543
10544 /* Then, tell the SQL layer what kind of locking it should use: */
10545 if (lock_type != TL_IGNORE && m_db_lock.type == TL_UNLOCK) {
10546 /*
10547 Here is where we get into the guts of a row level lock.
10548 If TL_UNLOCK is set
10549 If we are not doing a LOCK TABLE or DISCARD/IMPORT
10550 TABLESPACE, then allow multiple writers
10551 */
10552
10553 if ((lock_type >= TL_WRITE_CONCURRENT_INSERT && lock_type <= TL_WRITE) &&
10554 !in_lock_tables && !my_core::thd_tablespace_op(thd)) {
10555 lock_type = TL_WRITE_ALLOW_WRITE;
10556 }
10557
10558 /*
10559 In queries of type INSERT INTO t1 SELECT ... FROM t2 ...
10560 MySQL would use the lock TL_READ_NO_INSERT on t2, and that
10561 would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts
10562 to t2. Convert the lock to a normal read lock to allow
10563 concurrent inserts to t2.
10564 */
10565
10566 if (lock_type == TL_READ_NO_INSERT && !in_lock_tables) {
10567 lock_type = TL_READ;
10568 }
10569
10570 m_db_lock.type = lock_type;
10571 }
10572
10573 *to++ = &m_db_lock;
10574
10575 DBUG_RETURN(to);
10576}
10577
10578void ha_rocksdb::read_thd_vars(THD *const thd) {
10579 m_store_row_debug_checksums = THDVAR(thd, store_row_debug_checksums);
10580 m_verify_row_debug_checksums = THDVAR(thd, verify_row_debug_checksums);
10581 m_checksums_pct = THDVAR(thd, checksums_pct);
10582}
10583
10584
10585/**
10586 @return
10587 HA_EXIT_SUCCESS OK
10588 other HA_ERR error code (cannot be SE-specific)
10589*/
10590int ha_rocksdb::external_lock(THD *const thd, int lock_type) {
10591 DBUG_ENTER_FUNC();
10592
10593 DBUG_ASSERT(thd != nullptr);
10594
10595 int res = HA_EXIT_SUCCESS;
10596 int binlog_format = my_core::thd_binlog_format(thd);
10597 bool unsafe_for_binlog = THDVAR(ha_thd(), unsafe_for_binlog);
10598
10599 /*
10600 A note about (*) below: In fb/mysql, LOCK TABLE myrocks_table WRITE
10601 - produces an error when @@binlog_format='ROW'.
10602 - does not produce an error with @@binlog_format='STATEMENT'
10603
10604 We cannot easily provide this behavior in MariaDB, because LOCK ROWS
10605 changes @@binlog_format for the duration of the statement.
10606
10607 Instead we will make the check in ha_rocksdb::start_stmt and will produce
10608 an error in the first DML statement done under LOCK TABLE
10609
10610 (**) - The same happens for other "DDL-type" commands, so allow them here
10611 as well.
10612 */
10613 if (lock_type == F_WRLCK && !thd->rgi_slave && !unsafe_for_binlog &&
10614 binlog_format != BINLOG_FORMAT_ROW &&
10615 binlog_format != BINLOG_FORMAT_UNSPEC &&
10616 thd->lex->sql_command != SQLCOM_LOCK_TABLES && // (*)
10617 thd->lex->sql_command != SQLCOM_ANALYZE && // (**)
10618 thd->lex->sql_command != SQLCOM_OPTIMIZE && // (**)
10619 thd->lex->sql_command != SQLCOM_FLUSH && // (**)
10620 my_core::thd_binlog_filter_ok(thd)) {
10621 my_error(ER_REQUIRE_ROW_BINLOG_FORMAT, MYF(0));
10622 DBUG_RETURN(HA_ERR_UNSUPPORTED);
10623 }
10624
10625 if (lock_type == F_UNLCK) {
10626 Rdb_transaction *const tx = get_tx_from_thd(thd);
10627
10628 if (tx) {
10629 tx->io_perf_end_and_record(&m_io_perf);
10630 tx->m_n_mysql_tables_in_use--;
10631 if (tx->m_n_mysql_tables_in_use == 0 &&
10632 !my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) {
10633 /*
10634 Do like InnoDB: when we get here, it's time to commit a
10635 single-statement transaction.
10636
10637 If the statement involved multiple tables, this code will be executed
10638 for each of them, but that's ok because non-first tx->commit() calls
10639 will be no-ops.
10640 */
10641 if (tx->commit_or_rollback()) {
10642 res = HA_ERR_INTERNAL_ERROR;
10643 }
10644 }
10645 }
10646 } else {
10647 if (my_core::thd_tx_isolation(thd) < ISO_READ_COMMITTED ||
10648 my_core::thd_tx_isolation(thd) > ISO_REPEATABLE_READ) {
10649 my_error(ER_ISOLATION_MODE_NOT_SUPPORTED, MYF(0),
10650 tx_isolation_names[my_core::thd_tx_isolation(thd)]);
10651 DBUG_RETURN(HA_ERR_UNSUPPORTED);
10652 }
10653 /*
10654 It's nice to do the following on start of every statement. The problem
10655 is, handler->start_stmt() is not called for INSERTs.
10656 So, we put this code here.
10657 */
10658 Rdb_transaction *const tx = get_or_create_tx(thd);
10659 read_thd_vars(thd);
10660
10661 m_update_scope_is_valid = false;
10662
10663 if (skip_unique_check()) {
10664 if ((thd->lex->sql_command == SQLCOM_INSERT ||
10665 thd->lex->sql_command == SQLCOM_LOAD ||
10666 thd->lex->sql_command == SQLCOM_REPLACE) &&
10667 (thd->lex->duplicates == DUP_REPLACE ||
10668 thd->lex->duplicates == DUP_UPDATE)) {
10669 my_error(ER_ON_DUPLICATE_DISABLED, MYF(0), thd->query());
10670 DBUG_RETURN(HA_ERR_UNSUPPORTED);
10671 }
10672 }
10673
10674 if (lock_type == F_WRLCK) {
10675 if (tx->is_tx_read_only()) {
10676 my_error(ER_UPDATES_WITH_CONSISTENT_SNAPSHOT, MYF(0));
10677 DBUG_RETURN(HA_ERR_UNSUPPORTED);
10678 }
10679
10680 /*
10681 SQL layer signals us to take a write lock. It does so when starting DML
10682 statement. We should put locks on the rows we're reading.
10683
10684 Note: sometimes, external_lock() can be called without a prior
10685 ::store_lock call. That's why we need to set lock_* members here, too.
10686 */
10687 m_lock_rows = RDB_LOCK_WRITE;
10688
10689 if (thd->lex->sql_command == SQLCOM_CREATE_INDEX ||
10690 thd->lex->sql_command == SQLCOM_DROP_INDEX ||
10691 thd->lex->sql_command == SQLCOM_ALTER_TABLE) {
10692 tx->m_ddl_transaction = true;
10693 }
10694 }
10695 tx->m_n_mysql_tables_in_use++;
10696 rocksdb_register_tx(rocksdb_hton, thd, tx);
10697 tx->io_perf_start(&m_io_perf);
10698 }
10699
10700 DBUG_RETURN(res);
10701}
10702
10703/**
10704 @note
10705 A quote from ha_innobase::start_stmt():
10706 <quote>
10707 MySQL calls this function at the start of each SQL statement inside LOCK
10708 TABLES. Inside LOCK TABLES the ::external_lock method does not work to
10709 mark SQL statement borders.
10710 </quote>
10711
10712 @return
10713 HA_EXIT_SUCCESS OK
10714*/
10715
10716int ha_rocksdb::start_stmt(THD *const thd, thr_lock_type lock_type) {
10717 DBUG_ENTER_FUNC();
10718
10719 /*
10720 MariaDB: the following is a copy of the check in ha_rocksdb::external_lock:
10721 */
10722 int binlog_format = my_core::thd_binlog_format(thd);
10723 bool unsafe_for_binlog = THDVAR(ha_thd(), unsafe_for_binlog);
10724 if (lock_type >= TL_WRITE_ALLOW_WRITE &&
10725 !thd->rgi_slave && !unsafe_for_binlog &&
10726 binlog_format != BINLOG_FORMAT_ROW &&
10727 binlog_format != BINLOG_FORMAT_UNSPEC &&
10728 my_core::thd_binlog_filter_ok(thd)) {
10729 my_error(ER_REQUIRE_ROW_BINLOG_FORMAT, MYF(0));
10730 DBUG_RETURN(HA_ERR_UNSUPPORTED);
10731 }
10732
10733 DBUG_ASSERT(thd != nullptr);
10734
10735 Rdb_transaction *const tx = get_or_create_tx(thd);
10736 read_thd_vars(thd);
10737 rocksdb_register_tx(ht, thd, tx);
10738 tx->io_perf_start(&m_io_perf);
10739
10740 DBUG_RETURN(HA_EXIT_SUCCESS);
10741}
10742
10743rocksdb::Range get_range(uint32_t i,
10744 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],
10745 int offset1, int offset2) {
10746 uchar *buf_begin = buf;
10747 uchar *buf_end = buf + Rdb_key_def::INDEX_NUMBER_SIZE;
10748 rdb_netbuf_store_index(buf_begin, i + offset1);
10749 rdb_netbuf_store_index(buf_end, i + offset2);
10750
10751 return rocksdb::Range(
10752 rocksdb::Slice((const char *)buf_begin, Rdb_key_def::INDEX_NUMBER_SIZE),
10753 rocksdb::Slice((const char *)buf_end, Rdb_key_def::INDEX_NUMBER_SIZE));
10754}
10755
10756static rocksdb::Range get_range(const Rdb_key_def &kd,
10757 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2],
10758 int offset1, int offset2) {
10759 return get_range(kd.get_index_number(), buf, offset1, offset2);
10760}
10761
10762rocksdb::Range get_range(const Rdb_key_def &kd,
10763 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) {
10764 if (kd.m_is_reverse_cf) {
10765 return myrocks::get_range(kd, buf, 1, 0);
10766 } else {
10767 return myrocks::get_range(kd, buf, 0, 1);
10768 }
10769}
10770
10771rocksdb::Range
10772ha_rocksdb::get_range(const int &i,
10773 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) const {
10774 return myrocks::get_range(*m_key_descr_arr[i], buf);
10775}
10776
10777/*
10778 This function is called with total_order_seek=true, but
10779 upper/lower bound setting is not necessary.
10780 Boundary set is useful when there is no matching key,
10781 but in drop_index_thread's case, it means index is marked as removed,
10782 so no further seek will happen for the index id.
10783*/
10784static bool is_myrocks_index_empty(
10785 rocksdb::ColumnFamilyHandle *cfh, const bool is_reverse_cf,
10786 const rocksdb::ReadOptions &read_opts,
10787 const uint index_id)
10788{
10789 bool index_removed = false;
10790 uchar key_buf[Rdb_key_def::INDEX_NUMBER_SIZE] = {0};
10791 rdb_netbuf_store_uint32(key_buf, index_id);
10792 const rocksdb::Slice key =
10793 rocksdb::Slice(reinterpret_cast<char *>(key_buf), sizeof(key_buf));
10794 std::unique_ptr<rocksdb::Iterator> it(rdb->NewIterator(read_opts, cfh));
10795 rocksdb_smart_seek(is_reverse_cf, it.get(), key);
10796 if (!it->Valid()) {
10797 index_removed = true;
10798 } else {
10799 if (memcmp(it->key().data(), key_buf,
10800 Rdb_key_def::INDEX_NUMBER_SIZE)) {
10801 // Key does not have same prefix
10802 index_removed = true;
10803 }
10804 }
10805 return index_removed;
10806}
10807
10808/*
10809 Drop index thread's main logic
10810*/
10811
10812void Rdb_drop_index_thread::run() {
10813 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
10814
10815 for (;;) {
10816 // The stop flag might be set by shutdown command
10817 // after drop_index_thread releases signal_mutex
10818 // (i.e. while executing expensive Seek()). To prevent drop_index_thread
10819 // from entering long cond_timedwait, checking if stop flag
10820 // is true or not is needed, with drop_index_interrupt_mutex held.
10821 if (m_stop) {
10822 break;
10823 }
10824
10825 timespec ts;
10826 int sec= dict_manager.is_drop_index_empty()
10827 ? 24 * 60 * 60 // no filtering
10828 : 60; // filtering
10829 set_timespec(ts,sec);
10830
10831 const auto ret MY_ATTRIBUTE((__unused__)) =
10832 mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts);
10833 if (m_stop) {
10834 break;
10835 }
10836 // make sure, no program error is returned
10837 DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT);
10838 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
10839
10840 std::unordered_set<GL_INDEX_ID> indices;
10841 dict_manager.get_ongoing_drop_indexes(&indices);
10842 if (!indices.empty()) {
10843 std::unordered_set<GL_INDEX_ID> finished;
10844 rocksdb::ReadOptions read_opts;
10845 read_opts.total_order_seek = true; // disable bloom filter
10846
10847 for (const auto d : indices) {
10848 uint32 cf_flags = 0;
10849 if (!dict_manager.get_cf_flags(d.cf_id, &cf_flags)) {
10850 sql_print_error("RocksDB: Failed to get column family flags "
10851 "from cf id %u. MyRocks data dictionary may "
10852 "get corrupted.",
10853 d.cf_id);
10854 abort();
10855 }
10856 rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(d.cf_id);
10857 DBUG_ASSERT(cfh);
10858 const bool is_reverse_cf = cf_flags & Rdb_key_def::REVERSE_CF_FLAG;
10859
10860 if (is_myrocks_index_empty(cfh, is_reverse_cf, read_opts, d.index_id))
10861 {
10862 finished.insert(d);
10863 continue;
10864 }
10865 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
10866 rocksdb::Range range = get_range(d.index_id, buf, is_reverse_cf ? 1 : 0,
10867 is_reverse_cf ? 0 : 1);
10868 rocksdb::Status status = DeleteFilesInRange(rdb->GetBaseDB(), cfh,
10869 &range.start, &range.limit);
10870 if (!status.ok()) {
10871 if (status.IsShutdownInProgress()) {
10872 break;
10873 }
10874 rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
10875 }
10876 status = rdb->CompactRange(getCompactRangeOptions(), cfh, &range.start,
10877 &range.limit);
10878 if (!status.ok()) {
10879 if (status.IsShutdownInProgress()) {
10880 break;
10881 }
10882 rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
10883 }
10884 if (is_myrocks_index_empty(cfh, is_reverse_cf, read_opts, d.index_id))
10885 {
10886 finished.insert(d);
10887 }
10888 }
10889
10890 if (!finished.empty()) {
10891 dict_manager.finish_drop_indexes(finished);
10892 }
10893 }
10894 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
10895 }
10896
10897 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
10898}
10899
10900Rdb_tbl_def *ha_rocksdb::get_table_if_exists(const char *const tablename) {
10901 DBUG_ASSERT(tablename != nullptr);
10902
10903 std::string str;
10904 if (rdb_normalize_tablename(tablename, &str) != HA_EXIT_SUCCESS) {
10905 // We were not passed table name?
10906 DBUG_ASSERT(0);
10907 return nullptr;
10908 }
10909
10910 return ddl_manager.find(str);
10911}
10912
10913/*
10914 Note: the following function is called when the table is not open. That is,
10915 this->table==nullptr, pk_key_descr==nullptr, etc.
10916
10917 tablename points to line in form "./dbname/tablename".
10918
10919 @return
10920 HA_EXIT_SUCCESS OK
10921 other HA_ERR error code (can be SE-specific)
10922*/
10923
10924int ha_rocksdb::delete_table(const char *const tablename) {
10925 DBUG_ENTER_FUNC();
10926
10927 DBUG_ASSERT(tablename != nullptr);
10928
10929 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
10930 rocksdb::WriteBatch *const batch = wb.get();
10931
10932 /* Find the table in the hash */
10933 Rdb_tbl_def *const tbl = get_table_if_exists(tablename);
10934 if (!tbl)
10935 DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
10936 dict_manager.add_drop_table(tbl->m_key_descr_arr, tbl->m_key_count, batch);
10937
10938 /*
10939 Remove the table entry in data dictionary (this will also remove it from
10940 the persistent data dictionary).
10941 */
10942 ddl_manager.remove(tbl, batch, true);
10943
10944 int err = dict_manager.commit(batch);
10945 if (err) {
10946 DBUG_RETURN(err);
10947 }
10948
10949 rdb_drop_idx_thread.signal();
10950
10951 DBUG_RETURN(HA_EXIT_SUCCESS);
10952}
10953
10954int ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) {
10955 const rocksdb::WriteOptions wo =
10956 rdb_get_rocksdb_write_options(handler::ha_thd());
10957
10958 rocksdb::ReadOptions opts;
10959 opts.total_order_seek = true;
10960 Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10961
10962 char key_buf[MAX_KEY_LENGTH];
10963 uint key_len;
10964 ulonglong bytes_written = 0;
10965 /*
10966 Remove all records in each index.
10967 (This is is not crash-safe, but it doesn't matter, because bulk row
10968 deletion will be handled on rocksdb side)
10969 */
10970 for (uint i = 0; i < tbl->m_key_count; i++) {
10971 const Rdb_key_def &kd = *tbl->m_key_descr_arr[i];
10972 kd.get_infimum_key(reinterpret_cast<uchar *>(key_buf), &key_len);
10973 rocksdb::ColumnFamilyHandle *cf = kd.get_cf();
10974 const rocksdb::Slice table_key(key_buf, key_len);
10975 setup_iterator_bounds(kd, table_key,
10976 m_eq_cond_lower_bound,
10977 m_eq_cond_upper_bound,
10978 &m_eq_cond_lower_bound_slice,
10979 &m_eq_cond_upper_bound_slice);
10980 opts.iterate_lower_bound = &m_eq_cond_lower_bound_slice;
10981 opts.iterate_upper_bound = &m_eq_cond_upper_bound_slice;
10982 std::unique_ptr<rocksdb::Iterator> it(rdb->NewIterator(opts, cf));
10983
10984 it->Seek(table_key);
10985 while (it->Valid()) {
10986 const rocksdb::Slice key = it->key();
10987 if (!kd.covers_key(key)) {
10988 break;
10989 }
10990
10991 rocksdb::Status s;
10992 if (can_use_single_delete(i)) {
10993 s = rdb->SingleDelete(wo, cf, key);
10994 } else {
10995 s = rdb->Delete(wo, cf, key);
10996 }
10997
10998 if (!s.ok()) {
10999 return tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def,
11000 m_table_handler);
11001 }
11002 bytes_written += key.size();
11003 it->Next();
11004 }
11005 }
11006
11007 tx->update_bytes_written(bytes_written);
11008
11009 return HA_EXIT_SUCCESS;
11010}
11011
11012/**
11013 @return
11014 HA_EXIT_SUCCESS OK
11015 other HA_ERR error code (cannot be SE-specific)
11016*/
11017int ha_rocksdb::rename_table(const char *const from, const char *const to) {
11018 DBUG_ENTER_FUNC();
11019
11020 DBUG_ASSERT(from != nullptr);
11021 DBUG_ASSERT(to != nullptr);
11022
11023 std::string from_str;
11024 std::string to_str;
11025 std::string from_db;
11026 std::string to_db;
11027
11028 int rc = rdb_normalize_tablename(from, &from_str);
11029 if (rc != HA_EXIT_SUCCESS) {
11030 DBUG_RETURN(rc);
11031 }
11032
11033 rc = rdb_split_normalized_tablename(from_str, &from_db);
11034 if (rc != HA_EXIT_SUCCESS) {
11035 DBUG_RETURN(rc);
11036 }
11037
11038 rc = rdb_normalize_tablename(to, &to_str);
11039 if (rc != HA_EXIT_SUCCESS) {
11040 DBUG_RETURN(rc);
11041 }
11042
11043 rc = rdb_split_normalized_tablename(to_str, &to_db);
11044 if (rc != HA_EXIT_SUCCESS) {
11045 DBUG_RETURN(rc);
11046 }
11047
11048 // If the user changed the database part of the name then validate that the
11049 // 'to' database exists.
11050 if (from_db != to_db && !rdb_database_exists(to_db)) {
11051 // If we return a RocksDB specific error code here we get
11052 // "error: 206 - Unknown error 206". InnoDB gets
11053 // "error -1 - Unknown error -1" so let's match them.
11054 DBUG_RETURN(-1);
11055 }
11056
11057 DBUG_EXECUTE_IF("gen_sql_table_name", to_str = to_str + "#sql-test";);
11058
11059 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
11060 rocksdb::WriteBatch *const batch = wb.get();
11061 dict_manager.lock();
11062
11063 if (ddl_manager.rename(from_str, to_str, batch)) {
11064 rc = HA_ERR_NO_SUCH_TABLE;
11065 } else {
11066 rc = dict_manager.commit(batch);
11067 }
11068 dict_manager.unlock();
11069
11070 DBUG_RETURN(rc);
11071}
11072
11073/**
11074 check_if_incompatible_data() called if ALTER TABLE can't detect otherwise
11075 if new and old definition are compatible
11076
11077 @details If there are no other explicit signs like changed number of
11078 fields this function will be called by compare_tables()
11079 (sql/sql_tables.cc) to decide should we rewrite whole table or only .frm
11080 file.
11081
11082*/
11083
11084bool ha_rocksdb::check_if_incompatible_data(HA_CREATE_INFO *const info,
11085 uint table_changes) {
11086 DBUG_ENTER_FUNC();
11087
11088 DBUG_ASSERT(info != nullptr);
11089
11090 // this function is needed only for online alter-table
11091 DBUG_RETURN(COMPATIBLE_DATA_NO);
11092}
11093
11094/**
11095 @return
11096 HA_EXIT_SUCCESS OK
11097*/
11098int ha_rocksdb::extra(enum ha_extra_function operation) {
11099 DBUG_ENTER_FUNC();
11100
11101 switch (operation) {
11102 case HA_EXTRA_KEYREAD:
11103 m_keyread_only = true;
11104 break;
11105 case HA_EXTRA_NO_KEYREAD:
11106 m_keyread_only = false;
11107 break;
11108 case HA_EXTRA_FLUSH:
11109 /*
11110 If the table has blobs, then they are part of m_retrieved_record.
11111 This call invalidates them.
11112 */
11113 m_retrieved_record.Reset();
11114 break;
11115 default:
11116 break;
11117 }
11118
11119 DBUG_RETURN(HA_EXIT_SUCCESS);
11120}
11121
11122/*
11123 Given a starting key and an ending key, estimate the number of rows that
11124 will exist between the two keys.
11125*/
11126ha_rows ha_rocksdb::records_in_range(uint inx, key_range *const min_key,
11127 key_range *const max_key) {
11128 DBUG_ENTER_FUNC();
11129
11130 ha_rows ret = THDVAR(ha_thd(), records_in_range);
11131 if (ret) {
11132 DBUG_RETURN(ret);
11133 }
11134 if (table->force_index) {
11135 const ha_rows force_rows = THDVAR(ha_thd(), force_index_records_in_range);
11136 if (force_rows) {
11137 DBUG_RETURN(force_rows);
11138 }
11139 }
11140
11141 const Rdb_key_def &kd = *m_key_descr_arr[inx];
11142
11143 uint size1 = 0;
11144 if (min_key) {
11145 size1 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
11146 min_key->key, min_key->keypart_map);
11147 if (min_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
11148 min_key->flag == HA_READ_PREFIX_LAST ||
11149 min_key->flag == HA_READ_AFTER_KEY) {
11150 kd.successor(m_sk_packed_tuple, size1);
11151 }
11152 } else {
11153 kd.get_infimum_key(m_sk_packed_tuple, &size1);
11154 }
11155
11156 uint size2 = 0;
11157 if (max_key) {
11158 size2 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple_old,
11159 max_key->key, max_key->keypart_map);
11160 if (max_key->flag == HA_READ_PREFIX_LAST_OR_PREV ||
11161 max_key->flag == HA_READ_PREFIX_LAST ||
11162 max_key->flag == HA_READ_AFTER_KEY) {
11163 kd.successor(m_sk_packed_tuple_old, size2);
11164 }
11165 // pad the upper key with FFFFs to make sure it is more than the lower
11166 if (size1 > size2) {
11167 memset(m_sk_packed_tuple_old + size2, 0xff, size1 - size2);
11168 size2 = size1;
11169 }
11170 } else {
11171 kd.get_supremum_key(m_sk_packed_tuple_old, &size2);
11172 }
11173
11174 const rocksdb::Slice slice1((const char *)m_sk_packed_tuple, size1);
11175 const rocksdb::Slice slice2((const char *)m_sk_packed_tuple_old, size2);
11176
11177 // slice1 >= slice2 means no row will match
11178 if (slice1.compare(slice2) >= 0) {
11179 DBUG_RETURN(HA_EXIT_SUCCESS);
11180 }
11181
11182 rocksdb::Range r(kd.m_is_reverse_cf ? slice2 : slice1,
11183 kd.m_is_reverse_cf ? slice1 : slice2);
11184
11185 uint64_t sz = 0;
11186 auto disk_size = kd.m_stats.m_actual_disk_size;
11187 if (disk_size == 0)
11188 disk_size = kd.m_stats.m_data_size;
11189 auto rows = kd.m_stats.m_rows;
11190 if (rows == 0 || disk_size == 0) {
11191 rows = 1;
11192 disk_size = ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
11193 }
11194
11195 // Getting statistics, including from Memtables
11196 uint8_t include_flags = rocksdb::DB::INCLUDE_FILES;
11197 rdb->GetApproximateSizes(kd.get_cf(), &r, 1, &sz, include_flags);
11198 ret = rows * sz / disk_size;
11199 uint64_t memTableCount;
11200 rdb->GetApproximateMemTableStats(kd.get_cf(), r, &memTableCount, &sz);
11201 ret += memTableCount;
11202
11203 /*
11204 GetApproximateSizes() gives estimates so ret might exceed stats.records.
11205 MySQL then decides to use full index scan rather than range scan, which
11206 is not efficient for most cases.
11207 To prevent this, changing estimated records slightly smaller than
11208 stats.records.
11209 */
11210 if (ret >= stats.records) {
11211 ret = stats.records * 0.99;
11212 }
11213
11214 if (rocksdb_debug_optimizer_n_rows > 0) {
11215 ret = rocksdb_debug_optimizer_n_rows;
11216 } else if (ret == 0) {
11217 ret = 1;
11218 }
11219
11220 DBUG_RETURN(ret);
11221}
11222
11223void ha_rocksdb::update_create_info(HA_CREATE_INFO *const create_info) {
11224 DBUG_ENTER_FUNC();
11225
11226 DBUG_ASSERT(create_info != nullptr);
11227
11228 if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) {
11229 create_info->auto_increment_value = m_tbl_def->m_auto_incr_val;
11230 }
11231
11232 DBUG_VOID_RETURN;
11233}
11234
11235/**
11236 @brief
11237 Doing manual compaction on OPTIMIZE TABLE in RocksDB.
11238 Compaction itself is executed by background thread in RocksDB, but
11239 CompactRange() waits until compaction completes so this function
11240 may take a long time.
11241 Since RocksDB dataset is allocated per index id, OPTIMIZE TABLE
11242 triggers manual compaction for all indexes of the table.
11243 @details
11244 Compaction range is from the beginning of the index id to
11245 the first row of the next index id. When using reverse order
11246 column family, the first row of the next index id should be
11247 the last row of the previous index id.
11248
11249 @return
11250 HA_ADMIN_OK OK
11251 other HA_ADMIN error code
11252*/
11253int ha_rocksdb::optimize(THD *const thd, HA_CHECK_OPT *const check_opt) {
11254 DBUG_ENTER_FUNC();
11255
11256 DBUG_ASSERT(thd != nullptr);
11257 DBUG_ASSERT(check_opt != nullptr);
11258
11259 for (uint i = 0; i < table->s->keys; i++) {
11260 uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
11261 auto range = get_range(i, buf);
11262 const rocksdb::Status s = rdb->CompactRange(getCompactRangeOptions(),
11263 m_key_descr_arr[i]->get_cf(),
11264 &range.start, &range.limit);
11265 if (!s.ok()) {
11266 DBUG_RETURN(rdb_error_to_mysql(s));
11267 }
11268 }
11269
11270 DBUG_RETURN(HA_EXIT_SUCCESS);
11271}
11272
11273int ha_rocksdb::calculate_stats(const TABLE *const table_arg, THD *const thd,
11274 HA_CHECK_OPT *const check_opt) {
11275 DBUG_ENTER_FUNC();
11276
11277 // find per column family key ranges which need to be queried
11278 std::unordered_map<rocksdb::ColumnFamilyHandle *, std::vector<rocksdb::Range>>
11279 ranges;
11280 std::unordered_set<GL_INDEX_ID> ids_to_check;
11281 std::vector<uchar> buf(table_arg->s->keys * 2 *
11282 Rdb_key_def::INDEX_NUMBER_SIZE);
11283 std::unordered_map<GL_INDEX_ID, Rdb_index_stats> stats;
11284 for (uint i = 0; i < table_arg->s->keys; i++) {
11285 const auto bufp = &buf[i * 2 * Rdb_key_def::INDEX_NUMBER_SIZE];
11286 const Rdb_key_def &kd = *m_key_descr_arr[i];
11287 const GL_INDEX_ID index_id = kd.get_gl_index_id();
11288 ranges[kd.get_cf()].push_back(get_range(i, bufp));
11289
11290 ids_to_check.insert(index_id);
11291 // Initialize the stats to 0. If there are no files that contain
11292 // this gl_index_id, then 0 should be stored for the cached stats.
11293 stats[index_id] = Rdb_index_stats(index_id);
11294 DBUG_ASSERT(kd.get_key_parts() > 0);
11295 stats[index_id].m_distinct_keys_per_prefix.resize(kd.get_key_parts());
11296 }
11297
11298 // get RocksDB table properties for these ranges
11299 rocksdb::TablePropertiesCollection props;
11300 for (auto it : ranges) {
11301 const auto old_size MY_ATTRIBUTE((__unused__)) = props.size();
11302 const auto status = rdb->GetPropertiesOfTablesInRange(
11303 it.first, &it.second[0], it.second.size(), &props);
11304 DBUG_ASSERT(props.size() >= old_size);
11305 if (!status.ok()) {
11306 DBUG_RETURN(
11307 rdb_error_to_mysql(status, "Could not access RocksDB properties"));
11308 }
11309 }
11310
11311 int num_sst = 0;
11312 for (const auto &it : props) {
11313 std::vector<Rdb_index_stats> sst_stats;
11314 Rdb_tbl_prop_coll::read_stats_from_tbl_props(it.second, &sst_stats);
11315 /*
11316 sst_stats is a list of index statistics for indexes that have entries
11317 in the current SST file.
11318 */
11319 for (const auto &it1 : sst_stats) {
11320 /*
11321 Only update statistics for indexes that belong to this SQL table.
11322
11323 The reason is: We are walking through all SST files that have
11324 entries from this table (and so can compute good statistics). For
11325 other SQL tables, it can be that we're only seeing a small fraction
11326 of table's entries (and so we can't update statistics based on that).
11327 */
11328 if (ids_to_check.find(it1.m_gl_index_id) == ids_to_check.end())
11329 continue;
11330
11331 auto kd = ddl_manager.safe_find(it1.m_gl_index_id);
11332 DBUG_ASSERT(kd != nullptr);
11333 stats[it1.m_gl_index_id].merge(it1, true, kd->max_storage_fmt_length());
11334 }
11335 num_sst++;
11336 }
11337
11338 // calculate memtable cardinality
11339 Rdb_tbl_card_coll cardinality_collector(rocksdb_table_stats_sampling_pct);
11340 auto read_opts = rocksdb::ReadOptions();
11341 read_opts.read_tier = rocksdb::ReadTier::kMemtableTier;
11342 for (uint i = 0; i < table_arg->s->keys; i++) {
11343 const Rdb_key_def &kd = *m_key_descr_arr[i];
11344 Rdb_index_stats &stat = stats[kd.get_gl_index_id()];
11345
11346 uchar r_buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2];
11347 auto r = get_range(i, r_buf);
11348 uint64_t memtableCount;
11349 uint64_t memtableSize;
11350 rdb->GetApproximateMemTableStats(kd.get_cf(), r, &memtableCount,
11351 &memtableSize);
11352 if (memtableCount < (uint64_t)stat.m_rows / 10) {
11353 // skip tables that already have enough stats from SST files to reduce
11354 // overhead and avoid degradation of big tables stats by sampling from
11355 // relatively tiny (less than 10% of full data set) memtable dataset
11356 continue;
11357 }
11358
11359 std::unique_ptr<rocksdb::Iterator> it = std::unique_ptr<rocksdb::Iterator>(
11360 rdb->NewIterator(read_opts, kd.get_cf()));
11361
11362 uchar *first_key;
11363 uint key_size;
11364 if (is_pk(i, table, m_tbl_def)) {
11365 first_key = m_pk_packed_tuple;
11366 } else {
11367 first_key = m_sk_packed_tuple;
11368 }
11369 kd.get_first_key(first_key, &key_size);
11370 rocksdb::Slice first_index_key((const char *)first_key, key_size);
11371
11372 cardinality_collector.Reset();
11373 for (it->Seek(first_index_key); is_valid(it.get()); it->Next()) {
11374 const rocksdb::Slice key = it->key();
11375 if (!kd.covers_key(key)) {
11376 break; // end of this index
11377 }
11378 stat.m_rows++;
11379
11380 cardinality_collector.ProcessKey(key, &kd, &stat);
11381 }
11382 cardinality_collector.AdjustStats(&stat);
11383 }
11384
11385 // set and persist new stats
11386 ddl_manager.set_stats(stats);
11387 ddl_manager.persist_stats(true);
11388
11389 DBUG_RETURN(HA_EXIT_SUCCESS);
11390}
11391
11392/*
11393 @return
11394 HA_ADMIN_OK OK
11395 other HA_ADMIN error code
11396*/
11397int ha_rocksdb::analyze(THD *const thd, HA_CHECK_OPT *const check_opt) {
11398 DBUG_ENTER_FUNC();
11399
11400 if (table && calculate_stats(table, thd, check_opt) != HA_EXIT_SUCCESS) {
11401 DBUG_RETURN(HA_ADMIN_FAILED);
11402 }
11403
11404 DBUG_RETURN(HA_ADMIN_OK);
11405}
11406
11407void ha_rocksdb::get_auto_increment(ulonglong off, ulonglong inc,
11408 ulonglong nb_desired_values,
11409 ulonglong *const first_value,
11410 ulonglong *const nb_reserved_values) {
11411 /*
11412 MySQL has a somewhat complicated way of handling the auto-increment value.
11413 The first time get_auto_increment is called for a statement,
11414 nb_desired_values is the estimate for how many values will be needed. The
11415 engine can then reserve some values, and those will be automatically used
11416 by MySQL, until a hard-coded value shows up in the insert statement, after
11417 which MySQL again calls this function to reset its starting value.
11418 *
11419 For simplicity we will just ignore nb_desired_values - we aren't going to
11420 reserve any extra values for a multi-insert statement. Each row will
11421 simply acquire the next value as needed and we will always tell MySQL that
11422 we only reserved 1 value. Since we are using an atomic value for
11423 m_auto_incr_val this should be safe - if we had to grab a mutex, doing
11424 an actual reserve of some values might be a better solution.
11425 */
11426 DEBUG_SYNC(ha_thd(), "rocksdb.autoinc_vars");
11427
11428 if (off > inc) {
11429 off = 1;
11430 }
11431
11432 Field *field;
11433 ulonglong new_val, max_val;
11434 field = table->key_info[table->s->next_number_index].key_part[0].field;
11435 max_val = rdb_get_int_col_max_value(field);
11436
11437 // Local variable reference to simplify code below
11438 auto &auto_incr = m_tbl_def->m_auto_incr_val;
11439
11440 if (inc == 1) {
11441 DBUG_ASSERT(off == 1);
11442 // Optimization for the standard case where we are always simply
11443 // incrementing from the last position
11444
11445 // Use CAS operation in a loop to make sure automically get the next auto
11446 // increment value while ensuring that we don't wrap around to a negative
11447 // number.
11448 //
11449 // We set auto_incr to the min of max_val and new_val + 1. This means that
11450 // if we're at the maximum, we should be returning the same value for
11451 // multiple rows, resulting in duplicate key errors (as expected).
11452 //
11453 // If we return values greater than the max, the SQL layer will "truncate"
11454 // the value anyway, but it means that we store invalid values into
11455 // auto_incr that will be visible in SHOW CREATE TABLE.
11456 new_val = auto_incr;
11457 while (new_val != std::numeric_limits<ulonglong>::max()) {
11458 if (auto_incr.compare_exchange_weak(new_val,
11459 std::min(new_val + 1, max_val))) {
11460 break;
11461 }
11462 }
11463 } else {
11464 // The next value can be more complicated if either 'inc' or 'off' is not 1
11465 ulonglong last_val = auto_incr;
11466
11467 // Loop until we can correctly update the atomic value
11468 do {
11469 DBUG_ASSERT(last_val > 0);
11470 // Calculate the next value in the auto increment series: offset
11471 // + N * increment where N is 0, 1, 2, ...
11472 //
11473 // For further information please visit:
11474 // http://dev.mysql.com/doc/refman/5.7/en/replication-options-master.html
11475 //
11476 // The following is confusing so here is an explanation:
11477 // To get the next number in the sequence above you subtract out the
11478 // offset, calculate the next sequence (N * increment) and then add the
11479 // offset back in.
11480 //
11481 // The additions are rearranged to avoid overflow. The following is
11482 // equivalent to (last_val - 1 + inc - off) / inc. This uses the fact
11483 // that (a+b)/c = a/c + b/c + (a%c + b%c)/c. To show why:
11484 //
11485 // (a+b)/c
11486 // = (a - a%c + a%c + b - b%c + b%c) / c
11487 // = (a - a%c) / c + (b - b%c) / c + (a%c + b%c) / c
11488 // = a/c + b/c + (a%c + b%c) / c
11489 //
11490 // Now, substitute a = last_val - 1, b = inc - off, c = inc to get the
11491 // following statement.
11492 ulonglong n =
11493 (last_val - 1) / inc + ((last_val - 1) % inc + inc - off) / inc;
11494
11495 // Check if n * inc + off will overflow. This can only happen if we have
11496 // an UNSIGNED BIGINT field.
11497 if (n > (std::numeric_limits<ulonglong>::max() - off) / inc) {
11498 DBUG_ASSERT(max_val == std::numeric_limits<ulonglong>::max());
11499 // The 'last_val' value is already equal to or larger than the largest
11500 // value in the sequence. Continuing would wrap around (technically
11501 // the behavior would be undefined). What should we do?
11502 // We could:
11503 // 1) set the new value to the last possible number in our sequence
11504 // as described above. The problem with this is that this
11505 // number could be smaller than a value in an existing row.
11506 // 2) set the new value to the largest possible number. This number
11507 // may not be in our sequence, but it is guaranteed to be equal
11508 // to or larger than any other value already inserted.
11509 //
11510 // For now I'm going to take option 2.
11511 //
11512 // Returning ULLONG_MAX from get_auto_increment will cause the SQL
11513 // layer to fail with ER_AUTOINC_READ_FAILED. This means that due to
11514 // the SE API for get_auto_increment, inserts will fail with
11515 // ER_AUTOINC_READ_FAILED if the column is UNSIGNED BIGINT, but
11516 // inserts will fail with ER_DUP_ENTRY for other types (or no failure
11517 // if the column is in a non-unique SK).
11518 new_val = std::numeric_limits<ulonglong>::max();
11519 auto_incr = new_val; // Store the largest value into auto_incr
11520 break;
11521 }
11522
11523 new_val = n * inc + off;
11524
11525 // Attempt to store the new value (plus 1 since m_auto_incr_val contains
11526 // the next available value) into the atomic value. If the current
11527 // value no longer matches what we have in 'last_val' this will fail and
11528 // we will repeat the loop (`last_val` will automatically get updated
11529 // with the current value).
11530 //
11531 // See above explanation for inc == 1 for why we use std::min.
11532 } while (!auto_incr.compare_exchange_weak(last_val,
11533 std::min(new_val + 1, max_val)));
11534 }
11535
11536 *first_value = new_val;
11537 *nb_reserved_values = 1;
11538}
11539
11540#ifndef DBUG_OFF
11541
11542/* Debugger help function */
11543static char dbug_item_print_buf[512];
11544
11545const char *dbug_print_item(Item *const item) {
11546 char *const buf = dbug_item_print_buf;
11547 String str(buf, sizeof(dbug_item_print_buf), &my_charset_bin);
11548 str.length(0);
11549 if (!item)
11550 return "(Item*)nullptr";
11551 item->print(&str, QT_ORDINARY);
11552 if (str.c_ptr() == buf)
11553 return buf;
11554 else
11555 return "Couldn't fit into buffer";
11556}
11557
11558#endif /*DBUG_OFF*/
11559
11560/**
11561 SQL layer calls this function to push an index condition.
11562
11563 @details
11564 The condition is for index keyno (only one condition can be pushed at a
11565 time).
11566 The caller guarantees that condition refers only to index fields; besides
11567 that, fields must have
11568
11569 $field->part_of_key.set_bit(keyno)
11570
11571 which means that
11572
11573 (handler->index_flags(keyno, $keypart, 0) & HA_KEYREAD_ONLY) == 1
11574
11575 which means that field value can be restored from the index tuple.
11576
11577 @return
11578 Part of condition we couldn't check (always nullptr).
11579*/
11580
11581class Item *ha_rocksdb::idx_cond_push(uint keyno, class Item *const idx_cond) {
11582 DBUG_ENTER_FUNC();
11583
11584 DBUG_ASSERT(keyno != MAX_KEY);
11585 DBUG_ASSERT(idx_cond != nullptr);
11586
11587 pushed_idx_cond = idx_cond;
11588 pushed_idx_cond_keyno = keyno;
11589 in_range_check_pushed_down = TRUE;
11590
11591 /* We will check the whole condition */
11592 DBUG_RETURN(nullptr);
11593}
11594
11595/*
11596 Checks if inplace alter is supported for a given operation.
11597*/
11598
11599my_core::enum_alter_inplace_result ha_rocksdb::check_if_supported_inplace_alter(
11600 TABLE *altered_table, my_core::Alter_inplace_info *const ha_alter_info) {
11601 DBUG_ENTER_FUNC();
11602
11603 DBUG_ASSERT(ha_alter_info != nullptr);
11604
11605 if (ha_alter_info->handler_flags &
11606 ~(ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX |
11607 ALTER_DROP_UNIQUE_INDEX |
11608 ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX |
11609 ALTER_PARTITIONED |
11610 ALTER_ADD_UNIQUE_INDEX |
11611 ALTER_CHANGE_CREATE_OPTION)) {
11612 DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
11613 }
11614
11615 /* We don't support unique keys on table w/ no primary keys */
11616 if ((ha_alter_info->handler_flags &
11617 ALTER_ADD_UNIQUE_INDEX) &&
11618 has_hidden_pk(altered_table)) {
11619 DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
11620 }
11621
11622 /* We only support changing auto_increment for table options. */
11623 if ((ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) &&
11624 !(ha_alter_info->create_info->used_fields & HA_CREATE_USED_AUTO)) {
11625 DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
11626 }
11627
11628 /* FIXME: MDEV-16099 Use alter algorithm=nocopy or algorithm=instant
11629 for non-InnoDB engine */
11630 DBUG_RETURN(my_core::HA_ALTER_INPLACE_COPY_LOCK);
11631}
11632
11633/**
11634 Allows the storage engine to update internal structures with concurrent
11635 writes blocked. If check_if_supported_inplace_alter() returns
11636 HA_ALTER_INPLACE_COPY_NO_LOCK or HA_ALTER_INPLACE_COPY_LOCK.
11637 this function is called with exclusive lock otherwise the same level of
11638 locking as for inplace_alter_table() will be used.
11639
11640 @note Storage engines are responsible for reporting any errors by
11641 calling my_error()/print_error()
11642
11643 @note If this function reports error, commit_inplace_alter_table()
11644 will be called with commit= false.
11645
11646 @note For partitioning, failing to prepare one partition, means that
11647 commit_inplace_alter_table() will be called to roll back changes for
11648 all partitions. This means that commit_inplace_alter_table() might be
11649 called without prepare_inplace_alter_table() having been called first
11650 for a given partition.
11651
11652 @param altered_table TABLE object for new version of table.
11653 @param ha_alter_info Structure describing changes to be done
11654 by ALTER TABLE and holding data used
11655 during in-place alter.
11656
11657 @retval true Error
11658 @retval false Success
11659*/
11660bool ha_rocksdb::prepare_inplace_alter_table(
11661 TABLE *const altered_table,
11662 my_core::Alter_inplace_info *const ha_alter_info) {
11663 DBUG_ENTER_FUNC();
11664
11665 DBUG_ASSERT(altered_table != nullptr);
11666 DBUG_ASSERT(ha_alter_info != nullptr);
11667
11668 Rdb_tbl_def *new_tdef = nullptr;
11669 std::shared_ptr<Rdb_key_def> *old_key_descr = nullptr;
11670 std::shared_ptr<Rdb_key_def> *new_key_descr = nullptr;
11671 uint old_n_keys = m_tbl_def->m_key_count;
11672 uint new_n_keys = altered_table->s->keys;
11673 std::unordered_set<std::shared_ptr<Rdb_key_def>> added_indexes;
11674 std::unordered_set<GL_INDEX_ID> dropped_index_ids;
11675 uint n_dropped_keys = 0;
11676 uint n_added_keys = 0;
11677 ulonglong max_auto_incr = 0;
11678
11679 if (ha_alter_info->handler_flags &
11680 (ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX |
11681 ALTER_DROP_UNIQUE_INDEX |
11682 ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX |
11683 ALTER_ADD_UNIQUE_INDEX)) {
11684
11685 if (has_hidden_pk(altered_table)) {
11686 new_n_keys += 1;
11687 }
11688
11689 const TABLE *const old_table = table;
11690 old_key_descr = m_tbl_def->m_key_descr_arr;
11691 new_key_descr = new std::shared_ptr<Rdb_key_def>[new_n_keys];
11692
11693 new_tdef = new Rdb_tbl_def(m_tbl_def->full_tablename());
11694 new_tdef->m_key_descr_arr = new_key_descr;
11695 new_tdef->m_key_count = new_n_keys;
11696 new_tdef->m_auto_incr_val =
11697 m_tbl_def->m_auto_incr_val.load(std::memory_order_relaxed);
11698 new_tdef->m_hidden_pk_val =
11699 m_tbl_def->m_hidden_pk_val.load(std::memory_order_relaxed);
11700
11701 if (create_key_defs(altered_table, new_tdef, table, m_tbl_def)) {
11702 /* Delete the new key descriptors */
11703 delete[] new_key_descr;
11704
11705 /*
11706 Explicitly mark as nullptr so we don't accidentally remove entries
11707 from data dictionary on cleanup (or cause double delete[]).
11708 */
11709 new_tdef->m_key_descr_arr = nullptr;
11710 delete new_tdef;
11711
11712 my_error(ER_KEY_CREATE_DURING_ALTER, MYF(0));
11713 DBUG_RETURN(HA_EXIT_FAILURE);
11714 }
11715
11716 uint i;
11717 uint j;
11718
11719 /* Determine which(if any) key definition(s) need to be dropped */
11720 for (i = 0; i < ha_alter_info->index_drop_count; i++) {
11721 const KEY *const dropped_key = ha_alter_info->index_drop_buffer[i];
11722 for (j = 0; j < old_n_keys; j++) {
11723 const KEY *const old_key =
11724 &old_table->key_info[old_key_descr[j]->get_keyno()];
11725
11726 if (!compare_keys(old_key, dropped_key)) {
11727 dropped_index_ids.insert(old_key_descr[j]->get_gl_index_id());
11728 break;
11729 }
11730 }
11731 }
11732
11733 /* Determine which(if any) key definitions(s) need to be added */
11734 int identical_indexes_found = 0;
11735 for (i = 0; i < ha_alter_info->index_add_count; i++) {
11736 const KEY *const added_key =
11737 &ha_alter_info->key_info_buffer[ha_alter_info->index_add_buffer[i]];
11738 for (j = 0; j < new_n_keys; j++) {
11739 const KEY *const new_key =
11740 &altered_table->key_info[new_key_descr[j]->get_keyno()];
11741 if (!compare_keys(new_key, added_key)) {
11742 /*
11743 Check for cases where an 'identical' index is being dropped and
11744 re-added in a single ALTER statement. Turn this into a no-op as the
11745 index has not changed.
11746
11747 E.G. Unique index -> non-unique index requires no change
11748
11749 Note that cases where the index name remains the same but the
11750 key-parts are changed is already handled in create_inplace_key_defs.
11751 In these cases the index needs to be rebuilt.
11752 */
11753 if (dropped_index_ids.count(new_key_descr[j]->get_gl_index_id())) {
11754 dropped_index_ids.erase(new_key_descr[j]->get_gl_index_id());
11755 identical_indexes_found++;
11756 } else {
11757 added_indexes.insert(new_key_descr[j]);
11758 }
11759
11760 break;
11761 }
11762 }
11763 }
11764
11765 n_dropped_keys = ha_alter_info->index_drop_count - identical_indexes_found;
11766 n_added_keys = ha_alter_info->index_add_count - identical_indexes_found;
11767 DBUG_ASSERT(dropped_index_ids.size() == n_dropped_keys);
11768 DBUG_ASSERT(added_indexes.size() == n_added_keys);
11769 DBUG_ASSERT(new_n_keys == (old_n_keys - n_dropped_keys + n_added_keys));
11770 }
11771 if (ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) {
11772 if (!new_tdef) {
11773 new_tdef = m_tbl_def;
11774 }
11775 if (table->found_next_number_field) {
11776 max_auto_incr = load_auto_incr_value_from_index();
11777 }
11778 }
11779
11780 ha_alter_info->handler_ctx = new Rdb_inplace_alter_ctx(
11781 new_tdef, old_key_descr, new_key_descr, old_n_keys, new_n_keys,
11782 added_indexes, dropped_index_ids, n_added_keys, n_dropped_keys,
11783 max_auto_incr);
11784 DBUG_RETURN(HA_EXIT_SUCCESS);
11785}
11786
11787/**
11788 Alter the table structure in-place with operations specified using
11789 HA_ALTER_FLAGS and Alter_inplace_info. The level of concurrency allowed
11790 during this operation depends on the return value from
11791 check_if_supported_inplace_alter().
11792
11793 @note Storage engines are responsible for reporting any errors by
11794 calling my_error()/print_error()
11795
11796 @note If this function reports error, commit_inplace_alter_table()
11797 will be called with commit= false.
11798
11799 @param altered_table TABLE object for new version of table.
11800 @param ha_alter_info Structure describing changes to be done
11801 by ALTER TABLE and holding data used
11802 during in-place alter.
11803
11804 @retval true Error
11805 @retval false Success
11806*/
11807bool ha_rocksdb::inplace_alter_table(
11808 TABLE *const altered_table,
11809 my_core::Alter_inplace_info *const ha_alter_info) {
11810 DBUG_ENTER_FUNC();
11811
11812 DBUG_ASSERT(altered_table != nullptr);
11813 DBUG_ASSERT(ha_alter_info != nullptr);
11814 DBUG_ASSERT(ha_alter_info->handler_ctx != nullptr);
11815
11816 Rdb_inplace_alter_ctx *const ctx =
11817 static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx);
11818
11819 if (ha_alter_info->handler_flags &
11820 (ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX |
11821 ALTER_ADD_UNIQUE_INDEX)) {
11822 /*
11823 Buffers need to be set up again to account for new, possibly longer
11824 secondary keys.
11825 */
11826 free_key_buffers();
11827
11828 DBUG_ASSERT(ctx != nullptr);
11829
11830 /*
11831 If adding unique index, allocate special buffers for duplicate checking.
11832 */
11833 int err;
11834 if ((err = alloc_key_buffers(
11835 altered_table, ctx->m_new_tdef,
11836 ha_alter_info->handler_flags &
11837 ALTER_ADD_UNIQUE_INDEX))) {
11838 my_error(ER_OUT_OF_RESOURCES, MYF(0));
11839 DBUG_RETURN(err);
11840 }
11841
11842 /* Populate all new secondary keys by scanning the primary key. */
11843 if ((err = inplace_populate_sk(altered_table, ctx->m_added_indexes))) {
11844 my_error(ER_SK_POPULATE_DURING_ALTER, MYF(0));
11845 DBUG_RETURN(HA_EXIT_FAILURE);
11846 }
11847 }
11848
11849 DBUG_EXECUTE_IF("myrocks_simulate_index_create_rollback", {
11850 dbug_create_err_inplace_alter();
11851 DBUG_RETURN(HA_EXIT_FAILURE);
11852 };);
11853
11854 DBUG_RETURN(HA_EXIT_SUCCESS);
11855}
11856
11857/**
11858 Scan the Primary Key index entries and populate the new secondary keys.
11859*/
11860int ha_rocksdb::inplace_populate_sk(
11861 TABLE *const new_table_arg,
11862 const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes) {
11863 DBUG_ENTER_FUNC();
11864 int res = HA_EXIT_SUCCESS;
11865 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
11866 rocksdb::WriteBatch *const batch = wb.get();
11867
11868 /* Update the data dictionary */
11869 std::unordered_set<GL_INDEX_ID> create_index_ids;
11870 for (const auto &index : indexes) {
11871 create_index_ids.insert(index->get_gl_index_id());
11872 }
11873 dict_manager.add_create_index(create_index_ids, batch);
11874 res = dict_manager.commit(batch);
11875 if (res != HA_EXIT_SUCCESS) {
11876 return res;
11877 }
11878
11879 /*
11880 Add uncommitted key definitons to ddl_manager. We need to do this
11881 so that the property collector can find this keydef when it needs to
11882 update stats. The property collector looks for the keydef in the
11883 data dictionary, but it won't be there yet since this key definition
11884 is still in the creation process.
11885 */
11886 ddl_manager.add_uncommitted_keydefs(indexes);
11887
11888 const bool hidden_pk_exists = has_hidden_pk(table);
11889
11890 Rdb_transaction *tx = get_or_create_tx(table->in_use);
11891
11892 /*
11893 There is one specific scenario where m_sst_info may not be nullptr. This
11894 happens if the handler we're using happens to be the handler where the PK
11895 bulk load was done on. The sequence of events that lead to this is as
11896 follows (T1 is PK bulk load, T2 is SK alter table):
11897
11898 T1: Execute last INSERT statement
11899 T1: Return TABLE and handler object back to Table_cache_manager
11900 T1: Close connection
11901 T2: Execute ALTER statement
11902 T2: Take same TABLE/handler from Table_cache_manager
11903 T2: Call closefrm which will call finalize_bulk_load on every other open
11904 table/handler *except* the one it's on.
11905 T2: Acquire stale snapshot of PK
11906 T1: Call finalize_bulk_load
11907
11908 This is rare because usually, closefrm will call the destructor (and thus
11909 finalize_bulk_load) on the handler where PK bulk load is done. However, if
11910 the thread ids of the bulk load thread and the alter thread differ by a
11911 multiple of table_cache_instances (8 by default), then they hash to the
11912 same bucket in Table_cache_manager and the alter thread will not not call
11913 the destructor on the handler it is holding. Thus, its m_sst_info will not
11914 be nullptr.
11915
11916 At this point, it is safe to refresh the snapshot because we know all other
11917 open handlers have been closed at this point, and the one we're on is the
11918 only one left.
11919 */
11920 if (m_sst_info) {
11921 if ((res = finalize_bulk_load())) {
11922 DBUG_RETURN(res);
11923 }
11924 tx->commit();
11925 }
11926
11927 const ulonglong rdb_merge_buf_size = THDVAR(ha_thd(), merge_buf_size);
11928 const ulonglong rdb_merge_combine_read_size =
11929 THDVAR(ha_thd(), merge_combine_read_size);
11930 const ulonglong rdb_merge_tmp_file_removal_delay =
11931 THDVAR(ha_thd(), merge_tmp_file_removal_delay_ms);
11932
11933 for (const auto &index : indexes) {
11934 bool is_unique_index =
11935 new_table_arg->key_info[index->get_keyno()].flags & HA_NOSAME;
11936
11937 Rdb_index_merge rdb_merge(tx->get_rocksdb_tmpdir(), rdb_merge_buf_size,
11938 rdb_merge_combine_read_size,
11939 rdb_merge_tmp_file_removal_delay,
11940 index->get_cf());
11941
11942 if ((res = rdb_merge.init())) {
11943 DBUG_RETURN(res);
11944 }
11945
11946 /*
11947 Note: We pass in the currently existing table + tbl_def object here,
11948 as the pk index position may have changed in the case of hidden primary
11949 keys.
11950 */
11951 const uint pk = pk_index(table, m_tbl_def);
11952 ha_index_init(pk, true);
11953
11954 /* Scan each record in the primary key in order */
11955 for (res = index_first(table->record[0]); res == 0;
11956 res = index_next(table->record[0])) {
11957 longlong hidden_pk_id = 0;
11958 if (hidden_pk_exists &&
11959 (res = read_hidden_pk_id_from_rowkey(&hidden_pk_id))) {
11960 // NO_LINT_DEBUG
11961 sql_print_error("Error retrieving hidden pk id.");
11962 ha_index_end();
11963 DBUG_RETURN(res);
11964 }
11965
11966 /* Create new secondary index entry */
11967 const int new_packed_size = index->pack_record(
11968 new_table_arg, m_pack_buffer, table->record[0], m_sk_packed_tuple,
11969 &m_sk_tails, should_store_row_debug_checksums(), hidden_pk_id, 0,
11970 nullptr, nullptr, m_ttl_bytes);
11971
11972 const rocksdb::Slice key = rocksdb::Slice(
11973 reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
11974 const rocksdb::Slice val =
11975 rocksdb::Slice(reinterpret_cast<const char *>(m_sk_tails.ptr()),
11976 m_sk_tails.get_current_pos());
11977
11978 /*
11979 Add record to offset tree in preparation for writing out to
11980 disk in sorted chunks.
11981 */
11982 if ((res = rdb_merge.add(key, val))) {
11983 ha_index_end();
11984 DBUG_RETURN(res);
11985 }
11986 }
11987
11988 if (res != HA_ERR_END_OF_FILE) {
11989 // NO_LINT_DEBUG
11990 sql_print_error("Error retrieving index entry from primary key.");
11991 ha_index_end();
11992 DBUG_RETURN(res);
11993 }
11994
11995 ha_index_end();
11996
11997 /*
11998 Perform an n-way merge of n sorted buffers on disk, then writes all
11999 results to RocksDB via SSTFileWriter API.
12000 */
12001 rocksdb::Slice merge_key;
12002 rocksdb::Slice merge_val;
12003
12004 struct unique_sk_buf_info sk_info;
12005 sk_info.dup_sk_buf = m_dup_sk_packed_tuple;
12006 sk_info.dup_sk_buf_old = m_dup_sk_packed_tuple_old;
12007
12008 while ((res = rdb_merge.next(&merge_key, &merge_val)) == 0) {
12009 /* Perform uniqueness check if needed */
12010 if (is_unique_index) {
12011 if (check_duplicate_sk(new_table_arg, *index, &merge_key, &sk_info)) {
12012 /*
12013 Duplicate entry found when trying to create unique secondary key.
12014 We need to unpack the record into new_table_arg->record[0] as it
12015 is used inside print_keydup_error so that the error message shows
12016 the duplicate record.
12017 */
12018 if (index->unpack_record(new_table_arg, new_table_arg->record[0],
12019 &merge_key, nullptr,
12020 m_verify_row_debug_checksums)) {
12021 /* Should never reach here */
12022 DBUG_ASSERT(0);
12023 }
12024
12025 print_keydup_error(new_table_arg,
12026 &new_table_arg->key_info[index->get_keyno()],
12027 MYF(0));
12028 DBUG_RETURN(ER_DUP_ENTRY);
12029 }
12030 }
12031
12032 /*
12033 Insert key and slice to SST via SSTFileWriter API.
12034 */
12035 if ((res = bulk_load_key(tx, *index, merge_key, merge_val, false))) {
12036 break;
12037 }
12038 }
12039
12040 /*
12041 Here, res == -1 means that we are finished, while > 0 means an error
12042 occurred.
12043 */
12044 if (res > 0) {
12045 // NO_LINT_DEBUG
12046 sql_print_error("Error while bulk loading keys in external merge sort.");
12047 DBUG_RETURN(res);
12048 }
12049
12050 if ((res = tx->finish_bulk_load())) {
12051 // NO_LINT_DEBUG
12052 sql_print_error("Error finishing bulk load.");
12053 DBUG_RETURN(res);
12054 }
12055 }
12056
12057 /*
12058 Explicitly tell jemalloc to clean up any unused dirty pages at this point.
12059 See https://reviews.facebook.net/D63723 for more details.
12060 */
12061 purge_all_jemalloc_arenas();
12062
12063 DBUG_EXECUTE_IF("crash_during_online_index_creation", DBUG_SUICIDE(););
12064 DBUG_RETURN(res);
12065}
12066
12067/**
12068 Commit or rollback the changes made during prepare_inplace_alter_table()
12069 and inplace_alter_table() inside the storage engine.
12070 Note that in case of rollback the allowed level of concurrency during
12071 this operation will be the same as for inplace_alter_table() and thus
12072 might be higher than during prepare_inplace_alter_table(). (For example,
12073 concurrent writes were blocked during prepare, but might not be during
12074 rollback).
12075
12076 @note Storage engines are responsible for reporting any errors by
12077 calling my_error()/print_error()
12078
12079 @note If this function with commit= true reports error, it will be called
12080 again with commit= false.
12081
12082 @note In case of partitioning, this function might be called for rollback
12083 without prepare_inplace_alter_table() having been called first.
12084 Also partitioned tables sets ha_alter_info->group_commit_ctx to a NULL
12085 terminated array of the partitions handlers and if all of them are
12086 committed as one, then group_commit_ctx should be set to NULL to indicate
12087 to the partitioning handler that all partitions handlers are committed.
12088 @see prepare_inplace_alter_table().
12089
12090 @param altered_table TABLE object for new version of table.
12091 @param ha_alter_info Structure describing changes to be done
12092 by ALTER TABLE and holding data used
12093 during in-place alter.
12094 @param commit True => Commit, False => Rollback.
12095
12096 @retval true Error
12097 @retval false Success
12098*/
12099bool ha_rocksdb::commit_inplace_alter_table(
12100 my_core::TABLE *const altered_table,
12101 my_core::Alter_inplace_info *const ha_alter_info, bool commit) {
12102 DBUG_ENTER_FUNC();
12103
12104 DBUG_ASSERT(altered_table != nullptr);
12105 DBUG_ASSERT(ha_alter_info != nullptr);
12106
12107 Rdb_inplace_alter_ctx *const ctx0 =
12108 static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx);
12109
12110 DEBUG_SYNC(ha_thd(), "rocksdb.commit_in_place_alter_table");
12111
12112 /*
12113 IMPORTANT: When rollback is requested, mysql will abort with
12114 an assertion failure. That means every failed commit during inplace alter
12115 table will result in a fatal error on the server. Indexes ongoing creation
12116 will be detected when the server restarts, and dropped.
12117
12118 For partitioned tables, a rollback call to this function (commit == false)
12119 is done for each partition. A successful commit call only executes once
12120 for all partitions.
12121 */
12122 if (!commit) {
12123 /* If ctx has not been created yet, nothing to do here */
12124 if (!ctx0) {
12125 DBUG_RETURN(HA_EXIT_SUCCESS);
12126 }
12127
12128 /*
12129 Cannot call destructor for Rdb_tbl_def directly because we don't want to
12130 erase the mappings inside the ddl_manager, as the old_key_descr is still
12131 using them.
12132 */
12133 if (ctx0->m_new_key_descr) {
12134 /* Delete the new key descriptors */
12135 for (uint i = 0; i < ctx0->m_new_tdef->m_key_count; i++) {
12136 ctx0->m_new_key_descr[i] = nullptr;
12137 }
12138
12139 delete[] ctx0->m_new_key_descr;
12140 ctx0->m_new_key_descr = nullptr;
12141 ctx0->m_new_tdef->m_key_descr_arr = nullptr;
12142
12143 delete ctx0->m_new_tdef;
12144 }
12145
12146 /* Remove uncommitted key definitons from ddl_manager */
12147 ddl_manager.remove_uncommitted_keydefs(ctx0->m_added_indexes);
12148
12149 /* Rollback any partially created indexes */
12150 dict_manager.rollback_ongoing_index_creation();
12151
12152 DBUG_RETURN(HA_EXIT_SUCCESS);
12153 }
12154
12155 DBUG_ASSERT(ctx0);
12156
12157 /*
12158 For partitioned tables, we need to commit all changes to all tables at
12159 once, unlike in the other inplace alter API methods.
12160 */
12161 inplace_alter_handler_ctx **ctx_array;
12162 inplace_alter_handler_ctx *ctx_single[2];
12163
12164 if (ha_alter_info->group_commit_ctx) {
12165 DBUG_EXECUTE_IF("crash_during_index_creation_partition", DBUG_SUICIDE(););
12166 ctx_array = ha_alter_info->group_commit_ctx;
12167 } else {
12168 ctx_single[0] = ctx0;
12169 ctx_single[1] = nullptr;
12170 ctx_array = ctx_single;
12171 }
12172
12173 DBUG_ASSERT(ctx0 == ctx_array[0]);
12174 ha_alter_info->group_commit_ctx = nullptr;
12175
12176 if (ha_alter_info->handler_flags &
12177 (ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX |
12178 ALTER_DROP_UNIQUE_INDEX |
12179 ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX |
12180 ALTER_ADD_UNIQUE_INDEX)) {
12181 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
12182 rocksdb::WriteBatch *const batch = wb.get();
12183 std::unordered_set<GL_INDEX_ID> create_index_ids;
12184
12185 m_tbl_def = ctx0->m_new_tdef;
12186 m_key_descr_arr = m_tbl_def->m_key_descr_arr;
12187 m_pk_descr = m_key_descr_arr[pk_index(altered_table, m_tbl_def)];
12188
12189 dict_manager.lock();
12190 for (inplace_alter_handler_ctx **pctx = ctx_array; *pctx; pctx++) {
12191 Rdb_inplace_alter_ctx *const ctx =
12192 static_cast<Rdb_inplace_alter_ctx *>(*pctx);
12193
12194 /* Mark indexes to be dropped */
12195 dict_manager.add_drop_index(ctx->m_dropped_index_ids, batch);
12196
12197 for (const auto &index : ctx->m_added_indexes) {
12198 create_index_ids.insert(index->get_gl_index_id());
12199 }
12200
12201 if (ddl_manager.put_and_write(ctx->m_new_tdef, batch)) {
12202 /*
12203 Failed to write new entry into data dictionary, this should never
12204 happen.
12205 */
12206 DBUG_ASSERT(0);
12207 }
12208
12209 /*
12210 Remove uncommitted key definitons from ddl_manager, as they are now
12211 committed into the data dictionary.
12212 */
12213 ddl_manager.remove_uncommitted_keydefs(ctx->m_added_indexes);
12214 }
12215
12216 if (dict_manager.commit(batch)) {
12217 /*
12218 Should never reach here. We assume MyRocks will abort if commit fails.
12219 */
12220 DBUG_ASSERT(0);
12221 }
12222
12223 dict_manager.unlock();
12224
12225 /* Mark ongoing create indexes as finished/remove from data dictionary */
12226 dict_manager.finish_indexes_operation(
12227 create_index_ids, Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
12228
12229 /*
12230 We need to recalculate the index stats here manually. The reason is that
12231 the secondary index does not exist inside
12232 m_index_num_to_keydef until it is committed to the data dictionary, which
12233 prevents us from updating the stats normally as the ddl_manager cannot
12234 find the proper gl_index_ids yet during adjust_stats calls.
12235 */
12236 if (calculate_stats(altered_table, nullptr, nullptr)) {
12237 /* Failed to update index statistics, should never happen */
12238 DBUG_ASSERT(0);
12239 }
12240
12241 rdb_drop_idx_thread.signal();
12242 }
12243
12244 if (ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) {
12245 const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
12246 rocksdb::WriteBatch *const batch = wb.get();
12247 std::unordered_set<GL_INDEX_ID> create_index_ids;
12248
12249 ulonglong auto_incr_val = ha_alter_info->create_info->auto_increment_value;
12250
12251 for (inplace_alter_handler_ctx **pctx = ctx_array; *pctx; pctx++) {
12252 Rdb_inplace_alter_ctx *const ctx =
12253 static_cast<Rdb_inplace_alter_ctx *>(*pctx);
12254 auto_incr_val = std::max(auto_incr_val, ctx->m_max_auto_incr);
12255 dict_manager.put_auto_incr_val(
12256 batch, ctx->m_new_tdef->get_autoincr_gl_index_id(), auto_incr_val,
12257 true /* overwrite */);
12258 ctx->m_new_tdef->m_auto_incr_val = auto_incr_val;
12259 }
12260
12261 if (dict_manager.commit(batch)) {
12262 DBUG_ASSERT(0);
12263 }
12264 }
12265
12266 DBUG_RETURN(HA_EXIT_SUCCESS);
12267}
12268
12269#define SHOW_FNAME(name) rocksdb_show_##name
12270
12271#define DEF_SHOW_FUNC(name, key) \
12272 static int SHOW_FNAME(name)(MYSQL_THD thd, SHOW_VAR * var, char *buff) { \
12273 rocksdb_status_counters.name = \
12274 rocksdb_stats->getTickerCount(rocksdb::key); \
12275 var->type = SHOW_LONGLONG; \
12276 var->value = (char *)&rocksdb_status_counters.name; \
12277 return HA_EXIT_SUCCESS; \
12278 }
12279
12280#define DEF_STATUS_VAR(name) \
12281 { "rocksdb_" #name, (char *)&SHOW_FNAME(name), SHOW_FUNC }
12282
12283#define DEF_STATUS_VAR_PTR(name, ptr, option) \
12284 { "rocksdb_" name, (char *)ptr, option }
12285
12286#define DEF_STATUS_VAR_FUNC(name, ptr, option) \
12287 { name, reinterpret_cast<char *>(ptr), option }
12288
12289struct rocksdb_status_counters_t {
12290 uint64_t block_cache_miss;
12291 uint64_t block_cache_hit;
12292 uint64_t block_cache_add;
12293 uint64_t block_cache_add_failures;
12294 uint64_t block_cache_index_miss;
12295 uint64_t block_cache_index_hit;
12296 uint64_t block_cache_index_add;
12297 uint64_t block_cache_index_bytes_insert;
12298 uint64_t block_cache_index_bytes_evict;
12299 uint64_t block_cache_filter_miss;
12300 uint64_t block_cache_filter_hit;
12301 uint64_t block_cache_filter_add;
12302 uint64_t block_cache_filter_bytes_insert;
12303 uint64_t block_cache_filter_bytes_evict;
12304 uint64_t block_cache_bytes_read;
12305 uint64_t block_cache_bytes_write;
12306 uint64_t block_cache_data_bytes_insert;
12307 uint64_t block_cache_data_miss;
12308 uint64_t block_cache_data_hit;
12309 uint64_t block_cache_data_add;
12310 uint64_t bloom_filter_useful;
12311 uint64_t memtable_hit;
12312 uint64_t memtable_miss;
12313 uint64_t get_hit_l0;
12314 uint64_t get_hit_l1;
12315 uint64_t get_hit_l2_and_up;
12316 uint64_t compaction_key_drop_new;
12317 uint64_t compaction_key_drop_obsolete;
12318 uint64_t compaction_key_drop_user;
12319 uint64_t number_keys_written;
12320 uint64_t number_keys_read;
12321 uint64_t number_keys_updated;
12322 uint64_t bytes_written;
12323 uint64_t bytes_read;
12324 uint64_t number_db_seek;
12325 uint64_t number_db_seek_found;
12326 uint64_t number_db_next;
12327 uint64_t number_db_next_found;
12328 uint64_t number_db_prev;
12329 uint64_t number_db_prev_found;
12330 uint64_t iter_bytes_read;
12331 uint64_t no_file_closes;
12332 uint64_t no_file_opens;
12333 uint64_t no_file_errors;
12334 uint64_t stall_micros;
12335 uint64_t num_iterators;
12336 uint64_t number_multiget_get;
12337 uint64_t number_multiget_keys_read;
12338 uint64_t number_multiget_bytes_read;
12339 uint64_t number_deletes_filtered;
12340 uint64_t number_merge_failures;
12341 uint64_t bloom_filter_prefix_checked;
12342 uint64_t bloom_filter_prefix_useful;
12343 uint64_t number_reseeks_iteration;
12344 uint64_t getupdatessince_calls;
12345 uint64_t block_cachecompressed_miss;
12346 uint64_t block_cachecompressed_hit;
12347 uint64_t wal_synced;
12348 uint64_t wal_bytes;
12349 uint64_t write_self;
12350 uint64_t write_other;
12351 uint64_t write_timedout;
12352 uint64_t write_wal;
12353 uint64_t flush_write_bytes;
12354 uint64_t compact_read_bytes;
12355 uint64_t compact_write_bytes;
12356 uint64_t number_superversion_acquires;
12357 uint64_t number_superversion_releases;
12358 uint64_t number_superversion_cleanups;
12359 uint64_t number_block_not_compressed;
12360};
12361
12362static rocksdb_status_counters_t rocksdb_status_counters;
12363
12364DEF_SHOW_FUNC(block_cache_miss, BLOCK_CACHE_MISS)
12365DEF_SHOW_FUNC(block_cache_hit, BLOCK_CACHE_HIT)
12366DEF_SHOW_FUNC(block_cache_add, BLOCK_CACHE_ADD)
12367DEF_SHOW_FUNC(block_cache_add_failures, BLOCK_CACHE_ADD_FAILURES)
12368DEF_SHOW_FUNC(block_cache_index_miss, BLOCK_CACHE_INDEX_MISS)
12369DEF_SHOW_FUNC(block_cache_index_hit, BLOCK_CACHE_INDEX_HIT)
12370DEF_SHOW_FUNC(block_cache_index_add, BLOCK_CACHE_INDEX_ADD)
12371DEF_SHOW_FUNC(block_cache_index_bytes_insert, BLOCK_CACHE_INDEX_BYTES_INSERT)
12372DEF_SHOW_FUNC(block_cache_index_bytes_evict, BLOCK_CACHE_INDEX_BYTES_EVICT)
12373DEF_SHOW_FUNC(block_cache_filter_miss, BLOCK_CACHE_FILTER_MISS)
12374DEF_SHOW_FUNC(block_cache_filter_hit, BLOCK_CACHE_FILTER_HIT)
12375DEF_SHOW_FUNC(block_cache_filter_add, BLOCK_CACHE_FILTER_ADD)
12376DEF_SHOW_FUNC(block_cache_filter_bytes_insert, BLOCK_CACHE_FILTER_BYTES_INSERT)
12377DEF_SHOW_FUNC(block_cache_filter_bytes_evict, BLOCK_CACHE_FILTER_BYTES_EVICT)
12378DEF_SHOW_FUNC(block_cache_bytes_read, BLOCK_CACHE_BYTES_READ)
12379DEF_SHOW_FUNC(block_cache_bytes_write, BLOCK_CACHE_BYTES_WRITE)
12380DEF_SHOW_FUNC(block_cache_data_bytes_insert, BLOCK_CACHE_DATA_BYTES_INSERT)
12381DEF_SHOW_FUNC(block_cache_data_miss, BLOCK_CACHE_DATA_MISS)
12382DEF_SHOW_FUNC(block_cache_data_hit, BLOCK_CACHE_DATA_HIT)
12383DEF_SHOW_FUNC(block_cache_data_add, BLOCK_CACHE_DATA_ADD)
12384DEF_SHOW_FUNC(bloom_filter_useful, BLOOM_FILTER_USEFUL)
12385DEF_SHOW_FUNC(memtable_hit, MEMTABLE_HIT)
12386DEF_SHOW_FUNC(memtable_miss, MEMTABLE_MISS)
12387DEF_SHOW_FUNC(get_hit_l0, GET_HIT_L0)
12388DEF_SHOW_FUNC(get_hit_l1, GET_HIT_L1)
12389DEF_SHOW_FUNC(get_hit_l2_and_up, GET_HIT_L2_AND_UP)
12390DEF_SHOW_FUNC(compaction_key_drop_new, COMPACTION_KEY_DROP_NEWER_ENTRY)
12391DEF_SHOW_FUNC(compaction_key_drop_obsolete, COMPACTION_KEY_DROP_OBSOLETE)
12392DEF_SHOW_FUNC(compaction_key_drop_user, COMPACTION_KEY_DROP_USER)
12393DEF_SHOW_FUNC(number_keys_written, NUMBER_KEYS_WRITTEN)
12394DEF_SHOW_FUNC(number_keys_read, NUMBER_KEYS_READ)
12395DEF_SHOW_FUNC(number_keys_updated, NUMBER_KEYS_UPDATED)
12396DEF_SHOW_FUNC(bytes_written, BYTES_WRITTEN)
12397DEF_SHOW_FUNC(bytes_read, BYTES_READ)
12398DEF_SHOW_FUNC(number_db_seek, NUMBER_DB_SEEK)
12399DEF_SHOW_FUNC(number_db_seek_found, NUMBER_DB_SEEK_FOUND)
12400DEF_SHOW_FUNC(number_db_next, NUMBER_DB_NEXT)
12401DEF_SHOW_FUNC(number_db_next_found, NUMBER_DB_NEXT_FOUND)
12402DEF_SHOW_FUNC(number_db_prev, NUMBER_DB_PREV)
12403DEF_SHOW_FUNC(number_db_prev_found, NUMBER_DB_PREV_FOUND)
12404DEF_SHOW_FUNC(iter_bytes_read, ITER_BYTES_READ)
12405DEF_SHOW_FUNC(no_file_closes, NO_FILE_CLOSES)
12406DEF_SHOW_FUNC(no_file_opens, NO_FILE_OPENS)
12407DEF_SHOW_FUNC(no_file_errors, NO_FILE_ERRORS)
12408DEF_SHOW_FUNC(stall_micros, STALL_MICROS)
12409DEF_SHOW_FUNC(num_iterators, NO_ITERATORS)
12410DEF_SHOW_FUNC(number_multiget_get, NUMBER_MULTIGET_CALLS)
12411DEF_SHOW_FUNC(number_multiget_keys_read, NUMBER_MULTIGET_KEYS_READ)
12412DEF_SHOW_FUNC(number_multiget_bytes_read, NUMBER_MULTIGET_BYTES_READ)
12413DEF_SHOW_FUNC(number_deletes_filtered, NUMBER_FILTERED_DELETES)
12414DEF_SHOW_FUNC(number_merge_failures, NUMBER_MERGE_FAILURES)
12415DEF_SHOW_FUNC(bloom_filter_prefix_checked, BLOOM_FILTER_PREFIX_CHECKED)
12416DEF_SHOW_FUNC(bloom_filter_prefix_useful, BLOOM_FILTER_PREFIX_USEFUL)
12417DEF_SHOW_FUNC(number_reseeks_iteration, NUMBER_OF_RESEEKS_IN_ITERATION)
12418DEF_SHOW_FUNC(getupdatessince_calls, GET_UPDATES_SINCE_CALLS)
12419DEF_SHOW_FUNC(block_cachecompressed_miss, BLOCK_CACHE_COMPRESSED_MISS)
12420DEF_SHOW_FUNC(block_cachecompressed_hit, BLOCK_CACHE_COMPRESSED_HIT)
12421DEF_SHOW_FUNC(wal_synced, WAL_FILE_SYNCED)
12422DEF_SHOW_FUNC(wal_bytes, WAL_FILE_BYTES)
12423DEF_SHOW_FUNC(write_self, WRITE_DONE_BY_SELF)
12424DEF_SHOW_FUNC(write_other, WRITE_DONE_BY_OTHER)
12425DEF_SHOW_FUNC(write_timedout, WRITE_TIMEDOUT)
12426DEF_SHOW_FUNC(write_wal, WRITE_WITH_WAL)
12427DEF_SHOW_FUNC(flush_write_bytes, FLUSH_WRITE_BYTES)
12428DEF_SHOW_FUNC(compact_read_bytes, COMPACT_READ_BYTES)
12429DEF_SHOW_FUNC(compact_write_bytes, COMPACT_WRITE_BYTES)
12430DEF_SHOW_FUNC(number_superversion_acquires, NUMBER_SUPERVERSION_ACQUIRES)
12431DEF_SHOW_FUNC(number_superversion_releases, NUMBER_SUPERVERSION_RELEASES)
12432DEF_SHOW_FUNC(number_superversion_cleanups, NUMBER_SUPERVERSION_CLEANUPS)
12433DEF_SHOW_FUNC(number_block_not_compressed, NUMBER_BLOCK_NOT_COMPRESSED)
12434
12435static void myrocks_update_status() {
12436 export_stats.rows_deleted = global_stats.rows[ROWS_DELETED];
12437 export_stats.rows_inserted = global_stats.rows[ROWS_INSERTED];
12438 export_stats.rows_read = global_stats.rows[ROWS_READ];
12439 export_stats.rows_updated = global_stats.rows[ROWS_UPDATED];
12440 export_stats.rows_deleted_blind = global_stats.rows[ROWS_DELETED_BLIND];
12441 export_stats.rows_expired = global_stats.rows[ROWS_EXPIRED];
12442 export_stats.rows_filtered = global_stats.rows[ROWS_FILTERED];
12443
12444 export_stats.system_rows_deleted = global_stats.system_rows[ROWS_DELETED];
12445 export_stats.system_rows_inserted = global_stats.system_rows[ROWS_INSERTED];
12446 export_stats.system_rows_read = global_stats.system_rows[ROWS_READ];
12447 export_stats.system_rows_updated = global_stats.system_rows[ROWS_UPDATED];
12448
12449 export_stats.queries_point = global_stats.queries[QUERIES_POINT];
12450 export_stats.queries_range = global_stats.queries[QUERIES_RANGE];
12451
12452 export_stats.covered_secondary_key_lookups =
12453 global_stats.covered_secondary_key_lookups;
12454}
12455
12456static void myrocks_update_memory_status() {
12457 std::vector<rocksdb::DB *> dbs;
12458 std::unordered_set<const rocksdb::Cache *> cache_set;
12459 dbs.push_back(rdb);
12460 std::map<rocksdb::MemoryUtil::UsageType, uint64_t> temp_usage_by_type;
12461 rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
12462 &temp_usage_by_type);
12463 memory_stats.memtable_total =
12464 temp_usage_by_type[rocksdb::MemoryUtil::kMemTableTotal];
12465 memory_stats.memtable_unflushed =
12466 temp_usage_by_type[rocksdb::MemoryUtil::kMemTableUnFlushed];
12467}
12468
12469static SHOW_VAR myrocks_status_variables[] = {
12470 DEF_STATUS_VAR_FUNC("rows_deleted", &export_stats.rows_deleted,
12471 SHOW_LONGLONG),
12472 DEF_STATUS_VAR_FUNC("rows_inserted", &export_stats.rows_inserted,
12473 SHOW_LONGLONG),
12474 DEF_STATUS_VAR_FUNC("rows_read", &export_stats.rows_read, SHOW_LONGLONG),
12475 DEF_STATUS_VAR_FUNC("rows_updated", &export_stats.rows_updated,
12476 SHOW_LONGLONG),
12477 DEF_STATUS_VAR_FUNC("rows_deleted_blind", &export_stats.rows_deleted_blind,
12478 SHOW_LONGLONG),
12479 DEF_STATUS_VAR_FUNC("rows_expired", &export_stats.rows_expired,
12480 SHOW_LONGLONG),
12481 DEF_STATUS_VAR_FUNC("rows_filtered", &export_stats.rows_filtered,
12482 SHOW_LONGLONG),
12483 DEF_STATUS_VAR_FUNC("system_rows_deleted",
12484 &export_stats.system_rows_deleted, SHOW_LONGLONG),
12485 DEF_STATUS_VAR_FUNC("system_rows_inserted",
12486 &export_stats.system_rows_inserted, SHOW_LONGLONG),
12487 DEF_STATUS_VAR_FUNC("system_rows_read", &export_stats.system_rows_read,
12488 SHOW_LONGLONG),
12489 DEF_STATUS_VAR_FUNC("system_rows_updated",
12490 &export_stats.system_rows_updated, SHOW_LONGLONG),
12491 DEF_STATUS_VAR_FUNC("memtable_total", &memory_stats.memtable_total,
12492 SHOW_LONGLONG),
12493 DEF_STATUS_VAR_FUNC("memtable_unflushed", &memory_stats.memtable_unflushed,
12494 SHOW_LONGLONG),
12495 DEF_STATUS_VAR_FUNC("queries_point", &export_stats.queries_point,
12496 SHOW_LONGLONG),
12497 DEF_STATUS_VAR_FUNC("queries_range", &export_stats.queries_range,
12498 SHOW_LONGLONG),
12499 DEF_STATUS_VAR_FUNC("covered_secondary_key_lookups",
12500 &export_stats.covered_secondary_key_lookups,
12501 SHOW_LONGLONG),
12502
12503 {NullS, NullS, SHOW_LONG}};
12504
12505static void show_myrocks_vars(THD *thd, SHOW_VAR *var, char *buff) {
12506 myrocks_update_status();
12507 myrocks_update_memory_status();
12508 var->type = SHOW_ARRAY;
12509 var->value = reinterpret_cast<char *>(&myrocks_status_variables);
12510}
12511
12512static ulonglong
12513io_stall_prop_value(const std::map<std::string, std::string> &props,
12514 const std::string &key) {
12515 std::map<std::string, std::string>::const_iterator iter =
12516 props.find("io_stalls." + key);
12517 if (iter != props.end()) {
12518 return std::stoull(iter->second);
12519 } else {
12520 DBUG_PRINT("warning",
12521 ("RocksDB GetMapPropery hasn't returned key=%s", key.c_str()));
12522 DBUG_ASSERT(0);
12523 return 0;
12524 }
12525}
12526
12527static void update_rocksdb_stall_status() {
12528 st_io_stall_stats local_io_stall_stats;
12529 for (const auto &cf_name : cf_manager.get_cf_names()) {
12530 rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name);
12531 if (cfh == nullptr) {
12532 continue;
12533 }
12534
12535 std::map<std::string, std::string> props;
12536 if (!rdb->GetMapProperty(cfh, "rocksdb.cfstats", &props)) {
12537 continue;
12538 }
12539
12540 local_io_stall_stats.level0_slowdown +=
12541 io_stall_prop_value(props, "level0_slowdown");
12542 local_io_stall_stats.level0_slowdown_with_compaction +=
12543 io_stall_prop_value(props, "level0_slowdown_with_compaction");
12544 local_io_stall_stats.level0_numfiles +=
12545 io_stall_prop_value(props, "level0_numfiles");
12546 local_io_stall_stats.level0_numfiles_with_compaction +=
12547 io_stall_prop_value(props, "level0_numfiles_with_compaction");
12548 local_io_stall_stats.stop_for_pending_compaction_bytes +=
12549 io_stall_prop_value(props, "stop_for_pending_compaction_bytes");
12550 local_io_stall_stats.slowdown_for_pending_compaction_bytes +=
12551 io_stall_prop_value(props, "slowdown_for_pending_compaction_bytes");
12552 local_io_stall_stats.memtable_compaction +=
12553 io_stall_prop_value(props, "memtable_compaction");
12554 local_io_stall_stats.memtable_slowdown +=
12555 io_stall_prop_value(props, "memtable_slowdown");
12556 local_io_stall_stats.total_stop += io_stall_prop_value(props, "total_stop");
12557 local_io_stall_stats.total_slowdown +=
12558 io_stall_prop_value(props, "total_slowdown");
12559 }
12560 io_stall_stats = local_io_stall_stats;
12561}
12562
12563static SHOW_VAR rocksdb_stall_status_variables[] = {
12564 DEF_STATUS_VAR_FUNC("l0_file_count_limit_slowdowns",
12565 &io_stall_stats.level0_slowdown, SHOW_LONGLONG),
12566 DEF_STATUS_VAR_FUNC("locked_l0_file_count_limit_slowdowns",
12567 &io_stall_stats.level0_slowdown_with_compaction,
12568 SHOW_LONGLONG),
12569 DEF_STATUS_VAR_FUNC("l0_file_count_limit_stops",
12570 &io_stall_stats.level0_numfiles, SHOW_LONGLONG),
12571 DEF_STATUS_VAR_FUNC("locked_l0_file_count_limit_stops",
12572 &io_stall_stats.level0_numfiles_with_compaction,
12573 SHOW_LONGLONG),
12574 DEF_STATUS_VAR_FUNC("pending_compaction_limit_stops",
12575 &io_stall_stats.stop_for_pending_compaction_bytes,
12576 SHOW_LONGLONG),
12577 DEF_STATUS_VAR_FUNC("pending_compaction_limit_slowdowns",
12578 &io_stall_stats.slowdown_for_pending_compaction_bytes,
12579 SHOW_LONGLONG),
12580 DEF_STATUS_VAR_FUNC("memtable_limit_stops",
12581 &io_stall_stats.memtable_compaction, SHOW_LONGLONG),
12582 DEF_STATUS_VAR_FUNC("memtable_limit_slowdowns",
12583 &io_stall_stats.memtable_slowdown, SHOW_LONGLONG),
12584 DEF_STATUS_VAR_FUNC("total_stops", &io_stall_stats.total_stop,
12585 SHOW_LONGLONG),
12586 DEF_STATUS_VAR_FUNC("total_slowdowns", &io_stall_stats.total_slowdown,
12587 SHOW_LONGLONG),
12588 // end of the array marker
12589 {NullS, NullS, SHOW_LONG}};
12590
12591static void show_rocksdb_stall_vars(THD *thd, SHOW_VAR *var, char *buff) {
12592 update_rocksdb_stall_status();
12593 var->type = SHOW_ARRAY;
12594 var->value = reinterpret_cast<char *>(&rocksdb_stall_status_variables);
12595}
12596
12597static SHOW_VAR rocksdb_status_vars[] = {
12598 DEF_STATUS_VAR(block_cache_miss),
12599 DEF_STATUS_VAR(block_cache_hit),
12600 DEF_STATUS_VAR(block_cache_add),
12601 DEF_STATUS_VAR(block_cache_add_failures),
12602 DEF_STATUS_VAR(block_cache_index_miss),
12603 DEF_STATUS_VAR(block_cache_index_hit),
12604 DEF_STATUS_VAR(block_cache_index_add),
12605 DEF_STATUS_VAR(block_cache_index_bytes_insert),
12606 DEF_STATUS_VAR(block_cache_index_bytes_evict),
12607 DEF_STATUS_VAR(block_cache_filter_miss),
12608 DEF_STATUS_VAR(block_cache_filter_hit),
12609 DEF_STATUS_VAR(block_cache_filter_add),
12610 DEF_STATUS_VAR(block_cache_filter_bytes_insert),
12611 DEF_STATUS_VAR(block_cache_filter_bytes_evict),
12612 DEF_STATUS_VAR(block_cache_bytes_read),
12613 DEF_STATUS_VAR(block_cache_bytes_write),
12614 DEF_STATUS_VAR(block_cache_data_bytes_insert),
12615 DEF_STATUS_VAR(block_cache_data_miss),
12616 DEF_STATUS_VAR(block_cache_data_hit),
12617 DEF_STATUS_VAR(block_cache_data_add),
12618 DEF_STATUS_VAR(bloom_filter_useful),
12619 DEF_STATUS_VAR(memtable_hit),
12620 DEF_STATUS_VAR(memtable_miss),
12621 DEF_STATUS_VAR(get_hit_l0),
12622 DEF_STATUS_VAR(get_hit_l1),
12623 DEF_STATUS_VAR(get_hit_l2_and_up),
12624 DEF_STATUS_VAR(compaction_key_drop_new),
12625 DEF_STATUS_VAR(compaction_key_drop_obsolete),
12626 DEF_STATUS_VAR(compaction_key_drop_user),
12627 DEF_STATUS_VAR(number_keys_written),
12628 DEF_STATUS_VAR(number_keys_read),
12629 DEF_STATUS_VAR(number_keys_updated),
12630 DEF_STATUS_VAR(bytes_written),
12631 DEF_STATUS_VAR(bytes_read),
12632 DEF_STATUS_VAR(number_db_seek),
12633 DEF_STATUS_VAR(number_db_seek_found),
12634 DEF_STATUS_VAR(number_db_next),
12635 DEF_STATUS_VAR(number_db_next_found),
12636 DEF_STATUS_VAR(number_db_prev),
12637 DEF_STATUS_VAR(number_db_prev_found),
12638 DEF_STATUS_VAR(iter_bytes_read),
12639 DEF_STATUS_VAR(no_file_closes),
12640 DEF_STATUS_VAR(no_file_opens),
12641 DEF_STATUS_VAR(no_file_errors),
12642 DEF_STATUS_VAR(stall_micros),
12643 DEF_STATUS_VAR(num_iterators),
12644 DEF_STATUS_VAR(number_multiget_get),
12645 DEF_STATUS_VAR(number_multiget_keys_read),
12646 DEF_STATUS_VAR(number_multiget_bytes_read),
12647 DEF_STATUS_VAR(number_deletes_filtered),
12648 DEF_STATUS_VAR(number_merge_failures),
12649 DEF_STATUS_VAR(bloom_filter_prefix_checked),
12650 DEF_STATUS_VAR(bloom_filter_prefix_useful),
12651 DEF_STATUS_VAR(number_reseeks_iteration),
12652 DEF_STATUS_VAR(getupdatessince_calls),
12653 DEF_STATUS_VAR(block_cachecompressed_miss),
12654 DEF_STATUS_VAR(block_cachecompressed_hit),
12655 DEF_STATUS_VAR(wal_synced),
12656 DEF_STATUS_VAR(wal_bytes),
12657 DEF_STATUS_VAR(write_self),
12658 DEF_STATUS_VAR(write_other),
12659 DEF_STATUS_VAR(write_timedout),
12660 DEF_STATUS_VAR(write_wal),
12661 DEF_STATUS_VAR(flush_write_bytes),
12662 DEF_STATUS_VAR(compact_read_bytes),
12663 DEF_STATUS_VAR(compact_write_bytes),
12664 DEF_STATUS_VAR(number_superversion_acquires),
12665 DEF_STATUS_VAR(number_superversion_releases),
12666 DEF_STATUS_VAR(number_superversion_cleanups),
12667 DEF_STATUS_VAR(number_block_not_compressed),
12668 DEF_STATUS_VAR_PTR("row_lock_deadlocks", &rocksdb_row_lock_deadlocks,
12669 SHOW_LONGLONG),
12670 DEF_STATUS_VAR_PTR("row_lock_wait_timeouts",
12671 &rocksdb_row_lock_wait_timeouts, SHOW_LONGLONG),
12672 DEF_STATUS_VAR_PTR("snapshot_conflict_errors",
12673 &rocksdb_snapshot_conflict_errors, SHOW_LONGLONG),
12674 DEF_STATUS_VAR_PTR("wal_group_syncs", &rocksdb_wal_group_syncs,
12675 SHOW_LONGLONG),
12676 DEF_STATUS_VAR_PTR("number_sst_entry_put", &rocksdb_num_sst_entry_put,
12677 SHOW_LONGLONG),
12678 DEF_STATUS_VAR_PTR("number_sst_entry_delete", &rocksdb_num_sst_entry_delete,
12679 SHOW_LONGLONG),
12680 DEF_STATUS_VAR_PTR("number_sst_entry_singledelete",
12681 &rocksdb_num_sst_entry_singledelete, SHOW_LONGLONG),
12682 DEF_STATUS_VAR_PTR("number_sst_entry_merge", &rocksdb_num_sst_entry_merge,
12683 SHOW_LONGLONG),
12684 DEF_STATUS_VAR_PTR("number_sst_entry_other", &rocksdb_num_sst_entry_other,
12685 SHOW_LONGLONG),
12686 // the variables generated by SHOW_FUNC are sorted only by prefix (first
12687 // arg in the tuple below), so make sure it is unique to make sorting
12688 // deterministic as quick sort is not stable
12689 {"rocksdb", reinterpret_cast<char *>(&show_myrocks_vars), SHOW_FUNC},
12690 {"rocksdb_stall", reinterpret_cast<char *>(&show_rocksdb_stall_vars),
12691 SHOW_FUNC},
12692 {NullS, NullS, SHOW_LONG}};
12693
12694/*
12695 Background thread's main logic
12696*/
12697
12698void Rdb_background_thread::run() {
12699 // How many seconds to wait till flushing the WAL next time.
12700 const int WAKE_UP_INTERVAL = 1;
12701
12702 timespec ts_next_sync;
12703 set_timespec(ts_next_sync, WAKE_UP_INTERVAL);
12704
12705 for (;;) {
12706 // Wait until the next timeout or until we receive a signal to stop the
12707 // thread. Request to stop the thread should only be triggered when the
12708 // storage engine is being unloaded.
12709 RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
12710 const auto ret MY_ATTRIBUTE((__unused__)) =
12711 mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts_next_sync);
12712
12713 // Check that we receive only the expected error codes.
12714 DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT);
12715 const bool local_stop = m_stop;
12716 const bool local_save_stats = m_save_stats;
12717 reset();
12718 RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
12719
12720 if (local_stop) {
12721 // If we're here then that's because condition variable was signaled by
12722 // another thread and we're shutting down. Break out the loop to make
12723 // sure that shutdown thread can proceed.
12724 break;
12725 }
12726
12727 // This path should be taken only when the timer expired.
12728 DBUG_ASSERT(ret == ETIMEDOUT);
12729
12730 if (local_save_stats) {
12731 ddl_manager.persist_stats();
12732 }
12733
12734 // Set the next timestamp for mysql_cond_timedwait() (which ends up calling
12735 // pthread_cond_timedwait()) to wait on.
12736 set_timespec(ts_next_sync, WAKE_UP_INTERVAL);
12737
12738 // Flush the WAL. Sync it for both background and never modes to copy
12739 // InnoDB's behavior. For mode never, the wal file isn't even written,
12740 // whereas background writes to the wal file, but issues the syncs in a
12741 // background thread.
12742 if (rdb && (rocksdb_flush_log_at_trx_commit != FLUSH_LOG_SYNC) &&
12743 !rocksdb_db_options->allow_mmap_writes) {
12744 const rocksdb::Status s = rdb->FlushWAL(true);
12745 if (!s.ok()) {
12746 rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD);
12747 }
12748 }
12749 }
12750
12751 // save remaining stats which might've left unsaved
12752 ddl_manager.persist_stats();
12753}
12754
12755bool ha_rocksdb::check_bloom_and_set_bounds(THD *thd, const Rdb_key_def &kd,
12756 const rocksdb::Slice &eq_cond,
12757 const bool use_all_keys,
12758 uchar *lower_bound_buf,
12759 uchar *upper_bound_buf,
12760 rocksdb::Slice *out_lower_bound,
12761 rocksdb::Slice *out_upper_bound) {
12762 bool can_use_bloom = can_use_bloom_filter(thd, kd, eq_cond, use_all_keys);
12763 if (!can_use_bloom) {
12764 setup_iterator_bounds(kd, eq_cond,
12765 lower_bound_buf, upper_bound_buf,
12766 out_lower_bound, out_upper_bound);
12767 }
12768 return can_use_bloom;
12769}
12770
12771/**
12772 Deciding if it is possible to use bloom filter or not.
12773
12774 @detail
12775 Even if bloom filter exists, it is not always possible
12776 to use bloom filter. If using bloom filter when you shouldn't,
12777 false negative may happen -- fewer rows than expected may be returned.
12778 It is users' responsibility to use bloom filter correctly.
12779
12780 If bloom filter does not exist, return value does not matter because
12781 RocksDB does not use bloom filter internally.
12782
12783 @param kd
12784 @param eq_cond Equal condition part of the key. This always includes
12785 system index id (4 bytes).
12786 @param use_all_keys True if all key parts are set with equal conditions.
12787 This is aware of extended keys.
12788*/
12789bool ha_rocksdb::can_use_bloom_filter(THD *thd, const Rdb_key_def &kd,
12790 const rocksdb::Slice &eq_cond,
12791 const bool use_all_keys) {
12792 bool can_use = false;
12793
12794 if (THDVAR(thd, skip_bloom_filter_on_read)) {
12795 return can_use;
12796 }
12797
12798 const rocksdb::SliceTransform *prefix_extractor = kd.get_extractor();
12799 if (prefix_extractor) {
12800 /*
12801 This is an optimized use case for CappedPrefixTransform.
12802 If eq_cond length >= prefix extractor length and if
12803 all keys are used for equal lookup, it is
12804 always possible to use bloom filter.
12805
12806 Prefix bloom filter can't be used on descending scan with
12807 prefix lookup (i.e. WHERE id1=1 ORDER BY id2 DESC), because of
12808 RocksDB's limitation. On ascending (or not sorting) scan,
12809 keys longer than the capped prefix length will be truncated down
12810 to the capped length and the resulting key is added to the bloom filter.
12811
12812 Keys shorter than the capped prefix length will be added to
12813 the bloom filter. When keys are looked up, key conditionals
12814 longer than the capped length can be used; key conditionals
12815 shorter require all parts of the key to be available
12816 for the short key match.
12817 */
12818 if ((use_all_keys && prefix_extractor->InRange(eq_cond))
12819 || prefix_extractor->SameResultWhenAppended(eq_cond))
12820 can_use = true;
12821 else
12822 can_use = false;
12823 } else {
12824 /*
12825 if prefix extractor is not defined, all key parts have to be
12826 used by eq_cond.
12827 */
12828 if (use_all_keys)
12829 can_use = true;
12830 else
12831 can_use = false;
12832 }
12833
12834 return can_use;
12835}
12836
12837/* For modules that need access to the global data structures */
12838rocksdb::TransactionDB *rdb_get_rocksdb_db() { return rdb; }
12839
12840Rdb_cf_manager &rdb_get_cf_manager() { return cf_manager; }
12841
12842const rocksdb::BlockBasedTableOptions &rdb_get_table_options() {
12843 return *rocksdb_tbl_options;
12844}
12845
12846bool rdb_is_ttl_enabled() { return rocksdb_enable_ttl; }
12847bool rdb_is_ttl_read_filtering_enabled() {
12848 return rocksdb_enable_ttl_read_filtering;
12849}
12850#ifndef NDEBUG
12851int rdb_dbug_set_ttl_rec_ts() { return rocksdb_debug_ttl_rec_ts; }
12852int rdb_dbug_set_ttl_snapshot_ts() { return rocksdb_debug_ttl_snapshot_ts; }
12853int rdb_dbug_set_ttl_read_filter_ts() {
12854 return rocksdb_debug_ttl_read_filter_ts;
12855}
12856bool rdb_dbug_set_ttl_ignore_pk() { return rocksdb_debug_ttl_ignore_pk; }
12857#endif
12858
12859void rdb_update_global_stats(const operation_type &type, uint count,
12860 bool is_system_table) {
12861 DBUG_ASSERT(type < ROWS_MAX);
12862
12863 if (count == 0) {
12864 return;
12865 }
12866
12867 if (is_system_table) {
12868 global_stats.system_rows[type].add(count);
12869 } else {
12870 global_stats.rows[type].add(count);
12871 }
12872}
12873
12874int rdb_get_table_perf_counters(const char *const tablename,
12875 Rdb_perf_counters *const counters) {
12876 DBUG_ASSERT(counters != nullptr);
12877 DBUG_ASSERT(tablename != nullptr);
12878
12879 Rdb_table_handler *table_handler;
12880 table_handler = rdb_open_tables.get_table_handler(tablename);
12881 if (table_handler == nullptr) {
12882 return HA_ERR_ROCKSDB_INVALID_TABLE;
12883 }
12884
12885 counters->load(table_handler->m_table_perf_context);
12886
12887 rdb_open_tables.release_table_handler(table_handler);
12888 return HA_EXIT_SUCCESS;
12889}
12890
12891const char *get_rdb_io_error_string(const RDB_IO_ERROR_TYPE err_type) {
12892 // If this assertion fails then this means that a member has been either added
12893 // to or removed from RDB_IO_ERROR_TYPE enum and this function needs to be
12894 // changed to return the appropriate value.
12895 static_assert(RDB_IO_ERROR_LAST == 4, "Please handle all the error types.");
12896
12897 switch (err_type) {
12898 case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_TX_COMMIT:
12899 return "RDB_IO_ERROR_TX_COMMIT";
12900 case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_DICT_COMMIT:
12901 return "RDB_IO_ERROR_DICT_COMMIT";
12902 case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_BG_THREAD:
12903 return "RDB_IO_ERROR_BG_THREAD";
12904 case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_GENERAL:
12905 return "RDB_IO_ERROR_GENERAL";
12906 default:
12907 DBUG_ASSERT(false);
12908 return "(unknown)";
12909 }
12910}
12911
12912// In case of core dump generation we want this function NOT to be optimized
12913// so that we can capture as much data as possible to debug the root cause
12914// more efficiently.
12915#ifdef __GNUC__
12916#pragma GCC push_options
12917#pragma GCC optimize("O0")
12918#endif
12919
12920void rdb_handle_io_error(const rocksdb::Status status,
12921 const RDB_IO_ERROR_TYPE err_type) {
12922 if (status.IsIOError()) {
12923 switch (err_type) {
12924 case RDB_IO_ERROR_TX_COMMIT:
12925 case RDB_IO_ERROR_DICT_COMMIT: {
12926 rdb_log_status_error(status, "failed to write to WAL");
12927 /* NO_LINT_DEBUG */
12928 sql_print_error("MyRocks: aborting on WAL write error.");
12929 abort();
12930 break;
12931 }
12932 case RDB_IO_ERROR_BG_THREAD: {
12933 rdb_log_status_error(status, "BG thread failed to write to RocksDB");
12934 break;
12935 }
12936 case RDB_IO_ERROR_GENERAL: {
12937 rdb_log_status_error(status, "failed on I/O");
12938 /* NO_LINT_DEBUG */
12939 sql_print_error("MyRocks: aborting on I/O error.");
12940 abort();
12941 break;
12942 }
12943 default:
12944 DBUG_ASSERT(0);
12945 break;
12946 }
12947 } else if (status.IsCorruption()) {
12948 rdb_log_status_error(status, "data corruption detected!");
12949 rdb_persist_corruption_marker();
12950 /* NO_LINT_DEBUG */
12951 sql_print_error("MyRocks: aborting because of data corruption.");
12952 abort();
12953 } else if (!status.ok()) {
12954 switch (err_type) {
12955 case RDB_IO_ERROR_DICT_COMMIT: {
12956 rdb_log_status_error(status, "Failed to write to WAL (dictionary)");
12957 /* NO_LINT_DEBUG */
12958 sql_print_error("MyRocks: aborting on WAL write error.");
12959 abort();
12960 break;
12961 }
12962 default:
12963 rdb_log_status_error(status, "Failed to read/write in RocksDB");
12964 break;
12965 }
12966 }
12967}
12968#ifdef __GNUC__
12969#pragma GCC pop_options
12970#endif
12971
12972Rdb_dict_manager *rdb_get_dict_manager(void) { return &dict_manager; }
12973
12974Rdb_ddl_manager *rdb_get_ddl_manager(void) { return &ddl_manager; }
12975
12976Rdb_binlog_manager *rdb_get_binlog_manager(void) { return &binlog_manager; }
12977
12978void rocksdb_set_compaction_options(
12979 my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
12980 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
12981 void *const var_ptr, const void *const save) {
12982 if (var_ptr && save) {
12983 *(uint64_t *)var_ptr = *(const uint64_t *)save;
12984 }
12985 const Rdb_compact_params params = {
12986 (uint64_t)rocksdb_compaction_sequential_deletes,
12987 (uint64_t)rocksdb_compaction_sequential_deletes_window,
12988 (uint64_t)rocksdb_compaction_sequential_deletes_file_size};
12989 if (properties_collector_factory) {
12990 properties_collector_factory->SetCompactionParams(params);
12991 }
12992}
12993
12994void rocksdb_set_table_stats_sampling_pct(
12995 my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
12996 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
12997 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
12998 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
12999
13000 const uint32_t new_val = *static_cast<const uint32_t *>(save);
13001
13002 if (new_val != rocksdb_table_stats_sampling_pct) {
13003 rocksdb_table_stats_sampling_pct = new_val;
13004
13005 if (properties_collector_factory) {
13006 properties_collector_factory->SetTableStatsSamplingPct(
13007 rocksdb_table_stats_sampling_pct);
13008 }
13009 }
13010
13011 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
13012}
13013
13014/*
13015 This function allows setting the rate limiter's bytes per second value
13016 but only if the rate limiter is turned on which has to be done at startup.
13017 If the rate is already 0 (turned off) or we are changing it to 0 (trying
13018 to turn it off) this function will push a warning to the client and do
13019 nothing.
13020 This is similar to the code in innodb_doublewrite_update (found in
13021 storage/innobase/handler/ha_innodb.cc).
13022*/
13023void rocksdb_set_rate_limiter_bytes_per_sec(
13024 my_core::THD *const thd,
13025 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
13026 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
13027 const uint64_t new_val = *static_cast<const uint64_t *>(save);
13028 if (new_val == 0 || rocksdb_rate_limiter_bytes_per_sec == 0) {
13029 /*
13030 If a rate_limiter was not enabled at startup we can't change it nor
13031 can we disable it if one was created at startup
13032 */
13033 push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_WRONG_ARGUMENTS,
13034 "RocksDB: rocksdb_rate_limiter_bytes_per_sec cannot "
13035 "be dynamically changed to or from 0. Do a clean "
13036 "shutdown if you want to change it from or to 0.");
13037 } else if (new_val != rocksdb_rate_limiter_bytes_per_sec) {
13038 /* Apply the new value to the rate limiter and store it locally */
13039 DBUG_ASSERT(rocksdb_rate_limiter != nullptr);
13040 rocksdb_rate_limiter_bytes_per_sec = new_val;
13041 rocksdb_rate_limiter->SetBytesPerSecond(new_val);
13042 }
13043}
13044
13045void rocksdb_set_sst_mgr_rate_bytes_per_sec(
13046 my_core::THD *const thd,
13047 my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
13048 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
13049 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
13050
13051 const uint64_t new_val = *static_cast<const uint64_t *>(save);
13052
13053 if (new_val != rocksdb_sst_mgr_rate_bytes_per_sec) {
13054 rocksdb_sst_mgr_rate_bytes_per_sec = new_val;
13055
13056 rocksdb_db_options->sst_file_manager->SetDeleteRateBytesPerSecond(
13057 rocksdb_sst_mgr_rate_bytes_per_sec);
13058 }
13059
13060 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
13061}
13062
13063void rocksdb_set_delayed_write_rate(THD *thd, struct st_mysql_sys_var *var,
13064 void *var_ptr, const void *save) {
13065 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
13066 const uint64_t new_val = *static_cast<const uint64_t *>(save);
13067 if (rocksdb_delayed_write_rate != new_val) {
13068 rocksdb_delayed_write_rate = new_val;
13069 rocksdb::Status s =
13070 rdb->SetDBOptions({{"delayed_write_rate", std::to_string(new_val)}});
13071
13072 if (!s.ok()) {
13073 /* NO_LINT_DEBUG */
13074 sql_print_warning("MyRocks: failed to update delayed_write_rate. "
13075 "status code = %d, status = %s",
13076 s.code(), s.ToString().c_str());
13077 }
13078 }
13079 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
13080}
13081
13082void rocksdb_set_max_latest_deadlocks(THD *thd, struct st_mysql_sys_var *var,
13083 void *var_ptr, const void *save) {
13084 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
13085 const uint32_t new_val = *static_cast<const uint32_t *>(save);
13086 if (rocksdb_max_latest_deadlocks != new_val) {
13087 rocksdb_max_latest_deadlocks = new_val;
13088 rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks);
13089 }
13090 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
13091}
13092
13093void rdb_set_collation_exception_list(const char *const exception_list) {
13094 DBUG_ASSERT(rdb_collation_exceptions != nullptr);
13095
13096 if (!rdb_collation_exceptions->set_patterns(exception_list)) {
13097 my_core::warn_about_bad_patterns(rdb_collation_exceptions,
13098 "strict_collation_exceptions");
13099 }
13100}
13101
13102void rocksdb_set_collation_exception_list(THD *const thd,
13103 struct st_mysql_sys_var *const var,
13104 void *const var_ptr,
13105 const void *const save) {
13106 const char *const val = *static_cast<const char *const *>(save);
13107
13108 rdb_set_collation_exception_list(val == nullptr ? "" : val);
13109
13110 //psergey-todo: what is the purpose of the below??
13111 const char *val_copy= val? my_strdup(val, MYF(0)): nullptr;
13112 my_free(*static_cast<char**>(var_ptr));
13113 *static_cast<const char**>(var_ptr) = val_copy;
13114}
13115
13116int mysql_value_to_bool(struct st_mysql_value *value, my_bool *return_value) {
13117 int new_value_type = value->value_type(value);
13118 if (new_value_type == MYSQL_VALUE_TYPE_STRING) {
13119 char buf[16];
13120 int len = sizeof(buf);
13121 const char *str = value->val_str(value, buf, &len);
13122 if (str && (my_strcasecmp(system_charset_info, "true", str) == 0 ||
13123 my_strcasecmp(system_charset_info, "on", str) == 0)) {
13124 *return_value = TRUE;
13125 } else if (str && (my_strcasecmp(system_charset_info, "false", str) == 0 ||
13126 my_strcasecmp(system_charset_info, "off", str) == 0)) {
13127 *return_value = FALSE;
13128 } else {
13129 return 1;
13130 }
13131 } else if (new_value_type == MYSQL_VALUE_TYPE_INT) {
13132 long long intbuf;
13133 value->val_int(value, &intbuf);
13134 if (intbuf > 1)
13135 return 1;
13136 *return_value = intbuf > 0 ? TRUE : FALSE;
13137 } else {
13138 return 1;
13139 }
13140
13141 return 0;
13142}
13143
13144int rocksdb_check_bulk_load(
13145 THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
13146 void *save, struct st_mysql_value *value) {
13147 my_bool new_value;
13148 if (mysql_value_to_bool(value, &new_value) != 0) {
13149 return 1;
13150 }
13151
13152 Rdb_transaction *&tx = get_tx_from_thd(thd);
13153 if (tx != nullptr) {
13154 const int rc = tx->finish_bulk_load();
13155 if (rc != 0) {
13156 // NO_LINT_DEBUG
13157 sql_print_error("RocksDB: Error %d finalizing last SST file while "
13158 "setting bulk loading variable",
13159 rc);
13160 THDVAR(thd, bulk_load) = 0;
13161 return 1;
13162 }
13163 }
13164
13165 *static_cast<bool *>(save) = new_value;
13166 return 0;
13167}
13168
13169int rocksdb_check_bulk_load_allow_unsorted(
13170 THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
13171 void *save, struct st_mysql_value *value) {
13172 my_bool new_value;
13173 if (mysql_value_to_bool(value, &new_value) != 0) {
13174 return 1;
13175 }
13176
13177 if (THDVAR(thd, bulk_load)) {
13178 my_error(ER_ERROR_WHEN_EXECUTING_COMMAND, MYF(0), "SET",
13179 "Cannot change this setting while bulk load is enabled");
13180
13181 return 1;
13182 }
13183
13184 *static_cast<bool *>(save) = new_value;
13185 return 0;
13186}
13187
13188static void rocksdb_set_max_background_jobs(THD *thd,
13189 struct st_mysql_sys_var *const var,
13190 void *const var_ptr,
13191 const void *const save) {
13192 DBUG_ASSERT(save != nullptr);
13193 DBUG_ASSERT(rocksdb_db_options != nullptr);
13194 DBUG_ASSERT(rocksdb_db_options->env != nullptr);
13195
13196 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
13197
13198 const int new_val = *static_cast<const int *>(save);
13199
13200 if (rocksdb_db_options->max_background_jobs != new_val) {
13201 rocksdb_db_options->max_background_jobs = new_val;
13202 rocksdb::Status s =
13203 rdb->SetDBOptions({{"max_background_jobs", std::to_string(new_val)}});
13204
13205 if (!s.ok()) {
13206 /* NO_LINT_DEBUG */
13207 sql_print_warning("MyRocks: failed to update max_background_jobs. "
13208 "Status code = %d, status = %s.",
13209 s.code(), s.ToString().c_str());
13210 }
13211 }
13212
13213 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
13214}
13215
13216static void rocksdb_set_bytes_per_sync(
13217 THD *thd MY_ATTRIBUTE((__unused__)),
13218 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
13219 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
13220 DBUG_ASSERT(save != nullptr);
13221 DBUG_ASSERT(rocksdb_db_options != nullptr);
13222 DBUG_ASSERT(rocksdb_db_options->env != nullptr);
13223
13224 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
13225
13226 const ulonglong new_val = *static_cast<const ulonglong *>(save);
13227
13228 if (rocksdb_db_options->bytes_per_sync != new_val) {
13229 rocksdb_db_options->bytes_per_sync = new_val;
13230 rocksdb::Status s =
13231 rdb->SetDBOptions({{"bytes_per_sync", std::to_string(new_val)}});
13232
13233 if (!s.ok()) {
13234 /* NO_LINT_DEBUG */
13235 sql_print_warning("MyRocks: failed to update max_background_jobs. "
13236 "Status code = %d, status = %s.",
13237 s.code(), s.ToString().c_str());
13238 }
13239 }
13240
13241 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
13242}
13243
13244static void rocksdb_set_wal_bytes_per_sync(
13245 THD *thd MY_ATTRIBUTE((__unused__)),
13246 struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
13247 void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
13248 DBUG_ASSERT(save != nullptr);
13249 DBUG_ASSERT(rocksdb_db_options != nullptr);
13250 DBUG_ASSERT(rocksdb_db_options->env != nullptr);
13251
13252 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
13253
13254 const ulonglong new_val = *static_cast<const ulonglong *>(save);
13255
13256 if (rocksdb_db_options->wal_bytes_per_sync != new_val) {
13257 rocksdb_db_options->wal_bytes_per_sync = new_val;
13258 rocksdb::Status s =
13259 rdb->SetDBOptions({{"wal_bytes_per_sync", std::to_string(new_val)}});
13260
13261 if (!s.ok()) {
13262 /* NO_LINT_DEBUG */
13263 sql_print_warning("MyRocks: failed to update max_background_jobs. "
13264 "Status code = %d, status = %s.",
13265 s.code(), s.ToString().c_str());
13266 }
13267 }
13268
13269 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
13270}
13271
13272static int
13273rocksdb_validate_update_cf_options(THD * /* unused */,
13274 struct st_mysql_sys_var * /*unused*/,
13275 void *save, struct st_mysql_value *value) {
13276
13277 char buff[STRING_BUFFER_USUAL_SIZE];
13278 const char *str;
13279 int length;
13280 length = sizeof(buff);
13281 str = value->val_str(value, buff, &length);
13282 *(const char **)save = str;
13283
13284 if (str == nullptr) {
13285 return HA_EXIT_SUCCESS;
13286 }
13287
13288 Rdb_cf_options::Name_to_config_t option_map;
13289
13290 // Basic sanity checking and parsing the options into a map. If this fails
13291 // then there's no point to proceed.
13292 if (!Rdb_cf_options::parse_cf_options(str, &option_map)) {
13293 my_error(ER_WRONG_VALUE_FOR_VAR, MYF(0), "rocksdb_update_cf_options", str);
13294 return HA_EXIT_FAILURE;
13295 }
13296 return HA_EXIT_SUCCESS;
13297}
13298
13299static void
13300rocksdb_set_update_cf_options(THD *const /* unused */,
13301 struct st_mysql_sys_var *const /* unused */,
13302 void *const var_ptr, const void *const save) {
13303 const char *const val = *static_cast<const char *const *>(save);
13304
13305 RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
13306
13307 if (!val) {
13308 *reinterpret_cast<char **>(var_ptr) = nullptr;
13309 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
13310 return;
13311 }
13312
13313 DBUG_ASSERT(val != nullptr);
13314
13315 // Reset the pointers regardless of how much success we had with updating
13316 // the CF options. This will results in consistent behavior and avoids
13317 // dealing with cases when only a subset of CF-s was successfully updated.
13318 *reinterpret_cast<char **>(var_ptr) = my_strdup(val, MYF(0));
13319
13320 // Do the real work of applying the changes.
13321 Rdb_cf_options::Name_to_config_t option_map;
13322
13323 // This should never fail, because of rocksdb_validate_update_cf_options
13324 if (!Rdb_cf_options::parse_cf_options(val, &option_map)) {
13325 my_free(*reinterpret_cast<char**>(var_ptr));
13326 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
13327 return;
13328 }
13329
13330 // For each CF we have, see if we need to update any settings.
13331 for (const auto &cf_name : cf_manager.get_cf_names()) {
13332 DBUG_ASSERT(!cf_name.empty());
13333
13334 rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name);
13335 DBUG_ASSERT(cfh != nullptr);
13336
13337 const auto it = option_map.find(cf_name);
13338 std::string per_cf_options = (it != option_map.end()) ? it->second : "";
13339
13340 if (!per_cf_options.empty()) {
13341 Rdb_cf_options::Name_to_config_t opt_map;
13342 rocksdb::Status s = rocksdb::StringToMap(per_cf_options, &opt_map);
13343
13344 if (s != rocksdb::Status::OK()) {
13345 // NO_LINT_DEBUG
13346 sql_print_warning("MyRocks: failed to convert the options for column "
13347 "family '%s' to a map. %s", cf_name.c_str(),
13348 s.ToString().c_str());
13349 } else {
13350 DBUG_ASSERT(rdb != nullptr);
13351
13352 // Finally we can apply the options.
13353 s = rdb->SetOptions(cfh, opt_map);
13354
13355 if (s != rocksdb::Status::OK()) {
13356 // NO_LINT_DEBUG
13357 sql_print_warning("MyRocks: failed to apply the options for column "
13358 "family '%s'. %s", cf_name.c_str(),
13359 s.ToString().c_str());
13360 } else {
13361 // NO_LINT_DEBUG
13362 sql_print_information("MyRocks: options for column family '%s' "
13363 "have been successfully updated.",
13364 cf_name.c_str());
13365
13366 // Make sure that data is internally consistent as well and update
13367 // the CF options. This is necessary also to make sure that the CF
13368 // options will be correctly reflected in the relevant table:
13369 // ROCKSDB_CF_OPTIONS in INFORMATION_SCHEMA.
13370 rocksdb::ColumnFamilyOptions cf_options = rdb->GetOptions(cfh);
13371 std::string updated_options;
13372
13373 s = rocksdb::GetStringFromColumnFamilyOptions(&updated_options,
13374 cf_options);
13375
13376 DBUG_ASSERT(s == rocksdb::Status::OK());
13377 DBUG_ASSERT(!updated_options.empty());
13378
13379 cf_manager.update_options_map(cf_name, updated_options);
13380 }
13381 }
13382 }
13383 }
13384
13385 // Our caller (`plugin_var_memalloc_global_update`) will call `my_free` to
13386 // free up resources used before.
13387
13388 RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
13389}
13390
13391void rdb_queue_save_stats_request() { rdb_bg_thread.request_save_stats(); }
13392
13393#ifdef MARIAROCKS_NOT_YET // MDEV-10976
13394
13395void ha_rocksdb::rpl_before_delete_rows() {
13396 DBUG_ENTER_FUNC();
13397
13398 m_in_rpl_delete_rows = true;
13399
13400 DBUG_VOID_RETURN;
13401}
13402
13403void ha_rocksdb::rpl_after_delete_rows() {
13404 DBUG_ENTER_FUNC();
13405
13406 m_in_rpl_delete_rows = false;
13407
13408 DBUG_VOID_RETURN;
13409}
13410
13411void ha_rocksdb::rpl_before_update_rows() {
13412 DBUG_ENTER_FUNC();
13413
13414 m_in_rpl_update_rows = true;
13415
13416 DBUG_VOID_RETURN;
13417}
13418
13419void ha_rocksdb::rpl_after_update_rows() {
13420 DBUG_ENTER_FUNC();
13421
13422 m_in_rpl_update_rows = false;
13423
13424 DBUG_VOID_RETURN;
13425}
13426
13427/**
13428 @brief
13429 Read Free Replication can be used or not. Returning False means
13430 Read Free Replication can be used. Read Free Replication can be used
13431 on UPDATE or DELETE row events, and table must have user defined
13432 primary key.
13433*/
13434bool ha_rocksdb::use_read_free_rpl() {
13435 DBUG_ENTER_FUNC();
13436
13437 DBUG_RETURN((m_in_rpl_delete_rows || m_in_rpl_update_rows) &&
13438 !has_hidden_pk(table) && m_use_read_free_rpl);
13439}
13440#endif // MARIAROCKS_NOT_YET
13441
13442double ha_rocksdb::read_time(uint index, uint ranges, ha_rows rows) {
13443 DBUG_ENTER_FUNC();
13444
13445 if (index != table->s->primary_key) {
13446 /* Non covering index range scan */
13447 DBUG_RETURN(handler::read_time(index, ranges, rows));
13448 }
13449
13450 DBUG_RETURN((rows / 20.0) + 1);
13451}
13452
13453std::string rdb_corruption_marker_file_name() {
13454 std::string ret(rocksdb_datadir);
13455 ret.append("/ROCKSDB_CORRUPTED");
13456 return ret;
13457}
13458
13459void sql_print_verbose_info(const char *format, ...)
13460{
13461 va_list args;
13462
13463 if (global_system_variables.log_warnings > 2) {
13464 va_start(args, format);
13465 sql_print_information_v(format, args);
13466 va_end(args);
13467 }
13468}
13469
13470} // namespace myrocks
13471
13472
13473/**
13474 Construct and emit duplicate key error message using information
13475 from table's record buffer.
13476
13477 @sa print_keydup_error(table, key, msg, errflag, thd, org_table_name).
13478*/
13479
13480void print_keydup_error(TABLE *table, KEY *key, myf errflag,
13481 const THD *thd, const char *org_table_name)
13482{
13483 print_keydup_error(table, key, ER(ER_DUP_ENTRY_WITH_KEY_NAME), errflag);
13484}
13485
13486/*
13487 Register the storage engine plugin outside of myrocks namespace
13488 so that mysql_declare_plugin does not get confused when it does
13489 its name generation.
13490*/
13491
13492
13493struct st_mysql_storage_engine rocksdb_storage_engine = {
13494 MYSQL_HANDLERTON_INTERFACE_VERSION};
13495
13496maria_declare_plugin(rocksdb_se){
13497 MYSQL_STORAGE_ENGINE_PLUGIN, /* Plugin Type */
13498 &rocksdb_storage_engine, /* Plugin Descriptor */
13499 "ROCKSDB", /* Plugin Name */
13500 "Monty Program Ab", /* Plugin Author */
13501 "RocksDB storage engine", /* Plugin Description */
13502 PLUGIN_LICENSE_GPL, /* Plugin Licence */
13503 myrocks::rocksdb_init_func, /* Plugin Entry Point */
13504 myrocks::rocksdb_done_func, /* Plugin Deinitializer */
13505 0x0001, /* version number (0.1) */
13506 myrocks::rocksdb_status_vars, /* status variables */
13507 myrocks::rocksdb_system_variables, /* system variables */
13508 "1.0", /* string version */
13509 myrocks::MYROCKS_MARIADB_PLUGIN_MATURITY_LEVEL
13510},
13511 myrocks::rdb_i_s_cfstats, myrocks::rdb_i_s_dbstats,
13512 myrocks::rdb_i_s_perf_context, myrocks::rdb_i_s_perf_context_global,
13513 myrocks::rdb_i_s_cfoptions, myrocks::rdb_i_s_compact_stats,
13514 myrocks::rdb_i_s_global_info, myrocks::rdb_i_s_ddl,
13515 myrocks::rdb_i_s_index_file_map, myrocks::rdb_i_s_lock_info,
13516 myrocks::rdb_i_s_trx_info,
13517 myrocks::rdb_i_s_deadlock_info
13518maria_declare_plugin_end;
13519