ha_rocksdb.cc source code [MariaDB/storage/rocksdb/ha_rocksdb.cc]

1	/*
2	Copyright (c) 2012, Monty Program Ab
3
4	This program is free software; you can redistribute it and/or modify
5	it under the terms of the GNU General Public License as published by
6	the Free Software Foundation; version 2 of the License.
7
8	This program is distributed in the hope that it will be useful,
9	but WITHOUT ANY WARRANTY; without even the implied warranty of
10	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11	GNU General Public License for more details.
12
13	You should have received a copy of the GNU General Public License
14	along with this program; if not, write to the Free Software
15	Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA /*
16
17	#ifdef USE_PRAGMA_IMPLEMENTATION
18	#pragma implementation // gcc: Class implementation
19	#endif
20
21	#define MYSQL_SERVER 1
22
23	/ For use of 'PRIu64': /
24	#define __STDC_FORMAT_MACROS
25
26	#include <my_global.h>
27
28	#include <inttypes.h>
29
30	/ The C++ file's header /
31	#include "./ha_rocksdb.h"
32
33	/ C++ standard header files /
34	#include <algorithm>
35	#include <limits>
36	#include <map>
37	#include <queue>
38	#include <set>
39	#include <string>
40	#include <vector>
41
42	/ MySQL includes /
43	#include "./debug_sync.h"
44	#include "./my_bit.h"
45	#include "./my_stacktrace.h"
46	#include "./my_sys.h"
47	#include "./sql_audit.h"
48	#include "./sql_table.h"
49	#include "./sql_hset.h"
50	#include <mysql/psi/mysql_table.h>
51	#ifdef MARIAROCKS_NOT_YET
52	#include <mysql/thread_pool_priv.h>
53	#endif
54	#include <mysys_err.h>
55
56	// Both MySQL and RocksDB define the same constant. To avoid compilation errors
57	// till we make the fix in RocksDB, we'll temporary undefine it here.
58	#undef CACHE_LINE_SIZE
59
60	/ RocksDB includes /
61	#include "monitoring/histogram.h"
62	#include "rocksdb/compaction_filter.h"
63	#include "rocksdb/env.h"
64	#include "rocksdb/persistent_cache.h"
65	#include "rocksdb/rate_limiter.h"
66	#include "rocksdb/slice_transform.h"
67	#include "rocksdb/thread_status.h"
68	#include "rocksdb/utilities/checkpoint.h"
69	#include "rocksdb/utilities/convenience.h"
70	#include "rocksdb/utilities/memory_util.h"
71	#include "rocksdb/utilities/sim_cache.h"
72	#include "util/stop_watch.h"
73	#include "./rdb_source_revision.h"
74
75	/ MyRocks includes /
76	#include "./event_listener.h"
77	#include "./ha_rocksdb_proto.h"
78	#include "./logger.h"
79	#include "./rdb_cf_manager.h"
80	#include "./rdb_cf_options.h"
81	#include "./rdb_datadic.h"
82	#include "./rdb_i_s.h"
83	#include "./rdb_index_merge.h"
84	#include "./rdb_mutex_wrapper.h"
85	#include "./rdb_psi.h"
86	#include "./rdb_threads.h"
87	#include "./rdb_mariadb_server_port.h"
88
89	// Internal MySQL APIs not exposed in any header.
90	extern "C" {
91	/**
92	Mark transaction to rollback and mark error as fatal to a sub-statement.
93	@param thd Thread handle
94	@param all TRUE <=> rollback main transaction.
95	*/
96	void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all);
97
98	/**
99	* Get the user thread's binary logging format
100	* @param thd user thread
101	* @return Value to be used as index into the binlog_format_names array
102	*/
103	int thd_binlog_format(const MYSQL_THD thd);
104
105	/**
106	* Check if binary logging is filtered for thread's current db.
107	* @param thd Thread handle
108	* @retval 1 the query is not filtered, 0 otherwise.
109	*/
110	bool thd_binlog_filter_ok(const MYSQL_THD thd);
111	}
112
113	MYSQL_PLUGIN_IMPORT bool my_disable_leak_check;
114
115	// Needed in rocksdb_init_func
116	void ignore_db_dirs_append(const char *dirname_arg);
117
118
119	namespace myrocks {
120
121	static st_global_stats global_stats;
122	static st_export_stats export_stats;
123	static st_memory_stats memory_stats;
124	static st_io_stall_stats io_stall_stats;
125
126	const std::string DEFAULT_CF_NAME("default");
127	const std::string DEFAULT_SYSTEM_CF_NAME("__system__");
128	const std::string PER_INDEX_CF_NAME("$per_index_cf");
129
130	/**
131	Updates row counters based on the table type and operation type.
132	*/
133	void ha_rocksdb::update_row_stats(const operation_type &type) {
134	DBUG_ASSERT(type < ROWS_MAX);
135	// Find if we are modifying system databases.
136	if (table->s && m_tbl_def->m_is_mysql_system_table)
137	global_stats.system_rows[type].inc();
138	else
139	global_stats.rows[type].inc();
140	}
141
142	void dbug_dump_database(rocksdb::DB *db);
143	static handler rocksdb_create_handler(my_core::handlerton hton,
144	my_core::TABLE_SHARE *table_arg,
145	my_core::MEM_ROOT *mem_root);
146
147	static rocksdb::CompactRangeOptions getCompactRangeOptions() {
148	rocksdb::CompactRangeOptions compact_range_options;
149	compact_range_options.bottommost_level_compaction =
150	rocksdb::BottommostLevelCompaction::kForce;
151	compact_range_options.exclusive_manual_compaction = false;
152	return compact_range_options;
153	}
154
155	///////////////////////////////////////////////////////////
156	// Parameters and settings
157	///////////////////////////////////////////////////////////
158	static char rocksdb_default_cf_options = nullptr*;
159	static char rocksdb_override_cf_options = nullptr*;
160	static char rocksdb_update_cf_options = nullptr*;
161
162	///////////////////////////////////////////////////////////
163	// Globals
164	///////////////////////////////////////////////////////////
165	handlerton *rocksdb_hton;
166
167	rocksdb::TransactionDB rdb = nullptr*;
168	rocksdb::HistogramImpl commit_latency_stats = nullptr*;
169
170	static std::shared_ptr<rocksdb::Statistics> rocksdb_stats;
171	static std::unique_ptr<rocksdb::Env> flashcache_aware_env;
172	static std::shared_ptr<Rdb_tbl_prop_coll_factory> properties_collector_factory;
173
174	Rdb_dict_manager dict_manager;
175	Rdb_cf_manager cf_manager;
176	Rdb_ddl_manager ddl_manager;
177	Rdb_binlog_manager binlog_manager;
178
179	#if !defined(_WIN32) && !defined(__APPLE__)
180	Rdb_io_watchdog io_watchdog = nullptr*;
181	#endif
182	/**
183	MyRocks background thread control
184	N.B. This is besides RocksDB's own background threads
185	(@see rocksdb::CancelAllBackgroundWork())
186	*/
187
188	static Rdb_background_thread rdb_bg_thread;
189
190	// List of table names (using regex) that are exceptions to the strict
191	// collation check requirement.
192	Regex_list_handler *rdb_collation_exceptions;
193
194	static const char *rdb_get_error_messages(int* nr);
195
196	static void rocksdb_flush_all_memtables() {
197	const Rdb_cf_manager &cf_manager = rdb_get_cf_manager();
198	for (const auto &cf_handle : cf_manager.get_all_cf()) {
199	rdb->Flush(rocksdb::FlushOptions (), cf_handle);
200	}
201	}
202
203	static void rocksdb_compact_column_family_stub(
204	THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
205	const void *const save) {}
206
207	static int rocksdb_compact_column_family(THD *const thd,
208	struct st_mysql_sys_var *const var,
209	void *const var_ptr,
210	struct st_mysql_value *const value) {
211	char buff[STRING_BUFFER_USUAL_SIZE];
212	int len = sizeof(buff);
213
214	DBUG_ASSERT(value != nullptr);
215
216	if (const char *const cf = value->val_str(value, buff, &len)) {
217	auto cfh = cf_manager.get_cf(cf);
218	if (cfh != nullptr && rdb != nullptr) {
219	sql_print_verbose_info("RocksDB: Manual compaction of column family: %s\n",
220	cf);
221	rdb->CompactRange(getCompactRangeOptions(), cfh, nullptr, nullptr);
222	}
223	}
224	return HA_EXIT_SUCCESS;
225	}
226
227	///////////////////////////////////////////////////////////
228	// Hash map: table name => open table handler
229	///////////////////////////////////////////////////////////
230
231	namespace // anonymous namespace = not visible outside this source file
232	{
233
234	const ulong TABLE_HASH_SIZE = `32`;
235	typedef Hash_set<Rdb_table_handler> Rdb_table_set;
236
237	struct Rdb_open_tables_map {
238	/ Hash table used to track the handlers of open tables /
239	Rdb_table_set m_hash;
240	/ The mutex used to protect the hash table /
241	mutable mysql_mutex_t m_mutex;
242
243	static uchar get_hash_key(const* Rdb_table_handler *const table_handler,
244	size_t *const length,
245	my_bool not_used MY_ATTRIBUTE((__unused__)));
246
247	Rdb_table_handler get_table_handler(const* char *const table_name);
248	void release_table_handler(Rdb_table_handler *const table_handler);
249
250	Rdb_open_tables_map() : m_hash (get_hash_key, system_charset_info) { }
251
252	std::vector<std::string> get_table_names(void) const;
253	};
254
255	} // anonymous namespace
256
257	static Rdb_open_tables_map rdb_open_tables;
258
259	static std::string rdb_normalize_dir(std::string dir) {
260	while (dir.size() > `0` && dir.back() == `'/'`) {
261	dir.resize(dir.size() - `1`);
262	}
263	return dir;
264	}
265
266	static int rocksdb_create_checkpoint(
267	THD *const thd MY_ATTRIBUTE((__unused__)),
268	struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
269	void *const save MY_ATTRIBUTE((__unused__)),
270	struct st_mysql_value *const value) {
271	char buf[FN_REFLEN];
272	int len = sizeof(buf);
273	const char *const checkpoint_dir_raw = value->val_str(value, buf, &len);
274	if (checkpoint_dir_raw) {
275	if (rdb != nullptr) {
276	std::string checkpoint_dir = rdb_normalize_dir(checkpoint_dir_raw);
277	// NO_LINT_DEBUG
278	sql_print_information("RocksDB: creating checkpoint in directory : %s\n",
279	checkpoint_dir.c_str());
280	rocksdb::Checkpoint *checkpoint;
281	auto status = rocksdb::Checkpoint::Create(rdb, &checkpoint);
282	// We can only return HA_EXIT_FAILURE/HA_EXIT_SUCCESS here which is why
283	// the return code is ignored, but by calling into rdb_error_to_mysql,
284	// it will call my_error for us, which will propogate up to the client.
285	int rc __attribute__((__unused__));
286	if (status.ok()) {
287	status = checkpoint->CreateCheckpoint(checkpoint_dir.c_str());
288	delete checkpoint;
289	if (status.ok()) {
290	sql_print_information(
291	"RocksDB: created checkpoint in directory : %s\n",
292	checkpoint_dir.c_str());
293	return HA_EXIT_SUCCESS;
294	} else {
295	rc = ha_rocksdb::rdb_error_to_mysql(status);
296	}
297	} else {
298	rc = ha_rocksdb::rdb_error_to_mysql(status);
299	}
300	}
301	}
302	return HA_EXIT_FAILURE;
303	}
304
305	/ This method is needed to indicate that the*
306	ROCKSDB_CREATE_CHECKPOINT command is not read-only /*
307	static void rocksdb_create_checkpoint_stub(THD *const thd,
308	struct st_mysql_sys_var *const var,
309	void *const var_ptr,
310	const void *const save) {}
311
312	static void rocksdb_force_flush_memtable_now_stub(
313	THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
314	const void *const save) {}
315
316	static int rocksdb_force_flush_memtable_now(
317	THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
318	struct st_mysql_value *const value) {
319	sql_print_information("RocksDB: Manual memtable flush.");
320	rocksdb_flush_all_memtables();
321	return HA_EXIT_SUCCESS;
322	}
323
324	static void rocksdb_force_flush_memtable_and_lzero_now_stub(
325	THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
326	const void *const save) {}
327
328	static int rocksdb_force_flush_memtable_and_lzero_now(
329	THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
330	struct st_mysql_value *const value) {
331	sql_print_information("RocksDB: Manual memtable and L0 flush.");
332	rocksdb_flush_all_memtables();
333
334	const Rdb_cf_manager &cf_manager = rdb_get_cf_manager();
335	rocksdb::CompactionOptions c_options = rocksdb::CompactionOptions ();
336	rocksdb::ColumnFamilyMetaData metadata;
337	rocksdb::ColumnFamilyDescriptor cf_descr;
338
339	for (const auto &cf_handle : cf_manager.get_all_cf()) {
340	rdb->GetColumnFamilyMetaData(cf_handle, &metadata);
341	cf_handle->GetDescriptor(&cf_descr);
342	c_options.output_file_size_limit = cf_descr.options.target_file_size_base;
343
344	DBUG_ASSERT(metadata.levels[`0`].level == `0`);
345	std::vector<std::string> file_names;
346	for (auto &file : metadata.levels [`0`].files) {
347	file_names.emplace_back(file.db_path + file.name);
348	}
349
350	if (!file_names.empty()) {
351	rocksdb::Status s;
352	s = rdb->CompactFiles(c_options, cf_handle, file_names, `1`);
353
354	if (!s.ok() && !s.IsAborted()) {
355	rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
356	return HA_EXIT_FAILURE;
357	}
358	}
359	}
360
361	return HA_EXIT_SUCCESS;
362	}
363
364	static void rocksdb_drop_index_wakeup_thread(
365	my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
366	struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
367	void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save);
368
369	static my_bool rocksdb_pause_background_work = `0`;
370	static mysql_mutex_t rdb_sysvars_mutex;
371
372	static void rocksdb_set_pause_background_work(
373	my_core::THD *const,
374	struct st_mysql_sys_var *const,
375	void *const, const void *const save) {
376	RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
377	const my_bool pause_requested = *static_cast<const my_bool *>(save);
378	if (rocksdb_pause_background_work != pause_requested) {
379	if (pause_requested) {
380	rdb->PauseBackgroundWork();
381	} else {
382	rdb->ContinueBackgroundWork();
383	}
384	rocksdb_pause_background_work = pause_requested;
385	}
386	RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
387	}
388
389	static void rocksdb_set_compaction_options(THD *thd,
390	struct st_mysql_sys_var *var,
391	void var_ptr, const* void *save);
392
393	static void rocksdb_set_table_stats_sampling_pct(THD *thd,
394	struct st_mysql_sys_var *var,
395	void *var_ptr,
396	const void *save);
397
398	static void rocksdb_set_rate_limiter_bytes_per_sec(THD *thd,
399	struct st_mysql_sys_var *var,
400	void *var_ptr,
401	const void *save);
402
403	static void rocksdb_set_sst_mgr_rate_bytes_per_sec(THD *thd,
404	struct st_mysql_sys_var *var,
405	void *var_ptr,
406	const void *save);
407
408	static void rocksdb_set_delayed_write_rate(THD *thd,
409	struct st_mysql_sys_var *var,
410	void var_ptr, const* void *save);
411
412	static void rocksdb_set_max_latest_deadlocks(THD *thd,
413	struct st_mysql_sys_var *var,
414	void var_ptr, const* void *save);
415
416	static void rdb_set_collation_exception_list(const char *exception_list);
417	static void rocksdb_set_collation_exception_list(THD *thd,
418	struct st_mysql_sys_var *var,
419	void *var_ptr,
420	const void *save);
421
422	static int rocksdb_validate_update_cf_options(THD *thd,
423	struct st_mysql_sys_var *var,
424	void *save,
425	st_mysql_value *value);
426
427	static void rocksdb_set_update_cf_options(THD *thd,
428	struct st_mysql_sys_var *var,
429	void var_ptr, const* void *save);
430
431	static int rocksdb_check_bulk_load(THD *const thd,
432	struct st_mysql_sys_var *var
433	MY_ATTRIBUTE((__unused__)),
434	void *save,
435	struct st_mysql_value *value);
436
437	static int rocksdb_check_bulk_load_allow_unsorted(
438	THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
439	void save, struct* st_mysql_value *value);
440
441	static void rocksdb_set_max_background_jobs(THD *thd,
442	struct st_mysql_sys_var *const var,
443	void *const var_ptr,
444	const void *const save);
445	static void rocksdb_set_bytes_per_sync(THD *thd,
446	struct st_mysql_sys_var *const var,
447	void *const var_ptr,
448	const void *const save);
449	static void rocksdb_set_wal_bytes_per_sync(THD *thd,
450	struct st_mysql_sys_var *const var,
451	void *const var_ptr,
452	const void *const save);
453	//////////////////////////////////////////////////////////////////////////////
454	// Options definitions
455	//////////////////////////////////////////////////////////////////////////////
456	static long long rocksdb_block_cache_size;
457	static long long rocksdb_sim_cache_size;
458	static my_bool rocksdb_use_clock_cache;
459	/ Use unsigned long long instead of uint64_t because of MySQL compatibility /
460	static unsigned long long // NOLINT(runtime/int)
461	rocksdb_rate_limiter_bytes_per_sec;
462	static unsigned long long // NOLINT(runtime/int)
463	rocksdb_sst_mgr_rate_bytes_per_sec;
464	static unsigned long long rocksdb_delayed_write_rate;
465	static uint32_t rocksdb_max_latest_deadlocks;
466	static unsigned long // NOLINT(runtime/int)
467	rocksdb_persistent_cache_size_mb;
468	static ulong rocksdb_info_log_level;
469	static char *rocksdb_wal_dir;
470	static char *rocksdb_persistent_cache_path;
471	static ulong rocksdb_index_type;
472	static uint32_t rocksdb_flush_log_at_trx_commit;
473	static uint32_t rocksdb_debug_optimizer_n_rows;
474	static my_bool rocksdb_force_compute_memtable_stats;
475	static uint32_t rocksdb_force_compute_memtable_stats_cachetime;
476	static my_bool rocksdb_debug_optimizer_no_zero_cardinality;
477	static uint32_t rocksdb_wal_recovery_mode;
478	static uint32_t rocksdb_access_hint_on_compaction_start;
479	static char *rocksdb_compact_cf_name;
480	static char *rocksdb_checkpoint_name;
481	static my_bool rocksdb_signal_drop_index_thread;
482	static my_bool rocksdb_strict_collation_check = `1`;
483	static my_bool rocksdb_ignore_unknown_options = `1`;
484	static my_bool rocksdb_enable_2pc = `0`;
485	static char *rocksdb_strict_collation_exceptions;
486	static my_bool rocksdb_collect_sst_properties = `1`;
487	static my_bool rocksdb_force_flush_memtable_now_var = `0`;
488	static my_bool rocksdb_force_flush_memtable_and_lzero_now_var = `0`;
489	static my_bool rocksdb_enable_ttl = `1`;
490	static my_bool rocksdb_enable_ttl_read_filtering = `1`;
491	static int rocksdb_debug_ttl_rec_ts = `0`;
492	static int rocksdb_debug_ttl_snapshot_ts = `0`;
493	static int rocksdb_debug_ttl_read_filter_ts = `0`;
494	static my_bool rocksdb_debug_ttl_ignore_pk = `0`;
495	static my_bool rocksdb_reset_stats = `0`;
496	static uint32_t rocksdb_io_write_timeout_secs = `0`;
497	static uint32_t rocksdb_seconds_between_stat_computes = `3600`;
498	static long long rocksdb_compaction_sequential_deletes = `0l`;
499	static long long rocksdb_compaction_sequential_deletes_window = `0l`;
500	static long long rocksdb_compaction_sequential_deletes_file_size = `0l`;
501	static uint32_t rocksdb_validate_tables = `1`;
502	static char *rocksdb_datadir;
503	static uint32_t rocksdb_table_stats_sampling_pct;
504	static my_bool rocksdb_enable_bulk_load_api = `1`;
505	static my_bool rocksdb_print_snapshot_conflict_queries = `0`;
506	static my_bool rocksdb_large_prefix = `0`;
507	static my_bool rocksdb_allow_to_start_after_corruption = `0`;
508	static char* rocksdb_git_hash;
509
510	char *compression_types_val=
511	const_cast<char*>(get_rocksdb_supported_compression_types());
512
513	std::atomic<uint64_t> rocksdb_row_lock_deadlocks(`0`);
514	std::atomic<uint64_t> rocksdb_row_lock_wait_timeouts(`0`);
515	std::atomic<uint64_t> rocksdb_snapshot_conflict_errors(`0`);
516	std::atomic<uint64_t> rocksdb_wal_group_syncs(`0`);
517
518	static std::unique_ptr<rocksdb::DBOptions> rdb_init_rocksdb_db_options(void) {
519	auto o = std::unique_ptr<rocksdb::DBOptions>(new rocksdb::DBOptions ());
520
521	o ->create_if_missing = true;
522	o ->listeners.push_back(std::make_shared<Rdb_event_listener>(&ddl_manager));
523	o ->info_log_level = rocksdb::InfoLogLevel::INFO_LEVEL;
524	o ->max_subcompactions = DEFAULT_SUBCOMPACTIONS;
525	o ->max_open_files = -`2`; // auto-tune to 50% open_files_limit
526
527	o ->two_write_queues = true;
528	o ->manual_wal_flush = true;
529	return o;
530	}
531
532	/ DBOptions contains Statistics and needs to be destructed last /
533	static std::unique_ptr<rocksdb::BlockBasedTableOptions> rocksdb_tbl_options =
534	std::unique_ptr<rocksdb::BlockBasedTableOptions>(
535	new rocksdb::BlockBasedTableOptions ());
536	static std::unique_ptr<rocksdb::DBOptions> rocksdb_db_options =
537	rdb_init_rocksdb_db_options();
538
539	static std::shared_ptr<rocksdb::RateLimiter> rocksdb_rate_limiter;
540
541	/ This enum needs to be kept up to date with rocksdb::InfoLogLevel /
542	static const char *info_log_level_names[] = {"debug_level", "info_level",
543	"warn_level", "error_level",
544	"fatal_level", NullS};
545
546	static TYPELIB info_log_level_typelib = {
547	array_elements(info_log_level_names) - `1`, "info_log_level_typelib",
548	info_log_level_names, nullptr};
549
550	static void rocksdb_set_rocksdb_info_log_level(
551	THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr,
552	const void *const save) {
553	DBUG_ASSERT(save != nullptr);
554
555	RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
556	rocksdb_info_log_level = *static_cast<const uint64_t *>(save);
557	rocksdb_db_options ->info_log ->SetInfoLogLevel(
558	static_cast<const rocksdb::InfoLogLevel>(rocksdb_info_log_level));
559	RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
560	}
561
562	static void rocksdb_set_reset_stats(
563	my_core::THD *const / unused /,
564	my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
565	void *const var_ptr, const void *const save) {
566	DBUG_ASSERT(save != nullptr);
567	DBUG_ASSERT(rdb != nullptr);
568	DBUG_ASSERT(rocksdb_stats != nullptr);
569
570	RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
571
572	*static_cast<bool >(var_ptr) = static_cast<const bool *>(save);
573
574	if (rocksdb_reset_stats) {
575	rocksdb::Status s = rdb->ResetStats();
576
577	// RocksDB will always return success. Let's document this assumption here
578	// as well so that we'll get immediately notified when contract changes.
579	DBUG_ASSERT(s == rocksdb::Status::OK());
580
581	s = rocksdb_stats ->Reset();
582	DBUG_ASSERT(s == rocksdb::Status::OK());
583	}
584
585	RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
586	}
587
588	static void rocksdb_set_io_write_timeout(
589	my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
590	my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
591	void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
592	DBUG_ASSERT(save != nullptr);
593	DBUG_ASSERT(rdb != nullptr);
594	#if !defined(_WIN32) && !defined(__APPLE__)
595	DBUG_ASSERT(io_watchdog != nullptr);
596	#endif
597
598	RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
599
600	const uint32_t new_val = *static_cast<const uint32_t *>(save);
601
602	rocksdb_io_write_timeout_secs = new_val;
603	#if !defined(_WIN32) && !defined(__APPLE__)
604	io_watchdog->reset_timeout(rocksdb_io_write_timeout_secs);
605	#endif
606	RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
607	}
608
609	enum rocksdb_flush_log_at_trx_commit_type : unsigned int {
610	FLUSH_LOG_NEVER = `0`,
611	FLUSH_LOG_SYNC,
612	FLUSH_LOG_BACKGROUND,
613	FLUSH_LOG_MAX / must be last /
614	};
615
616	static int rocksdb_validate_flush_log_at_trx_commit(
617	THD *const thd,
618	struct st_mysql_sys_var *const var, / in: pointer to system variable /
619	void var_ptr, /* out: immediate result for update function /
620	struct st_mysql_value *const value / in: incoming value /) {
621	long long new_value;
622
623	/ value is NULL /
624	if (value->val_int(value, &new_value)) {
625	return HA_EXIT_FAILURE;
626	}
627
628	if (rocksdb_db_options ->allow_mmap_writes && new_value != FLUSH_LOG_NEVER) {
629	return HA_EXIT_FAILURE;
630	}
631
632	*static_cast<uint32_t >(var_ptr) = static_cast*<uint32_t>(new_value);
633	return HA_EXIT_SUCCESS;
634	}
635
636	static const char *index_type_names[] = {"kBinarySearch", "kHashSearch", NullS};
637
638	static TYPELIB index_type_typelib = {array_elements(index_type_names) - `1`,
639	"index_type_typelib", index_type_names,
640	nullptr};
641
642	const ulong RDB_MAX_LOCK_WAIT_SECONDS = `1024` * `1024` * `1024`;
643	const ulong RDB_MAX_ROW_LOCKS = `1024` * `1024`;
644	const ulong RDB_DEFAULT_BULK_LOAD_SIZE = `1000`;
645	const ulong RDB_MAX_BULK_LOAD_SIZE = `1024` * `1024` * `1024`;
646	const size_t RDB_DEFAULT_MERGE_BUF_SIZE = `64` * `1024` * `1024`;
647	const size_t RDB_MIN_MERGE_BUF_SIZE = `100`;
648	const size_t RDB_DEFAULT_MERGE_COMBINE_READ_SIZE = `1024` * `1024` * `1024`;
649	const size_t RDB_MIN_MERGE_COMBINE_READ_SIZE = `100`;
650	const size_t RDB_DEFAULT_MERGE_TMP_FILE_REMOVAL_DELAY = `0`;
651	const size_t RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY = `0`;
652	const int64 RDB_DEFAULT_BLOCK_CACHE_SIZE = `512` * `1024` * `1024`;
653	const int64 RDB_MIN_BLOCK_CACHE_SIZE = `1024`;
654	const int RDB_MAX_CHECKSUMS_PCT = `100`;
655	const ulong RDB_DEADLOCK_DETECT_DEPTH = `50`;
656
657	// TODO: 0 means don't wait at all, and we don't support it yet?
658	static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG,
659	"Number of seconds to wait for lock", nullptr,
660	nullptr, /default/ `1`, /min/ `1`,
661	/max/ RDB_MAX_LOCK_WAIT_SECONDS, `0`);
662
663	static MYSQL_THDVAR_BOOL(deadlock_detect, PLUGIN_VAR_RQCMDARG,
664	"Enables deadlock detection", nullptr, nullptr, FALSE);
665
666	static MYSQL_THDVAR_ULONG(deadlock_detect_depth, PLUGIN_VAR_RQCMDARG,
667	"Number of transactions deadlock detection will "
668	"traverse through before assuming deadlock",
669	nullptr, nullptr,
670	/default/ RDB_DEADLOCK_DETECT_DEPTH,
671	/min/ `2`,
672	/max/ ULONG_MAX, `0`);
673
674	static MYSQL_THDVAR_BOOL(
675	trace_sst_api, PLUGIN_VAR_RQCMDARG,
676	"Generate trace output in the log for each call to the SstFileWriter",
677	nullptr, nullptr, FALSE);
678
679	static MYSQL_THDVAR_BOOL(
680	bulk_load, PLUGIN_VAR_RQCMDARG,
681	"Use bulk-load mode for inserts. This disables "
682	"unique_checks and enables rocksdb_commit_in_the_middle.",
683	rocksdb_check_bulk_load, nullptr, FALSE);
684
685	static MYSQL_THDVAR_BOOL(bulk_load_allow_unsorted, PLUGIN_VAR_RQCMDARG,
686	"Allow unsorted input during bulk-load. "
687	"Can be changed only when bulk load is disabled.",
688	rocksdb_check_bulk_load_allow_unsorted, nullptr,
689	FALSE);
690
691	static MYSQL_SYSVAR_BOOL(enable_bulk_load_api, rocksdb_enable_bulk_load_api,
692	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
693	"Enables using SstFileWriter for bulk loading",
694	nullptr, nullptr, rocksdb_enable_bulk_load_api);
695
696	static MYSQL_SYSVAR_STR(git_hash, rocksdb_git_hash,
697	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
698	"Git revision of the RocksDB library used by MyRocks",
699	nullptr, nullptr, ROCKSDB_GIT_HASH);
700
701	static MYSQL_THDVAR_STR(tmpdir, PLUGIN_VAR_OPCMDARG \| PLUGIN_VAR_MEMALLOC,
702	"Directory for temporary files during DDL operations.",
703	nullptr, nullptr, "");
704
705	static MYSQL_THDVAR_STR(
706	skip_unique_check_tables, PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_MEMALLOC,
707	"Skip unique constraint checking for the specified tables", nullptr,
708	nullptr, ".*");
709
710	static MYSQL_THDVAR_BOOL(
711	commit_in_the_middle, PLUGIN_VAR_RQCMDARG,
712	"Commit rows implicitly every rocksdb_bulk_load_size, on bulk load/insert, "
713	"update and delete",
714	nullptr, nullptr, FALSE);
715
716	static MYSQL_THDVAR_BOOL(
717	blind_delete_primary_key, PLUGIN_VAR_RQCMDARG,
718	"Deleting rows by primary key lookup, without reading rows (Blind Deletes)."
719	" Blind delete is disabled if the table has secondary key",
720	nullptr, nullptr, FALSE);
721
722	static MYSQL_THDVAR_STR(
723	read_free_rpl_tables, PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_MEMALLOC,
724	"List of tables that will use read-free replication on the slave "
725	"(i.e. not lookup a row during replication)",
726	nullptr, nullptr, "");
727
728	static MYSQL_THDVAR_BOOL(skip_bloom_filter_on_read, PLUGIN_VAR_RQCMDARG,
729	"Skip using bloom filter for reads", nullptr, nullptr,
730	FALSE);
731
732	static MYSQL_THDVAR_ULONG(max_row_locks, PLUGIN_VAR_RQCMDARG,
733	"Maximum number of locks a transaction can have",
734	nullptr, nullptr,
735	/default/ RDB_MAX_ROW_LOCKS,
736	/min/ `1`,
737	/max/ RDB_MAX_ROW_LOCKS, `0`);
738
739	static MYSQL_THDVAR_ULONGLONG(
740	write_batch_max_bytes, PLUGIN_VAR_RQCMDARG,
741	"Maximum size of write batch in bytes. 0 means no limit.", nullptr, nullptr,
742	/ default / `0`, / min / `0`, / max / SIZE_T_MAX, `1`);
743
744	static MYSQL_THDVAR_BOOL(
745	lock_scanned_rows, PLUGIN_VAR_RQCMDARG,
746	"Take and hold locks on rows that are scanned but not updated", nullptr,
747	nullptr, FALSE);
748
749	static MYSQL_THDVAR_ULONG(bulk_load_size, PLUGIN_VAR_RQCMDARG,
750	"Max #records in a batch for bulk-load mode", nullptr,
751	nullptr,
752	/default/ RDB_DEFAULT_BULK_LOAD_SIZE,
753	/min/ `1`,
754	/max/ RDB_MAX_BULK_LOAD_SIZE, `0`);
755
756	static MYSQL_THDVAR_ULONGLONG(
757	merge_buf_size, PLUGIN_VAR_RQCMDARG,
758	"Size to allocate for merge sort buffers written out to disk "
759	"during inplace index creation.",
760	nullptr, nullptr,
761	/ default (64MB) / RDB_DEFAULT_MERGE_BUF_SIZE,
762	/ min (100B) / RDB_MIN_MERGE_BUF_SIZE,
763	/ max / SIZE_T_MAX, `1`);
764
765	static MYSQL_THDVAR_ULONGLONG(
766	merge_combine_read_size, PLUGIN_VAR_RQCMDARG,
767	"Size that we have to work with during combine (reading from disk) phase "
768	"of "
769	"external sort during fast index creation.",
770	nullptr, nullptr,
771	/ default (1GB) / RDB_DEFAULT_MERGE_COMBINE_READ_SIZE,
772	/ min (100B) / RDB_MIN_MERGE_COMBINE_READ_SIZE,
773	/ max / SIZE_T_MAX, `1`);
774
775	static MYSQL_THDVAR_ULONGLONG(
776	merge_tmp_file_removal_delay_ms, PLUGIN_VAR_RQCMDARG,
777	"Fast index creation creates a large tmp file on disk during index "
778	"creation. Removing this large file all at once when index creation is "
779	"complete can cause trim stalls on Flash. This variable specifies a "
780	"duration to sleep (in milliseconds) between calling chsize() to truncate "
781	"the file in chunks. The chunk size is the same as merge_buf_size.",
782	nullptr, nullptr,
783	/ default (0ms) / RDB_DEFAULT_MERGE_TMP_FILE_REMOVAL_DELAY,
784	/ min (0ms) / RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY,
785	/ max / SIZE_T_MAX, `1`);
786
787	static MYSQL_SYSVAR_BOOL(
788	create_if_missing,
789	*reinterpret_cast<my_bool *>(&rocksdb_db_options ->create_if_missing),
790	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
791	"DBOptions::create_if_missing for RocksDB", nullptr, nullptr,
792	rocksdb_db_options ->create_if_missing);
793
794	static MYSQL_SYSVAR_BOOL(
795	two_write_queues,
796	*reinterpret_cast<my_bool *>(&rocksdb_db_options ->two_write_queues),
797	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
798	"DBOptions::two_write_queues for RocksDB", nullptr, nullptr,
799	rocksdb_db_options ->two_write_queues);
800
801	static MYSQL_SYSVAR_BOOL(
802	manual_wal_flush,
803	*reinterpret_cast<my_bool *>(&rocksdb_db_options ->manual_wal_flush),
804	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
805	"DBOptions::manual_wal_flush for RocksDB", nullptr, nullptr,
806	rocksdb_db_options ->manual_wal_flush);
807
808	static MYSQL_SYSVAR_BOOL(
809	create_missing_column_families,
810	*reinterpret_cast<my_bool *>(
811	&rocksdb_db_options ->create_missing_column_families),
812	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
813	"DBOptions::create_missing_column_families for RocksDB", nullptr, nullptr,
814	rocksdb_db_options ->create_missing_column_families);
815
816	static MYSQL_SYSVAR_BOOL(
817	error_if_exists,
818	*reinterpret_cast<my_bool *>(&rocksdb_db_options ->error_if_exists),
819	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
820	"DBOptions::error_if_exists for RocksDB", nullptr, nullptr,
821	rocksdb_db_options ->error_if_exists);
822
823	static MYSQL_SYSVAR_BOOL(
824	paranoid_checks,
825	*reinterpret_cast<my_bool *>(&rocksdb_db_options ->paranoid_checks),
826	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
827	"DBOptions::paranoid_checks for RocksDB", nullptr, nullptr,
828	rocksdb_db_options ->paranoid_checks);
829
830	static MYSQL_SYSVAR_ULONGLONG(
831	rate_limiter_bytes_per_sec, rocksdb_rate_limiter_bytes_per_sec,
832	PLUGIN_VAR_RQCMDARG, "DBOptions::rate_limiter bytes_per_sec for RocksDB",
833	nullptr, rocksdb_set_rate_limiter_bytes_per_sec, / default / `0L`,
834	/ min / `0L`, / max / MAX_RATE_LIMITER_BYTES_PER_SEC, `0`);
835
836	static MYSQL_SYSVAR_ULONGLONG(
837	sst_mgr_rate_bytes_per_sec, rocksdb_sst_mgr_rate_bytes_per_sec,
838	PLUGIN_VAR_RQCMDARG,
839	"DBOptions::sst_file_manager rate_bytes_per_sec for RocksDB", nullptr,
840	rocksdb_set_sst_mgr_rate_bytes_per_sec,
841	/ default / DEFAULT_SST_MGR_RATE_BYTES_PER_SEC,
842	/ min / `0L`, / max / UINT64_MAX, `0`);
843
844	static MYSQL_SYSVAR_ULONGLONG(delayed_write_rate, rocksdb_delayed_write_rate,
845	PLUGIN_VAR_RQCMDARG,
846	"DBOptions::delayed_write_rate", nullptr,
847	rocksdb_set_delayed_write_rate,
848	rocksdb_db_options ->delayed_write_rate, `0`,
849	UINT64_MAX, `0`);
850
851	static MYSQL_SYSVAR_UINT(max_latest_deadlocks, rocksdb_max_latest_deadlocks,
852	PLUGIN_VAR_RQCMDARG,
853	"Maximum number of recent "
854	"deadlocks to store",
855	nullptr, rocksdb_set_max_latest_deadlocks,
856	rocksdb::kInitialMaxDeadlocks, `0`, UINT32_MAX, `0`);
857
858	static MYSQL_SYSVAR_ENUM(
859	info_log_level, rocksdb_info_log_level, PLUGIN_VAR_RQCMDARG,
860	"Filter level for info logs to be written mysqld error log. "
861	"Valid values include 'debug_level', 'info_level', 'warn_level'"
862	"'error_level' and 'fatal_level'.",
863	nullptr, rocksdb_set_rocksdb_info_log_level,
864	rocksdb::InfoLogLevel::ERROR_LEVEL, &info_log_level_typelib);
865
866	static MYSQL_THDVAR_INT(
867	perf_context_level, PLUGIN_VAR_RQCMDARG,
868	"Perf Context Level for rocksdb internal timer stat collection", nullptr,
869	nullptr,
870	/ default / rocksdb::PerfLevel::kUninitialized,
871	/ min / rocksdb::PerfLevel::kUninitialized,
872	/ max / rocksdb::PerfLevel::kOutOfBounds - `1`, `0`);
873
874	static MYSQL_SYSVAR_UINT(
875	wal_recovery_mode, rocksdb_wal_recovery_mode, PLUGIN_VAR_RQCMDARG,
876	"DBOptions::wal_recovery_mode for RocksDB. Default is kAbsoluteConsistency",
877	nullptr, nullptr,
878	/ default / (uint)rocksdb::WALRecoveryMode::kAbsoluteConsistency,
879	/ min / (uint)rocksdb::WALRecoveryMode::kTolerateCorruptedTailRecords,
880	/ max / (uint)rocksdb::WALRecoveryMode::kSkipAnyCorruptedRecords, `0`);
881
882	static MYSQL_SYSVAR_SIZE_T(compaction_readahead_size,
883	rocksdb_db_options ->compaction_readahead_size,
884	PLUGIN_VAR_RQCMDARG,
885	"DBOptions::compaction_readahead_size for RocksDB",
886	nullptr, nullptr,
887	rocksdb_db_options ->compaction_readahead_size,
888	/ min / `0L`, / max / SIZE_T_MAX, `0`);
889
890	static MYSQL_SYSVAR_BOOL(
891	new_table_reader_for_compaction_inputs,
892	*reinterpret_cast<my_bool *>(
893	&rocksdb_db_options ->new_table_reader_for_compaction_inputs),
894	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
895	"DBOptions::new_table_reader_for_compaction_inputs for RocksDB", nullptr,
896	nullptr, rocksdb_db_options ->new_table_reader_for_compaction_inputs);
897
898	static MYSQL_SYSVAR_UINT(
899	access_hint_on_compaction_start, rocksdb_access_hint_on_compaction_start,
900	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
901	"DBOptions::access_hint_on_compaction_start for RocksDB", nullptr, nullptr,
902	/ default / (uint)rocksdb::Options::AccessHint::NORMAL,
903	/ min / (uint)rocksdb::Options::AccessHint::NONE,
904	/ max / (uint)rocksdb::Options::AccessHint::WILLNEED, `0`);
905
906	static MYSQL_SYSVAR_BOOL(
907	allow_concurrent_memtable_write,
908	*reinterpret_cast<my_bool *>(
909	&rocksdb_db_options ->allow_concurrent_memtable_write),
910	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
911	"DBOptions::allow_concurrent_memtable_write for RocksDB", nullptr, nullptr,
912	false);
913
914	static MYSQL_SYSVAR_BOOL(
915	enable_write_thread_adaptive_yield,
916	*reinterpret_cast<my_bool *>(
917	&rocksdb_db_options ->enable_write_thread_adaptive_yield),
918	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
919	"DBOptions::enable_write_thread_adaptive_yield for RocksDB", nullptr,
920	nullptr, false);
921
922	static MYSQL_SYSVAR_INT(max_open_files, rocksdb_db_options ->max_open_files,
923	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
924	"DBOptions::max_open_files for RocksDB", nullptr,
925	nullptr, rocksdb_db_options ->max_open_files,
926	/ min / -`2`, / max / INT_MAX, `0`);
927
928	static MYSQL_SYSVAR_UINT64_T(max_total_wal_size,
929	rocksdb_db_options ->max_total_wal_size,
930	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
931	"DBOptions::max_total_wal_size for RocksDB", nullptr,
932	nullptr, rocksdb_db_options ->max_total_wal_size,
933	/ min / `0`, / max / LONGLONG_MAX, `0`);
934
935	static MYSQL_SYSVAR_BOOL(
936	use_fsync, *reinterpret_cast<my_bool *>(&rocksdb_db_options ->use_fsync),
937	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
938	"DBOptions::use_fsync for RocksDB", nullptr, nullptr,
939	rocksdb_db_options ->use_fsync);
940
941	static MYSQL_SYSVAR_STR(wal_dir, rocksdb_wal_dir,
942	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
943	"DBOptions::wal_dir for RocksDB", nullptr, nullptr,
944	rocksdb_db_options ->wal_dir.c_str());
945
946	static MYSQL_SYSVAR_STR(
947	persistent_cache_path, rocksdb_persistent_cache_path,
948	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
949	"Path for BlockBasedTableOptions::persistent_cache for RocksDB", nullptr,
950	nullptr, "");
951
952	static MYSQL_SYSVAR_ULONG(
953	persistent_cache_size_mb, rocksdb_persistent_cache_size_mb,
954	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
955	"Size of cache in MB for BlockBasedTableOptions::persistent_cache "
956	"for RocksDB", nullptr, nullptr, rocksdb_persistent_cache_size_mb,
957	/ min / `0L`, / max / ULONG_MAX, `0`);
958
959	static MYSQL_SYSVAR_UINT64_T(
960	delete_obsolete_files_period_micros,
961	rocksdb_db_options ->delete_obsolete_files_period_micros,
962	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
963	"DBOptions::delete_obsolete_files_period_micros for RocksDB", nullptr,
964	nullptr, rocksdb_db_options ->delete_obsolete_files_period_micros,
965	/ min / `0`, / max / LONGLONG_MAX, `0`);
966
967	static MYSQL_SYSVAR_INT(max_background_jobs,
968	rocksdb_db_options ->max_background_jobs,
969	PLUGIN_VAR_RQCMDARG,
970	"DBOptions::max_background_jobs for RocksDB", nullptr,
971	rocksdb_set_max_background_jobs,
972	rocksdb_db_options ->max_background_jobs,
973	/ min / -`1`, / max / MAX_BACKGROUND_JOBS, `0`);
974
975	static MYSQL_SYSVAR_UINT(max_subcompactions,
976	rocksdb_db_options ->max_subcompactions,
977	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
978	"DBOptions::max_subcompactions for RocksDB", nullptr,
979	nullptr, rocksdb_db_options ->max_subcompactions,
980	/ min / `1`, / max / MAX_SUBCOMPACTIONS, `0`);
981
982	static MYSQL_SYSVAR_SIZE_T(max_log_file_size,
983	rocksdb_db_options ->max_log_file_size,
984	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
985	"DBOptions::max_log_file_size for RocksDB", nullptr,
986	nullptr, rocksdb_db_options ->max_log_file_size,
987	/ min / `0L`, / max / SIZE_T_MAX, `0`);
988
989	static MYSQL_SYSVAR_SIZE_T(log_file_time_to_roll,
990	rocksdb_db_options ->log_file_time_to_roll,
991	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
992	"DBOptions::log_file_time_to_roll for RocksDB",
993	nullptr, nullptr,
994	rocksdb_db_options ->log_file_time_to_roll,
995	/ min / `0L`, / max / SIZE_T_MAX, `0`);
996
997	static MYSQL_SYSVAR_SIZE_T(keep_log_file_num,
998	rocksdb_db_options ->keep_log_file_num,
999	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1000	"DBOptions::keep_log_file_num for RocksDB", nullptr,
1001	nullptr, rocksdb_db_options ->keep_log_file_num,
1002	/ min / `0L`, / max / SIZE_T_MAX, `0`);
1003
1004	static MYSQL_SYSVAR_UINT64_T(max_manifest_file_size,
1005	rocksdb_db_options ->max_manifest_file_size,
1006	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1007	"DBOptions::max_manifest_file_size for RocksDB",
1008	nullptr, nullptr,
1009	rocksdb_db_options ->max_manifest_file_size,
1010	/ min / `0L`, / max / ULONGLONG_MAX, `0`);
1011
1012	static MYSQL_SYSVAR_INT(table_cache_numshardbits,
1013	rocksdb_db_options ->table_cache_numshardbits,
1014	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1015	"DBOptions::table_cache_numshardbits for RocksDB",
1016	nullptr, nullptr,
1017	rocksdb_db_options ->table_cache_numshardbits,
1018	/ min / `0`, / max / INT_MAX, `0`);
1019
1020	static MYSQL_SYSVAR_UINT64_T(wal_ttl_seconds, rocksdb_db_options ->WAL_ttl_seconds,
1021	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1022	"DBOptions::WAL_ttl_seconds for RocksDB", nullptr,
1023	nullptr, rocksdb_db_options ->WAL_ttl_seconds,
1024	/ min / `0L`, / max / LONGLONG_MAX, `0`);
1025
1026	static MYSQL_SYSVAR_UINT64_T(wal_size_limit_mb,
1027	rocksdb_db_options ->WAL_size_limit_MB,
1028	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1029	"DBOptions::WAL_size_limit_MB for RocksDB", nullptr,
1030	nullptr, rocksdb_db_options ->WAL_size_limit_MB,
1031	/ min / `0L`, / max / LONGLONG_MAX, `0`);
1032
1033	static MYSQL_SYSVAR_SIZE_T(manifest_preallocation_size,
1034	rocksdb_db_options ->manifest_preallocation_size,
1035	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1036	"DBOptions::manifest_preallocation_size for RocksDB",
1037	nullptr, nullptr,
1038	rocksdb_db_options ->manifest_preallocation_size,
1039	/ min / `0L`, / max / SIZE_T_MAX, `0`);
1040
1041	static MYSQL_SYSVAR_BOOL(
1042	use_direct_reads,
1043	*reinterpret_cast<my_bool *>(&rocksdb_db_options ->use_direct_reads),
1044	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1045	"DBOptions::use_direct_reads for RocksDB", nullptr, nullptr,
1046	rocksdb_db_options ->use_direct_reads);
1047
1048	static MYSQL_SYSVAR_BOOL(
1049	use_direct_io_for_flush_and_compaction,
1050	*reinterpret_cast<my_bool *>(&rocksdb_db_options ->use_direct_io_for_flush_and_compaction),
1051	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1052	"DBOptions::use_direct_io_for_flush_and_compaction for RocksDB", nullptr, nullptr,
1053	rocksdb_db_options ->use_direct_io_for_flush_and_compaction);
1054
1055	static MYSQL_SYSVAR_BOOL(
1056	allow_mmap_reads,
1057	*reinterpret_cast<my_bool *>(&rocksdb_db_options ->allow_mmap_reads),
1058	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1059	"DBOptions::allow_mmap_reads for RocksDB", nullptr, nullptr,
1060	rocksdb_db_options ->allow_mmap_reads);
1061
1062	static MYSQL_SYSVAR_BOOL(
1063	allow_mmap_writes,
1064	*reinterpret_cast<my_bool *>(&rocksdb_db_options ->allow_mmap_writes),
1065	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1066	"DBOptions::allow_mmap_writes for RocksDB", nullptr, nullptr,
1067	rocksdb_db_options ->allow_mmap_writes);
1068
1069	static MYSQL_SYSVAR_BOOL(
1070	is_fd_close_on_exec,
1071	*reinterpret_cast<my_bool *>(&rocksdb_db_options ->is_fd_close_on_exec),
1072	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1073	"DBOptions::is_fd_close_on_exec for RocksDB", nullptr, nullptr,
1074	rocksdb_db_options ->is_fd_close_on_exec);
1075
1076	static MYSQL_SYSVAR_UINT(stats_dump_period_sec,
1077	rocksdb_db_options ->stats_dump_period_sec,
1078	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1079	"DBOptions::stats_dump_period_sec for RocksDB",
1080	nullptr, nullptr,
1081	rocksdb_db_options ->stats_dump_period_sec,
1082	/ min / `0`, / max / INT_MAX, `0`);
1083
1084	static MYSQL_SYSVAR_BOOL(
1085	advise_random_on_open,
1086	*reinterpret_cast<my_bool *>(&rocksdb_db_options ->advise_random_on_open),
1087	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1088	"DBOptions::advise_random_on_open for RocksDB", nullptr, nullptr,
1089	rocksdb_db_options ->advise_random_on_open);
1090
1091	static MYSQL_SYSVAR_SIZE_T(db_write_buffer_size,
1092	rocksdb_db_options ->db_write_buffer_size,
1093	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1094	"DBOptions::db_write_buffer_size for RocksDB",
1095	nullptr, nullptr,
1096	rocksdb_db_options ->db_write_buffer_size,
1097	/ min / `0L`, / max / SIZE_T_MAX, `0`);
1098
1099	static MYSQL_SYSVAR_BOOL(
1100	use_adaptive_mutex,
1101	*reinterpret_cast<my_bool *>(&rocksdb_db_options ->use_adaptive_mutex),
1102	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1103	"DBOptions::use_adaptive_mutex for RocksDB", nullptr, nullptr,
1104	rocksdb_db_options ->use_adaptive_mutex);
1105
1106	static MYSQL_SYSVAR_UINT64_T(bytes_per_sync, rocksdb_db_options ->bytes_per_sync,
1107	PLUGIN_VAR_RQCMDARG,
1108	"DBOptions::bytes_per_sync for RocksDB", nullptr,
1109	rocksdb_set_bytes_per_sync,
1110	rocksdb_db_options ->bytes_per_sync,
1111	/ min / `0L`, / max / ULONGLONG_MAX, `0`);
1112
1113	static MYSQL_SYSVAR_UINT64_T(wal_bytes_per_sync,
1114	rocksdb_db_options ->wal_bytes_per_sync,
1115	PLUGIN_VAR_RQCMDARG,
1116	"DBOptions::wal_bytes_per_sync for RocksDB", nullptr,
1117	rocksdb_set_wal_bytes_per_sync,
1118	rocksdb_db_options ->wal_bytes_per_sync,
1119	/ min / `0L`, / max / ULONGLONG_MAX, `0`);
1120
1121	static MYSQL_SYSVAR_BOOL(
1122	enable_thread_tracking,
1123	*reinterpret_cast<my_bool *>(&rocksdb_db_options ->enable_thread_tracking),
1124	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1125	"DBOptions::enable_thread_tracking for RocksDB", nullptr, nullptr, true);
1126
1127	static MYSQL_SYSVAR_LONGLONG(block_cache_size, rocksdb_block_cache_size,
1128	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1129	"block_cache size for RocksDB", nullptr, nullptr,
1130	/ default / RDB_DEFAULT_BLOCK_CACHE_SIZE,
1131	/ min / RDB_MIN_BLOCK_CACHE_SIZE,
1132	/ max / LONGLONG_MAX,
1133	/ Block size / RDB_MIN_BLOCK_CACHE_SIZE);
1134
1135	static MYSQL_SYSVAR_LONGLONG(sim_cache_size, rocksdb_sim_cache_size,
1136	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1137	"Simulated cache size for RocksDB", nullptr,
1138	nullptr,
1139	/ default / `0`,
1140	/ min / `0`,
1141	/ max / LONGLONG_MAX,
1142	/ Block size / `0`);
1143
1144	static MYSQL_SYSVAR_BOOL(
1145	use_clock_cache,
1146	rocksdb_use_clock_cache,
1147	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1148	"Use ClockCache instead of default LRUCache for RocksDB",
1149	nullptr, nullptr, false);
1150
1151	static MYSQL_SYSVAR_BOOL(
1152	cache_index_and_filter_blocks,
1153	*reinterpret_cast<my_bool *>(
1154	&rocksdb_tbl_options ->cache_index_and_filter_blocks),
1155	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1156	"BlockBasedTableOptions::cache_index_and_filter_blocks for RocksDB",
1157	nullptr, nullptr, true);
1158
1159	// When pin_l0_filter_and_index_blocks_in_cache is true, RocksDB will use the
1160	// LRU cache, but will always keep the filter & idndex block's handle checked
1161	// out (=won't call ShardedLRUCache::Release), plus the parsed out objects
1162	// the LRU cache will never push flush them out, hence they're pinned.
1163	//
1164	// This fixes the mutex contention between :ShardedLRUCache::Lookup and
1165	// ShardedLRUCache::Release which reduced the QPS ratio (QPS using secondary
1166	// index / QPS using PK).
1167	static MYSQL_SYSVAR_BOOL(
1168	pin_l0_filter_and_index_blocks_in_cache,
1169	*reinterpret_cast<my_bool *>(
1170	&rocksdb_tbl_options ->pin_l0_filter_and_index_blocks_in_cache),
1171	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1172	"pin_l0_filter_and_index_blocks_in_cache for RocksDB", nullptr, nullptr,
1173	true);
1174
1175	static MYSQL_SYSVAR_ENUM(index_type, rocksdb_index_type,
1176	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1177	"BlockBasedTableOptions::index_type for RocksDB",
1178	nullptr, nullptr,
1179	(ulong)rocksdb_tbl_options ->index_type,
1180	&index_type_typelib);
1181
1182	static MYSQL_SYSVAR_BOOL(
1183	hash_index_allow_collision,
1184	*reinterpret_cast<my_bool *>(
1185	&rocksdb_tbl_options ->hash_index_allow_collision),
1186	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1187	"BlockBasedTableOptions::hash_index_allow_collision for RocksDB", nullptr,
1188	nullptr, rocksdb_tbl_options ->hash_index_allow_collision);
1189
1190	static MYSQL_SYSVAR_BOOL(
1191	no_block_cache,
1192	*reinterpret_cast<my_bool *>(&rocksdb_tbl_options ->no_block_cache),
1193	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1194	"BlockBasedTableOptions::no_block_cache for RocksDB", nullptr, nullptr,
1195	rocksdb_tbl_options ->no_block_cache);
1196
1197	static MYSQL_SYSVAR_SIZE_T(block_size, rocksdb_tbl_options ->block_size,
1198	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1199	"BlockBasedTableOptions::block_size for RocksDB",
1200	nullptr, nullptr, rocksdb_tbl_options ->block_size,
1201	/ min / `1L`, / max / SIZE_T_MAX, `0`);
1202
1203	static MYSQL_SYSVAR_INT(
1204	block_size_deviation, rocksdb_tbl_options ->block_size_deviation,
1205	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1206	"BlockBasedTableOptions::block_size_deviation for RocksDB", nullptr,
1207	nullptr, rocksdb_tbl_options ->block_size_deviation,
1208	/ min / `0`, / max / INT_MAX, `0`);
1209
1210	static MYSQL_SYSVAR_INT(
1211	block_restart_interval, rocksdb_tbl_options ->block_restart_interval,
1212	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1213	"BlockBasedTableOptions::block_restart_interval for RocksDB", nullptr,
1214	nullptr, rocksdb_tbl_options ->block_restart_interval,
1215	/ min / `1`, / max / INT_MAX, `0`);
1216
1217	static MYSQL_SYSVAR_BOOL(
1218	whole_key_filtering,
1219	*reinterpret_cast<my_bool *>(&rocksdb_tbl_options ->whole_key_filtering),
1220	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1221	"BlockBasedTableOptions::whole_key_filtering for RocksDB", nullptr, nullptr,
1222	rocksdb_tbl_options ->whole_key_filtering);
1223
1224	static MYSQL_SYSVAR_STR(default_cf_options, rocksdb_default_cf_options,
1225	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1226	"default cf options for RocksDB", nullptr, nullptr, "");
1227
1228	static MYSQL_SYSVAR_STR(override_cf_options, rocksdb_override_cf_options,
1229	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1230	"option overrides per cf for RocksDB", nullptr, nullptr,
1231	"");
1232
1233	static MYSQL_SYSVAR_STR(update_cf_options, rocksdb_update_cf_options,
1234	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_MEMALLOC
1235	/ psergey-merge: need this? : PLUGIN_VAR_ALLOCATED/,
1236	"Option updates per column family for RocksDB",
1237	rocksdb_validate_update_cf_options,
1238	rocksdb_set_update_cf_options, nullptr);
1239
1240	static MYSQL_SYSVAR_UINT(flush_log_at_trx_commit,
1241	rocksdb_flush_log_at_trx_commit, PLUGIN_VAR_RQCMDARG,
1242	"Sync on transaction commit. Similar to "
1243	"innodb_flush_log_at_trx_commit. 1: sync on commit, "
1244	"0,2: not sync on commit",
1245	rocksdb_validate_flush_log_at_trx_commit, nullptr,
1246	/ default / FLUSH_LOG_SYNC,
1247	/ min / FLUSH_LOG_NEVER,
1248	/ max / FLUSH_LOG_BACKGROUND, `0`);
1249
1250	static MYSQL_THDVAR_BOOL(write_disable_wal, PLUGIN_VAR_RQCMDARG,
1251	"WriteOptions::disableWAL for RocksDB", nullptr,
1252	nullptr, rocksdb::WriteOptions ().disableWAL);
1253
1254	static MYSQL_THDVAR_BOOL(
1255	write_ignore_missing_column_families, PLUGIN_VAR_RQCMDARG,
1256	"WriteOptions::ignore_missing_column_families for RocksDB", nullptr,
1257	nullptr, rocksdb::WriteOptions ().ignore_missing_column_families);
1258
1259	static MYSQL_THDVAR_BOOL(skip_fill_cache, PLUGIN_VAR_RQCMDARG,
1260	"Skip filling block cache on read requests", nullptr,
1261	nullptr, FALSE);
1262
1263	static MYSQL_THDVAR_BOOL(
1264	unsafe_for_binlog, PLUGIN_VAR_RQCMDARG,
1265	"Allowing statement based binary logging which may break consistency",
1266	nullptr, nullptr, FALSE);
1267
1268	static MYSQL_THDVAR_UINT(records_in_range, PLUGIN_VAR_RQCMDARG,
1269	"Used to override the result of records_in_range(). "
1270	"Set to a positive number to override",
1271	nullptr, nullptr, `0`,
1272	/ min / `0`, / max / INT_MAX, `0`);
1273
1274	static MYSQL_THDVAR_UINT(force_index_records_in_range, PLUGIN_VAR_RQCMDARG,
1275	"Used to override the result of records_in_range() "
1276	"when FORCE INDEX is used.",
1277	nullptr, nullptr, `0`,
1278	/ min / `0`, / max / INT_MAX, `0`);
1279
1280	static MYSQL_SYSVAR_UINT(
1281	debug_optimizer_n_rows, rocksdb_debug_optimizer_n_rows,
1282	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY \| PLUGIN_VAR_NOSYSVAR,
1283	"Test only to override rocksdb estimates of table size in a memtable",
1284	nullptr, nullptr, `0`, / min / `0`, / max / INT_MAX, `0`);
1285
1286	static MYSQL_SYSVAR_BOOL(force_compute_memtable_stats,
1287	rocksdb_force_compute_memtable_stats,
1288	PLUGIN_VAR_RQCMDARG,
1289	"Force to always compute memtable stats",
1290	nullptr, nullptr, TRUE);
1291
1292	static MYSQL_SYSVAR_UINT(force_compute_memtable_stats_cachetime,
1293	rocksdb_force_compute_memtable_stats_cachetime,
1294	PLUGIN_VAR_RQCMDARG,
1295	"Time in usecs to cache memtable estimates", nullptr,
1296	nullptr, / default / `60` * `1000` * `1000`,
1297	/ min / `0`, / max / INT_MAX, `0`);
1298
1299	static MYSQL_SYSVAR_BOOL(
1300	debug_optimizer_no_zero_cardinality,
1301	rocksdb_debug_optimizer_no_zero_cardinality, PLUGIN_VAR_RQCMDARG,
1302	"In case if cardinality is zero, overrides it with some value", nullptr,
1303	nullptr, TRUE);
1304
1305	static MYSQL_SYSVAR_STR(compact_cf, rocksdb_compact_cf_name,
1306	PLUGIN_VAR_RQCMDARG, "Compact column family",
1307	rocksdb_compact_column_family,
1308	rocksdb_compact_column_family_stub, "");
1309
1310	static MYSQL_SYSVAR_STR(create_checkpoint, rocksdb_checkpoint_name,
1311	PLUGIN_VAR_RQCMDARG, "Checkpoint directory",
1312	rocksdb_create_checkpoint,
1313	rocksdb_create_checkpoint_stub, "");
1314
1315	static MYSQL_SYSVAR_BOOL(signal_drop_index_thread,
1316	rocksdb_signal_drop_index_thread, PLUGIN_VAR_RQCMDARG,
1317	"Wake up drop index thread", nullptr,
1318	rocksdb_drop_index_wakeup_thread, FALSE);
1319
1320	static MYSQL_SYSVAR_BOOL(pause_background_work, rocksdb_pause_background_work,
1321	PLUGIN_VAR_RQCMDARG,
1322	"Disable all rocksdb background operations", nullptr,
1323	rocksdb_set_pause_background_work, FALSE);
1324
1325	static MYSQL_SYSVAR_BOOL(
1326	enable_ttl, rocksdb_enable_ttl, PLUGIN_VAR_RQCMDARG,
1327	"Enable expired TTL records to be dropped during compaction.", nullptr,
1328	nullptr, TRUE);
1329
1330	static MYSQL_SYSVAR_BOOL(
1331	enable_ttl_read_filtering, rocksdb_enable_ttl_read_filtering,
1332	PLUGIN_VAR_RQCMDARG,
1333	"For tables with TTL, expired records are skipped/filtered out during "
1334	"processing and in query results. Disabling this will allow these records "
1335	"to be seen, but as a result rows may disappear in the middle of "
1336	"transactions as they are dropped during compaction. Use with caution.",
1337	nullptr, nullptr, TRUE);
1338
1339	static MYSQL_SYSVAR_INT(
1340	debug_ttl_rec_ts, rocksdb_debug_ttl_rec_ts, PLUGIN_VAR_RQCMDARG,
1341	"For debugging purposes only. Overrides the TTL of records to "
1342	"now() + debug_ttl_rec_ts. The value can be +/- to simulate "
1343	"a record inserted in the past vs a record inserted in the 'future'. "
1344	"A value of 0 denotes that the variable is not set. This variable is a "
1345	"no-op in non-debug builds.",
1346	nullptr, nullptr, `0`, / min / -`3600`, / max / `3600`, `0`);
1347
1348	static MYSQL_SYSVAR_INT(
1349	debug_ttl_snapshot_ts, rocksdb_debug_ttl_snapshot_ts, PLUGIN_VAR_RQCMDARG,
1350	"For debugging purposes only. Sets the snapshot during compaction to "
1351	"now() + debug_set_ttl_snapshot_ts. The value can be +/- to simulate "
1352	"a snapshot in the past vs a snapshot created in the 'future'. "
1353	"A value of 0 denotes that the variable is not set. This variable is a "
1354	"no-op in non-debug builds.",
1355	nullptr, nullptr, `0`, / min / -`3600`, / max / `3600`, `0`);
1356
1357	static MYSQL_SYSVAR_INT(
1358	debug_ttl_read_filter_ts, rocksdb_debug_ttl_read_filter_ts,
1359	PLUGIN_VAR_RQCMDARG,
1360	"For debugging purposes only. Overrides the TTL read filtering time to "
1361	"time + debug_ttl_read_filter_ts. A value of 0 denotes that the variable "
1362	"is not set. This variable is a no-op in non-debug builds.",
1363	nullptr, nullptr, `0`, / min / -`3600`, / max / `3600`, `0`);
1364
1365	static MYSQL_SYSVAR_BOOL(
1366	debug_ttl_ignore_pk, rocksdb_debug_ttl_ignore_pk, PLUGIN_VAR_RQCMDARG,
1367	"For debugging purposes only. If true, compaction filtering will not occur "
1368	"on PK TTL data. This variable is a no-op in non-debug builds.",
1369	nullptr, nullptr, FALSE);
1370
1371	static MYSQL_SYSVAR_BOOL(
1372	reset_stats, rocksdb_reset_stats, PLUGIN_VAR_RQCMDARG,
1373	"Reset the RocksDB internal statistics without restarting the DB.", nullptr,
1374	rocksdb_set_reset_stats, FALSE);
1375
1376	static MYSQL_SYSVAR_UINT(io_write_timeout, rocksdb_io_write_timeout_secs,
1377	PLUGIN_VAR_RQCMDARG,
1378	"Timeout for experimental I/O watchdog.", nullptr,
1379	rocksdb_set_io_write_timeout, / default / `0`,
1380	/ min / `0L`,
1381	/ max / UINT_MAX, `0`);
1382
1383	static MYSQL_SYSVAR_BOOL(enable_2pc, rocksdb_enable_2pc, PLUGIN_VAR_RQCMDARG,
1384	"Enable two phase commit for MyRocks", nullptr,
1385	nullptr, TRUE);
1386
1387	static MYSQL_SYSVAR_BOOL(ignore_unknown_options, rocksdb_ignore_unknown_options,
1388	PLUGIN_VAR_OPCMDARG \| PLUGIN_VAR_READONLY,
1389	"Enable ignoring unknown options passed to RocksDB",
1390	nullptr, nullptr, TRUE);
1391
1392	static MYSQL_SYSVAR_BOOL(strict_collation_check, rocksdb_strict_collation_check,
1393	PLUGIN_VAR_RQCMDARG,
1394	"Enforce case sensitive collation for MyRocks indexes",
1395	nullptr, nullptr, TRUE);
1396
1397	static MYSQL_SYSVAR_STR(strict_collation_exceptions,
1398	rocksdb_strict_collation_exceptions,
1399	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_MEMALLOC,
1400	"List of tables (using regex) that are excluded "
1401	"from the case sensitive collation enforcement",
1402	nullptr, rocksdb_set_collation_exception_list, "");
1403
1404	static MYSQL_SYSVAR_BOOL(collect_sst_properties, rocksdb_collect_sst_properties,
1405	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1406	"Enables collecting SST file properties on each flush",
1407	nullptr, nullptr, rocksdb_collect_sst_properties);
1408
1409	static MYSQL_SYSVAR_BOOL(
1410	force_flush_memtable_now, rocksdb_force_flush_memtable_now_var,
1411	PLUGIN_VAR_RQCMDARG,
1412	"Forces memstore flush which may block all write requests so be careful",
1413	rocksdb_force_flush_memtable_now, rocksdb_force_flush_memtable_now_stub,
1414	FALSE);
1415
1416	static MYSQL_SYSVAR_BOOL(
1417	force_flush_memtable_and_lzero_now,
1418	rocksdb_force_flush_memtable_and_lzero_now_var, PLUGIN_VAR_RQCMDARG,
1419	"Acts similar to force_flush_memtable_now, but also compacts all L0 files.",
1420	rocksdb_force_flush_memtable_and_lzero_now,
1421	rocksdb_force_flush_memtable_and_lzero_now_stub, FALSE);
1422
1423	static MYSQL_SYSVAR_UINT(
1424	seconds_between_stat_computes, rocksdb_seconds_between_stat_computes,
1425	PLUGIN_VAR_RQCMDARG,
1426	"Sets a number of seconds to wait between optimizer stats recomputation. "
1427	"Only changed indexes will be refreshed.",
1428	nullptr, nullptr, rocksdb_seconds_between_stat_computes,
1429	/ min / `0L`, / max / UINT_MAX, `0`);
1430
1431	static MYSQL_SYSVAR_LONGLONG(compaction_sequential_deletes,
1432	rocksdb_compaction_sequential_deletes,
1433	PLUGIN_VAR_RQCMDARG,
1434	"RocksDB will trigger compaction for the file if "
1435	"it has more than this number sequential deletes "
1436	"per window",
1437	nullptr, rocksdb_set_compaction_options,
1438	DEFAULT_COMPACTION_SEQUENTIAL_DELETES,
1439	/ min / `0L`,
1440	/ max / MAX_COMPACTION_SEQUENTIAL_DELETES, `0`);
1441
1442	static MYSQL_SYSVAR_LONGLONG(
1443	compaction_sequential_deletes_window,
1444	rocksdb_compaction_sequential_deletes_window, PLUGIN_VAR_RQCMDARG,
1445	"Size of the window for counting rocksdb_compaction_sequential_deletes",
1446	nullptr, rocksdb_set_compaction_options,
1447	DEFAULT_COMPACTION_SEQUENTIAL_DELETES_WINDOW,
1448	/ min / `0L`, / max / MAX_COMPACTION_SEQUENTIAL_DELETES_WINDOW, `0`);
1449
1450	static MYSQL_SYSVAR_LONGLONG(
1451	compaction_sequential_deletes_file_size,
1452	rocksdb_compaction_sequential_deletes_file_size, PLUGIN_VAR_RQCMDARG,
1453	"Minimum file size required for compaction_sequential_deletes", nullptr,
1454	rocksdb_set_compaction_options, `0L`,
1455	/ min / -`1L`, / max / LONGLONG_MAX, `0`);
1456
1457	static MYSQL_SYSVAR_BOOL(
1458	compaction_sequential_deletes_count_sd,
1459	rocksdb_compaction_sequential_deletes_count_sd, PLUGIN_VAR_RQCMDARG,
1460	"Counting SingleDelete as rocksdb_compaction_sequential_deletes", nullptr,
1461	nullptr, rocksdb_compaction_sequential_deletes_count_sd);
1462
1463	static MYSQL_SYSVAR_BOOL(
1464	print_snapshot_conflict_queries, rocksdb_print_snapshot_conflict_queries,
1465	PLUGIN_VAR_RQCMDARG,
1466	"Logging queries that got snapshot conflict errors into .err log", nullptr*,
1467	nullptr, rocksdb_print_snapshot_conflict_queries);
1468
1469	static MYSQL_THDVAR_INT(checksums_pct, PLUGIN_VAR_RQCMDARG,
1470	"How many percentages of rows to be checksummed",
1471	nullptr, nullptr, RDB_MAX_CHECKSUMS_PCT,
1472	/ min / `0`, / max / RDB_MAX_CHECKSUMS_PCT, `0`);
1473
1474	static MYSQL_THDVAR_BOOL(store_row_debug_checksums, PLUGIN_VAR_RQCMDARG,
1475	"Include checksums when writing index/table records",
1476	nullptr, nullptr, false / default value /);
1477
1478	static MYSQL_THDVAR_BOOL(verify_row_debug_checksums, PLUGIN_VAR_RQCMDARG,
1479	"Verify checksums when reading index/table records",
1480	nullptr, nullptr, false / default value /);
1481
1482	static MYSQL_THDVAR_BOOL(master_skip_tx_api, PLUGIN_VAR_RQCMDARG,
1483	"Skipping holding any lock on row access. "
1484	"Not effective on slave.",
1485	nullptr, nullptr, false);
1486
1487	static MYSQL_SYSVAR_UINT(
1488	validate_tables, rocksdb_validate_tables,
1489	PLUGIN_VAR_RQCMDARG \| PLUGIN_VAR_READONLY,
1490	"Verify all .frm files match all RocksDB tables (0 means no verification, "
1491	"1 means verify and fail on error, and 2 means verify but continue",
1492	nullptr, nullptr, `1` / default value /, `0` / min value /,
1493	`2` / max value /, `0`);
1494
1495	static MYSQL_SYSVAR_STR(datadir, rocksdb_datadir,
1496	PLUGIN_VAR_OPCMDARG \| PLUGIN_VAR_READONLY,
1497	"RocksDB data directory", nullptr, nullptr,
1498	"./#rocksdb");
1499
1500	static MYSQL_SYSVAR_STR(supported_compression_types,
1501	compression_types_val,
1502	PLUGIN_VAR_NOCMDOPT \| PLUGIN_VAR_READONLY,
1503	"Compression algorithms supported by RocksDB",
1504	nullptr, nullptr,
1505	compression_types_val);
1506
1507	static MYSQL_SYSVAR_UINT(
1508	table_stats_sampling_pct, rocksdb_table_stats_sampling_pct,
1509	PLUGIN_VAR_RQCMDARG,
1510	"Percentage of entries to sample when collecting statistics about table "
1511	"properties. Specify either 0 to sample everything or percentage "
1512	"[" STRINGIFY_ARG(RDB_TBL_STATS_SAMPLE_PCT_MIN) ".." STRINGIFY_ARG(
1513	RDB_TBL_STATS_SAMPLE_PCT_MAX) "]. "
1514	"By default " STRINGIFY_ARG(
1515	RDB_DEFAULT_TBL_STATS_SAMPLE_PCT) "% "
1516	"of"
1517	" e"
1518	"nt"
1519	"ri"
1520	"es"
1521	" a"
1522	"re"
1523	" "
1524	"sa"
1525	"mp"
1526	"le"
1527	"d"
1528	".",
1529	nullptr, rocksdb_set_table_stats_sampling_pct, / default /
1530	RDB_DEFAULT_TBL_STATS_SAMPLE_PCT, / everything / `0`,
1531	/ max / RDB_TBL_STATS_SAMPLE_PCT_MAX, `0`);
1532
1533	static MYSQL_SYSVAR_BOOL(
1534	large_prefix, rocksdb_large_prefix, PLUGIN_VAR_RQCMDARG,
1535	"Support large index prefix length of 3072 bytes. If off, the maximum "
1536	"index prefix length is 767.",
1537	nullptr, nullptr, FALSE);
1538
1539	static MYSQL_SYSVAR_BOOL(
1540	allow_to_start_after_corruption, rocksdb_allow_to_start_after_corruption,
1541	PLUGIN_VAR_OPCMDARG \| PLUGIN_VAR_READONLY,
1542	"Allow server still to start successfully even if RocksDB corruption is "
1543	"detected.",
1544	nullptr, nullptr, FALSE);
1545
1546	static const int ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE = `100`;
1547
1548	static struct st_mysql_sys_var *rocksdb_system_variables[] = {
1549	MYSQL_SYSVAR(lock_wait_timeout),
1550	MYSQL_SYSVAR(deadlock_detect),
1551	MYSQL_SYSVAR(deadlock_detect_depth),
1552	MYSQL_SYSVAR(max_row_locks),
1553	MYSQL_SYSVAR(write_batch_max_bytes),
1554	MYSQL_SYSVAR(lock_scanned_rows),
1555	MYSQL_SYSVAR(bulk_load),
1556	MYSQL_SYSVAR(bulk_load_allow_unsorted),
1557	MYSQL_SYSVAR(skip_unique_check_tables),
1558	MYSQL_SYSVAR(trace_sst_api),
1559	MYSQL_SYSVAR(commit_in_the_middle),
1560	MYSQL_SYSVAR(blind_delete_primary_key),
1561	MYSQL_SYSVAR(read_free_rpl_tables),
1562	MYSQL_SYSVAR(bulk_load_size),
1563	MYSQL_SYSVAR(merge_buf_size),
1564	MYSQL_SYSVAR(enable_bulk_load_api),
1565	MYSQL_SYSVAR(tmpdir),
1566	MYSQL_SYSVAR(merge_combine_read_size),
1567	MYSQL_SYSVAR(merge_tmp_file_removal_delay_ms),
1568	MYSQL_SYSVAR(skip_bloom_filter_on_read),
1569
1570	MYSQL_SYSVAR(create_if_missing),
1571	MYSQL_SYSVAR(two_write_queues),
1572	MYSQL_SYSVAR(manual_wal_flush),
1573	MYSQL_SYSVAR(create_missing_column_families),
1574	MYSQL_SYSVAR(error_if_exists),
1575	MYSQL_SYSVAR(paranoid_checks),
1576	MYSQL_SYSVAR(rate_limiter_bytes_per_sec),
1577	MYSQL_SYSVAR(sst_mgr_rate_bytes_per_sec),
1578	MYSQL_SYSVAR(delayed_write_rate),
1579	MYSQL_SYSVAR(max_latest_deadlocks),
1580	MYSQL_SYSVAR(info_log_level),
1581	MYSQL_SYSVAR(max_open_files),
1582	MYSQL_SYSVAR(max_total_wal_size),
1583	MYSQL_SYSVAR(use_fsync),
1584	MYSQL_SYSVAR(wal_dir),
1585	MYSQL_SYSVAR(persistent_cache_path),
1586	MYSQL_SYSVAR(persistent_cache_size_mb),
1587	MYSQL_SYSVAR(delete_obsolete_files_period_micros),
1588	MYSQL_SYSVAR(max_background_jobs),
1589	MYSQL_SYSVAR(max_log_file_size),
1590	MYSQL_SYSVAR(max_subcompactions),
1591	MYSQL_SYSVAR(log_file_time_to_roll),
1592	MYSQL_SYSVAR(keep_log_file_num),
1593	MYSQL_SYSVAR(max_manifest_file_size),
1594	MYSQL_SYSVAR(table_cache_numshardbits),
1595	MYSQL_SYSVAR(wal_ttl_seconds),
1596	MYSQL_SYSVAR(wal_size_limit_mb),
1597	MYSQL_SYSVAR(manifest_preallocation_size),
1598	MYSQL_SYSVAR(use_direct_reads),
1599	MYSQL_SYSVAR(use_direct_io_for_flush_and_compaction),
1600	MYSQL_SYSVAR(allow_mmap_reads),
1601	MYSQL_SYSVAR(allow_mmap_writes),
1602	MYSQL_SYSVAR(is_fd_close_on_exec),
1603	MYSQL_SYSVAR(stats_dump_period_sec),
1604	MYSQL_SYSVAR(advise_random_on_open),
1605	MYSQL_SYSVAR(db_write_buffer_size),
1606	MYSQL_SYSVAR(use_adaptive_mutex),
1607	MYSQL_SYSVAR(bytes_per_sync),
1608	MYSQL_SYSVAR(wal_bytes_per_sync),
1609	MYSQL_SYSVAR(enable_thread_tracking),
1610	MYSQL_SYSVAR(perf_context_level),
1611	MYSQL_SYSVAR(wal_recovery_mode),
1612	MYSQL_SYSVAR(access_hint_on_compaction_start),
1613	MYSQL_SYSVAR(new_table_reader_for_compaction_inputs),
1614	MYSQL_SYSVAR(compaction_readahead_size),
1615	MYSQL_SYSVAR(allow_concurrent_memtable_write),
1616	MYSQL_SYSVAR(enable_write_thread_adaptive_yield),
1617
1618	MYSQL_SYSVAR(block_cache_size),
1619	MYSQL_SYSVAR(sim_cache_size),
1620	MYSQL_SYSVAR(use_clock_cache),
1621	MYSQL_SYSVAR(cache_index_and_filter_blocks),
1622	MYSQL_SYSVAR(pin_l0_filter_and_index_blocks_in_cache),
1623	MYSQL_SYSVAR(index_type),
1624	MYSQL_SYSVAR(hash_index_allow_collision),
1625	MYSQL_SYSVAR(no_block_cache),
1626	MYSQL_SYSVAR(block_size),
1627	MYSQL_SYSVAR(block_size_deviation),
1628	MYSQL_SYSVAR(block_restart_interval),
1629	MYSQL_SYSVAR(whole_key_filtering),
1630
1631	MYSQL_SYSVAR(default_cf_options),
1632	MYSQL_SYSVAR(override_cf_options),
1633	MYSQL_SYSVAR(update_cf_options),
1634
1635	MYSQL_SYSVAR(flush_log_at_trx_commit),
1636	MYSQL_SYSVAR(write_disable_wal),
1637	MYSQL_SYSVAR(write_ignore_missing_column_families),
1638
1639	MYSQL_SYSVAR(skip_fill_cache),
1640	MYSQL_SYSVAR(unsafe_for_binlog),
1641
1642	MYSQL_SYSVAR(records_in_range),
1643	MYSQL_SYSVAR(force_index_records_in_range),
1644	MYSQL_SYSVAR(debug_optimizer_n_rows),
1645	MYSQL_SYSVAR(force_compute_memtable_stats),
1646	MYSQL_SYSVAR(force_compute_memtable_stats_cachetime),
1647	MYSQL_SYSVAR(debug_optimizer_no_zero_cardinality),
1648
1649	MYSQL_SYSVAR(compact_cf),
1650	MYSQL_SYSVAR(signal_drop_index_thread),
1651	MYSQL_SYSVAR(pause_background_work),
1652	MYSQL_SYSVAR(enable_2pc),
1653	MYSQL_SYSVAR(ignore_unknown_options),
1654	MYSQL_SYSVAR(strict_collation_check),
1655	MYSQL_SYSVAR(strict_collation_exceptions),
1656	MYSQL_SYSVAR(collect_sst_properties),
1657	MYSQL_SYSVAR(force_flush_memtable_now),
1658	MYSQL_SYSVAR(force_flush_memtable_and_lzero_now),
1659	MYSQL_SYSVAR(enable_ttl),
1660	MYSQL_SYSVAR(enable_ttl_read_filtering),
1661	MYSQL_SYSVAR(debug_ttl_rec_ts),
1662	MYSQL_SYSVAR(debug_ttl_snapshot_ts),
1663	MYSQL_SYSVAR(debug_ttl_read_filter_ts),
1664	MYSQL_SYSVAR(debug_ttl_ignore_pk),
1665	MYSQL_SYSVAR(reset_stats),
1666	MYSQL_SYSVAR(io_write_timeout),
1667	MYSQL_SYSVAR(seconds_between_stat_computes),
1668
1669	MYSQL_SYSVAR(compaction_sequential_deletes),
1670	MYSQL_SYSVAR(compaction_sequential_deletes_window),
1671	MYSQL_SYSVAR(compaction_sequential_deletes_file_size),
1672	MYSQL_SYSVAR(compaction_sequential_deletes_count_sd),
1673	MYSQL_SYSVAR(print_snapshot_conflict_queries),
1674
1675	MYSQL_SYSVAR(datadir),
1676	MYSQL_SYSVAR(supported_compression_types),
1677	MYSQL_SYSVAR(create_checkpoint),
1678
1679	MYSQL_SYSVAR(checksums_pct),
1680	MYSQL_SYSVAR(store_row_debug_checksums),
1681	MYSQL_SYSVAR(verify_row_debug_checksums),
1682	MYSQL_SYSVAR(master_skip_tx_api),
1683
1684	MYSQL_SYSVAR(validate_tables),
1685	MYSQL_SYSVAR(table_stats_sampling_pct),
1686
1687	MYSQL_SYSVAR(large_prefix),
1688	MYSQL_SYSVAR(allow_to_start_after_corruption),
1689	MYSQL_SYSVAR(git_hash),
1690	nullptr};
1691
1692	static rocksdb::WriteOptions
1693	rdb_get_rocksdb_write_options(my_core::THD *const thd) {
1694	rocksdb::WriteOptions opt;
1695
1696	opt.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
1697	opt.disableWAL = THDVAR(thd, write_disable_wal);
1698	opt.ignore_missing_column_families =
1699	THDVAR(thd, write_ignore_missing_column_families);
1700
1701	return opt;
1702	}
1703
1704	///////////////////////////////////////////////////////////////////////////////////////////
1705
1706	/**
1707	@brief
1708	Function we use in the creation of our hash to get key.
1709	*/
1710
1711	uchar *
1712	Rdb_open_tables_map::get_hash_key(const Rdb_table_handler *const table_handler,
1713	size_t *const length,
1714	my_bool not_used MY_ATTRIBUTE((__unused__))) {
1715	*length = table_handler->m_table_name_length;
1716	return reinterpret_cast<uchar *>(table_handler->m_table_name);
1717	}
1718
1719	/*
1720	Drop index thread's control
1721	*/
1722
1723	static Rdb_drop_index_thread rdb_drop_idx_thread;
1724
1725	static void rocksdb_drop_index_wakeup_thread(
1726	my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
1727	struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
1728	void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
1729	if (*static_cast<const bool *>(save)) {
1730	rdb_drop_idx_thread.signal();
1731	}
1732	}
1733
1734	static inline uint32_t rocksdb_perf_context_level(THD *const thd) {
1735	DBUG_ASSERT(thd != nullptr);
1736
1737	const int session_perf_context_level = THDVAR(thd, perf_context_level);
1738	if (session_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
1739	return session_perf_context_level;
1740	}
1741
1742	/*
1743	Fallback to global thdvar, if session specific one was not set to a valid
1744	value.
1745	*/
1746
1747	const int global_perf_context_level = THDVAR(nullptr, perf_context_level);
1748	if (global_perf_context_level > rocksdb::PerfLevel::kUninitialized) {
1749	return global_perf_context_level;
1750	}
1751
1752	return rocksdb::PerfLevel::kDisable;
1753	}
1754
1755	/*
1756	Very short (functor-like) interface to be passed to
1757	Rdb_transaction::walk_tx_list()
1758	*/
1759
1760	interface Rdb_tx_list_walker {
1761	virtual ~Rdb_tx_list_walker() {}
1762	virtual void process_tran(const Rdb_transaction *const) = `0`;
1763	};
1764
1765	/*
1766	This is a helper class that is passed to RocksDB to get notifications when
1767	a snapshot gets created.
1768	*/
1769
1770	class Rdb_snapshot_notifier : public rocksdb::TransactionNotifier {
1771	Rdb_transaction *m_owning_tx;
1772
1773	void SnapshotCreated(const rocksdb::Snapshot *snapshot) override;
1774
1775	public:
1776	Rdb_snapshot_notifier(const Rdb_snapshot_notifier &) = delete;
1777	Rdb_snapshot_notifier &operator=(const Rdb_snapshot_notifier &) = delete;
1778
1779	explicit Rdb_snapshot_notifier(Rdb_transaction *const owning_tx)
1780	: m_owning_tx(owning_tx) {}
1781
1782	// If the owning Rdb_transaction gets destructed we need to not reference
1783	// it anymore.
1784	void detach() { m_owning_tx = nullptr; }
1785	};
1786
1787
1788	#ifdef MARIAROCKS_NOT_YET
1789	// ER_LOCK_WAIT_TIMEOUT error also has a reason in facebook/mysql-5.6
1790	#endif
1791	String timeout_message(const char command, const* char *name1,
1792	const char *name2)
1793	{
1794	String msg;
1795	msg.append("Timeout on ");
1796	msg.append(command);
1797	msg.append(": ");
1798	msg.append(name1);
1799	if (name2 && name2[`0`])
1800	{
1801	msg.append(".");
1802	msg.append(name2);
1803	}
1804	return msg;
1805	}
1806
1807
1808	/ This is the base class for transactions when interacting with rocksdb.*
1809	*/
1810	class Rdb_transaction {
1811	protected:
1812	ulonglong m_write_count = `0`;
1813	ulonglong m_insert_count = `0`;
1814	ulonglong m_update_count = `0`;
1815	ulonglong m_delete_count = `0`;
1816	ulonglong m_lock_count = `0`;
1817	std::unordered_map<GL_INDEX_ID, ulonglong> m_auto_incr_map;
1818
1819	bool m_is_delayed_snapshot = false;
1820	bool m_is_two_phase = false;
1821
1822	private:
1823	/ Number of RockDB savepoints taken /
1824	int m_n_savepoints;
1825	/*
1826	Number of write operations this transaction had when we took the last
1827	savepoint (the idea is not to take another savepoint if we haven't made
1828	any changes)
1829	*/
1830	ulonglong m_writes_at_last_savepoint;
1831
1832	protected:
1833	THD m_thd = nullptr*;
1834
1835	rocksdb::ReadOptions m_read_opts;
1836
1837	static std::multiset<Rdb_transaction *> s_tx_list;
1838	static mysql_mutex_t s_tx_list_mutex;
1839
1840	Rdb_io_perf *m_tbl_io_perf;
1841
1842	bool m_tx_read_only = false;
1843
1844	int m_timeout_sec; / Cached value of @@rocksdb_lock_wait_timeout /
1845
1846	/ Maximum number of locks the transaction can have /
1847	ulonglong m_max_row_locks;
1848
1849	bool m_is_tx_failed = false;
1850	bool m_rollback_only = false;
1851
1852	std::shared_ptr<Rdb_snapshot_notifier> m_notifier;
1853
1854	// This should be used only when updating binlog information.
1855	virtual rocksdb::WriteBatchBase *get_write_batch() = `0`;
1856	virtual bool commit_no_binlog() = `0`;
1857	virtual rocksdb::Iterator *
1858	get_iterator(const rocksdb::ReadOptions &options,
1859	rocksdb::ColumnFamilyHandle *column_family) = `0`;
1860
1861	protected:
1862	/*
1863	The following two are helper functions to be overloaded by child classes.
1864	They should provide RocksDB's savepoint semantics.
1865	*/
1866	virtual void do_set_savepoint() = `0`;
1867	virtual void do_rollback_to_savepoint() = `0`;
1868
1869	/*
1870	@detail
1871	This function takes in the WriteBatch of the transaction to add
1872	all the AUTO_INCREMENT merges. It does so by iterating through
1873	m_auto_incr_map and then constructing key/value pairs to call merge upon.
1874
1875	@param wb
1876	*/
1877	rocksdb::Status merge_auto_incr_map(rocksdb::WriteBatchBase *const wb) {
1878	DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", return rocksdb::Status::OK(););
1879
1880	// Iterate through the merge map merging all keys into data dictionary.
1881	rocksdb::Status s;
1882	for (auto &it : m_auto_incr_map) {
1883	s = dict_manager.put_auto_incr_val(wb, it.first, it.second);
1884	if (!s.ok()) {
1885	return s;
1886	}
1887	}
1888	m_auto_incr_map.clear();
1889	return s;
1890	}
1891
1892	public:
1893	const char *m_mysql_log_file_name;
1894	my_off_t m_mysql_log_offset;
1895	#ifdef MARIAROCKS_NOT_YET
1896	// TODO: MariaDB probably doesn't need these at all:
1897	const char *m_mysql_gtid;
1898	const char *m_mysql_max_gtid;
1899	#endif
1900	String m_detailed_error;
1901	int64_t m_snapshot_timestamp = `0`;
1902	bool m_ddl_transaction;
1903
1904	/*
1905	Tracks the number of tables in use through external_lock.
1906	This should not be reset during start_tx().
1907	*/
1908	int64_t m_n_mysql_tables_in_use = `0`;
1909
1910	/*
1911	MariaDB's group commit:
1912	*/
1913	bool commit_ordered_done;
1914	bool commit_ordered_res;
1915
1916	/*
1917	for distinction between rdb_transaction_impl and rdb_writebatch_impl
1918	when using walk tx list
1919	*/
1920	virtual bool is_writebatch_trx() const = `0`;
1921
1922	static void init_mutex() {
1923	mysql_mutex_init(key_mutex_tx_list, &s_tx_list_mutex, MY_MUTEX_INIT_FAST);
1924	}
1925
1926	static void term_mutex() {
1927	DBUG_ASSERT(s_tx_list.size() == `0`);
1928	mysql_mutex_destroy(&s_tx_list_mutex);
1929	}
1930
1931	static void walk_tx_list(Rdb_tx_list_walker *walker) {
1932	DBUG_ASSERT(walker != nullptr);
1933
1934	RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
1935
1936	for (auto it : s_tx_list)
1937	walker->process_tran(it);
1938
1939	RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
1940	}
1941
1942	int set_status_error(THD *const thd, const rocksdb::Status &s,
1943	const Rdb_key_def &kd, Rdb_tbl_def *const tbl_def,
1944	Rdb_table_handler *const table_handler) {
1945	DBUG_ASSERT(!s.ok());
1946	DBUG_ASSERT(tbl_def != nullptr);
1947
1948	if (s.IsTimedOut()) {
1949	/*
1950	SQL layer has weird expectations. If we return an error when
1951	doing a read in DELETE IGNORE, it will ignore the error ("because it's
1952	an IGNORE command!) but then will fail an assert, because "error code
1953	was returned, but no error happened". Do what InnoDB's
1954	convert_error_code_to_mysql() does: force a statement
1955	rollback before returning HA_ERR_LOCK_WAIT_TIMEOUT:
1956	*/
1957	my_core::thd_mark_transaction_to_rollback(thd, false /just statement/);
1958	m_detailed_error.copy(timeout_message(
1959	"index", tbl_def->full_tablename().c_str(), kd.get_name().c_str()));
1960	table_handler->m_lock_wait_timeout_counter.inc();
1961	rocksdb_row_lock_wait_timeouts ++;
1962
1963	return HA_ERR_LOCK_WAIT_TIMEOUT;
1964	}
1965
1966	if (s.IsDeadlock()) {
1967	my_core::thd_mark_transaction_to_rollback(thd,
1968	false / just statement /);
1969	m_detailed_error = String ();
1970	table_handler->m_deadlock_counter.inc();
1971	rocksdb_row_lock_deadlocks ++;
1972	return HA_ERR_LOCK_DEADLOCK;
1973	} else if (s.IsBusy()) {
1974	rocksdb_snapshot_conflict_errors ++;
1975	if (rocksdb_print_snapshot_conflict_queries) {
1976	char user_host_buff[MAX_USER_HOST_SIZE + `1`];
1977	make_user_name(thd, user_host_buff);
1978	// NO_LINT_DEBUG
1979	sql_print_warning("Got snapshot conflict errors: User: %s "
1980	"Query: %s",
1981	user_host_buff, thd->query());
1982	}
1983	m_detailed_error = String (" (snapshot conflict)", system_charset_info);
1984	table_handler->m_deadlock_counter.inc();
1985	return HA_ERR_LOCK_DEADLOCK;
1986	}
1987
1988	if (s.IsIOError() \|\| s.IsCorruption()) {
1989	rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
1990	}
1991
1992	return ha_rocksdb::rdb_error_to_mysql(s);
1993	}
1994
1995	THD get_thd() const* { return m_thd; }
1996
1997	/ Used for tracking io_perf counters /
1998	void io_perf_start(Rdb_io_perf *const io_perf) {
1999	/*
2000	Since perf_context is tracked per thread, it is difficult and expensive
2001	to maintain perf_context on a per table basis. Therefore, roll all
2002	perf_context data into the first table used in a query. This works well
2003	for single table queries and is probably good enough for queries that hit
2004	multiple tables.
2005
2006	perf_context stats gathering is started when the table lock is acquired
2007	or when ha_rocksdb::start_stmt is called in case of LOCK TABLES. They
2008	are recorded when the table lock is released, or when commit/rollback
2009	is called on the transaction, whichever comes first. Table lock release
2010	and commit/rollback can happen in different orders. In the case where
2011	the lock is released before commit/rollback is called, an extra step to
2012	gather stats during commit/rollback is needed.
2013	*/
2014	if (m_tbl_io_perf == nullptr &&
2015	io_perf->start(rocksdb_perf_context_level(m_thd))) {
2016	m_tbl_io_perf = io_perf;
2017	}
2018	}
2019
2020	void io_perf_end_and_record(void) {
2021	if (m_tbl_io_perf != nullptr) {
2022	m_tbl_io_perf->end_and_record(rocksdb_perf_context_level(m_thd));
2023	m_tbl_io_perf = nullptr;
2024	}
2025	}
2026
2027	void io_perf_end_and_record(Rdb_io_perf *const io_perf) {
2028	if (m_tbl_io_perf == io_perf) {
2029	io_perf_end_and_record();
2030	}
2031	}
2032
2033	void update_bytes_written(ulonglong bytes_written) {
2034	if (m_tbl_io_perf != nullptr) {
2035	m_tbl_io_perf->update_bytes_written(rocksdb_perf_context_level(m_thd),
2036	bytes_written);
2037	}
2038	}
2039
2040	void set_params(int timeout_sec_arg, int max_row_locks_arg) {
2041	m_timeout_sec = timeout_sec_arg;
2042	m_max_row_locks = max_row_locks_arg;
2043	set_lock_timeout(timeout_sec_arg);
2044	}
2045
2046	virtual void set_lock_timeout(int timeout_sec_arg) = `0`;
2047
2048	ulonglong get_write_count() const { return m_write_count; }
2049
2050	ulonglong get_insert_count() const { return m_insert_count; }
2051
2052	ulonglong get_update_count() const { return m_update_count; }
2053
2054	ulonglong get_delete_count() const { return m_delete_count; }
2055
2056	void incr_insert_count() { ++m_insert_count; }
2057
2058	void incr_update_count() { ++m_update_count; }
2059
2060	void incr_delete_count() { ++m_delete_count; }
2061
2062	int get_timeout_sec() const { return m_timeout_sec; }
2063
2064	ulonglong get_lock_count() const { return m_lock_count; }
2065
2066	virtual void set_sync(bool sync) = `0`;
2067
2068	virtual void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
2069	const std::string &rowkey) = `0`;
2070
2071	virtual bool prepare(const rocksdb::TransactionName &name) = `0`;
2072
2073	bool commit_or_rollback() {
2074	bool res;
2075	if (m_is_tx_failed) {
2076	rollback();
2077	res = false;
2078	} else
2079	res = commit();
2080	return res;
2081	}
2082
2083	bool commit() {
2084	if (get_write_count() == `0`) {
2085	rollback();
2086	return false;
2087	} else if (m_rollback_only) {
2088	/*
2089	Transactions marked as rollback_only are expected to be rolled back at
2090	prepare(). But there are some exceptions like below that prepare() is
2091	never called and commit() is called instead.
2092	1. Binlog is disabled
2093	2. No modification exists in binlog cache for the transaction (#195)
2094	In both cases, rolling back transaction is safe. Nothing is written to
2095	binlog.
2096	*/
2097	my_error(ER_ROLLBACK_ONLY, MYF(`0`));
2098	rollback();
2099	return true;
2100	} else {
2101	#ifdef MARIAROCKS_NOT_YET
2102	/*
2103	Storing binlog position inside MyRocks is needed only for restoring
2104	MyRocks from backups. This feature is not supported yet.
2105	*/
2106	mysql_bin_log_commit_pos(m_thd, &m_mysql_log_offset,
2107	&m_mysql_log_file_name);
2108	binlog_manager.update(m_mysql_log_file_name, m_mysql_log_offset,
2109	get_write_batch());
2110	#endif
2111	return commit_no_binlog();
2112	}
2113	}
2114
2115	virtual void rollback() = `0`;
2116
2117	void snapshot_created(const rocksdb::Snapshot *const snapshot) {
2118	DBUG_ASSERT(snapshot != nullptr);
2119
2120	m_read_opts.snapshot = snapshot;
2121	rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
2122	m_is_delayed_snapshot = false;
2123	}
2124
2125	virtual void acquire_snapshot(bool acquire_now) = `0`;
2126	virtual void release_snapshot() = `0`;
2127
2128	bool has_snapshot() const { return m_read_opts.snapshot != nullptr; }
2129
2130	private:
2131	// The Rdb_sst_info structures we are currently loading. In a partitioned
2132	// table this can have more than one entry
2133	std::vector<std::shared_ptr<Rdb_sst_info>> m_curr_bulk_load;
2134	std::string m_curr_bulk_load_tablename;
2135
2136	/ External merge sorts for bulk load: key ID -> merge sort instance /
2137	std::unordered_map<GL_INDEX_ID, Rdb_index_merge> m_key_merge;
2138
2139	public:
2140	int get_key_merge(GL_INDEX_ID kd_gl_id, rocksdb::ColumnFamilyHandle *cf,
2141	Rdb_index_merge **key_merge) {
2142	int res;
2143	auto it = m_key_merge.find(kd_gl_id);
2144	if (it == m_key_merge.end()) {
2145	m_key_merge.emplace(
2146	std::piecewise_construct, std::make_tuple(kd_gl_id),
2147	std::make_tuple(
2148	get_rocksdb_tmpdir(), THDVAR(get_thd(), merge_buf_size),
2149	THDVAR(get_thd(), merge_combine_read_size),
2150	THDVAR(get_thd(), merge_tmp_file_removal_delay_ms), cf));
2151	it = m_key_merge.find(kd_gl_id);
2152	if ((res = it ->second.init()) != `0`) {
2153	return res;
2154	}
2155	}
2156	*key_merge = &it ->second;
2157	return HA_EXIT_SUCCESS;
2158	}
2159
2160	int finish_bulk_load(int print_client_error = true) {
2161	int rc = `0`, rc2;
2162
2163	std::vector<std::shared_ptr<Rdb_sst_info>>::iterator it;
2164	for (it = m_curr_bulk_load.begin(); it != m_curr_bulk_load.end(); it ++) {
2165	rc2 = (*it)->commit(print_client_error);
2166	if (rc2 != `0` && rc == `0`) {
2167	rc = rc2;
2168	}
2169	}
2170	m_curr_bulk_load.clear();
2171	m_curr_bulk_load_tablename.clear();
2172	DBUG_ASSERT(m_curr_bulk_load.size() == `0`);
2173
2174	// Flush the index_merge sort buffers
2175	if (!m_key_merge.empty()) {
2176	rocksdb::Slice merge_key;
2177	rocksdb::Slice merge_val;
2178	for (auto it = m_key_merge.begin(); it != m_key_merge.end(); it ++) {
2179	GL_INDEX_ID index_id = it ->first;
2180	std::shared_ptr<const Rdb_key_def> keydef =
2181	ddl_manager.safe_find(index_id);
2182	std::string table_name = ddl_manager.safe_get_table_name(index_id);
2183
2184	// Unable to find key definition or table name since the
2185	// table could have been dropped.
2186	// TODO(herman): there is a race here between dropping the table
2187	// and detecting a drop here. If the table is dropped while bulk
2188	// loading is finishing, these keys being added here may
2189	// be missed by the compaction filter and not be marked for
2190	// removal. It is unclear how to lock the sql table from the storage
2191	// engine to prevent modifications to it while bulk load is occurring.
2192	if (keydef == nullptr \|\| table_name.empty()) {
2193	rc2 = HA_ERR_ROCKSDB_BULK_LOAD;
2194	break;
2195	}
2196	const std::string &index_name = keydef ->get_name();
2197	Rdb_index_merge &rdb_merge = it ->second;
2198
2199	// Rdb_sst_info expects a denormalized table name in the form of
2200	// "./database/table"
2201	std::replace(table_name.begin(), table_name.end(), `'.'`, `'/'`);
2202	table_name = "./" + table_name;
2203	Rdb_sst_info sst_info(rdb, table_name, index_name, rdb_merge.get_cf(),
2204	*rocksdb_db_options,
2205	THDVAR(get_thd(), trace_sst_api));
2206
2207	while ((rc2 = rdb_merge.next(&merge_key, &merge_val)) == `0`) {
2208	if ((rc2 = sst_info.put(merge_key, merge_val)) != `0`) {
2209	break;
2210	}
2211	}
2212
2213	// rc2 == -1 => finished ok; rc2 > 0 => error
2214	if (rc2 > `0` \|\| (rc2 = sst_info.commit(print_client_error)) != `0`) {
2215	if (rc == `0`) {
2216	rc = rc2;
2217	}
2218	break;
2219	}
2220	}
2221	m_key_merge.clear();
2222
2223	/*
2224	Explicitly tell jemalloc to clean up any unused dirty pages at this
2225	point.
2226	See https://reviews.facebook.net/D63723 for more details.
2227	*/
2228	purge_all_jemalloc_arenas();
2229	}
2230	return rc;
2231	}
2232
2233	int start_bulk_load(ha_rocksdb *const bulk_load,
2234	std::shared_ptr<Rdb_sst_info> sst_info) {
2235	/*
2236	If we already have an open bulk load of a table and the name doesn't
2237	match the current one, close out the currently running one. This allows
2238	multiple bulk loads to occur on a partitioned table, but then closes
2239	them all out when we switch to another table.
2240	*/
2241	DBUG_ASSERT(bulk_load != nullptr);
2242
2243	if (!m_curr_bulk_load.empty() &&
2244	bulk_load->get_table_basename() != m_curr_bulk_load_tablename) {
2245	const auto res = finish_bulk_load();
2246	if (res != HA_EXIT_SUCCESS) {
2247	m_curr_bulk_load.clear();
2248	m_curr_bulk_load_tablename.clear();
2249	return res;
2250	}
2251	}
2252
2253	/*
2254	This used to track ha_rocksdb handler objects, but those can be
2255	freed by the table cache while this was referencing them. Instead
2256	of tracking ha_rocksdb handler objects, this now tracks the
2257	Rdb_sst_info allocated, and both the ha_rocksdb handler and the
2258	Rdb_transaction both have shared pointers to them.
2259
2260	On transaction complete, it will commit each Rdb_sst_info structure found.
2261	If the ha_rocksdb object is freed, etc., it will also commit
2262	the Rdb_sst_info. The Rdb_sst_info commit path needs to be idempotent.
2263	*/
2264	m_curr_bulk_load.push_back(sst_info);
2265	m_curr_bulk_load_tablename = bulk_load->get_table_basename();
2266	return HA_EXIT_SUCCESS;
2267	}
2268
2269	int num_ongoing_bulk_load() const { return m_curr_bulk_load.size(); }
2270
2271	const char get_rocksdb_tmpdir() const* {
2272	const char *tmp_dir = THDVAR(get_thd(), tmpdir);
2273
2274	/*
2275	We want to treat an empty string as nullptr, in these cases DDL operations
2276	will use the default --tmpdir passed to mysql instead.
2277	*/
2278	if (tmp_dir != nullptr && *tmp_dir == `'\0'`) {
2279	tmp_dir = nullptr;
2280	}
2281	return (tmp_dir);
2282	}
2283
2284	/*
2285	Flush the data accumulated so far. This assumes we're doing a bulk insert.
2286
2287	@detail
2288	This should work like transaction commit, except that we don't
2289	synchronize with the binlog (there is no API that would allow to have
2290	binlog flush the changes accumulated so far and return its current
2291	position)
2292
2293	@todo
2294	Add test coverage for what happens when somebody attempts to do bulk
2295	inserts while inside a multi-statement transaction.
2296	*/
2297	bool flush_batch() {
2298	if (get_write_count() == `0`)
2299	return false;
2300
2301	/ Commit the current transaction /
2302	if (commit_no_binlog())
2303	return true;
2304
2305	/ Start another one /
2306	start_tx();
2307	return false;
2308	}
2309
2310	void set_auto_incr(const GL_INDEX_ID &gl_index_id, ulonglong curr_id) {
2311	m_auto_incr_map [gl_index_id] =
2312	std::max(m_auto_incr_map [gl_index_id], curr_id);
2313	}
2314
2315	#ifndef NDEBUG
2316	ulonglong get_auto_incr(const GL_INDEX_ID &gl_index_id) {
2317	if (m_auto_incr_map.count(gl_index_id) > `0`) {
2318	return m_auto_incr_map[gl_index_id];
2319	}
2320	return `0`;
2321	}
2322	#endif
2323
2324	virtual rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
2325	const rocksdb::Slice &key,
2326	const rocksdb::Slice &value) = `0`;
2327	virtual rocksdb::Status
2328	delete_key(rocksdb::ColumnFamilyHandle *const column_family,
2329	const rocksdb::Slice &key) = `0`;
2330	virtual rocksdb::Status
2331	single_delete(rocksdb::ColumnFamilyHandle *const column_family,
2332	const rocksdb::Slice &key) = `0`;
2333
2334	virtual bool has_modifications() const = `0`;
2335
2336	virtual rocksdb::WriteBatchBase *get_indexed_write_batch() = `0`;
2337	/*
2338	Return a WriteBatch that one can write to. The writes will skip any
2339	transaction locking. The writes will NOT be visible to the transaction.
2340	*/
2341	rocksdb::WriteBatchBase *get_blind_write_batch() {
2342	return get_indexed_write_batch()->GetWriteBatch();
2343	}
2344
2345	virtual rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
2346	const rocksdb::Slice &key,
2347	rocksdb::PinnableSlice *const value) const = `0`;
2348	virtual rocksdb::Status
2349	get_for_update(rocksdb::ColumnFamilyHandle *const column_family,
2350	const rocksdb::Slice &key, rocksdb::PinnableSlice *const value,
2351	bool exclusive) = `0`;
2352
2353	rocksdb::Iterator *
2354	get_iterator(rocksdb::ColumnFamilyHandle *const column_family,
2355	bool skip_bloom_filter, bool fill_cache,
2356	const rocksdb::Slice &eq_cond_lower_bound,
2357	const rocksdb::Slice &eq_cond_upper_bound,
2358	bool read_current = false, bool create_snapshot = true) {
2359	// Make sure we are not doing both read_current (which implies we don't
2360	// want a snapshot) and create_snapshot which makes sure we create
2361	// a snapshot
2362	DBUG_ASSERT(column_family != nullptr);
2363	DBUG_ASSERT(!read_current \|\| !create_snapshot);
2364
2365	if (create_snapshot)
2366	acquire_snapshot(true);
2367
2368	rocksdb::ReadOptions options = m_read_opts;
2369
2370	if (skip_bloom_filter) {
2371	options.total_order_seek = true;
2372	options.iterate_lower_bound = &eq_cond_lower_bound;
2373	options.iterate_upper_bound = &eq_cond_upper_bound;
2374	} else {
2375	// With this option, Iterator::Valid() returns false if key
2376	// is outside of the prefix bloom filter range set at Seek().
2377	// Must not be set to true if not using bloom filter.
2378	options.prefix_same_as_start = true;
2379	}
2380	options.fill_cache = fill_cache;
2381	if (read_current) {
2382	options.snapshot = nullptr;
2383	}
2384	return get_iterator(options, column_family);
2385	}
2386
2387	virtual bool is_tx_started() const = `0`;
2388	virtual void start_tx() = `0`;
2389	virtual void start_stmt() = `0`;
2390
2391	void set_initial_savepoint() {
2392	/*
2393	Set the initial savepoint. If the first statement in the transaction
2394	fails, we need something to roll back to, without rolling back the
2395	entire transaction.
2396	*/
2397	do_set_savepoint();
2398	m_n_savepoints= `1`;
2399	m_writes_at_last_savepoint= m_write_count;
2400	}
2401
2402	/*
2403	Called when a "top-level" statement inside a transaction completes
2404	successfully and its changes become part of the transaction's changes.
2405	*/
2406	void make_stmt_savepoint_permanent() {
2407
2408	// Take another RocksDB savepoint only if we had changes since the last
2409	// one. This is very important for long transactions doing lots of
2410	// SELECTs.
2411	if (m_writes_at_last_savepoint != m_write_count)
2412	{
2413	do_set_savepoint();
2414	m_writes_at_last_savepoint= m_write_count;
2415	m_n_savepoints++;
2416	}
2417	}
2418
2419
2420	/*
2421	Rollback to the savepoint we've set before the last statement
2422	*/
2423	void rollback_to_stmt_savepoint() {
2424	if (m_writes_at_last_savepoint != m_write_count) {
2425	do_rollback_to_savepoint();
2426	if (!--m_n_savepoints) {
2427	do_set_savepoint();
2428	m_n_savepoints= `1`;
2429	}
2430	m_writes_at_last_savepoint= m_write_count;
2431	}
2432	}
2433
2434	virtual void rollback_stmt() = `0`;
2435
2436	void set_tx_failed(bool failed_arg) { m_is_tx_failed = failed_arg; }
2437
2438	bool can_prepare() const {
2439	if (m_rollback_only) {
2440	my_error(ER_ROLLBACK_ONLY, MYF(`0`));
2441	return false;
2442	}
2443	return true;
2444	}
2445
2446	int rollback_to_savepoint(void *const savepoint) {
2447	if (has_modifications()) {
2448	my_error(ER_ROLLBACK_TO_SAVEPOINT, MYF(`0`));
2449	m_rollback_only = true;
2450	return HA_EXIT_FAILURE;
2451	}
2452	return HA_EXIT_SUCCESS;
2453	}
2454
2455	/*
2456	This is used by transactions started with "START TRANSACTION WITH "
2457	"CONSISTENT [ROCKSDB] SNAPSHOT". When tx_read_only is turned on,
2458	snapshot has to be created via DB::GetSnapshot(), not via Transaction
2459	API.
2460	*/
2461	bool is_tx_read_only() const { return m_tx_read_only; }
2462
2463	bool is_two_phase() const { return m_is_two_phase; }
2464
2465	void set_tx_read_only(bool val) { m_tx_read_only = val; }
2466
2467	explicit Rdb_transaction(THD *const thd)
2468	: m_thd(thd), m_tbl_io_perf(nullptr) {
2469	RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
2470	s_tx_list.insert(this);
2471	RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
2472	}
2473
2474	virtual ~Rdb_transaction() {
2475	RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex);
2476	s_tx_list.erase(this);
2477	RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex);
2478	}
2479	};
2480
2481	/*
2482	This is a rocksdb transaction. Its members represent the current transaction,
2483	which consists of:
2484	- the snapshot
2485	- the changes we've made but are not seeing yet.
2486
2487	The changes are made to individual tables, which store them here and then
2488	this object commits them on commit.
2489	*/
2490	class Rdb_transaction_impl : public Rdb_transaction {
2491	rocksdb::Transaction m_rocksdb_tx = nullptr*;
2492	rocksdb::Transaction m_rocksdb_reuse_tx = nullptr*;
2493
2494	public:
2495	void set_lock_timeout(int timeout_sec_arg) override {
2496	if (m_rocksdb_tx)
2497	m_rocksdb_tx->SetLockTimeout(rdb_convert_sec_to_ms(m_timeout_sec));
2498	}
2499
2500	void set_sync(bool sync) override {
2501	if (m_rocksdb_tx)
2502	m_rocksdb_tx->GetWriteOptions()->sync = sync;
2503	}
2504
2505	void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
2506	const std::string &rowkey) override {
2507	if (!THDVAR(m_thd, lock_scanned_rows)) {
2508	m_rocksdb_tx->UndoGetForUpdate(column_family, rocksdb::Slice (rowkey));
2509	}
2510	}
2511
2512	virtual bool is_writebatch_trx() const override { return false; }
2513
2514	private:
2515	void release_tx(void) {
2516	// We are done with the current active transaction object. Preserve it
2517	// for later reuse.
2518	DBUG_ASSERT(m_rocksdb_reuse_tx == nullptr);
2519	m_rocksdb_reuse_tx = m_rocksdb_tx;
2520	m_rocksdb_tx = nullptr;
2521	}
2522
2523	bool prepare(const rocksdb::TransactionName &name) override {
2524	rocksdb::Status s;
2525	s = m_rocksdb_tx->SetName(name);
2526	if (!s.ok()) {
2527	rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
2528	return false;
2529	}
2530
2531	s = merge_auto_incr_map(m_rocksdb_tx->GetWriteBatch()->GetWriteBatch());
2532	if (!s.ok()) {
2533	rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
2534	return false;
2535	}
2536
2537	s = m_rocksdb_tx->Prepare();
2538	if (!s.ok()) {
2539	rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
2540	return false;
2541	}
2542	return true;
2543	}
2544
2545	bool commit_no_binlog() override {
2546	bool res = false;
2547	rocksdb::Status s;
2548
2549	s = merge_auto_incr_map(m_rocksdb_tx->GetWriteBatch()->GetWriteBatch());
2550	if (!s.ok()) {
2551	rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
2552	res = true;
2553	goto error;
2554	}
2555
2556	release_snapshot();
2557	s = m_rocksdb_tx->Commit();
2558	if (!s.ok()) {
2559	rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
2560	res = true;
2561	goto error;
2562	}
2563
2564	error:
2565	/ Save the transaction object to be reused /
2566	release_tx();
2567
2568	m_write_count = `0`;
2569	m_insert_count = `0`;
2570	m_update_count = `0`;
2571	m_delete_count = `0`;
2572	m_lock_count = `0`;
2573	set_tx_read_only(false);
2574	m_rollback_only = false;
2575	return res;
2576	}
2577
2578	public:
2579	void rollback() override {
2580	m_write_count = `0`;
2581	m_insert_count = `0`;
2582	m_update_count = `0`;
2583	m_delete_count = `0`;
2584	m_lock_count = `0`;
2585	m_auto_incr_map.clear();
2586	m_ddl_transaction = false;
2587	if (m_rocksdb_tx) {
2588	release_snapshot();
2589	/ This will also release all of the locks: /
2590	m_rocksdb_tx->Rollback();
2591
2592	/ Save the transaction object to be reused /
2593	release_tx();
2594
2595	set_tx_read_only(false);
2596	m_rollback_only = false;
2597	}
2598	}
2599
2600	void acquire_snapshot(bool acquire_now) override {
2601	if (m_read_opts.snapshot == nullptr) {
2602	if (is_tx_read_only()) {
2603	snapshot_created(rdb->GetSnapshot());
2604	} else if (acquire_now) {
2605	m_rocksdb_tx->SetSnapshot();
2606	snapshot_created(m_rocksdb_tx->GetSnapshot());
2607	} else if (!m_is_delayed_snapshot) {
2608	m_rocksdb_tx->SetSnapshotOnNextOperation(m_notifier);
2609	m_is_delayed_snapshot = true;
2610	}
2611	}
2612	}
2613
2614	void release_snapshot() override {
2615	bool need_clear = m_is_delayed_snapshot;
2616
2617	if (m_read_opts.snapshot != nullptr) {
2618	m_snapshot_timestamp = `0`;
2619	if (is_tx_read_only()) {
2620	rdb->ReleaseSnapshot(m_read_opts.snapshot);
2621	need_clear = false;
2622	} else {
2623	need_clear = true;
2624	}
2625	m_read_opts.snapshot = nullptr;
2626	}
2627
2628	if (need_clear && m_rocksdb_tx != nullptr)
2629	m_rocksdb_tx->ClearSnapshot();
2630	}
2631
2632	bool has_snapshot() { return m_read_opts.snapshot != nullptr; }
2633
2634	rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
2635	const rocksdb::Slice &key,
2636	const rocksdb::Slice &value) override {
2637	++m_write_count;
2638	++m_lock_count;
2639	if (m_write_count > m_max_row_locks \|\| m_lock_count > m_max_row_locks)
2640	return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
2641	return m_rocksdb_tx->Put(column_family, key, value);
2642	}
2643
2644	rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family,
2645	const rocksdb::Slice &key) override {
2646	++m_write_count;
2647	++m_lock_count;
2648	if (m_write_count > m_max_row_locks \|\| m_lock_count > m_max_row_locks)
2649	return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
2650	return m_rocksdb_tx->Delete(column_family, key);
2651	}
2652
2653	rocksdb::Status
2654	single_delete(rocksdb::ColumnFamilyHandle *const column_family,
2655	const rocksdb::Slice &key) override {
2656	++m_write_count;
2657	++m_lock_count;
2658	if (m_write_count > m_max_row_locks \|\| m_lock_count > m_max_row_locks)
2659	return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
2660	return m_rocksdb_tx->SingleDelete(column_family, key);
2661	}
2662
2663	bool has_modifications() const override {
2664	return m_rocksdb_tx->GetWriteBatch() &&
2665	m_rocksdb_tx->GetWriteBatch()->GetWriteBatch() &&
2666	m_rocksdb_tx->GetWriteBatch()->GetWriteBatch()->Count() > `0`;
2667	}
2668
2669	rocksdb::WriteBatchBase *get_write_batch() override {
2670	if (is_two_phase()) {
2671	return m_rocksdb_tx->GetCommitTimeWriteBatch();
2672	}
2673	return m_rocksdb_tx->GetWriteBatch()->GetWriteBatch();
2674	}
2675
2676	/*
2677	Return a WriteBatch that one can write to. The writes will skip any
2678	transaction locking. The writes WILL be visible to the transaction.
2679	*/
2680	rocksdb::WriteBatchBase *get_indexed_write_batch() override {
2681	++m_write_count;
2682	return m_rocksdb_tx->GetWriteBatch();
2683	}
2684
2685	rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
2686	const rocksdb::Slice &key,
2687	rocksdb::PinnableSlice *const value) const override {
2688	// clean PinnableSlice right begfore Get() for multiple gets per statement
2689	// the resources after the last Get in a statement are cleared in
2690	// handler::reset call
2691	value->Reset();
2692	global_stats.queries[QUERIES_POINT].inc();
2693	return m_rocksdb_tx->Get(m_read_opts, column_family, key, value);
2694	}
2695
2696	rocksdb::Status
2697	get_for_update(rocksdb::ColumnFamilyHandle *const column_family,
2698	const rocksdb::Slice &key, rocksdb::PinnableSlice *const value,
2699	bool exclusive) override {
2700	if (++m_lock_count > m_max_row_locks)
2701	return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit);
2702
2703	if (value != nullptr) {
2704	value->Reset();
2705	}
2706	return m_rocksdb_tx->GetForUpdate(m_read_opts, column_family, key, value,
2707	exclusive);
2708	}
2709
2710	rocksdb::Iterator *
2711	get_iterator(const rocksdb::ReadOptions &options,
2712	rocksdb::ColumnFamilyHandle *const column_family) override {
2713	global_stats.queries[QUERIES_RANGE].inc();
2714	return m_rocksdb_tx->GetIterator(options, column_family);
2715	}
2716
2717	const rocksdb::Transaction get_rdb_trx() const* { return m_rocksdb_tx; }
2718
2719	bool is_tx_started() const override { return (m_rocksdb_tx != nullptr); }
2720
2721	void start_tx() override {
2722	rocksdb::TransactionOptions tx_opts;
2723	rocksdb::WriteOptions write_opts;
2724	tx_opts.set_snapshot = false;
2725	tx_opts.lock_timeout = rdb_convert_sec_to_ms(m_timeout_sec);
2726	tx_opts.deadlock_detect = THDVAR(m_thd, deadlock_detect);
2727	tx_opts.deadlock_detect_depth = THDVAR(m_thd, deadlock_detect_depth);
2728	tx_opts.max_write_batch_size = THDVAR(m_thd, write_batch_max_bytes);
2729
2730	write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
2731	write_opts.disableWAL = THDVAR(m_thd, write_disable_wal);
2732	write_opts.ignore_missing_column_families =
2733	THDVAR(m_thd, write_ignore_missing_column_families);
2734	m_is_two_phase = rocksdb_enable_2pc;
2735
2736	commit_ordered_done= false;
2737
2738	/*
2739	If m_rocksdb_reuse_tx is null this will create a new transaction object.
2740	Otherwise it will reuse the existing one.
2741	*/
2742	m_rocksdb_tx =
2743	rdb->BeginTransaction(write_opts, tx_opts, m_rocksdb_reuse_tx);
2744	m_rocksdb_reuse_tx = nullptr;
2745
2746	m_read_opts = rocksdb::ReadOptions ();
2747
2748	set_initial_savepoint();
2749
2750	m_ddl_transaction = false;
2751	}
2752
2753	/ Implementations of do_savepoint based on rocksdB::Transaction savepoints /*
2754	void do_set_savepoint() override {
2755	m_rocksdb_tx->SetSavePoint();
2756	}
2757
2758	void do_rollback_to_savepoint() override {
2759	m_rocksdb_tx->RollbackToSavePoint();
2760	}
2761
2762	/*
2763	Start a statement inside a multi-statement transaction.
2764
2765	@todo: are we sure this is called once (and not several times) per
2766	statement start?
2767
2768	For hooking to start of statement that is its own transaction, see
2769	ha_rocksdb::external_lock().
2770	*/
2771	void start_stmt() override {
2772	// Set the snapshot to delayed acquisition (SetSnapshotOnNextOperation)
2773	acquire_snapshot(false);
2774	}
2775
2776	/*
2777	This must be called when last statement is rolled back, but the transaction
2778	continues
2779	*/
2780	void rollback_stmt() override {
2781	/ TODO: here we must release the locks taken since the start_stmt() call /
2782	if (m_rocksdb_tx) {
2783	const rocksdb::Snapshot *const org_snapshot = m_rocksdb_tx->GetSnapshot();
2784	rollback_to_stmt_savepoint();
2785
2786	const rocksdb::Snapshot *const cur_snapshot = m_rocksdb_tx->GetSnapshot();
2787	if (org_snapshot != cur_snapshot) {
2788	if (org_snapshot != nullptr)
2789	m_snapshot_timestamp = `0`;
2790
2791	m_read_opts.snapshot = cur_snapshot;
2792	if (cur_snapshot != nullptr)
2793	rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp);
2794	else
2795	m_is_delayed_snapshot = true;
2796	}
2797	}
2798	}
2799
2800	explicit Rdb_transaction_impl(THD *const thd)
2801	: Rdb_transaction (thd), m_rocksdb_tx(nullptr) {
2802	// Create a notifier that can be called when a snapshot gets generated.
2803	m_notifier = std::make_shared<Rdb_snapshot_notifier>(this);
2804	}
2805
2806	virtual ~Rdb_transaction_impl() {
2807	rollback();
2808
2809	// Theoretically the notifier could outlive the Rdb_transaction_impl
2810	// (because of the shared_ptr), so let it know it can't reference
2811	// the transaction anymore.
2812	m_notifier ->detach();
2813
2814	// Free any transaction memory that is still hanging around.
2815	delete m_rocksdb_reuse_tx;
2816	DBUG_ASSERT(m_rocksdb_tx == nullptr);
2817	}
2818	};
2819
2820	/ This is a rocksdb write batch. This class doesn't hold or wait on any*
2821	transaction locks (skips rocksdb transaction API) thus giving better
2822	performance. The commit is done through rdb->GetBaseDB()->Commit().
2823
2824	Currently this is only used for replication threads which are guaranteed
2825	to be non-conflicting. Any further usage of this class should completely
2826	be thought thoroughly.
2827	*/
2828	class Rdb_writebatch_impl : public Rdb_transaction {
2829	rocksdb::WriteBatchWithIndex *m_batch;
2830	rocksdb::WriteOptions write_opts;
2831	// Called after commit/rollback.
2832	void reset() {
2833	m_batch->Clear();
2834	m_read_opts = rocksdb::ReadOptions ();
2835	m_ddl_transaction = false;
2836	}
2837
2838	private:
2839	bool prepare(const rocksdb::TransactionName &name) override { return true; }
2840
2841	bool commit_no_binlog() override {
2842	bool res = false;
2843	rocksdb::Status s;
2844
2845	s = merge_auto_incr_map(m_batch->GetWriteBatch());
2846	if (!s.ok()) {
2847	rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
2848	res = true;
2849	goto error;
2850	}
2851
2852	release_snapshot();
2853
2854	s = rdb->GetBaseDB()->Write(write_opts, m_batch->GetWriteBatch());
2855	if (!s.ok()) {
2856	rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT);
2857	res = true;
2858	goto error;
2859	}
2860	error:
2861	reset();
2862
2863	m_write_count = `0`;
2864	m_insert_count = `0`;
2865	m_update_count = `0`;
2866	m_delete_count = `0`;
2867	set_tx_read_only(false);
2868	m_rollback_only = false;
2869	return res;
2870	}
2871
2872	protected:
2873	/ Implementations of do_savepoint based on rocksdB::WriteBatch savepoints /*
2874	void do_set_savepoint() override {
2875	m_batch->SetSavePoint();
2876	}
2877
2878	void do_rollback_to_savepoint() override {
2879	m_batch->RollbackToSavePoint();
2880	}
2881
2882	public:
2883	bool is_writebatch_trx() const override { return true; }
2884
2885	void set_lock_timeout(int timeout_sec_arg) override {
2886	// Nothing to do here.
2887	}
2888
2889	void set_sync(bool sync) override { write_opts.sync = sync; }
2890
2891	void release_lock(rocksdb::ColumnFamilyHandle *const column_family,
2892	const std::string &rowkey) override {
2893	// Nothing to do here since we don't hold any row locks.
2894	}
2895
2896	void rollback() override {
2897	m_write_count = `0`;
2898	m_insert_count = `0`;
2899	m_update_count = `0`;
2900	m_delete_count = `0`;
2901	m_lock_count = `0`;
2902	release_snapshot();
2903
2904	reset();
2905	set_tx_read_only(false);
2906	m_rollback_only = false;
2907	}
2908
2909	void acquire_snapshot(bool acquire_now) override {
2910	if (m_read_opts.snapshot == nullptr)
2911	snapshot_created(rdb->GetSnapshot());
2912	}
2913
2914	void release_snapshot() override {
2915	if (m_read_opts.snapshot != nullptr) {
2916	rdb->ReleaseSnapshot(m_read_opts.snapshot);
2917	m_read_opts.snapshot = nullptr;
2918	}
2919	}
2920
2921	rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family,
2922	const rocksdb::Slice &key,
2923	const rocksdb::Slice &value) override {
2924	++m_write_count;
2925	m_batch->Put(column_family, key, value);
2926	// Note Put/Delete in write batch doesn't return any error code. We simply
2927	// return OK here.
2928	return rocksdb::Status::OK();
2929	}
2930
2931	rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family,
2932	const rocksdb::Slice &key) override {
2933	++m_write_count;
2934	m_batch->Delete(column_family, key);
2935	return rocksdb::Status::OK();
2936	}
2937
2938	rocksdb::Status
2939	single_delete(rocksdb::ColumnFamilyHandle *const column_family,
2940	const rocksdb::Slice &key) override {
2941	++m_write_count;
2942	m_batch->SingleDelete(column_family, key);
2943	return rocksdb::Status::OK();
2944	}
2945
2946	bool has_modifications() const override {
2947	return m_batch->GetWriteBatch()->Count() > `0`;
2948	}
2949
2950	rocksdb::WriteBatchBase get_write_batch() override { return* m_batch; }
2951
2952	rocksdb::WriteBatchBase *get_indexed_write_batch() override {
2953	++m_write_count;
2954	return m_batch;
2955	}
2956
2957	rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family,
2958	const rocksdb::Slice &key,
2959	rocksdb::PinnableSlice *const value) const override {
2960	value->Reset();
2961	return m_batch->GetFromBatchAndDB(rdb, m_read_opts, column_family, key,
2962	value);
2963	}
2964
2965	rocksdb::Status
2966	get_for_update(rocksdb::ColumnFamilyHandle *const column_family,
2967	const rocksdb::Slice &key, rocksdb::PinnableSlice *const value,
2968	bool exclusive) override {
2969	return get(column_family, key, value);
2970	}
2971
2972	rocksdb::Iterator *
2973	get_iterator(const rocksdb::ReadOptions &options,
2974	rocksdb::ColumnFamilyHandle *const column_family) override {
2975	const auto it = rdb->NewIterator(options);
2976	return m_batch->NewIteratorWithBase(it);
2977	}
2978
2979	bool is_tx_started() const override { return (m_batch != nullptr); }
2980
2981	void start_tx() override {
2982	commit_ordered_done= false; // Do we need this here?
2983	reset();
2984	write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
2985	write_opts.disableWAL = THDVAR(m_thd, write_disable_wal);
2986	write_opts.ignore_missing_column_families =
2987	THDVAR(m_thd, write_ignore_missing_column_families);
2988
2989	set_initial_savepoint();
2990	}
2991
2992	void start_stmt() override {}
2993
2994	void rollback_stmt() override {
2995	if (m_batch)
2996	rollback_to_stmt_savepoint();
2997	}
2998
2999	explicit Rdb_writebatch_impl(THD *const thd)
3000	: Rdb_transaction (thd), m_batch(nullptr) {
3001	m_batch = new rocksdb::WriteBatchWithIndex (rocksdb::BytewiseComparator(), `0`,
3002	true);
3003	}
3004
3005	virtual ~Rdb_writebatch_impl() {
3006	rollback();
3007	delete m_batch;
3008	}
3009	};
3010
3011	void Rdb_snapshot_notifier::SnapshotCreated(
3012	const rocksdb::Snapshot *const snapshot) {
3013	if (m_owning_tx != nullptr) {
3014	m_owning_tx->snapshot_created(snapshot);
3015	}
3016	}
3017
3018	std::multiset<Rdb_transaction *> Rdb_transaction::s_tx_list;
3019	mysql_mutex_t Rdb_transaction::s_tx_list_mutex;
3020
3021	static Rdb_transaction &get_tx_from_thd(THD const thd) {
3022	return *reinterpret_cast<Rdb_transaction **>(
3023	my_core::thd_ha_data(thd, rocksdb_hton));
3024	}
3025
3026	namespace {
3027
3028	class Rdb_perf_context_guard {
3029	Rdb_io_perf m_io_perf;
3030	Rdb_io_perf *m_io_perf_ptr;
3031	Rdb_transaction *m_tx;
3032	uint m_level;
3033
3034	public:
3035	Rdb_perf_context_guard(const Rdb_perf_context_guard &) = delete;
3036	Rdb_perf_context_guard &operator=(const Rdb_perf_context_guard &) = delete;
3037
3038	explicit Rdb_perf_context_guard(Rdb_io_perf *io_perf, uint level)
3039	: m_io_perf_ptr(io_perf), m_tx(nullptr), m_level(level) {
3040	m_io_perf_ptr->start(m_level);
3041	}
3042
3043	explicit Rdb_perf_context_guard(Rdb_transaction *tx, uint level)
3044	: m_io_perf_ptr(nullptr), m_tx(tx), m_level(level) {
3045	/*
3046	if perf_context information is already being recorded, this becomes a
3047	no-op
3048	*/
3049	if (tx != nullptr) {
3050	tx->io_perf_start(&m_io_perf);
3051	}
3052	}
3053
3054	~Rdb_perf_context_guard() {
3055	if (m_tx != nullptr) {
3056	m_tx->io_perf_end_and_record();
3057	} else if (m_io_perf_ptr != nullptr) {
3058	m_io_perf_ptr->end_and_record(m_level);
3059	}
3060	}
3061	};
3062
3063	} // anonymous namespace
3064
3065	/*
3066	TODO: maybe, call this in external_lock() and store in ha_rocksdb..
3067	*/
3068
3069	static Rdb_transaction get_or_create_tx(THD const thd) {
3070	Rdb_transaction *&tx = get_tx_from_thd(thd);
3071	// TODO: this is called too many times.. O(#rows)
3072	if (tx == nullptr) {
3073	bool rpl_skip_tx_api= false; // MARIAROCKS_NOT_YET.
3074	if ((rpl_skip_tx_api && thd->rgi_slave) \|\|
3075	false / MARIAROCKS_NOT_YET: THDVAR(thd, master_skip_tx_api) && !thd->rgi_slave)/)
3076	{
3077	tx = new Rdb_writebatch_impl (thd);
3078	}
3079	else
3080	{
3081	tx = new Rdb_transaction_impl (thd);
3082	}
3083	tx->set_params(THDVAR(thd, lock_wait_timeout), THDVAR(thd, max_row_locks));
3084	tx->start_tx();
3085	} else {
3086	tx->set_params(THDVAR(thd, lock_wait_timeout), THDVAR(thd, max_row_locks));
3087	if (!tx->is_tx_started()) {
3088	tx->start_tx();
3089	}
3090	}
3091
3092	return tx;
3093	}
3094
3095	static int rocksdb_close_connection(handlerton *const hton, THD *const thd) {
3096	Rdb_transaction *&tx = get_tx_from_thd(thd);
3097	if (tx != nullptr) {
3098	int rc = tx->finish_bulk_load(false);
3099	if (rc != `0`) {
3100	// NO_LINT_DEBUG
3101	sql_print_error("RocksDB: Error %d finalizing last SST file while "
3102	"disconnecting",
3103	rc);
3104	}
3105
3106	delete tx;
3107	tx = nullptr;
3108	}
3109	return HA_EXIT_SUCCESS;
3110	}
3111
3112	/*
3113	* Serializes an xid to a string so that it can
3114	* be used as a rocksdb transaction name
3115	*/
3116	static std::string rdb_xid_to_string(const XID &src) {
3117	DBUG_ASSERT(src.gtrid_length >= `0` && src.gtrid_length <= MAXGTRIDSIZE);
3118	DBUG_ASSERT(src.bqual_length >= `0` && src.bqual_length <= MAXBQUALSIZE);
3119
3120	std::string buf;
3121	buf.reserve(RDB_XIDHDR_LEN + src.gtrid_length + src.bqual_length);
3122
3123	/*
3124	* expand formatID to fill 8 bytes if it doesn't already
3125	* then reinterpret bit pattern as unsigned and store in network order
3126	*/
3127	uchar fidbuf[RDB_FORMATID_SZ];
3128	int64 signed_fid8 = src.formatID;
3129	const uint64 raw_fid8 = *reinterpret_cast<uint64 *>(&signed_fid8);
3130	rdb_netbuf_store_uint64(fidbuf, raw_fid8);
3131	buf.append(reinterpret_cast<const char *>(fidbuf), RDB_FORMATID_SZ);
3132
3133	buf.push_back(src.gtrid_length);
3134	buf.push_back(src.bqual_length);
3135	buf.append(src.data, (src.gtrid_length) + (src.bqual_length));
3136	return buf;
3137	}
3138
3139	#if 0
3140	// MARIAROCKS: MariaDB doesn't have flush_wal method
3141	/**
3142	Called by hton->flush_logs after MySQL group commit prepares a set of
3143	transactions.
3144	*/
3145	static bool rocksdb_flush_wal(handlerton* hton __attribute__((__unused__)))
3146	DBUG_ASSERT(rdb != nullptr);
3147
3148	rocksdb::Status s;
3149	/*
3150	target_lsn is set to 0 when MySQL wants to sync the wal files
3151	*/
3152	if ((target_lsn == `0` && !rocksdb_db_options->allow_mmap_writes) \|\|
3153	rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER) {
3154	rocksdb_wal_group_syncs++;
3155	s = rdb->FlushWAL(target_lsn == `0` \|\|
3156	rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC);
3157	}
3158
3159	if (!s.ok()) {
3160	rdb_log_status_error(s);
3161	return HA_EXIT_FAILURE;
3162	}
3163	return HA_EXIT_SUCCESS;
3164	}
3165	#endif
3166
3167	/**
3168	For a slave, prepare() updates the slave_gtid_info table which tracks the
3169	replication progress.
3170	*/
3171	static int rocksdb_prepare(handlerton* hton, THD* thd, bool prepare_tx)
3172	{
3173	bool async=false; // This is "ASYNC_COMMIT" feature which is only present in webscalesql
3174
3175	Rdb_transaction *&tx = get_tx_from_thd(thd);
3176	if (!tx->can_prepare()) {
3177	return HA_EXIT_FAILURE;
3178	}
3179	if (prepare_tx \|\|
3180	(!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT \| OPTION_BEGIN))) {
3181	/ We were instructed to prepare the whole transaction, or*
3182	this is an SQL statement end and autocommit is on /*
3183
3184	#ifdef MARIAROCKS_NOT_YET
3185	/*
3186	Storing binlog position inside MyRocks is needed only for restoring
3187	MyRocks from backups. This feature is not supported yet.
3188	*/
3189	std::vector<st_slave_gtid_info> slave_gtid_info;
3190	my_core::thd_slave_gtid_info(thd, &slave_gtid_info);
3191	for (const auto &it : slave_gtid_info) {
3192	rocksdb::WriteBatchBase *const write_batch = tx->get_blind_write_batch();
3193	binlog_manager.update_slave_gtid_info(it.id, it.db, it.gtid, write_batch);
3194	}
3195	#endif
3196
3197	if (tx->is_two_phase()) {
3198
3199	/*
3200	MariaDB: the following branch is never taken.
3201	We always flush at Prepare and rely on RocksDB's internal Group Commit
3202	to do some grouping.
3203	*/
3204	if (thd->durability_property == HA_IGNORE_DURABILITY \|\| async) {
3205	tx->set_sync(false);
3206	}
3207
3208	/*
3209	MariaDB: do not flush logs if we are running in a non-crash-safe mode.
3210	*/
3211	if (!rocksdb_flush_log_at_trx_commit)
3212	tx->set_sync(false);
3213
3214	XID xid;
3215	thd_get_xid(thd, reinterpret_cast<MYSQL_XID *>(&xid));
3216	if (!tx->prepare(rdb_xid_to_string(xid))) {
3217	return HA_EXIT_FAILURE;
3218	}
3219
3220	/*
3221	MariaDB: our Group Commit implementation does not use the
3222	hton->flush_logs call (at least currently) so the following is not
3223	needed (TODO: will we need this for binlog rotation?)
3224	*/
3225	#ifdef MARIAROCKS_NOT_YET
3226	if (thd->durability_property == HA_IGNORE_DURABILITY )
3227	(rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER))
3228	&&
3229	THDVAR(thd, flush_log_at_trx_commit))
3230	#endif
3231	#ifdef MARIAROCKS_NOT_YET
3232	{
3233	// MariaRocks: disable the
3234	// "write/sync redo log before flushing binlog cache to file"
3235	// feature. See a869c56d361bb44f46c0efeb11a8f03561676247
3236	/**
3237	we set the log sequence as '1' just to trigger hton->flush_logs
3238	*/
3239	thd_store_lsn(thd, `1`, DB_TYPE_ROCKSDB);
3240	}
3241	#endif
3242	}
3243
3244	DEBUG_SYNC(thd, "rocksdb.prepared");
3245	}
3246	else
3247	tx->make_stmt_savepoint_permanent();
3248	return HA_EXIT_SUCCESS;
3249	}
3250
3251	/**
3252	do nothing for prepare/commit by xid
3253	this is needed to avoid crashes in XA scenarios
3254	*/
3255	static int rocksdb_commit_by_xid(handlerton *const hton, XID *const xid) {
3256	DBUG_ENTER_FUNC();
3257
3258	DBUG_ASSERT(hton != nullptr);
3259	DBUG_ASSERT(xid != nullptr);
3260	DBUG_ASSERT(commit_latency_stats != nullptr);
3261
3262	rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
3263
3264	const auto name = rdb_xid_to_string(*xid);
3265	DBUG_ASSERT(!name.empty());
3266
3267	rocksdb::Transaction *const trx = rdb->GetTransactionByName(name);
3268
3269	if (trx == nullptr) {
3270	DBUG_RETURN(HA_EXIT_FAILURE);
3271	}
3272
3273	const rocksdb::Status s = trx->Commit();
3274
3275	if (!s.ok()) {
3276	rdb_log_status_error(s);
3277	DBUG_RETURN(HA_EXIT_FAILURE);
3278	}
3279
3280	delete trx;
3281
3282	// `Add()` is implemented in a thread-safe manner.
3283	commit_latency_stats->Add(timer.ElapsedNanos() / `1000`);
3284
3285	DBUG_RETURN(HA_EXIT_SUCCESS);
3286	}
3287
3288	static int
3289	rocksdb_rollback_by_xid(handlerton *const hton MY_ATTRIBUTE((__unused__)),
3290	XID *const xid) {
3291	DBUG_ENTER_FUNC();
3292
3293	DBUG_ASSERT(hton != nullptr);
3294	DBUG_ASSERT(xid != nullptr);
3295	DBUG_ASSERT(rdb != nullptr);
3296
3297	const auto name = rdb_xid_to_string(*xid);
3298
3299	rocksdb::Transaction *const trx = rdb->GetTransactionByName(name);
3300
3301	if (trx == nullptr) {
3302	DBUG_RETURN(HA_EXIT_FAILURE);
3303	}
3304
3305	const rocksdb::Status s = trx->Rollback();
3306
3307	if (!s.ok()) {
3308	rdb_log_status_error(s);
3309	DBUG_RETURN(HA_EXIT_FAILURE);
3310	}
3311
3312	delete trx;
3313
3314	DBUG_RETURN(HA_EXIT_SUCCESS);
3315	}
3316
3317	/**
3318	Rebuilds an XID from a serialized version stored in a string.
3319	*/
3320	static void rdb_xid_from_string(const std::string &src, XID *const dst) {
3321	DBUG_ASSERT(dst != nullptr);
3322	uint offset = `0`;
3323	uint64 raw_fid8 =
3324	rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(src.data()));
3325	const int64 signed_fid8 = *reinterpret_cast<int64 *>(&raw_fid8);
3326	dst->formatID = signed_fid8;
3327	offset += RDB_FORMATID_SZ;
3328	dst->gtrid_length = src.at(offset);
3329	offset += RDB_GTRID_SZ;
3330	dst->bqual_length = src.at(offset);
3331	offset += RDB_BQUAL_SZ;
3332
3333	DBUG_ASSERT(dst->gtrid_length >= `0` && dst->gtrid_length <= MAXGTRIDSIZE);
3334	DBUG_ASSERT(dst->bqual_length >= `0` && dst->bqual_length <= MAXBQUALSIZE);
3335
3336	src.copy(dst->data, (dst->gtrid_length) + (dst->bqual_length),
3337	RDB_XIDHDR_LEN);
3338	}
3339
3340	/**
3341	Reading last committed binary log info from RocksDB system row.
3342	The info is needed for crash safe slave/master to work.
3343	*/
3344	static int rocksdb_recover(handlerton* hton, XID* xid_list, uint len)
3345	#ifdef MARIAROCKS_NOT_YET
3346	char* const binlog_file,
3347	my_off_t *const binlog_pos,
3348	Gtid *const binlog_max_gtid) {
3349	#endif
3350	{
3351	#ifdef MARIAROCKS_NOT_YET
3352	if (binlog_file && binlog_pos) {
3353	char file_buf[FN_REFLEN + `1`] = {`0`};
3354	my_off_t pos;
3355	char gtid_buf[FN_REFLEN + `1`] = {`0`};
3356	if (binlog_manager.read(file_buf, &pos, gtid_buf)) {
3357	if (is_binlog_advanced(binlog_file, *binlog_pos, file_buf, pos)) {
3358	memcpy(binlog_file, file_buf, FN_REFLEN + `1`);
3359	*binlog_pos = pos;
3360	fprintf(stderr, "RocksDB: Last binlog file position %llu,"
3361	" file name %s\n",
3362	pos, file_buf);
3363	if (*gtid_buf) {
3364	global_sid_lock->rdlock();
3365	binlog_max_gtid->parse(global_sid_map, gtid_buf);
3366	global_sid_lock->unlock();
3367	fprintf(stderr, "RocksDB: Last MySQL Gtid %s\n", gtid_buf);
3368	}
3369	}
3370	}
3371	}
3372	#endif
3373
3374	if (len == `0` \|\| xid_list == nullptr) {
3375	return HA_EXIT_SUCCESS;
3376	}
3377
3378	std::vector<rocksdb::Transaction *> trans_list;
3379	rdb->GetAllPreparedTransactions(&trans_list);
3380
3381	uint count = `0`;
3382	for (auto &trans : trans_list) {
3383	if (count >= len) {
3384	break;
3385	}
3386	auto name = trans->GetName();
3387	rdb_xid_from_string(name, &xid_list[count]);
3388	count++;
3389	}
3390	return count;
3391	}
3392
3393
3394	/*
3395	Handle a commit checkpoint request from server layer.
3396
3397	InnoDB does this:
3398	We put the request in a queue, so that we can notify upper layer about
3399	checkpoint complete when we have flushed the redo log.
3400	If we have already flushed all relevant redo log, we notify immediately.
3401
3402	MariaRocks just flushes everything right away ATM
3403	*/
3404
3405	static void rocksdb_checkpoint_request(handlerton *hton,
3406	void *cookie)
3407	{
3408	const rocksdb::Status s= rdb->SyncWAL();
3409	//TODO: what to do on error?
3410	if (s.ok())
3411	{
3412	rocksdb_wal_group_syncs ++;
3413	commit_checkpoint_notify_ha(hton, cookie);
3414	}
3415	}
3416
3417	/*
3418	@param all: TRUE - commit the transaction
3419	FALSE - SQL statement ended
3420	*/
3421	static void rocksdb_commit_ordered(handlerton hton, THD thd, bool all)
3422	{
3423	// Same assert as InnoDB has
3424	DBUG_ASSERT(all \|\| (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT \|
3425	OPTION_BEGIN)));
3426	Rdb_transaction *&tx = get_tx_from_thd(thd);
3427	if (!tx->is_two_phase()) {
3428	/*
3429	ordered_commit is supposedly slower as it is done sequentially
3430	in order to preserve commit order.
3431
3432	if we are not required do 2-phase commit with the binlog, do not do
3433	anything here.
3434	*/
3435	return;
3436	}
3437
3438	tx->set_sync(false);
3439
3440	/ This will note the master position also /
3441	tx->commit_ordered_res= tx->commit();
3442	tx->commit_ordered_done= true;
3443
3444	}
3445
3446
3447	static int rocksdb_commit(handlerton* hton, THD* thd, bool commit_tx)
3448	{
3449	DBUG_ENTER_FUNC();
3450
3451	DBUG_ASSERT(hton != nullptr);
3452	DBUG_ASSERT(thd != nullptr);
3453	DBUG_ASSERT(commit_latency_stats != nullptr);
3454
3455	rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true);
3456
3457	/ note: h->external_lock(F_UNLCK) is called after this function is called) /
3458	Rdb_transaction *&tx = get_tx_from_thd(thd);
3459
3460	/ this will trigger saving of perf_context information /
3461	Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
3462
3463	if (tx != nullptr) {
3464	if (commit_tx \|\| (!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT \|
3465	OPTION_BEGIN))) {
3466	/*
3467	This will not add anything to commit_latency_stats, and this is correct
3468	right?
3469	*/
3470	if (tx->commit_ordered_done)
3471	{
3472	thd_wakeup_subsequent_commits(thd, `0`);
3473	DBUG_RETURN((tx->commit_ordered_res? HA_ERR_INTERNAL_ERROR: `0`));
3474	}
3475
3476	/*
3477	We get here
3478	- For a COMMIT statement that finishes a multi-statement transaction
3479	- For a statement that has its own transaction
3480	*/
3481
3482	// First, commit without syncing. This establishes the commit order
3483	tx->set_sync(false);
3484	if (tx->commit()) {
3485	DBUG_RETURN(HA_ERR_ROCKSDB_COMMIT_FAILED);
3486	}
3487	thd_wakeup_subsequent_commits(thd, `0`);
3488
3489	if (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC)
3490	{
3491	rocksdb::Status s= rdb->FlushWAL(true);
3492	if (!s.ok())
3493	DBUG_RETURN(HA_ERR_INTERNAL_ERROR);
3494	}
3495	} else {
3496	/*
3497	We get here when committing a statement within a transaction.
3498	*/
3499	tx->make_stmt_savepoint_permanent();
3500	}
3501
3502	if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
3503	// For READ_COMMITTED, we release any existing snapshot so that we will
3504	// see any changes that occurred since the last statement.
3505	tx->release_snapshot();
3506	}
3507	}
3508
3509	// `Add()` is implemented in a thread-safe manner.
3510	commit_latency_stats->Add(timer.ElapsedNanos() / `1000`);
3511
3512	DBUG_RETURN(HA_EXIT_SUCCESS);
3513	}
3514
3515
3516	static int rocksdb_rollback(handlerton *const hton, THD *const thd,
3517	bool rollback_tx) {
3518	Rdb_transaction *&tx = get_tx_from_thd(thd);
3519	Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
3520
3521	if (tx != nullptr) {
3522	if (rollback_tx) {
3523	/*
3524	We get here, when
3525	- ROLLBACK statement is issued.
3526
3527	Discard the changes made by the transaction
3528	*/
3529	tx->rollback();
3530	} else {
3531	/*
3532	We get here when
3533	- a statement with AUTOCOMMIT=1 is being rolled back (because of some
3534	error)
3535	- a statement inside a transaction is rolled back
3536	*/
3537
3538	tx->rollback_stmt();
3539	tx->set_tx_failed(true);
3540	}
3541
3542	if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) {
3543	// For READ_COMMITTED, we release any existing snapshot so that we will
3544	// see any changes that occurred since the last statement.
3545	tx->release_snapshot();
3546	}
3547	}
3548	return HA_EXIT_SUCCESS;
3549	}
3550
3551	static bool print_stats(THD *const thd, std::string const &type,
3552	std::string const &name, std::string const &status,
3553	stat_print_fn *stat_print) {
3554	return stat_print(thd, type.c_str(), type.size(), name.c_str(), name.size(),
3555	status.c_str(), status.size());
3556	}
3557
3558	static std::string format_string(const char *const format, ...) {
3559	std::string res;
3560	va_list args;
3561	va_list args_copy;
3562	char static_buff[`256`];
3563
3564	DBUG_ASSERT(format != nullptr);
3565
3566	va_start(args, format);
3567	va_copy(args_copy, args);
3568
3569	// Calculate how much space we will need
3570	int len = vsnprintf(nullptr, `0`, format, args);
3571	va_end(args);
3572
3573	if (len < `0`) {
3574	res = std::string ("<format error>");
3575	} else if (len == `0`) {
3576	// Shortcut for an empty string
3577	res = std::string ("");
3578	} else {
3579	// For short enough output use a static buffer
3580	char *buff = static_buff;
3581	std::unique_ptr<char[]> dynamic_buff = nullptr;
3582
3583	len++; // Add one for null terminator
3584
3585	// for longer output use an allocated buffer
3586	if (static_cast<uint>(len) > sizeof(static_buff)) {
3587	dynamic_buff.reset(new char[len]);
3588	buff = dynamic_buff.get();
3589	}
3590
3591	// Now re-do the vsnprintf with the buffer which is now large enough
3592	(void)vsnprintf(buff, len, format, args_copy);
3593
3594	// Convert to a std::string. Note we could have created a std::string
3595	// large enough and then converted the buffer to a 'char' and created*
3596	// the output in place. This would probably work but feels like a hack.
3597	// Since this isn't code that needs to be super-performant we are going
3598	// with this 'safer' method.
3599	res = std::string (buff);
3600	}
3601
3602	va_end(args_copy);
3603
3604	return res;
3605	}
3606
3607	class Rdb_snapshot_status : public Rdb_tx_list_walker {
3608	private:
3609	std::string m_data;
3610
3611	static std::string current_timestamp(void) {
3612	static const char *const format = "%d-%02d-%02d %02d:%02d:%02d";
3613	time_t currtime;
3614	struct tm currtm;
3615
3616	time(&currtime);
3617
3618	localtime_r(&currtime, &currtm);
3619
3620	return format_string(format, currtm.tm_year + `1900`, currtm.tm_mon + `1`,
3621	currtm.tm_mday, currtm.tm_hour, currtm.tm_min,
3622	currtm.tm_sec);
3623	}
3624
3625	static std::string get_header(void) {
3626	return "\n============================================================\n" +
3627	current_timestamp() +
3628	" ROCKSDB TRANSACTION MONITOR OUTPUT\n"
3629	"============================================================\n"
3630	"---------\n"
3631	"SNAPSHOTS\n"
3632	"---------\n"
3633	"LIST OF SNAPSHOTS FOR EACH SESSION:\n";
3634	}
3635
3636	static std::string get_footer(void) {
3637	return "-----------------------------------------\n"
3638	"END OF ROCKSDB TRANSACTION MONITOR OUTPUT\n"
3639	"=========================================\n";
3640	}
3641
3642	static Rdb_deadlock_info::Rdb_dl_trx_info
3643	get_dl_txn_info(const rocksdb::DeadlockInfo &txn,
3644	const GL_INDEX_ID &gl_index_id) {
3645	Rdb_deadlock_info::Rdb_dl_trx_info txn_data;
3646
3647	txn_data.trx_id = txn.m_txn_id;
3648
3649	txn_data.table_name = ddl_manager.safe_get_table_name(gl_index_id);
3650	if (txn_data.table_name.empty()) {
3651	txn_data.table_name =
3652	"NOT FOUND; INDEX_ID: " + std::to_string(gl_index_id.index_id);
3653	}
3654
3655	auto kd = ddl_manager.safe_find(gl_index_id);
3656	txn_data.index_name =
3657	(kd) ? kd ->get_name()
3658	: "NOT FOUND; INDEX_ID: " + std::to_string(gl_index_id.index_id);
3659
3660	rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(txn.m_cf_id);
3661	txn_data.cf_name = cfh->GetName();
3662
3663	txn_data.waiting_key =
3664	rdb_hexdump(txn.m_waiting_key.c_str(), txn.m_waiting_key.length());
3665
3666	txn_data.exclusive_lock = txn.m_exclusive;
3667
3668	return txn_data;
3669	}
3670
3671	static Rdb_deadlock_info
3672	get_dl_path_trx_info(const rocksdb::DeadlockPath &path_entry) {
3673	Rdb_deadlock_info deadlock_info;
3674
3675	for (auto it = path_entry.path.begin(); it != path_entry.path.end();
3676	it ++) {
3677	auto txn = *it;
3678	const GL_INDEX_ID gl_index_id = {
3679	txn.m_cf_id, rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>(
3680	txn.m_waiting_key.c_str()))};
3681	deadlock_info.path.push_back(get_dl_txn_info(txn, gl_index_id));
3682	}
3683	DBUG_ASSERT_IFF(path_entry.limit_exceeded, path_entry.path.empty());
3684	/ print the first txn in the path to display the full deadlock cycle /
3685	if (!path_entry.path.empty() && !path_entry.limit_exceeded) {
3686	auto deadlocking_txn = *(path_entry.path.end() - `1`);
3687	deadlock_info.victim_trx_id = deadlocking_txn.m_txn_id;
3688	}
3689	return deadlock_info;
3690	}
3691
3692	public:
3693	Rdb_snapshot_status() : m_data (get_header()) {}
3694
3695	std::string getResult() { return m_data + get_footer(); }
3696
3697	/ Implement Rdb_transaction interface /
3698	/ Create one row in the snapshot status table /
3699	void process_tran(const Rdb_transaction *const tx) override {
3700	DBUG_ASSERT(tx != nullptr);
3701
3702	/ Calculate the duration the snapshot has existed /
3703	int64_t snapshot_timestamp = tx->m_snapshot_timestamp;
3704	if (snapshot_timestamp != `0`) {
3705	int64_t curr_time;
3706	rdb->GetEnv()->GetCurrentTime(&curr_time);
3707
3708	char buffer[`1024`];
3709	#ifdef MARIAROCKS_NOT_YET
3710	thd_security_context(tx->get_thd(), buffer, sizeof buffer, `0`);
3711	#endif
3712	m_data += format_string(
3713	"---SNAPSHOT, ACTIVE %lld sec\n"
3714	"%s\n"
3715	"lock count %llu, write count %llu\n"
3716	"insert count %llu, update count %llu, delete count %llu\n",
3717	(longlong)(curr_time - snapshot_timestamp), buffer, tx->get_lock_count(),
3718	tx->get_write_count(), tx->get_insert_count(), tx->get_update_count(),
3719	tx->get_delete_count());
3720	}
3721	}
3722
3723	void populate_deadlock_buffer() {
3724	auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
3725	m_data += "----------LATEST DETECTED DEADLOCKS----------\n";
3726
3727	for (auto path_entry : dlock_buffer) {
3728	std::string path_data;
3729	if (path_entry.limit_exceeded) {
3730	path_data += "\n-------DEADLOCK EXCEEDED MAX DEPTH-------\n";
3731	} else {
3732	path_data += "\n*** DEADLOCK PATH\n"
3733	"=========================================\n";
3734	const auto dl_info = get_dl_path_trx_info(path_entry);
3735	for (auto it = dl_info.path.begin(); it != dl_info.path.end(); it ++) {
3736	const auto trx_info = *it;
3737	path_data += format_string(
3738	"TRANSACTION ID: %u\n"
3739	"COLUMN FAMILY NAME: %s\n"
3740	"WAITING KEY: %s\n"
3741	"LOCK TYPE: %s\n"
3742	"INDEX NAME: %s\n"
3743	"TABLE NAME: %s\n",
3744	trx_info.trx_id, trx_info.cf_name.c_str(),
3745	trx_info.waiting_key.c_str(),
3746	trx_info.exclusive_lock ? "EXCLUSIVE" : "SHARED",
3747	trx_info.index_name.c_str(), trx_info.table_name.c_str());
3748	if (it != dl_info.path.end() - `1`) {
3749	path_data += "---------------WAITING FOR---------------\n";
3750	}
3751	}
3752	path_data +=
3753	format_string("\n--------TRANSACTION ID: %u GOT DEADLOCK---------\n",
3754	dl_info.victim_trx_id);
3755	}
3756	m_data += path_data;
3757	}
3758	}
3759
3760	std::vector<Rdb_deadlock_info> get_deadlock_info() {
3761	std::vector<Rdb_deadlock_info> deadlock_info;
3762	auto dlock_buffer = rdb->GetDeadlockInfoBuffer();
3763	for (auto path_entry : dlock_buffer) {
3764	if (!path_entry.limit_exceeded) {
3765	deadlock_info.push_back(get_dl_path_trx_info(path_entry));
3766	}
3767	}
3768	return deadlock_info;
3769	}
3770	};
3771
3772	/**
3773	* @brief
3774	* walks through all non-replication transactions and copies
3775	* out relevant information for information_schema.rocksdb_trx
3776	*/
3777	class Rdb_trx_info_aggregator : public Rdb_tx_list_walker {
3778	private:
3779	std::vector<Rdb_trx_info> *m_trx_info;
3780
3781	public:
3782	explicit Rdb_trx_info_aggregator(std::vector<Rdb_trx_info> *const trx_info)
3783	: m_trx_info(trx_info) {}
3784
3785	void process_tran(const Rdb_transaction *const tx) override {
3786	static const std::map<int, std::string> state_map = {
3787	{rocksdb::Transaction::STARTED, "STARTED"},
3788	{rocksdb::Transaction::AWAITING_PREPARE, "AWAITING_PREPARE"},
3789	{rocksdb::Transaction::PREPARED, "PREPARED"},
3790	{rocksdb::Transaction::AWAITING_COMMIT, "AWAITING_COMMIT"},
3791	{rocksdb::Transaction::COMMITED, "COMMITED"},
3792	{rocksdb::Transaction::AWAITING_ROLLBACK, "AWAITING_ROLLBACK"},
3793	{rocksdb::Transaction::ROLLEDBACK, "ROLLEDBACK"},
3794	};
3795
3796	DBUG_ASSERT(tx != nullptr);
3797
3798	THD *const thd = tx->get_thd();
3799	ulong thread_id = thd_get_thread_id(thd);
3800
3801	if (tx->is_writebatch_trx()) {
3802	const auto wb_impl = static_cast<const Rdb_writebatch_impl *>(tx);
3803	DBUG_ASSERT(wb_impl);
3804	m_trx_info->push_back(
3805	{"", / name /
3806	`0`, / trx_id /
3807	wb_impl->get_write_count(), `0`, / lock_count /
3808	`0`, / timeout_sec /
3809	"", / state /
3810	"", / waiting_key /
3811	`0`, / waiting_cf_id /
3812	`1`, /is_replication /
3813	`1`, / skip_trx_api /
3814	wb_impl->is_tx_read_only(), `0`, / deadlock detection /
3815	wb_impl->num_ongoing_bulk_load(), thread_id, "" / query string /});
3816	} else {
3817	const auto tx_impl = static_cast<const Rdb_transaction_impl *>(tx);
3818	DBUG_ASSERT(tx_impl);
3819	const rocksdb::Transaction *rdb_trx = tx_impl->get_rdb_trx();
3820
3821	if (rdb_trx == nullptr) {
3822	return;
3823	}
3824
3825	char query_buf[NAME_LEN+`1`];
3826	thd_query_safe(thd, query_buf, sizeof(query_buf));
3827	std::string query_str(query_buf);
3828
3829	const auto state_it = state_map.find(rdb_trx->GetState());
3830	DBUG_ASSERT(state_it != state_map.end());
3831	const int is_replication = (thd->rgi_slave != nullptr);
3832	uint32_t waiting_cf_id;
3833	std::string waiting_key;
3834	rdb_trx->GetWaitingTxns(&waiting_cf_id, &waiting_key),
3835
3836	m_trx_info->push_back(
3837	{rdb_trx->GetName(), rdb_trx->GetID(), tx_impl->get_write_count(),
3838	tx_impl->get_lock_count(), tx_impl->get_timeout_sec(),
3839	state_it ->second, waiting_key, waiting_cf_id, is_replication,
3840	`0`, / skip_trx_api /
3841	tx_impl->is_tx_read_only(), rdb_trx->IsDeadlockDetect(),
3842	tx_impl->num_ongoing_bulk_load(), thread_id, query_str});
3843	}
3844	}
3845	};
3846
3847	/*
3848	returns a vector of info for all non-replication threads
3849	for use by information_schema.rocksdb_trx
3850	*/
3851	std::vector<Rdb_trx_info> rdb_get_all_trx_info() {
3852	std::vector<Rdb_trx_info> trx_info;
3853	Rdb_trx_info_aggregator trx_info_agg(&trx_info);
3854	Rdb_transaction::walk_tx_list(&trx_info_agg);
3855	return trx_info;
3856	}
3857
3858
3859	/*
3860	returns a vector of info of recent deadlocks
3861	for use by information_schema.rocksdb_deadlock
3862	*/
3863	std::vector<Rdb_deadlock_info> rdb_get_deadlock_info() {
3864	Rdb_snapshot_status showStatus;
3865	Rdb_transaction::walk_tx_list(&showStatus);
3866	return showStatus.get_deadlock_info();
3867	}
3868
3869	#ifdef MARIAROCKS_NOT_YET
3870	/ Generate the snapshot status table /
3871	static bool rocksdb_show_snapshot_status(handlerton *const hton, THD *const thd,
3872	stat_print_fn *const stat_print) {
3873	Rdb_snapshot_status showStatus;
3874
3875	Rdb_transaction::walk_tx_list(&showStatus);
3876	showStatus.populate_deadlock_buffer();
3877
3878	/ Send the result data back to MySQL /
3879	return print_stats(thd, "rocksdb", "", showStatus.getResult(), stat_print);
3880	}
3881	#endif
3882
3883	/*
3884	This is called for SHOW ENGINE ROCKSDB STATUS \| LOGS \| etc.
3885
3886	For now, produce info about live files (which gives an imprecise idea about
3887	what column families are there).
3888	*/
3889	static bool rocksdb_show_status(handlerton *const hton, THD *const thd,
3890	stat_print_fn *const stat_print,
3891	enum ha_stat_type stat_type) {
3892	DBUG_ASSERT(hton != nullptr);
3893	DBUG_ASSERT(thd != nullptr);
3894	DBUG_ASSERT(stat_print != nullptr);
3895
3896	bool res = false;
3897	char buf[`100`] = {`'\0'`};
3898
3899	if (stat_type == HA_ENGINE_STATUS) {
3900	DBUG_ASSERT(rdb != nullptr);
3901
3902	std::string str;
3903
3904	/ Global DB Statistics /
3905	if (rocksdb_stats) {
3906	str = rocksdb_stats ->ToString();
3907
3908	// Use the same format as internal RocksDB statistics entries to make
3909	// sure that output will look unified.
3910	DBUG_ASSERT(commit_latency_stats != nullptr);
3911
3912	snprintf(buf, sizeof(buf), "rocksdb.commit_latency statistics "
3913	"Percentiles :=> 50 : %.2f 95 : %.2f "
3914	"99 : %.2f 100 : %.2f\n",
3915	commit_latency_stats->Percentile(`50`),
3916	commit_latency_stats->Percentile(`95`),
3917	commit_latency_stats->Percentile(`99`),
3918	commit_latency_stats->Percentile(`100`));
3919	str.append(buf);
3920
3921	uint64_t v = `0`;
3922
3923	// Retrieve additional stalling related numbers from RocksDB and append
3924	// them to the buffer meant for displaying detailed statistics. The intent
3925	// here is to avoid adding another row to the query output because of
3926	// just two numbers.
3927	//
3928	// NB! We're replacing hyphens with underscores in output to better match
3929	// the existing naming convention.
3930	if (rdb->GetIntProperty("rocksdb.is-write-stopped", &v)) {
3931	snprintf(buf, sizeof(buf), "rocksdb.is_write_stopped COUNT : %llu\n", (ulonglong)v);
3932	str.append(buf);
3933	}
3934
3935	if (rdb->GetIntProperty("rocksdb.actual-delayed-write-rate", &v)) {
3936	snprintf(buf, sizeof(buf), "rocksdb.actual_delayed_write_rate "
3937	"COUNT : %llu\n",
3938	(ulonglong)v);
3939	str.append(buf);
3940	}
3941
3942	res \|= print_stats(thd, "STATISTICS", "rocksdb", str, stat_print);
3943	}
3944
3945	/ Per DB stats /
3946	if (rdb->GetProperty("rocksdb.dbstats", &str)) {
3947	res \|= print_stats(thd, "DBSTATS", "rocksdb", str, stat_print);
3948	}
3949
3950	/ Per column family stats /
3951	for (const auto &cf_name : cf_manager.get_cf_names()) {
3952	rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name);
3953	if (cfh == nullptr) {
3954	continue;
3955	}
3956
3957	if (!rdb->GetProperty(cfh, "rocksdb.cfstats", &str)) {
3958	continue;
3959	}
3960
3961	res \|= print_stats(thd, "CF_COMPACTION", cf_name, str, stat_print);
3962	}
3963
3964	/ Memory Statistics /
3965	std::vector<rocksdb::DB *> dbs;
3966	std::unordered_set<const rocksdb::Cache *> cache_set;
3967	size_t internal_cache_count = `0`;
3968	size_t kDefaultInternalCacheSize = `8` * `1024` * `1024`;
3969
3970	dbs.push_back(rdb);
3971	cache_set.insert(rocksdb_tbl_options ->block_cache.get());
3972
3973	for (const auto &cf_handle : cf_manager.get_all_cf()) {
3974	rocksdb::ColumnFamilyDescriptor cf_desc;
3975	cf_handle->GetDescriptor(&cf_desc);
3976	auto *const table_factory = cf_desc.options.table_factory.get();
3977
3978	if (table_factory != nullptr) {
3979	std::string tf_name = table_factory->Name();
3980
3981	if (tf_name.find("BlockBasedTable") != std::string::npos) {
3982	const rocksdb::BlockBasedTableOptions *const bbt_opt =
3983	reinterpret_cast<rocksdb::BlockBasedTableOptions *>(
3984	table_factory->GetOptions());
3985
3986	if (bbt_opt != nullptr) {
3987	if (bbt_opt->block_cache.get() != nullptr) {
3988	cache_set.insert(bbt_opt->block_cache.get());
3989	} else {
3990	internal_cache_count++;
3991	}
3992	cache_set.insert(bbt_opt->block_cache_compressed.get());
3993	}
3994	}
3995	}
3996	}
3997
3998	std::map<rocksdb::MemoryUtil::UsageType, uint64_t> temp_usage_by_type;
3999	str.clear();
4000	rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
4001	&temp_usage_by_type);
4002	snprintf(buf, sizeof(buf), "\nMemTable Total: %llu",
4003	(ulonglong)temp_usage_by_type [rocksdb::MemoryUtil::kMemTableTotal]);
4004	str.append(buf);
4005	snprintf(buf, sizeof(buf), "\nMemTable Unflushed: %llu",
4006	(ulonglong)temp_usage_by_type [rocksdb::MemoryUtil::kMemTableUnFlushed]);
4007	str.append(buf);
4008	snprintf(buf, sizeof(buf), "\nTable Readers Total: %llu",
4009	(ulonglong)temp_usage_by_type [rocksdb::MemoryUtil::kTableReadersTotal]);
4010	str.append(buf);
4011	snprintf(buf, sizeof(buf), "\nCache Total: %llu",
4012	(ulonglong)temp_usage_by_type [rocksdb::MemoryUtil::kCacheTotal]);
4013	str.append(buf);
4014	snprintf(buf, sizeof(buf), "\nDefault Cache Capacity: %llu",
4015	(ulonglong)internal_cache_count * kDefaultInternalCacheSize);
4016	str.append(buf);
4017	res \|= print_stats(thd, "MEMORY_STATS", "rocksdb", str, stat_print);
4018	#ifdef MARIAROCKS_NOT_YET
4019	/ Show the background thread status /
4020	std::vector<rocksdb::ThreadStatus> thread_list;
4021	rocksdb::Status s = rdb->GetEnv()->GetThreadList(&thread_list);
4022
4023	if (!s.ok()) {
4024	sql_print_error("RocksDB: Returned error (%s) from GetThreadList.\n",
4025	s.ToString().c_str());
4026	res \|= true;
4027	} else {
4028	/ For each background thread retrieved, print out its information /
4029	for (auto &it : thread_list) {
4030	/ Only look at background threads. Ignore user threads, if any. /
4031	if (it.thread_type > rocksdb::ThreadStatus::LOW_PRIORITY) {
4032	continue;
4033	}
4034
4035	str = "\nthread_type: " + it.GetThreadTypeName(it.thread_type) +
4036	"\ncf_name: " + it.cf_name +
4037	"\noperation_type: " + it.GetOperationName(it.operation_type) +
4038	"\noperation_stage: " +
4039	it.GetOperationStageName(it.operation_stage) +
4040	"\nelapsed_time_ms: " +
4041	it.MicrosToString(it.op_elapsed_micros);
4042
4043	for (auto &it_props :
4044	it.InterpretOperationProperties(it.operation_type,
4045	it.op_properties)) {
4046	str += "\n" + it_props.first + ": " + std::to_string(it_props.second);
4047	}
4048
4049	str += "\nstate_type: " + it.GetStateName(it.state_type);
4050
4051	res \|= print_stats(thd, "BG_THREADS", std::to_string(it.thread_id),
4052	str, stat_print);
4053	}
4054	}
4055	#endif
4056
4057	#ifdef MARIAROCKS_NOT_YET
4058	} else if (stat_type == HA_ENGINE_TRX) {
4059	/ Handle the SHOW ENGINE ROCKSDB TRANSACTION STATUS command /
4060	res \|= rocksdb_show_snapshot_status(hton, thd, stat_print);
4061	#endif
4062	}
4063	return res;
4064	}
4065
4066	static inline void rocksdb_register_tx(handlerton *const hton, THD *const thd,
4067	Rdb_transaction *const tx) {
4068	DBUG_ASSERT(tx != nullptr);
4069
4070	trans_register_ha(thd, FALSE, rocksdb_hton);
4071	if (my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT \| OPTION_BEGIN)) {
4072	tx->start_stmt();
4073	trans_register_ha(thd, TRUE, rocksdb_hton);
4074	}
4075	}
4076
4077	static const char *ha_rocksdb_exts[] = {NullS};
4078
4079	/*
4080	Supporting START TRANSACTION WITH CONSISTENT [ROCKSDB] SNAPSHOT
4081
4082	Features:
4083	1. Supporting START TRANSACTION WITH CONSISTENT SNAPSHOT
4084	2. Getting current binlog position in addition to #1.
4085
4086	The second feature is done by START TRANSACTION WITH
4087	CONSISTENT ROCKSDB SNAPSHOT. This is Facebook's extension, and
4088	it works like existing START TRANSACTION WITH CONSISTENT INNODB SNAPSHOT.
4089
4090	- When not setting engine, START TRANSACTION WITH CONSISTENT SNAPSHOT
4091	takes both InnoDB and RocksDB snapshots, and both InnoDB and RocksDB
4092	participate in transaction. When executing COMMIT, both InnoDB and
4093	RocksDB modifications are committed. Remember that XA is not supported yet,
4094	so mixing engines is not recommended anyway.
4095
4096	- When setting engine, START TRANSACTION WITH CONSISTENT.. takes
4097	snapshot for the specified engine only. But it starts both
4098	InnoDB and RocksDB transactions.
4099	*/
4100	static int rocksdb_start_tx_and_assign_read_view(
4101	handlerton *const hton, /!< in: RocksDB handlerton /
4102	THD* thd) /!< in: MySQL thread handle of the*
4103	user for whom the transaction should
4104	be committed /*
4105	{
4106	ulong const tx_isolation = my_core::thd_tx_isolation(thd);
4107
4108	if (tx_isolation != ISO_REPEATABLE_READ) {
4109	my_error(ER_ISOLATION_LEVEL_WITH_CONSISTENT_SNAPSHOT, MYF(`0`));
4110	return HA_EXIT_FAILURE;
4111	}
4112	/*
4113	MariaDB: there is no need to call mysql_bin_log_lock_commits and then
4114	unlock back.
4115	SQL layer calls start_consistent_snapshot() for all engines, including the
4116	binlog under LOCK_commit_ordered mutex.
4117	The mutex prevents binlog commits from happening (right?) while the storage
4118	engine(s) allocate read snapshots. That way, each storage engine is
4119	synchronized with current binlog position.
4120	*/
4121	mysql_mutex_assert_owner(&LOCK_commit_ordered);
4122
4123	Rdb_transaction *const tx = get_or_create_tx(thd);
4124	Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd));
4125
4126	DBUG_ASSERT(!tx->has_snapshot());
4127	tx->set_tx_read_only(true);
4128	rocksdb_register_tx(hton, thd, tx);
4129	tx->acquire_snapshot(true);
4130
4131	return HA_EXIT_SUCCESS;
4132	}
4133
4134	/ Dummy SAVEPOINT support. This is needed for long running transactions*
4135	* like mysqldump (https://bugs.mysql.com/bug.php?id=71017).
4136	* Current SAVEPOINT does not correctly handle ROLLBACK and does not return
4137	* errors. This needs to be addressed in future versions (Issue#96).
4138	*/
4139	static int rocksdb_savepoint(handlerton *const hton, THD *const thd,
4140	void *const savepoint) {
4141	return HA_EXIT_SUCCESS;
4142	}
4143
4144	static int rocksdb_rollback_to_savepoint(handlerton *const hton, THD *const thd,
4145	void *const savepoint) {
4146	Rdb_transaction *&tx = get_tx_from_thd(thd);
4147	return tx->rollback_to_savepoint(savepoint);
4148	}
4149
4150	static bool
4151	rocksdb_rollback_to_savepoint_can_release_mdl(handlerton *const hton,
4152	THD *const thd) {
4153	return true;
4154	}
4155
4156	#ifdef MARIAROCKS_NOT_YET
4157	/*
4158	This is called for INFORMATION_SCHEMA
4159	*/
4160	static void rocksdb_update_table_stats(
4161	/ per-table stats callback /
4162	void (cb)(const* char db, const* char tbl, bool* is_partition,
4163	my_io_perf_t r, my_io_perf_t w, my_io_perf_t *r_blob,
4164	my_io_perf_t r_primary, my_io_perf_t r_secondary,
4165	page_stats_t page_stats, comp_stats_t comp_stats,
4166	int n_lock_wait, int n_lock_wait_timeout, int n_lock_deadlock,
4167	const char *engine)) {
4168	my_io_perf_t io_perf_read;
4169	my_io_perf_t io_perf_write;
4170	my_io_perf_t io_perf;
4171	page_stats_t page_stats;
4172	comp_stats_t comp_stats;
4173	uint lock_wait_timeout_stats;
4174	uint deadlock_stats;
4175	uint lock_wait_stats;
4176	std::vector<std::string> tablenames;
4177
4178	/*
4179	Most of these are for innodb, so setting them to 0.
4180	TODO: possibly separate out primary vs. secondary index reads
4181	*/
4182	memset(&io_perf, `0`, sizeof(io_perf));
4183	memset(&page_stats, `0`, sizeof(page_stats));
4184	memset(&comp_stats, `0`, sizeof(comp_stats));
4185	memset(&io_perf_write, `0`, sizeof(io_perf_write));
4186
4187	tablenames = rdb_open_tables.get_table_names();
4188
4189	for (const auto &it : tablenames) {
4190	Rdb_table_handler *table_handler;
4191	std::string str, dbname, tablename, partname;
4192	char dbname_sys[NAME_LEN + `1`];
4193	char tablename_sys[NAME_LEN + `1`];
4194	bool is_partition;
4195
4196	if (rdb_normalize_tablename(it, &str) != HA_EXIT_SUCCESS) {
4197	/ Function needs to return void because of the interface and we've*
4198	* detected an error which shouldn't happen. There's no way to let
4199	* caller know that something failed.
4200	*/
4201	SHIP_ASSERT(false);
4202	return;
4203	}
4204
4205	if (rdb_split_normalized_tablename(str, &dbname, &tablename, &partname)) {
4206	continue;
4207	}
4208
4209	is_partition = (partname.size() != `0`);
4210
4211	table_handler = rdb_open_tables.get_table_handler(it.c_str());
4212	if (table_handler == nullptr) {
4213	continue;
4214	}
4215
4216	io_perf_read.bytes = table_handler->m_io_perf_read.bytes.load();
4217	io_perf_read.requests = table_handler->m_io_perf_read.requests.load();
4218	io_perf_write.bytes = table_handler->m_io_perf_write.bytes.load();
4219	io_perf_write.requests = table_handler->m_io_perf_write.requests.load();
4220	lock_wait_timeout_stats = table_handler->m_lock_wait_timeout_counter.load();
4221	deadlock_stats = table_handler->m_deadlock_counter.load();
4222	lock_wait_stats =
4223	table_handler->m_table_perf_context.m_value[PC_KEY_LOCK_WAIT_COUNT]
4224	.load();
4225
4226	/*
4227	Convert from rocksdb timer to mysql timer. RocksDB values are
4228	in nanoseconds, but table statistics expect the value to be
4229	in my_timer format.
4230	*/
4231	io_perf_read.svc_time = my_core::microseconds_to_my_timer(
4232	table_handler->m_io_perf_read.svc_time.load() / `1000`);
4233	io_perf_read.svc_time_max = my_core::microseconds_to_my_timer(
4234	table_handler->m_io_perf_read.svc_time_max.load() / `1000`);
4235	io_perf_read.wait_time = my_core::microseconds_to_my_timer(
4236	table_handler->m_io_perf_read.wait_time.load() / `1000`);
4237	io_perf_read.wait_time_max = my_core::microseconds_to_my_timer(
4238	table_handler->m_io_perf_read.wait_time_max.load() / `1000`);
4239	io_perf_read.slow_ios = table_handler->m_io_perf_read.slow_ios.load();
4240	rdb_open_tables.release_table_handler(table_handler);
4241
4242	/*
4243	Table stats expects our database and table name to be in system encoding,
4244	not filename format. Convert before calling callback.
4245	*/
4246	my_core::filename_to_tablename(dbname.c_str(), dbname_sys,
4247	sizeof(dbname_sys));
4248	my_core::filename_to_tablename(tablename.c_str(), tablename_sys,
4249	sizeof(tablename_sys));
4250	(*cb)(dbname_sys, tablename_sys, is_partition, &io_perf_read,
4251	&io_perf_write, &io_perf, &io_perf, &io_perf, &page_stats,
4252	&comp_stats, lock_wait_stats, lock_wait_timeout_stats, deadlock_stats,
4253	rocksdb_hton_name);
4254	}
4255	}
4256	#endif
4257	static rocksdb::Status check_rocksdb_options_compatibility(
4258	const char *const dbpath, const rocksdb::Options &main_opts,
4259	const std::vector<rocksdb::ColumnFamilyDescriptor> &cf_descr) {
4260	DBUG_ASSERT(rocksdb_datadir != nullptr);
4261
4262	rocksdb::DBOptions loaded_db_opt;
4263	std::vector<rocksdb::ColumnFamilyDescriptor> loaded_cf_descs;
4264	rocksdb::Status status =
4265	LoadLatestOptions(dbpath, rocksdb::Env::Default(), &loaded_db_opt,
4266	&loaded_cf_descs, rocksdb_ignore_unknown_options);
4267
4268	// If we're starting from scratch and there are no options saved yet then this
4269	// is a valid case. Therefore we can't compare the current set of options to
4270	// anything.
4271	if (status.IsNotFound()) {
4272	return rocksdb::Status::OK();
4273	}
4274
4275	if (!status.ok()) {
4276	return status;
4277	}
4278
4279	if (loaded_cf_descs.size() != cf_descr.size()) {
4280	return rocksdb::Status::NotSupported("Mismatched size of column family "
4281	"descriptors.");
4282	}
4283
4284	// Please see RocksDB documentation for more context about why we need to set
4285	// user-defined functions and pointer-typed options manually.
4286	for (size_t i = `0`; i < loaded_cf_descs.size(); i++) {
4287	loaded_cf_descs [i].options.compaction_filter =
4288	cf_descr [i].options.compaction_filter;
4289	loaded_cf_descs [i].options.compaction_filter_factory =
4290	cf_descr [i].options.compaction_filter_factory;
4291	loaded_cf_descs [i].options.comparator = cf_descr [i].options.comparator;
4292	loaded_cf_descs [i].options.memtable_factory =
4293	cf_descr [i].options.memtable_factory;
4294	loaded_cf_descs [i].options.merge_operator =
4295	cf_descr [i].options.merge_operator;
4296	loaded_cf_descs [i].options.prefix_extractor =
4297	cf_descr [i].options.prefix_extractor;
4298	loaded_cf_descs [i].options.table_factory =
4299	cf_descr [i].options.table_factory;
4300	}
4301
4302	// This is the essence of the function - determine if it's safe to open the
4303	// database or not.
4304	status = CheckOptionsCompatibility(dbpath, rocksdb::Env::Default(), main_opts,
4305	loaded_cf_descs,
4306	rocksdb_ignore_unknown_options);
4307
4308	return status;
4309	}
4310
4311	bool prevent_myrocks_loading= false;
4312
4313
4314	/*
4315	Storage Engine initialization function, invoked when plugin is loaded.
4316	*/
4317
4318	static int rocksdb_init_func(void *const p) {
4319
4320	DBUG_ENTER_FUNC();
4321
4322	if (prevent_myrocks_loading)
4323	{
4324	my_error(ER_INTERNAL_ERROR, MYF(`0`),
4325	"Loading MyRocks plugin after it has been unloaded is not "
4326	"supported. Please restart mysqld");
4327	DBUG_RETURN(`1`);
4328	}
4329
4330	if (rdb_check_rocksdb_corruption()) {
4331	sql_print_error("RocksDB: There was a corruption detected in RockDB files. "
4332	"Check error log emitted earlier for more details.");
4333	if (rocksdb_allow_to_start_after_corruption) {
4334	sql_print_information(
4335	"RocksDB: Remove rocksdb_allow_to_start_after_corruption to prevent "
4336	"server operating if RocksDB corruption is detected.");
4337	} else {
4338	sql_print_error("RocksDB: The server will exit normally and stop restart "
4339	"attempts. Remove %s file from data directory and "
4340	"start mysqld manually.",
4341	rdb_corruption_marker_file_name().c_str());
4342	exit(`0`);
4343	}
4344	}
4345
4346	// Validate the assumption about the size of ROCKSDB_SIZEOF_HIDDEN_PK_COLUMN.
4347	static_assert(sizeof(longlong) == `8`, "Assuming that longlong is 8 bytes.");
4348
4349	init_rocksdb_psi_keys();
4350
4351	rocksdb_hton = (handlerton *)p;
4352	mysql_mutex_init(rdb_psi_open_tbls_mutex_key, &rdb_open_tables.m_mutex,
4353	MY_MUTEX_INIT_FAST);
4354	#ifdef HAVE_PSI_INTERFACE
4355	rdb_bg_thread.init(rdb_signal_bg_psi_mutex_key, rdb_signal_bg_psi_cond_key);
4356	rdb_drop_idx_thread.init(rdb_signal_drop_idx_psi_mutex_key,
4357	rdb_signal_drop_idx_psi_cond_key);
4358	#else
4359	rdb_bg_thread.init();
4360	rdb_drop_idx_thread.init();
4361	#endif
4362	mysql_mutex_init(rdb_collation_data_mutex_key, &rdb_collation_data_mutex,
4363	MY_MUTEX_INIT_FAST);
4364	mysql_mutex_init(rdb_mem_cmp_space_mutex_key, &rdb_mem_cmp_space_mutex,
4365	MY_MUTEX_INIT_FAST);
4366
4367	const char* initial_rocksdb_datadir_for_ignore_dirs= rocksdb_datadir;
4368	if (!strncmp(rocksdb_datadir, "./", `2`))
4369	initial_rocksdb_datadir_for_ignore_dirs += `2`;
4370	ignore_db_dirs_append(initial_rocksdb_datadir_for_ignore_dirs);
4371
4372	#if defined(HAVE_PSI_INTERFACE)
4373	rdb_collation_exceptions =
4374	new Regex_list_handler (key_rwlock_collation_exception_list);
4375	#else
4376	rdb_collation_exceptions = new Regex_list_handler();
4377	#endif
4378
4379	mysql_mutex_init(rdb_sysvars_psi_mutex_key, &rdb_sysvars_mutex,
4380	MY_MUTEX_INIT_FAST);
4381	Rdb_transaction::init_mutex();
4382
4383	rocksdb_hton->state = SHOW_OPTION_YES;
4384	rocksdb_hton->create = rocksdb_create_handler;
4385	rocksdb_hton->close_connection = rocksdb_close_connection;
4386
4387	rocksdb_hton->prepare = rocksdb_prepare;
4388	rocksdb_hton->prepare_ordered = NULL; // Do not need it
4389
4390	rocksdb_hton->commit_by_xid = rocksdb_commit_by_xid;
4391	rocksdb_hton->rollback_by_xid = rocksdb_rollback_by_xid;
4392	rocksdb_hton->recover = rocksdb_recover;
4393
4394	rocksdb_hton->commit_ordered= rocksdb_commit_ordered;
4395	rocksdb_hton->commit = rocksdb_commit;
4396
4397	rocksdb_hton->commit_checkpoint_request= rocksdb_checkpoint_request;
4398
4399	rocksdb_hton->rollback = rocksdb_rollback;
4400	rocksdb_hton->show_status = rocksdb_show_status;
4401	rocksdb_hton->start_consistent_snapshot =
4402	rocksdb_start_tx_and_assign_read_view;
4403	rocksdb_hton->savepoint_set = rocksdb_savepoint;
4404	rocksdb_hton->savepoint_rollback = rocksdb_rollback_to_savepoint;
4405	rocksdb_hton->savepoint_rollback_can_release_mdl =
4406	rocksdb_rollback_to_savepoint_can_release_mdl;
4407	#ifdef MARIAROCKS_NOT_YET
4408	rocksdb_hton->update_table_stats = rocksdb_update_table_stats;
4409	#endif // MARIAROCKS_NOT_YET
4410
4411	/*
4412	Not needed in MariaDB:
4413	rocksdb_hton->flush_logs = rocksdb_flush_wal;
4414	*/
4415
4416	rocksdb_hton->flags = HTON_TEMPORARY_NOT_SUPPORTED \|
4417	HTON_SUPPORTS_EXTENDED_KEYS \| HTON_CAN_RECREATE;
4418
4419	rocksdb_hton->tablefile_extensions= ha_rocksdb_exts;
4420	DBUG_ASSERT(!mysqld_embedded);
4421
4422	if (rocksdb_db_options ->max_open_files > (long)open_files_limit) {
4423	sql_print_information("RocksDB: rocksdb_max_open_files should not be "
4424	"greater than the open_files_limit, effective value "
4425	"of rocksdb_max_open_files is being set to "
4426	"open_files_limit / 2.");
4427	rocksdb_db_options ->max_open_files = open_files_limit / `2`;
4428	} else if (rocksdb_db_options ->max_open_files == -`2`) {
4429	rocksdb_db_options ->max_open_files = open_files_limit / `2`;
4430	}
4431
4432	rocksdb_stats = rocksdb::CreateDBStatistics();
4433	rocksdb_db_options ->statistics = rocksdb_stats;
4434
4435	if (rocksdb_rate_limiter_bytes_per_sec != `0`) {
4436	rocksdb_rate_limiter.reset(
4437	rocksdb::NewGenericRateLimiter(rocksdb_rate_limiter_bytes_per_sec));
4438	rocksdb_db_options ->rate_limiter = rocksdb_rate_limiter;
4439	}
4440
4441	rocksdb_db_options ->delayed_write_rate = rocksdb_delayed_write_rate;
4442
4443	std::shared_ptr<Rdb_logger> myrocks_logger = std::make_shared<Rdb_logger>();
4444	rocksdb::Status s = rocksdb::CreateLoggerFromOptions(
4445	rocksdb_datadir, *rocksdb_db_options, &rocksdb_db_options ->info_log);
4446	if (s.ok()) {
4447	myrocks_logger ->SetRocksDBLogger(rocksdb_db_options ->info_log);
4448	}
4449
4450	rocksdb_db_options ->info_log = myrocks_logger;
4451	myrocks_logger ->SetInfoLogLevel(
4452	static_cast<rocksdb::InfoLogLevel>(rocksdb_info_log_level));
4453	rocksdb_db_options ->wal_dir = rocksdb_wal_dir;
4454
4455	rocksdb_db_options ->wal_recovery_mode =
4456	static_cast<rocksdb::WALRecoveryMode>(rocksdb_wal_recovery_mode);
4457
4458	rocksdb_db_options ->access_hint_on_compaction_start =
4459	static_cast<rocksdb::Options::AccessHint>(
4460	rocksdb_access_hint_on_compaction_start);
4461
4462	if (rocksdb_db_options ->allow_mmap_reads &&
4463	rocksdb_db_options ->use_direct_reads) {
4464	// allow_mmap_reads implies !use_direct_reads and RocksDB will not open if
4465	// mmap_reads and direct_reads are both on. (NO_LINT_DEBUG)
4466	sql_print_error("RocksDB: Can't enable both use_direct_reads "
4467	"and allow_mmap_reads\n");
4468	DBUG_RETURN(HA_EXIT_FAILURE);
4469	}
4470
4471	if (rocksdb_db_options ->allow_mmap_writes &&
4472	rocksdb_db_options ->use_direct_io_for_flush_and_compaction) {
4473	// See above comment for allow_mmap_reads. (NO_LINT_DEBUG)
4474	sql_print_error("RocksDB: Can't enable both "
4475	"use_direct_io_for_flush_and_compaction and "
4476	"allow_mmap_writes\n");
4477	DBUG_RETURN(HA_EXIT_FAILURE);
4478	}
4479
4480	if (rocksdb_db_options ->allow_mmap_writes &&
4481	rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER) {
4482	// NO_LINT_DEBUG
4483	sql_print_error("RocksDB: rocksdb_flush_log_at_trx_commit needs to be 0 "
4484	"to use allow_mmap_writes");
4485	DBUG_RETURN(HA_EXIT_FAILURE);
4486	}
4487
4488	// sst_file_manager will move deleted rocksdb sst files to trash_dir
4489	// to be deleted in a background thread.
4490	std::string trash_dir = std::string (rocksdb_datadir) + "/trash";
4491	rocksdb_db_options ->sst_file_manager.reset(NewSstFileManager(
4492	rocksdb_db_options ->env, myrocks_logger, trash_dir,
4493	rocksdb_sst_mgr_rate_bytes_per_sec, true / delete_existing_trash /));
4494
4495	std::vector<std::string> cf_names;
4496	rocksdb::Status status;
4497	status = rocksdb::DB::ListColumnFamilies(*rocksdb_db_options, rocksdb_datadir,
4498	&cf_names);
4499	if (!status.ok()) {
4500	/*
4501	When we start on an empty datadir, ListColumnFamilies returns IOError,
4502	and RocksDB doesn't provide any way to check what kind of error it was.
4503	Checking system errno happens to work right now.
4504	*/
4505	if (status.IsIOError()
4506	#ifndef _WIN32
4507	&& errno == ENOENT
4508	#endif
4509	) {
4510	sql_print_information("RocksDB: Got ENOENT when listing column families");
4511	sql_print_information(
4512	"RocksDB: assuming that we're creating a new database");
4513	} else {
4514	rdb_log_status_error(status, "Error listing column families");
4515	DBUG_RETURN(HA_EXIT_FAILURE);
4516	}
4517	} else
4518	sql_print_information("RocksDB: %ld column families found",
4519	cf_names.size());
4520
4521	std::vector<rocksdb::ColumnFamilyDescriptor> cf_descr;
4522	std::vector<rocksdb::ColumnFamilyHandle *> cf_handles;
4523
4524	rocksdb_tbl_options ->index_type =
4525	(rocksdb::BlockBasedTableOptions::IndexType)rocksdb_index_type;
4526
4527	if (!rocksdb_tbl_options ->no_block_cache) {
4528	std::shared_ptr<rocksdb::Cache> block_cache = rocksdb_use_clock_cache
4529	? rocksdb::NewClockCache(rocksdb_block_cache_size)
4530	: rocksdb::NewLRUCache(rocksdb_block_cache_size);
4531	if (rocksdb_sim_cache_size > `0`) {
4532	// Simulated cache enabled
4533	// Wrap block cache inside a simulated cache and pass it to RocksDB
4534	rocksdb_tbl_options ->block_cache =
4535	rocksdb::NewSimCache(block_cache, rocksdb_sim_cache_size, `6`);
4536	} else {
4537	// Pass block cache to RocksDB
4538	rocksdb_tbl_options ->block_cache = block_cache;
4539	}
4540	}
4541	// Using newer BlockBasedTable format version for better compression
4542	// and better memory allocation.
4543	// See:
4544	// https://github.com/facebook/rocksdb/commit/9ab5adfc59a621d12357580c94451d9f7320c2dd
4545	rocksdb_tbl_options ->format_version = `2`;
4546
4547	if (rocksdb_collect_sst_properties) {
4548	properties_collector_factory =
4549	std::make_shared<Rdb_tbl_prop_coll_factory>(&ddl_manager);
4550
4551	rocksdb_set_compaction_options(nullptr, nullptr, nullptr, nullptr);
4552
4553	RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
4554
4555	DBUG_ASSERT(rocksdb_table_stats_sampling_pct <=
4556	RDB_TBL_STATS_SAMPLE_PCT_MAX);
4557	properties_collector_factory ->SetTableStatsSamplingPct(
4558	rocksdb_table_stats_sampling_pct);
4559
4560	RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
4561	}
4562
4563	if (rocksdb_persistent_cache_size_mb > `0`) {
4564	std::shared_ptr<rocksdb::PersistentCache> pcache;
4565	uint64_t cache_size_bytes= rocksdb_persistent_cache_size_mb * `1024` * `1024`;
4566	status = rocksdb::NewPersistentCache(
4567	rocksdb::Env::Default(), std::string (rocksdb_persistent_cache_path),
4568	cache_size_bytes, myrocks_logger, true, &pcache);
4569	if (!status.ok()) {
4570	// NO_LINT_DEBUG
4571	sql_print_error("RocksDB: Persistent cache returned error: (%s)",
4572	status.getState());
4573	DBUG_RETURN(HA_EXIT_FAILURE);
4574	}
4575	rocksdb_tbl_options ->persistent_cache = pcache;
4576	} else if (strlen(rocksdb_persistent_cache_path)) {
4577	sql_print_error("RocksDB: Must specify rocksdb_persistent_cache_size_mb");
4578	DBUG_RETURN(HA_EXIT_FAILURE);
4579	}
4580
4581	std::unique_ptr<Rdb_cf_options> cf_options_map(new Rdb_cf_options ());
4582	if (!cf_options_map ->init(*rocksdb_tbl_options, properties_collector_factory,
4583	rocksdb_default_cf_options,
4584	rocksdb_override_cf_options)) {
4585	// NO_LINT_DEBUG
4586	sql_print_error("RocksDB: Failed to initialize CF options map.");
4587	DBUG_RETURN(HA_EXIT_FAILURE);
4588	}
4589
4590	/*
4591	If there are no column families, we're creating the new database.
4592	Create one column family named "default".
4593	*/
4594	if (cf_names.size() == `0`)
4595	cf_names.push_back(DEFAULT_CF_NAME);
4596
4597	std::vector<int> compaction_enabled_cf_indices;
4598	sql_print_information("RocksDB: Column Families at start:");
4599	for (size_t i = `0`; i < cf_names.size(); ++i) {
4600	rocksdb::ColumnFamilyOptions opts;
4601	cf_options_map ->get_cf_options(cf_names [i], &opts);
4602
4603	sql_print_information(" cf=%s", cf_names [i].c_str());
4604	sql_print_information(" write_buffer_size=%ld", opts.write_buffer_size);
4605	sql_print_information(" target_file_size_base=%" PRIu64,
4606	opts.target_file_size_base);
4607
4608	/*
4609	Temporarily disable compactions to prevent a race condition where
4610	compaction starts before compaction filter is ready.
4611	*/
4612	if (!opts.disable_auto_compactions) {
4613	compaction_enabled_cf_indices.push_back(i);
4614	opts.disable_auto_compactions = true;
4615	}
4616	cf_descr.push_back(rocksdb::ColumnFamilyDescriptor (cf_names [i], opts));
4617	}
4618
4619	rocksdb::Options main_opts(*rocksdb_db_options,
4620	cf_options_map ->get_defaults());
4621
4622	rocksdb::TransactionDBOptions tx_db_options;
4623	tx_db_options.transaction_lock_timeout = `2`; // 2 seconds
4624	tx_db_options.custom_mutex_factory = std::make_shared<Rdb_mutex_factory>();
4625
4626	status =
4627	check_rocksdb_options_compatibility(rocksdb_datadir, main_opts, cf_descr);
4628
4629	// We won't start if we'll determine that there's a chance of data corruption
4630	// because of incompatible options.
4631	if (!status.ok()) {
4632	rdb_log_status_error(
4633	status, "Compatibility check against existing database options failed");
4634	DBUG_RETURN(HA_EXIT_FAILURE);
4635	}
4636
4637	status = rocksdb::TransactionDB::Open(
4638	main_opts, tx_db_options, rocksdb_datadir, cf_descr, &cf_handles, &rdb);
4639
4640	if (!status.ok()) {
4641	rdb_log_status_error(status, "Error opening instance");
4642	DBUG_RETURN(HA_EXIT_FAILURE);
4643	}
4644	cf_manager.init(std::move(cf_options_map), &cf_handles);
4645
4646	if (dict_manager.init(rdb->GetBaseDB(), &cf_manager)) {
4647	// NO_LINT_DEBUG
4648	sql_print_error("RocksDB: Failed to initialize data dictionary.");
4649	DBUG_RETURN(HA_EXIT_FAILURE);
4650	}
4651
4652	if (binlog_manager.init(&dict_manager)) {
4653	// NO_LINT_DEBUG
4654	sql_print_error("RocksDB: Failed to initialize binlog manager.");
4655	DBUG_RETURN(HA_EXIT_FAILURE);
4656	}
4657
4658	if (ddl_manager.init(&dict_manager, &cf_manager, rocksdb_validate_tables)) {
4659	// NO_LINT_DEBUG
4660	sql_print_error("RocksDB: Failed to initialize DDL manager.");
4661	DBUG_RETURN(HA_EXIT_FAILURE);
4662	}
4663
4664	Rdb_sst_info::init(rdb);
4665
4666	/*
4667	Enable auto compaction, things needed for compaction filter are finished
4668	initializing
4669	*/
4670	std::vector<rocksdb::ColumnFamilyHandle *> compaction_enabled_cf_handles;
4671	compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size());
4672	for (const auto &index : compaction_enabled_cf_indices) {
4673	compaction_enabled_cf_handles.push_back(cf_handles [index]);
4674	}
4675
4676	status = rdb->EnableAutoCompaction(compaction_enabled_cf_handles);
4677
4678	if (!status.ok()) {
4679	rdb_log_status_error(status, "Error enabling compaction");
4680	DBUG_RETURN(HA_EXIT_FAILURE);
4681	}
4682
4683	auto err = rdb_bg_thread.create_thread(BG_THREAD_NAME
4684	#ifdef HAVE_PSI_INTERFACE
4685	,
4686	rdb_background_psi_thread_key
4687	#endif
4688	);
4689	if (err != `0`) {
4690	sql_print_error("RocksDB: Couldn't start the background thread: (errno=%d)",
4691	err);
4692	DBUG_RETURN(HA_EXIT_FAILURE);
4693	}
4694
4695	err = rdb_drop_idx_thread.create_thread(INDEX_THREAD_NAME
4696	#ifdef HAVE_PSI_INTERFACE
4697	,
4698	rdb_drop_idx_psi_thread_key
4699	#endif
4700	);
4701	if (err != `0`) {
4702	sql_print_error("RocksDB: Couldn't start the drop index thread: (errno=%d)",
4703	err);
4704	DBUG_RETURN(HA_EXIT_FAILURE);
4705	}
4706
4707	rdb_set_collation_exception_list(rocksdb_strict_collation_exceptions);
4708
4709	if (rocksdb_pause_background_work) {
4710	rdb->PauseBackgroundWork();
4711	}
4712
4713	// NO_LINT_DEBUG
4714	sql_print_information("RocksDB: global statistics using %s indexer",
4715	STRINGIFY_ARG(RDB_INDEXER));
4716	#if defined(HAVE_SCHED_GETCPU)
4717	if (sched_getcpu() == -`1`) {
4718	// NO_LINT_DEBUG
4719	sql_print_information(
4720	"RocksDB: sched_getcpu() failed - "
4721	"global statistics will use thread_id_indexer_t instead");
4722	}
4723	#endif
4724
4725	/**
4726	Rocksdb does not always shutdown its threads, when
4727	plugin is shut down. Disable server's leak check
4728	at exit to avoid crash.
4729	*/
4730	my_disable_leak_check = true;
4731
4732	err = my_error_register(rdb_get_error_messages, HA_ERR_ROCKSDB_FIRST,
4733	HA_ERR_ROCKSDB_LAST);
4734	if (err != `0`) {
4735	// NO_LINT_DEBUG
4736	sql_print_error("RocksDB: Couldn't initialize error messages");
4737	rdb_open_tables.m_hash.~Rdb_table_set();
4738	DBUG_RETURN(HA_EXIT_FAILURE);
4739	}
4740
4741
4742
4743	// Creating an instance of HistogramImpl should only happen after RocksDB
4744	// has been successfully initialized.
4745	commit_latency_stats = new rocksdb::HistogramImpl ();
4746
4747	// Construct a list of directories which will be monitored by I/O watchdog
4748	// to make sure that we won't lose write access to them.
4749	std::vector<std::string> directories;
4750
4751	// 1. Data directory.
4752	directories.push_back(mysql_real_data_home);
4753
4754	// 2. Transaction logs.
4755	if (myrocks::rocksdb_wal_dir && *myrocks::rocksdb_wal_dir) {
4756	directories.push_back(myrocks::rocksdb_wal_dir);
4757	}
4758
4759	#if !defined(_WIN32) && !defined(__APPLE__)
4760	io_watchdog = new Rdb_io_watchdog (directories);
4761	io_watchdog->reset_timeout(rocksdb_io_write_timeout_secs);
4762	#endif
4763
4764	// NO_LINT_DEBUG
4765	sql_print_information("MyRocks storage engine plugin has been successfully "
4766	"initialized.");
4767
4768	DBUG_RETURN(HA_EXIT_SUCCESS);
4769	}
4770
4771	/*
4772	Storage Engine deinitialization function, invoked when plugin is unloaded.
4773	*/
4774
4775	static int rocksdb_done_func(void *const p) {
4776	DBUG_ENTER_FUNC();
4777
4778	int error = `0`;
4779
4780	// signal the drop index thread to stop
4781	rdb_drop_idx_thread.signal(true);
4782
4783	// Flush all memtables for not losing data, even if WAL is disabled.
4784	rocksdb_flush_all_memtables();
4785
4786	// Stop all rocksdb background work
4787	CancelAllBackgroundWork(rdb->GetBaseDB(), true);
4788
4789	// Signal the background thread to stop and to persist all stats collected
4790	// from background flushes and compactions. This will add more keys to a new
4791	// memtable, but since the memtables were just flushed, it should not trigger
4792	// a flush that can stall due to background threads being stopped. As long
4793	// as these keys are stored in a WAL file, they can be retrieved on restart.
4794	rdb_bg_thread.signal(true);
4795
4796	// Wait for the background thread to finish.
4797	auto err = rdb_bg_thread.join();
4798	if (err != `0`) {
4799	// We'll log the message and continue because we're shutting down and
4800	// continuation is the optimal strategy.
4801	// NO_LINT_DEBUG
4802	sql_print_error("RocksDB: Couldn't stop the background thread: (errno=%d)",
4803	err);
4804	}
4805
4806	// Wait for the drop index thread to finish.
4807	err = rdb_drop_idx_thread.join();
4808	if (err != `0`) {
4809	// NO_LINT_DEBUG
4810	sql_print_error("RocksDB: Couldn't stop the index thread: (errno=%d)", err);
4811	}
4812
4813	if (rdb_open_tables.m_hash.size()) {
4814	// Looks like we are getting unloaded and yet we have some open tables
4815	// left behind.
4816	error = `1`;
4817	}
4818
4819	/*
4820	MariaDB: When the plugin is unloaded with UNINSTALL SONAME command, some
4821	connections may still have Rdb_transaction objects.
4822
4823	These objects are not genuine transactions (as SQL layer makes sure that
4824	a plugin that is being unloaded has no open tables), they are empty
4825	Rdb_transaction objects that were left there to save on object
4826	creation/deletion.
4827
4828	Go through the list and delete them.
4829	*/
4830	{
4831	class Rdb_trx_deleter: public Rdb_tx_list_walker {
4832	public:
4833	std::set<Rdb_transaction*> rdb_trxs;
4834
4835	void process_tran(const Rdb_transaction *const tx) override {
4836	/*
4837	Check if the transaction is really empty. We only check
4838	non-WriteBatch-based transactions, because there is no easy way to
4839	check WriteBatch-based transactions.
4840	*/
4841	if (!tx->is_writebatch_trx()) {
4842	const auto tx_impl = static_cast<const Rdb_transaction_impl *>(tx);
4843	DBUG_ASSERT(tx_impl);
4844	if (tx_impl->get_rdb_trx())
4845	DBUG_ASSERT(`0`);
4846	}
4847	rdb_trxs.insert((Rdb_transaction*)tx);
4848	};
4849	} deleter;
4850
4851	Rdb_transaction::walk_tx_list(&deleter);
4852
4853	for (std::set<Rdb_transaction*>::iterator it= deleter.rdb_trxs.begin();
4854	it != deleter.rdb_trxs.end();
4855	++it)
4856	{
4857	// When a transaction is deleted, it removes itself from s_tx_list.
4858	delete *it;
4859	}
4860	}
4861
4862	/*
4863	destructors for static objects can be called at _exit(),
4864	but we want to free the memory at dlclose()
4865	*/
4866	rdb_open_tables.m_hash.~Rdb_table_set();
4867	mysql_mutex_destroy(&rdb_open_tables.m_mutex);
4868	mysql_mutex_destroy(&rdb_sysvars_mutex);
4869
4870
4871	delete rdb_collation_exceptions;
4872
4873	mysql_mutex_destroy(&rdb_collation_data_mutex);
4874	mysql_mutex_destroy(&rdb_mem_cmp_space_mutex);
4875
4876	Rdb_transaction::term_mutex();
4877
4878	for (auto &it : rdb_collation_data) {
4879	delete it;
4880	it = nullptr;
4881	}
4882
4883	ddl_manager.cleanup();
4884	binlog_manager.cleanup();
4885	dict_manager.cleanup();
4886	cf_manager.cleanup();
4887
4888	delete rdb;
4889	rdb = nullptr;
4890
4891	delete commit_latency_stats;
4892	commit_latency_stats = nullptr;
4893
4894	#if !defined(_WIN32) && !defined(__APPLE__)
4895	delete io_watchdog;
4896	io_watchdog = nullptr;
4897	#endif
4898
4899	// Disown the cache data since we're shutting down.
4900	// This results in memory leaks but it improved the shutdown time.
4901	// Don't disown when running under valgrind
4902	#ifndef HAVE_purify
4903	if (rocksdb_tbl_options ->block_cache) {
4904	rocksdb_tbl_options ->block_cache ->DisownData();
4905	}
4906	#endif /* HAVE_purify */
4907
4908	/*
4909	MariaDB: don't clear rocksdb_db_options and rocksdb_tbl_options.
4910	MyRocks' plugin variables refer to them.
4911
4912	The plugin cannot be loaded again (see prevent_myrocks_loading) but plugin
4913	variables are processed before myrocks::rocksdb_init_func is invoked, so
4914	they must point to valid memory.
4915	*/
4916	//rocksdb_db_options = nullptr;
4917	rocksdb_db_options ->statistics = nullptr;
4918	//rocksdb_tbl_options = nullptr;
4919	rocksdb_stats = nullptr;
4920
4921	my_error_unregister(HA_ERR_ROCKSDB_FIRST, HA_ERR_ROCKSDB_LAST);
4922
4923	/*
4924	Prevent loading the plugin after it has been loaded and then unloaded. This
4925	doesn't work currently.
4926	*/
4927	prevent_myrocks_loading= true;
4928
4929	DBUG_RETURN(error);
4930	}
4931
4932	static inline void rocksdb_smart_seek(bool seek_backward,
4933	rocksdb::Iterator *const iter,
4934	const rocksdb::Slice &key_slice) {
4935	if (seek_backward) {
4936	iter->SeekForPrev(key_slice);
4937	} else {
4938	iter->Seek(key_slice);
4939	}
4940	}
4941
4942	static inline void rocksdb_smart_next(bool seek_backward,
4943	rocksdb::Iterator *const iter) {
4944	if (seek_backward) {
4945	iter->Prev();
4946	} else {
4947	iter->Next();
4948	}
4949	}
4950
4951	#ifndef NDEBUG
4952	// simulate that RocksDB has reported corrupted data
4953	static void dbug_change_status_to_corrupted(rocksdb::Status *status) {
4954	*status = rocksdb::Status::Corruption();
4955	}
4956	#endif
4957
4958	// If the iterator is not valid it might be because of EOF but might be due
4959	// to IOError or corruption. The good practice is always check it.
4960	// https://github.com/facebook/rocksdb/wiki/Iterator#error-handling
4961	static inline bool is_valid(rocksdb::Iterator *scan_it) {
4962	if (scan_it->Valid()) {
4963	return true;
4964	} else {
4965	rocksdb::Status s = scan_it->status();
4966	DBUG_EXECUTE_IF("rocksdb_return_status_corrupted",
4967	dbug_change_status_to_corrupted(&s););
4968	if (s.IsIOError() \|\| s.IsCorruption()) {
4969	if (s.IsCorruption()) {
4970	rdb_persist_corruption_marker();
4971	}
4972	rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL);
4973	}
4974	return false;
4975	}
4976	}
4977
4978	/**
4979	@brief
4980	Example of simple lock controls. The "table_handler" it creates is a
4981	structure we will pass to each ha_rocksdb handler. Do you have to have
4982	one of these? Well, you have pieces that are used for locking, and
4983	they are needed to function.
4984	*/
4985
4986	Rdb_table_handler *
4987	Rdb_open_tables_map::get_table_handler(const char *const table_name) {
4988	Rdb_table_handler *table_handler;
4989	uint length;
4990	char *tmp_name;
4991
4992	DBUG_ASSERT(table_name != nullptr);
4993	length = (uint)strlen(table_name);
4994
4995	// First, look up the table in the hash map.
4996	RDB_MUTEX_LOCK_CHECK(m_mutex);
4997	if (!m_hash.size() \|\| !(table_handler = m_hash.find(table_name, length))) {
4998	// Since we did not find it in the hash map, attempt to create and add it
4999	// to the hash map.
5000	if (!(table_handler = reinterpret_cast<Rdb_table_handler *>(my_multi_malloc(
5001	MYF(MY_WME \| MY_ZEROFILL), &table_handler, sizeof(*table_handler),
5002	&tmp_name, length + `1`, NullS)))) {
5003	// Allocating a new Rdb_table_handler and a new table name failed.
5004	RDB_MUTEX_UNLOCK_CHECK(m_mutex);
5005	return nullptr;
5006	}
5007
5008	table_handler->m_ref_count = `0`;
5009	table_handler->m_table_name_length = length;
5010	table_handler->m_table_name = tmp_name;
5011	strmov(table_handler->m_table_name, table_name);
5012
5013	if (m_hash.insert(table_handler)) {
5014	// Inserting into the hash map failed.
5015	RDB_MUTEX_UNLOCK_CHECK(m_mutex);
5016	my_free(table_handler);
5017	return nullptr;
5018	}
5019
5020	thr_lock_init(&table_handler->m_thr_lock);
5021	#ifdef MARIAROCKS_NOT_YET
5022	table_handler->m_io_perf_read.init();
5023	table_handler->m_io_perf_write.init();
5024	#endif
5025	}
5026	DBUG_ASSERT(table_handler->m_ref_count >= `0`);
5027	table_handler->m_ref_count++;
5028
5029	RDB_MUTEX_UNLOCK_CHECK(m_mutex);
5030
5031	return table_handler;
5032	}
5033
5034	std::vector<std::string> rdb_get_open_table_names(void) {
5035	return rdb_open_tables.get_table_names();
5036	}
5037
5038	std::vector<std::string> Rdb_open_tables_map::get_table_names(void) const {
5039	size_t i;
5040	const Rdb_table_handler *table_handler;
5041	std::vector<std::string> names;
5042
5043	RDB_MUTEX_LOCK_CHECK(m_mutex);
5044	for (i = `0`; (table_handler = m_hash.at(i)); i++) {
5045	DBUG_ASSERT(table_handler != nullptr);
5046	names.push_back(table_handler->m_table_name);
5047	}
5048	DBUG_ASSERT(i == m_hash.size());
5049	RDB_MUTEX_UNLOCK_CHECK(m_mutex);
5050
5051	return names;
5052	}
5053
5054	/*
5055	Inspired by innobase_get_int_col_max_value from InnoDB. This returns the
5056	maximum value a type can take on.
5057	*/
5058	static ulonglong rdb_get_int_col_max_value(const Field *field) {
5059	ulonglong max_value = `0`;
5060	switch (field->key_type()) {
5061	case HA_KEYTYPE_BINARY:
5062	max_value = `0xFFULL`;
5063	break;
5064	case HA_KEYTYPE_INT8:
5065	max_value = `0x7FULL`;
5066	break;
5067	case HA_KEYTYPE_USHORT_INT:
5068	max_value = `0xFFFFULL`;
5069	break;
5070	case HA_KEYTYPE_SHORT_INT:
5071	max_value = `0x7FFFULL`;
5072	break;
5073	case HA_KEYTYPE_UINT24:
5074	max_value = `0xFFFFFFULL`;
5075	break;
5076	case HA_KEYTYPE_INT24:
5077	max_value = `0x7FFFFFULL`;
5078	break;
5079	case HA_KEYTYPE_ULONG_INT:
5080	max_value = `0xFFFFFFFFULL`;
5081	break;
5082	case HA_KEYTYPE_LONG_INT:
5083	max_value = `0x7FFFFFFFULL`;
5084	break;
5085	case HA_KEYTYPE_ULONGLONG:
5086	max_value = `0xFFFFFFFFFFFFFFFFULL`;
5087	break;
5088	case HA_KEYTYPE_LONGLONG:
5089	max_value = `0x7FFFFFFFFFFFFFFFULL`;
5090	break;
5091	case HA_KEYTYPE_FLOAT:
5092	max_value = `0x1000000ULL`;
5093	break;
5094	case HA_KEYTYPE_DOUBLE:
5095	max_value = `0x20000000000000ULL`;
5096	break;
5097	default:
5098	abort();
5099	}
5100
5101	return max_value;
5102	}
5103
5104	void ha_rocksdb::load_auto_incr_value() {
5105	ulonglong auto_incr = `0`;
5106	bool validate_last = false, use_datadic = true;
5107	#ifndef NDEBUG
5108	DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", use_datadic = false;);
5109	validate_last = true;
5110	#endif
5111
5112	if (use_datadic && dict_manager.get_auto_incr_val(
5113	m_tbl_def->get_autoincr_gl_index_id(), &auto_incr)) {
5114	update_auto_incr_val(auto_incr);
5115	}
5116
5117	// If we find nothing in the data dictionary, or if we are in debug mode,
5118	// then call index_last to get the last value.
5119	//
5120	// This is needed when upgrading from a server that did not support
5121	// persistent auto_increment, of if the table is empty.
5122	//
5123	// For debug mode, we are just verifying that the data dictionary value is
5124	// greater than or equal to the maximum value in the table.
5125	if (auto_incr == `0` \|\| validate_last) {
5126	auto_incr = load_auto_incr_value_from_index();
5127	update_auto_incr_val(auto_incr);
5128	}
5129
5130	// If we failed to find anything from the data dictionary and index, then
5131	// initialize auto_increment to 1.
5132	if (m_tbl_def->m_auto_incr_val == `0`) {
5133	update_auto_incr_val(`1`);
5134	}
5135	}
5136
5137	ulonglong ha_rocksdb::load_auto_incr_value_from_index() {
5138	const int save_active_index = active_index;
5139	active_index = table->s->next_number_index;
5140	const uint8 save_table_status = table->status;
5141	ulonglong last_val = `0`;
5142
5143	Rdb_transaction *const tx = get_or_create_tx(table->in_use);
5144	const bool is_new_snapshot = !tx->has_snapshot();
5145	if (is_new_snapshot) {
5146	tx->acquire_snapshot(true);
5147	}
5148
5149	// Do a lookup. We only need index column, so it should be index-only.
5150	// (another reason to make it index-only is that table->read_set is not set
5151	// appropriately and non-index-only lookup will not read the value)
5152	const bool save_keyread_only = m_keyread_only;
5153	m_keyread_only = true;
5154	m_key_requested = true;
5155
5156	if (!index_last(table->record[`0`])) {
5157	Field *field =
5158	table->key_info[table->s->next_number_index].key_part[`0`].field;
5159	ulonglong max_val = rdb_get_int_col_max_value(field);
5160	my_bitmap_map *const old_map =
5161	dbug_tmp_use_all_columns(table, table->read_set);
5162	last_val = field->val_int();
5163	if (last_val != max_val) {
5164	last_val++;
5165	}
5166	#ifndef NDEBUG
5167	ulonglong dd_val;
5168	if (last_val <= max_val) {
5169	const auto &gl_index_id = m_tbl_def->get_autoincr_gl_index_id();
5170	if (dict_manager.get_auto_incr_val(gl_index_id, &dd_val) &&
5171	tx->get_auto_incr(gl_index_id) == `0`) {
5172	DBUG_ASSERT(dd_val >= last_val);
5173	}
5174	}
5175	#endif
5176	dbug_tmp_restore_column_map(table->read_set, old_map);
5177	}
5178
5179	m_keyread_only = save_keyread_only;
5180	if (is_new_snapshot) {
5181	tx->release_snapshot();
5182	}
5183
5184	table->status = save_table_status;
5185	active_index = save_active_index;
5186
5187	/*
5188	Do what ha_rocksdb::index_end() does.
5189	(Why don't we use index_init/index_end? class handler defines index_init
5190	as private, for some reason).
5191	*/
5192	release_scan_iterator();
5193
5194	return last_val;
5195	}
5196
5197	void ha_rocksdb::update_auto_incr_val(ulonglong val) {
5198	ulonglong auto_incr_val = m_tbl_def->m_auto_incr_val;
5199	while (
5200	auto_incr_val < val &&
5201	!m_tbl_def->m_auto_incr_val.compare_exchange_weak(auto_incr_val, val)) {
5202	// Do nothing - just loop until auto_incr_val is >= val or we successfully
5203	// set it
5204	}
5205	}
5206
5207	void ha_rocksdb::update_auto_incr_val_from_field() {
5208	Field *field;
5209	ulonglong new_val, max_val;
5210	field = table->key_info[table->s->next_number_index].key_part[`0`].field;
5211	max_val = rdb_get_int_col_max_value(field);
5212
5213	my_bitmap_map *const old_map =
5214	dbug_tmp_use_all_columns(table, table->read_set);
5215	new_val = field->val_int();
5216	// don't increment if we would wrap around
5217	if (new_val != max_val) {
5218	new_val++;
5219	}
5220
5221	dbug_tmp_restore_column_map(table->read_set, old_map);
5222
5223	// Only update if positive value was set for auto_incr column.
5224	if (new_val <= max_val) {
5225	Rdb_transaction *const tx = get_or_create_tx(table->in_use);
5226	tx->set_auto_incr(m_tbl_def->get_autoincr_gl_index_id(), new_val);
5227
5228	// Update the in memory auto_incr value in m_tbl_def.
5229	update_auto_incr_val(new_val);
5230	}
5231	}
5232
5233	int ha_rocksdb::load_hidden_pk_value() {
5234	const int save_active_index = active_index;
5235	active_index = m_tbl_def->m_key_count - `1`;
5236	const uint8 save_table_status = table->status;
5237
5238	/*
5239	We should read the latest committed value in the database.
5240	That is, if we have an open transaction with a snapshot, we should not use
5241	it as we may get old data. Start a new transaction to read the latest
5242	value.
5243	*/
5244	Rdb_transaction *const temp_tx = new Rdb_transaction_impl (table->in_use);
5245	temp_tx->start_tx();
5246	Rdb_transaction *&tx = get_tx_from_thd(table->in_use);
5247	Rdb_transaction *save_tx= tx;
5248	tx= temp_tx;
5249
5250	longlong hidden_pk_id = `1`;
5251	// Do a lookup.
5252	if (!index_last(table->record[`0`])) {
5253	/*
5254	Decode PK field from the key
5255	*/
5256	auto err = read_hidden_pk_id_from_rowkey(&hidden_pk_id);
5257	if (err) {
5258	delete tx;
5259	tx= save_tx;
5260	return err;
5261	}
5262
5263	hidden_pk_id++;
5264	}
5265
5266	longlong old = m_tbl_def->m_hidden_pk_val;
5267	while (old < hidden_pk_id &&
5268	!m_tbl_def->m_hidden_pk_val.compare_exchange_weak(old, hidden_pk_id)) {
5269	}
5270
5271	delete tx;
5272	tx= save_tx;
5273
5274	table->status = save_table_status;
5275	active_index = save_active_index;
5276
5277	release_scan_iterator();
5278
5279	return HA_EXIT_SUCCESS;
5280	}
5281
5282	/ Get PK value from m_tbl_def->m_hidden_pk_info. /
5283	longlong ha_rocksdb::update_hidden_pk_val() {
5284	DBUG_ASSERT(has_hidden_pk(table));
5285	const longlong new_val = m_tbl_def->m_hidden_pk_val ++;
5286	return new_val;
5287	}
5288
5289	/ Get the id of the hidden pk id from m_last_rowkey /
5290	int ha_rocksdb::read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id) {
5291	DBUG_ASSERT(hidden_pk_id != nullptr);
5292	DBUG_ASSERT(table != nullptr);
5293	DBUG_ASSERT(has_hidden_pk(table));
5294
5295	rocksdb::Slice rowkey_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
5296
5297	// Get hidden primary key from old key slice
5298	Rdb_string_reader reader(&rowkey_slice);
5299	if ((!reader.read(Rdb_key_def::INDEX_NUMBER_SIZE)))
5300	return HA_ERR_ROCKSDB_CORRUPT_DATA;
5301
5302	const int length= `8`; / was Field_longlong::PACK_LENGTH in FB MySQL tree /
5303	const uchar from = reinterpret_cast<const* uchar *>(reader.read(length));
5304	if (from == nullptr) {
5305	/ Mem-comparable image doesn't have enough bytes /
5306	return HA_ERR_ROCKSDB_CORRUPT_DATA;
5307	}
5308
5309	*hidden_pk_id = rdb_netbuf_read_uint64(&from);
5310	return HA_EXIT_SUCCESS;
5311	}
5312
5313	/**
5314	@brief
5315	Free lock controls. We call this whenever we close a table. If the table had
5316	the last reference to the table_handler, then we free the memory associated
5317	with it.
5318	*/
5319
5320	void Rdb_open_tables_map::release_table_handler(
5321	Rdb_table_handler *const table_handler) {
5322	RDB_MUTEX_LOCK_CHECK(m_mutex);
5323
5324	DBUG_ASSERT(table_handler != nullptr);
5325	DBUG_ASSERT(table_handler->m_ref_count > `0`);
5326	if (!--table_handler->m_ref_count) {
5327	// Last reference was released. Tear down the hash entry.
5328	const auto ret MY_ATTRIBUTE((__unused__)) = m_hash.remove(table_handler);
5329	DBUG_ASSERT(!ret); // the hash entry must actually be found and deleted
5330	my_core::thr_lock_delete(&table_handler->m_thr_lock);
5331	my_free(table_handler);
5332	}
5333
5334	RDB_MUTEX_UNLOCK_CHECK(m_mutex);
5335	}
5336
5337	static handler rocksdb_create_handler(my_core::handlerton const hton,
5338	my_core::TABLE_SHARE *const table_arg,
5339	my_core::MEM_ROOT *const mem_root) {
5340	return new (mem_root) ha_rocksdb (hton, table_arg);
5341	}
5342
5343	ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton,
5344	my_core::TABLE_SHARE *const table_arg)
5345	: handler (hton, table_arg), m_table_handler(nullptr), m_scan_it(nullptr),
5346	m_scan_it_skips_bloom(false), m_scan_it_snapshot(nullptr),
5347	m_tbl_def(nullptr), m_pk_descr (nullptr), m_key_descr_arr(nullptr),
5348	m_pk_can_be_decoded(false), m_maybe_unpack_info(false),
5349	m_pk_tuple(nullptr), m_pk_packed_tuple(nullptr),
5350	m_sk_packed_tuple(nullptr), m_end_key_packed_tuple(nullptr),
5351	m_sk_match_prefix(nullptr), m_sk_match_prefix_buf(nullptr),
5352	m_sk_packed_tuple_old(nullptr), m_dup_sk_packed_tuple(nullptr),
5353	m_dup_sk_packed_tuple_old(nullptr), m_eq_cond_lower_bound(nullptr),
5354	m_eq_cond_upper_bound(nullptr), m_pack_buffer(nullptr),
5355	m_lock_rows(RDB_LOCK_NONE), m_keyread_only(FALSE), m_encoder_arr(nullptr),
5356	m_row_checksums_checked(`0`), m_in_rpl_delete_rows(false),
5357	m_in_rpl_update_rows(false), m_force_skip_unique_check(false) {}
5358
5359
5360	const std::string &ha_rocksdb::get_table_basename() const {
5361	return m_tbl_def->base_tablename();
5362	}
5363
5364	/**
5365	@return
5366	false OK
5367	other Error inpacking the data
5368	*/
5369	bool ha_rocksdb::init_with_fields() {
5370	DBUG_ENTER_FUNC();
5371
5372	const uint pk = table_share->primary_key;
5373	if (pk != MAX_KEY) {
5374	const uint key_parts = table_share->key_info[pk].user_defined_key_parts;
5375	check_keyread_allowed(pk /PK/, key_parts - `1`, true);
5376	} else
5377	m_pk_can_be_decoded = false;
5378
5379	cached_table_flags = table_flags();
5380
5381	DBUG_RETURN(false); / Ok /
5382	}
5383
5384	/*
5385	If the key is a TTL key, we may need to filter it out.
5386
5387	The purpose of read filtering for tables with TTL is to ensure that
5388	during a transaction a key which has expired already but not removed by
5389	compaction yet is not returned to the user.
5390
5391	Without this the user might be hit with problems such as disappearing
5392	rows within a transaction, etc, because the compaction filter ignores
5393	snapshots when filtering keys.
5394	*/
5395	bool ha_rocksdb::should_hide_ttl_rec(const Rdb_key_def &kd,
5396	const rocksdb::Slice &ttl_rec_val,
5397	const int64_t curr_ts) {
5398	DBUG_ASSERT(kd.has_ttl());
5399	DBUG_ASSERT(kd.m_ttl_rec_offset != UINT_MAX);
5400
5401	/*
5402	Curr_ts can only be 0 if there are no snapshots open.
5403	should_hide_ttl_rec can only be called when there is >=1 snapshots, unless
5404	we are filtering on the write path (single INSERT/UPDATE) in which case
5405	we are passed in the current time as curr_ts.
5406
5407	In the event curr_ts is 0, we always decide not to filter the record. We
5408	also log a warning and increment a diagnostic counter.
5409	*/
5410	if (curr_ts == `0`) {
5411	update_row_stats(ROWS_HIDDEN_NO_SNAPSHOT);
5412	return false;
5413	}
5414
5415	if (!rdb_is_ttl_read_filtering_enabled() \|\| !rdb_is_ttl_enabled()) {
5416	return false;
5417	}
5418
5419	Rdb_string_reader reader(&ttl_rec_val);
5420
5421	/*
5422	Find where the 8-byte ttl is for each record in this index.
5423	*/
5424	uint64 ts;
5425	if (!reader.read(kd.m_ttl_rec_offset) \|\| reader.read_uint64(&ts)) {
5426	/*
5427	This condition should never be reached since all TTL records have an
5428	8 byte ttl field in front. Don't filter the record out, and log an error.
5429	*/
5430	std::string buf;
5431	buf = rdb_hexdump(ttl_rec_val.data(), ttl_rec_val.size(),
5432	RDB_MAX_HEXDUMP_LEN);
5433	const GL_INDEX_ID gl_index_id = kd.get_gl_index_id();
5434	// NO_LINT_DEBUG
5435	sql_print_error("Decoding ttl from PK value failed, "
5436	"for index (%u,%u), val: %s",
5437	gl_index_id.cf_id, gl_index_id.index_id, buf.c_str());
5438	DBUG_ASSERT(`0`);
5439	return false;
5440	}
5441
5442	/ Hide record if it has expired before the current snapshot time. /
5443	uint64 read_filter_ts = `0`;
5444	#ifndef NDEBUG
5445	read_filter_ts += rdb_dbug_set_ttl_read_filter_ts();
5446	#endif
5447	bool is_hide_ttl =
5448	ts + kd.m_ttl_duration + read_filter_ts <= static_cast<uint64>(curr_ts);
5449	if (is_hide_ttl) {
5450	update_row_stats(ROWS_FILTERED);
5451	}
5452	return is_hide_ttl;
5453	}
5454
5455	void ha_rocksdb::rocksdb_skip_expired_records(const Rdb_key_def &kd,
5456	rocksdb::Iterator *const iter,
5457	bool seek_backward) {
5458	if (kd.has_ttl()) {
5459	while (iter->Valid() &&
5460	should_hide_ttl_rec(
5461	kd, iter->value(),
5462	get_or_create_tx(table->in_use)->m_snapshot_timestamp)) {
5463	rocksdb_smart_next(seek_backward, iter);
5464	}
5465	}
5466	}
5467
5468	/**
5469	Convert record from table->record[0] form into a form that can be written
5470	into rocksdb.
5471
5472	@param pk_packed_slice Packed PK tuple. We need it in order to compute
5473	and store its CRC.
5474	@param packed_rec OUT Data slice with record data.
5475	*/
5476
5477	int ha_rocksdb::convert_record_to_storage_format(
5478	const struct update_row_info &row_info, rocksdb::Slice *const packed_rec) {
5479	DBUG_ASSERT_IMP(m_maybe_unpack_info, row_info.new_pk_unpack_info);
5480	DBUG_ASSERT(m_pk_descr != nullptr);
5481
5482	const rocksdb::Slice &pk_packed_slice = row_info.new_pk_slice;
5483	Rdb_string_writer *const pk_unpack_info = row_info.new_pk_unpack_info;
5484	bool has_ttl = m_pk_descr ->has_ttl();
5485	bool has_ttl_column = !m_pk_descr ->m_ttl_column.empty();
5486	bool ttl_in_pk = has_ttl_column && (row_info.ttl_pk_offset != UINT_MAX);
5487
5488	m_storage_record.length(`0`);
5489
5490	if (has_ttl) {
5491	/ If it's a TTL record, reserve space for 8 byte TTL value in front. /
5492	m_storage_record.fill(ROCKSDB_SIZEOF_TTL_RECORD + m_null_bytes_in_rec, `0`);
5493	m_ttl_bytes_updated = false;
5494
5495	/*
5496	If the TTL is contained within the key, we use the offset to find the
5497	TTL value and place it in the beginning of the value record.
5498	*/
5499	if (ttl_in_pk) {
5500	Rdb_string_reader reader(&pk_packed_slice);
5501	const char *ts;
5502	if (!reader.read(row_info.ttl_pk_offset) \|\|
5503	!(ts = reader.read(ROCKSDB_SIZEOF_TTL_RECORD))) {
5504	std::string buf;
5505	buf = rdb_hexdump(pk_packed_slice.data(), pk_packed_slice.size(),
5506	RDB_MAX_HEXDUMP_LEN);
5507	const GL_INDEX_ID gl_index_id = m_pk_descr ->get_gl_index_id();
5508	// NO_LINT_DEBUG
5509	sql_print_error("Decoding ttl from PK failed during insert, "
5510	"for index (%u,%u), key: %s",
5511	gl_index_id.cf_id, gl_index_id.index_id, buf.c_str());
5512	return HA_EXIT_FAILURE;
5513	}
5514
5515	char *const data = const_cast<char *>(m_storage_record.ptr());
5516	memcpy(data, ts, ROCKSDB_SIZEOF_TTL_RECORD);
5517	#ifndef NDEBUG
5518	// Adjust for test case if needed
5519	rdb_netbuf_store_uint64(
5520	reinterpret_cast<uchar *>(data),
5521	rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(data)) +
5522	rdb_dbug_set_ttl_rec_ts());
5523	#endif
5524	// Also store in m_ttl_bytes to propagate to update_sk
5525	memcpy(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD);
5526	} else if (!has_ttl_column) {
5527	/*
5528	For implicitly generated TTL records we need to copy over the old
5529	TTL value from the old record in the event of an update. It was stored
5530	in m_ttl_bytes.
5531
5532	Otherwise, generate a timestamp using the current time.
5533	*/
5534	if (!row_info.old_pk_slice.empty()) {
5535	char *const data = const_cast<char *>(m_storage_record.ptr());
5536	memcpy(data, m_ttl_bytes, sizeof(uint64));
5537	} else {
5538	uint64 ts = static_cast<uint64>(std::time(nullptr));
5539	#ifndef NDEBUG
5540	ts += rdb_dbug_set_ttl_rec_ts();
5541	#endif
5542	char *const data = const_cast<char *>(m_storage_record.ptr());
5543	rdb_netbuf_store_uint64(reinterpret_cast<uchar *>(data), ts);
5544	// Also store in m_ttl_bytes to propagate to update_sk
5545	memcpy(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD);
5546	}
5547	}
5548	} else {
5549	/ All NULL bits are initially 0 /
5550	m_storage_record.fill(m_null_bytes_in_rec, `0`);
5551	}
5552
5553	// If a primary key may have non-empty unpack_info for certain values,
5554	// (m_maybe_unpack_info=TRUE), we write the unpack_info block. The block
5555	// itself was prepared in Rdb_key_def::pack_record.
5556	if (m_maybe_unpack_info) {
5557	m_storage_record.append(reinterpret_cast<char *>(pk_unpack_info->ptr()),
5558	pk_unpack_info->get_current_pos());
5559	}
5560
5561	for (uint i = `0`; i < table->s->fields; i++) {
5562	/ Don't pack decodable PK key parts /
5563	if (m_encoder_arr[i].m_storage_type != Rdb_field_encoder::STORE_ALL) {
5564	continue;
5565	}
5566
5567	Field *const field = table->field[i];
5568	if (m_encoder_arr[i].maybe_null()) {
5569	char data = const_cast<char* *>(m_storage_record.ptr());
5570	if (has_ttl) {
5571	data += ROCKSDB_SIZEOF_TTL_RECORD;
5572	}
5573
5574	if (field->is_null()) {
5575	data[m_encoder_arr[i].m_null_offset] \|= m_encoder_arr[i].m_null_mask;
5576	/ Don't write anything for NULL values /
5577	continue;
5578	}
5579	}
5580
5581	if (m_encoder_arr[i].m_field_type == MYSQL_TYPE_BLOB) {
5582	my_core::Field_blob blob = (my_core::Field_blob )field;
5583	/ Get the number of bytes needed to store length/
5584	const uint length_bytes = blob->pack_length() - portable_sizeof_char_ptr;
5585
5586	/ Store the length of the value /
5587	m_storage_record.append(reinterpret_cast<char *>(blob->ptr),
5588	length_bytes);
5589
5590	/ Store the blob value itself /
5591	char *data_ptr;
5592	memcpy(&data_ptr, blob->ptr + length_bytes, sizeof(uchar **));
5593	m_storage_record.append(data_ptr, blob->get_length());
5594	} else if (m_encoder_arr[i].m_field_type == MYSQL_TYPE_VARCHAR) {
5595	Field_varstring *const field_var = (Field_varstring *)field;
5596	uint data_len;
5597	/ field_var->length_bytes is 1 or 2 /
5598	if (field_var->length_bytes == `1`) {
5599	data_len = field_var->ptr[`0`];
5600	} else {
5601	DBUG_ASSERT(field_var->length_bytes == `2`);
5602	data_len = uint2korr(field_var->ptr);
5603	}
5604	m_storage_record.append(reinterpret_cast<char *>(field_var->ptr),
5605	field_var->length_bytes + data_len);
5606	} else {
5607	/ Copy the field data /
5608	const uint len = field->pack_length_in_rec();
5609	m_storage_record.append(reinterpret_cast<char *>(field->ptr), len);
5610
5611	/*
5612	Check if this is the TTL field within the table, if so store the TTL
5613	in the front of the record as well here.
5614	*/
5615	if (has_ttl && has_ttl_column &&
5616	i == m_pk_descr ->get_ttl_field_offset()) {
5617	DBUG_ASSERT(len == ROCKSDB_SIZEOF_TTL_RECORD);
5618	DBUG_ASSERT(field->real_type() == MYSQL_TYPE_LONGLONG);
5619	DBUG_ASSERT(m_pk_descr->get_ttl_field_offset() != UINT_MAX);
5620
5621	char *const data = const_cast<char *>(m_storage_record.ptr());
5622	uint64 ts = uint8korr(field->ptr);
5623	#ifndef NDEBUG
5624	ts += rdb_dbug_set_ttl_rec_ts();
5625	#endif
5626	rdb_netbuf_store_uint64(reinterpret_cast<uchar *>(data), ts);
5627
5628	// If this is an update and the timestamp has been updated, take note
5629	// so we can avoid updating SKs unnecessarily.
5630	if (!row_info.old_pk_slice.empty()) {
5631	m_ttl_bytes_updated =
5632	memcmp(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD);
5633	}
5634	// Store timestamp in m_ttl_bytes to propagate to update_sk
5635	memcpy(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD);
5636	}
5637	}
5638	}
5639
5640	if (should_store_row_debug_checksums()) {
5641	const uint32_t key_crc32 = my_core::crc32(
5642	`0`, rdb_slice_to_uchar_ptr(&pk_packed_slice), pk_packed_slice.size());
5643	const uint32_t val_crc32 =
5644	my_core::crc32(`0`, rdb_mysql_str_to_uchar_str(&m_storage_record),
5645	m_storage_record.length());
5646	uchar key_crc_buf[RDB_CHECKSUM_SIZE];
5647	uchar val_crc_buf[RDB_CHECKSUM_SIZE];
5648	rdb_netbuf_store_uint32(key_crc_buf, key_crc32);
5649	rdb_netbuf_store_uint32(val_crc_buf, val_crc32);
5650	m_storage_record.append((const char *)&RDB_CHECKSUM_DATA_TAG, `1`);
5651	m_storage_record.append((const char *)key_crc_buf, RDB_CHECKSUM_SIZE);
5652	m_storage_record.append((const char *)val_crc_buf, RDB_CHECKSUM_SIZE);
5653	}
5654
5655	*packed_rec =
5656	rocksdb::Slice (m_storage_record.ptr(), m_storage_record.length());
5657
5658	return HA_EXIT_SUCCESS;
5659	}
5660
5661	/*
5662	@brief
5663	Setup which fields will be unpacked when reading rows
5664
5665	@detail
5666	Three special cases when we still unpack all fields:
5667	- When this table is being updated (m_lock_rows==RDB_LOCK_WRITE).
5668	- When @@rocksdb_verify_row_debug_checksums is ON (In this mode, we need to
5669	read all fields to find whether there is a row checksum at the end. We could
5670	skip the fields instead of decoding them, but currently we do decoding.)
5671	- On index merge as bitmap is cleared during that operation
5672
5673	@seealso
5674	ha_rocksdb::setup_field_converters()
5675	ha_rocksdb::convert_record_from_storage_format()
5676	*/
5677	void ha_rocksdb::setup_read_decoders() {
5678	m_decoders_vect.clear();
5679	m_key_requested = false;
5680
5681	int last_useful = `0`;
5682	int skip_size = `0`;
5683
5684	for (uint i = `0`; i < table->s->fields; i++) {
5685	// bitmap is cleared on index merge, but it still needs to decode columns
5686	const bool field_requested =
5687	m_lock_rows == RDB_LOCK_WRITE \|\| m_verify_row_debug_checksums \|\|
5688	bitmap_is_clear_all(table->read_set) \|\|
5689	bitmap_is_set(table->read_set, table->field[i]->field_index);
5690
5691	// We only need the decoder if the whole record is stored.
5692	if (m_encoder_arr[i].m_storage_type != Rdb_field_encoder::STORE_ALL) {
5693	// the field potentially needs unpacking
5694	if (field_requested) {
5695	// the field is in the read set
5696	m_key_requested = true;
5697	}
5698	continue;
5699	}
5700
5701	if (field_requested) {
5702	// We will need to decode this field
5703	m_decoders_vect.push_back({&m_encoder_arr[i], true, skip_size});
5704	last_useful = m_decoders_vect.size();
5705	skip_size = `0`;
5706	} else {
5707	if (m_encoder_arr[i].uses_variable_len_encoding() \|\|
5708	m_encoder_arr[i].maybe_null()) {
5709	// For variable-length field, we need to read the data and skip it
5710	m_decoders_vect.push_back({&m_encoder_arr[i], false, skip_size});
5711	skip_size = `0`;
5712	} else {
5713	// Fixed-width field can be skipped without looking at it.
5714	// Add appropriate skip_size to the next field.
5715	skip_size += m_encoder_arr[i].m_pack_length_in_rec;
5716	}
5717	}
5718	}
5719
5720	// It could be that the last few elements are varchars that just do
5721	// skipping. Remove them.
5722	m_decoders_vect.erase(m_decoders_vect.begin() + last_useful,
5723	m_decoders_vect.end());
5724	}
5725
5726	#ifndef NDEBUG
5727	void dbug_append_garbage_at_end(rocksdb::PinnableSlice *on_disk_rec) {
5728	std::string str(on_disk_rec->data(), on_disk_rec->size());
5729	on_disk_rec->Reset();
5730	str.append("abc");
5731	on_disk_rec->PinSelf(rocksdb::Slice(str));
5732	}
5733
5734	void dbug_truncate_record(rocksdb::PinnableSlice *on_disk_rec) {
5735	on_disk_rec->remove_suffix(on_disk_rec->size());
5736	}
5737
5738	void dbug_modify_rec_varchar12(rocksdb::PinnableSlice *on_disk_rec) {
5739	std::string res;
5740	// The record is NULL-byte followed by VARCHAR(10).
5741	// Put the NULL-byte
5742	res.append("\0", `1`);
5743	// Then, add a valid VARCHAR(12) value.
5744	res.append("\xC", `1`);
5745	res.append("123456789ab", `12`);
5746
5747	on_disk_rec->Reset();
5748	on_disk_rec->PinSelf(rocksdb::Slice(res));
5749	}
5750
5751	void dbug_modify_key_varchar8(String &on_disk_rec) {
5752	std::string res;
5753	// The key starts with index number
5754	res.append(on_disk_rec.ptr(), Rdb_key_def::INDEX_NUMBER_SIZE);
5755
5756	// Then, a mem-comparable form of a varchar(8) value.
5757	res.append("ABCDE\0\0\0\xFC", `9`);
5758	on_disk_rec.length(`0`);
5759	on_disk_rec.append(res.data(), res.size());
5760	}
5761
5762	void dbug_create_err_inplace_alter() {
5763	my_printf_error(ER_UNKNOWN_ERROR,
5764	"Intentional failure in inplace alter occurred.", MYF(`0`));
5765	}
5766	#endif
5767
5768	int ha_rocksdb::convert_record_from_storage_format(
5769	const rocksdb::Slice *const key, uchar *const buf) {
5770
5771	DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read1",
5772	dbug_append_garbage_at_end(&m_retrieved_record););
5773	DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read2",
5774	dbug_truncate_record(&m_retrieved_record););
5775	DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read3",
5776	dbug_modify_rec_varchar12(&m_retrieved_record););
5777
5778	return convert_record_from_storage_format(key, &m_retrieved_record, buf);
5779	}
5780
5781	int ha_rocksdb::convert_blob_from_storage_format(
5782	my_core::Field_blob *const blob,
5783	Rdb_string_reader *const reader,
5784	bool decode)
5785	{
5786	/ Get the number of bytes needed to store length/
5787	const uint length_bytes = blob->pack_length() - portable_sizeof_char_ptr;
5788
5789	const char *data_len_str;
5790	if (!(data_len_str = reader->read(length_bytes))) {
5791	return HA_ERR_ROCKSDB_CORRUPT_DATA;
5792	}
5793
5794	memcpy(blob->ptr, data_len_str, length_bytes);
5795
5796	const uint32 data_len = blob->get_length(
5797	reinterpret_cast<const uchar*>(data_len_str), length_bytes);
5798	const char *blob_ptr;
5799	if (!(blob_ptr = reader->read(data_len))) {
5800	return HA_ERR_ROCKSDB_CORRUPT_DATA;
5801	}
5802
5803	if (decode) {
5804	// set 8-byte pointer to 0, like innodb does (relevant for 32-bit
5805	// platforms)
5806	memset(blob->ptr + length_bytes, `0`, `8`);
5807	memcpy(blob->ptr + length_bytes, &blob_ptr, sizeof(uchar **));
5808	}
5809
5810	return HA_EXIT_SUCCESS;
5811	}
5812
5813	int ha_rocksdb::convert_varchar_from_storage_format(
5814	my_core::Field_varstring *const field_var,
5815	Rdb_string_reader *const reader,
5816	bool decode)
5817	{
5818	const char *data_len_str;
5819	if (!(data_len_str = reader->read(field_var->length_bytes)))
5820	return HA_ERR_ROCKSDB_CORRUPT_DATA;
5821
5822	uint data_len;
5823	/ field_var->length_bytes is 1 or 2 /
5824	if (field_var->length_bytes == `1`) {
5825	data_len = (uchar)data_len_str[`0`];
5826	} else {
5827	DBUG_ASSERT(field_var->length_bytes == `2`);
5828	data_len = uint2korr(data_len_str);
5829	}
5830
5831	if (data_len > field_var->field_length) {
5832	/ The data on disk is longer than table DDL allows? /
5833	return HA_ERR_ROCKSDB_CORRUPT_DATA;
5834	}
5835
5836	if (!reader->read(data_len)) {
5837	return HA_ERR_ROCKSDB_CORRUPT_DATA;
5838	}
5839
5840	if (decode) {
5841	memcpy(field_var->ptr, data_len_str, field_var->length_bytes + data_len);
5842	}
5843
5844	return HA_EXIT_SUCCESS;
5845	}
5846
5847	int ha_rocksdb::convert_field_from_storage_format(
5848	my_core::Field *const field,
5849	Rdb_string_reader *const reader,
5850	bool decode,
5851	uint len)
5852	{
5853	const char *data_bytes;
5854	if (len > `0`) {
5855	if ((data_bytes = reader->read(len)) == nullptr) {
5856	return HA_ERR_ROCKSDB_CORRUPT_DATA;
5857	}
5858
5859	if (decode)
5860	memcpy(field->ptr, data_bytes, len);
5861	}
5862
5863	return HA_EXIT_SUCCESS;
5864	}
5865
5866	/*
5867	@brief
5868	Unpack the record in this->m_retrieved_record and this->m_last_rowkey from
5869	storage format into buf (which can be table->record[0] or table->record[1]).
5870
5871	@param key Table record's key in mem-comparable form.
5872	@param buf Store record in table->record[0] format here
5873
5874	@detail
5875	If the table has blobs, the unpacked data in buf may keep pointers to the
5876	data in this->m_retrieved_record.
5877
5878	The key is only needed to check its checksum value (the checksum is in
5879	m_retrieved_record).
5880
5881	@seealso
5882	ha_rocksdb::setup_read_decoders() Sets up data structures which tell which
5883	columns to decode.
5884
5885	@return
5886	0 OK
5887	other Error inpacking the data
5888	*/
5889
5890	int ha_rocksdb::convert_record_from_storage_format(
5891	const rocksdb::Slice *const key, const rocksdb::Slice *const value,
5892	uchar *const buf) {
5893	DBUG_ASSERT(key != nullptr);
5894	DBUG_ASSERT(buf != nullptr);
5895
5896	Rdb_string_reader reader(value);
5897
5898	/*
5899	Decode PK fields from the key
5900	*/
5901	DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_read1",
5902	dbug_modify_key_varchar8(m_last_rowkey););
5903
5904	const rocksdb::Slice rowkey_slice(m_last_rowkey.ptr(),
5905	m_last_rowkey.length());
5906	const char unpack_info = nullptr*;
5907	uint16 unpack_info_len = `0`;
5908	rocksdb::Slice unpack_slice;
5909
5910	/ If it's a TTL record, skip the 8 byte TTL value /
5911	const char *ttl_bytes;
5912	if (m_pk_descr ->has_ttl()) {
5913	if ((ttl_bytes = reader.read(ROCKSDB_SIZEOF_TTL_RECORD))) {
5914	memcpy(m_ttl_bytes, ttl_bytes, ROCKSDB_SIZEOF_TTL_RECORD);
5915	} else {
5916	return HA_ERR_ROCKSDB_CORRUPT_DATA;
5917	}
5918	}
5919
5920	/ Other fields are decoded from the value /
5921	const char null_bytes = nullptr*;
5922	if (m_null_bytes_in_rec && !(null_bytes = reader.read(m_null_bytes_in_rec))) {
5923	return HA_ERR_ROCKSDB_CORRUPT_DATA;
5924	}
5925
5926	if (m_maybe_unpack_info) {
5927	unpack_info = reader.get_current_ptr();
5928	if (!unpack_info \|\| !Rdb_key_def::is_unpack_data_tag(unpack_info[`0`]) \|\|
5929	!reader.read(Rdb_key_def::get_unpack_header_size(unpack_info[`0`]))) {
5930	return HA_ERR_ROCKSDB_CORRUPT_DATA;
5931	}
5932
5933	unpack_info_len =
5934	rdb_netbuf_to_uint16(reinterpret_cast<const uchar *>(unpack_info + `1`));
5935	unpack_slice = rocksdb::Slice (unpack_info, unpack_info_len);
5936
5937	reader.read(unpack_info_len -
5938	Rdb_key_def::get_unpack_header_size(unpack_info[`0`]));
5939	}
5940
5941	int err = HA_EXIT_SUCCESS;
5942	if (m_key_requested) {
5943	err = m_pk_descr ->unpack_record(table, buf, &rowkey_slice,
5944	unpack_info ? &unpack_slice : nullptr,
5945	false / verify_checksum /);
5946	}
5947
5948	if (err != HA_EXIT_SUCCESS) {
5949	return err;
5950	}
5951
5952	for (auto it = m_decoders_vect.begin(); it != m_decoders_vect.end(); it ++) {
5953	const Rdb_field_encoder *const field_dec = it ->m_field_enc;
5954	const bool decode = it ->m_decode;
5955	const bool isNull =
5956	field_dec->maybe_null() &&
5957	((null_bytes[field_dec->m_null_offset] & field_dec->m_null_mask) != `0`);
5958
5959	Field *const field = table->field[field_dec->m_field_index];
5960
5961	/ Skip the bytes we need to skip /
5962	if (it ->m_skip && !reader.read(it ->m_skip)) {
5963	return HA_ERR_ROCKSDB_CORRUPT_DATA;
5964	}
5965
5966	uint field_offset = field->ptr - table->record[`0`];
5967	uint null_offset = field->null_offset();
5968	bool maybe_null = field->real_maybe_null();
5969	field->move_field(buf + field_offset,
5970	maybe_null ? buf + null_offset : nullptr,
5971	field->null_bit);
5972	// WARNING! - Don't return before restoring field->ptr and field->null_ptr!
5973
5974	if (isNull) {
5975	if (decode) {
5976	/ This sets the NULL-bit of this record /
5977	field->set_null();
5978	/*
5979	Besides that, set the field value to default value. CHECKSUM TABLE
5980	depends on this.
5981	*/
5982	memcpy(field->ptr, table->s->default_values + field_offset,
5983	field->pack_length());
5984	}
5985	} else {
5986	if (decode) {
5987	field->set_notnull();
5988	}
5989
5990	if (field_dec->m_field_type == MYSQL_TYPE_BLOB) {
5991	err = convert_blob_from_storage_format(
5992	(my_core::Field_blob *) field, &reader, decode);
5993	} else if (field_dec->m_field_type == MYSQL_TYPE_VARCHAR) {
5994	err = convert_varchar_from_storage_format(
5995	(my_core::Field_varstring *) field, &reader, decode);
5996	} else {
5997	err = convert_field_from_storage_format(
5998	field, &reader, decode, field_dec->m_pack_length_in_rec);
5999	}
6000	}
6001
6002	// Restore field->ptr and field->null_ptr
6003	field->move_field(table->record[`0`] + field_offset,
6004	maybe_null ? table->record[`0`] + null_offset : nullptr,
6005	field->null_bit);
6006
6007	if (err != HA_EXIT_SUCCESS) {
6008	return err;
6009	}
6010	}
6011
6012	if (m_verify_row_debug_checksums) {
6013	if (reader.remaining_bytes() == RDB_CHECKSUM_CHUNK_SIZE &&
6014	reader.read(`1`)[`0`] == RDB_CHECKSUM_DATA_TAG) {
6015	uint32_t stored_key_chksum =
6016	rdb_netbuf_to_uint32((const uchar *)reader.read(RDB_CHECKSUM_SIZE));
6017	uint32_t stored_val_chksum =
6018	rdb_netbuf_to_uint32((const uchar *)reader.read(RDB_CHECKSUM_SIZE));
6019
6020	const uint32_t computed_key_chksum =
6021	my_core::crc32(`0`, rdb_slice_to_uchar_ptr(key), key->size());
6022	const uint32_t computed_val_chksum =
6023	my_core::crc32(`0`, rdb_slice_to_uchar_ptr(value),
6024	value->size() - RDB_CHECKSUM_CHUNK_SIZE);
6025
6026	DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_checksum1",
6027	stored_key_chksum++;);
6028
6029	if (stored_key_chksum != computed_key_chksum) {
6030	m_pk_descr ->report_checksum_mismatch(true, key->data(), key->size());
6031	return HA_ERR_ROCKSDB_CHECKSUM_MISMATCH;
6032	}
6033
6034	DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_checksum2",
6035	stored_val_chksum++;);
6036	if (stored_val_chksum != computed_val_chksum) {
6037	m_pk_descr ->report_checksum_mismatch(false, value->data(),
6038	value->size());
6039	return HA_ERR_ROCKSDB_CHECKSUM_MISMATCH;
6040	}
6041
6042	m_row_checksums_checked++;
6043	}
6044	if (reader.remaining_bytes())
6045	return HA_ERR_ROCKSDB_CORRUPT_DATA;
6046	}
6047
6048	return HA_EXIT_SUCCESS;
6049	}
6050
6051	void ha_rocksdb::get_storage_type(Rdb_field_encoder *const encoder,
6052	const uint &kp) {
6053	// STORE_SOME uses unpack_info.
6054	if (m_pk_descr ->has_unpack_info(kp)) {
6055	DBUG_ASSERT(m_pk_descr->can_unpack(kp));
6056	encoder->m_storage_type = Rdb_field_encoder::STORE_SOME;
6057	m_maybe_unpack_info = true;
6058	} else if (m_pk_descr ->can_unpack(kp)) {
6059	encoder->m_storage_type = Rdb_field_encoder::STORE_NONE;
6060	}
6061	}
6062
6063	/*
6064	Setup data needed to convert table->record[] to and from record storage
6065	format.
6066
6067	@seealso
6068	ha_rocksdb::convert_record_to_storage_format,
6069	ha_rocksdb::convert_record_from_storage_format
6070	*/
6071
6072	void ha_rocksdb::setup_field_converters() {
6073	uint i;
6074	uint null_bytes = `0`;
6075	uchar cur_null_mask = `0x1`;
6076
6077	DBUG_ASSERT(m_encoder_arr == nullptr);
6078	m_encoder_arr = static_cast<Rdb_field_encoder *>(
6079	my_malloc(table->s->fields * sizeof(Rdb_field_encoder), MYF(`0`)));
6080	if (m_encoder_arr == nullptr) {
6081	return;
6082	}
6083
6084	for (i = `0`; i < table->s->fields; i++) {
6085	Field *const field = table->field[i];
6086	m_encoder_arr[i].m_storage_type = Rdb_field_encoder::STORE_ALL;
6087
6088	/*
6089	Check if this field is
6090	- a part of primary key, and
6091	- it can be decoded back from its key image.
6092	If both hold, we don't need to store this field in the value part of
6093	RocksDB's key-value pair.
6094
6095	If hidden pk exists, we skip this check since the field will never be
6096	part of the hidden pk.
6097	*/
6098	if (!has_hidden_pk(table)) {
6099	KEY *const pk_info = &table->key_info[table->s->primary_key];
6100	for (uint kp = `0`; kp < pk_info->user_defined_key_parts; kp++) {
6101	/ key_part->fieldnr is counted from 1 /
6102	if (field->field_index + `1` == pk_info->key_part[kp].fieldnr) {
6103	get_storage_type(&m_encoder_arr[i], kp);
6104	break;
6105	}
6106	}
6107	}
6108
6109	m_encoder_arr[i].m_field_type = field->real_type();
6110	m_encoder_arr[i].m_field_index = i;
6111	m_encoder_arr[i].m_pack_length_in_rec = field->pack_length_in_rec();
6112
6113	if (field->real_maybe_null()) {
6114	m_encoder_arr[i].m_null_mask = cur_null_mask;
6115	m_encoder_arr[i].m_null_offset = null_bytes;
6116	if (cur_null_mask == `0x80`) {
6117	cur_null_mask = `0x1`;
6118	null_bytes++;
6119	} else
6120	cur_null_mask = cur_null_mask << `1`;
6121	} else {
6122	m_encoder_arr[i].m_null_mask = `0`;
6123	}
6124	}
6125
6126	/ Count the last, unfinished NULL-bits byte /
6127	if (cur_null_mask != `0x1`)
6128	null_bytes++;
6129
6130	m_null_bytes_in_rec = null_bytes;
6131	}
6132
6133	int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg,
6134	const Rdb_tbl_def *const tbl_def_arg,
6135	bool alloc_alter_buffers) {
6136	DBUG_ENTER_FUNC();
6137
6138	DBUG_ASSERT(m_pk_tuple == nullptr);
6139	DBUG_ASSERT(tbl_def_arg != nullptr);
6140
6141	std::shared_ptr<Rdb_key_def> *const kd_arr = tbl_def_arg->m_key_descr_arr;
6142
6143	uint key_len = `0`;
6144	uint max_packed_sk_len = `0`;
6145	uint pack_key_len = `0`;
6146
6147	m_pk_descr = kd_arr[pk_index(table_arg, tbl_def_arg)];
6148	if (has_hidden_pk(table_arg)) {
6149	m_pk_key_parts = `1`;
6150	} else {
6151	m_pk_key_parts =
6152	table->key_info[table->s->primary_key].user_defined_key_parts;
6153	key_len = table->key_info[table->s->primary_key].key_length;
6154	}
6155
6156	// move this into get_table_handler() ??
6157	m_pk_descr ->setup(table_arg, tbl_def_arg);
6158
6159	m_pk_tuple = reinterpret_cast<uchar *>(my_malloc(key_len, MYF(`0`)));
6160
6161	pack_key_len = m_pk_descr ->max_storage_fmt_length();
6162	m_pk_packed_tuple =
6163	reinterpret_cast<uchar *>(my_malloc(pack_key_len, MYF(`0`)));
6164
6165	/ Sometimes, we may use m_sk_packed_tuple for storing packed PK /
6166	max_packed_sk_len = pack_key_len;
6167	for (uint i = `0`; i < table_arg->s->keys; i++) {
6168	if (i == table_arg->s->primary_key) / Primary key was processed above /
6169	continue;
6170
6171	// TODO: move this into get_table_handler() ??
6172	kd_arr[i]->setup(table_arg, tbl_def_arg);
6173
6174	const uint packed_len = kd_arr[i]->max_storage_fmt_length();
6175	if (packed_len > max_packed_sk_len) {
6176	max_packed_sk_len = packed_len;
6177	}
6178	}
6179
6180	m_sk_packed_tuple =
6181	reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(`0`)));
6182	m_sk_match_prefix_buf =
6183	reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(`0`)));
6184	m_sk_packed_tuple_old =
6185	reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(`0`)));
6186	m_end_key_packed_tuple =
6187	reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(`0`)));
6188	m_pack_buffer =
6189	reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(`0`)));
6190
6191	m_eq_cond_upper_bound =
6192	reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(`0`)));
6193	m_eq_cond_lower_bound =
6194	reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(`0`)));
6195
6196	/*
6197	If inplace alter is happening, allocate special buffers for unique
6198	secondary index duplicate checking.
6199	*/
6200	if (alloc_alter_buffers) {
6201	m_dup_sk_packed_tuple =
6202	reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(`0`)));
6203	m_dup_sk_packed_tuple_old =
6204	reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(`0`)));
6205	}
6206
6207	if (m_pk_tuple == nullptr \|\| m_pk_packed_tuple == nullptr \|\|
6208	m_sk_packed_tuple == nullptr \|\| m_sk_packed_tuple_old == nullptr \|\|
6209	m_end_key_packed_tuple == nullptr \|\| m_pack_buffer == nullptr \|\|
6210	m_eq_cond_upper_bound == nullptr \|\| m_eq_cond_lower_bound == nullptr \|\|
6211	(alloc_alter_buffers && (m_dup_sk_packed_tuple == nullptr \|\|
6212	m_dup_sk_packed_tuple_old == nullptr))) {
6213	// One or more of the above allocations failed. Clean up and exit
6214	free_key_buffers();
6215
6216	DBUG_RETURN(HA_ERR_OUT_OF_MEM);
6217	}
6218
6219	DBUG_RETURN(HA_EXIT_SUCCESS);
6220	}
6221
6222	void ha_rocksdb::free_key_buffers() {
6223	my_free(m_pk_tuple);
6224	m_pk_tuple = nullptr;
6225
6226	my_free(m_pk_packed_tuple);
6227	m_pk_packed_tuple = nullptr;
6228
6229	my_free(m_sk_packed_tuple);
6230	m_sk_packed_tuple = nullptr;
6231
6232	my_free(m_sk_match_prefix_buf);
6233	m_sk_match_prefix_buf = nullptr;
6234
6235	my_free(m_sk_packed_tuple_old);
6236	m_sk_packed_tuple_old = nullptr;
6237
6238	my_free(m_end_key_packed_tuple);
6239	m_end_key_packed_tuple = nullptr;
6240
6241	my_free(m_pack_buffer);
6242	m_pack_buffer = nullptr;
6243
6244	my_free(m_dup_sk_packed_tuple);
6245	m_dup_sk_packed_tuple = nullptr;
6246
6247	my_free(m_dup_sk_packed_tuple_old);
6248	m_dup_sk_packed_tuple_old = nullptr;
6249
6250	my_free(m_eq_cond_upper_bound);
6251	m_eq_cond_upper_bound = nullptr;
6252
6253	my_free(m_eq_cond_lower_bound);
6254	m_eq_cond_lower_bound = nullptr;
6255	}
6256
6257	#ifdef MARIAROCKS_NOT_YET
6258	void ha_rocksdb::set_use_read_free_rpl(const char *const whitelist) {
6259	DBUG_ASSERT(whitelist != nullptr);
6260
6261	#if defined(HAVE_PSI_INTERFACE)
6262	Regex_list_handler regex_handler(key_rwlock_read_free_rpl_tables);
6263	#else
6264	Regex_list_handler regex_handler;
6265	#endif
6266
6267	if (!regex_handler.set_patterns(whitelist)) {
6268	warn_about_bad_patterns(&regex_handler, "read_free_rpl_tables");
6269	}
6270
6271	m_use_read_free_rpl = regex_handler.matches(m_tbl_def->base_tablename());
6272	}
6273	#endif
6274
6275	void ha_rocksdb::set_skip_unique_check_tables(const char *const whitelist) {
6276	DBUG_ASSERT(whitelist != nullptr);
6277
6278	#if defined(HAVE_PSI_INTERFACE)
6279	Regex_list_handler regex_handler(key_rwlock_skip_unique_check_tables);
6280	#else
6281	Regex_list_handler regex_handler;
6282	#endif
6283
6284	if (!regex_handler.set_patterns(whitelist)) {
6285	warn_about_bad_patterns(&regex_handler, "skip_unique_check_tables");
6286	}
6287
6288	m_skip_unique_check = regex_handler.matches(m_tbl_def->base_tablename());
6289	}
6290
6291	/**
6292	@return
6293	HA_EXIT_SUCCESS OK
6294	other HA_ERR error code (can be SE-specific)
6295	*/
6296	int ha_rocksdb::open(const char *const name, int mode, uint test_if_locked) {
6297	DBUG_ENTER_FUNC();
6298
6299	int err = close();
6300	if (err) {
6301	DBUG_RETURN(err);
6302	}
6303
6304	m_table_handler = rdb_open_tables.get_table_handler(name);
6305
6306	if (m_table_handler == nullptr) {
6307	DBUG_RETURN(HA_ERR_OUT_OF_MEM);
6308	}
6309
6310	my_core::thr_lock_data_init(&m_table_handler->m_thr_lock, &m_db_lock,
6311	nullptr);
6312	m_io_perf.init(&m_table_handler->m_table_perf_context,
6313	&m_table_handler->m_io_perf_read,
6314	&m_table_handler->m_io_perf_write, &stats);
6315	Rdb_perf_context_guard guard(&m_io_perf,
6316	rocksdb_perf_context_level(ha_thd()));
6317
6318	std::string fullname;
6319	err = rdb_normalize_tablename(name, &fullname);
6320	if (err != HA_EXIT_SUCCESS) {
6321	DBUG_RETURN(err);
6322	}
6323
6324	m_tbl_def = ddl_manager.find(fullname);
6325	if (m_tbl_def == nullptr) {
6326	my_error(ER_INTERNAL_ERROR, MYF(`0`),
6327	"Attempt to open a table that is not present in RocksDB-SE data "
6328	"dictionary");
6329	DBUG_RETURN(HA_ERR_ROCKSDB_INVALID_TABLE);
6330	}
6331	m_lock_rows = RDB_LOCK_NONE;
6332
6333	m_key_descr_arr = m_tbl_def->m_key_descr_arr;
6334
6335	/*
6336	Full table scan actually uses primary key
6337	(UPDATE needs to know this, otherwise it will go into infinite loop on
6338	queries like "UPDATE tbl SET pk=pk+100")
6339	*/
6340	key_used_on_scan = table->s->primary_key;
6341
6342	// close() above has already called free_key_buffers(). No need to do it here.
6343	err = alloc_key_buffers(table, m_tbl_def);
6344
6345	if (err) {
6346	DBUG_RETURN(err);
6347	}
6348
6349	/*
6350	init_with_fields() is used to initialize table flags based on the field
6351	definitions in table->field[].
6352	It is called by open_binary_frm(), but that function calls the method for
6353	a temporary ha_rocksdb object which is later destroyed.
6354
6355	If we are here in ::open(), then init_with_fields() has not been called
6356	for this object. Call it ourselves, we want all member variables to be
6357	properly initialized.
6358	*/
6359	init_with_fields();
6360
6361	setup_field_converters();
6362
6363	/*
6364	MariaDB: adjust field->part_of_key for PK columns. We can only do it here
6365	because SE API is just relying on the HA_PRIMARY_KEY_IN_READ_INDEX which
6366	does not allow to distinguish between unpack'able and non-unpack'able
6367	columns.
6368	Upstream uses handler->init_with_fields() but we don't have that call.
6369	*/
6370	{
6371	if (!has_hidden_pk(table)) {
6372	KEY *const pk_info = &table->key_info[table->s->primary_key];
6373	for (uint kp = `0`; kp < pk_info->user_defined_key_parts; kp++) {
6374	if (!m_pk_descr ->can_unpack(kp)) {
6375	//
6376	uint field_index= pk_info->key_part[kp].field->field_index;
6377	table->field[field_index]->part_of_key.clear_all();
6378	table->field[field_index]->part_of_key.set_bit(table->s->primary_key);
6379	}
6380	}
6381	}
6382
6383	for (uint key= `0`; key < table->s->keys; key++) {
6384	KEY *const key_info = &table->key_info[key];
6385	if (key == table->s->primary_key)
6386	continue;
6387	for (uint kp = `0`; kp < key_info->usable_key_parts; kp++) {
6388	uint field_index= key_info->key_part[kp].field->field_index;
6389	if (m_key_descr_arr[key]->can_unpack(kp)) {
6390	table->field[field_index]->part_of_key.set_bit(key);
6391	} else {
6392	table->field[field_index]->part_of_key.clear_bit(key);
6393	}
6394	}
6395	}
6396	}
6397
6398	info(HA_STATUS_NO_LOCK \| HA_STATUS_VARIABLE \| HA_STATUS_CONST);
6399
6400	/*
6401	The following load_XXX code calls row decode functions, and they do
6402	that without having done ::external_lock() or index_init()/rnd_init().
6403	(Note: this also means we're doing a read when there was no
6404	setup_field_converters() call)
6405
6406	Initialize the necessary variables for them:
6407	*/
6408	m_verify_row_debug_checksums = false;
6409
6410	/ Load auto_increment value only once on first use. /
6411	if (table->found_next_number_field && m_tbl_def->m_auto_incr_val == `0`) {
6412	load_auto_incr_value();
6413	}
6414
6415	/ Load hidden pk only once on first use. /
6416	if (has_hidden_pk(table) && m_tbl_def->m_hidden_pk_val == `0` &&
6417	(err = load_hidden_pk_value()) != HA_EXIT_SUCCESS) {
6418	free_key_buffers();
6419	DBUG_RETURN(err);
6420	}
6421
6422	/ Index block size in MyRocks: used by MySQL in query optimization /
6423	stats.block_size = rocksdb_tbl_options ->block_size;
6424
6425	#ifdef MARIAROCKS_NOT_YET // MDEV-10976
6426	/ Determine at open whether we can use Read Free Replication or not /
6427	set_use_read_free_rpl(THDVAR(ha_thd(), read_free_rpl_tables));
6428	#endif
6429
6430	/ Determine at open whether we should skip unique checks for this table /
6431	set_skip_unique_check_tables(THDVAR(ha_thd(), skip_unique_check_tables));
6432
6433	DBUG_RETURN(HA_EXIT_SUCCESS);
6434	}
6435
6436	int ha_rocksdb::close(void) {
6437	DBUG_ENTER_FUNC();
6438
6439	m_pk_descr = nullptr;
6440	m_key_descr_arr = nullptr;
6441
6442	free_key_buffers();
6443
6444	my_free(m_encoder_arr);
6445	m_encoder_arr = nullptr;
6446
6447	if (m_table_handler != nullptr) {
6448	rdb_open_tables.release_table_handler(m_table_handler);
6449	m_table_handler = nullptr;
6450	}
6451
6452	// These are needed to suppress valgrind errors in rocksdb.partition
6453	m_storage_record.free();
6454	m_last_rowkey.free();
6455	m_sk_tails.free();
6456	m_sk_tails_old.free();
6457	m_pk_unpack_info.free();
6458
6459	DBUG_RETURN(HA_EXIT_SUCCESS);
6460	}
6461
6462	static const char *rdb_error_messages[] = {
6463	"Table must have a PRIMARY KEY.",
6464	"Specifying DATA DIRECTORY for an individual table is not supported.",
6465	"Specifying INDEX DIRECTORY for an individual table is not supported.",
6466	"RocksDB commit failed.",
6467	"Failure during bulk load operation.",
6468	"Found data corruption.",
6469	"CRC checksum mismatch.",
6470	"Invalid table.",
6471	"Could not access RocksDB properties.",
6472	"File I/O error during merge/sort operation.",
6473	"RocksDB status: not found.",
6474	"RocksDB status: corruption.",
6475	"RocksDB status: invalid argument.",
6476	"RocksDB status: io error.",
6477	"RocksDB status: no space.",
6478	"RocksDB status: merge in progress.",
6479	"RocksDB status: incomplete.",
6480	"RocksDB status: shutdown in progress.",
6481	"RocksDB status: timed out.",
6482	"RocksDB status: aborted.",
6483	"RocksDB status: lock limit reached.",
6484	"RocksDB status: busy.",
6485	"RocksDB status: deadlock.",
6486	"RocksDB status: expired.",
6487	"RocksDB status: try again.",
6488	};
6489
6490	static_assert((sizeof(rdb_error_messages) / sizeof(rdb_error_messages[`0`])) ==
6491	((HA_ERR_ROCKSDB_LAST - HA_ERR_ROCKSDB_FIRST) + `1`),
6492	"Number of error messages doesn't match number of error codes");
6493
6494	//psergey-merge: do we need this in MariaDB: we have get_error_messages
6495	//below...
6496	#if 0
6497	static const char rdb_get_error_message(int* nr) {
6498	return rdb_error_messages[nr - HA_ERR_ROCKSDB_FIRST];
6499	}
6500	#endif
6501
6502	static const char *rdb_get_error_messages(int* nr) { return rdb_error_messages; }
6503
6504	bool ha_rocksdb::get_error_message(const int error, String *const buf) {
6505	DBUG_ENTER_FUNC();
6506
6507	static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST,
6508	"HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST");
6509	static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_LAST,
6510	"HA_ERR_ROCKSDB_LAST > HA_ERR_LAST");
6511
6512	DBUG_ASSERT(buf != nullptr);
6513
6514	if (error == HA_ERR_LOCK_WAIT_TIMEOUT \|\| error == HA_ERR_LOCK_DEADLOCK) {
6515	Rdb_transaction *const tx = get_tx_from_thd(ha_thd());
6516	DBUG_ASSERT(tx != nullptr);
6517	buf->append(tx->m_detailed_error);
6518	DBUG_RETURN(true);
6519	}
6520
6521	if (error >= HA_ERR_ROCKSDB_FIRST && error <= HA_ERR_ROCKSDB_LAST) {
6522	buf->append(rdb_error_messages[error - HA_ERR_ROCKSDB_FIRST]);
6523	}
6524
6525	// We can be called with the values which are < HA_ERR_FIRST because most
6526	// MySQL internal functions will just return HA_EXIT_FAILURE in case of
6527	// an error.
6528
6529	DBUG_RETURN(false);
6530	}
6531
6532	/*
6533	Generalized way to convert RocksDB status errors into MySQL error code, and
6534	print error message.
6535
6536	Each error code below maps to a RocksDB status code found in:
6537	rocksdb/include/rocksdb/status.h
6538	*/
6539	int ha_rocksdb::rdb_error_to_mysql(const rocksdb::Status &s,
6540	const char *opt_msg) {
6541	DBUG_ASSERT(!s.ok());
6542
6543	int err;
6544	switch (s.code()) {
6545	case rocksdb::Status::Code::kOk:
6546	err = HA_EXIT_SUCCESS;
6547	break;
6548	case rocksdb::Status::Code::kNotFound:
6549	err = HA_ERR_ROCKSDB_STATUS_NOT_FOUND;
6550	break;
6551	case rocksdb::Status::Code::kCorruption:
6552	err = HA_ERR_ROCKSDB_STATUS_CORRUPTION;
6553	break;
6554	case rocksdb::Status::Code::kNotSupported:
6555	err = HA_ERR_ROCKSDB_STATUS_NOT_SUPPORTED;
6556	break;
6557	case rocksdb::Status::Code::kInvalidArgument:
6558	err = HA_ERR_ROCKSDB_STATUS_INVALID_ARGUMENT;
6559	break;
6560	case rocksdb::Status::Code::kIOError:
6561	err = (s.IsNoSpace()) ? HA_ERR_ROCKSDB_STATUS_NO_SPACE
6562	: HA_ERR_ROCKSDB_STATUS_IO_ERROR;
6563	break;
6564	case rocksdb::Status::Code::kMergeInProgress:
6565	err = HA_ERR_ROCKSDB_STATUS_MERGE_IN_PROGRESS;
6566	break;
6567	case rocksdb::Status::Code::kIncomplete:
6568	err = HA_ERR_ROCKSDB_STATUS_INCOMPLETE;
6569	break;
6570	case rocksdb::Status::Code::kShutdownInProgress:
6571	err = HA_ERR_ROCKSDB_STATUS_SHUTDOWN_IN_PROGRESS;
6572	break;
6573	case rocksdb::Status::Code::kTimedOut:
6574	err = HA_ERR_ROCKSDB_STATUS_TIMED_OUT;
6575	break;
6576	case rocksdb::Status::Code::kAborted:
6577	err = (s.IsLockLimit()) ? HA_ERR_ROCKSDB_STATUS_LOCK_LIMIT
6578	: HA_ERR_ROCKSDB_STATUS_ABORTED;
6579	break;
6580	case rocksdb::Status::Code::kBusy:
6581	err = (s.IsDeadlock()) ? HA_ERR_ROCKSDB_STATUS_DEADLOCK
6582	: HA_ERR_ROCKSDB_STATUS_BUSY;
6583	break;
6584	case rocksdb::Status::Code::kExpired:
6585	err = HA_ERR_ROCKSDB_STATUS_EXPIRED;
6586	break;
6587	case rocksdb::Status::Code::kTryAgain:
6588	err = HA_ERR_ROCKSDB_STATUS_TRY_AGAIN;
6589	break;
6590	default:
6591	DBUG_ASSERT(`0`);
6592	return -`1`;
6593	}
6594
6595	if (opt_msg) {
6596	my_error(ER_RDB_STATUS_MSG, MYF(`0`), opt_msg, s.code(),
6597	s.ToString().c_str());
6598	} else {
6599	my_error(ER_RDB_STATUS_GENERAL, MYF(`0`), s.code(), s.ToString().c_str());
6600	}
6601
6602	return err;
6603	}
6604
6605	/ MyRocks supports only the following collations for indexed columns /
6606	static const std::set<uint> RDB_INDEX_COLLATIONS = {
6607	COLLATION_BINARY, COLLATION_UTF8_BIN, COLLATION_LATIN1_BIN};
6608
6609	static bool
6610	rdb_is_index_collation_supported(const my_core::Field *const field) {
6611	const my_core::enum_field_types type = field->real_type();
6612	/ Handle [VAR](CHAR\|BINARY) or TEXT\|BLOB /
6613	if (type == MYSQL_TYPE_VARCHAR \|\| type == MYSQL_TYPE_STRING \|\|
6614	type == MYSQL_TYPE_BLOB) {
6615
6616	return (RDB_INDEX_COLLATIONS.find(field->charset()->number) !=
6617	RDB_INDEX_COLLATIONS.end()) \|\|
6618	rdb_is_collation_supported(field->charset());
6619	}
6620	return true;
6621	}
6622
6623
6624	static bool
6625	rdb_field_uses_nopad_collation(const my_core::Field *const field) {
6626	const my_core::enum_field_types type = field->real_type();
6627	/ Handle [VAR](CHAR\|BINARY) or TEXT\|BLOB /
6628	if (type == MYSQL_TYPE_VARCHAR \|\| type == MYSQL_TYPE_STRING \|\|
6629	type == MYSQL_TYPE_BLOB) {
6630
6631	/*
6632	This is technically a NOPAD collation but it's a binary collation
6633	that we can handle.
6634	*/
6635	if (RDB_INDEX_COLLATIONS.find(field->charset()->number) !=
6636	RDB_INDEX_COLLATIONS.end())
6637	return false;
6638
6639	return (field->charset()->state & MY_CS_NOPAD);
6640	}
6641	return false;
6642	}
6643
6644
6645	/*
6646	Create structures needed for storing data in rocksdb. This is called when the
6647	table is created. The structures will be shared by all TABLE objects.*
6648
6649	@param
6650	table_arg Table with definition
6651	db_table "dbname.tablename"
6652	len strlen of the above
6653	tbl_def_arg tbl_def whose key_descr is being created/populated
6654	old_tbl_def_arg tbl_def from which keys are being copied over from
6655	(for use during inplace alter)
6656
6657	@return
6658	0 - Ok
6659	other - error, either given table ddl is not supported by rocksdb or OOM.
6660	*/
6661	int ha_rocksdb::create_key_defs(
6662	const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
6663	const TABLE *const old_table_arg / = nullptr /,
6664	const Rdb_tbl_def *const old_tbl_def_arg
6665	/ = nullptr /) const {
6666	DBUG_ENTER_FUNC();
6667
6668	DBUG_ASSERT(table_arg != nullptr);
6669	DBUG_ASSERT(table_arg->s != nullptr);
6670
6671	uint i;
6672
6673	/*
6674	These need to be one greater than MAX_INDEXES since the user can create
6675	MAX_INDEXES secondary keys and no primary key which would cause us
6676	to generate a hidden one.
6677	*/
6678	std::array<key_def_cf_info, MAX_INDEXES + `1`> cfs;
6679
6680	/*
6681	NOTE: All new column families must be created before new index numbers are
6682	allocated to each key definition. See below for more details.
6683	http://github.com/MySQLOnRocksDB/mysql-5.6/issues/86#issuecomment-138515501
6684	*/
6685	if (create_cfs(table_arg, tbl_def_arg, &cfs)) {
6686	DBUG_RETURN(HA_EXIT_FAILURE);
6687	}
6688
6689	if (!old_tbl_def_arg) {
6690	/*
6691	old_tbl_def doesn't exist. this means we are in the process of creating
6692	a new table.
6693
6694	Get the index numbers (this will update the next_index_number)
6695	and create Rdb_key_def structures.
6696	*/
6697	for (i = `0`; i < tbl_def_arg->m_key_count; i++) {
6698	if (create_key_def(table_arg, i, tbl_def_arg, &m_key_descr_arr[i],
6699	cfs [i])) {
6700	DBUG_RETURN(HA_EXIT_FAILURE);
6701	}
6702	}
6703	} else {
6704	/*
6705	old_tbl_def exists. This means we are creating a new tbl_def as part of
6706	in-place alter table. Copy over existing keys from the old_tbl_def and
6707	generate the necessary new key definitions if any.
6708	*/
6709	if (create_inplace_key_defs(table_arg, tbl_def_arg, old_table_arg,
6710	old_tbl_def_arg, cfs)) {
6711	DBUG_RETURN(HA_EXIT_FAILURE);
6712	}
6713	}
6714
6715	DBUG_RETURN(HA_EXIT_SUCCESS);
6716	}
6717
6718	/*
6719	Checks index parameters and creates column families needed for storing data
6720	in rocksdb if necessary.
6721
6722	@param in
6723	table_arg Table with definition
6724	db_table Table name
6725	tbl_def_arg Table def structure being populated
6726
6727	@param out
6728	cfs CF info for each key definition in 'key_info' order
6729
6730	@return
6731	0 - Ok
6732	other - error
6733	*/
6734	int ha_rocksdb::create_cfs(
6735	const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
6736	std::array<struct key_def_cf_info, MAX_INDEXES + `1`> *const cfs) const {
6737	DBUG_ENTER_FUNC();
6738
6739	DBUG_ASSERT(table_arg != nullptr);
6740	DBUG_ASSERT(table_arg->s != nullptr);
6741	DBUG_ASSERT(tbl_def_arg != nullptr);
6742
6743	char tablename_sys[NAME_LEN + `1`];
6744	bool tsys_set= false;
6745
6746	/*
6747	The first loop checks the index parameters and creates
6748	column families if necessary.
6749	*/
6750	for (uint i = `0`; i < tbl_def_arg->m_key_count; i++) {
6751	rocksdb::ColumnFamilyHandle *cf_handle;
6752
6753	if (!is_hidden_pk(i, table_arg, tbl_def_arg) &&
6754	tbl_def_arg->base_tablename().find(tmp_file_prefix) != `0`) {
6755	if (!tsys_set)
6756	{
6757	tsys_set= true;
6758	my_core::filename_to_tablename(tbl_def_arg->base_tablename().c_str(),
6759	tablename_sys, sizeof(tablename_sys));
6760	}
6761
6762	for (uint part = `0`; part < table_arg->key_info[i].ext_key_parts;
6763	part++)
6764	{
6765	/ MariaDB: disallow NOPAD collations /
6766	if (rdb_field_uses_nopad_collation(
6767	table_arg->key_info[i].key_part[part].field))
6768	{
6769	my_error(ER_MYROCKS_CANT_NOPAD_COLLATION, MYF(`0`));
6770	DBUG_RETURN(HA_EXIT_FAILURE);
6771	}
6772
6773	if (rocksdb_strict_collation_check &&
6774	!rdb_is_index_collation_supported(
6775	table_arg->key_info[i].key_part[part].field) &&
6776	!rdb_collation_exceptions->matches(tablename_sys)) {
6777
6778	char buf[`1024`];
6779	my_snprintf(buf, sizeof(buf),
6780	"Indexed column %s.%s uses a collation that does not "
6781	"allow index-only access in secondary key and has "
6782	"reduced disk space efficiency in primary key.",
6783	tbl_def_arg->full_tablename().c_str(),
6784	table_arg->key_info[i].key_part[part].field->field_name.str);
6785
6786	my_error(ER_INTERNAL_ERROR, MYF(ME_JUST_WARNING), buf);
6787	}
6788	}
6789	}
6790
6791	// Internal consistency check to make sure that data in TABLE and
6792	// Rdb_tbl_def structures matches. Either both are missing or both are
6793	// specified. Yes, this is critical enough to make it into SHIP_ASSERT.
6794	SHIP_ASSERT(IF_PARTITIONING(!table_arg->part_info,true) == tbl_def_arg->base_partition().empty());
6795
6796	// Generate the name for the column family to use.
6797	bool per_part_match_found = false;
6798	std::string cf_name = generate_cf_name(i, table_arg, tbl_def_arg,
6799	&per_part_match_found);
6800
6801	// Prevent create from using the system column family.
6802	if (cf_name == DEFAULT_SYSTEM_CF_NAME) {
6803	my_error(ER_WRONG_ARGUMENTS, MYF(`0`),
6804	"column family not valid for storing index data.");
6805	DBUG_RETURN(HA_EXIT_FAILURE);
6806	}
6807
6808	// Here's how `get_or_create_cf` will use the input parameters:
6809	//
6810	// `cf_name` - will be used as a CF name.
6811	cf_handle = cf_manager.get_or_create_cf(rdb, cf_name);
6812
6813	if (!cf_handle) {
6814	DBUG_RETURN(HA_EXIT_FAILURE);
6815	}
6816
6817	auto &cf = (*cfs)[i];
6818
6819	cf.cf_handle = cf_handle;
6820	cf.is_reverse_cf = Rdb_cf_manager::is_cf_name_reverse(cf_name.c_str());
6821	cf.is_per_partition_cf = per_part_match_found;
6822	}
6823
6824	DBUG_RETURN(HA_EXIT_SUCCESS);
6825	}
6826
6827	/*
6828	Create key definition needed for storing data in rocksdb during ADD index
6829	inplace operations.
6830
6831	@param in
6832	table_arg Table with definition
6833	tbl_def_arg New table def structure being populated
6834	old_tbl_def_arg Old(current) table def structure
6835	cfs Struct array which contains column family information
6836
6837	@return
6838	0 - Ok
6839	other - error, either given table ddl is not supported by rocksdb or OOM.
6840	*/
6841	int ha_rocksdb::create_inplace_key_defs(
6842	const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg,
6843	const TABLE *const old_table_arg, const Rdb_tbl_def *const old_tbl_def_arg,
6844	const std::array<key_def_cf_info, MAX_INDEXES + `1`> &cfs) const {
6845	DBUG_ENTER_FUNC();
6846
6847	DBUG_ASSERT(table_arg != nullptr);
6848	DBUG_ASSERT(tbl_def_arg != nullptr);
6849	DBUG_ASSERT(old_tbl_def_arg != nullptr);
6850
6851	std::shared_ptr<Rdb_key_def> *const old_key_descr =
6852	old_tbl_def_arg->m_key_descr_arr;
6853	std::shared_ptr<Rdb_key_def> *const new_key_descr =
6854	tbl_def_arg->m_key_descr_arr;
6855	const std::unordered_map<std::string, uint> old_key_pos =
6856	get_old_key_positions(table_arg, tbl_def_arg, old_table_arg,
6857	old_tbl_def_arg);
6858
6859	uint i;
6860	for (i = `0`; i < tbl_def_arg->m_key_count; i++) {
6861	const auto &it = old_key_pos.find(get_key_name(i, table_arg, tbl_def_arg));
6862
6863	if (it != old_key_pos.end()) {
6864	/*
6865	Found matching index in old table definition, so copy it over to the
6866	new one created.
6867	*/
6868	const Rdb_key_def &okd = *old_key_descr[it ->second];
6869
6870	const GL_INDEX_ID gl_index_id = okd.get_gl_index_id();
6871	struct Rdb_index_info index_info;
6872	if (!dict_manager.get_index_info(gl_index_id, &index_info)) {
6873	// NO_LINT_DEBUG
6874	sql_print_error("RocksDB: Could not get index information "
6875	"for Index Number (%u,%u), table %s",
6876	gl_index_id.cf_id, gl_index_id.index_id,
6877	old_tbl_def_arg->full_tablename().c_str());
6878	DBUG_RETURN(HA_EXIT_FAILURE);
6879	}
6880
6881	uint32 ttl_rec_offset =
6882	Rdb_key_def::has_index_flag(index_info.m_index_flags,
6883	Rdb_key_def::TTL_FLAG)
6884	? Rdb_key_def::calculate_index_flag_offset(
6885	index_info.m_index_flags, Rdb_key_def::TTL_FLAG)
6886	: UINT_MAX;
6887
6888	/*
6889	We can't use the copy constructor because we need to update the
6890	keynr within the pack_info for each field and the keyno of the keydef
6891	itself.
6892	*/
6893	new_key_descr[i] = std::make_shared<Rdb_key_def>(
6894	okd.get_index_number(), i, okd.get_cf(),
6895	index_info.m_index_dict_version, index_info.m_index_type,
6896	index_info.m_kv_version, okd.m_is_reverse_cf,
6897	okd.m_is_per_partition_cf, okd.m_name.c_str(),
6898	dict_manager.get_stats(gl_index_id), index_info.m_index_flags,
6899	ttl_rec_offset, index_info.m_ttl_duration);
6900	} else if (create_key_def(table_arg, i, tbl_def_arg, &new_key_descr[i],
6901	cfs [i])) {
6902	DBUG_RETURN(HA_EXIT_FAILURE);
6903	}
6904
6905	DBUG_ASSERT(new_key_descr[i] != nullptr);
6906	new_key_descr[i]->setup(table_arg, tbl_def_arg);
6907	}
6908
6909	DBUG_RETURN(HA_EXIT_SUCCESS);
6910	}
6911
6912	std::unordered_map<std::string, uint> ha_rocksdb::get_old_key_positions(
6913	const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg,
6914	const TABLE *const old_table_arg,
6915	const Rdb_tbl_def *const old_tbl_def_arg) const {
6916	DBUG_ENTER_FUNC();
6917
6918	DBUG_ASSERT(table_arg != nullptr);
6919	DBUG_ASSERT(old_table_arg != nullptr);
6920	DBUG_ASSERT(tbl_def_arg != nullptr);
6921	DBUG_ASSERT(old_tbl_def_arg != nullptr);
6922
6923	std::shared_ptr<Rdb_key_def> *const old_key_descr =
6924	old_tbl_def_arg->m_key_descr_arr;
6925	std::unordered_map<std::string, uint> old_key_pos;
6926	std::unordered_map<std::string, uint> new_key_pos;
6927	uint i;
6928
6929	for (i = `0`; i < tbl_def_arg->m_key_count; i++) {
6930	new_key_pos [get_key_name(i, table_arg, tbl_def_arg)] = i;
6931	}
6932
6933	for (i = `0`; i < old_tbl_def_arg->m_key_count; i++) {
6934	if (is_hidden_pk(i, old_table_arg, old_tbl_def_arg)) {
6935	old_key_pos [old_key_descr[i]->m_name] = i;
6936	continue;
6937	}
6938
6939	/*
6940	In case of matching key name, need to check key parts of keys as well,
6941	in case a simultaneous drop + add is performed, where the key name is the
6942	same but the key parts are different.
6943
6944	Example:
6945	CREATE TABLE t1 (a INT, b INT, KEY ka(a)) ENGINE=RocksDB;
6946	ALTER TABLE t1 DROP INDEX ka, ADD INDEX ka(b), ALGORITHM=INPLACE;
6947	*/
6948	const KEY *const old_key = &old_table_arg->key_info[i];
6949	const auto &it = new_key_pos.find(old_key->name.str);
6950	if (it == new_key_pos.end()) {
6951	continue;
6952	}
6953
6954	KEY *const new_key = &table_arg->key_info[it ->second];
6955
6956	/*
6957	Check that the key is identical between old and new tables.
6958	If not, we still need to create a new index.
6959
6960	The exception is if there is an index changed from unique to non-unique,
6961	in these cases we don't need to rebuild as they are stored the same way in
6962	RocksDB.
6963	*/
6964	bool unique_to_non_unique =
6965	((old_key->flags ^ new_key->flags) == HA_NOSAME) &&
6966	(old_key->flags & HA_NOSAME);
6967
6968	if (compare_keys(old_key, new_key) && !unique_to_non_unique) {
6969	continue;
6970	}
6971
6972	/ Check to make sure key parts match. /
6973	if (compare_key_parts(old_key, new_key)) {
6974	continue;
6975	}
6976
6977	old_key_pos [old_key->name.str] = i;
6978	}
6979
6980	DBUG_RETURN(old_key_pos);
6981	}
6982
6983	/ Check to see if two keys are identical. /
6984	int ha_rocksdb::compare_keys(const KEY *const old_key,
6985	const KEY *const new_key) const {
6986	DBUG_ENTER_FUNC();
6987
6988	DBUG_ASSERT(old_key != nullptr);
6989	DBUG_ASSERT(new_key != nullptr);
6990
6991	/ Check index name. /
6992	if (strcmp(old_key->name.str, new_key->name.str) != `0`) {
6993	DBUG_RETURN(HA_EXIT_FAILURE);
6994	}
6995
6996	/ If index algorithms are different then keys are different. /
6997	if (old_key->algorithm != new_key->algorithm) {
6998	DBUG_RETURN(HA_EXIT_FAILURE);
6999	}
7000
7001	/ Check that the key is identical between old and new tables. /
7002	if ((old_key->flags ^ new_key->flags) & HA_KEYFLAG_MASK) {
7003	DBUG_RETURN(HA_EXIT_FAILURE);
7004	}
7005
7006	/ Check index comment. (for column family changes) /
7007	std::string old_comment(old_key->comment.str, old_key->comment.length);
7008	std::string new_comment(new_key->comment.str, new_key->comment.length);
7009	if (old_comment.compare(new_comment) != `0`) {
7010	DBUG_RETURN(HA_EXIT_FAILURE);
7011	}
7012
7013	DBUG_RETURN(HA_EXIT_SUCCESS);
7014	}
7015
7016	/ Check two keys to ensure that key parts within keys match /
7017	int ha_rocksdb::compare_key_parts(const KEY *const old_key,
7018	const KEY *const new_key) const {
7019	DBUG_ENTER_FUNC();
7020
7021	DBUG_ASSERT(old_key != nullptr);
7022	DBUG_ASSERT(new_key != nullptr);
7023
7024	/ Skip if key parts do not match, as it is a different key /
7025	if (new_key->user_defined_key_parts != old_key->user_defined_key_parts) {
7026	DBUG_RETURN(HA_EXIT_FAILURE);
7027	}
7028
7029	/ Check to see that key parts themselves match /
7030	for (uint i = `0`; i < old_key->user_defined_key_parts; i++) {
7031	if (strcmp(old_key->key_part[i].field->field_name.str,
7032	new_key->key_part[i].field->field_name.str) != `0`) {
7033	DBUG_RETURN(HA_EXIT_FAILURE);
7034	}
7035
7036	/ Check if prefix index key part length has changed /
7037	if (old_key->key_part[i].length != new_key->key_part[i].length) {
7038	DBUG_RETURN(HA_EXIT_FAILURE);
7039	}
7040	}
7041
7042	DBUG_RETURN(HA_EXIT_SUCCESS);
7043	}
7044
7045	/*
7046	Create key definition needed for storing data in rocksdb.
7047	This can be called either during CREATE table or doing ADD index operations.
7048
7049	@param in
7050	table_arg Table with definition
7051	i Position of index being created inside table_arg->key_info
7052	tbl_def_arg Table def structure being populated
7053	cf_info Struct which contains column family information
7054
7055	@param out
7056	new_key_def Newly created index definition.
7057
7058	@return
7059	0 - Ok
7060	other - error, either given table ddl is not supported by rocksdb or OOM.
7061	*/
7062	int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint &i,
7063	const Rdb_tbl_def *const tbl_def_arg,
7064	std::shared_ptr<Rdb_key_def> *const new_key_def,
7065	const struct key_def_cf_info &cf_info) const {
7066	DBUG_ENTER_FUNC();
7067
7068	DBUG_ASSERT(new_key_def != nullptr);
7069	DBUG_ASSERT(new_key_def == nullptr*);
7070
7071	uint64 ttl_duration = `0`;
7072	std::string ttl_column;
7073	uint ttl_field_offset;
7074
7075	uint err;
7076	if ((err = Rdb_key_def::extract_ttl_duration(table_arg, tbl_def_arg,
7077	&ttl_duration))) {
7078	DBUG_RETURN(err);
7079	}
7080
7081	if ((err = Rdb_key_def::extract_ttl_col(table_arg, tbl_def_arg, &ttl_column,
7082	&ttl_field_offset))) {
7083	DBUG_RETURN(err);
7084	}
7085
7086	/ We don't currently support TTL on tables with hidden primary keys. /
7087	if (ttl_duration > `0` && is_hidden_pk(i, table_arg, tbl_def_arg)) {
7088	my_error(ER_RDB_TTL_UNSUPPORTED, MYF(`0`));
7089	DBUG_RETURN(HA_EXIT_FAILURE);
7090	}
7091
7092	/*
7093	If TTL duration is not specified but TTL column was specified, throw an
7094	error because TTL column requires duration.
7095	*/
7096	if (ttl_duration == `0` && !ttl_column.empty()) {
7097	my_error(ER_RDB_TTL_COL_FORMAT, MYF(`0`), ttl_column.c_str());
7098	DBUG_RETURN(HA_EXIT_FAILURE);
7099	}
7100
7101	const uint index_id = ddl_manager.get_and_update_next_number(&dict_manager);
7102	const uint16_t index_dict_version = Rdb_key_def::INDEX_INFO_VERSION_LATEST;
7103	uchar index_type;
7104	uint16_t kv_version;
7105
7106	if (is_hidden_pk(i, table_arg, tbl_def_arg)) {
7107	index_type = Rdb_key_def::INDEX_TYPE_HIDDEN_PRIMARY;
7108	kv_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
7109	} else if (i == table_arg->s->primary_key) {
7110	index_type = Rdb_key_def::INDEX_TYPE_PRIMARY;
7111	uint16 pk_latest_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST;
7112	kv_version = pk_latest_version;
7113	} else {
7114	index_type = Rdb_key_def::INDEX_TYPE_SECONDARY;
7115	uint16 sk_latest_version = Rdb_key_def::SECONDARY_FORMAT_VERSION_LATEST;
7116	kv_version = sk_latest_version;
7117	}
7118
7119	// Use PRIMARY_FORMAT_VERSION_UPDATE1 here since it is the same value as
7120	// SECONDARY_FORMAT_VERSION_UPDATE1 so it doesn't matter if this is a
7121	// primary key or secondary key.
7122	DBUG_EXECUTE_IF("MYROCKS_LEGACY_VARBINARY_FORMAT", {
7123	kv_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_UPDATE1;
7124	});
7125
7126	DBUG_EXECUTE_IF("MYROCKS_NO_COVERED_BITMAP_FORMAT", {
7127	if (index_type == Rdb_key_def::INDEX_TYPE_SECONDARY) {
7128	kv_version = Rdb_key_def::SECONDARY_FORMAT_VERSION_UPDATE2;
7129	}
7130	});
7131
7132	uint32 index_flags = (ttl_duration > `0` ? Rdb_key_def::TTL_FLAG : `0`);
7133
7134	uint32 ttl_rec_offset =
7135	Rdb_key_def::has_index_flag(index_flags, Rdb_key_def::TTL_FLAG)
7136	? Rdb_key_def::calculate_index_flag_offset(index_flags,
7137	Rdb_key_def::TTL_FLAG)
7138	: UINT_MAX;
7139
7140	const char *const key_name = get_key_name(i, table_arg, m_tbl_def);
7141	*new_key_def = std::make_shared<Rdb_key_def>(
7142	index_id, i, cf_info.cf_handle, index_dict_version, index_type,
7143	kv_version, cf_info.is_reverse_cf, cf_info.is_per_partition_cf, key_name,
7144	Rdb_index_stats (), index_flags, ttl_rec_offset, ttl_duration);
7145
7146	if (!ttl_column.empty()) {
7147	(*new_key_def)->m_ttl_column = ttl_column;
7148	}
7149
7150	DBUG_RETURN(HA_EXIT_SUCCESS);
7151	}
7152
7153	int rdb_normalize_tablename(const std::string &tablename,
7154	std::string *const strbuf) {
7155	DBUG_ASSERT(strbuf != nullptr);
7156
7157	if (tablename.size() < `2` \|\| tablename [`0`] != `'.'` \|\|
7158	(tablename [`1`] != FN_LIBCHAR && tablename [`1`] != FN_LIBCHAR2)) {
7159	DBUG_ASSERT(`0`); // We were not passed table name?
7160	return HA_ERR_ROCKSDB_INVALID_TABLE;
7161	}
7162
7163	size_t pos = tablename.find_first_of(FN_LIBCHAR, `2`);
7164	if (pos == std::string::npos) {
7165	pos = tablename.find_first_of(FN_LIBCHAR2, `2`);
7166	}
7167
7168	if (pos == std::string::npos) {
7169	DBUG_ASSERT(`0`); // We were not passed table name?
7170	return HA_ERR_ROCKSDB_INVALID_TABLE;
7171	}
7172
7173	*strbuf = tablename.substr(`2`, pos - `2`) + "." + tablename.substr(pos + `1`);
7174
7175	return HA_EXIT_SUCCESS;
7176	}
7177
7178	/*
7179	Check to see if the user's original statement includes foreign key
7180	references
7181	*/
7182	bool ha_rocksdb::contains_foreign_key(THD *const thd) {
7183	bool success;
7184	const char *str = thd_query_string(thd)->str;
7185
7186	DBUG_ASSERT(str != nullptr);
7187
7188	while (*str != `'\0'`) {
7189	// Scan from our current pos looking for 'FOREIGN'
7190	str = rdb_find_in_string(str, "FOREIGN", &success);
7191	if (!success) {
7192	return false;
7193	}
7194
7195	// Skip past the found "FOREIGN'
7196	str = rdb_check_next_token(&my_charset_bin, str, "FOREIGN", &success);
7197	DBUG_ASSERT(success);
7198
7199	if (!my_isspace(&my_charset_bin, *str)) {
7200	return false;
7201	}
7202
7203	// See if the next token is 'KEY'
7204	str = rdb_check_next_token(&my_charset_bin, str, "KEY", &success);
7205	if (!success) {
7206	continue;
7207	}
7208
7209	// See if the next token is '('
7210	str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
7211	if (!success) {
7212	// There is an optional index id after 'FOREIGN KEY', skip it
7213	str = rdb_skip_id(&my_charset_bin, str);
7214
7215	// Now check for '(' again
7216	str = rdb_check_next_token(&my_charset_bin, str, "(", &success);
7217	}
7218
7219	// If we have found 'FOREIGN KEY [<word>] (' we can be confident we have
7220	// a foreign key clause.
7221	return success;
7222	}
7223
7224	// We never found a valid foreign key clause
7225	return false;
7226	}
7227
7228	/**
7229	@brief
7230	splits the normalized table name of <dbname>.<tablename>#P#<part_no> into
7231	the <dbname>, <tablename> and <part_no> components.
7232
7233	@param dbbuf returns database name/table_schema
7234	@param tablebuf returns tablename
7235	@param partitionbuf returns partition suffix if there is one
7236	@return HA_EXIT_SUCCESS on success, non-zero on failure to split
7237	*/
7238	int rdb_split_normalized_tablename(const std::string &fullname,
7239	std::string *const db,
7240	std::string *const table,
7241	std::string *const partition) {
7242	DBUG_ASSERT(!fullname.empty());
7243
7244	#define RDB_PARTITION_STR "#P#"
7245
7246	/ Normalize returns dbname.tablename. /
7247	size_t dotpos = fullname.find(`'.'`);
7248
7249	/ Invalid table name? /
7250	if (dotpos == std::string::npos) {
7251	return HA_ERR_ROCKSDB_INVALID_TABLE;
7252	}
7253
7254	// Table must have a database name associated with it.
7255	DBUG_ASSERT(dotpos > `0`);
7256
7257	if (db != nullptr) {
7258	*db = fullname.substr(`0`, dotpos);
7259	}
7260
7261	dotpos++;
7262
7263	const size_t partpos =
7264	fullname.find(RDB_PARTITION_STR, dotpos, strlen(RDB_PARTITION_STR));
7265
7266	if (partpos != std::string::npos) {
7267	DBUG_ASSERT(partpos >= dotpos);
7268
7269	if (table != nullptr) {
7270	*table = fullname.substr(dotpos, partpos - dotpos);
7271	}
7272
7273	if (partition != nullptr) {
7274	*partition = fullname.substr(partpos + strlen(RDB_PARTITION_STR));
7275	}
7276	} else if (table != nullptr) {
7277	*table = fullname.substr(dotpos);
7278	}
7279
7280	return HA_EXIT_SUCCESS;
7281	}
7282
7283	/**
7284	@brief
7285	create() is called to create a database. The variable name will have the name
7286	of the table.
7287
7288	@details
7289	When create() is called you do not need to worry about
7290	opening the table. Also, the .frm file will have already been
7291	created so adjusting create_info is not necessary. You can overwrite
7292	the .frm file at this point if you wish to change the table
7293	definition, but there are no methods currently provided for doing
7294	so.
7295
7296	Called from handle.cc by ha_create_table().
7297
7298	@return
7299	HA_EXIT_SUCCESS OK
7300	other HA_ERR error code (can be SE-specific)
7301
7302	@see
7303	ha_create_table() in handle.cc
7304	*/
7305
7306	int ha_rocksdb::create(const char *const name, TABLE *const table_arg,
7307	HA_CREATE_INFO *const create_info) {
7308	DBUG_ENTER_FUNC();
7309
7310	DBUG_ASSERT(table_arg != nullptr);
7311	DBUG_ASSERT(create_info != nullptr);
7312
7313	if (create_info->data_file_name) {
7314	// DATA DIRECTORY is used to create tables under a specific location
7315	// outside the MySQL data directory. We don't support this for MyRocks.
7316	// The `rocksdb_datadir` setting should be used to configure RocksDB data
7317	// directory.
7318	print_error(HA_ERR_ROCKSDB_TABLE_DATA_DIRECTORY_NOT_SUPPORTED,
7319	MYF(ME_JUST_WARNING));
7320	DBUG_RETURN(HA_WRONG_CREATE_OPTION);
7321	}
7322
7323	if (create_info->index_file_name) {
7324	// Similar check for INDEX DIRECTORY as well.
7325	print_error(HA_ERR_ROCKSDB_TABLE_INDEX_DIRECTORY_NOT_SUPPORTED,
7326	MYF(ME_JUST_WARNING));
7327	DBUG_RETURN(HA_WRONG_CREATE_OPTION);
7328	}
7329
7330	std::string str;
7331	int err;
7332
7333	THD *const thd = my_core::thd_get_current_thd();
7334
7335	if (get_table_if_exists(name)) {
7336	if (thd->lex->sql_command == SQLCOM_TRUNCATE) {
7337	err = delete_table(name);
7338	if (err != HA_EXIT_SUCCESS) {
7339	DBUG_RETURN(err);
7340	}
7341	} else {
7342	err = rdb_normalize_tablename(name, &str);
7343	if (err != HA_EXIT_SUCCESS) {
7344	DBUG_RETURN(err);
7345	}
7346	my_error(ER_METADATA_INCONSISTENCY, MYF(`0`), str.c_str(), name);
7347	DBUG_RETURN(HA_ERR_ROCKSDB_CORRUPT_DATA);
7348	}
7349	}
7350
7351	/*
7352	Construct dbname.tablename ourselves, because parititioning
7353	passes strings like "./test/t14#P#p0" for individual partitions,
7354	while table_arg->s->table_name has none of that.
7355	*/
7356	err = rdb_normalize_tablename(name, &str);
7357	if (err != HA_EXIT_SUCCESS) {
7358	DBUG_RETURN(err);
7359	}
7360
7361	if (contains_foreign_key(thd)) {
7362	my_error(ER_NOT_SUPPORTED_YET, MYF(`0`),
7363	"FOREIGN KEY for the RocksDB storage engine");
7364	DBUG_RETURN(HA_ERR_UNSUPPORTED);
7365	}
7366
7367	const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
7368	rocksdb::WriteBatch *const batch = wb.get();
7369
7370	/ Create table/key descriptions and put them into the data dictionary /
7371	m_tbl_def = new Rdb_tbl_def (str);
7372
7373	uint n_keys = table_arg->s->keys;
7374
7375	/*
7376	If no primary key found, create a hidden PK and place it inside table
7377	definition
7378	*/
7379	if (has_hidden_pk(table_arg)) {
7380	n_keys += `1`;
7381	}
7382
7383	m_key_descr_arr = new std::shared_ptr<Rdb_key_def>[n_keys];
7384	m_tbl_def->m_key_count = n_keys;
7385	m_tbl_def->m_key_descr_arr = m_key_descr_arr;
7386
7387	err = create_key_defs(table_arg, m_tbl_def);
7388	if (err != HA_EXIT_SUCCESS) {
7389	goto error;
7390	}
7391
7392	m_pk_descr = m_key_descr_arr[pk_index(table_arg, m_tbl_def)];
7393
7394	if (create_info->auto_increment_value) {
7395	bool autoinc_upgrade_test = false;
7396	m_tbl_def->m_auto_incr_val = create_info->auto_increment_value;
7397	DBUG_EXECUTE_IF("myrocks_autoinc_upgrade", autoinc_upgrade_test = true;);
7398	if (!autoinc_upgrade_test) {
7399	auto s = dict_manager.put_auto_incr_val(
7400	batch, m_tbl_def->get_autoincr_gl_index_id(),
7401	m_tbl_def->m_auto_incr_val);
7402	if (!s.ok()) {
7403	goto error;
7404	}
7405	}
7406	}
7407
7408	dict_manager.lock();
7409	err = ddl_manager.put_and_write(m_tbl_def, batch);
7410	if (err != HA_EXIT_SUCCESS) {
7411	dict_manager.unlock();
7412	goto error;
7413	}
7414
7415	err = dict_manager.commit(batch);
7416	if (err != HA_EXIT_SUCCESS) {
7417	dict_manager.unlock();
7418	goto error;
7419	}
7420
7421	dict_manager.unlock();
7422
7423	DBUG_RETURN(HA_EXIT_SUCCESS);
7424
7425	error:
7426	/ Delete what we have allocated so far /
7427	delete m_tbl_def;
7428	m_tbl_def = nullptr;
7429	m_key_descr_arr = nullptr;
7430
7431	DBUG_RETURN(err);
7432	}
7433
7434	/**
7435	@note
7436	This function is used only when the table has not yet been opened, and
7437	keyread_allowed bitmap doesn't have the correct values yet.
7438
7439	See comment in ha_rocksdb::index_flags() for details.
7440	*/
7441
7442	bool ha_rocksdb::check_keyread_allowed(uint inx, uint part,
7443	bool all_parts) const {
7444	bool res = true;
7445	KEY *const key_info = &table_share->key_info[inx];
7446
7447	Rdb_field_packing dummy1;
7448	res = dummy1.setup(nullptr, key_info->key_part[part].field, inx, part,
7449	key_info->key_part[part].length);
7450
7451	if (res && all_parts) {
7452	for (uint i = `0`; i < part; i++) {
7453	Field *field;
7454	if ((field = key_info->key_part[i].field)) {
7455	Rdb_field_packing dummy;
7456	if (!dummy.setup(nullptr, field, inx, i,
7457	key_info->key_part[i].length)) {
7458	/ Cannot do index-only reads for this column /
7459	res = false;
7460	break;
7461	}
7462	}
7463	}
7464	}
7465
7466	const uint pk = table_share->primary_key;
7467	if (inx == pk && all_parts &&
7468	part + `1` == table_share->key_info[pk].user_defined_key_parts) {
7469	m_pk_can_be_decoded = res;
7470	}
7471
7472	return res;
7473	}
7474
7475	int ha_rocksdb::read_key_exact(const Rdb_key_def &kd,
7476	rocksdb::Iterator *const iter,
7477	const bool &full_key_match,
7478	const rocksdb::Slice &key_slice,
7479	const int64_t ttl_filter_ts) {
7480	DBUG_ASSERT(iter != nullptr);
7481
7482	/*
7483	We are looking for the first record such that
7484	index_tuple= lookup_tuple.
7485	lookup_tuple may be a prefix of the index.
7486	*/
7487	rocksdb_smart_seek(kd.m_is_reverse_cf, iter, key_slice);
7488
7489	while (iter->Valid() && kd.value_matches_prefix(iter->key(), key_slice)) {
7490	/*
7491	If TTL is enabled we need to check if the given key has already expired
7492	from the POV of the current transaction. If it has, try going to the next
7493	key.
7494	*/
7495	if (kd.has_ttl() && should_hide_ttl_rec(kd, iter->value(), ttl_filter_ts)) {
7496	rocksdb_smart_next(kd.m_is_reverse_cf, iter);
7497	continue;
7498	}
7499
7500	return HA_EXIT_SUCCESS;
7501	}
7502
7503	/*
7504	Got a record that is not equal to the lookup value, or even a record
7505	from another table.index.
7506	*/
7507	return HA_ERR_KEY_NOT_FOUND;
7508	}
7509
7510	int ha_rocksdb::read_before_key(const Rdb_key_def &kd,
7511	const bool &full_key_match,
7512	const rocksdb::Slice &key_slice,
7513	const int64_t ttl_filter_ts) {
7514	/*
7515	We are looking for record with the biggest t.key such that
7516	t.key < lookup_tuple.
7517	*/
7518	rocksdb_smart_seek(!kd.m_is_reverse_cf, m_scan_it, key_slice);
7519
7520	while (is_valid(m_scan_it)) {
7521	/*
7522	We are using full key and we've hit an exact match, or...
7523
7524	If TTL is enabled we need to check if the given key has already expired
7525	from the POV of the current transaction. If it has, try going to the next
7526	key.
7527	*/
7528	if ((full_key_match &&
7529	kd.value_matches_prefix(m_scan_it->key(), key_slice)) \|\|
7530	(kd.has_ttl() &&
7531	should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts))) {
7532	rocksdb_smart_next(!kd.m_is_reverse_cf, m_scan_it);
7533	continue;
7534	}
7535
7536	return HA_EXIT_SUCCESS;
7537	}
7538
7539	return HA_ERR_KEY_NOT_FOUND;
7540	}
7541
7542	int ha_rocksdb::read_after_key(const Rdb_key_def &kd,
7543	const rocksdb::Slice &key_slice,
7544	const int64_t ttl_filter_ts) {
7545	/*
7546	We are looking for the first record such that
7547
7548	index_tuple $GT lookup_tuple
7549
7550	with HA_READ_AFTER_KEY, $GT = '>',
7551	with HA_READ_KEY_OR_NEXT, $GT = '>='
7552	*/
7553	rocksdb_smart_seek(kd.m_is_reverse_cf, m_scan_it, key_slice);
7554
7555	/*
7556	If TTL is enabled we need to check if the given key has already expired
7557	from the POV of the current transaction. If it has, try going to the next
7558	key.
7559	*/
7560	while (is_valid(m_scan_it) && kd.has_ttl() &&
7561	should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts)) {
7562	rocksdb_smart_next(kd.m_is_reverse_cf, m_scan_it);
7563	}
7564
7565	return is_valid(m_scan_it) ? HA_EXIT_SUCCESS : HA_ERR_KEY_NOT_FOUND;
7566	}
7567
7568	int ha_rocksdb::position_to_correct_key(
7569	const Rdb_key_def &kd, const enum ha_rkey_function &find_flag,
7570	const bool &full_key_match, const uchar *const key,
7571	const key_part_map &keypart_map, const rocksdb::Slice &key_slice,
7572	bool *const move_forward, const int64_t ttl_filter_ts) {
7573	int rc = `0`;
7574
7575	move_forward = true*;
7576
7577	switch (find_flag) {
7578	case HA_READ_KEY_EXACT:
7579	rc =
7580	read_key_exact(kd, m_scan_it, full_key_match, key_slice, ttl_filter_ts);
7581	break;
7582	case HA_READ_BEFORE_KEY:
7583	move_forward = false*;
7584	rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts);
7585	if (rc == `0` && !kd.covers_key(m_scan_it->key())) {
7586	/ The record we've got is not from this index /
7587	rc = HA_ERR_KEY_NOT_FOUND;
7588	}
7589	break;
7590	case HA_READ_AFTER_KEY:
7591	case HA_READ_KEY_OR_NEXT:
7592	rc = read_after_key(kd, key_slice, ttl_filter_ts);
7593	if (rc == `0` && !kd.covers_key(m_scan_it->key())) {
7594	/ The record we've got is not from this index /
7595	rc = HA_ERR_KEY_NOT_FOUND;
7596	}
7597	break;
7598	case HA_READ_KEY_OR_PREV:
7599	case HA_READ_PREFIX:
7600	/ This flag is not used by the SQL layer, so we don't support it yet. /
7601	rc = HA_ERR_UNSUPPORTED;
7602	break;
7603	case HA_READ_PREFIX_LAST:
7604	case HA_READ_PREFIX_LAST_OR_PREV:
7605	move_forward = false*;
7606	/*
7607	Find the last record with the specified index prefix lookup.
7608	- HA_READ_PREFIX_LAST requires that the record has the
7609	prefix=lookup (if there are no such records,
7610	HA_ERR_KEY_NOT_FOUND should be returned).
7611	- HA_READ_PREFIX_LAST_OR_PREV has no such requirement. If there are no
7612	records with prefix=lookup, we should return the last record
7613	before that.
7614	*/
7615	rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts);
7616	if (rc == `0`) {
7617	const rocksdb::Slice &rkey = m_scan_it->key();
7618	if (!kd.covers_key(rkey)) {
7619	/ The record we've got is not from this index /
7620	rc = HA_ERR_KEY_NOT_FOUND;
7621	} else if (find_flag == HA_READ_PREFIX_LAST) {
7622	uint size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
7623	key, keypart_map);
7624	rocksdb::Slice lookup_tuple(reinterpret_cast<char *>(m_sk_packed_tuple),
7625	size);
7626
7627	// We need to compare the key we've got with the original search prefix.
7628	if (!kd.value_matches_prefix(rkey, lookup_tuple)) {
7629	rc = HA_ERR_KEY_NOT_FOUND;
7630	}
7631	}
7632	}
7633	break;
7634	default:
7635	DBUG_ASSERT(`0`);
7636	break;
7637	}
7638
7639	return rc;
7640	}
7641
7642	int ha_rocksdb::calc_eq_cond_len(const Rdb_key_def &kd,
7643	const enum ha_rkey_function &find_flag,
7644	const rocksdb::Slice &slice,
7645	const int &bytes_changed_by_succ,
7646	const key_range *const end_key,
7647	uint *const end_key_packed_size) {
7648	if (find_flag == HA_READ_KEY_EXACT)
7649	return slice.size();
7650
7651	if (find_flag == HA_READ_PREFIX_LAST) {
7652	/*
7653	We have made the kd.successor(m_sk_packed_tuple) call above.
7654
7655	The slice is at least Rdb_key_def::INDEX_NUMBER_SIZE bytes long.
7656	*/
7657	return slice.size() - bytes_changed_by_succ;
7658	}
7659
7660	if (end_key) {
7661	*end_key_packed_size =
7662	kd.pack_index_tuple(table, m_pack_buffer, m_end_key_packed_tuple,
7663	end_key->key, end_key->keypart_map);
7664
7665	/*
7666	Calculating length of the equal conditions here. 4 byte index id is
7667	included.
7668	Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
7669	WHERE id1=1 AND id2=1 AND id3>=2 => eq_cond_len= 4+8+4= 16
7670	WHERE id1=1 AND id2>=1 AND id3>=2 => eq_cond_len= 4+8= 12
7671	Example2: id1 VARCHAR(30), id2 INT, PRIMARY KEY (id1, id2)
7672	WHERE id1 = 'AAA' and id2 < 3; => eq_cond_len=13 (varchar used 9 bytes)
7673	*/
7674	rocksdb::Slice end_slice(reinterpret_cast<char *>(m_end_key_packed_tuple),
7675	*end_key_packed_size);
7676	return slice.difference_offset(end_slice);
7677	}
7678
7679	/*
7680	On range scan without any end key condition, there is no
7681	eq cond, and eq cond length is the same as index_id size (4 bytes).
7682	Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3)
7683	WHERE id1>=1 AND id2 >= 2 and id2 <= 5 => eq_cond_len= 4
7684	*/
7685	return Rdb_key_def::INDEX_NUMBER_SIZE;
7686	}
7687
7688	int ha_rocksdb::read_row_from_primary_key(uchar *const buf) {
7689	DBUG_ASSERT(buf != nullptr);
7690
7691	int rc;
7692	const rocksdb::Slice &rkey = m_scan_it->key();
7693	const uint pk_size = rkey.size();
7694	const char *pk_data = rkey.data();
7695
7696	memcpy(m_pk_packed_tuple, pk_data, pk_size);
7697	m_last_rowkey.copy(pk_data, pk_size, &my_charset_bin);
7698
7699	if (m_lock_rows != RDB_LOCK_NONE) {
7700	/ We need to put a lock and re-read /
7701	rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
7702	} else {
7703	/ Unpack from the row we've read /
7704	const rocksdb::Slice &value = m_scan_it->value();
7705	rc = convert_record_from_storage_format(&rkey, &value, buf);
7706	}
7707
7708	return rc;
7709	}
7710
7711	int ha_rocksdb::read_row_from_secondary_key(uchar *const buf,
7712	const Rdb_key_def &kd,
7713	bool move_forward) {
7714	DBUG_ASSERT(buf != nullptr);
7715
7716	int rc = `0`;
7717	uint pk_size;
7718
7719	/ Get the key columns and primary key value /
7720	const rocksdb::Slice &rkey = m_scan_it->key();
7721	const rocksdb::Slice &value = m_scan_it->value();
7722
7723	bool covered_lookup =
7724	m_keyread_only \|\| kd.covers_lookup(table, &value, &m_lookup_bitmap);
7725	if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) {
7726	pk_size =
7727	kd.get_primary_key_tuple(table, *m_pk_descr, &rkey, m_pk_packed_tuple);
7728	if (pk_size == RDB_INVALID_KEY_LEN) {
7729	rc = HA_ERR_ROCKSDB_CORRUPT_DATA;
7730	} else {
7731	rc = kd.unpack_record(table, buf, &rkey, &value,
7732	m_verify_row_debug_checksums);
7733	global_stats.covered_secondary_key_lookups.inc();
7734	}
7735	} else {
7736	if (kd.m_is_reverse_cf)
7737	move_forward = !move_forward;
7738
7739	rc = find_icp_matching_index_rec(move_forward, buf);
7740	if (!rc) {
7741	const rocksdb::Slice &rkey = m_scan_it->key();
7742	pk_size = kd.get_primary_key_tuple(table, *m_pk_descr, &rkey,
7743	m_pk_packed_tuple);
7744	if (pk_size == RDB_INVALID_KEY_LEN) {
7745	rc = HA_ERR_ROCKSDB_CORRUPT_DATA;
7746	} else {
7747	rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size);
7748	}
7749	}
7750	}
7751
7752	if (!rc) {
7753	m_last_rowkey.copy((const char *)m_pk_packed_tuple, pk_size,
7754	&my_charset_bin);
7755	}
7756
7757	return rc;
7758	}
7759
7760	/**
7761	@note
7762	The problem with this function is that SQL layer calls it, when
7763	- the table has not been yet opened (no ::open() call done)
7764	- this->table_share already exists, but it is in the process of being
7765	filled, so some of fields are still NULL.
7766	- In particular, table_share->key_info[inx].key_part[] is filled only up
7767	to part #part. Subsequent key parts are not yet filled.
7768
7769	To complicate things further, SQL layer will call index_flags() with
7770	all_parts=TRUE. Essentially, we're asked to provide flags for reading
7771	keyparts whose datatype is not yet known.
7772
7773	We walk around this problem by using check_keyread_allowed(), which uses
7774	table_share object and is careful not to step on unitialized data.
7775
7776	When we get a call with all_parts=TRUE, we try to analyze all parts but
7777	ignore those that have key_part->field==nullptr (these are not initialized
7778	yet).
7779	*/
7780
7781	ulong ha_rocksdb::index_flags(uint inx, uint part, bool all_parts) const {
7782	DBUG_ENTER_FUNC();
7783
7784	ulong base_flags = HA_READ_NEXT \| // doesn't seem to be used
7785	HA_READ_ORDER \| HA_READ_RANGE \| HA_READ_PREV;
7786
7787	if (check_keyread_allowed(inx, part, all_parts))
7788	base_flags \|= HA_KEYREAD_ONLY;
7789
7790	if (inx == table_share->primary_key) {
7791	/*
7792	Index-only reads on primary key are the same as table scan for us. Still,
7793	we need to explicitly "allow" them, otherwise SQL layer will miss some
7794	plans.
7795	*/
7796	base_flags \|= HA_KEYREAD_ONLY \| HA_CLUSTERED_INDEX;
7797	} else {
7798	/*
7799	We can Index Condition Pushdown any key except the primary. With primary
7800	key, we get (pk, record) pair immediately, there is no place to put the
7801	ICP check.
7802	*/
7803	base_flags \|= HA_DO_INDEX_COND_PUSHDOWN;
7804	}
7805
7806	DBUG_RETURN(base_flags);
7807	}
7808
7809	/**
7810	@brief
7811	Read next index tuple through the secondary index.
7812
7813	@details
7814	m_scan_it points at the index key-value pair that we should read the (pk,row)
7815	pair for.
7816	*/
7817	int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) {
7818	DBUG_ASSERT(buf != nullptr);
7819	DBUG_ASSERT(table != nullptr);
7820	#ifdef MARIAROCKS_NOT_YET
7821	stats.rows_requested++;
7822	#endif
7823	/ Use STATUS_NOT_FOUND when record not found or some error occurred /
7824	table->status = STATUS_NOT_FOUND;
7825
7826	if (is_valid(m_scan_it)) {
7827	rocksdb::Slice key = m_scan_it->key();
7828
7829	/ Check if we've ran out of records of this index /
7830	if (m_key_descr_arr[keyno]->covers_key(key)) {
7831	int rc = `0`;
7832
7833	// TODO: We could here check if we have ran out of range we're scanning
7834	const uint size = m_key_descr_arr[keyno]->get_primary_key_tuple(
7835	table, *m_pk_descr, &key, m_pk_packed_tuple);
7836	if (size == RDB_INVALID_KEY_LEN) {
7837	return HA_ERR_ROCKSDB_CORRUPT_DATA;
7838	}
7839
7840	m_last_rowkey.copy((const char *)m_pk_packed_tuple, size,
7841	&my_charset_bin);
7842
7843	rocksdb::Slice value = m_scan_it->value();
7844	bool covered_lookup =
7845	m_keyread_only \|\| m_key_descr_arr[keyno]->covers_lookup(
7846	table, &value, &m_lookup_bitmap);
7847	if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) {
7848	rc = m_key_descr_arr[keyno]->unpack_record(
7849	table, buf, &key, &value, m_verify_row_debug_checksums);
7850	global_stats.covered_secondary_key_lookups.inc();
7851	} else {
7852	rc = get_row_by_rowid(buf, m_pk_packed_tuple, size);
7853	}
7854
7855	if (!rc) {
7856	table->status = `0`;
7857	#ifdef MARIAROCKS_NOT_YET
7858	stats.rows_read++;
7859	stats.rows_index_next++;
7860	#endif
7861	update_row_stats(ROWS_READ);
7862	}
7863	return rc;
7864	}
7865	}
7866	return HA_ERR_END_OF_FILE;
7867	}
7868
7869	bool ha_rocksdb::is_using_full_key(key_part_map keypart_map,
7870	uint actual_key_parts)
7871	{
7872	return (keypart_map == HA_WHOLE_KEY) \|\|
7873	(keypart_map == ((key_part_map(`1`) << actual_key_parts)
7874	- `1`));
7875	}
7876
7877	/*
7878	ha_rocksdb::read_range_first overrides handler::read_range_first.
7879	The only difference from handler::read_range_first is that
7880	ha_rocksdb::read_range_first passes end_key to
7881	ha_rocksdb::index_read_map_impl function.
7882
7883	@return
7884	HA_EXIT_SUCCESS OK
7885	other HA_ERR error code (can be SE-specific)
7886	*/
7887	int ha_rocksdb::read_range_first(const key_range *const start_key,
7888	const key_range *const end_key,
7889	bool eq_range_arg, bool sorted) {
7890	DBUG_ENTER_FUNC();
7891
7892	int result;
7893
7894	eq_range = eq_range_arg;
7895
7896	/ MariaDB: Pass RANGE_SCAN_ASC when we support ICP on backwards scans /
7897	set_end_range(end_key);
7898
7899	range_key_part = table->key_info[active_index].key_part;
7900
7901	if (!start_key) // Read first record
7902	result = ha_index_first(table->record[`0`]);
7903	else {
7904	#ifdef MARIAROCKS_NOT_YET
7905	if (is_using_prohibited_gap_locks(
7906	is_using_full_unique_key(active_index, start_key->keypart_map,
7907	start_key->flag))) {
7908	DBUG_RETURN(HA_ERR_LOCK_DEADLOCK);
7909	}
7910	#endif
7911	increment_statistics(&SSV::ha_read_key_count);
7912
7913	MYSQL_TABLE_IO_WAIT(m_psi, PSI_TABLE_FETCH_ROW, active_index, `0`, {
7914	result =
7915	index_read_map_impl(table->record[`0`], start_key->key,
7916	start_key->keypart_map, start_key->flag, end_key);
7917	})
7918	}
7919	if (result)
7920	DBUG_RETURN((result == HA_ERR_KEY_NOT_FOUND) ? HA_ERR_END_OF_FILE : result);
7921
7922	if (compare_key(end_range) <= `0`) {
7923	DBUG_RETURN(HA_EXIT_SUCCESS);
7924	} else {
7925	/*
7926	The last read row does not fall in the range. So request
7927	storage engine to release row lock if possible.
7928	*/
7929	unlock_row();
7930	DBUG_RETURN(HA_ERR_END_OF_FILE);
7931	}
7932	}
7933
7934	int ha_rocksdb::prepare_index_scan()
7935	{
7936	range_key_part= table->key_info[active_index].key_part;
7937	set_end_range(NULL);
7938	return `0`;
7939	}
7940
7941
7942	int ha_rocksdb::prepare_range_scan(const key_range *start_key,
7943	const key_range *end_key)
7944	{
7945	range_key_part= table->key_info[active_index].key_part;
7946
7947	if (start_key)
7948	{
7949	m_save_start_range = *start_key;
7950	m_start_range= &m_save_start_range;
7951	}
7952	else
7953	m_start_range= NULL;
7954
7955	set_end_range(end_key);
7956	return `0`;
7957	}
7958
7959
7960	/**
7961	@return
7962	HA_EXIT_SUCCESS OK
7963	other HA_ERR error code (can be SE-specific)
7964	*/
7965
7966	int ha_rocksdb::index_read_map(uchar *const buf, const uchar *const key,
7967	key_part_map keypart_map,
7968	enum ha_rkey_function find_flag) {
7969	DBUG_ENTER_FUNC();
7970
7971	DBUG_RETURN(index_read_map_impl(buf, key, keypart_map, find_flag, nullptr));
7972	}
7973
7974	/*
7975	See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL
7976	index navigation commands are converted into RocksDB lookup commands.
7977
7978	This function takes end_key as an argument, and it is set on range scan.
7979	MyRocks needs to decide whether prefix bloom filter can be used or not.
7980	To decide to use prefix bloom filter or not, calculating equal condition
7981	length
7982	is needed. On equal lookups (find_flag == HA_READ_KEY_EXACT), equal
7983	condition length is the same as rocksdb::Slice.size() of the start key.
7984	On range scan, equal condition length is MIN(start_key, end_key) of the
7985	rocksdb::Slice expression.
7986
7987	@return
7988	HA_EXIT_SUCCESS OK
7989	other HA_ERR error code (can be SE-specific)
7990	*/
7991	int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key,
7992	key_part_map keypart_map,
7993	enum ha_rkey_function find_flag,
7994	const key_range *end_key) {
7995	DBUG_ENTER_FUNC();
7996
7997	int rc = `0`;
7998
7999	const Rdb_key_def &kd = *m_key_descr_arr[active_index];
8000	const uint actual_key_parts = kd.get_key_parts();
8001	bool using_full_key = is_using_full_key(keypart_map, actual_key_parts);
8002
8003	if (!end_key)
8004	end_key = end_range;
8005
8006	/ By default, we don't need the retrieved records to match the prefix /
8007	m_sk_match_prefix = nullptr;
8008	#ifdef MARIAROCKS_NOT_YET
8009	stats.rows_requested++;
8010	#endif
8011	if (active_index == table->s->primary_key && find_flag == HA_READ_KEY_EXACT &&
8012	using_full_key) {
8013	/*
8014	Equality lookup over primary key, using full tuple.
8015	This is a special case, use DB::Get.
8016	*/
8017	const uint size = kd.pack_index_tuple(table, m_pack_buffer,
8018	m_pk_packed_tuple, key, keypart_map);
8019	bool skip_lookup = is_blind_delete_enabled();
8020
8021	rc = get_row_by_rowid(buf, m_pk_packed_tuple, size,
8022	skip_lookup, false);
8023
8024	if (!rc && !skip_lookup) {
8025	#ifdef MARIAROCKS_NOT_YET
8026	stats.rows_read++;
8027	stats.rows_index_first++;
8028	#endif
8029	update_row_stats(ROWS_READ);
8030	}
8031	DBUG_RETURN(rc);
8032	}
8033
8034	/*
8035	Unique secondary index performs lookups without the extended key fields
8036	*/
8037	uint packed_size;
8038	if (active_index != table->s->primary_key &&
8039	table->key_info[active_index].flags & HA_NOSAME &&
8040	find_flag == HA_READ_KEY_EXACT && using_full_key) {
8041	key_part_map tmp_map = (key_part_map(`1`) << table->key_info[active_index]
8042	.user_defined_key_parts) -
8043	`1`;
8044	packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
8045	key, tmp_map);
8046	if (table->key_info[active_index].user_defined_key_parts !=
8047	kd.get_key_parts())
8048	using_full_key = false;
8049	} else {
8050	packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
8051	key, keypart_map);
8052	}
8053
8054	if ((pushed_idx_cond && pushed_idx_cond_keyno == active_index) &&
8055	(find_flag == HA_READ_KEY_EXACT \|\| find_flag == HA_READ_PREFIX_LAST)) {
8056	/*
8057	We are doing a point index lookup, and ICP is enabled. It is possible
8058	that this call will be followed by ha_rocksdb->index_next_same() call.
8059
8060	Do what InnoDB does: save the lookup tuple now. We will need it in
8061	index_next_same/find_icp_matching_index_rec in order to stop scanning
8062	as soon as index record doesn't match the lookup tuple.
8063
8064	When not using ICP, handler::index_next_same() will make sure that rows
8065	that don't match the lookup prefix are not returned.
8066	row matches the lookup prefix.
8067	*/
8068	m_sk_match_prefix = m_sk_match_prefix_buf;
8069	m_sk_match_length = packed_size;
8070	memcpy(m_sk_match_prefix, m_sk_packed_tuple, packed_size);
8071	}
8072
8073	int bytes_changed_by_succ = `0`;
8074	if (find_flag == HA_READ_PREFIX_LAST_OR_PREV \|\|
8075	find_flag == HA_READ_PREFIX_LAST \|\| find_flag == HA_READ_AFTER_KEY) {
8076	/ See below /
8077	bytes_changed_by_succ = kd.successor(m_sk_packed_tuple, packed_size);
8078	}
8079
8080	rocksdb::Slice slice(reinterpret_cast<const char *>(m_sk_packed_tuple),
8081	packed_size);
8082
8083	uint end_key_packed_size = `0`;
8084	const key_range *cur_end_key= end_key;
8085	if (find_flag == HA_READ_PREFIX_LAST_OR_PREV)
8086	cur_end_key= m_start_range;
8087
8088	const uint eq_cond_len =
8089	calc_eq_cond_len(kd, find_flag, slice, bytes_changed_by_succ, cur_end_key,
8090	&end_key_packed_size);
8091
8092	bool use_all_keys = false;
8093	if (find_flag == HA_READ_KEY_EXACT &&
8094	my_count_bits(keypart_map) == kd.get_key_parts())
8095	use_all_keys = true;
8096
8097	Rdb_transaction *const tx = get_or_create_tx(table->in_use);
8098	const bool is_new_snapshot = !tx->has_snapshot();
8099	// Loop as long as we get a deadlock error AND we end up creating the
8100	// snapshot here (i.e. it did not exist prior to this)
8101	for (;;) {
8102	/*
8103	This will open the iterator and position it at a record that's equal or
8104	greater than the lookup tuple.
8105	*/
8106	setup_scan_iterator(kd, &slice, use_all_keys, eq_cond_len);
8107
8108	/*
8109	Once we are positioned on from above, move to the position we really
8110	want: See storage/rocksdb/rocksdb-range-access.txt
8111	*/
8112	bool move_forward;
8113	rc =
8114	position_to_correct_key(kd, find_flag, using_full_key, key, keypart_map,
8115	slice, &move_forward, tx->m_snapshot_timestamp);
8116
8117	if (rc) {
8118	/ This status is returned on any error /
8119	table->status = STATUS_NOT_FOUND;
8120	DBUG_RETURN(rc);
8121	}
8122
8123	m_skip_scan_it_next_call = false;
8124
8125	/*
8126	Now get the data for the row into 'buf'. If we were using a primary key
8127	then we have all the rows we need. For a secondary key we now need to
8128	lookup the primary key.
8129	*/
8130	if (active_index == table->s->primary_key)
8131	rc = read_row_from_primary_key(buf);
8132	else
8133	rc = read_row_from_secondary_key(buf, kd, move_forward);
8134
8135	if (rc != HA_ERR_LOCK_DEADLOCK \|\| !is_new_snapshot)
8136	break; / Exit the loop /
8137
8138	// release the snapshot and iterator so they will be regenerated
8139	tx->release_snapshot();
8140	release_scan_iterator();
8141	}
8142
8143	if (rc) {
8144	/ the only possible error condition is record-not-found /
8145	table->status = STATUS_NOT_FOUND;
8146	} else {
8147	table->status = `0`;
8148	#ifdef MARIAROCKS_NOT_YET
8149	stats.rows_read++;
8150	stats.rows_index_first++;
8151	#endif
8152	update_row_stats(ROWS_READ);
8153	}
8154
8155	DBUG_RETURN(rc);
8156	}
8157
8158	/*
8159	@brief
8160	Scan the secondary index until we find an index record that satisfies ICP
8161
8162	@param move_forward TRUE <=> move m_scan_it forward
8163	FALSE <=> move m_scan_it backward
8164	@param buf Record buffer (must be the same buffer that
8165	pushed index condition points to, in practice
8166	it is table->record[0])
8167
8168	@detail
8169	Move the current iterator m_scan_it until we get an index tuple that
8170	satisfies the pushed Index Condition.
8171	(if there is no pushed index condition, return right away)
8172
8173	@return
8174	0 - Index tuple satisfies ICP, can do index read.
8175	other - error code
8176	*/
8177
8178	int ha_rocksdb::find_icp_matching_index_rec(const bool &move_forward,
8179	uchar *const buf) {
8180	DBUG_ASSERT(buf != nullptr);
8181
8182	if (pushed_idx_cond && pushed_idx_cond_keyno == active_index) {
8183	const Rdb_key_def &kd = *m_key_descr_arr[active_index];
8184
8185	while (`1`) {
8186	rocksdb_skip_expired_records(kd, m_scan_it, !move_forward);
8187
8188	if (!is_valid(m_scan_it)) {
8189	table->status = STATUS_NOT_FOUND;
8190	return HA_ERR_END_OF_FILE;
8191	}
8192	const rocksdb::Slice rkey = m_scan_it->key();
8193
8194	if (!kd.covers_key(rkey)) {
8195	table->status = STATUS_NOT_FOUND;
8196	return HA_ERR_END_OF_FILE;
8197	}
8198
8199	if (m_sk_match_prefix) {
8200	const rocksdb::Slice prefix((const char *)m_sk_match_prefix,
8201	m_sk_match_length);
8202	if (!kd.value_matches_prefix(rkey, prefix)) {
8203	table->status = STATUS_NOT_FOUND;
8204	return HA_ERR_END_OF_FILE;
8205	}
8206	}
8207
8208	const rocksdb::Slice value = m_scan_it->value();
8209	int err = kd.unpack_record(table, buf, &rkey, &value,
8210	m_verify_row_debug_checksums);
8211	if (err != HA_EXIT_SUCCESS) {
8212	return err;
8213	}
8214
8215	const enum icp_result icp_status= handler_index_cond_check(this);
8216	if (icp_status == ICP_NO_MATCH) {
8217	rocksdb_smart_next(!move_forward, m_scan_it);
8218	continue; / Get the next (or prev) index tuple /
8219	}
8220	else if (icp_status == ICP_OUT_OF_RANGE \|\| icp_status == ICP_ABORTED_BY_USER) {
8221	/ We have walked out of range we are scanning /
8222	table->status = STATUS_NOT_FOUND;
8223	return HA_ERR_END_OF_FILE;
8224	}
8225	else / icp_status == ICP_MATCH /
8226	{
8227	/ Index Condition is satisfied. We have rc==0, proceed to fetch the*
8228	* row. */
8229	break;
8230	}
8231	/*
8232	TODO: should we have this here, or RockDB handles this internally?
8233	if (my_core::thd_killed(current_thd))
8234	{
8235	rc= HA_ERR_INTERNAL_ERROR; // doesn't matter
8236	break;
8237	}
8238	*/
8239	}
8240	}
8241	return HA_EXIT_SUCCESS;
8242	}
8243
8244	/**
8245	@return
8246	HA_EXIT_SUCCESS OK
8247	other HA_ERR error code (can be SE-specific)
8248	*/
8249
8250	/**
8251	@return
8252	HA_ADMIN_OK OK
8253	other HA_ADMIN error code
8254	*/
8255	int ha_rocksdb::check(THD *const thd, HA_CHECK_OPT *const check_opt) {
8256	DBUG_ENTER_FUNC();
8257
8258	DBUG_ASSERT(thd != nullptr);
8259	DBUG_ASSERT(check_opt != nullptr);
8260
8261	const uint pk = pk_index(table, m_tbl_def);
8262	String rowkey_copy;
8263	String sec_key_copy;
8264	const char *const table_name = table->s->table_name.str;
8265
8266	// Only when debugging: don't use snapshot when reading
8267	// Rdb_transaction tx= get_or_create_tx(table->in_use);*
8268	// tx->snapshot= nullptr;
8269
8270	const bool save_verify_row_debug_checksums = m_verify_row_debug_checksums;
8271	m_verify_row_debug_checksums = true;
8272
8273	/ For each secondary index, check that we can get a PK value from it /
8274	// NO_LINT_DEBUG
8275	sql_print_verbose_info("CHECKTABLE %s: Checking table %s", table_name,
8276	table_name);
8277	ha_rows UNINIT_VAR(row_checksums_at_start); // set/used iff first_index==true
8278	ha_rows row_checksums = ha_rows(-`1`);
8279	bool first_index = true;
8280
8281	for (uint keyno = `0`; keyno < table->s->keys; keyno++) {
8282	if (keyno != pk) {
8283	extra(HA_EXTRA_KEYREAD);
8284	ha_index_init(keyno, true);
8285	ha_rows rows = `0`;
8286	ha_rows checksums = `0`;
8287	if (first_index)
8288	row_checksums_at_start = m_row_checksums_checked;
8289	int res;
8290	// NO_LINT_DEBUG
8291	sql_print_verbose_info("CHECKTABLE %s: Checking index %s", table_name,
8292	table->key_info[keyno].name);
8293	while (`1`) {
8294	if (!rows)
8295	res = index_first(table->record[`0`]);
8296	else
8297	res = index_next(table->record[`0`]);
8298
8299	if (res == HA_ERR_END_OF_FILE)
8300	break;
8301	if (res) {
8302	// error
8303	// NO_LINT_DEBUG
8304	sql_print_error("CHECKTABLE %s: .. row %lld: index scan error %d",
8305	table_name, rows, res);
8306	goto error;
8307	}
8308	rocksdb::Slice key = m_scan_it->key();
8309	sec_key_copy.copy(key.data(), key.size(), &my_charset_bin);
8310	rowkey_copy.copy(m_last_rowkey.ptr(), m_last_rowkey.length(),
8311	&my_charset_bin);
8312
8313	if (m_key_descr_arr[keyno]->unpack_info_has_checksum(
8314	m_scan_it->value())) {
8315	checksums++;
8316	}
8317
8318	if ((res = get_row_by_rowid(table->record[`0`], rowkey_copy.ptr(),
8319	rowkey_copy.length()))) {
8320	// NO_LINT_DEBUG
8321	sql_print_error("CHECKTABLE %s: .. row %lld: "
8322	"failed to fetch row by rowid",
8323	table_name, rows);
8324	goto error;
8325	}
8326
8327	longlong hidden_pk_id = `0`;
8328	if (has_hidden_pk(table) &&
8329	read_hidden_pk_id_from_rowkey(&hidden_pk_id))
8330	goto error;
8331
8332	/ Check if we get the same PK value /
8333	uint packed_size = m_pk_descr ->pack_record(
8334	table, m_pack_buffer, table->record[`0`], m_pk_packed_tuple, nullptr,
8335	false, hidden_pk_id);
8336	if (packed_size != rowkey_copy.length() \|\|
8337	memcmp(m_pk_packed_tuple, rowkey_copy.ptr(), packed_size)) {
8338	// NO_LINT_DEBUG
8339	sql_print_error("CHECKTABLE %s: .. row %lld: PK value mismatch",
8340	table_name, rows);
8341	goto print_and_error;
8342	}
8343
8344	/ Check if we get the same secondary key value /
8345	packed_size = m_key_descr_arr[keyno]->pack_record(
8346	table, m_pack_buffer, table->record[`0`], m_sk_packed_tuple,
8347	&m_sk_tails, false, hidden_pk_id);
8348	if (packed_size != sec_key_copy.length() \|\|
8349	memcmp(m_sk_packed_tuple, sec_key_copy.ptr(), packed_size)) {
8350	// NO_LINT_DEBUG
8351	sql_print_error("CHECKTABLE %s: .. row %lld: "
8352	"secondary index value mismatch",
8353	table_name, rows);
8354	goto print_and_error;
8355	}
8356	rows++;
8357	continue;
8358
8359	print_and_error : {
8360	std::string buf;
8361	buf = rdb_hexdump(rowkey_copy.ptr(), rowkey_copy.length(),
8362	RDB_MAX_HEXDUMP_LEN);
8363	// NO_LINT_DEBUG
8364	sql_print_error("CHECKTABLE %s: rowkey: %s", table_name, buf.c_str());
8365
8366	buf = rdb_hexdump(m_retrieved_record.data(), m_retrieved_record.size(),
8367	RDB_MAX_HEXDUMP_LEN);
8368	// NO_LINT_DEBUG
8369	sql_print_error("CHECKTABLE %s: record: %s", table_name, buf.c_str());
8370
8371	buf = rdb_hexdump(sec_key_copy.ptr(), sec_key_copy.length(),
8372	RDB_MAX_HEXDUMP_LEN);
8373	// NO_LINT_DEBUG
8374	sql_print_error("CHECKTABLE %s: index: %s", table_name, buf.c_str());
8375
8376	goto error;
8377	}
8378	}
8379	// NO_LINT_DEBUG
8380	sql_print_verbose_info("CHECKTABLE %s: ... %lld index entries checked "
8381	"(%lld had checksums)",
8382	table_name, rows, checksums);
8383
8384	if (first_index) {
8385	row_checksums = m_row_checksums_checked - row_checksums_at_start;
8386	first_index = false;
8387	}
8388	ha_index_end();
8389	}
8390	}
8391	if (row_checksums != ha_rows(-`1`)) {
8392	// NO_LINT_DEBUG
8393	sql_print_verbose_info("CHECKTABLE %s: %lld table records had checksums",
8394	table_name, row_checksums);
8395	}
8396	extra(HA_EXTRA_NO_KEYREAD);
8397
8398	m_verify_row_debug_checksums = save_verify_row_debug_checksums;
8399	/*
8400	TODO: we should check also for PK records that are missing in the secondary
8401	indexes.
8402	For that, need to walk through the PK and check that every PK record has a
8403	proper counterpart in each secondary index.
8404	*/
8405	DBUG_RETURN(HA_ADMIN_OK);
8406	error:
8407	m_verify_row_debug_checksums = save_verify_row_debug_checksums;
8408	ha_index_or_rnd_end();
8409	extra(HA_EXTRA_NO_KEYREAD);
8410
8411	DBUG_RETURN(HA_ADMIN_CORRUPT);
8412	}
8413
8414	static void dbug_dump_str(FILE *const out, const char *const str, int len) {
8415	fprintf(out, "\"");
8416	for (int i = `0`; i < len; i++) {
8417	if (str[i] > `32`)
8418	fprintf(out, "%c", str[i]);
8419	else
8420	fprintf(out, "\\%d", str[i]);
8421	}
8422	fprintf(out, "\"");
8423	}
8424
8425	/*
8426	Debugging help: dump the whole database into a human-readable file.
8427	Usage:
8428	dbug_dump_database(rdb);
8429	*/
8430
8431	void dbug_dump_database(rocksdb::DB *const db) {
8432	FILE *const out = fopen("/tmp/rocksdb.dump", "wt");
8433	if (!out)
8434	return;
8435
8436	rocksdb::Iterator *it = db->NewIterator(rocksdb::ReadOptions ());
8437	for (it->SeekToFirst(); it->Valid(); it->Next()) {
8438	rocksdb::Slice key = it->key();
8439	rocksdb::Slice val = it->value();
8440	dbug_dump_str(out, key.data(), key.size());
8441	fprintf(out, " -> ");
8442	dbug_dump_str(out, val.data(), val.size());
8443	fprintf(out, "\n");
8444	}
8445
8446	delete it;
8447	fclose(out);
8448	}
8449
8450	rocksdb::Status ha_rocksdb::get_for_update(
8451	Rdb_transaction *const tx, rocksdb::ColumnFamilyHandle *const column_family,
8452	const rocksdb::Slice &key, rocksdb::PinnableSlice *const value) const {
8453	DBUG_ASSERT(m_lock_rows != RDB_LOCK_NONE);
8454	const bool exclusive = m_lock_rows != RDB_LOCK_READ;
8455
8456	rocksdb::Status s = tx->get_for_update(column_family, key, value, exclusive);
8457
8458	// If we have a lock conflict and we are running in READ COMMITTTED mode
8459	// release and reacquire the snapshot and then retry the get_for_update().
8460	if (s.IsBusy() && !s.IsDeadlock() &&
8461	my_core::thd_tx_isolation(ha_thd()) == ISO_READ_COMMITTED) {
8462	tx->release_snapshot();
8463	tx->acquire_snapshot(false);
8464
8465	s = tx->get_for_update(column_family, key, value, exclusive);
8466	}
8467
8468	return s;
8469	}
8470
8471	bool ha_rocksdb::is_blind_delete_enabled()
8472	{
8473	THD *thd = ha_thd();
8474	return (THDVAR(thd, blind_delete_primary_key) &&
8475	thd->lex->sql_command == SQLCOM_DELETE &&
8476	thd->lex->table_count == `1` &&
8477	table->s->keys == `1` &&
8478	!has_hidden_pk(table) &&
8479	!thd->rgi_slave);
8480	}
8481
8482	/*
8483	Given a rowid (i.e. packed PK) as a parameter, get the record.
8484
8485	@return
8486	HA_EXIT_SUCCESS OK
8487	other HA_ERR error code (can be SE-specific)
8488	*/
8489
8490	int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid,
8491	const uint rowid_size, const bool skip_lookup,
8492	const bool skip_ttl_check) {
8493	DBUG_ENTER_FUNC();
8494
8495	DBUG_ASSERT(buf != nullptr);
8496	DBUG_ASSERT(rowid != nullptr);
8497	DBUG_ASSERT(table != nullptr);
8498
8499	int rc;
8500
8501	rocksdb::Slice key_slice(rowid, rowid_size);
8502
8503	Rdb_transaction *const tx = get_or_create_tx(table->in_use);
8504	DBUG_ASSERT(tx != nullptr);
8505
8506	DEBUG_SYNC(ha_thd(), "rocksdb.get_row_by_rowid");
8507	DBUG_EXECUTE_IF("dbug.rocksdb.get_row_by_rowid", {
8508	THD *thd = ha_thd();
8509	const char act[] = "now signal Reached "
8510	"wait_for signal.rocksdb.get_row_by_rowid_let_running";
8511	DBUG_ASSERT(opt_debug_sync_timeout > `0`);
8512	DBUG_ASSERT(!debug_sync_set_action(thd, STRING_WITH_LEN(act)));
8513	};);
8514
8515	bool found;
8516	rocksdb::Status s;
8517
8518	/ Pretend row found without looking up /
8519	if (skip_lookup)
8520	{
8521	#ifdef MARIAROCKS_NOT_YET
8522	stats.rows_deleted_blind++;
8523	#endif
8524	update_row_stats(ROWS_DELETED_BLIND);
8525	m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin);
8526	table->status = `0`;
8527	DBUG_RETURN(`0`);
8528	}
8529
8530	if (m_lock_rows == RDB_LOCK_NONE) {
8531	tx->acquire_snapshot(true);
8532	s = tx->get(m_pk_descr ->get_cf(), key_slice, &m_retrieved_record);
8533	} else {
8534	s = get_for_update(tx, m_pk_descr ->get_cf(), key_slice,
8535	&m_retrieved_record);
8536	}
8537
8538	DBUG_EXECUTE_IF("rocksdb_return_status_corrupted",
8539	dbug_change_status_to_corrupted(&s););
8540
8541	if (!s.IsNotFound() && !s.ok()) {
8542	DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def,
8543	m_table_handler));
8544	}
8545	found = !s.IsNotFound();
8546
8547	table->status = STATUS_NOT_FOUND;
8548	if (found) {
8549	/ If we found the record, but it's expired, pretend we didn't find it. /
8550	if (!skip_ttl_check && m_pk_descr ->has_ttl() &&
8551	should_hide_ttl_rec(*m_pk_descr, m_retrieved_record,
8552	tx->m_snapshot_timestamp)) {
8553	DBUG_RETURN(HA_ERR_KEY_NOT_FOUND);
8554	}
8555
8556	m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin);
8557	rc = convert_record_from_storage_format(&key_slice, buf);
8558
8559	if (!rc) {
8560	table->status = `0`;
8561	}
8562	} else {
8563	/*
8564	Note: we don't need to unlock the row. It is intentional that we keep
8565	locks on rows that don't exist.
8566	*/
8567	rc = HA_ERR_KEY_NOT_FOUND;
8568	}
8569
8570	DBUG_RETURN(rc);
8571	}
8572
8573	/**
8574	@return
8575	HA_EXIT_SUCCESS OK
8576	other HA_ERR error code (can be SE-specific)
8577	*/
8578	int ha_rocksdb::index_next(uchar *const buf) {
8579	DBUG_ENTER_FUNC();
8580
8581	bool moves_forward = true;
8582	if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
8583	moves_forward = false;
8584	}
8585
8586	int rc = index_next_with_direction(buf, moves_forward);
8587	if (rc == HA_ERR_KEY_NOT_FOUND)
8588	rc = HA_ERR_END_OF_FILE;
8589
8590	DBUG_RETURN(rc);
8591	}
8592
8593	/**
8594	@return
8595	HA_EXIT_SUCCESS OK
8596	other HA_ERR error code (can be SE-specific)
8597	*/
8598	int ha_rocksdb::index_prev(uchar *const buf) {
8599	DBUG_ENTER_FUNC();
8600
8601	bool moves_forward = false;
8602	if (m_key_descr_arr[active_index]->m_is_reverse_cf) {
8603	moves_forward = true;
8604	}
8605
8606	int rc = index_next_with_direction(buf, moves_forward);
8607	if (rc == HA_ERR_KEY_NOT_FOUND)
8608	rc = HA_ERR_END_OF_FILE;
8609
8610	DBUG_RETURN(rc);
8611	}
8612
8613	int ha_rocksdb::index_next_with_direction(uchar *const buf, bool move_forward) {
8614	DBUG_ENTER_FUNC();
8615
8616	int rc;
8617
8618	if (active_index == pk_index(table, m_tbl_def)) {
8619	rc = rnd_next_with_direction(buf, move_forward);
8620	} else {
8621	if (m_skip_scan_it_next_call) {
8622	m_skip_scan_it_next_call = false;
8623	} else {
8624	if (move_forward)
8625	m_scan_it->Next(); / this call cannot fail /
8626	else
8627	m_scan_it->Prev();
8628	}
8629	rocksdb_skip_expired_records(*m_key_descr_arr[active_index], m_scan_it,
8630	!move_forward);
8631	rc = find_icp_matching_index_rec(move_forward, buf);
8632	if (!rc)
8633	rc = secondary_index_read(active_index, buf);
8634	}
8635
8636	DBUG_RETURN(rc);
8637	}
8638
8639	/**
8640	@return
8641	HA_EXIT_SUCCESS OK
8642	other HA_ERR error code (can be SE-specific)
8643	*/
8644	int ha_rocksdb::index_first(uchar *const buf) {
8645	DBUG_ENTER_FUNC();
8646
8647	m_sk_match_prefix = nullptr;
8648	int rc = m_key_descr_arr[active_index]->m_is_reverse_cf
8649	? index_last_intern(buf)
8650	: index_first_intern(buf);
8651	if (rc == HA_ERR_KEY_NOT_FOUND)
8652	rc = HA_ERR_END_OF_FILE;
8653
8654	DBUG_RETURN(rc);
8655	}
8656
8657	/**
8658	@return
8659	HA_EXIT_SUCCESS OK
8660	other HA_ERR error code (can be SE-specific)
8661	*/
8662	int ha_rocksdb::index_last(uchar *const buf) {
8663	DBUG_ENTER_FUNC();
8664
8665	m_sk_match_prefix = nullptr;
8666	int rc = m_key_descr_arr[active_index]->m_is_reverse_cf
8667	? index_first_intern(buf)
8668	: index_last_intern(buf);
8669	if (rc == HA_ERR_KEY_NOT_FOUND)
8670	rc = HA_ERR_END_OF_FILE;
8671
8672	DBUG_RETURN(rc);
8673	}
8674
8675	/*
8676	Start scanning from the "first" value.
8677
8678	The 'first' here means "the first from start of the key space".
8679	For reverse-ordered key spaces, the first value will be the biggest, 'zzzz'.
8680
8681	An picture of a forward-ordered keyspace (remember, the keys have form
8682	'indexnr-keyval'. Suppose the index we are at has number n)
8683
8684	(n-1) - ...
8685	( n ) <--- 1. (n) doesn't exist in the db but it would be here.
8686	( n ) - aaa <--- 2. Seek("n") will put us here on the first index
8687	( n ) - bbb record.
8688	( n ) - cc
8689
8690	So, need to do: Seek(n);
8691
8692	A backward-ordered keyspace:
8693
8694	(n+1) - bbb
8695	(n+1) - aaa
8696	(n+1) <--- (n+1) doesn't exist in the db but would be here.
8697	( n ) - ccc <--- 1. We need to be here.
8698	( n ) - bbb
8699	( n ) - aaa
8700	( n )
8701
8702	So, need to: Seek(n+1);
8703
8704	*/
8705
8706	int ha_rocksdb::index_first_intern(uchar *const buf) {
8707	DBUG_ENTER_FUNC();
8708
8709	DBUG_ASSERT(buf != nullptr);
8710
8711	uchar *key;
8712	uint key_size;
8713	int rc;
8714
8715	if (is_pk(active_index, table, m_tbl_def)) {
8716	key = m_pk_packed_tuple;
8717	} else {
8718	key = m_sk_packed_tuple;
8719	}
8720
8721	DBUG_ASSERT(key != nullptr);
8722
8723	const Rdb_key_def &kd = *m_key_descr_arr[active_index];
8724	int key_start_matching_bytes = kd.get_first_key(key, &key_size);
8725
8726	rocksdb::Slice index_key((const char *)key, key_size);
8727
8728	Rdb_transaction *const tx = get_or_create_tx(table->in_use);
8729	DBUG_ASSERT(tx != nullptr);
8730
8731	const bool is_new_snapshot = !tx->has_snapshot();
8732	// Loop as long as we get a deadlock error AND we end up creating the
8733	// snapshot here (i.e. it did not exist prior to this)
8734	for (;;) {
8735	setup_scan_iterator(kd, &index_key, false, key_start_matching_bytes);
8736	m_scan_it->Seek(index_key);
8737	m_skip_scan_it_next_call = true;
8738
8739	rc = index_next_with_direction(buf, true);
8740	if (rc != HA_ERR_LOCK_DEADLOCK \|\| !is_new_snapshot)
8741	break; // exit the loop
8742
8743	// release the snapshot and iterator so they will be regenerated
8744	tx->release_snapshot();
8745	release_scan_iterator();
8746	}
8747
8748	if (!rc) {
8749	/*
8750	index_next is always incremented on success, so decrement if it is
8751	index_first instead
8752	*/
8753	#ifdef MARIAROCKS_NOT_YET
8754	stats.rows_index_first++;
8755	stats.rows_index_next--;
8756	#endif
8757	}
8758
8759	DBUG_RETURN(rc);
8760	}
8761
8762	/**
8763	@details
8764	Start scanning from the "last" value
8765
8766	The 'last' here means "the last from start of the key space".
8767	For reverse-ordered key spaces, we will actually read the smallest value.
8768
8769	An picture of a forward-ordered keyspace (remember, the keys have form
8770	'indexnr-keyval'. Suppose the we are at a key that has number n)
8771
8772	(n-1)-something
8773	( n )-aaa
8774	( n )-bbb
8775	( n )-ccc <----------- Need to seek to here.
8776	(n+1) <---- Doesn't exist, but would be here.
8777	(n+1)-smth, or no value at all
8778
8779	RocksDB's Iterator::SeekForPrev($val) seeks to "at $val or last value that's
8780	smaller". We can't seek to "(n)-ccc" directly, because we don't know what
8781	is the value of 'ccc' (the biggest record with prefix (n)). Instead, we seek
8782	to "(n+1)", which is the least possible value that's greater than any value
8783	in index #n.
8784
8785	So, need to: it->SeekForPrev(n+1)
8786
8787	A backward-ordered keyspace:
8788
8789	(n+1)-something
8790	( n ) - ccc
8791	( n ) - bbb
8792	( n ) - aaa <---------------- () Need to seek here.*
8793	( n ) <--- Doesn't exist, but would be here.
8794	(n-1)-smth, or no value at all
8795
8796	So, need to: it->SeekForPrev(n)
8797	*/
8798
8799	int ha_rocksdb::index_last_intern(uchar *const buf) {
8800	DBUG_ENTER_FUNC();
8801
8802	DBUG_ASSERT(buf != nullptr);
8803
8804	uchar *key;
8805	uint key_size;
8806	int rc;
8807
8808	if (is_pk(active_index, table, m_tbl_def)) {
8809	key = m_pk_packed_tuple;
8810	} else {
8811	key = m_sk_packed_tuple;
8812	}
8813
8814	DBUG_ASSERT(key != nullptr);
8815
8816	const Rdb_key_def &kd = *m_key_descr_arr[active_index];
8817	if (kd.m_is_reverse_cf) {
8818	kd.get_infimum_key(key, &key_size);
8819	} else {
8820	kd.get_supremum_key(key, &key_size);
8821	}
8822
8823	rocksdb::Slice index_key((const char *)key, key_size);
8824
8825	Rdb_transaction *const tx = get_or_create_tx(table->in_use);
8826	DBUG_ASSERT(tx != nullptr);
8827
8828	bool is_new_snapshot = !tx->has_snapshot();
8829	// Loop as long as we get a deadlock error AND we end up creating the
8830	// snapshot here (i.e. it did not exist prior to this)
8831	for (;;) {
8832	setup_scan_iterator(kd, &index_key, false, Rdb_key_def::INDEX_NUMBER_SIZE);
8833	m_scan_it->SeekForPrev(index_key);
8834	m_skip_scan_it_next_call = false;
8835
8836	if (is_pk(active_index, table, m_tbl_def)) {
8837	m_skip_scan_it_next_call = true;
8838	rc = rnd_next_with_direction(buf, false);
8839	} else {
8840	rc = find_icp_matching_index_rec(false /move_forward/, buf);
8841	if (!rc)
8842	rc = secondary_index_read(active_index, buf);
8843	}
8844
8845	if (rc != HA_ERR_LOCK_DEADLOCK \|\| !is_new_snapshot)
8846	break; / exit the loop /
8847
8848	// release the snapshot and iterator so they will be regenerated
8849	tx->release_snapshot();
8850	release_scan_iterator();
8851	}
8852
8853	if (!rc) {
8854	/*
8855	index_next is always incremented on success, so decrement if it is
8856	index_first instead
8857	*/
8858	#ifdef MARIAROCKS_NOT_YET
8859	stats.rows_index_first++;
8860	stats.rows_index_next--;
8861	#endif
8862	}
8863
8864	DBUG_RETURN(rc);
8865	}
8866
8867	void ha_rocksdb::unlock_row() {
8868	DBUG_ENTER_FUNC();
8869
8870	if (m_lock_rows != RDB_LOCK_NONE) {
8871	Rdb_transaction *const tx = get_or_create_tx(table->in_use);
8872	tx->release_lock(m_pk_descr ->get_cf(),
8873	std::string (m_last_rowkey.ptr(), m_last_rowkey.length()));
8874	}
8875
8876	DBUG_VOID_RETURN;
8877	}
8878
8879	/*
8880	Returning true if SingleDelete can be used.
8881	- Secondary Indexes can always use SingleDelete.
8882	- If the index is PRIMARY KEY, and if all of the columns of the table
8883	are covered by the PRIMARY KEY, SingleDelete can be used.
8884	*/
8885	bool ha_rocksdb::can_use_single_delete(const uint &index) const {
8886	return (index != pk_index(table, m_tbl_def) \|\|
8887	(!has_hidden_pk(table) &&
8888	table->key_info[index].ext_key_parts == table->s->fields));
8889	}
8890
8891	bool ha_rocksdb::skip_unique_check() const {
8892	/*
8893	We want to skip unique checks if:
8894	1) bulk_load is on
8895	2) this table is in the whitelist of tables to skip and the replication
8896	lag has reached a large enough value (see unique_check_lag_threshold
8897	and unique_check_lage_reset_threshold)
8898	3) the user set unique_checks option to 0, and the table does not have
8899	any indexes. If the table has secondary keys, then those might becomes
8900	inconsisted/corrupted
8901	*/
8902	return THDVAR(table->in_use, bulk_load) \|\|
8903	(m_force_skip_unique_check && m_skip_unique_check) \|\|
8904	(my_core::thd_test_options(table->in_use,
8905	OPTION_RELAXED_UNIQUE_CHECKS) &&
8906	m_tbl_def->m_key_count == `1`);
8907	}
8908
8909	#ifdef MARIAROCKS_NOT_YET // MDEV-10975
8910	void ha_rocksdb::set_force_skip_unique_check(bool skip) {
8911	DBUG_ENTER_FUNC();
8912
8913	m_force_skip_unique_check = skip;
8914
8915	DBUG_VOID_RETURN;
8916	}
8917	#endif
8918
8919	bool ha_rocksdb::commit_in_the_middle() {
8920	return THDVAR(table->in_use, bulk_load) \|\|
8921	THDVAR(table->in_use, commit_in_the_middle);
8922	}
8923
8924	/*
8925	Executing bulk commit if it should.
8926	@retval true if bulk commit failed
8927	@retval false if bulk commit was skipped or succeeded
8928	*/
8929	bool ha_rocksdb::do_bulk_commit(Rdb_transaction *const tx) {
8930	DBUG_ASSERT(tx != nullptr);
8931	return commit_in_the_middle() &&
8932	tx->get_write_count() >= THDVAR(table->in_use, bulk_load_size) &&
8933	tx->flush_batch();
8934	}
8935
8936	/*
8937	If table was created without primary key, SQL layer represents the primary
8938	key number as MAX_INDEXES. Hence, this function returns true if the table
8939	does not contain a primary key. (In which case we generate a hidden
8940	'auto-incremented' pk.)
8941	*/
8942	bool ha_rocksdb::has_hidden_pk(const TABLE *const table) const {
8943	DBUG_ASSERT(table != nullptr);
8944	return Rdb_key_def::table_has_hidden_pk(table);
8945	}
8946
8947	/*
8948	Returns true if given index number is a hidden_pk.
8949	- This is used when a table is created with no primary key.
8950	*/
8951	bool ha_rocksdb::is_hidden_pk(const uint index, const TABLE *const table_arg,
8952	const Rdb_tbl_def *const tbl_def_arg) {
8953	DBUG_ASSERT(table_arg != nullptr);
8954	DBUG_ASSERT(table_arg->s != nullptr);
8955	DBUG_ASSERT(tbl_def_arg != nullptr);
8956
8957	return (table_arg->s->primary_key == MAX_INDEXES &&
8958	index == tbl_def_arg->m_key_count - `1`);
8959	}
8960
8961	/ Returns index of primary key /
8962	uint ha_rocksdb::pk_index(const TABLE *const table_arg,
8963	const Rdb_tbl_def *const tbl_def_arg) {
8964	DBUG_ASSERT(table_arg != nullptr);
8965	DBUG_ASSERT(table_arg->s != nullptr);
8966	DBUG_ASSERT(tbl_def_arg != nullptr);
8967
8968	return table_arg->s->primary_key == MAX_INDEXES ? tbl_def_arg->m_key_count - `1`
8969	: table_arg->s->primary_key;
8970	}
8971
8972	/ Returns true if given index number is a primary key /
8973	bool ha_rocksdb::is_pk(const uint index, const TABLE *const table_arg,
8974	const Rdb_tbl_def *const tbl_def_arg) {
8975	DBUG_ASSERT(table_arg != nullptr);
8976	DBUG_ASSERT(table_arg->s != nullptr);
8977	DBUG_ASSERT(tbl_def_arg != nullptr);
8978
8979	return index == table_arg->s->primary_key \|\|
8980	is_hidden_pk(index, table_arg, tbl_def_arg);
8981	}
8982
8983	uint ha_rocksdb::max_supported_key_part_length() const {
8984	DBUG_ENTER_FUNC();
8985	DBUG_RETURN(rocksdb_large_prefix ? MAX_INDEX_COL_LEN_LARGE
8986	: MAX_INDEX_COL_LEN_SMALL);
8987	}
8988
8989	const char ha_rocksdb::get_key_name(const* uint index,
8990	const TABLE *const table_arg,
8991	const Rdb_tbl_def *const tbl_def_arg) {
8992	DBUG_ASSERT(table_arg != nullptr);
8993	DBUG_ASSERT(tbl_def_arg != nullptr);
8994
8995	if (is_hidden_pk(index, table_arg, tbl_def_arg)) {
8996	return HIDDEN_PK_NAME;
8997	}
8998
8999	DBUG_ASSERT(table_arg->key_info != nullptr);
9000	DBUG_ASSERT(table_arg->key_info[index].name.str != nullptr);
9001
9002	return table_arg->key_info[index].name.str;
9003	}
9004
9005	const char ha_rocksdb::get_key_comment(const* uint index,
9006	const TABLE *const table_arg,
9007	const Rdb_tbl_def *const tbl_def_arg) {
9008	DBUG_ASSERT(table_arg != nullptr);
9009	DBUG_ASSERT(tbl_def_arg != nullptr);
9010
9011	if (is_hidden_pk(index, table_arg, tbl_def_arg)) {
9012	return nullptr;
9013	}
9014
9015	DBUG_ASSERT(table_arg->key_info != nullptr);
9016
9017	return table_arg->key_info[index].comment.str;
9018	}
9019
9020	const std::string ha_rocksdb::generate_cf_name(const uint index,
9021	const TABLE *const table_arg,
9022	const Rdb_tbl_def *const tbl_def_arg,
9023	bool *per_part_match_found) {
9024	DBUG_ASSERT(table_arg != nullptr);
9025	DBUG_ASSERT(tbl_def_arg != nullptr);
9026	DBUG_ASSERT(per_part_match_found != nullptr);
9027
9028	// When creating CF-s the caller needs to know if there was a custom CF name
9029	// specified for a given paritition.
9030	per_part_match_found = false*;
9031
9032	// Index comment is used to define the column family name specification(s).
9033	// If there was no comment, we get an emptry string, and it means "use the
9034	// default column family".
9035	const char *const comment = get_key_comment(index, table_arg, tbl_def_arg);
9036
9037	// `get_key_comment` can return `nullptr`, that's why this.
9038	std::string key_comment = comment ? comment : "";
9039
9040	std::string cf_name = Rdb_key_def::parse_comment_for_qualifier(
9041	key_comment, table_arg, tbl_def_arg, per_part_match_found,
9042	RDB_CF_NAME_QUALIFIER);
9043
9044	if (IF_PARTITIONING(table_arg->part_info,nullptr) != nullptr && !*per_part_match_found) {
9045	// At this point we tried to search for a custom CF name for a partition,
9046	// but none was specified. Therefore default one will be used.
9047	return "";
9048	}
9049
9050	// If we didn't find any partitioned/non-partitioned qualifiers, return the
9051	// comment itself. NOTE: this currently handles returning the cf name
9052	// specified in the index comment in the case of no partitions, which doesn't
9053	// use any qualifiers at the moment. (aka its a special case)
9054	if (cf_name.empty() && !key_comment.empty()) {
9055	return key_comment;
9056	}
9057
9058	return cf_name;
9059	}
9060
9061	const std::string ha_rocksdb::get_table_comment(const TABLE *const table_arg) {
9062	DBUG_ASSERT(table_arg != nullptr);
9063	DBUG_ASSERT(table_arg->s != nullptr);
9064
9065	return table_arg->s->comment.str;
9066	}
9067
9068	/**
9069	@return
9070	HA_EXIT_SUCCESS OK
9071	other HA_ERR error code (can be SE-specific)
9072	*/
9073	int ha_rocksdb::write_row(uchar *const buf) {
9074	DBUG_ENTER_FUNC();
9075
9076	DBUG_ASSERT(buf != nullptr);
9077	DBUG_ASSERT(buf == table->record[`0`]);
9078	DBUG_ASSERT(m_lock_rows == RDB_LOCK_WRITE);
9079
9080	/*
9081	Note: "buf == table->record[0]" is copied from innodb. I am not aware of
9082	any use cases where this condition is not true.
9083	*/
9084	if (table->next_number_field && buf == table->record[`0`]) {
9085	int err;
9086	if ((err = update_auto_increment())) {
9087	DBUG_RETURN(err);
9088	}
9089	}
9090
9091	const int rv = update_write_row(nullptr, buf, skip_unique_check());
9092
9093	if (rv == `0`) {
9094	#ifdef MARIAROCKS_NOT_YET
9095	stats.rows_inserted++;
9096	#endif
9097	update_row_stats(ROWS_INSERTED);
9098	}
9099
9100	DBUG_RETURN(rv);
9101	}
9102
9103	/**
9104	Constructing m_last_rowkey (MyRocks key expression) from
9105	before_update\|delete image (MySQL row expression).
9106	m_last_rowkey is normally set during lookup phase, such as
9107	rnd_next_with_direction() and rnd_pos(). With Read Free Replication,
9108	these read functions are skipped and update_rows(), delete_rows() are
9109	called without setting m_last_rowkey. This function sets m_last_rowkey
9110	for Read Free Replication.
9111	*/
9112	void ha_rocksdb::set_last_rowkey(const uchar *const old_data) {
9113	#ifdef MARIAROCKS_NOT_YET
9114	if (old_data && use_read_free_rpl()) {
9115	const int old_pk_size = m_pk_descr->pack_record(
9116	table, m_pack_buffer, old_data, m_pk_packed_tuple, nullptr, false);
9117	m_last_rowkey.copy((const char *)m_pk_packed_tuple, old_pk_size,
9118	&my_charset_bin);
9119	}
9120	#endif
9121	}
9122
9123	int ha_rocksdb::get_pk_for_update(struct update_row_info *const row_info) {
9124	int size;
9125
9126	/*
9127	Get new row key for any insert, and any update where the pk is not hidden.
9128	Row key for updates with hidden pk is handled below.
9129	*/
9130	if (!has_hidden_pk(table)) {
9131	row_info->hidden_pk_id = `0`;
9132
9133	row_info->new_pk_unpack_info = &m_pk_unpack_info;
9134
9135	size =
9136	m_pk_descr ->pack_record(table, m_pack_buffer, row_info->new_data,
9137	m_pk_packed_tuple, row_info->new_pk_unpack_info,
9138	false, `0`, `0`, nullptr, &row_info->ttl_pk_offset);
9139	} else if (row_info->old_data == nullptr) {
9140	row_info->hidden_pk_id = update_hidden_pk_val();
9141	size =
9142	m_pk_descr ->pack_hidden_pk(row_info->hidden_pk_id, m_pk_packed_tuple);
9143	} else {
9144	/*
9145	If hidden primary key, rowkey for new record will always be the same as
9146	before
9147	*/
9148	size = row_info->old_pk_slice.size();
9149	memcpy(m_pk_packed_tuple, row_info->old_pk_slice.data(), size);
9150	int err = read_hidden_pk_id_from_rowkey(&row_info->hidden_pk_id);
9151	if (err) {
9152	return err;
9153	}
9154	}
9155
9156	row_info->new_pk_slice =
9157	rocksdb::Slice ((const char *)m_pk_packed_tuple, size);
9158
9159	return HA_EXIT_SUCCESS;
9160	}
9161
9162	int ha_rocksdb::check_and_lock_unique_pk(const uint &key_id,
9163	const struct update_row_info &row_info,
9164	bool *const found,
9165	bool *const pk_changed) {
9166	DBUG_ASSERT(found != nullptr);
9167	DBUG_ASSERT(pk_changed != nullptr);
9168
9169	pk_changed = false*;
9170
9171	/*
9172	For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs
9173	always require locking.
9174	*/
9175	if (row_info.old_pk_slice.size() > `0`) {
9176	/*
9177	If the keys are the same, then no lock is needed
9178	*/
9179	if (!Rdb_pk_comparator::bytewise_compare(row_info.new_pk_slice,
9180	row_info.old_pk_slice)) {
9181	found = false*;
9182	return HA_EXIT_SUCCESS;
9183	}
9184
9185	pk_changed = true*;
9186	}
9187
9188	/*
9189	Perform a read to determine if a duplicate entry exists. For primary
9190	keys, a point lookup will be sufficient.
9191
9192	note: we intentionally don't set options.snapshot here. We want to read
9193	the latest committed data.
9194	*/
9195
9196	/*
9197	To prevent race conditions like below, it is necessary to
9198	take a lock for a target row. get_for_update() holds a gap lock if
9199	target key does not exist, so below conditions should never
9200	happen.
9201
9202	1) T1 Get(empty) -> T2 Get(empty) -> T1 Put(insert) -> T1 commit
9203	-> T2 Put(overwrite) -> T2 commit
9204	2) T1 Get(empty) -> T1 Put(insert, not committed yet) -> T2 Get(empty)
9205	-> T2 Put(insert, blocked) -> T1 commit -> T2 commit(overwrite)
9206	*/
9207	const rocksdb::Status s =
9208	get_for_update(row_info.tx, m_pk_descr ->get_cf(), row_info.new_pk_slice,
9209	&m_retrieved_record);
9210	if (!s.ok() && !s.IsNotFound()) {
9211	return row_info.tx->set_status_error(
9212	table->in_use, s, *m_key_descr_arr[key_id], m_tbl_def, m_table_handler);
9213	}
9214
9215	*found = !s.IsNotFound();
9216	return HA_EXIT_SUCCESS;
9217	}
9218
9219	int ha_rocksdb::check_and_lock_sk(const uint &key_id,
9220	const struct update_row_info &row_info,
9221	bool *const found) {
9222	DBUG_ASSERT(found != nullptr);
9223	found = false*;
9224
9225	/*
9226	Can skip checking this key if none of the key fields have changed.
9227	*/
9228	if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id)) {
9229	return HA_EXIT_SUCCESS;
9230	}
9231
9232	KEY key_info = nullptr*;
9233	uint n_null_fields = `0`;
9234	uint user_defined_key_parts = `1`;
9235
9236	key_info = &table->key_info[key_id];
9237	user_defined_key_parts = key_info->user_defined_key_parts;
9238	/*
9239	If there are no uniqueness requirements, there's no need to obtain a
9240	lock for this key.
9241	*/
9242	if (!(key_info->flags & HA_NOSAME)) {
9243	return HA_EXIT_SUCCESS;
9244	}
9245
9246	const Rdb_key_def &kd = *m_key_descr_arr[key_id];
9247
9248	/*
9249	Calculate the new key for obtaining the lock
9250
9251	For unique secondary indexes, the key used for locking does not
9252	include the extended fields.
9253	*/
9254	int size =
9255	kd.pack_record(table, m_pack_buffer, row_info.new_data, m_sk_packed_tuple,
9256	nullptr, false, `0`, user_defined_key_parts, &n_null_fields);
9257	if (n_null_fields > `0`) {
9258	/*
9259	If any fields are marked as NULL this will never match another row as
9260	to NULL never matches anything else including another NULL.
9261	*/
9262	return HA_EXIT_SUCCESS;
9263	}
9264
9265	const rocksdb::Slice new_slice =
9266	rocksdb::Slice ((const char *)m_sk_packed_tuple, size);
9267
9268	/*
9269	For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs
9270	always require locking.
9271	*/
9272	if (row_info.old_data != nullptr) {
9273	size = kd.pack_record(table, m_pack_buffer, row_info.old_data,
9274	m_sk_packed_tuple_old, nullptr, false, `0`,
9275	user_defined_key_parts);
9276	const rocksdb::Slice old_slice =
9277	rocksdb::Slice ((const char *)m_sk_packed_tuple_old, size);
9278
9279	/*
9280	For updates, if the keys are the same, then no lock is needed
9281
9282	Also check to see if the key has any fields set to NULL. If it does, then
9283	this key is unique since NULL is not equal to each other, so no lock is
9284	needed.
9285	*/
9286	if (!Rdb_pk_comparator::bytewise_compare(new_slice, old_slice)) {
9287	return HA_EXIT_SUCCESS;
9288	}
9289	}
9290
9291	/*
9292	Perform a read to determine if a duplicate entry exists - since this is
9293	a secondary indexes a range scan is needed.
9294
9295	note: we intentionally don't set options.snapshot here. We want to read
9296	the latest committed data.
9297	*/
9298
9299	const bool all_parts_used = (user_defined_key_parts == kd.get_key_parts());
9300
9301	/*
9302	This iterator seems expensive since we need to allocate and free
9303	memory for each unique index.
9304
9305	If this needs to be optimized, for keys without NULL fields, the
9306	extended primary key fields can be migrated to the value portion of the
9307	key. This enables using Get() instead of Seek() as in the primary key
9308	case.
9309
9310	The bloom filter may need to be disabled for this lookup.
9311	*/
9312	uchar min_bound_buf[MAX_KEY_LENGTH];
9313	uchar max_bound_buf[MAX_KEY_LENGTH];
9314	rocksdb::Slice min_bound_slice;
9315	rocksdb::Slice max_bound_slice;
9316	const bool total_order_seek = !check_bloom_and_set_bounds(
9317	ha_thd(), kd, new_slice, all_parts_used,
9318	min_bound_buf,
9319	max_bound_buf,
9320	&min_bound_slice,
9321	&max_bound_slice);
9322	const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
9323
9324	const rocksdb::Status s =
9325	get_for_update(row_info.tx, kd.get_cf(), new_slice, nullptr);
9326	if (!s.ok() && !s.IsNotFound()) {
9327	return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def,
9328	m_table_handler);
9329	}
9330
9331	rocksdb::Iterator *const iter = row_info.tx->get_iterator(
9332	kd.get_cf(), total_order_seek, fill_cache,
9333	min_bound_slice, max_bound_slice,
9334	true / read current data /,
9335	false / acquire snapshot /);
9336	/*
9337	Need to scan the transaction to see if there is a duplicate key.
9338	Also need to scan RocksDB and verify the key has not been deleted
9339	in the transaction.
9340	*/
9341	iter->Seek(new_slice);
9342	*found = !read_key_exact(kd, iter, all_parts_used, new_slice,
9343	row_info.tx->m_snapshot_timestamp);
9344	delete iter;
9345
9346	return HA_EXIT_SUCCESS;
9347	}
9348
9349	int ha_rocksdb::check_uniqueness_and_lock(
9350	const struct update_row_info &row_info, bool *const pk_changed) {
9351	/*
9352	Go through each index and determine if the index has uniqueness
9353	requirements. If it does, then try to obtain a row lock on the new values.
9354	Once all locks have been obtained, then perform the changes needed to
9355	update/insert the row.
9356	*/
9357	for (uint key_id = `0`; key_id < m_tbl_def->m_key_count; key_id++) {
9358	bool found;
9359	int rc;
9360
9361	if (is_pk(key_id, table, m_tbl_def)) {
9362	rc = check_and_lock_unique_pk(key_id, row_info, &found, pk_changed);
9363	} else {
9364	rc = check_and_lock_sk(key_id, row_info, &found);
9365	}
9366
9367	if (rc != HA_EXIT_SUCCESS) {
9368	return rc;
9369	}
9370
9371	/*
9372	If the pk key has ttl, we may need to pretend the row wasn't
9373	found if it is already expired. The pk record is read into
9374	m_retrieved_record by check_and_lock_unique_pk().
9375	*/
9376	if (is_pk(key_id, table, m_tbl_def) && found && m_pk_descr ->has_ttl() &&
9377	should_hide_ttl_rec(*m_pk_descr, m_retrieved_record,
9378	(row_info.tx->m_snapshot_timestamp
9379	? row_info.tx->m_snapshot_timestamp
9380	: static_cast<int64_t>(std::time(nullptr))))) {
9381	found = false;
9382	}
9383
9384	if (found) {
9385	/ There is a row with this key already, so error out. /
9386	errkey = key_id;
9387	m_dupp_errkey = errkey;
9388	return HA_ERR_FOUND_DUPP_KEY;
9389	}
9390	}
9391
9392	return HA_EXIT_SUCCESS;
9393	}
9394
9395	int ha_rocksdb::check_duplicate_sk(const TABLE *table_arg,
9396	const Rdb_key_def &index,
9397	const rocksdb::Slice *key,
9398	struct unique_sk_buf_info *sk_info) {
9399	uint n_null_fields = `0`;
9400	const rocksdb::Comparator *index_comp = index.get_cf()->GetComparator();
9401
9402	/ Get proper SK buffer. /
9403	uchar *sk_buf = sk_info->swap_and_get_sk_buf();
9404
9405	/ Get memcmp form of sk without extended pk tail /
9406	uint sk_memcmp_size =
9407	index.get_memcmp_sk_parts(table_arg, *key, sk_buf, &n_null_fields);
9408
9409	sk_info->sk_memcmp_key =
9410	rocksdb::Slice (reinterpret_cast<char *>(sk_buf), sk_memcmp_size);
9411
9412	if (sk_info->sk_memcmp_key_old.size() > `0` && n_null_fields == `0` &&
9413	index_comp->Compare(sk_info->sk_memcmp_key, sk_info->sk_memcmp_key_old) ==
9414	`0`) {
9415	return `1`;
9416	}
9417
9418	sk_info->sk_memcmp_key_old = sk_info->sk_memcmp_key;
9419	return `0`;
9420	}
9421
9422	int ha_rocksdb::bulk_load_key(Rdb_transaction *const tx, const Rdb_key_def &kd,
9423	const rocksdb::Slice &key,
9424	const rocksdb::Slice &value, bool sort) {
9425	DBUG_ENTER_FUNC();
9426	int res;
9427	rocksdb::ColumnFamilyHandle *cf = kd.get_cf();
9428
9429	// In the case of unsorted inserts, m_sst_info allocated here is not
9430	// used to store the keys. It is still used to indicate when tables
9431	// are switched.
9432	if (m_sst_info == nullptr \|\| m_sst_info ->is_committed()) {
9433	m_sst_info.reset(new Rdb_sst_info (rdb, m_table_handler->m_table_name,
9434	kd.get_name(), cf, *rocksdb_db_options,
9435	THDVAR(ha_thd(), trace_sst_api)));
9436	res = tx->start_bulk_load(this, m_sst_info);
9437	if (res != HA_EXIT_SUCCESS) {
9438	DBUG_RETURN(res);
9439	}
9440	}
9441	DBUG_ASSERT(m_sst_info);
9442
9443	if (sort) {
9444	Rdb_index_merge *key_merge;
9445	DBUG_ASSERT(cf != nullptr);
9446
9447	res = tx->get_key_merge(kd.get_gl_index_id(), cf, &key_merge);
9448	if (res == HA_EXIT_SUCCESS) {
9449	res = key_merge->add(key, value);
9450	}
9451	} else {
9452	res = m_sst_info ->put(key, value);
9453	}
9454
9455	DBUG_RETURN(res);
9456	}
9457
9458	int ha_rocksdb::finalize_bulk_load(bool print_client_error) {
9459	DBUG_ENTER_FUNC();
9460
9461	int res = HA_EXIT_SUCCESS;
9462
9463	/ Skip if there are no possible ongoing bulk loads /
9464	if (m_sst_info) {
9465	res = m_sst_info ->commit(print_client_error);
9466	m_sst_info.reset();
9467	}
9468	DBUG_RETURN(res);
9469	}
9470
9471	int ha_rocksdb::update_pk(const Rdb_key_def &kd,
9472	const struct update_row_info &row_info,
9473	const bool &pk_changed) {
9474	const uint key_id = kd.get_keyno();
9475	const bool hidden_pk = is_hidden_pk(key_id, table, m_tbl_def);
9476	ulonglong bytes_written = `0`;
9477
9478	/*
9479	If the PK has changed, or if this PK uses single deletes and this is an
9480	update, the old key needs to be deleted. In the single delete case, it
9481	might be possible to have this sequence of keys: PUT(X), PUT(X), SD(X),
9482	resulting in the first PUT(X) showing up.
9483	*/
9484	if (!hidden_pk && (pk_changed \|\| ((row_info.old_pk_slice.size() > `0`) &&
9485	can_use_single_delete(key_id)))) {
9486	const rocksdb::Status s = delete_or_singledelete(
9487	key_id, row_info.tx, kd.get_cf(), row_info.old_pk_slice);
9488	if (!s.ok()) {
9489	return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def,
9490	m_table_handler);
9491	} else {
9492	bytes_written = row_info.old_pk_slice.size();
9493	}
9494	}
9495
9496	if (table->next_number_field) {
9497	update_auto_incr_val_from_field();
9498	}
9499
9500	int rc = HA_EXIT_SUCCESS;
9501	rocksdb::Slice value_slice;
9502	/ Prepare the new record to be written into RocksDB /
9503	if ((rc = convert_record_to_storage_format(row_info, &value_slice))) {
9504	return rc;
9505	}
9506
9507	const auto cf = m_pk_descr ->get_cf();
9508	if (rocksdb_enable_bulk_load_api && THDVAR(table->in_use, bulk_load) &&
9509	!hidden_pk) {
9510	/*
9511	Write the primary key directly to an SST file using an SstFileWriter
9512	*/
9513	rc = bulk_load_key(row_info.tx, kd, row_info.new_pk_slice, value_slice,
9514	THDVAR(table->in_use, bulk_load_allow_unsorted));
9515	} else if (row_info.skip_unique_check \|\| row_info.tx->m_ddl_transaction) {
9516	/*
9517	It is responsibility of the user to make sure that the data being
9518	inserted doesn't violate any unique keys.
9519	*/
9520	row_info.tx->get_indexed_write_batch()->Put(cf, row_info.new_pk_slice,
9521	value_slice);
9522	} else {
9523	const auto s = row_info.tx->put(cf, row_info.new_pk_slice, value_slice);
9524	if (!s.ok()) {
9525	if (s.IsBusy()) {
9526	errkey = table->s->primary_key;
9527	m_dupp_errkey = errkey;
9528	rc = HA_ERR_FOUND_DUPP_KEY;
9529	} else {
9530	rc = row_info.tx->set_status_error(table->in_use, s, *m_pk_descr,
9531	m_tbl_def, m_table_handler);
9532	}
9533	}
9534	}
9535
9536	if (rc == HA_EXIT_SUCCESS) {
9537	row_info.tx->update_bytes_written(
9538	bytes_written + row_info.new_pk_slice.size() + value_slice.size());
9539	}
9540	return rc;
9541	}
9542
9543	int ha_rocksdb::update_sk(const TABLE *const table_arg, const Rdb_key_def &kd,
9544	const struct update_row_info &row_info) {
9545	int new_packed_size;
9546	int old_packed_size;
9547
9548	rocksdb::Slice new_key_slice;
9549	rocksdb::Slice new_value_slice;
9550	rocksdb::Slice old_key_slice;
9551
9552	const uint key_id = kd.get_keyno();
9553
9554	ulonglong bytes_written = `0`;
9555
9556	/*
9557	Can skip updating this key if none of the key fields have changed and, if
9558	this table has TTL, the TTL timestamp has not changed.
9559	*/
9560	if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id) &&
9561	(!kd.has_ttl() \|\| !m_ttl_bytes_updated)) {
9562	return HA_EXIT_SUCCESS;
9563	}
9564
9565	const bool store_row_debug_checksums = should_store_row_debug_checksums();
9566
9567	new_packed_size =
9568	kd.pack_record(table_arg, m_pack_buffer, row_info.new_data,
9569	m_sk_packed_tuple, &m_sk_tails, store_row_debug_checksums,
9570	row_info.hidden_pk_id, `0`, nullptr, nullptr, m_ttl_bytes);
9571
9572	if (row_info.old_data != nullptr) {
9573	// The old value
9574	old_packed_size = kd.pack_record(
9575	table_arg, m_pack_buffer, row_info.old_data, m_sk_packed_tuple_old,
9576	&m_sk_tails_old, store_row_debug_checksums, row_info.hidden_pk_id, `0`,
9577	nullptr, nullptr, m_ttl_bytes);
9578
9579	/*
9580	Check if we are going to write the same value. This can happen when
9581	one does
9582	UPDATE tbl SET col='foo'
9583	and we are looking at the row that already has col='foo'.
9584
9585	We also need to compare the unpack info. Suppose, the collation is
9586	case-insensitive, and unpack info contains information about whether
9587	the letters were uppercase and lowercase. Then, both 'foo' and 'FOO'
9588	will have the same key value, but different data in unpack_info.
9589
9590	(note: anyone changing bytewise_compare should take this code into
9591	account)
9592	*/
9593	if (old_packed_size == new_packed_size &&
9594	m_sk_tails_old.get_current_pos() == m_sk_tails.get_current_pos() &&
9595	!(kd.has_ttl() && m_ttl_bytes_updated) &&
9596	memcmp(m_sk_packed_tuple_old, m_sk_packed_tuple, old_packed_size) ==
9597	`0` &&
9598	memcmp(m_sk_tails_old.ptr(), m_sk_tails.ptr(),
9599	m_sk_tails.get_current_pos()) == `0`) {
9600	return HA_EXIT_SUCCESS;
9601	}
9602
9603	/*
9604	Deleting entries from secondary index should skip locking, but
9605	be visible to the transaction.
9606	(also note that DDL statements do not delete rows, so this is not a DDL
9607	statement)
9608	*/
9609	old_key_slice = rocksdb::Slice (
9610	reinterpret_cast<const char *>(m_sk_packed_tuple_old), old_packed_size);
9611
9612	row_info.tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
9613	old_key_slice);
9614
9615	bytes_written = old_key_slice.size();
9616	}
9617
9618	new_key_slice = rocksdb::Slice (
9619	reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
9620	new_value_slice =
9621	rocksdb::Slice (reinterpret_cast<const char *>(m_sk_tails.ptr()),
9622	m_sk_tails.get_current_pos());
9623
9624	row_info.tx->get_indexed_write_batch()->Put(kd.get_cf(), new_key_slice,
9625	new_value_slice);
9626
9627	row_info.tx->update_bytes_written(bytes_written + new_key_slice.size() +
9628	new_value_slice.size());
9629
9630	return HA_EXIT_SUCCESS;
9631	}
9632
9633	int ha_rocksdb::update_indexes(const struct update_row_info &row_info,
9634	const bool &pk_changed) {
9635	int rc;
9636
9637	// The PK must be updated first to pull out the TTL value.
9638	rc = update_pk(*m_pk_descr, row_info, pk_changed);
9639	if (rc != HA_EXIT_SUCCESS) {
9640	return rc;
9641	}
9642
9643	// Update the remaining indexes.
9644	for (uint key_id = `0`; key_id < m_tbl_def->m_key_count; key_id++) {
9645	if (is_pk(key_id, table, m_tbl_def)) {
9646	continue;
9647	}
9648
9649	rc = update_sk(table, *m_key_descr_arr[key_id], row_info);
9650	if (rc != HA_EXIT_SUCCESS) {
9651	return rc;
9652	}
9653	}
9654
9655	return HA_EXIT_SUCCESS;
9656	}
9657
9658	int ha_rocksdb::update_write_row(const uchar *const old_data,
9659	const uchar *const new_data,
9660	const bool skip_unique_check) {
9661	DBUG_ENTER_FUNC();
9662
9663	bool pk_changed = false;
9664	struct update_row_info row_info;
9665
9666	row_info.old_data = old_data;
9667	row_info.new_data = new_data;
9668	row_info.skip_unique_check = skip_unique_check;
9669	row_info.new_pk_unpack_info = nullptr;
9670
9671	set_last_rowkey(old_data);
9672
9673	row_info.tx = get_or_create_tx(table->in_use);
9674
9675	if (old_data != nullptr) {
9676	row_info.old_pk_slice =
9677	rocksdb::Slice (m_last_rowkey.ptr(), m_last_rowkey.length());
9678
9679	/ Determine which indexes need updating. /
9680	calc_updated_indexes();
9681	}
9682
9683	/*
9684	Get the new row key into row_info.new_pk_slice
9685	*/
9686	int rc = get_pk_for_update(&row_info);
9687	if (rc != HA_EXIT_SUCCESS) {
9688	DBUG_RETURN(rc);
9689	}
9690
9691	if (!skip_unique_check) {
9692	/*
9693	Check to see if we are going to have failures because of unique
9694	keys. Also lock the appropriate key values.
9695	*/
9696	rc = check_uniqueness_and_lock(row_info, &pk_changed);
9697	if (rc != HA_EXIT_SUCCESS) {
9698	DBUG_RETURN(rc);
9699	}
9700	}
9701
9702	DEBUG_SYNC(ha_thd(), "rocksdb.update_write_row_after_unique_check");
9703
9704	/*
9705	At this point, all locks have been obtained, and all checks for duplicate
9706	keys have been performed. No further errors can be allowed to occur from
9707	here because updates to the transaction will be made and those updates
9708	cannot be easily removed without rolling back the entire transaction.
9709	*/
9710	rc = update_indexes(row_info, pk_changed);
9711	if (rc != HA_EXIT_SUCCESS) {
9712	DBUG_RETURN(rc);
9713	}
9714
9715	if (old_data != nullptr) {
9716	row_info.tx->incr_update_count();
9717	} else {
9718	row_info.tx->incr_insert_count();
9719	}
9720
9721	if (do_bulk_commit(row_info.tx)) {
9722	DBUG_RETURN(HA_ERR_ROCKSDB_BULK_LOAD);
9723	}
9724
9725	DBUG_RETURN(HA_EXIT_SUCCESS);
9726	}
9727
9728	/*
9729	Setting iterator upper/lower bounds for Seek/SeekForPrev.
9730	This makes RocksDB to avoid scanning tombstones outside of
9731	the given key ranges, when prefix_same_as_start=true was not passed
9732	(when prefix bloom filter can not be used).
9733	Inversing upper/lower bound is necessary on reverse order CF.
9734	This covers HA_READ_PREFIX_LAST case as well. For example,*
9735	if given query eq condition was 12 bytes and condition was
9736	0x0000b3eb003f65c5e78858b8, and if doing HA_READ_PREFIX_LAST,
9737	eq_cond_len was 11 (see calc_eq_cond_len() for details).
9738	If the index was reverse order, upper bound would be
9739	0x0000b3eb003f65c5e78857, and lower bound would be
9740	0x0000b3eb003f65c5e78859. These cover given eq condition range.
9741
9742	@param lower_bound_buf IN Buffer for lower bound
9743	@param upper_bound_buf IN Buffer for upper bound
9744
9745	@param outer_u
9746	*/
9747	void ha_rocksdb::setup_iterator_bounds(const Rdb_key_def &kd,
9748	const rocksdb::Slice &eq_cond,
9749	uchar *lower_bound_buf,
9750	uchar *upper_bound_buf,
9751	rocksdb::Slice *out_lower_bound,
9752	rocksdb::Slice *out_upper_bound) {
9753	uint eq_cond_len = eq_cond.size();
9754	memcpy(upper_bound_buf, eq_cond.data(), eq_cond_len);
9755	kd.successor(upper_bound_buf, eq_cond_len);
9756	memcpy(lower_bound_buf, eq_cond.data(), eq_cond_len);
9757	kd.predecessor(lower_bound_buf, eq_cond_len);
9758
9759	if (kd.m_is_reverse_cf) {
9760	*out_upper_bound =
9761	rocksdb::Slice ((const char *)lower_bound_buf, eq_cond_len);
9762	*out_lower_bound =
9763	rocksdb::Slice ((const char *)upper_bound_buf, eq_cond_len);
9764	} else {
9765	*out_upper_bound =
9766	rocksdb::Slice ((const char *)upper_bound_buf, eq_cond_len);
9767	*out_lower_bound =
9768	rocksdb::Slice ((const char *)lower_bound_buf, eq_cond_len);
9769	}
9770	}
9771
9772	/*
9773	Open a cursor
9774	*/
9775
9776	void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd,
9777	rocksdb::Slice *const slice,
9778	const bool use_all_keys,
9779	const uint eq_cond_len) {
9780	DBUG_ASSERT(slice != nullptr);
9781	DBUG_ASSERT(slice->size() >= eq_cond_len);
9782
9783	Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9784
9785	bool skip_bloom = true;
9786
9787	const rocksdb::Slice eq_cond(slice->data(), eq_cond_len);
9788	if (check_bloom_and_set_bounds(ha_thd(), kd, eq_cond, use_all_keys,
9789	m_eq_cond_lower_bound,
9790	m_eq_cond_upper_bound,
9791	&m_eq_cond_lower_bound_slice,
9792	&m_eq_cond_upper_bound_slice)) {
9793	skip_bloom = false;
9794	}
9795
9796	/*
9797	In some cases, setup_scan_iterator() is called multiple times from
9798	the same query but bloom filter can not always be used.
9799	Suppose the following query example. id2 is VARCHAR(30) and PRIMARY KEY
9800	(id1, id2).
9801	select count() from t2 WHERE id1=100 and id2 IN ('00000000000000000000',*
9802	'100');
9803	In this case, setup_scan_iterator() is called twice, the first time is for
9804	(id1, id2)=(100, '00000000000000000000') and the second time is for (100,
9805	'100').
9806	If prefix bloom filter length is 24 bytes, prefix bloom filter can be used
9807	for the
9808	first condition but not for the second condition.
9809	If bloom filter condition is changed, currently it is necessary to destroy
9810	and
9811	re-create Iterator.
9812	*/
9813	if (m_scan_it_skips_bloom != skip_bloom) {
9814	release_scan_iterator();
9815	}
9816
9817	/*
9818	SQL layer can call rnd_init() multiple times in a row.
9819	In that case, re-use the iterator, but re-position it at the table start.
9820	*/
9821	if (!m_scan_it) {
9822	const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache);
9823	if (commit_in_the_middle()) {
9824	DBUG_ASSERT(m_scan_it_snapshot == nullptr);
9825	m_scan_it_snapshot = rdb->GetSnapshot();
9826
9827	auto read_opts = rocksdb::ReadOptions ();
9828	read_opts.total_order_seek = true; // TODO: set based on WHERE conditions
9829	read_opts.snapshot = m_scan_it_snapshot;
9830	m_scan_it = rdb->NewIterator(read_opts, kd.get_cf());
9831	} else {
9832	m_scan_it = tx->get_iterator(kd.get_cf(), skip_bloom, fill_cache,
9833	m_eq_cond_lower_bound_slice,
9834	m_eq_cond_upper_bound_slice);
9835	}
9836	m_scan_it_skips_bloom = skip_bloom;
9837	}
9838	}
9839
9840	void ha_rocksdb::release_scan_iterator() {
9841	delete m_scan_it;
9842	m_scan_it = nullptr;
9843
9844	if (m_scan_it_snapshot) {
9845	rdb->ReleaseSnapshot(m_scan_it_snapshot);
9846	m_scan_it_snapshot = nullptr;
9847	}
9848	}
9849
9850	void ha_rocksdb::setup_iterator_for_rnd_scan() {
9851	uint key_size;
9852
9853	int key_start_matching_bytes = m_pk_descr ->get_first_key(m_pk_packed_tuple, &key_size);
9854
9855	rocksdb::Slice table_key((const char *)m_pk_packed_tuple, key_size);
9856
9857	setup_scan_iterator(m_pk_descr, &table_key, false*,
9858	key_start_matching_bytes);
9859	m_scan_it->Seek(table_key);
9860	m_skip_scan_it_next_call = true;
9861	}
9862
9863	/**
9864	@return
9865	HA_EXIT_SUCCESS OK
9866	other HA_ERR error code (can be SE-specific)
9867	*/
9868	int ha_rocksdb::rnd_init(bool scan) {
9869	DBUG_ENTER_FUNC();
9870
9871	Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9872	setup_read_decoders();
9873
9874	if (scan) {
9875	m_rnd_scan_is_new_snapshot = !tx->has_snapshot();
9876	setup_iterator_for_rnd_scan();
9877	} else {
9878	/ We don't need any preparations for rnd_pos() calls. /
9879	}
9880
9881	// If m_lock_rows is on then we will be doing a get_for_update when accessing
9882	// the index, so don't acquire the snapshot right away. Otherwise acquire
9883	// the snapshot immediately.
9884	tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
9885
9886	DBUG_RETURN(HA_EXIT_SUCCESS);
9887	}
9888
9889	/**
9890	@return
9891	HA_EXIT_SUCCESS OK
9892	other HA_ERR error code (can be SE-specific)
9893	*/
9894	int ha_rocksdb::rnd_next(uchar *const buf) {
9895	DBUG_ENTER_FUNC();
9896
9897	int rc;
9898	for (;;) {
9899	rc = rnd_next_with_direction(buf, true);
9900	if (rc != HA_ERR_LOCK_DEADLOCK \|\| !m_rnd_scan_is_new_snapshot)
9901	break; / exit the loop /
9902
9903	// release the snapshot and iterator and then regenerate them
9904	Rdb_transaction *tx = get_or_create_tx(table->in_use);
9905	tx->release_snapshot();
9906	release_scan_iterator();
9907	setup_iterator_for_rnd_scan();
9908	}
9909
9910	m_rnd_scan_is_new_snapshot = false;
9911
9912	if (rc == HA_ERR_KEY_NOT_FOUND)
9913	rc = HA_ERR_END_OF_FILE;
9914
9915	DBUG_RETURN(rc);
9916	}
9917
9918	/*
9919	See also secondary_index_read().
9920	*/
9921	int ha_rocksdb::rnd_next_with_direction(uchar *const buf, bool move_forward) {
9922	DBUG_ENTER_FUNC();
9923
9924	int rc;
9925
9926	table->status = STATUS_NOT_FOUND;
9927	#ifdef MARIAROCKS_NOT_YET
9928	stats.rows_requested++;
9929	#endif
9930	if (!m_scan_it \|\| !is_valid(m_scan_it)) {
9931	/*
9932	We can get here when SQL layer has called
9933
9934	h->index_init(PRIMARY);
9935	h->index_read_map(full index tuple, HA_READ_KEY_EXACT);
9936
9937	In this case, we should return EOF.
9938	*/
9939	DBUG_RETURN(HA_ERR_END_OF_FILE);
9940	}
9941
9942	for (;;) {
9943	if (m_skip_scan_it_next_call) {
9944	m_skip_scan_it_next_call = false;
9945	} else {
9946	if (move_forward)
9947	m_scan_it->Next(); / this call cannot fail /
9948	else
9949	m_scan_it->Prev(); / this call cannot fail /
9950	}
9951
9952	if (!is_valid(m_scan_it)) {
9953	rc = HA_ERR_END_OF_FILE;
9954	break;
9955	}
9956
9957	/ check if we're out of this table /
9958	const rocksdb::Slice key = m_scan_it->key();
9959	if (!m_pk_descr ->covers_key(key)) {
9960	rc = HA_ERR_END_OF_FILE;
9961	break;
9962	}
9963
9964	if (m_lock_rows != RDB_LOCK_NONE) {
9965	/*
9966	Lock the row we've just read.
9967
9968	Now we call get_for_update which will 1) Take a lock and 2) Will fail
9969	if the row was deleted since the snapshot was taken.
9970	*/
9971	Rdb_transaction *const tx = get_or_create_tx(table->in_use);
9972	DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete");
9973
9974	if (m_pk_descr ->has_ttl() &&
9975	should_hide_ttl_rec(*m_pk_descr, m_scan_it->value(),
9976	tx->m_snapshot_timestamp)) {
9977	continue;
9978	}
9979
9980	const rocksdb::Status s =
9981	get_for_update(tx, m_pk_descr ->get_cf(), key, &m_retrieved_record);
9982	if (s.IsNotFound() &&
9983	my_core::thd_tx_isolation(ha_thd()) == ISO_READ_COMMITTED) {
9984	// This occurs if we accessed a row, tried to lock it, failed,
9985	// released and reacquired the snapshot (because of READ COMMITTED
9986	// mode) and the row was deleted by someone else in the meantime.
9987	// If so, we just want to move on to the next row.
9988	continue;
9989	}
9990
9991	if (!s.ok()) {
9992	DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr,
9993	m_tbl_def, m_table_handler));
9994	}
9995
9996	// If we called get_for_update() use the value from that call not from
9997	// the iterator as it may be stale since we don't have a snapshot
9998	// when m_lock_rows is not RDB_LOCK_NONE.
9999	m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
10000	rc = convert_record_from_storage_format(&key, buf);
10001	} else {
10002	// Use the value from the iterator
10003	rocksdb::Slice value = m_scan_it->value();
10004
10005	if (m_pk_descr ->has_ttl() &&
10006	should_hide_ttl_rec(
10007	*m_pk_descr, value,
10008	get_or_create_tx(table->in_use)->m_snapshot_timestamp)) {
10009	continue;
10010	}
10011
10012	m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin);
10013	rc = convert_record_from_storage_format(&key, &value, buf);
10014	}
10015
10016	table->status = `0`;
10017	break;
10018	}
10019
10020	if (!rc) {
10021	#ifdef MARIAROCKS_NOT_YET
10022	stats.rows_read++;
10023	stats.rows_index_next++;
10024	#endif
10025	update_row_stats(ROWS_READ);
10026	}
10027
10028	DBUG_RETURN(rc);
10029	}
10030
10031	int ha_rocksdb::rnd_end() {
10032	DBUG_ENTER_FUNC();
10033
10034	release_scan_iterator();
10035
10036	DBUG_RETURN(HA_EXIT_SUCCESS);
10037	}
10038
10039	/**
10040	@return
10041	HA_EXIT_SUCCESS OK
10042	other HA_ERR error code (can be SE-specific)
10043	*/
10044	int ha_rocksdb::index_init(uint idx, bool sorted) {
10045	DBUG_ENTER_FUNC();
10046
10047	Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10048	DBUG_ASSERT(tx != nullptr);
10049
10050	setup_read_decoders();
10051	m_start_range= NULL;
10052
10053	if (!m_keyread_only) {
10054	m_key_descr_arr[idx]->get_lookup_bitmap(table, &m_lookup_bitmap);
10055	}
10056
10057	// If m_lock_rows is not RDB_LOCK_NONE then we will be doing a get_for_update
10058	// when accessing the index, so don't acquire the snapshot right away.
10059	// Otherwise acquire the snapshot immediately.
10060	tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE);
10061
10062	active_index = idx;
10063
10064	DBUG_RETURN(HA_EXIT_SUCCESS);
10065	}
10066
10067	/**
10068	@return
10069	HA_EXIT_SUCCESS OK
10070	*/
10071	int ha_rocksdb::index_end() {
10072	DBUG_ENTER_FUNC();
10073
10074	release_scan_iterator();
10075
10076	bitmap_free(&m_lookup_bitmap);
10077
10078	active_index = MAX_KEY;
10079	in_range_check_pushed_down = FALSE;
10080	m_start_range= NULL;
10081
10082	DBUG_RETURN(HA_EXIT_SUCCESS);
10083	}
10084
10085	/**
10086	@return
10087	HA_EXIT_SUCCESS OK
10088	other HA_ERR error code (can be SE-specific)
10089	*/
10090	int ha_rocksdb::truncate() {
10091	DBUG_ENTER_FUNC();
10092
10093	DBUG_ASSERT(m_tbl_def != nullptr);
10094
10095	m_tbl_def->m_auto_incr_val = `1`;
10096	m_tbl_def->m_hidden_pk_val = `1`;
10097
10098	DBUG_RETURN(remove_rows(m_tbl_def));
10099	}
10100
10101	/*
10102	Delete the row we've last read. The row is also passed as parameter.
10103
10104	@detail
10105	The caller guarantees table buf points to the row that was just read.
10106	The row is either table->record[0] or table->record[1].
10107	(Check out InnoDB: row_update_for_mysql() has "UT_NOT_USED(mysql_rec)"
10108
10109	@return
10110	HA_EXIT_SUCCESS OK
10111	other HA_ERR error code (can be SE-specific)
10112	*/
10113	int ha_rocksdb::delete_row(const uchar *const buf) {
10114	DBUG_ENTER_FUNC();
10115
10116	DBUG_ASSERT(buf != nullptr);
10117
10118	set_last_rowkey(buf);
10119
10120	rocksdb::Slice key_slice(m_last_rowkey.ptr(), m_last_rowkey.length());
10121	Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10122	ulonglong bytes_written = `0`;
10123
10124	const uint index = pk_index(table, m_tbl_def);
10125	rocksdb::Status s =
10126	delete_or_singledelete(index, tx, m_pk_descr ->get_cf(), key_slice);
10127	if (!s.ok()) {
10128	DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def,
10129	m_table_handler));
10130	} else {
10131	bytes_written = key_slice.size();
10132	}
10133
10134	longlong hidden_pk_id = `0`;
10135	if (m_tbl_def->m_key_count > `1` && has_hidden_pk(table)) {
10136	int err = read_hidden_pk_id_from_rowkey(&hidden_pk_id);
10137	if (err) {
10138	DBUG_RETURN(err);
10139	}
10140	}
10141
10142	// Delete the record for every secondary index
10143	for (uint i = `0`; i < m_tbl_def->m_key_count; i++) {
10144	if (!is_pk(i, table, m_tbl_def)) {
10145	int packed_size;
10146	const Rdb_key_def &kd = *m_key_descr_arr[i];
10147	packed_size = kd.pack_record(table, m_pack_buffer, buf, m_sk_packed_tuple,
10148	nullptr, false, hidden_pk_id);
10149	rocksdb::Slice secondary_key_slice(
10150	reinterpret_cast<const char *>(m_sk_packed_tuple), packed_size);
10151	/ Deleting on secondary key doesn't need any locks: /
10152	tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(),
10153	secondary_key_slice);
10154	bytes_written += secondary_key_slice.size();
10155	}
10156	}
10157
10158	tx->incr_delete_count();
10159
10160	if (do_bulk_commit(tx)) {
10161	DBUG_RETURN(HA_ERR_ROCKSDB_BULK_LOAD);
10162	}
10163	#ifdef MARIAROCKS_NOT_YET
10164	stats.rows_deleted++;
10165	#endif
10166	update_row_stats(ROWS_DELETED);
10167	tx->update_bytes_written(bytes_written);
10168
10169	DBUG_RETURN(HA_EXIT_SUCCESS);
10170	}
10171
10172	rocksdb::Status ha_rocksdb::delete_or_singledelete(
10173	uint index, Rdb_transaction *const tx,
10174	rocksdb::ColumnFamilyHandle *const column_family,
10175	const rocksdb::Slice &key) {
10176	if (can_use_single_delete(index))
10177	return tx->single_delete(column_family, key);
10178	return tx->delete_key(column_family, key);
10179	}
10180
10181	void ha_rocksdb::update_stats(void) {
10182	DBUG_ENTER_FUNC();
10183
10184	stats.records = `0`;
10185	stats.index_file_length = `0ul`;
10186	stats.data_file_length = `0ul`;
10187	stats.mean_rec_length = `0`;
10188
10189	for (uint i = `0`; i < m_tbl_def->m_key_count; i++) {
10190	if (is_pk(i, table, m_tbl_def)) {
10191	stats.data_file_length = m_pk_descr ->m_stats.m_actual_disk_size;
10192	stats.records = m_pk_descr ->m_stats.m_rows;
10193	} else {
10194	stats.index_file_length += m_key_descr_arr[i]->m_stats.m_actual_disk_size;
10195	}
10196	}
10197
10198	DBUG_VOID_RETURN;
10199	}
10200
10201	/**
10202	@return
10203	HA_EXIT_SUCCESS OK
10204	HA_EXIT_FAILURE Error
10205	*/
10206	int ha_rocksdb::info(uint flag) {
10207	DBUG_ENTER_FUNC();
10208
10209	if (!table)
10210	DBUG_RETURN(HA_EXIT_FAILURE);
10211
10212	if (flag & HA_STATUS_VARIABLE) {
10213	/*
10214	Test only to simulate corrupted stats
10215	*/
10216	DBUG_EXECUTE_IF("myrocks_simulate_negative_stats",
10217	m_pk_descr->m_stats.m_actual_disk_size =
10218	-m_pk_descr->m_stats.m_actual_disk_size;);
10219
10220	update_stats();
10221
10222	/*
10223	If any stats are negative due to bad cached stats, re-run analyze table
10224	and re-retrieve the stats.
10225	*/
10226	if (static_cast<longlong>(stats.data_file_length) < `0` \|\|
10227	static_cast<longlong>(stats.index_file_length) < `0` \|\|
10228	static_cast<longlong>(stats.records) < `0`) {
10229	if (analyze(nullptr, nullptr)) {
10230	DBUG_RETURN(HA_EXIT_FAILURE);
10231	}
10232
10233	update_stats();
10234	}
10235
10236	// if number of records is hardcoded, we do not want to force computation
10237	// of memtable cardinalities
10238	if (stats.records == `0` \|\|
10239	(rocksdb_force_compute_memtable_stats &&
10240	rocksdb_debug_optimizer_n_rows == `0`))
10241	{
10242	// First, compute SST files stats
10243	uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * `2`];
10244	auto r = get_range(pk_index(table, m_tbl_def), buf);
10245	uint64_t sz = `0`;
10246	uint8_t include_flags = rocksdb::DB::INCLUDE_FILES;
10247	// recompute SST files stats only if records count is 0
10248	if (stats.records == `0`) {
10249	rdb->GetApproximateSizes(m_pk_descr ->get_cf(), &r, `1`, &sz,
10250	include_flags);
10251	stats.records+= sz/ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
10252	stats.data_file_length+= sz;
10253	}
10254	// Second, compute memtable stats. This call is expensive, so cache
10255	// values computed for some time.
10256	uint64_t cachetime = rocksdb_force_compute_memtable_stats_cachetime;
10257	uint64_t time = (cachetime == `0`) ? `0` : my_interval_timer() / `1000`;
10258	if (cachetime == `0` \|\|
10259	time > m_table_handler->m_mtcache_last_update + cachetime) {
10260	uint64_t memtableCount;
10261	uint64_t memtableSize;
10262
10263	// the stats below are calculated from skiplist wich is a probablistic
10264	// data structure, so the results vary between test runs
10265	// it also can return 0 for quite a large tables which means that
10266	// cardinality for memtable only indxes will be reported as 0
10267	rdb->GetApproximateMemTableStats(m_pk_descr ->get_cf(), r,
10268	&memtableCount, &memtableSize);
10269
10270	// Atomically update all of these fields at the same time
10271	if (cachetime > `0`) {
10272	if (m_table_handler->m_mtcache_lock.fetch_add(
10273	`1`, std::memory_order_acquire) == `0`) {
10274	m_table_handler->m_mtcache_count = memtableCount;
10275	m_table_handler->m_mtcache_size = memtableSize;
10276	m_table_handler->m_mtcache_last_update = time;
10277	}
10278	m_table_handler->m_mtcache_lock.fetch_sub(`1`,
10279	std::memory_order_release);
10280	}
10281
10282	stats.records += memtableCount;
10283	stats.data_file_length += memtableSize;
10284	} else {
10285	// Cached data is still valid, so use it instead
10286	stats.records += m_table_handler->m_mtcache_count;
10287	stats.data_file_length += m_table_handler->m_mtcache_size;
10288	}
10289
10290	if (rocksdb_debug_optimizer_n_rows > `0`)
10291	stats.records = rocksdb_debug_optimizer_n_rows;
10292	}
10293
10294	if (stats.records != `0`)
10295	stats.mean_rec_length = stats.data_file_length / stats.records;
10296	}
10297	if (flag & HA_STATUS_CONST) {
10298	ref_length = m_pk_descr ->max_storage_fmt_length();
10299
10300	for (uint i = `0`; i < m_tbl_def->m_key_count; i++) {
10301	if (is_hidden_pk(i, table, m_tbl_def)) {
10302	continue;
10303	}
10304	KEY *const k = &table->key_info[i];
10305	for (uint j = `0`; j < k->ext_key_parts; j++) {
10306	const Rdb_index_stats &k_stats = m_key_descr_arr[i]->m_stats;
10307	uint x = k_stats.m_distinct_keys_per_prefix.size() > j &&
10308	k_stats.m_distinct_keys_per_prefix [j] > `0`
10309	? k_stats.m_rows / k_stats.m_distinct_keys_per_prefix [j]
10310	: `0`;
10311	if (x > stats.records)
10312	x = stats.records;
10313	if ((x == `0` && rocksdb_debug_optimizer_no_zero_cardinality) \|\|
10314	rocksdb_debug_optimizer_n_rows > `0`) {
10315	// Fake cardinality implementation. For example, (idx1, idx2, idx3)
10316	// index
10317	/*
10318	Make MariaRocks behave the same way as MyRocks does:
10319	1. SQL layer thinks that unique secondary indexes are not extended
10320	with PK columns (both in MySQL and MariaDB)
10321	2. MariaDB also thinks that indexes with partially-covered columns
10322	are not extended with PK columns. Use the same number of
10323	keyparts that MyRocks would use.
10324	*/
10325	uint ext_key_parts2;
10326	if (k->flags & HA_NOSAME)
10327	ext_key_parts2= k->ext_key_parts; // This is #1
10328	else
10329	ext_key_parts2= m_key_descr_arr[i]->get_key_parts(); // This is #2.
10330
10331	// will have rec_per_key for (idx1)=4, (idx1,2)=2, and (idx1,2,3)=1.
10332	// rec_per_key for the whole index is 1, and multiplied by 2^n if
10333	// n suffix columns of the index are not used.
10334	x = `1` << (ext_key_parts2 - j - `1`);
10335	}
10336	k->rec_per_key[j] = x;
10337	}
10338	}
10339	}
10340
10341	if (flag & HA_STATUS_ERRKEY) {
10342	/*
10343	Currently we support only primary keys so we know which key had a
10344	uniqueness violation.
10345	*/
10346	errkey = m_dupp_errkey;
10347	dup_ref = m_pk_tuple; // TODO(?): this should store packed PK.
10348	}
10349
10350	if (flag & HA_STATUS_AUTO) {
10351	stats.auto_increment_value = m_tbl_def->m_auto_incr_val;
10352	}
10353
10354	DBUG_RETURN(HA_EXIT_SUCCESS);
10355	}
10356
10357	void ha_rocksdb::position(const uchar *const record) {
10358	DBUG_ENTER_FUNC();
10359
10360	longlong hidden_pk_id = `0`;
10361	if (has_hidden_pk(table) && read_hidden_pk_id_from_rowkey(&hidden_pk_id))
10362	DBUG_ASSERT(false); // should never reach here
10363
10364	/*
10365	Get packed primary key value from the record.
10366
10367	(Note: m_last_rowkey has the packed PK of last-read-row, which allows to
10368	handle most cases, but there is an exception to this: when slave applies
10369	RBR events, it fills the record and then calls position(); rnd_pos())
10370
10371	Also note that we can* rely on PK column values being available. This is*
10372	because table_flags() includes HA_PRIMARY_KEY_REQUIRED_FOR_POSITION bit.
10373	When that is true, table->prepare_for_position() adds PK columns into the
10374	read set (this may potentially disable index-only access when PK column
10375	cannot be restored from its mem-comparable form in the secondary indexes).
10376	*/
10377	const uint packed_size = m_pk_descr ->pack_record(
10378	table, m_pack_buffer, record, ref, nullptr, false, hidden_pk_id);
10379
10380	/*
10381	It could be that mem-comparable form of PK occupies less than ref_length
10382	bytes. Fill the remainder with zeros.
10383	*/
10384	if (ref_length > packed_size)
10385	memset(ref + packed_size, `0`, ref_length - packed_size);
10386
10387	DBUG_VOID_RETURN;
10388	}
10389
10390	/**
10391	@return
10392	HA_EXIT_SUCCESS OK
10393	other HA_ERR error code (can be SE-specific)
10394	*/
10395	int ha_rocksdb::rnd_pos(uchar *const buf, uchar *const pos) {
10396	DBUG_ENTER_FUNC();
10397
10398	int rc;
10399	size_t len;
10400
10401	#ifdef MARIAROCKS_NOT_YET
10402	stats.rows_requested++;
10403	#endif
10404	len = m_pk_descr ->key_length(table,
10405	rocksdb::Slice ((const char *)pos, ref_length));
10406	if (len == size_t(-`1`)) {
10407	DBUG_RETURN(HA_ERR_ROCKSDB_CORRUPT_DATA); / Data corruption? /
10408	}
10409
10410	rc = get_row_by_rowid(buf, pos, len);
10411
10412	if (!rc) {
10413	#ifdef MARIAROCKS_NOT_YET
10414	stats.rows_read++;
10415	#endif
10416	update_row_stats(ROWS_READ);
10417	}
10418
10419	DBUG_RETURN(rc);
10420	}
10421
10422	/*
10423	@brief
10424	Calculate (if needed) the bitmap of indexes that are modified by the
10425	current query
10426
10427	@detail
10428	The calculation done by checking index definitions against the
10429	table->write_set
10430	column bitmap.
10431	*/
10432
10433	void ha_rocksdb::calc_updated_indexes() {
10434	if (!m_update_scope_is_valid) {
10435	m_update_scope_is_valid = true;
10436	m_update_scope.clear_all();
10437
10438	for (uint keynr = `0`; keynr < table->s->keys; keynr++) {
10439	const Rdb_key_def &kd = *m_key_descr_arr[keynr];
10440	/ Walk over all key parts, including the "extended key" suffix /
10441	const uint key_parts = kd.get_key_parts();
10442	for (uint kp = `0`; kp < key_parts; kp++) {
10443	if (has_hidden_pk(table) && kp + `1` == key_parts)
10444	break;
10445
10446	Field *const field = kd.get_table_field_for_part_no(table, kp);
10447	if (bitmap_is_set(table->write_set, field->field_index)) {
10448	m_update_scope.set_bit(keynr);
10449	break;
10450	}
10451	}
10452	}
10453	}
10454	}
10455
10456	/**
10457	@return
10458	HA_EXIT_SUCCESS OK
10459	other HA_ERR error code (can be SE-specific)
10460	*/
10461	int ha_rocksdb::update_row(const uchar *const old_data,
10462	const uchar *const new_data) {
10463	DBUG_ENTER_FUNC();
10464
10465	DBUG_ASSERT(old_data != nullptr);
10466	DBUG_ASSERT(new_data != nullptr);
10467	DBUG_ASSERT(m_lock_rows == RDB_LOCK_WRITE);
10468	/*
10469	old_data points to record we're updating. It is the same as the record
10470	we've just read (for multi-table UPDATE, too, because SQL layer will make
10471	an rnd_pos() call to re-read the record before calling update_row())
10472	*/
10473	DBUG_ASSERT(new_data == table->record[`0`]);
10474
10475	const int rv = update_write_row(old_data, new_data, false);
10476
10477	if (rv == `0`) {
10478	#ifdef MARIAROCKS_NOT_YET
10479	stats.rows_updated++;
10480	#endif
10481	update_row_stats(ROWS_UPDATED);
10482	}
10483
10484	DBUG_RETURN(rv);
10485	}
10486
10487	/*
10488	MariaDB's temporary: MyRocks has this function in sql/handler.cc:
10489	*/
10490
10491	bool can_hold_read_locks_on_select(THD *thd, thr_lock_type lock_type)
10492	{
10493	return (lock_type == TL_READ_WITH_SHARED_LOCKS
10494	\|\| lock_type == TL_READ_NO_INSERT
10495	\|\| (lock_type != TL_IGNORE
10496	&& thd->lex->sql_command != SQLCOM_SELECT));
10497	}
10498
10499
10500	/ The following function was copied from ha_blackhole::store_lock: /
10501	THR_LOCK_DATA ha_rocksdb::store_lock(THD const* thd, THR_LOCK_DATA **to,
10502	enum thr_lock_type lock_type) {
10503	DBUG_ENTER_FUNC();
10504
10505	DBUG_ASSERT(thd != nullptr);
10506	DBUG_ASSERT(to != nullptr);
10507
10508	bool in_lock_tables = my_core::thd_in_lock_tables(thd);
10509
10510	/ First, make a decision about MyRocks's internal locking /
10511	if (lock_type >= TL_WRITE_ALLOW_WRITE) {
10512	m_lock_rows = RDB_LOCK_WRITE;
10513	} else if (lock_type == TL_READ_WITH_SHARED_LOCKS) {
10514	m_lock_rows = RDB_LOCK_READ;
10515	} else {
10516	m_lock_rows = RDB_LOCK_NONE;
10517	if (THDVAR(thd, lock_scanned_rows)) {
10518	/*
10519	The following logic was copied directly from
10520	ha_innobase::store_lock_with_x_type() in
10521	storage/innobase/handler/ha_innodb.cc and causes MyRocks to leave
10522	locks in place on rows that are in a table that is not being updated.
10523	*/
10524	const uint sql_command = my_core::thd_sql_command(thd);
10525	if ((lock_type == TL_READ && in_lock_tables) \|\|
10526	(lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) \|\|
10527	can_hold_read_locks_on_select(thd, lock_type)) {
10528	ulong tx_isolation = my_core::thd_tx_isolation(thd);
10529	if (sql_command != SQLCOM_CHECKSUM &&
10530	((my_core::thd_test_options(thd, OPTION_BIN_LOG) &&
10531	tx_isolation > ISO_READ_COMMITTED) \|\|
10532	tx_isolation == ISO_SERIALIZABLE \|\|
10533	(lock_type != TL_READ && lock_type != TL_READ_NO_INSERT) \|\|
10534	(sql_command != SQLCOM_INSERT_SELECT &&
10535	sql_command != SQLCOM_REPLACE_SELECT &&
10536	sql_command != SQLCOM_UPDATE && sql_command != SQLCOM_DELETE &&
10537	sql_command != SQLCOM_CREATE_TABLE))) {
10538	m_lock_rows = RDB_LOCK_READ;
10539	}
10540	}
10541	}
10542	}
10543
10544	/ Then, tell the SQL layer what kind of locking it should use: /
10545	if (lock_type != TL_IGNORE && m_db_lock.type == TL_UNLOCK) {
10546	/*
10547	Here is where we get into the guts of a row level lock.
10548	If TL_UNLOCK is set
10549	If we are not doing a LOCK TABLE or DISCARD/IMPORT
10550	TABLESPACE, then allow multiple writers
10551	*/
10552
10553	if ((lock_type >= TL_WRITE_CONCURRENT_INSERT && lock_type <= TL_WRITE) &&
10554	!in_lock_tables && !my_core::thd_tablespace_op(thd)) {
10555	lock_type = TL_WRITE_ALLOW_WRITE;
10556	}
10557
10558	/*
10559	In queries of type INSERT INTO t1 SELECT ... FROM t2 ...
10560	MySQL would use the lock TL_READ_NO_INSERT on t2, and that
10561	would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts
10562	to t2. Convert the lock to a normal read lock to allow
10563	concurrent inserts to t2.
10564	*/
10565
10566	if (lock_type == TL_READ_NO_INSERT && !in_lock_tables) {
10567	lock_type = TL_READ;
10568	}
10569
10570	m_db_lock.type = lock_type;
10571	}
10572
10573	*to++ = &m_db_lock;
10574
10575	DBUG_RETURN(to);
10576	}
10577
10578	void ha_rocksdb::read_thd_vars(THD *const thd) {
10579	m_store_row_debug_checksums = THDVAR(thd, store_row_debug_checksums);
10580	m_verify_row_debug_checksums = THDVAR(thd, verify_row_debug_checksums);
10581	m_checksums_pct = THDVAR(thd, checksums_pct);
10582	}
10583
10584
10585	/**
10586	@return
10587	HA_EXIT_SUCCESS OK
10588	other HA_ERR error code (cannot be SE-specific)
10589	*/
10590	int ha_rocksdb::external_lock(THD *const thd, int lock_type) {
10591	DBUG_ENTER_FUNC();
10592
10593	DBUG_ASSERT(thd != nullptr);
10594
10595	int res = HA_EXIT_SUCCESS;
10596	int binlog_format = my_core::thd_binlog_format(thd);
10597	bool unsafe_for_binlog = THDVAR(ha_thd(), unsafe_for_binlog);
10598
10599	/*
10600	A note about () below: In fb/mysql, LOCK TABLE myrocks_table WRITE*
10601	- produces an error when @@binlog_format='ROW'.
10602	- does not produce an error with @@binlog_format='STATEMENT'
10603
10604	We cannot easily provide this behavior in MariaDB, because LOCK ROWS
10605	changes @@binlog_format for the duration of the statement.
10606
10607	Instead we will make the check in ha_rocksdb::start_stmt and will produce
10608	an error in the first DML statement done under LOCK TABLE
10609
10610	() - The same happens for other "DDL-type" commands, so allow them here
10611	as well.
10612	*/
10613	if (lock_type == F_WRLCK && !thd->rgi_slave && !unsafe_for_binlog &&
10614	binlog_format != BINLOG_FORMAT_ROW &&
10615	binlog_format != BINLOG_FORMAT_UNSPEC &&
10616	thd->lex->sql_command != SQLCOM_LOCK_TABLES && // ()*
10617	thd->lex->sql_command != SQLCOM_ANALYZE && // ()
10618	thd->lex->sql_command != SQLCOM_OPTIMIZE && // ()
10619	thd->lex->sql_command != SQLCOM_FLUSH && // ()
10620	my_core::thd_binlog_filter_ok(thd)) {
10621	my_error(ER_REQUIRE_ROW_BINLOG_FORMAT, MYF(`0`));
10622	DBUG_RETURN(HA_ERR_UNSUPPORTED);
10623	}
10624
10625	if (lock_type == F_UNLCK) {
10626	Rdb_transaction *const tx = get_tx_from_thd(thd);
10627
10628	if (tx) {
10629	tx->io_perf_end_and_record(&m_io_perf);
10630	tx->m_n_mysql_tables_in_use--;
10631	if (tx->m_n_mysql_tables_in_use == `0` &&
10632	!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT \| OPTION_BEGIN)) {
10633	/*
10634	Do like InnoDB: when we get here, it's time to commit a
10635	single-statement transaction.
10636
10637	If the statement involved multiple tables, this code will be executed
10638	for each of them, but that's ok because non-first tx->commit() calls
10639	will be no-ops.
10640	*/
10641	if (tx->commit_or_rollback()) {
10642	res = HA_ERR_INTERNAL_ERROR;
10643	}
10644	}
10645	}
10646	} else {
10647	if (my_core::thd_tx_isolation(thd) < ISO_READ_COMMITTED \|\|
10648	my_core::thd_tx_isolation(thd) > ISO_REPEATABLE_READ) {
10649	my_error(ER_ISOLATION_MODE_NOT_SUPPORTED, MYF(`0`),
10650	tx_isolation_names[my_core::thd_tx_isolation(thd)]);
10651	DBUG_RETURN(HA_ERR_UNSUPPORTED);
10652	}
10653	/*
10654	It's nice to do the following on start of every statement. The problem
10655	is, handler->start_stmt() is not called for INSERTs.
10656	So, we put this code here.
10657	*/
10658	Rdb_transaction *const tx = get_or_create_tx(thd);
10659	read_thd_vars(thd);
10660
10661	m_update_scope_is_valid = false;
10662
10663	if (skip_unique_check()) {
10664	if ((thd->lex->sql_command == SQLCOM_INSERT \|\|
10665	thd->lex->sql_command == SQLCOM_LOAD \|\|
10666	thd->lex->sql_command == SQLCOM_REPLACE) &&
10667	(thd->lex->duplicates == DUP_REPLACE \|\|
10668	thd->lex->duplicates == DUP_UPDATE)) {
10669	my_error(ER_ON_DUPLICATE_DISABLED, MYF(`0`), thd->query());
10670	DBUG_RETURN(HA_ERR_UNSUPPORTED);
10671	}
10672	}
10673
10674	if (lock_type == F_WRLCK) {
10675	if (tx->is_tx_read_only()) {
10676	my_error(ER_UPDATES_WITH_CONSISTENT_SNAPSHOT, MYF(`0`));
10677	DBUG_RETURN(HA_ERR_UNSUPPORTED);
10678	}
10679
10680	/*
10681	SQL layer signals us to take a write lock. It does so when starting DML
10682	statement. We should put locks on the rows we're reading.
10683
10684	Note: sometimes, external_lock() can be called without a prior
10685	::store_lock call. That's why we need to set lock_ members here, too.*
10686	*/
10687	m_lock_rows = RDB_LOCK_WRITE;
10688
10689	if (thd->lex->sql_command == SQLCOM_CREATE_INDEX \|\|
10690	thd->lex->sql_command == SQLCOM_DROP_INDEX \|\|
10691	thd->lex->sql_command == SQLCOM_ALTER_TABLE) {
10692	tx->m_ddl_transaction = true;
10693	}
10694	}
10695	tx->m_n_mysql_tables_in_use++;
10696	rocksdb_register_tx(rocksdb_hton, thd, tx);
10697	tx->io_perf_start(&m_io_perf);
10698	}
10699
10700	DBUG_RETURN(res);
10701	}
10702
10703	/**
10704	@note
10705	A quote from ha_innobase::start_stmt():
10706	<quote>
10707	MySQL calls this function at the start of each SQL statement inside LOCK
10708	TABLES. Inside LOCK TABLES the ::external_lock method does not work to
10709	mark SQL statement borders.
10710	</quote>
10711
10712	@return
10713	HA_EXIT_SUCCESS OK
10714	*/
10715
10716	int ha_rocksdb::start_stmt(THD *const thd, thr_lock_type lock_type) {
10717	DBUG_ENTER_FUNC();
10718
10719	/*
10720	MariaDB: the following is a copy of the check in ha_rocksdb::external_lock:
10721	*/
10722	int binlog_format = my_core::thd_binlog_format(thd);
10723	bool unsafe_for_binlog = THDVAR(ha_thd(), unsafe_for_binlog);
10724	if (lock_type >= TL_WRITE_ALLOW_WRITE &&
10725	!thd->rgi_slave && !unsafe_for_binlog &&
10726	binlog_format != BINLOG_FORMAT_ROW &&
10727	binlog_format != BINLOG_FORMAT_UNSPEC &&
10728	my_core::thd_binlog_filter_ok(thd)) {
10729	my_error(ER_REQUIRE_ROW_BINLOG_FORMAT, MYF(`0`));
10730	DBUG_RETURN(HA_ERR_UNSUPPORTED);
10731	}
10732
10733	DBUG_ASSERT(thd != nullptr);
10734
10735	Rdb_transaction *const tx = get_or_create_tx(thd);
10736	read_thd_vars(thd);
10737	rocksdb_register_tx(ht, thd, tx);
10738	tx->io_perf_start(&m_io_perf);
10739
10740	DBUG_RETURN(HA_EXIT_SUCCESS);
10741	}
10742
10743	rocksdb::Range get_range(uint32_t i,
10744	uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * `2`],
10745	int offset1, int offset2) {
10746	uchar *buf_begin = buf;
10747	uchar *buf_end = buf + Rdb_key_def::INDEX_NUMBER_SIZE;
10748	rdb_netbuf_store_index(buf_begin, i + offset1);
10749	rdb_netbuf_store_index(buf_end, i + offset2);
10750
10751	return rocksdb::Range (
10752	rocksdb::Slice ((const char *)buf_begin, Rdb_key_def::INDEX_NUMBER_SIZE),
10753	rocksdb::Slice ((const char *)buf_end, Rdb_key_def::INDEX_NUMBER_SIZE));
10754	}
10755
10756	static rocksdb::Range get_range(const Rdb_key_def &kd,
10757	uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * `2`],
10758	int offset1, int offset2) {
10759	return get_range(kd.get_index_number(), buf, offset1, offset2);
10760	}
10761
10762	rocksdb::Range get_range(const Rdb_key_def &kd,
10763	uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * `2`]) {
10764	if (kd.m_is_reverse_cf) {
10765	return myrocks::get_range(kd, buf, `1`, `0`);
10766	} else {
10767	return myrocks::get_range(kd, buf, `0`, `1`);
10768	}
10769	}
10770
10771	rocksdb::Range
10772	ha_rocksdb::get_range(const int &i,
10773	uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * `2`]) const {
10774	return myrocks::get_range(*m_key_descr_arr[i], buf);
10775	}
10776
10777	/*
10778	This function is called with total_order_seek=true, but
10779	upper/lower bound setting is not necessary.
10780	Boundary set is useful when there is no matching key,
10781	but in drop_index_thread's case, it means index is marked as removed,
10782	so no further seek will happen for the index id.
10783	*/
10784	static bool is_myrocks_index_empty(
10785	rocksdb::ColumnFamilyHandle cfh, const* bool is_reverse_cf,
10786	const rocksdb::ReadOptions &read_opts,
10787	const uint index_id)
10788	{
10789	bool index_removed = false;
10790	uchar key_buf[Rdb_key_def::INDEX_NUMBER_SIZE] = {`0`};
10791	rdb_netbuf_store_uint32(key_buf, index_id);
10792	const rocksdb::Slice key =
10793	rocksdb::Slice (reinterpret_cast<char >(key_buf), sizeof*(key_buf));
10794	std::unique_ptr<rocksdb::Iterator> it(rdb->NewIterator(read_opts, cfh));
10795	rocksdb_smart_seek(is_reverse_cf, it.get(), key);
10796	if (!it ->Valid()) {
10797	index_removed = true;
10798	} else {
10799	if (memcmp(it ->key().data(), key_buf,
10800	Rdb_key_def::INDEX_NUMBER_SIZE)) {
10801	// Key does not have same prefix
10802	index_removed = true;
10803	}
10804	}
10805	return index_removed;
10806	}
10807
10808	/*
10809	Drop index thread's main logic
10810	*/
10811
10812	void Rdb_drop_index_thread::run() {
10813	RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
10814
10815	for (;;) {
10816	// The stop flag might be set by shutdown command
10817	// after drop_index_thread releases signal_mutex
10818	// (i.e. while executing expensive Seek()). To prevent drop_index_thread
10819	// from entering long cond_timedwait, checking if stop flag
10820	// is true or not is needed, with drop_index_interrupt_mutex held.
10821	if (m_stop) {
10822	break;
10823	}
10824
10825	timespec ts;
10826	int sec= dict_manager.is_drop_index_empty()
10827	? `24` * `60` * `60` // no filtering
10828	: `60`; // filtering
10829	set_timespec(ts,sec);
10830
10831	const auto ret MY_ATTRIBUTE((__unused__)) =
10832	mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts);
10833	if (m_stop) {
10834	break;
10835	}
10836	// make sure, no program error is returned
10837	DBUG_ASSERT(ret == `0` \|\| ret == ETIMEDOUT);
10838	RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
10839
10840	std::unordered_set<GL_INDEX_ID> indices;
10841	dict_manager.get_ongoing_drop_indexes(&indices);
10842	if (!indices.empty()) {
10843	std::unordered_set<GL_INDEX_ID> finished;
10844	rocksdb::ReadOptions read_opts;
10845	read_opts.total_order_seek = true; // disable bloom filter
10846
10847	for (const auto d : indices) {
10848	uint32 cf_flags = `0`;
10849	if (!dict_manager.get_cf_flags(d.cf_id, &cf_flags)) {
10850	sql_print_error("RocksDB: Failed to get column family flags "
10851	"from cf id %u. MyRocks data dictionary may "
10852	"get corrupted.",
10853	d.cf_id);
10854	abort();
10855	}
10856	rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(d.cf_id);
10857	DBUG_ASSERT(cfh);
10858	const bool is_reverse_cf = cf_flags & Rdb_key_def::REVERSE_CF_FLAG;
10859
10860	if (is_myrocks_index_empty(cfh, is_reverse_cf, read_opts, d.index_id))
10861	{
10862	finished.insert(d);
10863	continue;
10864	}
10865	uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * `2`];
10866	rocksdb::Range range = get_range(d.index_id, buf, is_reverse_cf ? `1` : `0`,
10867	is_reverse_cf ? `0` : `1`);
10868	rocksdb::Status status = DeleteFilesInRange(rdb->GetBaseDB(), cfh,
10869	&range.start, &range.limit);
10870	if (!status.ok()) {
10871	if (status.IsShutdownInProgress()) {
10872	break;
10873	}
10874	rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
10875	}
10876	status = rdb->CompactRange(getCompactRangeOptions(), cfh, &range.start,
10877	&range.limit);
10878	if (!status.ok()) {
10879	if (status.IsShutdownInProgress()) {
10880	break;
10881	}
10882	rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD);
10883	}
10884	if (is_myrocks_index_empty(cfh, is_reverse_cf, read_opts, d.index_id))
10885	{
10886	finished.insert(d);
10887	}
10888	}
10889
10890	if (!finished.empty()) {
10891	dict_manager.finish_drop_indexes(finished);
10892	}
10893	}
10894	RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
10895	}
10896
10897	RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
10898	}
10899
10900	Rdb_tbl_def ha_rocksdb::get_table_if_exists(const* char *const tablename) {
10901	DBUG_ASSERT(tablename != nullptr);
10902
10903	std::string str;
10904	if (rdb_normalize_tablename(tablename, &str) != HA_EXIT_SUCCESS) {
10905	// We were not passed table name?
10906	DBUG_ASSERT(`0`);
10907	return nullptr;
10908	}
10909
10910	return ddl_manager.find(str);
10911	}
10912
10913	/*
10914	Note: the following function is called when the table is not open. That is,
10915	this->table==nullptr, pk_key_descr==nullptr, etc.
10916
10917	tablename points to line in form "./dbname/tablename".
10918
10919	@return
10920	HA_EXIT_SUCCESS OK
10921	other HA_ERR error code (can be SE-specific)
10922	*/
10923
10924	int ha_rocksdb::delete_table(const char *const tablename) {
10925	DBUG_ENTER_FUNC();
10926
10927	DBUG_ASSERT(tablename != nullptr);
10928
10929	const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
10930	rocksdb::WriteBatch *const batch = wb.get();
10931
10932	/ Find the table in the hash /
10933	Rdb_tbl_def *const tbl = get_table_if_exists(tablename);
10934	if (!tbl)
10935	DBUG_RETURN(HA_ERR_NO_SUCH_TABLE);
10936	dict_manager.add_drop_table(tbl->m_key_descr_arr, tbl->m_key_count, batch);
10937
10938	/*
10939	Remove the table entry in data dictionary (this will also remove it from
10940	the persistent data dictionary).
10941	*/
10942	ddl_manager.remove(tbl, batch, true);
10943
10944	int err = dict_manager.commit(batch);
10945	if (err) {
10946	DBUG_RETURN(err);
10947	}
10948
10949	rdb_drop_idx_thread.signal();
10950
10951	DBUG_RETURN(HA_EXIT_SUCCESS);
10952	}
10953
10954	int ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) {
10955	const rocksdb::WriteOptions wo =
10956	rdb_get_rocksdb_write_options(handler::ha_thd());
10957
10958	rocksdb::ReadOptions opts;
10959	opts.total_order_seek = true;
10960	Rdb_transaction *const tx = get_or_create_tx(table->in_use);
10961
10962	char key_buf[MAX_KEY_LENGTH];
10963	uint key_len;
10964	ulonglong bytes_written = `0`;
10965	/*
10966	Remove all records in each index.
10967	(This is is not crash-safe, but it doesn't matter, because bulk row
10968	deletion will be handled on rocksdb side)
10969	*/
10970	for (uint i = `0`; i < tbl->m_key_count; i++) {
10971	const Rdb_key_def &kd = *tbl->m_key_descr_arr[i];
10972	kd.get_infimum_key(reinterpret_cast<uchar *>(key_buf), &key_len);
10973	rocksdb::ColumnFamilyHandle *cf = kd.get_cf();
10974	const rocksdb::Slice table_key(key_buf, key_len);
10975	setup_iterator_bounds(kd, table_key,
10976	m_eq_cond_lower_bound,
10977	m_eq_cond_upper_bound,
10978	&m_eq_cond_lower_bound_slice,
10979	&m_eq_cond_upper_bound_slice);
10980	opts.iterate_lower_bound = &m_eq_cond_lower_bound_slice;
10981	opts.iterate_upper_bound = &m_eq_cond_upper_bound_slice;
10982	std::unique_ptr<rocksdb::Iterator> it(rdb->NewIterator(opts, cf));
10983
10984	it ->Seek(table_key);
10985	while (it ->Valid()) {
10986	const rocksdb::Slice key = it ->key();
10987	if (!kd.covers_key(key)) {
10988	break;
10989	}
10990
10991	rocksdb::Status s;
10992	if (can_use_single_delete(i)) {
10993	s = rdb->SingleDelete(wo, cf, key);
10994	} else {
10995	s = rdb->Delete(wo, cf, key);
10996	}
10997
10998	if (!s.ok()) {
10999	return tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def,
11000	m_table_handler);
11001	}
11002	bytes_written += key.size();
11003	it ->Next();
11004	}
11005	}
11006
11007	tx->update_bytes_written(bytes_written);
11008
11009	return HA_EXIT_SUCCESS;
11010	}
11011
11012	/**
11013	@return
11014	HA_EXIT_SUCCESS OK
11015	other HA_ERR error code (cannot be SE-specific)
11016	*/
11017	int ha_rocksdb::rename_table(const char *const from, const char *const to) {
11018	DBUG_ENTER_FUNC();
11019
11020	DBUG_ASSERT(from != nullptr);
11021	DBUG_ASSERT(to != nullptr);
11022
11023	std::string from_str;
11024	std::string to_str;
11025	std::string from_db;
11026	std::string to_db;
11027
11028	int rc = rdb_normalize_tablename(from, &from_str);
11029	if (rc != HA_EXIT_SUCCESS) {
11030	DBUG_RETURN(rc);
11031	}
11032
11033	rc = rdb_split_normalized_tablename(from_str, &from_db);
11034	if (rc != HA_EXIT_SUCCESS) {
11035	DBUG_RETURN(rc);
11036	}
11037
11038	rc = rdb_normalize_tablename(to, &to_str);
11039	if (rc != HA_EXIT_SUCCESS) {
11040	DBUG_RETURN(rc);
11041	}
11042
11043	rc = rdb_split_normalized_tablename(to_str, &to_db);
11044	if (rc != HA_EXIT_SUCCESS) {
11045	DBUG_RETURN(rc);
11046	}
11047
11048	// If the user changed the database part of the name then validate that the
11049	// 'to' database exists.
11050	if (from_db != to_db && !rdb_database_exists(to_db)) {
11051	// If we return a RocksDB specific error code here we get
11052	// "error: 206 - Unknown error 206". InnoDB gets
11053	// "error -1 - Unknown error -1" so let's match them.
11054	DBUG_RETURN(-`1`);
11055	}
11056
11057	DBUG_EXECUTE_IF("gen_sql_table_name", to_str = to_str + "#sql-test";);
11058
11059	const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
11060	rocksdb::WriteBatch *const batch = wb.get();
11061	dict_manager.lock();
11062
11063	if (ddl_manager.rename(from_str, to_str, batch)) {
11064	rc = HA_ERR_NO_SUCH_TABLE;
11065	} else {
11066	rc = dict_manager.commit(batch);
11067	}
11068	dict_manager.unlock();
11069
11070	DBUG_RETURN(rc);
11071	}
11072
11073	/**
11074	check_if_incompatible_data() called if ALTER TABLE can't detect otherwise
11075	if new and old definition are compatible
11076
11077	@details If there are no other explicit signs like changed number of
11078	fields this function will be called by compare_tables()
11079	(sql/sql_tables.cc) to decide should we rewrite whole table or only .frm
11080	file.
11081
11082	*/
11083
11084	bool ha_rocksdb::check_if_incompatible_data(HA_CREATE_INFO *const info,
11085	uint table_changes) {
11086	DBUG_ENTER_FUNC();
11087
11088	DBUG_ASSERT(info != nullptr);
11089
11090	// this function is needed only for online alter-table
11091	DBUG_RETURN(COMPATIBLE_DATA_NO);
11092	}
11093
11094	/**
11095	@return
11096	HA_EXIT_SUCCESS OK
11097	*/
11098	int ha_rocksdb::extra(enum ha_extra_function operation) {
11099	DBUG_ENTER_FUNC();
11100
11101	switch (operation) {
11102	case HA_EXTRA_KEYREAD:
11103	m_keyread_only = true;
11104	break;
11105	case HA_EXTRA_NO_KEYREAD:
11106	m_keyread_only = false;
11107	break;
11108	case HA_EXTRA_FLUSH:
11109	/*
11110	If the table has blobs, then they are part of m_retrieved_record.
11111	This call invalidates them.
11112	*/
11113	m_retrieved_record.Reset();
11114	break;
11115	default:
11116	break;
11117	}
11118
11119	DBUG_RETURN(HA_EXIT_SUCCESS);
11120	}
11121
11122	/*
11123	Given a starting key and an ending key, estimate the number of rows that
11124	will exist between the two keys.
11125	*/
11126	ha_rows ha_rocksdb::records_in_range(uint inx, key_range *const min_key,
11127	key_range *const max_key) {
11128	DBUG_ENTER_FUNC();
11129
11130	ha_rows ret = THDVAR(ha_thd(), records_in_range);
11131	if (ret) {
11132	DBUG_RETURN(ret);
11133	}
11134	if (table->force_index) {
11135	const ha_rows force_rows = THDVAR(ha_thd(), force_index_records_in_range);
11136	if (force_rows) {
11137	DBUG_RETURN(force_rows);
11138	}
11139	}
11140
11141	const Rdb_key_def &kd = *m_key_descr_arr[inx];
11142
11143	uint size1 = `0`;
11144	if (min_key) {
11145	size1 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple,
11146	min_key->key, min_key->keypart_map);
11147	if (min_key->flag == HA_READ_PREFIX_LAST_OR_PREV \|\|
11148	min_key->flag == HA_READ_PREFIX_LAST \|\|
11149	min_key->flag == HA_READ_AFTER_KEY) {
11150	kd.successor(m_sk_packed_tuple, size1);
11151	}
11152	} else {
11153	kd.get_infimum_key(m_sk_packed_tuple, &size1);
11154	}
11155
11156	uint size2 = `0`;
11157	if (max_key) {
11158	size2 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple_old,
11159	max_key->key, max_key->keypart_map);
11160	if (max_key->flag == HA_READ_PREFIX_LAST_OR_PREV \|\|
11161	max_key->flag == HA_READ_PREFIX_LAST \|\|
11162	max_key->flag == HA_READ_AFTER_KEY) {
11163	kd.successor(m_sk_packed_tuple_old, size2);
11164	}
11165	// pad the upper key with FFFFs to make sure it is more than the lower
11166	if (size1 > size2) {
11167	memset(m_sk_packed_tuple_old + size2, `0xff`, size1 - size2);
11168	size2 = size1;
11169	}
11170	} else {
11171	kd.get_supremum_key(m_sk_packed_tuple_old, &size2);
11172	}
11173
11174	const rocksdb::Slice slice1((const char *)m_sk_packed_tuple, size1);
11175	const rocksdb::Slice slice2((const char *)m_sk_packed_tuple_old, size2);
11176
11177	// slice1 >= slice2 means no row will match
11178	if (slice1.compare(slice2) >= `0`) {
11179	DBUG_RETURN(HA_EXIT_SUCCESS);
11180	}
11181
11182	rocksdb::Range r(kd.m_is_reverse_cf ? slice2 : slice1,
11183	kd.m_is_reverse_cf ? slice1 : slice2);
11184
11185	uint64_t sz = `0`;
11186	auto disk_size = kd.m_stats.m_actual_disk_size;
11187	if (disk_size == `0`)
11188	disk_size = kd.m_stats.m_data_size;
11189	auto rows = kd.m_stats.m_rows;
11190	if (rows == `0` \|\| disk_size == `0`) {
11191	rows = `1`;
11192	disk_size = ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE;
11193	}
11194
11195	// Getting statistics, including from Memtables
11196	uint8_t include_flags = rocksdb::DB::INCLUDE_FILES;
11197	rdb->GetApproximateSizes(kd.get_cf(), &r, `1`, &sz, include_flags);
11198	ret = rows * sz / disk_size;
11199	uint64_t memTableCount;
11200	rdb->GetApproximateMemTableStats(kd.get_cf(), r, &memTableCount, &sz);
11201	ret += memTableCount;
11202
11203	/*
11204	GetApproximateSizes() gives estimates so ret might exceed stats.records.
11205	MySQL then decides to use full index scan rather than range scan, which
11206	is not efficient for most cases.
11207	To prevent this, changing estimated records slightly smaller than
11208	stats.records.
11209	*/
11210	if (ret >= stats.records) {
11211	ret = stats.records * `0.99`;
11212	}
11213
11214	if (rocksdb_debug_optimizer_n_rows > `0`) {
11215	ret = rocksdb_debug_optimizer_n_rows;
11216	} else if (ret == `0`) {
11217	ret = `1`;
11218	}
11219
11220	DBUG_RETURN(ret);
11221	}
11222
11223	void ha_rocksdb::update_create_info(HA_CREATE_INFO *const create_info) {
11224	DBUG_ENTER_FUNC();
11225
11226	DBUG_ASSERT(create_info != nullptr);
11227
11228	if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) {
11229	create_info->auto_increment_value = m_tbl_def->m_auto_incr_val;
11230	}
11231
11232	DBUG_VOID_RETURN;
11233	}
11234
11235	/**
11236	@brief
11237	Doing manual compaction on OPTIMIZE TABLE in RocksDB.
11238	Compaction itself is executed by background thread in RocksDB, but
11239	CompactRange() waits until compaction completes so this function
11240	may take a long time.
11241	Since RocksDB dataset is allocated per index id, OPTIMIZE TABLE
11242	triggers manual compaction for all indexes of the table.
11243	@details
11244	Compaction range is from the beginning of the index id to
11245	the first row of the next index id. When using reverse order
11246	column family, the first row of the next index id should be
11247	the last row of the previous index id.
11248
11249	@return
11250	HA_ADMIN_OK OK
11251	other HA_ADMIN error code
11252	*/
11253	int ha_rocksdb::optimize(THD *const thd, HA_CHECK_OPT *const check_opt) {
11254	DBUG_ENTER_FUNC();
11255
11256	DBUG_ASSERT(thd != nullptr);
11257	DBUG_ASSERT(check_opt != nullptr);
11258
11259	for (uint i = `0`; i < table->s->keys; i++) {
11260	uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * `2`];
11261	auto range = get_range(i, buf);
11262	const rocksdb::Status s = rdb->CompactRange(getCompactRangeOptions(),
11263	m_key_descr_arr[i]->get_cf(),
11264	&range.start, &range.limit);
11265	if (!s.ok()) {
11266	DBUG_RETURN(rdb_error_to_mysql(s));
11267	}
11268	}
11269
11270	DBUG_RETURN(HA_EXIT_SUCCESS);
11271	}
11272
11273	int ha_rocksdb::calculate_stats(const TABLE *const table_arg, THD *const thd,
11274	HA_CHECK_OPT *const check_opt) {
11275	DBUG_ENTER_FUNC();
11276
11277	// find per column family key ranges which need to be queried
11278	std::unordered_map<rocksdb::ColumnFamilyHandle *, std::vector<rocksdb::Range>>
11279	ranges;
11280	std::unordered_set<GL_INDEX_ID> ids_to_check;
11281	std::vector<uchar> buf(table_arg->s->keys * `2` *
11282	Rdb_key_def::INDEX_NUMBER_SIZE);
11283	std::unordered_map<GL_INDEX_ID, Rdb_index_stats> stats;
11284	for (uint i = `0`; i < table_arg->s->keys; i++) {
11285	const auto bufp = &buf [i * `2` * Rdb_key_def::INDEX_NUMBER_SIZE];
11286	const Rdb_key_def &kd = *m_key_descr_arr[i];
11287	const GL_INDEX_ID index_id = kd.get_gl_index_id();
11288	ranges [kd.get_cf()].push_back(get_range(i, bufp));
11289
11290	ids_to_check.insert(index_id);
11291	// Initialize the stats to 0. If there are no files that contain
11292	// this gl_index_id, then 0 should be stored for the cached stats.
11293	stats [index_id] = Rdb_index_stats (index_id);
11294	DBUG_ASSERT(kd.get_key_parts() > `0`);
11295	stats [index_id].m_distinct_keys_per_prefix.resize(kd.get_key_parts());
11296	}
11297
11298	// get RocksDB table properties for these ranges
11299	rocksdb::TablePropertiesCollection props;
11300	for (auto it : ranges) {
11301	const auto old_size MY_ATTRIBUTE((__unused__)) = props.size();
11302	const auto status = rdb->GetPropertiesOfTablesInRange(
11303	it.first, &it.second [`0`], it.second.size(), &props);
11304	DBUG_ASSERT(props.size() >= old_size);
11305	if (!status.ok()) {
11306	DBUG_RETURN(
11307	rdb_error_to_mysql(status, "Could not access RocksDB properties"));
11308	}
11309	}
11310
11311	int num_sst = `0`;
11312	for (const auto &it : props) {
11313	std::vector<Rdb_index_stats> sst_stats;
11314	Rdb_tbl_prop_coll::read_stats_from_tbl_props(it.second, &sst_stats);
11315	/*
11316	sst_stats is a list of index statistics for indexes that have entries
11317	in the current SST file.
11318	*/
11319	for (const auto &it1 : sst_stats) {
11320	/*
11321	Only update statistics for indexes that belong to this SQL table.
11322
11323	The reason is: We are walking through all SST files that have
11324	entries from this table (and so can compute good statistics). For
11325	other SQL tables, it can be that we're only seeing a small fraction
11326	of table's entries (and so we can't update statistics based on that).
11327	*/
11328	if (ids_to_check.find(it1.m_gl_index_id) == ids_to_check.end())
11329	continue;
11330
11331	auto kd = ddl_manager.safe_find(it1.m_gl_index_id);
11332	DBUG_ASSERT(kd != nullptr);
11333	stats [it1.m_gl_index_id].merge(it1, true, kd ->max_storage_fmt_length());
11334	}
11335	num_sst++;
11336	}
11337
11338	// calculate memtable cardinality
11339	Rdb_tbl_card_coll cardinality_collector(rocksdb_table_stats_sampling_pct);
11340	auto read_opts = rocksdb::ReadOptions ();
11341	read_opts.read_tier = rocksdb::ReadTier::kMemtableTier;
11342	for (uint i = `0`; i < table_arg->s->keys; i++) {
11343	const Rdb_key_def &kd = *m_key_descr_arr[i];
11344	Rdb_index_stats &stat = stats [kd.get_gl_index_id()];
11345
11346	uchar r_buf[Rdb_key_def::INDEX_NUMBER_SIZE * `2`];
11347	auto r = get_range(i, r_buf);
11348	uint64_t memtableCount;
11349	uint64_t memtableSize;
11350	rdb->GetApproximateMemTableStats(kd.get_cf(), r, &memtableCount,
11351	&memtableSize);
11352	if (memtableCount < (uint64_t)stat.m_rows / `10`) {
11353	// skip tables that already have enough stats from SST files to reduce
11354	// overhead and avoid degradation of big tables stats by sampling from
11355	// relatively tiny (less than 10% of full data set) memtable dataset
11356	continue;
11357	}
11358
11359	std::unique_ptr<rocksdb::Iterator> it = std::unique_ptr<rocksdb::Iterator>(
11360	rdb->NewIterator(read_opts, kd.get_cf()));
11361
11362	uchar *first_key;
11363	uint key_size;
11364	if (is_pk(i, table, m_tbl_def)) {
11365	first_key = m_pk_packed_tuple;
11366	} else {
11367	first_key = m_sk_packed_tuple;
11368	}
11369	kd.get_first_key(first_key, &key_size);
11370	rocksdb::Slice first_index_key((const char *)first_key, key_size);
11371
11372	cardinality_collector.Reset();
11373	for (it ->Seek(first_index_key); is_valid(it.get()); it ->Next()) {
11374	const rocksdb::Slice key = it ->key();
11375	if (!kd.covers_key(key)) {
11376	break; // end of this index
11377	}
11378	stat.m_rows++;
11379
11380	cardinality_collector.ProcessKey(key, &kd, &stat);
11381	}
11382	cardinality_collector.AdjustStats(&stat);
11383	}
11384
11385	// set and persist new stats
11386	ddl_manager.set_stats(stats);
11387	ddl_manager.persist_stats(true);
11388
11389	DBUG_RETURN(HA_EXIT_SUCCESS);
11390	}
11391
11392	/*
11393	@return
11394	HA_ADMIN_OK OK
11395	other HA_ADMIN error code
11396	*/
11397	int ha_rocksdb::analyze(THD *const thd, HA_CHECK_OPT *const check_opt) {
11398	DBUG_ENTER_FUNC();
11399
11400	if (table && calculate_stats(table, thd, check_opt) != HA_EXIT_SUCCESS) {
11401	DBUG_RETURN(HA_ADMIN_FAILED);
11402	}
11403
11404	DBUG_RETURN(HA_ADMIN_OK);
11405	}
11406
11407	void ha_rocksdb::get_auto_increment(ulonglong off, ulonglong inc,
11408	ulonglong nb_desired_values,
11409	ulonglong *const first_value,
11410	ulonglong *const nb_reserved_values) {
11411	/*
11412	MySQL has a somewhat complicated way of handling the auto-increment value.
11413	The first time get_auto_increment is called for a statement,
11414	nb_desired_values is the estimate for how many values will be needed. The
11415	engine can then reserve some values, and those will be automatically used
11416	by MySQL, until a hard-coded value shows up in the insert statement, after
11417	which MySQL again calls this function to reset its starting value.
11418	*
11419	For simplicity we will just ignore nb_desired_values - we aren't going to
11420	reserve any extra values for a multi-insert statement. Each row will
11421	simply acquire the next value as needed and we will always tell MySQL that
11422	we only reserved 1 value. Since we are using an atomic value for
11423	m_auto_incr_val this should be safe - if we had to grab a mutex, doing
11424	an actual reserve of some values might be a better solution.
11425	*/
11426	DEBUG_SYNC(ha_thd(), "rocksdb.autoinc_vars");
11427
11428	if (off > inc) {
11429	off = `1`;
11430	}
11431
11432	Field *field;
11433	ulonglong new_val, max_val;
11434	field = table->key_info[table->s->next_number_index].key_part[`0`].field;
11435	max_val = rdb_get_int_col_max_value(field);
11436
11437	// Local variable reference to simplify code below
11438	auto &auto_incr = m_tbl_def->m_auto_incr_val;
11439
11440	if (inc == `1`) {
11441	DBUG_ASSERT(off == `1`);
11442	// Optimization for the standard case where we are always simply
11443	// incrementing from the last position
11444
11445	// Use CAS operation in a loop to make sure automically get the next auto
11446	// increment value while ensuring that we don't wrap around to a negative
11447	// number.
11448	//
11449	// We set auto_incr to the min of max_val and new_val + 1. This means that
11450	// if we're at the maximum, we should be returning the same value for
11451	// multiple rows, resulting in duplicate key errors (as expected).
11452	//
11453	// If we return values greater than the max, the SQL layer will "truncate"
11454	// the value anyway, but it means that we store invalid values into
11455	// auto_incr that will be visible in SHOW CREATE TABLE.
11456	new_val = auto_incr;
11457	while (new_val != std::numeric_limits<ulonglong>::max()) {
11458	if (auto_incr.compare_exchange_weak(new_val,
11459	std::min(new_val + `1`, max_val))) {
11460	break;
11461	}
11462	}
11463	} else {
11464	// The next value can be more complicated if either 'inc' or 'off' is not 1
11465	ulonglong last_val = auto_incr;
11466
11467	// Loop until we can correctly update the atomic value
11468	do {
11469	DBUG_ASSERT(last_val > `0`);
11470	// Calculate the next value in the auto increment series: offset
11471	// + N increment where N is 0, 1, 2, ...*
11472	//
11473	// For further information please visit:
11474	// http://dev.mysql.com/doc/refman/5.7/en/replication-options-master.html
11475	//
11476	// The following is confusing so here is an explanation:
11477	// To get the next number in the sequence above you subtract out the
11478	// offset, calculate the next sequence (N increment) and then add the*
11479	// offset back in.
11480	//
11481	// The additions are rearranged to avoid overflow. The following is
11482	// equivalent to (last_val - 1 + inc - off) / inc. This uses the fact
11483	// that (a+b)/c = a/c + b/c + (a%c + b%c)/c. To show why:
11484	//
11485	// (a+b)/c
11486	// = (a - a%c + a%c + b - b%c + b%c) / c
11487	// = (a - a%c) / c + (b - b%c) / c + (a%c + b%c) / c
11488	// = a/c + b/c + (a%c + b%c) / c
11489	//
11490	// Now, substitute a = last_val - 1, b = inc - off, c = inc to get the
11491	// following statement.
11492	ulonglong n =
11493	(last_val - `1`) / inc + ((last_val - `1`) % inc + inc - off) / inc;
11494
11495	// Check if n inc + off will overflow. This can only happen if we have*
11496	// an UNSIGNED BIGINT field.
11497	if (n > (std::numeric_limits<ulonglong>::max() - off) / inc) {
11498	DBUG_ASSERT(max_val == std::numeric_limits<ulonglong>::max());
11499	// The 'last_val' value is already equal to or larger than the largest
11500	// value in the sequence. Continuing would wrap around (technically
11501	// the behavior would be undefined). What should we do?
11502	// We could:
11503	// 1) set the new value to the last possible number in our sequence
11504	// as described above. The problem with this is that this
11505	// number could be smaller than a value in an existing row.
11506	// 2) set the new value to the largest possible number. This number
11507	// may not be in our sequence, but it is guaranteed to be equal
11508	// to or larger than any other value already inserted.
11509	//
11510	// For now I'm going to take option 2.
11511	//
11512	// Returning ULLONG_MAX from get_auto_increment will cause the SQL
11513	// layer to fail with ER_AUTOINC_READ_FAILED. This means that due to
11514	// the SE API for get_auto_increment, inserts will fail with
11515	// ER_AUTOINC_READ_FAILED if the column is UNSIGNED BIGINT, but
11516	// inserts will fail with ER_DUP_ENTRY for other types (or no failure
11517	// if the column is in a non-unique SK).
11518	new_val = std::numeric_limits<ulonglong>::max();
11519	auto_incr = new_val; // Store the largest value into auto_incr
11520	break;
11521	}
11522
11523	new_val = n * inc + off;
11524
11525	// Attempt to store the new value (plus 1 since m_auto_incr_val contains
11526	// the next available value) into the atomic value. If the current
11527	// value no longer matches what we have in 'last_val' this will fail and
11528	// we will repeat the loop (`last_val` will automatically get updated
11529	// with the current value).
11530	//
11531	// See above explanation for inc == 1 for why we use std::min.
11532	} while (!auto_incr.compare_exchange_weak(last_val,
11533	std::min(new_val + `1`, max_val)));
11534	}
11535
11536	*first_value = new_val;
11537	*nb_reserved_values = `1`;
11538	}
11539
11540	#ifndef DBUG_OFF
11541
11542	/ Debugger help function /
11543	static char dbug_item_print_buf[`512`];
11544
11545	const char dbug_print_item(Item const item) {
11546	char *const buf = dbug_item_print_buf;
11547	String str(buf, sizeof(dbug_item_print_buf), &my_charset_bin);
11548	str.length(`0`);
11549	if (!item)
11550	return "(Item*)nullptr";
11551	item->print(&str, QT_ORDINARY);
11552	if (str.c_ptr() == buf)
11553	return buf;
11554	else
11555	return "Couldn't fit into buffer";
11556	}
11557
11558	#endif /DBUG_OFF/
11559
11560	/**
11561	SQL layer calls this function to push an index condition.
11562
11563	@details
11564	The condition is for index keyno (only one condition can be pushed at a
11565	time).
11566	The caller guarantees that condition refers only to index fields; besides
11567	that, fields must have
11568
11569	$field->part_of_key.set_bit(keyno)
11570
11571	which means that
11572
11573	(handler->index_flags(keyno, $keypart, 0) & HA_KEYREAD_ONLY) == 1
11574
11575	which means that field value can be restored from the index tuple.
11576
11577	@return
11578	Part of condition we couldn't check (always nullptr).
11579	*/
11580
11581	class Item ha_rocksdb::idx_cond_push(uint keyno, class* Item *const idx_cond) {
11582	DBUG_ENTER_FUNC();
11583
11584	DBUG_ASSERT(keyno != MAX_KEY);
11585	DBUG_ASSERT(idx_cond != nullptr);
11586
11587	pushed_idx_cond = idx_cond;
11588	pushed_idx_cond_keyno = keyno;
11589	in_range_check_pushed_down = TRUE;
11590
11591	/ We will check the whole condition /
11592	DBUG_RETURN(nullptr);
11593	}
11594
11595	/*
11596	Checks if inplace alter is supported for a given operation.
11597	*/
11598
11599	my_core::enum_alter_inplace_result ha_rocksdb::check_if_supported_inplace_alter(
11600	TABLE altered_table, my_core::Alter_inplace_info const ha_alter_info) {
11601	DBUG_ENTER_FUNC();
11602
11603	DBUG_ASSERT(ha_alter_info != nullptr);
11604
11605	if (ha_alter_info->handler_flags &
11606	~(ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX \|
11607	ALTER_DROP_UNIQUE_INDEX \|
11608	ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX \|
11609	ALTER_PARTITIONED \|
11610	ALTER_ADD_UNIQUE_INDEX \|
11611	ALTER_CHANGE_CREATE_OPTION)) {
11612	DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
11613	}
11614
11615	/ We don't support unique keys on table w/ no primary keys /
11616	if ((ha_alter_info->handler_flags &
11617	ALTER_ADD_UNIQUE_INDEX) &&
11618	has_hidden_pk(altered_table)) {
11619	DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
11620	}
11621
11622	/ We only support changing auto_increment for table options. /
11623	if ((ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) &&
11624	!(ha_alter_info->create_info->used_fields & HA_CREATE_USED_AUTO)) {
11625	DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED);
11626	}
11627
11628	/ FIXME: MDEV-16099 Use alter algorithm=nocopy or algorithm=instant*
11629	for non-InnoDB engine /*
11630	DBUG_RETURN(my_core::HA_ALTER_INPLACE_COPY_LOCK);
11631	}
11632
11633	/**
11634	Allows the storage engine to update internal structures with concurrent
11635	writes blocked. If check_if_supported_inplace_alter() returns
11636	HA_ALTER_INPLACE_COPY_NO_LOCK or HA_ALTER_INPLACE_COPY_LOCK.
11637	this function is called with exclusive lock otherwise the same level of
11638	locking as for inplace_alter_table() will be used.
11639
11640	@note Storage engines are responsible for reporting any errors by
11641	calling my_error()/print_error()
11642
11643	@note If this function reports error, commit_inplace_alter_table()
11644	will be called with commit= false.
11645
11646	@note For partitioning, failing to prepare one partition, means that
11647	commit_inplace_alter_table() will be called to roll back changes for
11648	all partitions. This means that commit_inplace_alter_table() might be
11649	called without prepare_inplace_alter_table() having been called first
11650	for a given partition.
11651
11652	@param altered_table TABLE object for new version of table.
11653	@param ha_alter_info Structure describing changes to be done
11654	by ALTER TABLE and holding data used
11655	during in-place alter.
11656
11657	@retval true Error
11658	@retval false Success
11659	*/
11660	bool ha_rocksdb::prepare_inplace_alter_table(
11661	TABLE *const altered_table,
11662	my_core::Alter_inplace_info *const ha_alter_info) {
11663	DBUG_ENTER_FUNC();
11664
11665	DBUG_ASSERT(altered_table != nullptr);
11666	DBUG_ASSERT(ha_alter_info != nullptr);
11667
11668	Rdb_tbl_def new_tdef = nullptr*;
11669	std::shared_ptr<Rdb_key_def> old_key_descr = nullptr*;
11670	std::shared_ptr<Rdb_key_def> new_key_descr = nullptr*;
11671	uint old_n_keys = m_tbl_def->m_key_count;
11672	uint new_n_keys = altered_table->s->keys;
11673	std::unordered_set<std::shared_ptr<Rdb_key_def>> added_indexes;
11674	std::unordered_set<GL_INDEX_ID> dropped_index_ids;
11675	uint n_dropped_keys = `0`;
11676	uint n_added_keys = `0`;
11677	ulonglong max_auto_incr = `0`;
11678
11679	if (ha_alter_info->handler_flags &
11680	(ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX \|
11681	ALTER_DROP_UNIQUE_INDEX \|
11682	ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX \|
11683	ALTER_ADD_UNIQUE_INDEX)) {
11684
11685	if (has_hidden_pk(altered_table)) {
11686	new_n_keys += `1`;
11687	}
11688
11689	const TABLE *const old_table = table;
11690	old_key_descr = m_tbl_def->m_key_descr_arr;
11691	new_key_descr = new std::shared_ptr<Rdb_key_def>[new_n_keys];
11692
11693	new_tdef = new Rdb_tbl_def (m_tbl_def->full_tablename());
11694	new_tdef->m_key_descr_arr = new_key_descr;
11695	new_tdef->m_key_count = new_n_keys;
11696	new_tdef->m_auto_incr_val =
11697	m_tbl_def->m_auto_incr_val.load(std::memory_order_relaxed);
11698	new_tdef->m_hidden_pk_val =
11699	m_tbl_def->m_hidden_pk_val.load(std::memory_order_relaxed);
11700
11701	if (create_key_defs(altered_table, new_tdef, table, m_tbl_def)) {
11702	/ Delete the new key descriptors /
11703	delete[] new_key_descr;
11704
11705	/*
11706	Explicitly mark as nullptr so we don't accidentally remove entries
11707	from data dictionary on cleanup (or cause double delete[]).
11708	*/
11709	new_tdef->m_key_descr_arr = nullptr;
11710	delete new_tdef;
11711
11712	my_error(ER_KEY_CREATE_DURING_ALTER, MYF(`0`));
11713	DBUG_RETURN(HA_EXIT_FAILURE);
11714	}
11715
11716	uint i;
11717	uint j;
11718
11719	/ Determine which(if any) key definition(s) need to be dropped /
11720	for (i = `0`; i < ha_alter_info->index_drop_count; i++) {
11721	const KEY *const dropped_key = ha_alter_info->index_drop_buffer[i];
11722	for (j = `0`; j < old_n_keys; j++) {
11723	const KEY *const old_key =
11724	&old_table->key_info[old_key_descr[j]->get_keyno()];
11725
11726	if (!compare_keys(old_key, dropped_key)) {
11727	dropped_index_ids.insert(old_key_descr[j]->get_gl_index_id());
11728	break;
11729	}
11730	}
11731	}
11732
11733	/ Determine which(if any) key definitions(s) need to be added /
11734	int identical_indexes_found = `0`;
11735	for (i = `0`; i < ha_alter_info->index_add_count; i++) {
11736	const KEY *const added_key =
11737	&ha_alter_info->key_info_buffer[ha_alter_info->index_add_buffer[i]];
11738	for (j = `0`; j < new_n_keys; j++) {
11739	const KEY *const new_key =
11740	&altered_table->key_info[new_key_descr[j]->get_keyno()];
11741	if (!compare_keys(new_key, added_key)) {
11742	/*
11743	Check for cases where an 'identical' index is being dropped and
11744	re-added in a single ALTER statement. Turn this into a no-op as the
11745	index has not changed.
11746
11747	E.G. Unique index -> non-unique index requires no change
11748
11749	Note that cases where the index name remains the same but the
11750	key-parts are changed is already handled in create_inplace_key_defs.
11751	In these cases the index needs to be rebuilt.
11752	*/
11753	if (dropped_index_ids.count(new_key_descr[j]->get_gl_index_id())) {
11754	dropped_index_ids.erase(new_key_descr[j]->get_gl_index_id());
11755	identical_indexes_found++;
11756	} else {
11757	added_indexes.insert(new_key_descr[j]);
11758	}
11759
11760	break;
11761	}
11762	}
11763	}
11764
11765	n_dropped_keys = ha_alter_info->index_drop_count - identical_indexes_found;
11766	n_added_keys = ha_alter_info->index_add_count - identical_indexes_found;
11767	DBUG_ASSERT(dropped_index_ids.size() == n_dropped_keys);
11768	DBUG_ASSERT(added_indexes.size() == n_added_keys);
11769	DBUG_ASSERT(new_n_keys == (old_n_keys - n_dropped_keys + n_added_keys));
11770	}
11771	if (ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) {
11772	if (!new_tdef) {
11773	new_tdef = m_tbl_def;
11774	}
11775	if (table->found_next_number_field) {
11776	max_auto_incr = load_auto_incr_value_from_index();
11777	}
11778	}
11779
11780	ha_alter_info->handler_ctx = new Rdb_inplace_alter_ctx (
11781	new_tdef, old_key_descr, new_key_descr, old_n_keys, new_n_keys,
11782	added_indexes, dropped_index_ids, n_added_keys, n_dropped_keys,
11783	max_auto_incr);
11784	DBUG_RETURN(HA_EXIT_SUCCESS);
11785	}
11786
11787	/**
11788	Alter the table structure in-place with operations specified using
11789	HA_ALTER_FLAGS and Alter_inplace_info. The level of concurrency allowed
11790	during this operation depends on the return value from
11791	check_if_supported_inplace_alter().
11792
11793	@note Storage engines are responsible for reporting any errors by
11794	calling my_error()/print_error()
11795
11796	@note If this function reports error, commit_inplace_alter_table()
11797	will be called with commit= false.
11798
11799	@param altered_table TABLE object for new version of table.
11800	@param ha_alter_info Structure describing changes to be done
11801	by ALTER TABLE and holding data used
11802	during in-place alter.
11803
11804	@retval true Error
11805	@retval false Success
11806	*/
11807	bool ha_rocksdb::inplace_alter_table(
11808	TABLE *const altered_table,
11809	my_core::Alter_inplace_info *const ha_alter_info) {
11810	DBUG_ENTER_FUNC();
11811
11812	DBUG_ASSERT(altered_table != nullptr);
11813	DBUG_ASSERT(ha_alter_info != nullptr);
11814	DBUG_ASSERT(ha_alter_info->handler_ctx != nullptr);
11815
11816	Rdb_inplace_alter_ctx *const ctx =
11817	static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx);
11818
11819	if (ha_alter_info->handler_flags &
11820	(ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX \|
11821	ALTER_ADD_UNIQUE_INDEX)) {
11822	/*
11823	Buffers need to be set up again to account for new, possibly longer
11824	secondary keys.
11825	*/
11826	free_key_buffers();
11827
11828	DBUG_ASSERT(ctx != nullptr);
11829
11830	/*
11831	If adding unique index, allocate special buffers for duplicate checking.
11832	*/
11833	int err;
11834	if ((err = alloc_key_buffers(
11835	altered_table, ctx->m_new_tdef,
11836	ha_alter_info->handler_flags &
11837	ALTER_ADD_UNIQUE_INDEX))) {
11838	my_error(ER_OUT_OF_RESOURCES, MYF(`0`));
11839	DBUG_RETURN(err);
11840	}
11841
11842	/ Populate all new secondary keys by scanning the primary key. /
11843	if ((err = inplace_populate_sk(altered_table, ctx->m_added_indexes))) {
11844	my_error(ER_SK_POPULATE_DURING_ALTER, MYF(`0`));
11845	DBUG_RETURN(HA_EXIT_FAILURE);
11846	}
11847	}
11848
11849	DBUG_EXECUTE_IF("myrocks_simulate_index_create_rollback", {
11850	dbug_create_err_inplace_alter();
11851	DBUG_RETURN(HA_EXIT_FAILURE);
11852	};);
11853
11854	DBUG_RETURN(HA_EXIT_SUCCESS);
11855	}
11856
11857	/**
11858	Scan the Primary Key index entries and populate the new secondary keys.
11859	*/
11860	int ha_rocksdb::inplace_populate_sk(
11861	TABLE *const new_table_arg,
11862	const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes) {
11863	DBUG_ENTER_FUNC();
11864	int res = HA_EXIT_SUCCESS;
11865	const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
11866	rocksdb::WriteBatch *const batch = wb.get();
11867
11868	/ Update the data dictionary /
11869	std::unordered_set<GL_INDEX_ID> create_index_ids;
11870	for (const auto &index : indexes) {
11871	create_index_ids.insert(index ->get_gl_index_id());
11872	}
11873	dict_manager.add_create_index(create_index_ids, batch);
11874	res = dict_manager.commit(batch);
11875	if (res != HA_EXIT_SUCCESS) {
11876	return res;
11877	}
11878
11879	/*
11880	Add uncommitted key definitons to ddl_manager. We need to do this
11881	so that the property collector can find this keydef when it needs to
11882	update stats. The property collector looks for the keydef in the
11883	data dictionary, but it won't be there yet since this key definition
11884	is still in the creation process.
11885	*/
11886	ddl_manager.add_uncommitted_keydefs(indexes);
11887
11888	const bool hidden_pk_exists = has_hidden_pk(table);
11889
11890	Rdb_transaction *tx = get_or_create_tx(table->in_use);
11891
11892	/*
11893	There is one specific scenario where m_sst_info may not be nullptr. This
11894	happens if the handler we're using happens to be the handler where the PK
11895	bulk load was done on. The sequence of events that lead to this is as
11896	follows (T1 is PK bulk load, T2 is SK alter table):
11897
11898	T1: Execute last INSERT statement
11899	T1: Return TABLE and handler object back to Table_cache_manager
11900	T1: Close connection
11901	T2: Execute ALTER statement
11902	T2: Take same TABLE/handler from Table_cache_manager
11903	T2: Call closefrm which will call finalize_bulk_load on every other open
11904	table/handler except* the one it's on.*
11905	T2: Acquire stale snapshot of PK
11906	T1: Call finalize_bulk_load
11907
11908	This is rare because usually, closefrm will call the destructor (and thus
11909	finalize_bulk_load) on the handler where PK bulk load is done. However, if
11910	the thread ids of the bulk load thread and the alter thread differ by a
11911	multiple of table_cache_instances (8 by default), then they hash to the
11912	same bucket in Table_cache_manager and the alter thread will not not call
11913	the destructor on the handler it is holding. Thus, its m_sst_info will not
11914	be nullptr.
11915
11916	At this point, it is safe to refresh the snapshot because we know all other
11917	open handlers have been closed at this point, and the one we're on is the
11918	only one left.
11919	*/
11920	if (m_sst_info) {
11921	if ((res = finalize_bulk_load())) {
11922	DBUG_RETURN(res);
11923	}
11924	tx->commit();
11925	}
11926
11927	const ulonglong rdb_merge_buf_size = THDVAR(ha_thd(), merge_buf_size);
11928	const ulonglong rdb_merge_combine_read_size =
11929	THDVAR(ha_thd(), merge_combine_read_size);
11930	const ulonglong rdb_merge_tmp_file_removal_delay =
11931	THDVAR(ha_thd(), merge_tmp_file_removal_delay_ms);
11932
11933	for (const auto &index : indexes) {
11934	bool is_unique_index =
11935	new_table_arg->key_info[index ->get_keyno()].flags & HA_NOSAME;
11936
11937	Rdb_index_merge rdb_merge(tx->get_rocksdb_tmpdir(), rdb_merge_buf_size,
11938	rdb_merge_combine_read_size,
11939	rdb_merge_tmp_file_removal_delay,
11940	index ->get_cf());
11941
11942	if ((res = rdb_merge.init())) {
11943	DBUG_RETURN(res);
11944	}
11945
11946	/*
11947	Note: We pass in the currently existing table + tbl_def object here,
11948	as the pk index position may have changed in the case of hidden primary
11949	keys.
11950	*/
11951	const uint pk = pk_index(table, m_tbl_def);
11952	ha_index_init(pk, true);
11953
11954	/ Scan each record in the primary key in order /
11955	for (res = index_first(table->record[`0`]); res == `0`;
11956	res = index_next(table->record[`0`])) {
11957	longlong hidden_pk_id = `0`;
11958	if (hidden_pk_exists &&
11959	(res = read_hidden_pk_id_from_rowkey(&hidden_pk_id))) {
11960	// NO_LINT_DEBUG
11961	sql_print_error("Error retrieving hidden pk id.");
11962	ha_index_end();
11963	DBUG_RETURN(res);
11964	}
11965
11966	/ Create new secondary index entry /
11967	const int new_packed_size = index ->pack_record(
11968	new_table_arg, m_pack_buffer, table->record[`0`], m_sk_packed_tuple,
11969	&m_sk_tails, should_store_row_debug_checksums(), hidden_pk_id, `0`,
11970	nullptr, nullptr, m_ttl_bytes);
11971
11972	const rocksdb::Slice key = rocksdb::Slice (
11973	reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size);
11974	const rocksdb::Slice val =
11975	rocksdb::Slice (reinterpret_cast<const char *>(m_sk_tails.ptr()),
11976	m_sk_tails.get_current_pos());
11977
11978	/*
11979	Add record to offset tree in preparation for writing out to
11980	disk in sorted chunks.
11981	*/
11982	if ((res = rdb_merge.add(key, val))) {
11983	ha_index_end();
11984	DBUG_RETURN(res);
11985	}
11986	}
11987
11988	if (res != HA_ERR_END_OF_FILE) {
11989	// NO_LINT_DEBUG
11990	sql_print_error("Error retrieving index entry from primary key.");
11991	ha_index_end();
11992	DBUG_RETURN(res);
11993	}
11994
11995	ha_index_end();
11996
11997	/*
11998	Perform an n-way merge of n sorted buffers on disk, then writes all
11999	results to RocksDB via SSTFileWriter API.
12000	*/
12001	rocksdb::Slice merge_key;
12002	rocksdb::Slice merge_val;
12003
12004	struct unique_sk_buf_info sk_info;
12005	sk_info.dup_sk_buf = m_dup_sk_packed_tuple;
12006	sk_info.dup_sk_buf_old = m_dup_sk_packed_tuple_old;
12007
12008	while ((res = rdb_merge.next(&merge_key, &merge_val)) == `0`) {
12009	/ Perform uniqueness check if needed /
12010	if (is_unique_index) {
12011	if (check_duplicate_sk(new_table_arg, *index, &merge_key, &sk_info)) {
12012	/*
12013	Duplicate entry found when trying to create unique secondary key.
12014	We need to unpack the record into new_table_arg->record[0] as it
12015	is used inside print_keydup_error so that the error message shows
12016	the duplicate record.
12017	*/
12018	if (index ->unpack_record(new_table_arg, new_table_arg->record[`0`],
12019	&merge_key, nullptr,
12020	m_verify_row_debug_checksums)) {
12021	/ Should never reach here /
12022	DBUG_ASSERT(`0`);
12023	}
12024
12025	print_keydup_error(new_table_arg,
12026	&new_table_arg->key_info[index ->get_keyno()],
12027	MYF(`0`));
12028	DBUG_RETURN(ER_DUP_ENTRY);
12029	}
12030	}
12031
12032	/*
12033	Insert key and slice to SST via SSTFileWriter API.
12034	*/
12035	if ((res = bulk_load_key(tx, index, merge_key, merge_val, false*))) {
12036	break;
12037	}
12038	}
12039
12040	/*
12041	Here, res == -1 means that we are finished, while > 0 means an error
12042	occurred.
12043	*/
12044	if (res > `0`) {
12045	// NO_LINT_DEBUG
12046	sql_print_error("Error while bulk loading keys in external merge sort.");
12047	DBUG_RETURN(res);
12048	}
12049
12050	if ((res = tx->finish_bulk_load())) {
12051	// NO_LINT_DEBUG
12052	sql_print_error("Error finishing bulk load.");
12053	DBUG_RETURN(res);
12054	}
12055	}
12056
12057	/*
12058	Explicitly tell jemalloc to clean up any unused dirty pages at this point.
12059	See https://reviews.facebook.net/D63723 for more details.
12060	*/
12061	purge_all_jemalloc_arenas();
12062
12063	DBUG_EXECUTE_IF("crash_during_online_index_creation", DBUG_SUICIDE(););
12064	DBUG_RETURN(res);
12065	}
12066
12067	/**
12068	Commit or rollback the changes made during prepare_inplace_alter_table()
12069	and inplace_alter_table() inside the storage engine.
12070	Note that in case of rollback the allowed level of concurrency during
12071	this operation will be the same as for inplace_alter_table() and thus
12072	might be higher than during prepare_inplace_alter_table(). (For example,
12073	concurrent writes were blocked during prepare, but might not be during
12074	rollback).
12075
12076	@note Storage engines are responsible for reporting any errors by
12077	calling my_error()/print_error()
12078
12079	@note If this function with commit= true reports error, it will be called
12080	again with commit= false.
12081
12082	@note In case of partitioning, this function might be called for rollback
12083	without prepare_inplace_alter_table() having been called first.
12084	Also partitioned tables sets ha_alter_info->group_commit_ctx to a NULL
12085	terminated array of the partitions handlers and if all of them are
12086	committed as one, then group_commit_ctx should be set to NULL to indicate
12087	to the partitioning handler that all partitions handlers are committed.
12088	@see prepare_inplace_alter_table().
12089
12090	@param altered_table TABLE object for new version of table.
12091	@param ha_alter_info Structure describing changes to be done
12092	by ALTER TABLE and holding data used
12093	during in-place alter.
12094	@param commit True => Commit, False => Rollback.
12095
12096	@retval true Error
12097	@retval false Success
12098	*/
12099	bool ha_rocksdb::commit_inplace_alter_table(
12100	my_core::TABLE *const altered_table,
12101	my_core::Alter_inplace_info *const ha_alter_info, bool commit) {
12102	DBUG_ENTER_FUNC();
12103
12104	DBUG_ASSERT(altered_table != nullptr);
12105	DBUG_ASSERT(ha_alter_info != nullptr);
12106
12107	Rdb_inplace_alter_ctx *const ctx0 =
12108	static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx);
12109
12110	DEBUG_SYNC(ha_thd(), "rocksdb.commit_in_place_alter_table");
12111
12112	/*
12113	IMPORTANT: When rollback is requested, mysql will abort with
12114	an assertion failure. That means every failed commit during inplace alter
12115	table will result in a fatal error on the server. Indexes ongoing creation
12116	will be detected when the server restarts, and dropped.
12117
12118	For partitioned tables, a rollback call to this function (commit == false)
12119	is done for each partition. A successful commit call only executes once
12120	for all partitions.
12121	*/
12122	if (!commit) {
12123	/ If ctx has not been created yet, nothing to do here /
12124	if (!ctx0) {
12125	DBUG_RETURN(HA_EXIT_SUCCESS);
12126	}
12127
12128	/*
12129	Cannot call destructor for Rdb_tbl_def directly because we don't want to
12130	erase the mappings inside the ddl_manager, as the old_key_descr is still
12131	using them.
12132	*/
12133	if (ctx0->m_new_key_descr) {
12134	/ Delete the new key descriptors /
12135	for (uint i = `0`; i < ctx0->m_new_tdef->m_key_count; i++) {
12136	ctx0->m_new_key_descr[i] = nullptr;
12137	}
12138
12139	delete[] ctx0->m_new_key_descr;
12140	ctx0->m_new_key_descr = nullptr;
12141	ctx0->m_new_tdef->m_key_descr_arr = nullptr;
12142
12143	delete ctx0->m_new_tdef;
12144	}
12145
12146	/ Remove uncommitted key definitons from ddl_manager /
12147	ddl_manager.remove_uncommitted_keydefs(ctx0->m_added_indexes);
12148
12149	/ Rollback any partially created indexes /
12150	dict_manager.rollback_ongoing_index_creation();
12151
12152	DBUG_RETURN(HA_EXIT_SUCCESS);
12153	}
12154
12155	DBUG_ASSERT(ctx0);
12156
12157	/*
12158	For partitioned tables, we need to commit all changes to all tables at
12159	once, unlike in the other inplace alter API methods.
12160	*/
12161	inplace_alter_handler_ctx **ctx_array;
12162	inplace_alter_handler_ctx *ctx_single[`2`];
12163
12164	if (ha_alter_info->group_commit_ctx) {
12165	DBUG_EXECUTE_IF("crash_during_index_creation_partition", DBUG_SUICIDE(););
12166	ctx_array = ha_alter_info->group_commit_ctx;
12167	} else {
12168	ctx_single[`0`] = ctx0;
12169	ctx_single[`1`] = nullptr;
12170	ctx_array = ctx_single;
12171	}
12172
12173	DBUG_ASSERT(ctx0 == ctx_array[`0`]);
12174	ha_alter_info->group_commit_ctx = nullptr;
12175
12176	if (ha_alter_info->handler_flags &
12177	(ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX \|
12178	ALTER_DROP_UNIQUE_INDEX \|
12179	ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX \|
12180	ALTER_ADD_UNIQUE_INDEX)) {
12181	const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
12182	rocksdb::WriteBatch *const batch = wb.get();
12183	std::unordered_set<GL_INDEX_ID> create_index_ids;
12184
12185	m_tbl_def = ctx0->m_new_tdef;
12186	m_key_descr_arr = m_tbl_def->m_key_descr_arr;
12187	m_pk_descr = m_key_descr_arr[pk_index(altered_table, m_tbl_def)];
12188
12189	dict_manager.lock();
12190	for (inplace_alter_handler_ctx *pctx = ctx_array; pctx; pctx++) {
12191	Rdb_inplace_alter_ctx *const ctx =
12192	static_cast<Rdb_inplace_alter_ctx >(pctx);
12193
12194	/ Mark indexes to be dropped /
12195	dict_manager.add_drop_index(ctx->m_dropped_index_ids, batch);
12196
12197	for (const auto &index : ctx->m_added_indexes) {
12198	create_index_ids.insert(index ->get_gl_index_id());
12199	}
12200
12201	if (ddl_manager.put_and_write(ctx->m_new_tdef, batch)) {
12202	/*
12203	Failed to write new entry into data dictionary, this should never
12204	happen.
12205	*/
12206	DBUG_ASSERT(`0`);
12207	}
12208
12209	/*
12210	Remove uncommitted key definitons from ddl_manager, as they are now
12211	committed into the data dictionary.
12212	*/
12213	ddl_manager.remove_uncommitted_keydefs(ctx->m_added_indexes);
12214	}
12215
12216	if (dict_manager.commit(batch)) {
12217	/*
12218	Should never reach here. We assume MyRocks will abort if commit fails.
12219	*/
12220	DBUG_ASSERT(`0`);
12221	}
12222
12223	dict_manager.unlock();
12224
12225	/ Mark ongoing create indexes as finished/remove from data dictionary /
12226	dict_manager.finish_indexes_operation(
12227	create_index_ids, Rdb_key_def::DDL_CREATE_INDEX_ONGOING);
12228
12229	/*
12230	We need to recalculate the index stats here manually. The reason is that
12231	the secondary index does not exist inside
12232	m_index_num_to_keydef until it is committed to the data dictionary, which
12233	prevents us from updating the stats normally as the ddl_manager cannot
12234	find the proper gl_index_ids yet during adjust_stats calls.
12235	*/
12236	if (calculate_stats(altered_table, nullptr, nullptr)) {
12237	/ Failed to update index statistics, should never happen /
12238	DBUG_ASSERT(`0`);
12239	}
12240
12241	rdb_drop_idx_thread.signal();
12242	}
12243
12244	if (ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) {
12245	const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin();
12246	rocksdb::WriteBatch *const batch = wb.get();
12247	std::unordered_set<GL_INDEX_ID> create_index_ids;
12248
12249	ulonglong auto_incr_val = ha_alter_info->create_info->auto_increment_value;
12250
12251	for (inplace_alter_handler_ctx *pctx = ctx_array; pctx; pctx++) {
12252	Rdb_inplace_alter_ctx *const ctx =
12253	static_cast<Rdb_inplace_alter_ctx >(pctx);
12254	auto_incr_val = std::max(auto_incr_val, ctx->m_max_auto_incr);
12255	dict_manager.put_auto_incr_val(
12256	batch, ctx->m_new_tdef->get_autoincr_gl_index_id(), auto_incr_val,
12257	true / overwrite /);
12258	ctx->m_new_tdef->m_auto_incr_val = auto_incr_val;
12259	}
12260
12261	if (dict_manager.commit(batch)) {
12262	DBUG_ASSERT(`0`);
12263	}
12264	}
12265
12266	DBUG_RETURN(HA_EXIT_SUCCESS);
12267	}
12268
12269	#define SHOW_FNAME(name) rocksdb_show_##name
12270
12271	#define DEF_SHOW_FUNC(name, key) \
12272	static int SHOW_FNAME(name)(MYSQL_THD thd, SHOW_VAR * var, char *buff) { \
12273	rocksdb_status_counters.name = \
12274	rocksdb_stats ->getTickerCount(rocksdb::key); \
12275	var->type = SHOW_LONGLONG; \
12276	var->value = (char *)&rocksdb_status_counters.name; \
12277	return HA_EXIT_SUCCESS; \
12278	}
12279
12280	#define DEF_STATUS_VAR(name) \
12281	{ "rocksdb_" #name, (char *)&SHOW_FNAME(name), SHOW_FUNC }
12282
12283	#define DEF_STATUS_VAR_PTR(name, ptr, option) \
12284	{ "rocksdb_" name, (char *)ptr, option }
12285
12286	#define DEF_STATUS_VAR_FUNC(name, ptr, option) \
12287	{ name, reinterpret_cast<char *>(ptr), option }
12288
12289	struct rocksdb_status_counters_t {
12290	uint64_t block_cache_miss;
12291	uint64_t block_cache_hit;
12292	uint64_t block_cache_add;
12293	uint64_t block_cache_add_failures;
12294	uint64_t block_cache_index_miss;
12295	uint64_t block_cache_index_hit;
12296	uint64_t block_cache_index_add;
12297	uint64_t block_cache_index_bytes_insert;
12298	uint64_t block_cache_index_bytes_evict;
12299	uint64_t block_cache_filter_miss;
12300	uint64_t block_cache_filter_hit;
12301	uint64_t block_cache_filter_add;
12302	uint64_t block_cache_filter_bytes_insert;
12303	uint64_t block_cache_filter_bytes_evict;
12304	uint64_t block_cache_bytes_read;
12305	uint64_t block_cache_bytes_write;
12306	uint64_t block_cache_data_bytes_insert;
12307	uint64_t block_cache_data_miss;
12308	uint64_t block_cache_data_hit;
12309	uint64_t block_cache_data_add;
12310	uint64_t bloom_filter_useful;
12311	uint64_t memtable_hit;
12312	uint64_t memtable_miss;
12313	uint64_t get_hit_l0;
12314	uint64_t get_hit_l1;
12315	uint64_t get_hit_l2_and_up;
12316	uint64_t compaction_key_drop_new;
12317	uint64_t compaction_key_drop_obsolete;
12318	uint64_t compaction_key_drop_user;
12319	uint64_t number_keys_written;
12320	uint64_t number_keys_read;
12321	uint64_t number_keys_updated;
12322	uint64_t bytes_written;
12323	uint64_t bytes_read;
12324	uint64_t number_db_seek;
12325	uint64_t number_db_seek_found;
12326	uint64_t number_db_next;
12327	uint64_t number_db_next_found;
12328	uint64_t number_db_prev;
12329	uint64_t number_db_prev_found;
12330	uint64_t iter_bytes_read;
12331	uint64_t no_file_closes;
12332	uint64_t no_file_opens;
12333	uint64_t no_file_errors;
12334	uint64_t stall_micros;
12335	uint64_t num_iterators;
12336	uint64_t number_multiget_get;
12337	uint64_t number_multiget_keys_read;
12338	uint64_t number_multiget_bytes_read;
12339	uint64_t number_deletes_filtered;
12340	uint64_t number_merge_failures;
12341	uint64_t bloom_filter_prefix_checked;
12342	uint64_t bloom_filter_prefix_useful;
12343	uint64_t number_reseeks_iteration;
12344	uint64_t getupdatessince_calls;
12345	uint64_t block_cachecompressed_miss;
12346	uint64_t block_cachecompressed_hit;
12347	uint64_t wal_synced;
12348	uint64_t wal_bytes;
12349	uint64_t write_self;
12350	uint64_t write_other;
12351	uint64_t write_timedout;
12352	uint64_t write_wal;
12353	uint64_t flush_write_bytes;
12354	uint64_t compact_read_bytes;
12355	uint64_t compact_write_bytes;
12356	uint64_t number_superversion_acquires;
12357	uint64_t number_superversion_releases;
12358	uint64_t number_superversion_cleanups;
12359	uint64_t number_block_not_compressed;
12360	};
12361
12362	static rocksdb_status_counters_t rocksdb_status_counters;
12363
12364	DEF_SHOW_FUNC(block_cache_miss, BLOCK_CACHE_MISS)
12365	DEF_SHOW_FUNC(block_cache_hit, BLOCK_CACHE_HIT)
12366	DEF_SHOW_FUNC(block_cache_add, BLOCK_CACHE_ADD)
12367	DEF_SHOW_FUNC(block_cache_add_failures, BLOCK_CACHE_ADD_FAILURES)
12368	DEF_SHOW_FUNC(block_cache_index_miss, BLOCK_CACHE_INDEX_MISS)
12369	DEF_SHOW_FUNC(block_cache_index_hit, BLOCK_CACHE_INDEX_HIT)
12370	DEF_SHOW_FUNC(block_cache_index_add, BLOCK_CACHE_INDEX_ADD)
12371	DEF_SHOW_FUNC(block_cache_index_bytes_insert, BLOCK_CACHE_INDEX_BYTES_INSERT)
12372	DEF_SHOW_FUNC(block_cache_index_bytes_evict, BLOCK_CACHE_INDEX_BYTES_EVICT)
12373	DEF_SHOW_FUNC(block_cache_filter_miss, BLOCK_CACHE_FILTER_MISS)
12374	DEF_SHOW_FUNC(block_cache_filter_hit, BLOCK_CACHE_FILTER_HIT)
12375	DEF_SHOW_FUNC(block_cache_filter_add, BLOCK_CACHE_FILTER_ADD)
12376	DEF_SHOW_FUNC(block_cache_filter_bytes_insert, BLOCK_CACHE_FILTER_BYTES_INSERT)
12377	DEF_SHOW_FUNC(block_cache_filter_bytes_evict, BLOCK_CACHE_FILTER_BYTES_EVICT)
12378	DEF_SHOW_FUNC(block_cache_bytes_read, BLOCK_CACHE_BYTES_READ)
12379	DEF_SHOW_FUNC(block_cache_bytes_write, BLOCK_CACHE_BYTES_WRITE)
12380	DEF_SHOW_FUNC(block_cache_data_bytes_insert, BLOCK_CACHE_DATA_BYTES_INSERT)
12381	DEF_SHOW_FUNC(block_cache_data_miss, BLOCK_CACHE_DATA_MISS)
12382	DEF_SHOW_FUNC(block_cache_data_hit, BLOCK_CACHE_DATA_HIT)
12383	DEF_SHOW_FUNC(block_cache_data_add, BLOCK_CACHE_DATA_ADD)
12384	DEF_SHOW_FUNC(bloom_filter_useful, BLOOM_FILTER_USEFUL)
12385	DEF_SHOW_FUNC(memtable_hit, MEMTABLE_HIT)
12386	DEF_SHOW_FUNC(memtable_miss, MEMTABLE_MISS)
12387	DEF_SHOW_FUNC(get_hit_l0, GET_HIT_L0)
12388	DEF_SHOW_FUNC(get_hit_l1, GET_HIT_L1)
12389	DEF_SHOW_FUNC(get_hit_l2_and_up, GET_HIT_L2_AND_UP)
12390	DEF_SHOW_FUNC(compaction_key_drop_new, COMPACTION_KEY_DROP_NEWER_ENTRY)
12391	DEF_SHOW_FUNC(compaction_key_drop_obsolete, COMPACTION_KEY_DROP_OBSOLETE)
12392	DEF_SHOW_FUNC(compaction_key_drop_user, COMPACTION_KEY_DROP_USER)
12393	DEF_SHOW_FUNC(number_keys_written, NUMBER_KEYS_WRITTEN)
12394	DEF_SHOW_FUNC(number_keys_read, NUMBER_KEYS_READ)
12395	DEF_SHOW_FUNC(number_keys_updated, NUMBER_KEYS_UPDATED)
12396	DEF_SHOW_FUNC(bytes_written, BYTES_WRITTEN)
12397	DEF_SHOW_FUNC(bytes_read, BYTES_READ)
12398	DEF_SHOW_FUNC(number_db_seek, NUMBER_DB_SEEK)
12399	DEF_SHOW_FUNC(number_db_seek_found, NUMBER_DB_SEEK_FOUND)
12400	DEF_SHOW_FUNC(number_db_next, NUMBER_DB_NEXT)
12401	DEF_SHOW_FUNC(number_db_next_found, NUMBER_DB_NEXT_FOUND)
12402	DEF_SHOW_FUNC(number_db_prev, NUMBER_DB_PREV)
12403	DEF_SHOW_FUNC(number_db_prev_found, NUMBER_DB_PREV_FOUND)
12404	DEF_SHOW_FUNC(iter_bytes_read, ITER_BYTES_READ)
12405	DEF_SHOW_FUNC(no_file_closes, NO_FILE_CLOSES)
12406	DEF_SHOW_FUNC(no_file_opens, NO_FILE_OPENS)
12407	DEF_SHOW_FUNC(no_file_errors, NO_FILE_ERRORS)
12408	DEF_SHOW_FUNC(stall_micros, STALL_MICROS)
12409	DEF_SHOW_FUNC(num_iterators, NO_ITERATORS)
12410	DEF_SHOW_FUNC(number_multiget_get, NUMBER_MULTIGET_CALLS)
12411	DEF_SHOW_FUNC(number_multiget_keys_read, NUMBER_MULTIGET_KEYS_READ)
12412	DEF_SHOW_FUNC(number_multiget_bytes_read, NUMBER_MULTIGET_BYTES_READ)
12413	DEF_SHOW_FUNC(number_deletes_filtered, NUMBER_FILTERED_DELETES)
12414	DEF_SHOW_FUNC(number_merge_failures, NUMBER_MERGE_FAILURES)
12415	DEF_SHOW_FUNC(bloom_filter_prefix_checked, BLOOM_FILTER_PREFIX_CHECKED)
12416	DEF_SHOW_FUNC(bloom_filter_prefix_useful, BLOOM_FILTER_PREFIX_USEFUL)
12417	DEF_SHOW_FUNC(number_reseeks_iteration, NUMBER_OF_RESEEKS_IN_ITERATION)
12418	DEF_SHOW_FUNC(getupdatessince_calls, GET_UPDATES_SINCE_CALLS)
12419	DEF_SHOW_FUNC(block_cachecompressed_miss, BLOCK_CACHE_COMPRESSED_MISS)
12420	DEF_SHOW_FUNC(block_cachecompressed_hit, BLOCK_CACHE_COMPRESSED_HIT)
12421	DEF_SHOW_FUNC(wal_synced, WAL_FILE_SYNCED)
12422	DEF_SHOW_FUNC(wal_bytes, WAL_FILE_BYTES)
12423	DEF_SHOW_FUNC(write_self, WRITE_DONE_BY_SELF)
12424	DEF_SHOW_FUNC(write_other, WRITE_DONE_BY_OTHER)
12425	DEF_SHOW_FUNC(write_timedout, WRITE_TIMEDOUT)
12426	DEF_SHOW_FUNC(write_wal, WRITE_WITH_WAL)
12427	DEF_SHOW_FUNC(flush_write_bytes, FLUSH_WRITE_BYTES)
12428	DEF_SHOW_FUNC(compact_read_bytes, COMPACT_READ_BYTES)
12429	DEF_SHOW_FUNC(compact_write_bytes, COMPACT_WRITE_BYTES)
12430	DEF_SHOW_FUNC(number_superversion_acquires, NUMBER_SUPERVERSION_ACQUIRES)
12431	DEF_SHOW_FUNC(number_superversion_releases, NUMBER_SUPERVERSION_RELEASES)
12432	DEF_SHOW_FUNC(number_superversion_cleanups, NUMBER_SUPERVERSION_CLEANUPS)
12433	DEF_SHOW_FUNC(number_block_not_compressed, NUMBER_BLOCK_NOT_COMPRESSED)
12434
12435	static void myrocks_update_status() {
12436	export_stats.rows_deleted = global_stats.rows[ROWS_DELETED];
12437	export_stats.rows_inserted = global_stats.rows[ROWS_INSERTED];
12438	export_stats.rows_read = global_stats.rows[ROWS_READ];
12439	export_stats.rows_updated = global_stats.rows[ROWS_UPDATED];
12440	export_stats.rows_deleted_blind = global_stats.rows[ROWS_DELETED_BLIND];
12441	export_stats.rows_expired = global_stats.rows[ROWS_EXPIRED];
12442	export_stats.rows_filtered = global_stats.rows[ROWS_FILTERED];
12443
12444	export_stats.system_rows_deleted = global_stats.system_rows[ROWS_DELETED];
12445	export_stats.system_rows_inserted = global_stats.system_rows[ROWS_INSERTED];
12446	export_stats.system_rows_read = global_stats.system_rows[ROWS_READ];
12447	export_stats.system_rows_updated = global_stats.system_rows[ROWS_UPDATED];
12448
12449	export_stats.queries_point = global_stats.queries[QUERIES_POINT];
12450	export_stats.queries_range = global_stats.queries[QUERIES_RANGE];
12451
12452	export_stats.covered_secondary_key_lookups =
12453	global_stats.covered_secondary_key_lookups;
12454	}
12455
12456	static void myrocks_update_memory_status() {
12457	std::vector<rocksdb::DB *> dbs;
12458	std::unordered_set<const rocksdb::Cache *> cache_set;
12459	dbs.push_back(rdb);
12460	std::map<rocksdb::MemoryUtil::UsageType, uint64_t> temp_usage_by_type;
12461	rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set,
12462	&temp_usage_by_type);
12463	memory_stats.memtable_total =
12464	temp_usage_by_type [rocksdb::MemoryUtil::kMemTableTotal];
12465	memory_stats.memtable_unflushed =
12466	temp_usage_by_type [rocksdb::MemoryUtil::kMemTableUnFlushed];
12467	}
12468
12469	static SHOW_VAR myrocks_status_variables[] = {
12470	DEF_STATUS_VAR_FUNC("rows_deleted", &export_stats.rows_deleted,
12471	SHOW_LONGLONG),
12472	DEF_STATUS_VAR_FUNC("rows_inserted", &export_stats.rows_inserted,
12473	SHOW_LONGLONG),
12474	DEF_STATUS_VAR_FUNC("rows_read", &export_stats.rows_read, SHOW_LONGLONG),
12475	DEF_STATUS_VAR_FUNC("rows_updated", &export_stats.rows_updated,
12476	SHOW_LONGLONG),
12477	DEF_STATUS_VAR_FUNC("rows_deleted_blind", &export_stats.rows_deleted_blind,
12478	SHOW_LONGLONG),
12479	DEF_STATUS_VAR_FUNC("rows_expired", &export_stats.rows_expired,
12480	SHOW_LONGLONG),
12481	DEF_STATUS_VAR_FUNC("rows_filtered", &export_stats.rows_filtered,
12482	SHOW_LONGLONG),
12483	DEF_STATUS_VAR_FUNC("system_rows_deleted",
12484	&export_stats.system_rows_deleted, SHOW_LONGLONG),
12485	DEF_STATUS_VAR_FUNC("system_rows_inserted",
12486	&export_stats.system_rows_inserted, SHOW_LONGLONG),
12487	DEF_STATUS_VAR_FUNC("system_rows_read", &export_stats.system_rows_read,
12488	SHOW_LONGLONG),
12489	DEF_STATUS_VAR_FUNC("system_rows_updated",
12490	&export_stats.system_rows_updated, SHOW_LONGLONG),
12491	DEF_STATUS_VAR_FUNC("memtable_total", &memory_stats.memtable_total,
12492	SHOW_LONGLONG),
12493	DEF_STATUS_VAR_FUNC("memtable_unflushed", &memory_stats.memtable_unflushed,
12494	SHOW_LONGLONG),
12495	DEF_STATUS_VAR_FUNC("queries_point", &export_stats.queries_point,
12496	SHOW_LONGLONG),
12497	DEF_STATUS_VAR_FUNC("queries_range", &export_stats.queries_range,
12498	SHOW_LONGLONG),
12499	DEF_STATUS_VAR_FUNC("covered_secondary_key_lookups",
12500	&export_stats.covered_secondary_key_lookups,
12501	SHOW_LONGLONG),
12502
12503	{NullS, NullS, SHOW_LONG}};
12504
12505	static void show_myrocks_vars(THD thd, SHOW_VAR var, char *buff) {
12506	myrocks_update_status();
12507	myrocks_update_memory_status();
12508	var->type = SHOW_ARRAY;
12509	var->value = reinterpret_cast<char *>(&myrocks_status_variables);
12510	}
12511
12512	static ulonglong
12513	io_stall_prop_value(const std::map<std::string, std::string> &props,
12514	const std::string &key) {
12515	std::map<std::string, std::string>::const_iterator iter =
12516	props.find("io_stalls." + key);
12517	if (iter != props.end()) {
12518	return std::stoull(iter ->second);
12519	} else {
12520	DBUG_PRINT("warning",
12521	("RocksDB GetMapPropery hasn't returned key=%s", key.c_str()));
12522	DBUG_ASSERT(`0`);
12523	return `0`;
12524	}
12525	}
12526
12527	static void update_rocksdb_stall_status() {
12528	st_io_stall_stats local_io_stall_stats;
12529	for (const auto &cf_name : cf_manager.get_cf_names()) {
12530	rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name);
12531	if (cfh == nullptr) {
12532	continue;
12533	}
12534
12535	std::map<std::string, std::string> props;
12536	if (!rdb->GetMapProperty(cfh, "rocksdb.cfstats", &props)) {
12537	continue;
12538	}
12539
12540	local_io_stall_stats.level0_slowdown +=
12541	io_stall_prop_value(props, "level0_slowdown");
12542	local_io_stall_stats.level0_slowdown_with_compaction +=
12543	io_stall_prop_value(props, "level0_slowdown_with_compaction");
12544	local_io_stall_stats.level0_numfiles +=
12545	io_stall_prop_value(props, "level0_numfiles");
12546	local_io_stall_stats.level0_numfiles_with_compaction +=
12547	io_stall_prop_value(props, "level0_numfiles_with_compaction");
12548	local_io_stall_stats.stop_for_pending_compaction_bytes +=
12549	io_stall_prop_value(props, "stop_for_pending_compaction_bytes");
12550	local_io_stall_stats.slowdown_for_pending_compaction_bytes +=
12551	io_stall_prop_value(props, "slowdown_for_pending_compaction_bytes");
12552	local_io_stall_stats.memtable_compaction +=
12553	io_stall_prop_value(props, "memtable_compaction");
12554	local_io_stall_stats.memtable_slowdown +=
12555	io_stall_prop_value(props, "memtable_slowdown");
12556	local_io_stall_stats.total_stop += io_stall_prop_value(props, "total_stop");
12557	local_io_stall_stats.total_slowdown +=
12558	io_stall_prop_value(props, "total_slowdown");
12559	}
12560	io_stall_stats = local_io_stall_stats;
12561	}
12562
12563	static SHOW_VAR rocksdb_stall_status_variables[] = {
12564	DEF_STATUS_VAR_FUNC("l0_file_count_limit_slowdowns",
12565	&io_stall_stats.level0_slowdown, SHOW_LONGLONG),
12566	DEF_STATUS_VAR_FUNC("locked_l0_file_count_limit_slowdowns",
12567	&io_stall_stats.level0_slowdown_with_compaction,
12568	SHOW_LONGLONG),
12569	DEF_STATUS_VAR_FUNC("l0_file_count_limit_stops",
12570	&io_stall_stats.level0_numfiles, SHOW_LONGLONG),
12571	DEF_STATUS_VAR_FUNC("locked_l0_file_count_limit_stops",
12572	&io_stall_stats.level0_numfiles_with_compaction,
12573	SHOW_LONGLONG),
12574	DEF_STATUS_VAR_FUNC("pending_compaction_limit_stops",
12575	&io_stall_stats.stop_for_pending_compaction_bytes,
12576	SHOW_LONGLONG),
12577	DEF_STATUS_VAR_FUNC("pending_compaction_limit_slowdowns",
12578	&io_stall_stats.slowdown_for_pending_compaction_bytes,
12579	SHOW_LONGLONG),
12580	DEF_STATUS_VAR_FUNC("memtable_limit_stops",
12581	&io_stall_stats.memtable_compaction, SHOW_LONGLONG),
12582	DEF_STATUS_VAR_FUNC("memtable_limit_slowdowns",
12583	&io_stall_stats.memtable_slowdown, SHOW_LONGLONG),
12584	DEF_STATUS_VAR_FUNC("total_stops", &io_stall_stats.total_stop,
12585	SHOW_LONGLONG),
12586	DEF_STATUS_VAR_FUNC("total_slowdowns", &io_stall_stats.total_slowdown,
12587	SHOW_LONGLONG),
12588	// end of the array marker
12589	{NullS, NullS, SHOW_LONG}};
12590
12591	static void show_rocksdb_stall_vars(THD thd, SHOW_VAR var, char *buff) {
12592	update_rocksdb_stall_status();
12593	var->type = SHOW_ARRAY;
12594	var->value = reinterpret_cast<char *>(&rocksdb_stall_status_variables);
12595	}
12596
12597	static SHOW_VAR rocksdb_status_vars[] = {
12598	DEF_STATUS_VAR(block_cache_miss),
12599	DEF_STATUS_VAR(block_cache_hit),
12600	DEF_STATUS_VAR(block_cache_add),
12601	DEF_STATUS_VAR(block_cache_add_failures),
12602	DEF_STATUS_VAR(block_cache_index_miss),
12603	DEF_STATUS_VAR(block_cache_index_hit),
12604	DEF_STATUS_VAR(block_cache_index_add),
12605	DEF_STATUS_VAR(block_cache_index_bytes_insert),
12606	DEF_STATUS_VAR(block_cache_index_bytes_evict),
12607	DEF_STATUS_VAR(block_cache_filter_miss),
12608	DEF_STATUS_VAR(block_cache_filter_hit),
12609	DEF_STATUS_VAR(block_cache_filter_add),
12610	DEF_STATUS_VAR(block_cache_filter_bytes_insert),
12611	DEF_STATUS_VAR(block_cache_filter_bytes_evict),
12612	DEF_STATUS_VAR(block_cache_bytes_read),
12613	DEF_STATUS_VAR(block_cache_bytes_write),
12614	DEF_STATUS_VAR(block_cache_data_bytes_insert),
12615	DEF_STATUS_VAR(block_cache_data_miss),
12616	DEF_STATUS_VAR(block_cache_data_hit),
12617	DEF_STATUS_VAR(block_cache_data_add),
12618	DEF_STATUS_VAR(bloom_filter_useful),
12619	DEF_STATUS_VAR(memtable_hit),
12620	DEF_STATUS_VAR(memtable_miss),
12621	DEF_STATUS_VAR(get_hit_l0),
12622	DEF_STATUS_VAR(get_hit_l1),
12623	DEF_STATUS_VAR(get_hit_l2_and_up),
12624	DEF_STATUS_VAR(compaction_key_drop_new),
12625	DEF_STATUS_VAR(compaction_key_drop_obsolete),
12626	DEF_STATUS_VAR(compaction_key_drop_user),
12627	DEF_STATUS_VAR(number_keys_written),
12628	DEF_STATUS_VAR(number_keys_read),
12629	DEF_STATUS_VAR(number_keys_updated),
12630	DEF_STATUS_VAR(bytes_written),
12631	DEF_STATUS_VAR(bytes_read),
12632	DEF_STATUS_VAR(number_db_seek),
12633	DEF_STATUS_VAR(number_db_seek_found),
12634	DEF_STATUS_VAR(number_db_next),
12635	DEF_STATUS_VAR(number_db_next_found),
12636	DEF_STATUS_VAR(number_db_prev),
12637	DEF_STATUS_VAR(number_db_prev_found),
12638	DEF_STATUS_VAR(iter_bytes_read),
12639	DEF_STATUS_VAR(no_file_closes),
12640	DEF_STATUS_VAR(no_file_opens),
12641	DEF_STATUS_VAR(no_file_errors),
12642	DEF_STATUS_VAR(stall_micros),
12643	DEF_STATUS_VAR(num_iterators),
12644	DEF_STATUS_VAR(number_multiget_get),
12645	DEF_STATUS_VAR(number_multiget_keys_read),
12646	DEF_STATUS_VAR(number_multiget_bytes_read),
12647	DEF_STATUS_VAR(number_deletes_filtered),
12648	DEF_STATUS_VAR(number_merge_failures),
12649	DEF_STATUS_VAR(bloom_filter_prefix_checked),
12650	DEF_STATUS_VAR(bloom_filter_prefix_useful),
12651	DEF_STATUS_VAR(number_reseeks_iteration),
12652	DEF_STATUS_VAR(getupdatessince_calls),
12653	DEF_STATUS_VAR(block_cachecompressed_miss),
12654	DEF_STATUS_VAR(block_cachecompressed_hit),
12655	DEF_STATUS_VAR(wal_synced),
12656	DEF_STATUS_VAR(wal_bytes),
12657	DEF_STATUS_VAR(write_self),
12658	DEF_STATUS_VAR(write_other),
12659	DEF_STATUS_VAR(write_timedout),
12660	DEF_STATUS_VAR(write_wal),
12661	DEF_STATUS_VAR(flush_write_bytes),
12662	DEF_STATUS_VAR(compact_read_bytes),
12663	DEF_STATUS_VAR(compact_write_bytes),
12664	DEF_STATUS_VAR(number_superversion_acquires),
12665	DEF_STATUS_VAR(number_superversion_releases),
12666	DEF_STATUS_VAR(number_superversion_cleanups),
12667	DEF_STATUS_VAR(number_block_not_compressed),
12668	DEF_STATUS_VAR_PTR("row_lock_deadlocks", &rocksdb_row_lock_deadlocks,
12669	SHOW_LONGLONG),
12670	DEF_STATUS_VAR_PTR("row_lock_wait_timeouts",
12671	&rocksdb_row_lock_wait_timeouts, SHOW_LONGLONG),
12672	DEF_STATUS_VAR_PTR("snapshot_conflict_errors",
12673	&rocksdb_snapshot_conflict_errors, SHOW_LONGLONG),
12674	DEF_STATUS_VAR_PTR("wal_group_syncs", &rocksdb_wal_group_syncs,
12675	SHOW_LONGLONG),
12676	DEF_STATUS_VAR_PTR("number_sst_entry_put", &rocksdb_num_sst_entry_put,
12677	SHOW_LONGLONG),
12678	DEF_STATUS_VAR_PTR("number_sst_entry_delete", &rocksdb_num_sst_entry_delete,
12679	SHOW_LONGLONG),
12680	DEF_STATUS_VAR_PTR("number_sst_entry_singledelete",
12681	&rocksdb_num_sst_entry_singledelete, SHOW_LONGLONG),
12682	DEF_STATUS_VAR_PTR("number_sst_entry_merge", &rocksdb_num_sst_entry_merge,
12683	SHOW_LONGLONG),
12684	DEF_STATUS_VAR_PTR("number_sst_entry_other", &rocksdb_num_sst_entry_other,
12685	SHOW_LONGLONG),
12686	// the variables generated by SHOW_FUNC are sorted only by prefix (first
12687	// arg in the tuple below), so make sure it is unique to make sorting
12688	// deterministic as quick sort is not stable
12689	{"rocksdb", reinterpret_cast<char *>(&show_myrocks_vars), SHOW_FUNC},
12690	{"rocksdb_stall", reinterpret_cast<char *>(&show_rocksdb_stall_vars),
12691	SHOW_FUNC},
12692	{NullS, NullS, SHOW_LONG}};
12693
12694	/*
12695	Background thread's main logic
12696	*/
12697
12698	void Rdb_background_thread::run() {
12699	// How many seconds to wait till flushing the WAL next time.
12700	const int WAKE_UP_INTERVAL = `1`;
12701
12702	timespec ts_next_sync;
12703	set_timespec(ts_next_sync, WAKE_UP_INTERVAL);
12704
12705	for (;;) {
12706	// Wait until the next timeout or until we receive a signal to stop the
12707	// thread. Request to stop the thread should only be triggered when the
12708	// storage engine is being unloaded.
12709	RDB_MUTEX_LOCK_CHECK(m_signal_mutex);
12710	const auto ret MY_ATTRIBUTE((__unused__)) =
12711	mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts_next_sync);
12712
12713	// Check that we receive only the expected error codes.
12714	DBUG_ASSERT(ret == `0` \|\| ret == ETIMEDOUT);
12715	const bool local_stop = m_stop;
12716	const bool local_save_stats = m_save_stats;
12717	reset();
12718	RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex);
12719
12720	if (local_stop) {
12721	// If we're here then that's because condition variable was signaled by
12722	// another thread and we're shutting down. Break out the loop to make
12723	// sure that shutdown thread can proceed.
12724	break;
12725	}
12726
12727	// This path should be taken only when the timer expired.
12728	DBUG_ASSERT(ret == ETIMEDOUT);
12729
12730	if (local_save_stats) {
12731	ddl_manager.persist_stats();
12732	}
12733
12734	// Set the next timestamp for mysql_cond_timedwait() (which ends up calling
12735	// pthread_cond_timedwait()) to wait on.
12736	set_timespec(ts_next_sync, WAKE_UP_INTERVAL);
12737
12738	// Flush the WAL. Sync it for both background and never modes to copy
12739	// InnoDB's behavior. For mode never, the wal file isn't even written,
12740	// whereas background writes to the wal file, but issues the syncs in a
12741	// background thread.
12742	if (rdb && (rocksdb_flush_log_at_trx_commit != FLUSH_LOG_SYNC) &&
12743	!rocksdb_db_options ->allow_mmap_writes) {
12744	const rocksdb::Status s = rdb->FlushWAL(true);
12745	if (!s.ok()) {
12746	rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD);
12747	}
12748	}
12749	}
12750
12751	// save remaining stats which might've left unsaved
12752	ddl_manager.persist_stats();
12753	}
12754
12755	bool ha_rocksdb::check_bloom_and_set_bounds(THD thd, const* Rdb_key_def &kd,
12756	const rocksdb::Slice &eq_cond,
12757	const bool use_all_keys,
12758	uchar *lower_bound_buf,
12759	uchar *upper_bound_buf,
12760	rocksdb::Slice *out_lower_bound,
12761	rocksdb::Slice *out_upper_bound) {
12762	bool can_use_bloom = can_use_bloom_filter(thd, kd, eq_cond, use_all_keys);
12763	if (!can_use_bloom) {
12764	setup_iterator_bounds(kd, eq_cond,
12765	lower_bound_buf, upper_bound_buf,
12766	out_lower_bound, out_upper_bound);
12767	}
12768	return can_use_bloom;
12769	}
12770
12771	/**
12772	Deciding if it is possible to use bloom filter or not.
12773
12774	@detail
12775	Even if bloom filter exists, it is not always possible
12776	to use bloom filter. If using bloom filter when you shouldn't,
12777	false negative may happen -- fewer rows than expected may be returned.
12778	It is users' responsibility to use bloom filter correctly.
12779
12780	If bloom filter does not exist, return value does not matter because
12781	RocksDB does not use bloom filter internally.
12782
12783	@param kd
12784	@param eq_cond Equal condition part of the key. This always includes
12785	system index id (4 bytes).
12786	@param use_all_keys True if all key parts are set with equal conditions.
12787	This is aware of extended keys.
12788	*/
12789	bool ha_rocksdb::can_use_bloom_filter(THD thd, const* Rdb_key_def &kd,
12790	const rocksdb::Slice &eq_cond,
12791	const bool use_all_keys) {
12792	bool can_use = false;
12793
12794	if (THDVAR(thd, skip_bloom_filter_on_read)) {
12795	return can_use;
12796	}
12797
12798	const rocksdb::SliceTransform *prefix_extractor = kd.get_extractor();
12799	if (prefix_extractor) {
12800	/*
12801	This is an optimized use case for CappedPrefixTransform.
12802	If eq_cond length >= prefix extractor length and if
12803	all keys are used for equal lookup, it is
12804	always possible to use bloom filter.
12805
12806	Prefix bloom filter can't be used on descending scan with
12807	prefix lookup (i.e. WHERE id1=1 ORDER BY id2 DESC), because of
12808	RocksDB's limitation. On ascending (or not sorting) scan,
12809	keys longer than the capped prefix length will be truncated down
12810	to the capped length and the resulting key is added to the bloom filter.
12811
12812	Keys shorter than the capped prefix length will be added to
12813	the bloom filter. When keys are looked up, key conditionals
12814	longer than the capped length can be used; key conditionals
12815	shorter require all parts of the key to be available
12816	for the short key match.
12817	*/
12818	if ((use_all_keys && prefix_extractor->InRange(eq_cond))
12819	\|\| prefix_extractor->SameResultWhenAppended(eq_cond))
12820	can_use = true;
12821	else
12822	can_use = false;
12823	} else {
12824	/*
12825	if prefix extractor is not defined, all key parts have to be
12826	used by eq_cond.
12827	*/
12828	if (use_all_keys)
12829	can_use = true;
12830	else
12831	can_use = false;
12832	}
12833
12834	return can_use;
12835	}
12836
12837	/ For modules that need access to the global data structures /
12838	rocksdb::TransactionDB rdb_get_rocksdb_db() { return* rdb; }
12839
12840	Rdb_cf_manager &rdb_get_cf_manager() { return cf_manager; }
12841
12842	const rocksdb::BlockBasedTableOptions &rdb_get_table_options() {
12843	return *rocksdb_tbl_options;
12844	}
12845
12846	bool rdb_is_ttl_enabled() { return rocksdb_enable_ttl; }
12847	bool rdb_is_ttl_read_filtering_enabled() {
12848	return rocksdb_enable_ttl_read_filtering;
12849	}
12850	#ifndef NDEBUG
12851	int rdb_dbug_set_ttl_rec_ts() { return rocksdb_debug_ttl_rec_ts; }
12852	int rdb_dbug_set_ttl_snapshot_ts() { return rocksdb_debug_ttl_snapshot_ts; }
12853	int rdb_dbug_set_ttl_read_filter_ts() {
12854	return rocksdb_debug_ttl_read_filter_ts;
12855	}
12856	bool rdb_dbug_set_ttl_ignore_pk() { return rocksdb_debug_ttl_ignore_pk; }
12857	#endif
12858
12859	void rdb_update_global_stats(const operation_type &type, uint count,
12860	bool is_system_table) {
12861	DBUG_ASSERT(type < ROWS_MAX);
12862
12863	if (count == `0`) {
12864	return;
12865	}
12866
12867	if (is_system_table) {
12868	global_stats.system_rows[type].add(count);
12869	} else {
12870	global_stats.rows[type].add(count);
12871	}
12872	}
12873
12874	int rdb_get_table_perf_counters(const char *const tablename,
12875	Rdb_perf_counters *const counters) {
12876	DBUG_ASSERT(counters != nullptr);
12877	DBUG_ASSERT(tablename != nullptr);
12878
12879	Rdb_table_handler *table_handler;
12880	table_handler = rdb_open_tables.get_table_handler(tablename);
12881	if (table_handler == nullptr) {
12882	return HA_ERR_ROCKSDB_INVALID_TABLE;
12883	}
12884
12885	counters->load(table_handler->m_table_perf_context);
12886
12887	rdb_open_tables.release_table_handler(table_handler);
12888	return HA_EXIT_SUCCESS;
12889	}
12890
12891	const char get_rdb_io_error_string(const* RDB_IO_ERROR_TYPE err_type) {
12892	// If this assertion fails then this means that a member has been either added
12893	// to or removed from RDB_IO_ERROR_TYPE enum and this function needs to be
12894	// changed to return the appropriate value.
12895	static_assert(RDB_IO_ERROR_LAST == `4`, "Please handle all the error types.");
12896
12897	switch (err_type) {
12898	case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_TX_COMMIT:
12899	return "RDB_IO_ERROR_TX_COMMIT";
12900	case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_DICT_COMMIT:
12901	return "RDB_IO_ERROR_DICT_COMMIT";
12902	case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_BG_THREAD:
12903	return "RDB_IO_ERROR_BG_THREAD";
12904	case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_GENERAL:
12905	return "RDB_IO_ERROR_GENERAL";
12906	default:
12907	DBUG_ASSERT(false);
12908	return "(unknown)";
12909	}
12910	}
12911
12912	// In case of core dump generation we want this function NOT to be optimized
12913	// so that we can capture as much data as possible to debug the root cause
12914	// more efficiently.
12915	#ifdef __GNUC__
12916	#pragma GCC push_options
12917	#pragma GCC optimize("O0")
12918	#endif
12919
12920	void rdb_handle_io_error(const rocksdb::Status status,
12921	const RDB_IO_ERROR_TYPE err_type) {
12922	if (status.IsIOError()) {
12923	switch (err_type) {
12924	case RDB_IO_ERROR_TX_COMMIT:
12925	case RDB_IO_ERROR_DICT_COMMIT: {
12926	rdb_log_status_error(status, "failed to write to WAL");
12927	/ NO_LINT_DEBUG /
12928	sql_print_error("MyRocks: aborting on WAL write error.");
12929	abort();
12930	break;
12931	}
12932	case RDB_IO_ERROR_BG_THREAD: {
12933	rdb_log_status_error(status, "BG thread failed to write to RocksDB");
12934	break;
12935	}
12936	case RDB_IO_ERROR_GENERAL: {
12937	rdb_log_status_error(status, "failed on I/O");
12938	/ NO_LINT_DEBUG /
12939	sql_print_error("MyRocks: aborting on I/O error.");
12940	abort();
12941	break;
12942	}
12943	default:
12944	DBUG_ASSERT(`0`);
12945	break;
12946	}
12947	} else if (status.IsCorruption()) {
12948	rdb_log_status_error(status, "data corruption detected!");
12949	rdb_persist_corruption_marker();
12950	/ NO_LINT_DEBUG /
12951	sql_print_error("MyRocks: aborting because of data corruption.");
12952	abort();
12953	} else if (!status.ok()) {
12954	switch (err_type) {
12955	case RDB_IO_ERROR_DICT_COMMIT: {
12956	rdb_log_status_error(status, "Failed to write to WAL (dictionary)");
12957	/ NO_LINT_DEBUG /
12958	sql_print_error("MyRocks: aborting on WAL write error.");
12959	abort();
12960	break;
12961	}
12962	default:
12963	rdb_log_status_error(status, "Failed to read/write in RocksDB");
12964	break;
12965	}
12966	}
12967	}
12968	#ifdef __GNUC__
12969	#pragma GCC pop_options
12970	#endif
12971
12972	Rdb_dict_manager rdb_get_dict_manager(void) { return* &dict_manager; }
12973
12974	Rdb_ddl_manager rdb_get_ddl_manager(void) { return* &ddl_manager; }
12975
12976	Rdb_binlog_manager rdb_get_binlog_manager(void) { return* &binlog_manager; }
12977
12978	void rocksdb_set_compaction_options(
12979	my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
12980	my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
12981	void *const var_ptr, const void *const save) {
12982	if (var_ptr && save) {
12983	(uint64_t )var_ptr = (const* uint64_t *)save;
12984	}
12985	const Rdb_compact_params params = {
12986	(uint64_t)rocksdb_compaction_sequential_deletes,
12987	(uint64_t)rocksdb_compaction_sequential_deletes_window,
12988	(uint64_t)rocksdb_compaction_sequential_deletes_file_size};
12989	if (properties_collector_factory) {
12990	properties_collector_factory ->SetCompactionParams(params);
12991	}
12992	}
12993
12994	void rocksdb_set_table_stats_sampling_pct(
12995	my_core::THD *const thd MY_ATTRIBUTE((__unused__)),
12996	my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
12997	void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
12998	RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
12999
13000	const uint32_t new_val = *static_cast<const uint32_t *>(save);
13001
13002	if (new_val != rocksdb_table_stats_sampling_pct) {
13003	rocksdb_table_stats_sampling_pct = new_val;
13004
13005	if (properties_collector_factory) {
13006	properties_collector_factory ->SetTableStatsSamplingPct(
13007	rocksdb_table_stats_sampling_pct);
13008	}
13009	}
13010
13011	RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
13012	}
13013
13014	/*
13015	This function allows setting the rate limiter's bytes per second value
13016	but only if the rate limiter is turned on which has to be done at startup.
13017	If the rate is already 0 (turned off) or we are changing it to 0 (trying
13018	to turn it off) this function will push a warning to the client and do
13019	nothing.
13020	This is similar to the code in innodb_doublewrite_update (found in
13021	storage/innobase/handler/ha_innodb.cc).
13022	*/
13023	void rocksdb_set_rate_limiter_bytes_per_sec(
13024	my_core::THD *const thd,
13025	my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
13026	void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
13027	const uint64_t new_val = *static_cast<const uint64_t *>(save);
13028	if (new_val == `0` \|\| rocksdb_rate_limiter_bytes_per_sec == `0`) {
13029	/*
13030	If a rate_limiter was not enabled at startup we can't change it nor
13031	can we disable it if one was created at startup
13032	*/
13033	push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_WRONG_ARGUMENTS,
13034	"RocksDB: rocksdb_rate_limiter_bytes_per_sec cannot "
13035	"be dynamically changed to or from 0. Do a clean "
13036	"shutdown if you want to change it from or to 0.");
13037	} else if (new_val != rocksdb_rate_limiter_bytes_per_sec) {
13038	/ Apply the new value to the rate limiter and store it locally /
13039	DBUG_ASSERT(rocksdb_rate_limiter != nullptr);
13040	rocksdb_rate_limiter_bytes_per_sec = new_val;
13041	rocksdb_rate_limiter ->SetBytesPerSecond(new_val);
13042	}
13043	}
13044
13045	void rocksdb_set_sst_mgr_rate_bytes_per_sec(
13046	my_core::THD *const thd,
13047	my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
13048	void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
13049	RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
13050
13051	const uint64_t new_val = *static_cast<const uint64_t *>(save);
13052
13053	if (new_val != rocksdb_sst_mgr_rate_bytes_per_sec) {
13054	rocksdb_sst_mgr_rate_bytes_per_sec = new_val;
13055
13056	rocksdb_db_options ->sst_file_manager ->SetDeleteRateBytesPerSecond(
13057	rocksdb_sst_mgr_rate_bytes_per_sec);
13058	}
13059
13060	RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
13061	}
13062
13063	void rocksdb_set_delayed_write_rate(THD thd, struct* st_mysql_sys_var *var,
13064	void var_ptr, const* void *save) {
13065	RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
13066	const uint64_t new_val = *static_cast<const uint64_t *>(save);
13067	if (rocksdb_delayed_write_rate != new_val) {
13068	rocksdb_delayed_write_rate = new_val;
13069	rocksdb::Status s =
13070	rdb->SetDBOptions({{"delayed_write_rate", std::to_string(new_val)}});
13071
13072	if (!s.ok()) {
13073	/ NO_LINT_DEBUG /
13074	sql_print_warning("MyRocks: failed to update delayed_write_rate. "
13075	"status code = %d, status = %s",
13076	s.code(), s.ToString().c_str());
13077	}
13078	}
13079	RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
13080	}
13081
13082	void rocksdb_set_max_latest_deadlocks(THD thd, struct* st_mysql_sys_var *var,
13083	void var_ptr, const* void *save) {
13084	RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
13085	const uint32_t new_val = *static_cast<const uint32_t *>(save);
13086	if (rocksdb_max_latest_deadlocks != new_val) {
13087	rocksdb_max_latest_deadlocks = new_val;
13088	rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks);
13089	}
13090	RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
13091	}
13092
13093	void rdb_set_collation_exception_list(const char *const exception_list) {
13094	DBUG_ASSERT(rdb_collation_exceptions != nullptr);
13095
13096	if (!rdb_collation_exceptions->set_patterns(exception_list)) {
13097	my_core::warn_about_bad_patterns(rdb_collation_exceptions,
13098	"strict_collation_exceptions");
13099	}
13100	}
13101
13102	void rocksdb_set_collation_exception_list(THD *const thd,
13103	struct st_mysql_sys_var *const var,
13104	void *const var_ptr,
13105	const void *const save) {
13106	const char *const val = *static_cast<const char *const *>(save);
13107
13108	rdb_set_collation_exception_list(val == nullptr ? "" : val);
13109
13110	//psergey-todo: what is the purpose of the below??
13111	const char val_copy= val? my_strdup(val, MYF(`0`)): nullptr*;
13112	my_free(*static_cast<char**>(var_ptr));
13113	*static_cast<const char**>(var_ptr) = val_copy;
13114	}
13115
13116	int mysql_value_to_bool(struct st_mysql_value value, my_bool return_value) {
13117	int new_value_type = value->value_type(value);
13118	if (new_value_type == MYSQL_VALUE_TYPE_STRING) {
13119	char buf[`16`];
13120	int len = sizeof(buf);
13121	const char *str = value->val_str(value, buf, &len);
13122	if (str && (my_strcasecmp(system_charset_info, "true", str) == `0` \|\|
13123	my_strcasecmp(system_charset_info, "on", str) == `0`)) {
13124	*return_value = TRUE;
13125	} else if (str && (my_strcasecmp(system_charset_info, "false", str) == `0` \|\|
13126	my_strcasecmp(system_charset_info, "off", str) == `0`)) {
13127	*return_value = FALSE;
13128	} else {
13129	return `1`;
13130	}
13131	} else if (new_value_type == MYSQL_VALUE_TYPE_INT) {
13132	long long intbuf;
13133	value->val_int(value, &intbuf);
13134	if (intbuf > `1`)
13135	return `1`;
13136	*return_value = intbuf > `0` ? TRUE : FALSE;
13137	} else {
13138	return `1`;
13139	}
13140
13141	return `0`;
13142	}
13143
13144	int rocksdb_check_bulk_load(
13145	THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
13146	void save, struct* st_mysql_value *value) {
13147	my_bool new_value;
13148	if (mysql_value_to_bool(value, &new_value) != `0`) {
13149	return `1`;
13150	}
13151
13152	Rdb_transaction *&tx = get_tx_from_thd(thd);
13153	if (tx != nullptr) {
13154	const int rc = tx->finish_bulk_load();
13155	if (rc != `0`) {
13156	// NO_LINT_DEBUG
13157	sql_print_error("RocksDB: Error %d finalizing last SST file while "
13158	"setting bulk loading variable",
13159	rc);
13160	THDVAR(thd, bulk_load) = `0`;
13161	return `1`;
13162	}
13163	}
13164
13165	*static_cast<bool *>(save) = new_value;
13166	return `0`;
13167	}
13168
13169	int rocksdb_check_bulk_load_allow_unsorted(
13170	THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)),
13171	void save, struct* st_mysql_value *value) {
13172	my_bool new_value;
13173	if (mysql_value_to_bool(value, &new_value) != `0`) {
13174	return `1`;
13175	}
13176
13177	if (THDVAR(thd, bulk_load)) {
13178	my_error(ER_ERROR_WHEN_EXECUTING_COMMAND, MYF(`0`), "SET",
13179	"Cannot change this setting while bulk load is enabled");
13180
13181	return `1`;
13182	}
13183
13184	*static_cast<bool *>(save) = new_value;
13185	return `0`;
13186	}
13187
13188	static void rocksdb_set_max_background_jobs(THD *thd,
13189	struct st_mysql_sys_var *const var,
13190	void *const var_ptr,
13191	const void *const save) {
13192	DBUG_ASSERT(save != nullptr);
13193	DBUG_ASSERT(rocksdb_db_options != nullptr);
13194	DBUG_ASSERT(rocksdb_db_options->env != nullptr);
13195
13196	RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
13197
13198	const int new_val = *static_cast<const int *>(save);
13199
13200	if (rocksdb_db_options ->max_background_jobs != new_val) {
13201	rocksdb_db_options ->max_background_jobs = new_val;
13202	rocksdb::Status s =
13203	rdb->SetDBOptions({{"max_background_jobs", std::to_string(new_val)}});
13204
13205	if (!s.ok()) {
13206	/ NO_LINT_DEBUG /
13207	sql_print_warning("MyRocks: failed to update max_background_jobs. "
13208	"Status code = %d, status = %s.",
13209	s.code(), s.ToString().c_str());
13210	}
13211	}
13212
13213	RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
13214	}
13215
13216	static void rocksdb_set_bytes_per_sync(
13217	THD *thd MY_ATTRIBUTE((__unused__)),
13218	struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
13219	void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
13220	DBUG_ASSERT(save != nullptr);
13221	DBUG_ASSERT(rocksdb_db_options != nullptr);
13222	DBUG_ASSERT(rocksdb_db_options->env != nullptr);
13223
13224	RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
13225
13226	const ulonglong new_val = *static_cast<const ulonglong *>(save);
13227
13228	if (rocksdb_db_options ->bytes_per_sync != new_val) {
13229	rocksdb_db_options ->bytes_per_sync = new_val;
13230	rocksdb::Status s =
13231	rdb->SetDBOptions({{"bytes_per_sync", std::to_string(new_val)}});
13232
13233	if (!s.ok()) {
13234	/ NO_LINT_DEBUG /
13235	sql_print_warning("MyRocks: failed to update max_background_jobs. "
13236	"Status code = %d, status = %s.",
13237	s.code(), s.ToString().c_str());
13238	}
13239	}
13240
13241	RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
13242	}
13243
13244	static void rocksdb_set_wal_bytes_per_sync(
13245	THD *thd MY_ATTRIBUTE((__unused__)),
13246	struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)),
13247	void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) {
13248	DBUG_ASSERT(save != nullptr);
13249	DBUG_ASSERT(rocksdb_db_options != nullptr);
13250	DBUG_ASSERT(rocksdb_db_options->env != nullptr);
13251
13252	RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
13253
13254	const ulonglong new_val = *static_cast<const ulonglong *>(save);
13255
13256	if (rocksdb_db_options ->wal_bytes_per_sync != new_val) {
13257	rocksdb_db_options ->wal_bytes_per_sync = new_val;
13258	rocksdb::Status s =
13259	rdb->SetDBOptions({{"wal_bytes_per_sync", std::to_string(new_val)}});
13260
13261	if (!s.ok()) {
13262	/ NO_LINT_DEBUG /
13263	sql_print_warning("MyRocks: failed to update max_background_jobs. "
13264	"Status code = %d, status = %s.",
13265	s.code(), s.ToString().c_str());
13266	}
13267	}
13268
13269	RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
13270	}
13271
13272	static int
13273	rocksdb_validate_update_cf_options(THD * / unused /,
13274	struct st_mysql_sys_var * /unused/,
13275	void save, struct* st_mysql_value *value) {
13276
13277	char buff[STRING_BUFFER_USUAL_SIZE];
13278	const char *str;
13279	int length;
13280	length = sizeof(buff);
13281	str = value->val_str(value, buff, &length);
13282	(const* char **)save = str;
13283
13284	if (str == nullptr) {
13285	return HA_EXIT_SUCCESS;
13286	}
13287
13288	Rdb_cf_options::Name_to_config_t option_map;
13289
13290	// Basic sanity checking and parsing the options into a map. If this fails
13291	// then there's no point to proceed.
13292	if (!Rdb_cf_options::parse_cf_options(str, &option_map)) {
13293	my_error(ER_WRONG_VALUE_FOR_VAR, MYF(`0`), "rocksdb_update_cf_options", str);
13294	return HA_EXIT_FAILURE;
13295	}
13296	return HA_EXIT_SUCCESS;
13297	}
13298
13299	static void
13300	rocksdb_set_update_cf_options(THD *const / unused /,
13301	struct st_mysql_sys_var *const / unused /,
13302	void *const var_ptr, const void *const save) {
13303	const char *const val = *static_cast<const char *const *>(save);
13304
13305	RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex);
13306
13307	if (!val) {
13308	*reinterpret_cast<char >(var_ptr) = nullptr**;
13309	RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
13310	return;
13311	}
13312
13313	DBUG_ASSERT(val != nullptr);
13314
13315	// Reset the pointers regardless of how much success we had with updating
13316	// the CF options. This will results in consistent behavior and avoids
13317	// dealing with cases when only a subset of CF-s was successfully updated.
13318	*reinterpret_cast<char **>(var_ptr) = my_strdup(val, MYF(`0`));
13319
13320	// Do the real work of applying the changes.
13321	Rdb_cf_options::Name_to_config_t option_map;
13322
13323	// This should never fail, because of rocksdb_validate_update_cf_options
13324	if (!Rdb_cf_options::parse_cf_options(val, &option_map)) {
13325	my_free(*reinterpret_cast<char**>(var_ptr));
13326	RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
13327	return;
13328	}
13329
13330	// For each CF we have, see if we need to update any settings.
13331	for (const auto &cf_name : cf_manager.get_cf_names()) {
13332	DBUG_ASSERT(!cf_name.empty());
13333
13334	rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name);
13335	DBUG_ASSERT(cfh != nullptr);
13336
13337	const auto it = option_map.find(cf_name);
13338	std::string per_cf_options = (it != option_map.end()) ? it ->second : "";
13339
13340	if (!per_cf_options.empty()) {
13341	Rdb_cf_options::Name_to_config_t opt_map;
13342	rocksdb::Status s = rocksdb::StringToMap(per_cf_options, &opt_map);
13343
13344	if (s != rocksdb::Status::OK()) {
13345	// NO_LINT_DEBUG
13346	sql_print_warning("MyRocks: failed to convert the options for column "
13347	"family '%s' to a map. %s", cf_name.c_str(),
13348	s.ToString().c_str());
13349	} else {
13350	DBUG_ASSERT(rdb != nullptr);
13351
13352	// Finally we can apply the options.
13353	s = rdb->SetOptions(cfh, opt_map);
13354
13355	if (s != rocksdb::Status::OK()) {
13356	// NO_LINT_DEBUG
13357	sql_print_warning("MyRocks: failed to apply the options for column "
13358	"family '%s'. %s", cf_name.c_str(),
13359	s.ToString().c_str());
13360	} else {
13361	// NO_LINT_DEBUG
13362	sql_print_information("MyRocks: options for column family '%s' "
13363	"have been successfully updated.",
13364	cf_name.c_str());
13365
13366	// Make sure that data is internally consistent as well and update
13367	// the CF options. This is necessary also to make sure that the CF
13368	// options will be correctly reflected in the relevant table:
13369	// ROCKSDB_CF_OPTIONS in INFORMATION_SCHEMA.
13370	rocksdb::ColumnFamilyOptions cf_options = rdb->GetOptions(cfh);
13371	std::string updated_options;
13372
13373	s = rocksdb::GetStringFromColumnFamilyOptions(&updated_options,
13374	cf_options);
13375
13376	DBUG_ASSERT(s == rocksdb::Status::OK());
13377	DBUG_ASSERT(!updated_options.empty());
13378
13379	cf_manager.update_options_map(cf_name, updated_options);
13380	}
13381	}
13382	}
13383	}
13384
13385	// Our caller (`plugin_var_memalloc_global_update`) will call `my_free` to
13386	// free up resources used before.
13387
13388	RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex);
13389	}
13390
13391	void rdb_queue_save_stats_request() { rdb_bg_thread.request_save_stats(); }
13392
13393	#ifdef MARIAROCKS_NOT_YET // MDEV-10976
13394
13395	void ha_rocksdb::rpl_before_delete_rows() {
13396	DBUG_ENTER_FUNC();
13397
13398	m_in_rpl_delete_rows = true;
13399
13400	DBUG_VOID_RETURN;
13401	}
13402
13403	void ha_rocksdb::rpl_after_delete_rows() {
13404	DBUG_ENTER_FUNC();
13405
13406	m_in_rpl_delete_rows = false;
13407
13408	DBUG_VOID_RETURN;
13409	}
13410
13411	void ha_rocksdb::rpl_before_update_rows() {
13412	DBUG_ENTER_FUNC();
13413
13414	m_in_rpl_update_rows = true;
13415
13416	DBUG_VOID_RETURN;
13417	}
13418
13419	void ha_rocksdb::rpl_after_update_rows() {
13420	DBUG_ENTER_FUNC();
13421
13422	m_in_rpl_update_rows = false;
13423
13424	DBUG_VOID_RETURN;
13425	}
13426
13427	/**
13428	@brief
13429	Read Free Replication can be used or not. Returning False means
13430	Read Free Replication can be used. Read Free Replication can be used
13431	on UPDATE or DELETE row events, and table must have user defined
13432	primary key.
13433	*/
13434	bool ha_rocksdb::use_read_free_rpl() {
13435	DBUG_ENTER_FUNC();
13436
13437	DBUG_RETURN((m_in_rpl_delete_rows \|\| m_in_rpl_update_rows) &&
13438	!has_hidden_pk(table) && m_use_read_free_rpl);
13439	}
13440	#endif // MARIAROCKS_NOT_YET
13441
13442	double ha_rocksdb::read_time(uint index, uint ranges, ha_rows rows) {
13443	DBUG_ENTER_FUNC();
13444
13445	if (index != table->s->primary_key) {
13446	/ Non covering index range scan /
13447	DBUG_RETURN(handler::read_time(index, ranges, rows));
13448	}
13449
13450	DBUG_RETURN((rows / `20.0`) + `1`);
13451	}
13452
13453	std::string rdb_corruption_marker_file_name() {
13454	std::string ret(rocksdb_datadir);
13455	ret.append("/ROCKSDB_CORRUPTED");
13456	return ret;
13457	}
13458
13459	void sql_print_verbose_info(const char *format, ...)
13460	{
13461	va_list args;
13462
13463	if (global_system_variables.log_warnings > `2`) {
13464	va_start(args, format);
13465	sql_print_information_v(format, args);
13466	va_end(args);
13467	}
13468	}
13469
13470	} // namespace myrocks
13471
13472
13473	/**
13474	Construct and emit duplicate key error message using information
13475	from table's record buffer.
13476
13477	@sa print_keydup_error(table, key, msg, errflag, thd, org_table_name).
13478	*/
13479
13480	void print_keydup_error(TABLE table, KEY key, myf errflag,
13481	const THD thd, const* char *org_table_name)
13482	{
13483	print_keydup_error(table, key, ER(ER_DUP_ENTRY_WITH_KEY_NAME), errflag);
13484	}
13485
13486	/*
13487	Register the storage engine plugin outside of myrocks namespace
13488	so that mysql_declare_plugin does not get confused when it does
13489	its name generation.
13490	*/
13491
13492
13493	struct st_mysql_storage_engine rocksdb_storage_engine = {
13494	MYSQL_HANDLERTON_INTERFACE_VERSION};
13495
13496	maria_declare_plugin(rocksdb_se){
13497	MYSQL_STORAGE_ENGINE_PLUGIN, / Plugin Type /
13498	&rocksdb_storage_engine, / Plugin Descriptor /
13499	"ROCKSDB", / Plugin Name /
13500	"Monty Program Ab", / Plugin Author /
13501	"RocksDB storage engine", / Plugin Description /
13502	PLUGIN_LICENSE_GPL, / Plugin Licence /
13503	myrocks::rocksdb_init_func, / Plugin Entry Point /
13504	myrocks::rocksdb_done_func, / Plugin Deinitializer /
13505	`0x0001`, / version number (0.1) /
13506	myrocks::rocksdb_status_vars, / status variables /
13507	myrocks::rocksdb_system_variables, / system variables /
13508	"1.0", / string version /
13509	myrocks::MYROCKS_MARIADB_PLUGIN_MATURITY_LEVEL
13510	},
13511	myrocks::rdb_i_s_cfstats, myrocks::rdb_i_s_dbstats,
13512	myrocks::rdb_i_s_perf_context, myrocks::rdb_i_s_perf_context_global,
13513	myrocks::rdb_i_s_cfoptions, myrocks::rdb_i_s_compact_stats,
13514	myrocks::rdb_i_s_global_info, myrocks::rdb_i_s_ddl,
13515	myrocks::rdb_i_s_index_file_map, myrocks::rdb_i_s_lock_info,
13516	myrocks::rdb_i_s_trx_info,
13517	myrocks::rdb_i_s_deadlock_info
13518	maria_declare_plugin_end;
13519

Browse the source code of MariaDB/storage/rocksdb/ha_rocksdb.cc