1 | /* |
2 | Copyright (c) 2012, Monty Program Ab |
3 | |
4 | This program is free software; you can redistribute it and/or modify |
5 | it under the terms of the GNU General Public License as published by |
6 | the Free Software Foundation; version 2 of the License. |
7 | |
8 | This program is distributed in the hope that it will be useful, |
9 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
10 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
11 | GNU General Public License for more details. |
12 | |
13 | You should have received a copy of the GNU General Public License |
14 | along with this program; if not, write to the Free Software |
15 | Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ |
16 | |
17 | #ifdef USE_PRAGMA_IMPLEMENTATION |
18 | #pragma implementation // gcc: Class implementation |
19 | #endif |
20 | |
21 | #define MYSQL_SERVER 1 |
22 | |
23 | /* For use of 'PRIu64': */ |
24 | #define __STDC_FORMAT_MACROS |
25 | |
26 | #include <my_global.h> |
27 | |
28 | #include <inttypes.h> |
29 | |
30 | /* The C++ file's header */ |
31 | #include "./ha_rocksdb.h" |
32 | |
33 | /* C++ standard header files */ |
34 | #include <algorithm> |
35 | #include <limits> |
36 | #include <map> |
37 | #include <queue> |
38 | #include <set> |
39 | #include <string> |
40 | #include <vector> |
41 | |
42 | /* MySQL includes */ |
43 | #include "./debug_sync.h" |
44 | #include "./my_bit.h" |
45 | #include "./my_stacktrace.h" |
46 | #include "./my_sys.h" |
47 | #include "./sql_audit.h" |
48 | #include "./sql_table.h" |
49 | #include "./sql_hset.h" |
50 | #include <mysql/psi/mysql_table.h> |
51 | #ifdef MARIAROCKS_NOT_YET |
52 | #include <mysql/thread_pool_priv.h> |
53 | #endif |
54 | #include <mysys_err.h> |
55 | |
56 | // Both MySQL and RocksDB define the same constant. To avoid compilation errors |
57 | // till we make the fix in RocksDB, we'll temporary undefine it here. |
58 | #undef CACHE_LINE_SIZE |
59 | |
60 | /* RocksDB includes */ |
61 | #include "monitoring/histogram.h" |
62 | #include "rocksdb/compaction_filter.h" |
63 | #include "rocksdb/env.h" |
64 | #include "rocksdb/persistent_cache.h" |
65 | #include "rocksdb/rate_limiter.h" |
66 | #include "rocksdb/slice_transform.h" |
67 | #include "rocksdb/thread_status.h" |
68 | #include "rocksdb/utilities/checkpoint.h" |
69 | #include "rocksdb/utilities/convenience.h" |
70 | #include "rocksdb/utilities/memory_util.h" |
71 | #include "rocksdb/utilities/sim_cache.h" |
72 | #include "util/stop_watch.h" |
73 | #include "./rdb_source_revision.h" |
74 | |
75 | /* MyRocks includes */ |
76 | #include "./event_listener.h" |
77 | #include "./ha_rocksdb_proto.h" |
78 | #include "./logger.h" |
79 | #include "./rdb_cf_manager.h" |
80 | #include "./rdb_cf_options.h" |
81 | #include "./rdb_datadic.h" |
82 | #include "./rdb_i_s.h" |
83 | #include "./rdb_index_merge.h" |
84 | #include "./rdb_mutex_wrapper.h" |
85 | #include "./rdb_psi.h" |
86 | #include "./rdb_threads.h" |
87 | #include "./rdb_mariadb_server_port.h" |
88 | |
89 | // Internal MySQL APIs not exposed in any header. |
90 | extern "C" { |
91 | /** |
92 | Mark transaction to rollback and mark error as fatal to a sub-statement. |
93 | @param thd Thread handle |
94 | @param all TRUE <=> rollback main transaction. |
95 | */ |
96 | void thd_mark_transaction_to_rollback(MYSQL_THD thd, bool all); |
97 | |
98 | /** |
99 | * Get the user thread's binary logging format |
100 | * @param thd user thread |
101 | * @return Value to be used as index into the binlog_format_names array |
102 | */ |
103 | int thd_binlog_format(const MYSQL_THD thd); |
104 | |
105 | /** |
106 | * Check if binary logging is filtered for thread's current db. |
107 | * @param thd Thread handle |
108 | * @retval 1 the query is not filtered, 0 otherwise. |
109 | */ |
110 | bool thd_binlog_filter_ok(const MYSQL_THD thd); |
111 | } |
112 | |
113 | MYSQL_PLUGIN_IMPORT bool my_disable_leak_check; |
114 | |
115 | // Needed in rocksdb_init_func |
116 | void ignore_db_dirs_append(const char *dirname_arg); |
117 | |
118 | |
119 | namespace myrocks { |
120 | |
121 | static st_global_stats global_stats; |
122 | static st_export_stats export_stats; |
123 | static st_memory_stats memory_stats; |
124 | static st_io_stall_stats io_stall_stats; |
125 | |
126 | const std::string DEFAULT_CF_NAME("default" ); |
127 | const std::string DEFAULT_SYSTEM_CF_NAME("__system__" ); |
128 | const std::string PER_INDEX_CF_NAME("$per_index_cf" ); |
129 | |
130 | /** |
131 | Updates row counters based on the table type and operation type. |
132 | */ |
133 | void ha_rocksdb::update_row_stats(const operation_type &type) { |
134 | DBUG_ASSERT(type < ROWS_MAX); |
135 | // Find if we are modifying system databases. |
136 | if (table->s && m_tbl_def->m_is_mysql_system_table) |
137 | global_stats.system_rows[type].inc(); |
138 | else |
139 | global_stats.rows[type].inc(); |
140 | } |
141 | |
142 | void dbug_dump_database(rocksdb::DB *db); |
143 | static handler *rocksdb_create_handler(my_core::handlerton *hton, |
144 | my_core::TABLE_SHARE *table_arg, |
145 | my_core::MEM_ROOT *mem_root); |
146 | |
147 | static rocksdb::CompactRangeOptions getCompactRangeOptions() { |
148 | rocksdb::CompactRangeOptions compact_range_options; |
149 | compact_range_options.bottommost_level_compaction = |
150 | rocksdb::BottommostLevelCompaction::kForce; |
151 | compact_range_options.exclusive_manual_compaction = false; |
152 | return compact_range_options; |
153 | } |
154 | |
155 | /////////////////////////////////////////////////////////// |
156 | // Parameters and settings |
157 | /////////////////////////////////////////////////////////// |
158 | static char *rocksdb_default_cf_options = nullptr; |
159 | static char *rocksdb_override_cf_options = nullptr; |
160 | static char *rocksdb_update_cf_options = nullptr; |
161 | |
162 | /////////////////////////////////////////////////////////// |
163 | // Globals |
164 | /////////////////////////////////////////////////////////// |
165 | handlerton *rocksdb_hton; |
166 | |
167 | rocksdb::TransactionDB *rdb = nullptr; |
168 | rocksdb::HistogramImpl *commit_latency_stats = nullptr; |
169 | |
170 | static std::shared_ptr<rocksdb::Statistics> rocksdb_stats; |
171 | static std::unique_ptr<rocksdb::Env> flashcache_aware_env; |
172 | static std::shared_ptr<Rdb_tbl_prop_coll_factory> properties_collector_factory; |
173 | |
174 | Rdb_dict_manager dict_manager; |
175 | Rdb_cf_manager cf_manager; |
176 | Rdb_ddl_manager ddl_manager; |
177 | Rdb_binlog_manager binlog_manager; |
178 | |
179 | #if !defined(_WIN32) && !defined(__APPLE__) |
180 | Rdb_io_watchdog *io_watchdog = nullptr; |
181 | #endif |
182 | /** |
183 | MyRocks background thread control |
184 | N.B. This is besides RocksDB's own background threads |
185 | (@see rocksdb::CancelAllBackgroundWork()) |
186 | */ |
187 | |
188 | static Rdb_background_thread rdb_bg_thread; |
189 | |
190 | // List of table names (using regex) that are exceptions to the strict |
191 | // collation check requirement. |
192 | Regex_list_handler *rdb_collation_exceptions; |
193 | |
194 | static const char **rdb_get_error_messages(int nr); |
195 | |
196 | static void rocksdb_flush_all_memtables() { |
197 | const Rdb_cf_manager &cf_manager = rdb_get_cf_manager(); |
198 | for (const auto &cf_handle : cf_manager.get_all_cf()) { |
199 | rdb->Flush(rocksdb::FlushOptions(), cf_handle); |
200 | } |
201 | } |
202 | |
203 | static void rocksdb_compact_column_family_stub( |
204 | THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr, |
205 | const void *const save) {} |
206 | |
207 | static int rocksdb_compact_column_family(THD *const thd, |
208 | struct st_mysql_sys_var *const var, |
209 | void *const var_ptr, |
210 | struct st_mysql_value *const value) { |
211 | char buff[STRING_BUFFER_USUAL_SIZE]; |
212 | int len = sizeof(buff); |
213 | |
214 | DBUG_ASSERT(value != nullptr); |
215 | |
216 | if (const char *const cf = value->val_str(value, buff, &len)) { |
217 | auto cfh = cf_manager.get_cf(cf); |
218 | if (cfh != nullptr && rdb != nullptr) { |
219 | sql_print_verbose_info("RocksDB: Manual compaction of column family: %s\n" , |
220 | cf); |
221 | rdb->CompactRange(getCompactRangeOptions(), cfh, nullptr, nullptr); |
222 | } |
223 | } |
224 | return HA_EXIT_SUCCESS; |
225 | } |
226 | |
227 | /////////////////////////////////////////////////////////// |
228 | // Hash map: table name => open table handler |
229 | /////////////////////////////////////////////////////////// |
230 | |
231 | namespace // anonymous namespace = not visible outside this source file |
232 | { |
233 | |
234 | const ulong TABLE_HASH_SIZE = 32; |
235 | typedef Hash_set<Rdb_table_handler> Rdb_table_set; |
236 | |
237 | struct Rdb_open_tables_map { |
238 | /* Hash table used to track the handlers of open tables */ |
239 | Rdb_table_set m_hash; |
240 | /* The mutex used to protect the hash table */ |
241 | mutable mysql_mutex_t m_mutex; |
242 | |
243 | static uchar *get_hash_key(const Rdb_table_handler *const table_handler, |
244 | size_t *const length, |
245 | my_bool not_used MY_ATTRIBUTE((__unused__))); |
246 | |
247 | Rdb_table_handler *get_table_handler(const char *const table_name); |
248 | void release_table_handler(Rdb_table_handler *const table_handler); |
249 | |
250 | Rdb_open_tables_map() : m_hash(get_hash_key, system_charset_info) { } |
251 | |
252 | std::vector<std::string> get_table_names(void) const; |
253 | }; |
254 | |
255 | } // anonymous namespace |
256 | |
257 | static Rdb_open_tables_map rdb_open_tables; |
258 | |
259 | static std::string rdb_normalize_dir(std::string dir) { |
260 | while (dir.size() > 0 && dir.back() == '/') { |
261 | dir.resize(dir.size() - 1); |
262 | } |
263 | return dir; |
264 | } |
265 | |
266 | static int rocksdb_create_checkpoint( |
267 | THD *const thd MY_ATTRIBUTE((__unused__)), |
268 | struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)), |
269 | void *const save MY_ATTRIBUTE((__unused__)), |
270 | struct st_mysql_value *const value) { |
271 | char buf[FN_REFLEN]; |
272 | int len = sizeof(buf); |
273 | const char *const checkpoint_dir_raw = value->val_str(value, buf, &len); |
274 | if (checkpoint_dir_raw) { |
275 | if (rdb != nullptr) { |
276 | std::string checkpoint_dir = rdb_normalize_dir(checkpoint_dir_raw); |
277 | // NO_LINT_DEBUG |
278 | sql_print_information("RocksDB: creating checkpoint in directory : %s\n" , |
279 | checkpoint_dir.c_str()); |
280 | rocksdb::Checkpoint *checkpoint; |
281 | auto status = rocksdb::Checkpoint::Create(rdb, &checkpoint); |
282 | // We can only return HA_EXIT_FAILURE/HA_EXIT_SUCCESS here which is why |
283 | // the return code is ignored, but by calling into rdb_error_to_mysql, |
284 | // it will call my_error for us, which will propogate up to the client. |
285 | int rc __attribute__((__unused__)); |
286 | if (status.ok()) { |
287 | status = checkpoint->CreateCheckpoint(checkpoint_dir.c_str()); |
288 | delete checkpoint; |
289 | if (status.ok()) { |
290 | sql_print_information( |
291 | "RocksDB: created checkpoint in directory : %s\n" , |
292 | checkpoint_dir.c_str()); |
293 | return HA_EXIT_SUCCESS; |
294 | } else { |
295 | rc = ha_rocksdb::rdb_error_to_mysql(status); |
296 | } |
297 | } else { |
298 | rc = ha_rocksdb::rdb_error_to_mysql(status); |
299 | } |
300 | } |
301 | } |
302 | return HA_EXIT_FAILURE; |
303 | } |
304 | |
305 | /* This method is needed to indicate that the |
306 | ROCKSDB_CREATE_CHECKPOINT command is not read-only */ |
307 | static void rocksdb_create_checkpoint_stub(THD *const thd, |
308 | struct st_mysql_sys_var *const var, |
309 | void *const var_ptr, |
310 | const void *const save) {} |
311 | |
312 | static void rocksdb_force_flush_memtable_now_stub( |
313 | THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr, |
314 | const void *const save) {} |
315 | |
316 | static int rocksdb_force_flush_memtable_now( |
317 | THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr, |
318 | struct st_mysql_value *const value) { |
319 | sql_print_information("RocksDB: Manual memtable flush." ); |
320 | rocksdb_flush_all_memtables(); |
321 | return HA_EXIT_SUCCESS; |
322 | } |
323 | |
324 | static void rocksdb_force_flush_memtable_and_lzero_now_stub( |
325 | THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr, |
326 | const void *const save) {} |
327 | |
328 | static int rocksdb_force_flush_memtable_and_lzero_now( |
329 | THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr, |
330 | struct st_mysql_value *const value) { |
331 | sql_print_information("RocksDB: Manual memtable and L0 flush." ); |
332 | rocksdb_flush_all_memtables(); |
333 | |
334 | const Rdb_cf_manager &cf_manager = rdb_get_cf_manager(); |
335 | rocksdb::CompactionOptions c_options = rocksdb::CompactionOptions(); |
336 | rocksdb::ColumnFamilyMetaData metadata; |
337 | rocksdb::ColumnFamilyDescriptor cf_descr; |
338 | |
339 | for (const auto &cf_handle : cf_manager.get_all_cf()) { |
340 | rdb->GetColumnFamilyMetaData(cf_handle, &metadata); |
341 | cf_handle->GetDescriptor(&cf_descr); |
342 | c_options.output_file_size_limit = cf_descr.options.target_file_size_base; |
343 | |
344 | DBUG_ASSERT(metadata.levels[0].level == 0); |
345 | std::vector<std::string> file_names; |
346 | for (auto &file : metadata.levels[0].files) { |
347 | file_names.emplace_back(file.db_path + file.name); |
348 | } |
349 | |
350 | if (!file_names.empty()) { |
351 | rocksdb::Status s; |
352 | s = rdb->CompactFiles(c_options, cf_handle, file_names, 1); |
353 | |
354 | if (!s.ok() && !s.IsAborted()) { |
355 | rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL); |
356 | return HA_EXIT_FAILURE; |
357 | } |
358 | } |
359 | } |
360 | |
361 | return HA_EXIT_SUCCESS; |
362 | } |
363 | |
364 | static void rocksdb_drop_index_wakeup_thread( |
365 | my_core::THD *const thd MY_ATTRIBUTE((__unused__)), |
366 | struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)), |
367 | void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save); |
368 | |
369 | static my_bool rocksdb_pause_background_work = 0; |
370 | static mysql_mutex_t rdb_sysvars_mutex; |
371 | |
372 | static void rocksdb_set_pause_background_work( |
373 | my_core::THD *const, |
374 | struct st_mysql_sys_var *const, |
375 | void *const, const void *const save) { |
376 | RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex); |
377 | const my_bool pause_requested = *static_cast<const my_bool *>(save); |
378 | if (rocksdb_pause_background_work != pause_requested) { |
379 | if (pause_requested) { |
380 | rdb->PauseBackgroundWork(); |
381 | } else { |
382 | rdb->ContinueBackgroundWork(); |
383 | } |
384 | rocksdb_pause_background_work = pause_requested; |
385 | } |
386 | RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); |
387 | } |
388 | |
389 | static void rocksdb_set_compaction_options(THD *thd, |
390 | struct st_mysql_sys_var *var, |
391 | void *var_ptr, const void *save); |
392 | |
393 | static void rocksdb_set_table_stats_sampling_pct(THD *thd, |
394 | struct st_mysql_sys_var *var, |
395 | void *var_ptr, |
396 | const void *save); |
397 | |
398 | static void rocksdb_set_rate_limiter_bytes_per_sec(THD *thd, |
399 | struct st_mysql_sys_var *var, |
400 | void *var_ptr, |
401 | const void *save); |
402 | |
403 | static void rocksdb_set_sst_mgr_rate_bytes_per_sec(THD *thd, |
404 | struct st_mysql_sys_var *var, |
405 | void *var_ptr, |
406 | const void *save); |
407 | |
408 | static void rocksdb_set_delayed_write_rate(THD *thd, |
409 | struct st_mysql_sys_var *var, |
410 | void *var_ptr, const void *save); |
411 | |
412 | static void rocksdb_set_max_latest_deadlocks(THD *thd, |
413 | struct st_mysql_sys_var *var, |
414 | void *var_ptr, const void *save); |
415 | |
416 | static void rdb_set_collation_exception_list(const char *exception_list); |
417 | static void rocksdb_set_collation_exception_list(THD *thd, |
418 | struct st_mysql_sys_var *var, |
419 | void *var_ptr, |
420 | const void *save); |
421 | |
422 | static int rocksdb_validate_update_cf_options(THD *thd, |
423 | struct st_mysql_sys_var *var, |
424 | void *save, |
425 | st_mysql_value *value); |
426 | |
427 | static void rocksdb_set_update_cf_options(THD *thd, |
428 | struct st_mysql_sys_var *var, |
429 | void *var_ptr, const void *save); |
430 | |
431 | static int rocksdb_check_bulk_load(THD *const thd, |
432 | struct st_mysql_sys_var *var |
433 | MY_ATTRIBUTE((__unused__)), |
434 | void *save, |
435 | struct st_mysql_value *value); |
436 | |
437 | static int rocksdb_check_bulk_load_allow_unsorted( |
438 | THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), |
439 | void *save, struct st_mysql_value *value); |
440 | |
441 | static void rocksdb_set_max_background_jobs(THD *thd, |
442 | struct st_mysql_sys_var *const var, |
443 | void *const var_ptr, |
444 | const void *const save); |
445 | static void rocksdb_set_bytes_per_sync(THD *thd, |
446 | struct st_mysql_sys_var *const var, |
447 | void *const var_ptr, |
448 | const void *const save); |
449 | static void rocksdb_set_wal_bytes_per_sync(THD *thd, |
450 | struct st_mysql_sys_var *const var, |
451 | void *const var_ptr, |
452 | const void *const save); |
453 | ////////////////////////////////////////////////////////////////////////////// |
454 | // Options definitions |
455 | ////////////////////////////////////////////////////////////////////////////// |
456 | static long long rocksdb_block_cache_size; |
457 | static long long rocksdb_sim_cache_size; |
458 | static my_bool rocksdb_use_clock_cache; |
459 | /* Use unsigned long long instead of uint64_t because of MySQL compatibility */ |
460 | static unsigned long long // NOLINT(runtime/int) |
461 | rocksdb_rate_limiter_bytes_per_sec; |
462 | static unsigned long long // NOLINT(runtime/int) |
463 | rocksdb_sst_mgr_rate_bytes_per_sec; |
464 | static unsigned long long rocksdb_delayed_write_rate; |
465 | static uint32_t rocksdb_max_latest_deadlocks; |
466 | static unsigned long // NOLINT(runtime/int) |
467 | rocksdb_persistent_cache_size_mb; |
468 | static ulong rocksdb_info_log_level; |
469 | static char *rocksdb_wal_dir; |
470 | static char *rocksdb_persistent_cache_path; |
471 | static ulong rocksdb_index_type; |
472 | static uint32_t rocksdb_flush_log_at_trx_commit; |
473 | static uint32_t rocksdb_debug_optimizer_n_rows; |
474 | static my_bool rocksdb_force_compute_memtable_stats; |
475 | static uint32_t rocksdb_force_compute_memtable_stats_cachetime; |
476 | static my_bool rocksdb_debug_optimizer_no_zero_cardinality; |
477 | static uint32_t rocksdb_wal_recovery_mode; |
478 | static uint32_t rocksdb_access_hint_on_compaction_start; |
479 | static char *rocksdb_compact_cf_name; |
480 | static char *rocksdb_checkpoint_name; |
481 | static my_bool rocksdb_signal_drop_index_thread; |
482 | static my_bool rocksdb_strict_collation_check = 1; |
483 | static my_bool rocksdb_ignore_unknown_options = 1; |
484 | static my_bool rocksdb_enable_2pc = 0; |
485 | static char *rocksdb_strict_collation_exceptions; |
486 | static my_bool rocksdb_collect_sst_properties = 1; |
487 | static my_bool rocksdb_force_flush_memtable_now_var = 0; |
488 | static my_bool rocksdb_force_flush_memtable_and_lzero_now_var = 0; |
489 | static my_bool rocksdb_enable_ttl = 1; |
490 | static my_bool rocksdb_enable_ttl_read_filtering = 1; |
491 | static int rocksdb_debug_ttl_rec_ts = 0; |
492 | static int rocksdb_debug_ttl_snapshot_ts = 0; |
493 | static int rocksdb_debug_ttl_read_filter_ts = 0; |
494 | static my_bool rocksdb_debug_ttl_ignore_pk = 0; |
495 | static my_bool rocksdb_reset_stats = 0; |
496 | static uint32_t rocksdb_io_write_timeout_secs = 0; |
497 | static uint32_t rocksdb_seconds_between_stat_computes = 3600; |
498 | static long long rocksdb_compaction_sequential_deletes = 0l; |
499 | static long long rocksdb_compaction_sequential_deletes_window = 0l; |
500 | static long long rocksdb_compaction_sequential_deletes_file_size = 0l; |
501 | static uint32_t rocksdb_validate_tables = 1; |
502 | static char *rocksdb_datadir; |
503 | static uint32_t rocksdb_table_stats_sampling_pct; |
504 | static my_bool rocksdb_enable_bulk_load_api = 1; |
505 | static my_bool rocksdb_print_snapshot_conflict_queries = 0; |
506 | static my_bool rocksdb_large_prefix = 0; |
507 | static my_bool rocksdb_allow_to_start_after_corruption = 0; |
508 | static char* rocksdb_git_hash; |
509 | |
510 | char *compression_types_val= |
511 | const_cast<char*>(get_rocksdb_supported_compression_types()); |
512 | |
513 | std::atomic<uint64_t> rocksdb_row_lock_deadlocks(0); |
514 | std::atomic<uint64_t> rocksdb_row_lock_wait_timeouts(0); |
515 | std::atomic<uint64_t> rocksdb_snapshot_conflict_errors(0); |
516 | std::atomic<uint64_t> rocksdb_wal_group_syncs(0); |
517 | |
518 | static std::unique_ptr<rocksdb::DBOptions> rdb_init_rocksdb_db_options(void) { |
519 | auto o = std::unique_ptr<rocksdb::DBOptions>(new rocksdb::DBOptions()); |
520 | |
521 | o->create_if_missing = true; |
522 | o->listeners.push_back(std::make_shared<Rdb_event_listener>(&ddl_manager)); |
523 | o->info_log_level = rocksdb::InfoLogLevel::INFO_LEVEL; |
524 | o->max_subcompactions = DEFAULT_SUBCOMPACTIONS; |
525 | o->max_open_files = -2; // auto-tune to 50% open_files_limit |
526 | |
527 | o->two_write_queues = true; |
528 | o->manual_wal_flush = true; |
529 | return o; |
530 | } |
531 | |
532 | /* DBOptions contains Statistics and needs to be destructed last */ |
533 | static std::unique_ptr<rocksdb::BlockBasedTableOptions> rocksdb_tbl_options = |
534 | std::unique_ptr<rocksdb::BlockBasedTableOptions>( |
535 | new rocksdb::BlockBasedTableOptions()); |
536 | static std::unique_ptr<rocksdb::DBOptions> rocksdb_db_options = |
537 | rdb_init_rocksdb_db_options(); |
538 | |
539 | static std::shared_ptr<rocksdb::RateLimiter> rocksdb_rate_limiter; |
540 | |
541 | /* This enum needs to be kept up to date with rocksdb::InfoLogLevel */ |
542 | static const char *info_log_level_names[] = {"debug_level" , "info_level" , |
543 | "warn_level" , "error_level" , |
544 | "fatal_level" , NullS}; |
545 | |
546 | static TYPELIB info_log_level_typelib = { |
547 | array_elements(info_log_level_names) - 1, "info_log_level_typelib" , |
548 | info_log_level_names, nullptr}; |
549 | |
550 | static void rocksdb_set_rocksdb_info_log_level( |
551 | THD *const thd, struct st_mysql_sys_var *const var, void *const var_ptr, |
552 | const void *const save) { |
553 | DBUG_ASSERT(save != nullptr); |
554 | |
555 | RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex); |
556 | rocksdb_info_log_level = *static_cast<const uint64_t *>(save); |
557 | rocksdb_db_options->info_log->SetInfoLogLevel( |
558 | static_cast<const rocksdb::InfoLogLevel>(rocksdb_info_log_level)); |
559 | RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); |
560 | } |
561 | |
562 | static void rocksdb_set_reset_stats( |
563 | my_core::THD *const /* unused */, |
564 | my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)), |
565 | void *const var_ptr, const void *const save) { |
566 | DBUG_ASSERT(save != nullptr); |
567 | DBUG_ASSERT(rdb != nullptr); |
568 | DBUG_ASSERT(rocksdb_stats != nullptr); |
569 | |
570 | RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex); |
571 | |
572 | *static_cast<bool *>(var_ptr) = *static_cast<const bool *>(save); |
573 | |
574 | if (rocksdb_reset_stats) { |
575 | rocksdb::Status s = rdb->ResetStats(); |
576 | |
577 | // RocksDB will always return success. Let's document this assumption here |
578 | // as well so that we'll get immediately notified when contract changes. |
579 | DBUG_ASSERT(s == rocksdb::Status::OK()); |
580 | |
581 | s = rocksdb_stats->Reset(); |
582 | DBUG_ASSERT(s == rocksdb::Status::OK()); |
583 | } |
584 | |
585 | RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); |
586 | } |
587 | |
588 | static void rocksdb_set_io_write_timeout( |
589 | my_core::THD *const thd MY_ATTRIBUTE((__unused__)), |
590 | my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)), |
591 | void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) { |
592 | DBUG_ASSERT(save != nullptr); |
593 | DBUG_ASSERT(rdb != nullptr); |
594 | #if !defined(_WIN32) && !defined(__APPLE__) |
595 | DBUG_ASSERT(io_watchdog != nullptr); |
596 | #endif |
597 | |
598 | RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex); |
599 | |
600 | const uint32_t new_val = *static_cast<const uint32_t *>(save); |
601 | |
602 | rocksdb_io_write_timeout_secs = new_val; |
603 | #if !defined(_WIN32) && !defined(__APPLE__) |
604 | io_watchdog->reset_timeout(rocksdb_io_write_timeout_secs); |
605 | #endif |
606 | RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); |
607 | } |
608 | |
609 | enum rocksdb_flush_log_at_trx_commit_type : unsigned int { |
610 | FLUSH_LOG_NEVER = 0, |
611 | FLUSH_LOG_SYNC, |
612 | FLUSH_LOG_BACKGROUND, |
613 | FLUSH_LOG_MAX /* must be last */ |
614 | }; |
615 | |
616 | static int rocksdb_validate_flush_log_at_trx_commit( |
617 | THD *const thd, |
618 | struct st_mysql_sys_var *const var, /* in: pointer to system variable */ |
619 | void *var_ptr, /* out: immediate result for update function */ |
620 | struct st_mysql_value *const value /* in: incoming value */) { |
621 | long long new_value; |
622 | |
623 | /* value is NULL */ |
624 | if (value->val_int(value, &new_value)) { |
625 | return HA_EXIT_FAILURE; |
626 | } |
627 | |
628 | if (rocksdb_db_options->allow_mmap_writes && new_value != FLUSH_LOG_NEVER) { |
629 | return HA_EXIT_FAILURE; |
630 | } |
631 | |
632 | *static_cast<uint32_t *>(var_ptr) = static_cast<uint32_t>(new_value); |
633 | return HA_EXIT_SUCCESS; |
634 | } |
635 | |
636 | static const char *index_type_names[] = {"kBinarySearch" , "kHashSearch" , NullS}; |
637 | |
638 | static TYPELIB index_type_typelib = {array_elements(index_type_names) - 1, |
639 | "index_type_typelib" , index_type_names, |
640 | nullptr}; |
641 | |
642 | const ulong RDB_MAX_LOCK_WAIT_SECONDS = 1024 * 1024 * 1024; |
643 | const ulong RDB_MAX_ROW_LOCKS = 1024 * 1024; |
644 | const ulong RDB_DEFAULT_BULK_LOAD_SIZE = 1000; |
645 | const ulong RDB_MAX_BULK_LOAD_SIZE = 1024 * 1024 * 1024; |
646 | const size_t RDB_DEFAULT_MERGE_BUF_SIZE = 64 * 1024 * 1024; |
647 | const size_t RDB_MIN_MERGE_BUF_SIZE = 100; |
648 | const size_t RDB_DEFAULT_MERGE_COMBINE_READ_SIZE = 1024 * 1024 * 1024; |
649 | const size_t RDB_MIN_MERGE_COMBINE_READ_SIZE = 100; |
650 | const size_t RDB_DEFAULT_MERGE_TMP_FILE_REMOVAL_DELAY = 0; |
651 | const size_t RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY = 0; |
652 | const int64 RDB_DEFAULT_BLOCK_CACHE_SIZE = 512 * 1024 * 1024; |
653 | const int64 RDB_MIN_BLOCK_CACHE_SIZE = 1024; |
654 | const int RDB_MAX_CHECKSUMS_PCT = 100; |
655 | const ulong RDB_DEADLOCK_DETECT_DEPTH = 50; |
656 | |
657 | // TODO: 0 means don't wait at all, and we don't support it yet? |
658 | static MYSQL_THDVAR_ULONG(lock_wait_timeout, PLUGIN_VAR_RQCMDARG, |
659 | "Number of seconds to wait for lock" , nullptr, |
660 | nullptr, /*default*/ 1, /*min*/ 1, |
661 | /*max*/ RDB_MAX_LOCK_WAIT_SECONDS, 0); |
662 | |
663 | static MYSQL_THDVAR_BOOL(deadlock_detect, PLUGIN_VAR_RQCMDARG, |
664 | "Enables deadlock detection" , nullptr, nullptr, FALSE); |
665 | |
666 | static MYSQL_THDVAR_ULONG(deadlock_detect_depth, PLUGIN_VAR_RQCMDARG, |
667 | "Number of transactions deadlock detection will " |
668 | "traverse through before assuming deadlock" , |
669 | nullptr, nullptr, |
670 | /*default*/ RDB_DEADLOCK_DETECT_DEPTH, |
671 | /*min*/ 2, |
672 | /*max*/ ULONG_MAX, 0); |
673 | |
674 | static MYSQL_THDVAR_BOOL( |
675 | trace_sst_api, PLUGIN_VAR_RQCMDARG, |
676 | "Generate trace output in the log for each call to the SstFileWriter" , |
677 | nullptr, nullptr, FALSE); |
678 | |
679 | static MYSQL_THDVAR_BOOL( |
680 | bulk_load, PLUGIN_VAR_RQCMDARG, |
681 | "Use bulk-load mode for inserts. This disables " |
682 | "unique_checks and enables rocksdb_commit_in_the_middle." , |
683 | rocksdb_check_bulk_load, nullptr, FALSE); |
684 | |
685 | static MYSQL_THDVAR_BOOL(bulk_load_allow_unsorted, PLUGIN_VAR_RQCMDARG, |
686 | "Allow unsorted input during bulk-load. " |
687 | "Can be changed only when bulk load is disabled." , |
688 | rocksdb_check_bulk_load_allow_unsorted, nullptr, |
689 | FALSE); |
690 | |
691 | static MYSQL_SYSVAR_BOOL(enable_bulk_load_api, rocksdb_enable_bulk_load_api, |
692 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
693 | "Enables using SstFileWriter for bulk loading" , |
694 | nullptr, nullptr, rocksdb_enable_bulk_load_api); |
695 | |
696 | static MYSQL_SYSVAR_STR(git_hash, rocksdb_git_hash, |
697 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
698 | "Git revision of the RocksDB library used by MyRocks" , |
699 | nullptr, nullptr, ROCKSDB_GIT_HASH); |
700 | |
701 | static MYSQL_THDVAR_STR(tmpdir, PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_MEMALLOC, |
702 | "Directory for temporary files during DDL operations." , |
703 | nullptr, nullptr, "" ); |
704 | |
705 | static MYSQL_THDVAR_STR( |
706 | skip_unique_check_tables, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, |
707 | "Skip unique constraint checking for the specified tables" , nullptr, |
708 | nullptr, ".*" ); |
709 | |
710 | static MYSQL_THDVAR_BOOL( |
711 | commit_in_the_middle, PLUGIN_VAR_RQCMDARG, |
712 | "Commit rows implicitly every rocksdb_bulk_load_size, on bulk load/insert, " |
713 | "update and delete" , |
714 | nullptr, nullptr, FALSE); |
715 | |
716 | static MYSQL_THDVAR_BOOL( |
717 | blind_delete_primary_key, PLUGIN_VAR_RQCMDARG, |
718 | "Deleting rows by primary key lookup, without reading rows (Blind Deletes)." |
719 | " Blind delete is disabled if the table has secondary key" , |
720 | nullptr, nullptr, FALSE); |
721 | |
722 | static MYSQL_THDVAR_STR( |
723 | read_free_rpl_tables, PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, |
724 | "List of tables that will use read-free replication on the slave " |
725 | "(i.e. not lookup a row during replication)" , |
726 | nullptr, nullptr, "" ); |
727 | |
728 | static MYSQL_THDVAR_BOOL(skip_bloom_filter_on_read, PLUGIN_VAR_RQCMDARG, |
729 | "Skip using bloom filter for reads" , nullptr, nullptr, |
730 | FALSE); |
731 | |
732 | static MYSQL_THDVAR_ULONG(max_row_locks, PLUGIN_VAR_RQCMDARG, |
733 | "Maximum number of locks a transaction can have" , |
734 | nullptr, nullptr, |
735 | /*default*/ RDB_MAX_ROW_LOCKS, |
736 | /*min*/ 1, |
737 | /*max*/ RDB_MAX_ROW_LOCKS, 0); |
738 | |
739 | static MYSQL_THDVAR_ULONGLONG( |
740 | write_batch_max_bytes, PLUGIN_VAR_RQCMDARG, |
741 | "Maximum size of write batch in bytes. 0 means no limit." , nullptr, nullptr, |
742 | /* default */ 0, /* min */ 0, /* max */ SIZE_T_MAX, 1); |
743 | |
744 | static MYSQL_THDVAR_BOOL( |
745 | lock_scanned_rows, PLUGIN_VAR_RQCMDARG, |
746 | "Take and hold locks on rows that are scanned but not updated" , nullptr, |
747 | nullptr, FALSE); |
748 | |
749 | static MYSQL_THDVAR_ULONG(bulk_load_size, PLUGIN_VAR_RQCMDARG, |
750 | "Max #records in a batch for bulk-load mode" , nullptr, |
751 | nullptr, |
752 | /*default*/ RDB_DEFAULT_BULK_LOAD_SIZE, |
753 | /*min*/ 1, |
754 | /*max*/ RDB_MAX_BULK_LOAD_SIZE, 0); |
755 | |
756 | static MYSQL_THDVAR_ULONGLONG( |
757 | merge_buf_size, PLUGIN_VAR_RQCMDARG, |
758 | "Size to allocate for merge sort buffers written out to disk " |
759 | "during inplace index creation." , |
760 | nullptr, nullptr, |
761 | /* default (64MB) */ RDB_DEFAULT_MERGE_BUF_SIZE, |
762 | /* min (100B) */ RDB_MIN_MERGE_BUF_SIZE, |
763 | /* max */ SIZE_T_MAX, 1); |
764 | |
765 | static MYSQL_THDVAR_ULONGLONG( |
766 | merge_combine_read_size, PLUGIN_VAR_RQCMDARG, |
767 | "Size that we have to work with during combine (reading from disk) phase " |
768 | "of " |
769 | "external sort during fast index creation." , |
770 | nullptr, nullptr, |
771 | /* default (1GB) */ RDB_DEFAULT_MERGE_COMBINE_READ_SIZE, |
772 | /* min (100B) */ RDB_MIN_MERGE_COMBINE_READ_SIZE, |
773 | /* max */ SIZE_T_MAX, 1); |
774 | |
775 | static MYSQL_THDVAR_ULONGLONG( |
776 | merge_tmp_file_removal_delay_ms, PLUGIN_VAR_RQCMDARG, |
777 | "Fast index creation creates a large tmp file on disk during index " |
778 | "creation. Removing this large file all at once when index creation is " |
779 | "complete can cause trim stalls on Flash. This variable specifies a " |
780 | "duration to sleep (in milliseconds) between calling chsize() to truncate " |
781 | "the file in chunks. The chunk size is the same as merge_buf_size." , |
782 | nullptr, nullptr, |
783 | /* default (0ms) */ RDB_DEFAULT_MERGE_TMP_FILE_REMOVAL_DELAY, |
784 | /* min (0ms) */ RDB_MIN_MERGE_TMP_FILE_REMOVAL_DELAY, |
785 | /* max */ SIZE_T_MAX, 1); |
786 | |
787 | static MYSQL_SYSVAR_BOOL( |
788 | create_if_missing, |
789 | *reinterpret_cast<my_bool *>(&rocksdb_db_options->create_if_missing), |
790 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
791 | "DBOptions::create_if_missing for RocksDB" , nullptr, nullptr, |
792 | rocksdb_db_options->create_if_missing); |
793 | |
794 | static MYSQL_SYSVAR_BOOL( |
795 | two_write_queues, |
796 | *reinterpret_cast<my_bool *>(&rocksdb_db_options->two_write_queues), |
797 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
798 | "DBOptions::two_write_queues for RocksDB" , nullptr, nullptr, |
799 | rocksdb_db_options->two_write_queues); |
800 | |
801 | static MYSQL_SYSVAR_BOOL( |
802 | manual_wal_flush, |
803 | *reinterpret_cast<my_bool *>(&rocksdb_db_options->manual_wal_flush), |
804 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
805 | "DBOptions::manual_wal_flush for RocksDB" , nullptr, nullptr, |
806 | rocksdb_db_options->manual_wal_flush); |
807 | |
808 | static MYSQL_SYSVAR_BOOL( |
809 | create_missing_column_families, |
810 | *reinterpret_cast<my_bool *>( |
811 | &rocksdb_db_options->create_missing_column_families), |
812 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
813 | "DBOptions::create_missing_column_families for RocksDB" , nullptr, nullptr, |
814 | rocksdb_db_options->create_missing_column_families); |
815 | |
816 | static MYSQL_SYSVAR_BOOL( |
817 | error_if_exists, |
818 | *reinterpret_cast<my_bool *>(&rocksdb_db_options->error_if_exists), |
819 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
820 | "DBOptions::error_if_exists for RocksDB" , nullptr, nullptr, |
821 | rocksdb_db_options->error_if_exists); |
822 | |
823 | static MYSQL_SYSVAR_BOOL( |
824 | paranoid_checks, |
825 | *reinterpret_cast<my_bool *>(&rocksdb_db_options->paranoid_checks), |
826 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
827 | "DBOptions::paranoid_checks for RocksDB" , nullptr, nullptr, |
828 | rocksdb_db_options->paranoid_checks); |
829 | |
830 | static MYSQL_SYSVAR_ULONGLONG( |
831 | rate_limiter_bytes_per_sec, rocksdb_rate_limiter_bytes_per_sec, |
832 | PLUGIN_VAR_RQCMDARG, "DBOptions::rate_limiter bytes_per_sec for RocksDB" , |
833 | nullptr, rocksdb_set_rate_limiter_bytes_per_sec, /* default */ 0L, |
834 | /* min */ 0L, /* max */ MAX_RATE_LIMITER_BYTES_PER_SEC, 0); |
835 | |
836 | static MYSQL_SYSVAR_ULONGLONG( |
837 | sst_mgr_rate_bytes_per_sec, rocksdb_sst_mgr_rate_bytes_per_sec, |
838 | PLUGIN_VAR_RQCMDARG, |
839 | "DBOptions::sst_file_manager rate_bytes_per_sec for RocksDB" , nullptr, |
840 | rocksdb_set_sst_mgr_rate_bytes_per_sec, |
841 | /* default */ DEFAULT_SST_MGR_RATE_BYTES_PER_SEC, |
842 | /* min */ 0L, /* max */ UINT64_MAX, 0); |
843 | |
844 | static MYSQL_SYSVAR_ULONGLONG(delayed_write_rate, rocksdb_delayed_write_rate, |
845 | PLUGIN_VAR_RQCMDARG, |
846 | "DBOptions::delayed_write_rate" , nullptr, |
847 | rocksdb_set_delayed_write_rate, |
848 | rocksdb_db_options->delayed_write_rate, 0, |
849 | UINT64_MAX, 0); |
850 | |
851 | static MYSQL_SYSVAR_UINT(max_latest_deadlocks, rocksdb_max_latest_deadlocks, |
852 | PLUGIN_VAR_RQCMDARG, |
853 | "Maximum number of recent " |
854 | "deadlocks to store" , |
855 | nullptr, rocksdb_set_max_latest_deadlocks, |
856 | rocksdb::kInitialMaxDeadlocks, 0, UINT32_MAX, 0); |
857 | |
858 | static MYSQL_SYSVAR_ENUM( |
859 | info_log_level, rocksdb_info_log_level, PLUGIN_VAR_RQCMDARG, |
860 | "Filter level for info logs to be written mysqld error log. " |
861 | "Valid values include 'debug_level', 'info_level', 'warn_level'" |
862 | "'error_level' and 'fatal_level'." , |
863 | nullptr, rocksdb_set_rocksdb_info_log_level, |
864 | rocksdb::InfoLogLevel::ERROR_LEVEL, &info_log_level_typelib); |
865 | |
866 | static MYSQL_THDVAR_INT( |
867 | perf_context_level, PLUGIN_VAR_RQCMDARG, |
868 | "Perf Context Level for rocksdb internal timer stat collection" , nullptr, |
869 | nullptr, |
870 | /* default */ rocksdb::PerfLevel::kUninitialized, |
871 | /* min */ rocksdb::PerfLevel::kUninitialized, |
872 | /* max */ rocksdb::PerfLevel::kOutOfBounds - 1, 0); |
873 | |
874 | static MYSQL_SYSVAR_UINT( |
875 | wal_recovery_mode, rocksdb_wal_recovery_mode, PLUGIN_VAR_RQCMDARG, |
876 | "DBOptions::wal_recovery_mode for RocksDB. Default is kAbsoluteConsistency" , |
877 | nullptr, nullptr, |
878 | /* default */ (uint)rocksdb::WALRecoveryMode::kAbsoluteConsistency, |
879 | /* min */ (uint)rocksdb::WALRecoveryMode::kTolerateCorruptedTailRecords, |
880 | /* max */ (uint)rocksdb::WALRecoveryMode::kSkipAnyCorruptedRecords, 0); |
881 | |
882 | static MYSQL_SYSVAR_SIZE_T(compaction_readahead_size, |
883 | rocksdb_db_options->compaction_readahead_size, |
884 | PLUGIN_VAR_RQCMDARG, |
885 | "DBOptions::compaction_readahead_size for RocksDB" , |
886 | nullptr, nullptr, |
887 | rocksdb_db_options->compaction_readahead_size, |
888 | /* min */ 0L, /* max */ SIZE_T_MAX, 0); |
889 | |
890 | static MYSQL_SYSVAR_BOOL( |
891 | new_table_reader_for_compaction_inputs, |
892 | *reinterpret_cast<my_bool *>( |
893 | &rocksdb_db_options->new_table_reader_for_compaction_inputs), |
894 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
895 | "DBOptions::new_table_reader_for_compaction_inputs for RocksDB" , nullptr, |
896 | nullptr, rocksdb_db_options->new_table_reader_for_compaction_inputs); |
897 | |
898 | static MYSQL_SYSVAR_UINT( |
899 | access_hint_on_compaction_start, rocksdb_access_hint_on_compaction_start, |
900 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
901 | "DBOptions::access_hint_on_compaction_start for RocksDB" , nullptr, nullptr, |
902 | /* default */ (uint)rocksdb::Options::AccessHint::NORMAL, |
903 | /* min */ (uint)rocksdb::Options::AccessHint::NONE, |
904 | /* max */ (uint)rocksdb::Options::AccessHint::WILLNEED, 0); |
905 | |
906 | static MYSQL_SYSVAR_BOOL( |
907 | allow_concurrent_memtable_write, |
908 | *reinterpret_cast<my_bool *>( |
909 | &rocksdb_db_options->allow_concurrent_memtable_write), |
910 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
911 | "DBOptions::allow_concurrent_memtable_write for RocksDB" , nullptr, nullptr, |
912 | false); |
913 | |
914 | static MYSQL_SYSVAR_BOOL( |
915 | enable_write_thread_adaptive_yield, |
916 | *reinterpret_cast<my_bool *>( |
917 | &rocksdb_db_options->enable_write_thread_adaptive_yield), |
918 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
919 | "DBOptions::enable_write_thread_adaptive_yield for RocksDB" , nullptr, |
920 | nullptr, false); |
921 | |
922 | static MYSQL_SYSVAR_INT(max_open_files, rocksdb_db_options->max_open_files, |
923 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
924 | "DBOptions::max_open_files for RocksDB" , nullptr, |
925 | nullptr, rocksdb_db_options->max_open_files, |
926 | /* min */ -2, /* max */ INT_MAX, 0); |
927 | |
928 | static MYSQL_SYSVAR_UINT64_T(max_total_wal_size, |
929 | rocksdb_db_options->max_total_wal_size, |
930 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
931 | "DBOptions::max_total_wal_size for RocksDB" , nullptr, |
932 | nullptr, rocksdb_db_options->max_total_wal_size, |
933 | /* min */ 0, /* max */ LONGLONG_MAX, 0); |
934 | |
935 | static MYSQL_SYSVAR_BOOL( |
936 | use_fsync, *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_fsync), |
937 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
938 | "DBOptions::use_fsync for RocksDB" , nullptr, nullptr, |
939 | rocksdb_db_options->use_fsync); |
940 | |
941 | static MYSQL_SYSVAR_STR(wal_dir, rocksdb_wal_dir, |
942 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
943 | "DBOptions::wal_dir for RocksDB" , nullptr, nullptr, |
944 | rocksdb_db_options->wal_dir.c_str()); |
945 | |
946 | static MYSQL_SYSVAR_STR( |
947 | persistent_cache_path, rocksdb_persistent_cache_path, |
948 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
949 | "Path for BlockBasedTableOptions::persistent_cache for RocksDB" , nullptr, |
950 | nullptr, "" ); |
951 | |
952 | static MYSQL_SYSVAR_ULONG( |
953 | persistent_cache_size_mb, rocksdb_persistent_cache_size_mb, |
954 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
955 | "Size of cache in MB for BlockBasedTableOptions::persistent_cache " |
956 | "for RocksDB" , nullptr, nullptr, rocksdb_persistent_cache_size_mb, |
957 | /* min */ 0L, /* max */ ULONG_MAX, 0); |
958 | |
959 | static MYSQL_SYSVAR_UINT64_T( |
960 | delete_obsolete_files_period_micros, |
961 | rocksdb_db_options->delete_obsolete_files_period_micros, |
962 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
963 | "DBOptions::delete_obsolete_files_period_micros for RocksDB" , nullptr, |
964 | nullptr, rocksdb_db_options->delete_obsolete_files_period_micros, |
965 | /* min */ 0, /* max */ LONGLONG_MAX, 0); |
966 | |
967 | static MYSQL_SYSVAR_INT(max_background_jobs, |
968 | rocksdb_db_options->max_background_jobs, |
969 | PLUGIN_VAR_RQCMDARG, |
970 | "DBOptions::max_background_jobs for RocksDB" , nullptr, |
971 | rocksdb_set_max_background_jobs, |
972 | rocksdb_db_options->max_background_jobs, |
973 | /* min */ -1, /* max */ MAX_BACKGROUND_JOBS, 0); |
974 | |
975 | static MYSQL_SYSVAR_UINT(max_subcompactions, |
976 | rocksdb_db_options->max_subcompactions, |
977 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
978 | "DBOptions::max_subcompactions for RocksDB" , nullptr, |
979 | nullptr, rocksdb_db_options->max_subcompactions, |
980 | /* min */ 1, /* max */ MAX_SUBCOMPACTIONS, 0); |
981 | |
982 | static MYSQL_SYSVAR_SIZE_T(max_log_file_size, |
983 | rocksdb_db_options->max_log_file_size, |
984 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
985 | "DBOptions::max_log_file_size for RocksDB" , nullptr, |
986 | nullptr, rocksdb_db_options->max_log_file_size, |
987 | /* min */ 0L, /* max */ SIZE_T_MAX, 0); |
988 | |
989 | static MYSQL_SYSVAR_SIZE_T(log_file_time_to_roll, |
990 | rocksdb_db_options->log_file_time_to_roll, |
991 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
992 | "DBOptions::log_file_time_to_roll for RocksDB" , |
993 | nullptr, nullptr, |
994 | rocksdb_db_options->log_file_time_to_roll, |
995 | /* min */ 0L, /* max */ SIZE_T_MAX, 0); |
996 | |
997 | static MYSQL_SYSVAR_SIZE_T(keep_log_file_num, |
998 | rocksdb_db_options->keep_log_file_num, |
999 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1000 | "DBOptions::keep_log_file_num for RocksDB" , nullptr, |
1001 | nullptr, rocksdb_db_options->keep_log_file_num, |
1002 | /* min */ 0L, /* max */ SIZE_T_MAX, 0); |
1003 | |
1004 | static MYSQL_SYSVAR_UINT64_T(max_manifest_file_size, |
1005 | rocksdb_db_options->max_manifest_file_size, |
1006 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1007 | "DBOptions::max_manifest_file_size for RocksDB" , |
1008 | nullptr, nullptr, |
1009 | rocksdb_db_options->max_manifest_file_size, |
1010 | /* min */ 0L, /* max */ ULONGLONG_MAX, 0); |
1011 | |
1012 | static MYSQL_SYSVAR_INT(table_cache_numshardbits, |
1013 | rocksdb_db_options->table_cache_numshardbits, |
1014 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1015 | "DBOptions::table_cache_numshardbits for RocksDB" , |
1016 | nullptr, nullptr, |
1017 | rocksdb_db_options->table_cache_numshardbits, |
1018 | /* min */ 0, /* max */ INT_MAX, 0); |
1019 | |
1020 | static MYSQL_SYSVAR_UINT64_T(wal_ttl_seconds, rocksdb_db_options->WAL_ttl_seconds, |
1021 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1022 | "DBOptions::WAL_ttl_seconds for RocksDB" , nullptr, |
1023 | nullptr, rocksdb_db_options->WAL_ttl_seconds, |
1024 | /* min */ 0L, /* max */ LONGLONG_MAX, 0); |
1025 | |
1026 | static MYSQL_SYSVAR_UINT64_T(wal_size_limit_mb, |
1027 | rocksdb_db_options->WAL_size_limit_MB, |
1028 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1029 | "DBOptions::WAL_size_limit_MB for RocksDB" , nullptr, |
1030 | nullptr, rocksdb_db_options->WAL_size_limit_MB, |
1031 | /* min */ 0L, /* max */ LONGLONG_MAX, 0); |
1032 | |
1033 | static MYSQL_SYSVAR_SIZE_T(manifest_preallocation_size, |
1034 | rocksdb_db_options->manifest_preallocation_size, |
1035 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1036 | "DBOptions::manifest_preallocation_size for RocksDB" , |
1037 | nullptr, nullptr, |
1038 | rocksdb_db_options->manifest_preallocation_size, |
1039 | /* min */ 0L, /* max */ SIZE_T_MAX, 0); |
1040 | |
1041 | static MYSQL_SYSVAR_BOOL( |
1042 | use_direct_reads, |
1043 | *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_direct_reads), |
1044 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1045 | "DBOptions::use_direct_reads for RocksDB" , nullptr, nullptr, |
1046 | rocksdb_db_options->use_direct_reads); |
1047 | |
1048 | static MYSQL_SYSVAR_BOOL( |
1049 | use_direct_io_for_flush_and_compaction, |
1050 | *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_direct_io_for_flush_and_compaction), |
1051 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1052 | "DBOptions::use_direct_io_for_flush_and_compaction for RocksDB" , nullptr, nullptr, |
1053 | rocksdb_db_options->use_direct_io_for_flush_and_compaction); |
1054 | |
1055 | static MYSQL_SYSVAR_BOOL( |
1056 | allow_mmap_reads, |
1057 | *reinterpret_cast<my_bool *>(&rocksdb_db_options->allow_mmap_reads), |
1058 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1059 | "DBOptions::allow_mmap_reads for RocksDB" , nullptr, nullptr, |
1060 | rocksdb_db_options->allow_mmap_reads); |
1061 | |
1062 | static MYSQL_SYSVAR_BOOL( |
1063 | allow_mmap_writes, |
1064 | *reinterpret_cast<my_bool *>(&rocksdb_db_options->allow_mmap_writes), |
1065 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1066 | "DBOptions::allow_mmap_writes for RocksDB" , nullptr, nullptr, |
1067 | rocksdb_db_options->allow_mmap_writes); |
1068 | |
1069 | static MYSQL_SYSVAR_BOOL( |
1070 | is_fd_close_on_exec, |
1071 | *reinterpret_cast<my_bool *>(&rocksdb_db_options->is_fd_close_on_exec), |
1072 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1073 | "DBOptions::is_fd_close_on_exec for RocksDB" , nullptr, nullptr, |
1074 | rocksdb_db_options->is_fd_close_on_exec); |
1075 | |
1076 | static MYSQL_SYSVAR_UINT(stats_dump_period_sec, |
1077 | rocksdb_db_options->stats_dump_period_sec, |
1078 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1079 | "DBOptions::stats_dump_period_sec for RocksDB" , |
1080 | nullptr, nullptr, |
1081 | rocksdb_db_options->stats_dump_period_sec, |
1082 | /* min */ 0, /* max */ INT_MAX, 0); |
1083 | |
1084 | static MYSQL_SYSVAR_BOOL( |
1085 | advise_random_on_open, |
1086 | *reinterpret_cast<my_bool *>(&rocksdb_db_options->advise_random_on_open), |
1087 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1088 | "DBOptions::advise_random_on_open for RocksDB" , nullptr, nullptr, |
1089 | rocksdb_db_options->advise_random_on_open); |
1090 | |
1091 | static MYSQL_SYSVAR_SIZE_T(db_write_buffer_size, |
1092 | rocksdb_db_options->db_write_buffer_size, |
1093 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1094 | "DBOptions::db_write_buffer_size for RocksDB" , |
1095 | nullptr, nullptr, |
1096 | rocksdb_db_options->db_write_buffer_size, |
1097 | /* min */ 0L, /* max */ SIZE_T_MAX, 0); |
1098 | |
1099 | static MYSQL_SYSVAR_BOOL( |
1100 | use_adaptive_mutex, |
1101 | *reinterpret_cast<my_bool *>(&rocksdb_db_options->use_adaptive_mutex), |
1102 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1103 | "DBOptions::use_adaptive_mutex for RocksDB" , nullptr, nullptr, |
1104 | rocksdb_db_options->use_adaptive_mutex); |
1105 | |
1106 | static MYSQL_SYSVAR_UINT64_T(bytes_per_sync, rocksdb_db_options->bytes_per_sync, |
1107 | PLUGIN_VAR_RQCMDARG, |
1108 | "DBOptions::bytes_per_sync for RocksDB" , nullptr, |
1109 | rocksdb_set_bytes_per_sync, |
1110 | rocksdb_db_options->bytes_per_sync, |
1111 | /* min */ 0L, /* max */ ULONGLONG_MAX, 0); |
1112 | |
1113 | static MYSQL_SYSVAR_UINT64_T(wal_bytes_per_sync, |
1114 | rocksdb_db_options->wal_bytes_per_sync, |
1115 | PLUGIN_VAR_RQCMDARG, |
1116 | "DBOptions::wal_bytes_per_sync for RocksDB" , nullptr, |
1117 | rocksdb_set_wal_bytes_per_sync, |
1118 | rocksdb_db_options->wal_bytes_per_sync, |
1119 | /* min */ 0L, /* max */ ULONGLONG_MAX, 0); |
1120 | |
1121 | static MYSQL_SYSVAR_BOOL( |
1122 | enable_thread_tracking, |
1123 | *reinterpret_cast<my_bool *>(&rocksdb_db_options->enable_thread_tracking), |
1124 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1125 | "DBOptions::enable_thread_tracking for RocksDB" , nullptr, nullptr, true); |
1126 | |
1127 | static MYSQL_SYSVAR_LONGLONG(block_cache_size, rocksdb_block_cache_size, |
1128 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1129 | "block_cache size for RocksDB" , nullptr, nullptr, |
1130 | /* default */ RDB_DEFAULT_BLOCK_CACHE_SIZE, |
1131 | /* min */ RDB_MIN_BLOCK_CACHE_SIZE, |
1132 | /* max */ LONGLONG_MAX, |
1133 | /* Block size */ RDB_MIN_BLOCK_CACHE_SIZE); |
1134 | |
1135 | static MYSQL_SYSVAR_LONGLONG(sim_cache_size, rocksdb_sim_cache_size, |
1136 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1137 | "Simulated cache size for RocksDB" , nullptr, |
1138 | nullptr, |
1139 | /* default */ 0, |
1140 | /* min */ 0, |
1141 | /* max */ LONGLONG_MAX, |
1142 | /* Block size */ 0); |
1143 | |
1144 | static MYSQL_SYSVAR_BOOL( |
1145 | use_clock_cache, |
1146 | rocksdb_use_clock_cache, |
1147 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1148 | "Use ClockCache instead of default LRUCache for RocksDB" , |
1149 | nullptr, nullptr, false); |
1150 | |
1151 | static MYSQL_SYSVAR_BOOL( |
1152 | cache_index_and_filter_blocks, |
1153 | *reinterpret_cast<my_bool *>( |
1154 | &rocksdb_tbl_options->cache_index_and_filter_blocks), |
1155 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1156 | "BlockBasedTableOptions::cache_index_and_filter_blocks for RocksDB" , |
1157 | nullptr, nullptr, true); |
1158 | |
1159 | // When pin_l0_filter_and_index_blocks_in_cache is true, RocksDB will use the |
1160 | // LRU cache, but will always keep the filter & idndex block's handle checked |
1161 | // out (=won't call ShardedLRUCache::Release), plus the parsed out objects |
1162 | // the LRU cache will never push flush them out, hence they're pinned. |
1163 | // |
1164 | // This fixes the mutex contention between :ShardedLRUCache::Lookup and |
1165 | // ShardedLRUCache::Release which reduced the QPS ratio (QPS using secondary |
1166 | // index / QPS using PK). |
1167 | static MYSQL_SYSVAR_BOOL( |
1168 | pin_l0_filter_and_index_blocks_in_cache, |
1169 | *reinterpret_cast<my_bool *>( |
1170 | &rocksdb_tbl_options->pin_l0_filter_and_index_blocks_in_cache), |
1171 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1172 | "pin_l0_filter_and_index_blocks_in_cache for RocksDB" , nullptr, nullptr, |
1173 | true); |
1174 | |
1175 | static MYSQL_SYSVAR_ENUM(index_type, rocksdb_index_type, |
1176 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1177 | "BlockBasedTableOptions::index_type for RocksDB" , |
1178 | nullptr, nullptr, |
1179 | (ulong)rocksdb_tbl_options->index_type, |
1180 | &index_type_typelib); |
1181 | |
1182 | static MYSQL_SYSVAR_BOOL( |
1183 | hash_index_allow_collision, |
1184 | *reinterpret_cast<my_bool *>( |
1185 | &rocksdb_tbl_options->hash_index_allow_collision), |
1186 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1187 | "BlockBasedTableOptions::hash_index_allow_collision for RocksDB" , nullptr, |
1188 | nullptr, rocksdb_tbl_options->hash_index_allow_collision); |
1189 | |
1190 | static MYSQL_SYSVAR_BOOL( |
1191 | no_block_cache, |
1192 | *reinterpret_cast<my_bool *>(&rocksdb_tbl_options->no_block_cache), |
1193 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1194 | "BlockBasedTableOptions::no_block_cache for RocksDB" , nullptr, nullptr, |
1195 | rocksdb_tbl_options->no_block_cache); |
1196 | |
1197 | static MYSQL_SYSVAR_SIZE_T(block_size, rocksdb_tbl_options->block_size, |
1198 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1199 | "BlockBasedTableOptions::block_size for RocksDB" , |
1200 | nullptr, nullptr, rocksdb_tbl_options->block_size, |
1201 | /* min */ 1L, /* max */ SIZE_T_MAX, 0); |
1202 | |
1203 | static MYSQL_SYSVAR_INT( |
1204 | block_size_deviation, rocksdb_tbl_options->block_size_deviation, |
1205 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1206 | "BlockBasedTableOptions::block_size_deviation for RocksDB" , nullptr, |
1207 | nullptr, rocksdb_tbl_options->block_size_deviation, |
1208 | /* min */ 0, /* max */ INT_MAX, 0); |
1209 | |
1210 | static MYSQL_SYSVAR_INT( |
1211 | block_restart_interval, rocksdb_tbl_options->block_restart_interval, |
1212 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1213 | "BlockBasedTableOptions::block_restart_interval for RocksDB" , nullptr, |
1214 | nullptr, rocksdb_tbl_options->block_restart_interval, |
1215 | /* min */ 1, /* max */ INT_MAX, 0); |
1216 | |
1217 | static MYSQL_SYSVAR_BOOL( |
1218 | whole_key_filtering, |
1219 | *reinterpret_cast<my_bool *>(&rocksdb_tbl_options->whole_key_filtering), |
1220 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1221 | "BlockBasedTableOptions::whole_key_filtering for RocksDB" , nullptr, nullptr, |
1222 | rocksdb_tbl_options->whole_key_filtering); |
1223 | |
1224 | static MYSQL_SYSVAR_STR(default_cf_options, rocksdb_default_cf_options, |
1225 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1226 | "default cf options for RocksDB" , nullptr, nullptr, "" ); |
1227 | |
1228 | static MYSQL_SYSVAR_STR(override_cf_options, rocksdb_override_cf_options, |
1229 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1230 | "option overrides per cf for RocksDB" , nullptr, nullptr, |
1231 | "" ); |
1232 | |
1233 | static MYSQL_SYSVAR_STR(update_cf_options, rocksdb_update_cf_options, |
1234 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC |
1235 | /* psergey-merge: need this? : PLUGIN_VAR_ALLOCATED*/, |
1236 | "Option updates per column family for RocksDB" , |
1237 | rocksdb_validate_update_cf_options, |
1238 | rocksdb_set_update_cf_options, nullptr); |
1239 | |
1240 | static MYSQL_SYSVAR_UINT(flush_log_at_trx_commit, |
1241 | rocksdb_flush_log_at_trx_commit, PLUGIN_VAR_RQCMDARG, |
1242 | "Sync on transaction commit. Similar to " |
1243 | "innodb_flush_log_at_trx_commit. 1: sync on commit, " |
1244 | "0,2: not sync on commit" , |
1245 | rocksdb_validate_flush_log_at_trx_commit, nullptr, |
1246 | /* default */ FLUSH_LOG_SYNC, |
1247 | /* min */ FLUSH_LOG_NEVER, |
1248 | /* max */ FLUSH_LOG_BACKGROUND, 0); |
1249 | |
1250 | static MYSQL_THDVAR_BOOL(write_disable_wal, PLUGIN_VAR_RQCMDARG, |
1251 | "WriteOptions::disableWAL for RocksDB" , nullptr, |
1252 | nullptr, rocksdb::WriteOptions().disableWAL); |
1253 | |
1254 | static MYSQL_THDVAR_BOOL( |
1255 | write_ignore_missing_column_families, PLUGIN_VAR_RQCMDARG, |
1256 | "WriteOptions::ignore_missing_column_families for RocksDB" , nullptr, |
1257 | nullptr, rocksdb::WriteOptions().ignore_missing_column_families); |
1258 | |
1259 | static MYSQL_THDVAR_BOOL(skip_fill_cache, PLUGIN_VAR_RQCMDARG, |
1260 | "Skip filling block cache on read requests" , nullptr, |
1261 | nullptr, FALSE); |
1262 | |
1263 | static MYSQL_THDVAR_BOOL( |
1264 | unsafe_for_binlog, PLUGIN_VAR_RQCMDARG, |
1265 | "Allowing statement based binary logging which may break consistency" , |
1266 | nullptr, nullptr, FALSE); |
1267 | |
1268 | static MYSQL_THDVAR_UINT(records_in_range, PLUGIN_VAR_RQCMDARG, |
1269 | "Used to override the result of records_in_range(). " |
1270 | "Set to a positive number to override" , |
1271 | nullptr, nullptr, 0, |
1272 | /* min */ 0, /* max */ INT_MAX, 0); |
1273 | |
1274 | static MYSQL_THDVAR_UINT(force_index_records_in_range, PLUGIN_VAR_RQCMDARG, |
1275 | "Used to override the result of records_in_range() " |
1276 | "when FORCE INDEX is used." , |
1277 | nullptr, nullptr, 0, |
1278 | /* min */ 0, /* max */ INT_MAX, 0); |
1279 | |
1280 | static MYSQL_SYSVAR_UINT( |
1281 | debug_optimizer_n_rows, rocksdb_debug_optimizer_n_rows, |
1282 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY | PLUGIN_VAR_NOSYSVAR, |
1283 | "Test only to override rocksdb estimates of table size in a memtable" , |
1284 | nullptr, nullptr, 0, /* min */ 0, /* max */ INT_MAX, 0); |
1285 | |
1286 | static MYSQL_SYSVAR_BOOL(force_compute_memtable_stats, |
1287 | rocksdb_force_compute_memtable_stats, |
1288 | PLUGIN_VAR_RQCMDARG, |
1289 | "Force to always compute memtable stats" , |
1290 | nullptr, nullptr, TRUE); |
1291 | |
1292 | static MYSQL_SYSVAR_UINT(force_compute_memtable_stats_cachetime, |
1293 | rocksdb_force_compute_memtable_stats_cachetime, |
1294 | PLUGIN_VAR_RQCMDARG, |
1295 | "Time in usecs to cache memtable estimates" , nullptr, |
1296 | nullptr, /* default */ 60 * 1000 * 1000, |
1297 | /* min */ 0, /* max */ INT_MAX, 0); |
1298 | |
1299 | static MYSQL_SYSVAR_BOOL( |
1300 | debug_optimizer_no_zero_cardinality, |
1301 | rocksdb_debug_optimizer_no_zero_cardinality, PLUGIN_VAR_RQCMDARG, |
1302 | "In case if cardinality is zero, overrides it with some value" , nullptr, |
1303 | nullptr, TRUE); |
1304 | |
1305 | static MYSQL_SYSVAR_STR(compact_cf, rocksdb_compact_cf_name, |
1306 | PLUGIN_VAR_RQCMDARG, "Compact column family" , |
1307 | rocksdb_compact_column_family, |
1308 | rocksdb_compact_column_family_stub, "" ); |
1309 | |
1310 | static MYSQL_SYSVAR_STR(create_checkpoint, rocksdb_checkpoint_name, |
1311 | PLUGIN_VAR_RQCMDARG, "Checkpoint directory" , |
1312 | rocksdb_create_checkpoint, |
1313 | rocksdb_create_checkpoint_stub, "" ); |
1314 | |
1315 | static MYSQL_SYSVAR_BOOL(signal_drop_index_thread, |
1316 | rocksdb_signal_drop_index_thread, PLUGIN_VAR_RQCMDARG, |
1317 | "Wake up drop index thread" , nullptr, |
1318 | rocksdb_drop_index_wakeup_thread, FALSE); |
1319 | |
1320 | static MYSQL_SYSVAR_BOOL(pause_background_work, rocksdb_pause_background_work, |
1321 | PLUGIN_VAR_RQCMDARG, |
1322 | "Disable all rocksdb background operations" , nullptr, |
1323 | rocksdb_set_pause_background_work, FALSE); |
1324 | |
1325 | static MYSQL_SYSVAR_BOOL( |
1326 | enable_ttl, rocksdb_enable_ttl, PLUGIN_VAR_RQCMDARG, |
1327 | "Enable expired TTL records to be dropped during compaction." , nullptr, |
1328 | nullptr, TRUE); |
1329 | |
1330 | static MYSQL_SYSVAR_BOOL( |
1331 | enable_ttl_read_filtering, rocksdb_enable_ttl_read_filtering, |
1332 | PLUGIN_VAR_RQCMDARG, |
1333 | "For tables with TTL, expired records are skipped/filtered out during " |
1334 | "processing and in query results. Disabling this will allow these records " |
1335 | "to be seen, but as a result rows may disappear in the middle of " |
1336 | "transactions as they are dropped during compaction. Use with caution." , |
1337 | nullptr, nullptr, TRUE); |
1338 | |
1339 | static MYSQL_SYSVAR_INT( |
1340 | debug_ttl_rec_ts, rocksdb_debug_ttl_rec_ts, PLUGIN_VAR_RQCMDARG, |
1341 | "For debugging purposes only. Overrides the TTL of records to " |
1342 | "now() + debug_ttl_rec_ts. The value can be +/- to simulate " |
1343 | "a record inserted in the past vs a record inserted in the 'future'. " |
1344 | "A value of 0 denotes that the variable is not set. This variable is a " |
1345 | "no-op in non-debug builds." , |
1346 | nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0); |
1347 | |
1348 | static MYSQL_SYSVAR_INT( |
1349 | debug_ttl_snapshot_ts, rocksdb_debug_ttl_snapshot_ts, PLUGIN_VAR_RQCMDARG, |
1350 | "For debugging purposes only. Sets the snapshot during compaction to " |
1351 | "now() + debug_set_ttl_snapshot_ts. The value can be +/- to simulate " |
1352 | "a snapshot in the past vs a snapshot created in the 'future'. " |
1353 | "A value of 0 denotes that the variable is not set. This variable is a " |
1354 | "no-op in non-debug builds." , |
1355 | nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0); |
1356 | |
1357 | static MYSQL_SYSVAR_INT( |
1358 | debug_ttl_read_filter_ts, rocksdb_debug_ttl_read_filter_ts, |
1359 | PLUGIN_VAR_RQCMDARG, |
1360 | "For debugging purposes only. Overrides the TTL read filtering time to " |
1361 | "time + debug_ttl_read_filter_ts. A value of 0 denotes that the variable " |
1362 | "is not set. This variable is a no-op in non-debug builds." , |
1363 | nullptr, nullptr, 0, /* min */ -3600, /* max */ 3600, 0); |
1364 | |
1365 | static MYSQL_SYSVAR_BOOL( |
1366 | debug_ttl_ignore_pk, rocksdb_debug_ttl_ignore_pk, PLUGIN_VAR_RQCMDARG, |
1367 | "For debugging purposes only. If true, compaction filtering will not occur " |
1368 | "on PK TTL data. This variable is a no-op in non-debug builds." , |
1369 | nullptr, nullptr, FALSE); |
1370 | |
1371 | static MYSQL_SYSVAR_BOOL( |
1372 | reset_stats, rocksdb_reset_stats, PLUGIN_VAR_RQCMDARG, |
1373 | "Reset the RocksDB internal statistics without restarting the DB." , nullptr, |
1374 | rocksdb_set_reset_stats, FALSE); |
1375 | |
1376 | static MYSQL_SYSVAR_UINT(io_write_timeout, rocksdb_io_write_timeout_secs, |
1377 | PLUGIN_VAR_RQCMDARG, |
1378 | "Timeout for experimental I/O watchdog." , nullptr, |
1379 | rocksdb_set_io_write_timeout, /* default */ 0, |
1380 | /* min */ 0L, |
1381 | /* max */ UINT_MAX, 0); |
1382 | |
1383 | static MYSQL_SYSVAR_BOOL(enable_2pc, rocksdb_enable_2pc, PLUGIN_VAR_RQCMDARG, |
1384 | "Enable two phase commit for MyRocks" , nullptr, |
1385 | nullptr, TRUE); |
1386 | |
1387 | static MYSQL_SYSVAR_BOOL(ignore_unknown_options, rocksdb_ignore_unknown_options, |
1388 | PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, |
1389 | "Enable ignoring unknown options passed to RocksDB" , |
1390 | nullptr, nullptr, TRUE); |
1391 | |
1392 | static MYSQL_SYSVAR_BOOL(strict_collation_check, rocksdb_strict_collation_check, |
1393 | PLUGIN_VAR_RQCMDARG, |
1394 | "Enforce case sensitive collation for MyRocks indexes" , |
1395 | nullptr, nullptr, TRUE); |
1396 | |
1397 | static MYSQL_SYSVAR_STR(strict_collation_exceptions, |
1398 | rocksdb_strict_collation_exceptions, |
1399 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_MEMALLOC, |
1400 | "List of tables (using regex) that are excluded " |
1401 | "from the case sensitive collation enforcement" , |
1402 | nullptr, rocksdb_set_collation_exception_list, "" ); |
1403 | |
1404 | static MYSQL_SYSVAR_BOOL(collect_sst_properties, rocksdb_collect_sst_properties, |
1405 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1406 | "Enables collecting SST file properties on each flush" , |
1407 | nullptr, nullptr, rocksdb_collect_sst_properties); |
1408 | |
1409 | static MYSQL_SYSVAR_BOOL( |
1410 | force_flush_memtable_now, rocksdb_force_flush_memtable_now_var, |
1411 | PLUGIN_VAR_RQCMDARG, |
1412 | "Forces memstore flush which may block all write requests so be careful" , |
1413 | rocksdb_force_flush_memtable_now, rocksdb_force_flush_memtable_now_stub, |
1414 | FALSE); |
1415 | |
1416 | static MYSQL_SYSVAR_BOOL( |
1417 | force_flush_memtable_and_lzero_now, |
1418 | rocksdb_force_flush_memtable_and_lzero_now_var, PLUGIN_VAR_RQCMDARG, |
1419 | "Acts similar to force_flush_memtable_now, but also compacts all L0 files." , |
1420 | rocksdb_force_flush_memtable_and_lzero_now, |
1421 | rocksdb_force_flush_memtable_and_lzero_now_stub, FALSE); |
1422 | |
1423 | static MYSQL_SYSVAR_UINT( |
1424 | seconds_between_stat_computes, rocksdb_seconds_between_stat_computes, |
1425 | PLUGIN_VAR_RQCMDARG, |
1426 | "Sets a number of seconds to wait between optimizer stats recomputation. " |
1427 | "Only changed indexes will be refreshed." , |
1428 | nullptr, nullptr, rocksdb_seconds_between_stat_computes, |
1429 | /* min */ 0L, /* max */ UINT_MAX, 0); |
1430 | |
1431 | static MYSQL_SYSVAR_LONGLONG(compaction_sequential_deletes, |
1432 | rocksdb_compaction_sequential_deletes, |
1433 | PLUGIN_VAR_RQCMDARG, |
1434 | "RocksDB will trigger compaction for the file if " |
1435 | "it has more than this number sequential deletes " |
1436 | "per window" , |
1437 | nullptr, rocksdb_set_compaction_options, |
1438 | DEFAULT_COMPACTION_SEQUENTIAL_DELETES, |
1439 | /* min */ 0L, |
1440 | /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES, 0); |
1441 | |
1442 | static MYSQL_SYSVAR_LONGLONG( |
1443 | compaction_sequential_deletes_window, |
1444 | rocksdb_compaction_sequential_deletes_window, PLUGIN_VAR_RQCMDARG, |
1445 | "Size of the window for counting rocksdb_compaction_sequential_deletes" , |
1446 | nullptr, rocksdb_set_compaction_options, |
1447 | DEFAULT_COMPACTION_SEQUENTIAL_DELETES_WINDOW, |
1448 | /* min */ 0L, /* max */ MAX_COMPACTION_SEQUENTIAL_DELETES_WINDOW, 0); |
1449 | |
1450 | static MYSQL_SYSVAR_LONGLONG( |
1451 | compaction_sequential_deletes_file_size, |
1452 | rocksdb_compaction_sequential_deletes_file_size, PLUGIN_VAR_RQCMDARG, |
1453 | "Minimum file size required for compaction_sequential_deletes" , nullptr, |
1454 | rocksdb_set_compaction_options, 0L, |
1455 | /* min */ -1L, /* max */ LONGLONG_MAX, 0); |
1456 | |
1457 | static MYSQL_SYSVAR_BOOL( |
1458 | compaction_sequential_deletes_count_sd, |
1459 | rocksdb_compaction_sequential_deletes_count_sd, PLUGIN_VAR_RQCMDARG, |
1460 | "Counting SingleDelete as rocksdb_compaction_sequential_deletes" , nullptr, |
1461 | nullptr, rocksdb_compaction_sequential_deletes_count_sd); |
1462 | |
1463 | static MYSQL_SYSVAR_BOOL( |
1464 | print_snapshot_conflict_queries, rocksdb_print_snapshot_conflict_queries, |
1465 | PLUGIN_VAR_RQCMDARG, |
1466 | "Logging queries that got snapshot conflict errors into *.err log" , nullptr, |
1467 | nullptr, rocksdb_print_snapshot_conflict_queries); |
1468 | |
1469 | static MYSQL_THDVAR_INT(checksums_pct, PLUGIN_VAR_RQCMDARG, |
1470 | "How many percentages of rows to be checksummed" , |
1471 | nullptr, nullptr, RDB_MAX_CHECKSUMS_PCT, |
1472 | /* min */ 0, /* max */ RDB_MAX_CHECKSUMS_PCT, 0); |
1473 | |
1474 | static MYSQL_THDVAR_BOOL(store_row_debug_checksums, PLUGIN_VAR_RQCMDARG, |
1475 | "Include checksums when writing index/table records" , |
1476 | nullptr, nullptr, false /* default value */); |
1477 | |
1478 | static MYSQL_THDVAR_BOOL(verify_row_debug_checksums, PLUGIN_VAR_RQCMDARG, |
1479 | "Verify checksums when reading index/table records" , |
1480 | nullptr, nullptr, false /* default value */); |
1481 | |
1482 | static MYSQL_THDVAR_BOOL(master_skip_tx_api, PLUGIN_VAR_RQCMDARG, |
1483 | "Skipping holding any lock on row access. " |
1484 | "Not effective on slave." , |
1485 | nullptr, nullptr, false); |
1486 | |
1487 | static MYSQL_SYSVAR_UINT( |
1488 | validate_tables, rocksdb_validate_tables, |
1489 | PLUGIN_VAR_RQCMDARG | PLUGIN_VAR_READONLY, |
1490 | "Verify all .frm files match all RocksDB tables (0 means no verification, " |
1491 | "1 means verify and fail on error, and 2 means verify but continue" , |
1492 | nullptr, nullptr, 1 /* default value */, 0 /* min value */, |
1493 | 2 /* max value */, 0); |
1494 | |
1495 | static MYSQL_SYSVAR_STR(datadir, rocksdb_datadir, |
1496 | PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, |
1497 | "RocksDB data directory" , nullptr, nullptr, |
1498 | "./#rocksdb" ); |
1499 | |
1500 | static MYSQL_SYSVAR_STR(supported_compression_types, |
1501 | compression_types_val, |
1502 | PLUGIN_VAR_NOCMDOPT | PLUGIN_VAR_READONLY, |
1503 | "Compression algorithms supported by RocksDB" , |
1504 | nullptr, nullptr, |
1505 | compression_types_val); |
1506 | |
1507 | static MYSQL_SYSVAR_UINT( |
1508 | table_stats_sampling_pct, rocksdb_table_stats_sampling_pct, |
1509 | PLUGIN_VAR_RQCMDARG, |
1510 | "Percentage of entries to sample when collecting statistics about table " |
1511 | "properties. Specify either 0 to sample everything or percentage " |
1512 | "[" STRINGIFY_ARG(RDB_TBL_STATS_SAMPLE_PCT_MIN) ".." STRINGIFY_ARG( |
1513 | RDB_TBL_STATS_SAMPLE_PCT_MAX) "]. " |
1514 | "By default " STRINGIFY_ARG( |
1515 | RDB_DEFAULT_TBL_STATS_SAMPLE_PCT) "% " |
1516 | "of" |
1517 | " e" |
1518 | "nt" |
1519 | "ri" |
1520 | "es" |
1521 | " a" |
1522 | "re" |
1523 | " " |
1524 | "sa" |
1525 | "mp" |
1526 | "le" |
1527 | "d" |
1528 | "." , |
1529 | nullptr, rocksdb_set_table_stats_sampling_pct, /* default */ |
1530 | RDB_DEFAULT_TBL_STATS_SAMPLE_PCT, /* everything */ 0, |
1531 | /* max */ RDB_TBL_STATS_SAMPLE_PCT_MAX, 0); |
1532 | |
1533 | static MYSQL_SYSVAR_BOOL( |
1534 | large_prefix, rocksdb_large_prefix, PLUGIN_VAR_RQCMDARG, |
1535 | "Support large index prefix length of 3072 bytes. If off, the maximum " |
1536 | "index prefix length is 767." , |
1537 | nullptr, nullptr, FALSE); |
1538 | |
1539 | static MYSQL_SYSVAR_BOOL( |
1540 | allow_to_start_after_corruption, rocksdb_allow_to_start_after_corruption, |
1541 | PLUGIN_VAR_OPCMDARG | PLUGIN_VAR_READONLY, |
1542 | "Allow server still to start successfully even if RocksDB corruption is " |
1543 | "detected." , |
1544 | nullptr, nullptr, FALSE); |
1545 | |
1546 | static const int ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE = 100; |
1547 | |
1548 | static struct st_mysql_sys_var *rocksdb_system_variables[] = { |
1549 | MYSQL_SYSVAR(lock_wait_timeout), |
1550 | MYSQL_SYSVAR(deadlock_detect), |
1551 | MYSQL_SYSVAR(deadlock_detect_depth), |
1552 | MYSQL_SYSVAR(max_row_locks), |
1553 | MYSQL_SYSVAR(write_batch_max_bytes), |
1554 | MYSQL_SYSVAR(lock_scanned_rows), |
1555 | MYSQL_SYSVAR(bulk_load), |
1556 | MYSQL_SYSVAR(bulk_load_allow_unsorted), |
1557 | MYSQL_SYSVAR(skip_unique_check_tables), |
1558 | MYSQL_SYSVAR(trace_sst_api), |
1559 | MYSQL_SYSVAR(commit_in_the_middle), |
1560 | MYSQL_SYSVAR(blind_delete_primary_key), |
1561 | MYSQL_SYSVAR(read_free_rpl_tables), |
1562 | MYSQL_SYSVAR(bulk_load_size), |
1563 | MYSQL_SYSVAR(merge_buf_size), |
1564 | MYSQL_SYSVAR(enable_bulk_load_api), |
1565 | MYSQL_SYSVAR(tmpdir), |
1566 | MYSQL_SYSVAR(merge_combine_read_size), |
1567 | MYSQL_SYSVAR(merge_tmp_file_removal_delay_ms), |
1568 | MYSQL_SYSVAR(skip_bloom_filter_on_read), |
1569 | |
1570 | MYSQL_SYSVAR(create_if_missing), |
1571 | MYSQL_SYSVAR(two_write_queues), |
1572 | MYSQL_SYSVAR(manual_wal_flush), |
1573 | MYSQL_SYSVAR(create_missing_column_families), |
1574 | MYSQL_SYSVAR(error_if_exists), |
1575 | MYSQL_SYSVAR(paranoid_checks), |
1576 | MYSQL_SYSVAR(rate_limiter_bytes_per_sec), |
1577 | MYSQL_SYSVAR(sst_mgr_rate_bytes_per_sec), |
1578 | MYSQL_SYSVAR(delayed_write_rate), |
1579 | MYSQL_SYSVAR(max_latest_deadlocks), |
1580 | MYSQL_SYSVAR(info_log_level), |
1581 | MYSQL_SYSVAR(max_open_files), |
1582 | MYSQL_SYSVAR(max_total_wal_size), |
1583 | MYSQL_SYSVAR(use_fsync), |
1584 | MYSQL_SYSVAR(wal_dir), |
1585 | MYSQL_SYSVAR(persistent_cache_path), |
1586 | MYSQL_SYSVAR(persistent_cache_size_mb), |
1587 | MYSQL_SYSVAR(delete_obsolete_files_period_micros), |
1588 | MYSQL_SYSVAR(max_background_jobs), |
1589 | MYSQL_SYSVAR(max_log_file_size), |
1590 | MYSQL_SYSVAR(max_subcompactions), |
1591 | MYSQL_SYSVAR(log_file_time_to_roll), |
1592 | MYSQL_SYSVAR(keep_log_file_num), |
1593 | MYSQL_SYSVAR(max_manifest_file_size), |
1594 | MYSQL_SYSVAR(table_cache_numshardbits), |
1595 | MYSQL_SYSVAR(wal_ttl_seconds), |
1596 | MYSQL_SYSVAR(wal_size_limit_mb), |
1597 | MYSQL_SYSVAR(manifest_preallocation_size), |
1598 | MYSQL_SYSVAR(use_direct_reads), |
1599 | MYSQL_SYSVAR(use_direct_io_for_flush_and_compaction), |
1600 | MYSQL_SYSVAR(allow_mmap_reads), |
1601 | MYSQL_SYSVAR(allow_mmap_writes), |
1602 | MYSQL_SYSVAR(is_fd_close_on_exec), |
1603 | MYSQL_SYSVAR(stats_dump_period_sec), |
1604 | MYSQL_SYSVAR(advise_random_on_open), |
1605 | MYSQL_SYSVAR(db_write_buffer_size), |
1606 | MYSQL_SYSVAR(use_adaptive_mutex), |
1607 | MYSQL_SYSVAR(bytes_per_sync), |
1608 | MYSQL_SYSVAR(wal_bytes_per_sync), |
1609 | MYSQL_SYSVAR(enable_thread_tracking), |
1610 | MYSQL_SYSVAR(perf_context_level), |
1611 | MYSQL_SYSVAR(wal_recovery_mode), |
1612 | MYSQL_SYSVAR(access_hint_on_compaction_start), |
1613 | MYSQL_SYSVAR(new_table_reader_for_compaction_inputs), |
1614 | MYSQL_SYSVAR(compaction_readahead_size), |
1615 | MYSQL_SYSVAR(allow_concurrent_memtable_write), |
1616 | MYSQL_SYSVAR(enable_write_thread_adaptive_yield), |
1617 | |
1618 | MYSQL_SYSVAR(block_cache_size), |
1619 | MYSQL_SYSVAR(sim_cache_size), |
1620 | MYSQL_SYSVAR(use_clock_cache), |
1621 | MYSQL_SYSVAR(cache_index_and_filter_blocks), |
1622 | MYSQL_SYSVAR(pin_l0_filter_and_index_blocks_in_cache), |
1623 | MYSQL_SYSVAR(index_type), |
1624 | MYSQL_SYSVAR(hash_index_allow_collision), |
1625 | MYSQL_SYSVAR(no_block_cache), |
1626 | MYSQL_SYSVAR(block_size), |
1627 | MYSQL_SYSVAR(block_size_deviation), |
1628 | MYSQL_SYSVAR(block_restart_interval), |
1629 | MYSQL_SYSVAR(whole_key_filtering), |
1630 | |
1631 | MYSQL_SYSVAR(default_cf_options), |
1632 | MYSQL_SYSVAR(override_cf_options), |
1633 | MYSQL_SYSVAR(update_cf_options), |
1634 | |
1635 | MYSQL_SYSVAR(flush_log_at_trx_commit), |
1636 | MYSQL_SYSVAR(write_disable_wal), |
1637 | MYSQL_SYSVAR(write_ignore_missing_column_families), |
1638 | |
1639 | MYSQL_SYSVAR(skip_fill_cache), |
1640 | MYSQL_SYSVAR(unsafe_for_binlog), |
1641 | |
1642 | MYSQL_SYSVAR(records_in_range), |
1643 | MYSQL_SYSVAR(force_index_records_in_range), |
1644 | MYSQL_SYSVAR(debug_optimizer_n_rows), |
1645 | MYSQL_SYSVAR(force_compute_memtable_stats), |
1646 | MYSQL_SYSVAR(force_compute_memtable_stats_cachetime), |
1647 | MYSQL_SYSVAR(debug_optimizer_no_zero_cardinality), |
1648 | |
1649 | MYSQL_SYSVAR(compact_cf), |
1650 | MYSQL_SYSVAR(signal_drop_index_thread), |
1651 | MYSQL_SYSVAR(pause_background_work), |
1652 | MYSQL_SYSVAR(enable_2pc), |
1653 | MYSQL_SYSVAR(ignore_unknown_options), |
1654 | MYSQL_SYSVAR(strict_collation_check), |
1655 | MYSQL_SYSVAR(strict_collation_exceptions), |
1656 | MYSQL_SYSVAR(collect_sst_properties), |
1657 | MYSQL_SYSVAR(force_flush_memtable_now), |
1658 | MYSQL_SYSVAR(force_flush_memtable_and_lzero_now), |
1659 | MYSQL_SYSVAR(enable_ttl), |
1660 | MYSQL_SYSVAR(enable_ttl_read_filtering), |
1661 | MYSQL_SYSVAR(debug_ttl_rec_ts), |
1662 | MYSQL_SYSVAR(debug_ttl_snapshot_ts), |
1663 | MYSQL_SYSVAR(debug_ttl_read_filter_ts), |
1664 | MYSQL_SYSVAR(debug_ttl_ignore_pk), |
1665 | MYSQL_SYSVAR(reset_stats), |
1666 | MYSQL_SYSVAR(io_write_timeout), |
1667 | MYSQL_SYSVAR(seconds_between_stat_computes), |
1668 | |
1669 | MYSQL_SYSVAR(compaction_sequential_deletes), |
1670 | MYSQL_SYSVAR(compaction_sequential_deletes_window), |
1671 | MYSQL_SYSVAR(compaction_sequential_deletes_file_size), |
1672 | MYSQL_SYSVAR(compaction_sequential_deletes_count_sd), |
1673 | MYSQL_SYSVAR(print_snapshot_conflict_queries), |
1674 | |
1675 | MYSQL_SYSVAR(datadir), |
1676 | MYSQL_SYSVAR(supported_compression_types), |
1677 | MYSQL_SYSVAR(create_checkpoint), |
1678 | |
1679 | MYSQL_SYSVAR(checksums_pct), |
1680 | MYSQL_SYSVAR(store_row_debug_checksums), |
1681 | MYSQL_SYSVAR(verify_row_debug_checksums), |
1682 | MYSQL_SYSVAR(master_skip_tx_api), |
1683 | |
1684 | MYSQL_SYSVAR(validate_tables), |
1685 | MYSQL_SYSVAR(table_stats_sampling_pct), |
1686 | |
1687 | MYSQL_SYSVAR(large_prefix), |
1688 | MYSQL_SYSVAR(allow_to_start_after_corruption), |
1689 | MYSQL_SYSVAR(git_hash), |
1690 | nullptr}; |
1691 | |
1692 | static rocksdb::WriteOptions |
1693 | rdb_get_rocksdb_write_options(my_core::THD *const thd) { |
1694 | rocksdb::WriteOptions opt; |
1695 | |
1696 | opt.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC); |
1697 | opt.disableWAL = THDVAR(thd, write_disable_wal); |
1698 | opt.ignore_missing_column_families = |
1699 | THDVAR(thd, write_ignore_missing_column_families); |
1700 | |
1701 | return opt; |
1702 | } |
1703 | |
1704 | /////////////////////////////////////////////////////////////////////////////////////////// |
1705 | |
1706 | /** |
1707 | @brief |
1708 | Function we use in the creation of our hash to get key. |
1709 | */ |
1710 | |
1711 | uchar * |
1712 | Rdb_open_tables_map::get_hash_key(const Rdb_table_handler *const table_handler, |
1713 | size_t *const length, |
1714 | my_bool not_used MY_ATTRIBUTE((__unused__))) { |
1715 | *length = table_handler->m_table_name_length; |
1716 | return reinterpret_cast<uchar *>(table_handler->m_table_name); |
1717 | } |
1718 | |
1719 | /* |
1720 | Drop index thread's control |
1721 | */ |
1722 | |
1723 | static Rdb_drop_index_thread rdb_drop_idx_thread; |
1724 | |
1725 | static void rocksdb_drop_index_wakeup_thread( |
1726 | my_core::THD *const thd MY_ATTRIBUTE((__unused__)), |
1727 | struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)), |
1728 | void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) { |
1729 | if (*static_cast<const bool *>(save)) { |
1730 | rdb_drop_idx_thread.signal(); |
1731 | } |
1732 | } |
1733 | |
1734 | static inline uint32_t rocksdb_perf_context_level(THD *const thd) { |
1735 | DBUG_ASSERT(thd != nullptr); |
1736 | |
1737 | const int session_perf_context_level = THDVAR(thd, perf_context_level); |
1738 | if (session_perf_context_level > rocksdb::PerfLevel::kUninitialized) { |
1739 | return session_perf_context_level; |
1740 | } |
1741 | |
1742 | /* |
1743 | Fallback to global thdvar, if session specific one was not set to a valid |
1744 | value. |
1745 | */ |
1746 | |
1747 | const int global_perf_context_level = THDVAR(nullptr, perf_context_level); |
1748 | if (global_perf_context_level > rocksdb::PerfLevel::kUninitialized) { |
1749 | return global_perf_context_level; |
1750 | } |
1751 | |
1752 | return rocksdb::PerfLevel::kDisable; |
1753 | } |
1754 | |
1755 | /* |
1756 | Very short (functor-like) interface to be passed to |
1757 | Rdb_transaction::walk_tx_list() |
1758 | */ |
1759 | |
1760 | interface Rdb_tx_list_walker { |
1761 | virtual ~Rdb_tx_list_walker() {} |
1762 | virtual void process_tran(const Rdb_transaction *const) = 0; |
1763 | }; |
1764 | |
1765 | /* |
1766 | This is a helper class that is passed to RocksDB to get notifications when |
1767 | a snapshot gets created. |
1768 | */ |
1769 | |
1770 | class Rdb_snapshot_notifier : public rocksdb::TransactionNotifier { |
1771 | Rdb_transaction *m_owning_tx; |
1772 | |
1773 | void SnapshotCreated(const rocksdb::Snapshot *snapshot) override; |
1774 | |
1775 | public: |
1776 | Rdb_snapshot_notifier(const Rdb_snapshot_notifier &) = delete; |
1777 | Rdb_snapshot_notifier &operator=(const Rdb_snapshot_notifier &) = delete; |
1778 | |
1779 | explicit Rdb_snapshot_notifier(Rdb_transaction *const owning_tx) |
1780 | : m_owning_tx(owning_tx) {} |
1781 | |
1782 | // If the owning Rdb_transaction gets destructed we need to not reference |
1783 | // it anymore. |
1784 | void detach() { m_owning_tx = nullptr; } |
1785 | }; |
1786 | |
1787 | |
1788 | #ifdef MARIAROCKS_NOT_YET |
1789 | // ER_LOCK_WAIT_TIMEOUT error also has a reason in facebook/mysql-5.6 |
1790 | #endif |
1791 | String timeout_message(const char *command, const char *name1, |
1792 | const char *name2) |
1793 | { |
1794 | String msg; |
1795 | msg.append("Timeout on " ); |
1796 | msg.append(command); |
1797 | msg.append(": " ); |
1798 | msg.append(name1); |
1799 | if (name2 && name2[0]) |
1800 | { |
1801 | msg.append("." ); |
1802 | msg.append(name2); |
1803 | } |
1804 | return msg; |
1805 | } |
1806 | |
1807 | |
1808 | /* This is the base class for transactions when interacting with rocksdb. |
1809 | */ |
1810 | class Rdb_transaction { |
1811 | protected: |
1812 | ulonglong m_write_count = 0; |
1813 | ulonglong m_insert_count = 0; |
1814 | ulonglong m_update_count = 0; |
1815 | ulonglong m_delete_count = 0; |
1816 | ulonglong m_lock_count = 0; |
1817 | std::unordered_map<GL_INDEX_ID, ulonglong> m_auto_incr_map; |
1818 | |
1819 | bool m_is_delayed_snapshot = false; |
1820 | bool m_is_two_phase = false; |
1821 | |
1822 | private: |
1823 | /* Number of RockDB savepoints taken */ |
1824 | int m_n_savepoints; |
1825 | /* |
1826 | Number of write operations this transaction had when we took the last |
1827 | savepoint (the idea is not to take another savepoint if we haven't made |
1828 | any changes) |
1829 | */ |
1830 | ulonglong m_writes_at_last_savepoint; |
1831 | |
1832 | protected: |
1833 | THD *m_thd = nullptr; |
1834 | |
1835 | rocksdb::ReadOptions m_read_opts; |
1836 | |
1837 | static std::multiset<Rdb_transaction *> s_tx_list; |
1838 | static mysql_mutex_t s_tx_list_mutex; |
1839 | |
1840 | Rdb_io_perf *m_tbl_io_perf; |
1841 | |
1842 | bool m_tx_read_only = false; |
1843 | |
1844 | int m_timeout_sec; /* Cached value of @@rocksdb_lock_wait_timeout */ |
1845 | |
1846 | /* Maximum number of locks the transaction can have */ |
1847 | ulonglong m_max_row_locks; |
1848 | |
1849 | bool m_is_tx_failed = false; |
1850 | bool m_rollback_only = false; |
1851 | |
1852 | std::shared_ptr<Rdb_snapshot_notifier> m_notifier; |
1853 | |
1854 | // This should be used only when updating binlog information. |
1855 | virtual rocksdb::WriteBatchBase *get_write_batch() = 0; |
1856 | virtual bool commit_no_binlog() = 0; |
1857 | virtual rocksdb::Iterator * |
1858 | get_iterator(const rocksdb::ReadOptions &options, |
1859 | rocksdb::ColumnFamilyHandle *column_family) = 0; |
1860 | |
1861 | protected: |
1862 | /* |
1863 | The following two are helper functions to be overloaded by child classes. |
1864 | They should provide RocksDB's savepoint semantics. |
1865 | */ |
1866 | virtual void do_set_savepoint() = 0; |
1867 | virtual void do_rollback_to_savepoint() = 0; |
1868 | |
1869 | /* |
1870 | @detail |
1871 | This function takes in the WriteBatch of the transaction to add |
1872 | all the AUTO_INCREMENT merges. It does so by iterating through |
1873 | m_auto_incr_map and then constructing key/value pairs to call merge upon. |
1874 | |
1875 | @param wb |
1876 | */ |
1877 | rocksdb::Status merge_auto_incr_map(rocksdb::WriteBatchBase *const wb) { |
1878 | DBUG_EXECUTE_IF("myrocks_autoinc_upgrade" , return rocksdb::Status::OK();); |
1879 | |
1880 | // Iterate through the merge map merging all keys into data dictionary. |
1881 | rocksdb::Status s; |
1882 | for (auto &it : m_auto_incr_map) { |
1883 | s = dict_manager.put_auto_incr_val(wb, it.first, it.second); |
1884 | if (!s.ok()) { |
1885 | return s; |
1886 | } |
1887 | } |
1888 | m_auto_incr_map.clear(); |
1889 | return s; |
1890 | } |
1891 | |
1892 | public: |
1893 | const char *m_mysql_log_file_name; |
1894 | my_off_t m_mysql_log_offset; |
1895 | #ifdef MARIAROCKS_NOT_YET |
1896 | // TODO: MariaDB probably doesn't need these at all: |
1897 | const char *m_mysql_gtid; |
1898 | const char *m_mysql_max_gtid; |
1899 | #endif |
1900 | String m_detailed_error; |
1901 | int64_t m_snapshot_timestamp = 0; |
1902 | bool m_ddl_transaction; |
1903 | |
1904 | /* |
1905 | Tracks the number of tables in use through external_lock. |
1906 | This should not be reset during start_tx(). |
1907 | */ |
1908 | int64_t m_n_mysql_tables_in_use = 0; |
1909 | |
1910 | /* |
1911 | MariaDB's group commit: |
1912 | */ |
1913 | bool commit_ordered_done; |
1914 | bool commit_ordered_res; |
1915 | |
1916 | /* |
1917 | for distinction between rdb_transaction_impl and rdb_writebatch_impl |
1918 | when using walk tx list |
1919 | */ |
1920 | virtual bool is_writebatch_trx() const = 0; |
1921 | |
1922 | static void init_mutex() { |
1923 | mysql_mutex_init(key_mutex_tx_list, &s_tx_list_mutex, MY_MUTEX_INIT_FAST); |
1924 | } |
1925 | |
1926 | static void term_mutex() { |
1927 | DBUG_ASSERT(s_tx_list.size() == 0); |
1928 | mysql_mutex_destroy(&s_tx_list_mutex); |
1929 | } |
1930 | |
1931 | static void walk_tx_list(Rdb_tx_list_walker *walker) { |
1932 | DBUG_ASSERT(walker != nullptr); |
1933 | |
1934 | RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex); |
1935 | |
1936 | for (auto it : s_tx_list) |
1937 | walker->process_tran(it); |
1938 | |
1939 | RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex); |
1940 | } |
1941 | |
1942 | int set_status_error(THD *const thd, const rocksdb::Status &s, |
1943 | const Rdb_key_def &kd, Rdb_tbl_def *const tbl_def, |
1944 | Rdb_table_handler *const table_handler) { |
1945 | DBUG_ASSERT(!s.ok()); |
1946 | DBUG_ASSERT(tbl_def != nullptr); |
1947 | |
1948 | if (s.IsTimedOut()) { |
1949 | /* |
1950 | SQL layer has weird expectations. If we return an error when |
1951 | doing a read in DELETE IGNORE, it will ignore the error ("because it's |
1952 | an IGNORE command!) but then will fail an assert, because "error code |
1953 | was returned, but no error happened". Do what InnoDB's |
1954 | convert_error_code_to_mysql() does: force a statement |
1955 | rollback before returning HA_ERR_LOCK_WAIT_TIMEOUT: |
1956 | */ |
1957 | my_core::thd_mark_transaction_to_rollback(thd, false /*just statement*/); |
1958 | m_detailed_error.copy(timeout_message( |
1959 | "index" , tbl_def->full_tablename().c_str(), kd.get_name().c_str())); |
1960 | table_handler->m_lock_wait_timeout_counter.inc(); |
1961 | rocksdb_row_lock_wait_timeouts++; |
1962 | |
1963 | return HA_ERR_LOCK_WAIT_TIMEOUT; |
1964 | } |
1965 | |
1966 | if (s.IsDeadlock()) { |
1967 | my_core::thd_mark_transaction_to_rollback(thd, |
1968 | false /* just statement */); |
1969 | m_detailed_error = String(); |
1970 | table_handler->m_deadlock_counter.inc(); |
1971 | rocksdb_row_lock_deadlocks++; |
1972 | return HA_ERR_LOCK_DEADLOCK; |
1973 | } else if (s.IsBusy()) { |
1974 | rocksdb_snapshot_conflict_errors++; |
1975 | if (rocksdb_print_snapshot_conflict_queries) { |
1976 | char user_host_buff[MAX_USER_HOST_SIZE + 1]; |
1977 | make_user_name(thd, user_host_buff); |
1978 | // NO_LINT_DEBUG |
1979 | sql_print_warning("Got snapshot conflict errors: User: %s " |
1980 | "Query: %s" , |
1981 | user_host_buff, thd->query()); |
1982 | } |
1983 | m_detailed_error = String(" (snapshot conflict)" , system_charset_info); |
1984 | table_handler->m_deadlock_counter.inc(); |
1985 | return HA_ERR_LOCK_DEADLOCK; |
1986 | } |
1987 | |
1988 | if (s.IsIOError() || s.IsCorruption()) { |
1989 | rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL); |
1990 | } |
1991 | |
1992 | return ha_rocksdb::rdb_error_to_mysql(s); |
1993 | } |
1994 | |
1995 | THD *get_thd() const { return m_thd; } |
1996 | |
1997 | /* Used for tracking io_perf counters */ |
1998 | void io_perf_start(Rdb_io_perf *const io_perf) { |
1999 | /* |
2000 | Since perf_context is tracked per thread, it is difficult and expensive |
2001 | to maintain perf_context on a per table basis. Therefore, roll all |
2002 | perf_context data into the first table used in a query. This works well |
2003 | for single table queries and is probably good enough for queries that hit |
2004 | multiple tables. |
2005 | |
2006 | perf_context stats gathering is started when the table lock is acquired |
2007 | or when ha_rocksdb::start_stmt is called in case of LOCK TABLES. They |
2008 | are recorded when the table lock is released, or when commit/rollback |
2009 | is called on the transaction, whichever comes first. Table lock release |
2010 | and commit/rollback can happen in different orders. In the case where |
2011 | the lock is released before commit/rollback is called, an extra step to |
2012 | gather stats during commit/rollback is needed. |
2013 | */ |
2014 | if (m_tbl_io_perf == nullptr && |
2015 | io_perf->start(rocksdb_perf_context_level(m_thd))) { |
2016 | m_tbl_io_perf = io_perf; |
2017 | } |
2018 | } |
2019 | |
2020 | void io_perf_end_and_record(void) { |
2021 | if (m_tbl_io_perf != nullptr) { |
2022 | m_tbl_io_perf->end_and_record(rocksdb_perf_context_level(m_thd)); |
2023 | m_tbl_io_perf = nullptr; |
2024 | } |
2025 | } |
2026 | |
2027 | void io_perf_end_and_record(Rdb_io_perf *const io_perf) { |
2028 | if (m_tbl_io_perf == io_perf) { |
2029 | io_perf_end_and_record(); |
2030 | } |
2031 | } |
2032 | |
2033 | void update_bytes_written(ulonglong bytes_written) { |
2034 | if (m_tbl_io_perf != nullptr) { |
2035 | m_tbl_io_perf->update_bytes_written(rocksdb_perf_context_level(m_thd), |
2036 | bytes_written); |
2037 | } |
2038 | } |
2039 | |
2040 | void set_params(int timeout_sec_arg, int max_row_locks_arg) { |
2041 | m_timeout_sec = timeout_sec_arg; |
2042 | m_max_row_locks = max_row_locks_arg; |
2043 | set_lock_timeout(timeout_sec_arg); |
2044 | } |
2045 | |
2046 | virtual void set_lock_timeout(int timeout_sec_arg) = 0; |
2047 | |
2048 | ulonglong get_write_count() const { return m_write_count; } |
2049 | |
2050 | ulonglong get_insert_count() const { return m_insert_count; } |
2051 | |
2052 | ulonglong get_update_count() const { return m_update_count; } |
2053 | |
2054 | ulonglong get_delete_count() const { return m_delete_count; } |
2055 | |
2056 | void incr_insert_count() { ++m_insert_count; } |
2057 | |
2058 | void incr_update_count() { ++m_update_count; } |
2059 | |
2060 | void incr_delete_count() { ++m_delete_count; } |
2061 | |
2062 | int get_timeout_sec() const { return m_timeout_sec; } |
2063 | |
2064 | ulonglong get_lock_count() const { return m_lock_count; } |
2065 | |
2066 | virtual void set_sync(bool sync) = 0; |
2067 | |
2068 | virtual void release_lock(rocksdb::ColumnFamilyHandle *const column_family, |
2069 | const std::string &rowkey) = 0; |
2070 | |
2071 | virtual bool prepare(const rocksdb::TransactionName &name) = 0; |
2072 | |
2073 | bool commit_or_rollback() { |
2074 | bool res; |
2075 | if (m_is_tx_failed) { |
2076 | rollback(); |
2077 | res = false; |
2078 | } else |
2079 | res = commit(); |
2080 | return res; |
2081 | } |
2082 | |
2083 | bool commit() { |
2084 | if (get_write_count() == 0) { |
2085 | rollback(); |
2086 | return false; |
2087 | } else if (m_rollback_only) { |
2088 | /* |
2089 | Transactions marked as rollback_only are expected to be rolled back at |
2090 | prepare(). But there are some exceptions like below that prepare() is |
2091 | never called and commit() is called instead. |
2092 | 1. Binlog is disabled |
2093 | 2. No modification exists in binlog cache for the transaction (#195) |
2094 | In both cases, rolling back transaction is safe. Nothing is written to |
2095 | binlog. |
2096 | */ |
2097 | my_error(ER_ROLLBACK_ONLY, MYF(0)); |
2098 | rollback(); |
2099 | return true; |
2100 | } else { |
2101 | #ifdef MARIAROCKS_NOT_YET |
2102 | /* |
2103 | Storing binlog position inside MyRocks is needed only for restoring |
2104 | MyRocks from backups. This feature is not supported yet. |
2105 | */ |
2106 | mysql_bin_log_commit_pos(m_thd, &m_mysql_log_offset, |
2107 | &m_mysql_log_file_name); |
2108 | binlog_manager.update(m_mysql_log_file_name, m_mysql_log_offset, |
2109 | get_write_batch()); |
2110 | #endif |
2111 | return commit_no_binlog(); |
2112 | } |
2113 | } |
2114 | |
2115 | virtual void rollback() = 0; |
2116 | |
2117 | void snapshot_created(const rocksdb::Snapshot *const snapshot) { |
2118 | DBUG_ASSERT(snapshot != nullptr); |
2119 | |
2120 | m_read_opts.snapshot = snapshot; |
2121 | rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp); |
2122 | m_is_delayed_snapshot = false; |
2123 | } |
2124 | |
2125 | virtual void acquire_snapshot(bool acquire_now) = 0; |
2126 | virtual void release_snapshot() = 0; |
2127 | |
2128 | bool has_snapshot() const { return m_read_opts.snapshot != nullptr; } |
2129 | |
2130 | private: |
2131 | // The Rdb_sst_info structures we are currently loading. In a partitioned |
2132 | // table this can have more than one entry |
2133 | std::vector<std::shared_ptr<Rdb_sst_info>> m_curr_bulk_load; |
2134 | std::string m_curr_bulk_load_tablename; |
2135 | |
2136 | /* External merge sorts for bulk load: key ID -> merge sort instance */ |
2137 | std::unordered_map<GL_INDEX_ID, Rdb_index_merge> m_key_merge; |
2138 | |
2139 | public: |
2140 | int get_key_merge(GL_INDEX_ID kd_gl_id, rocksdb::ColumnFamilyHandle *cf, |
2141 | Rdb_index_merge **key_merge) { |
2142 | int res; |
2143 | auto it = m_key_merge.find(kd_gl_id); |
2144 | if (it == m_key_merge.end()) { |
2145 | m_key_merge.emplace( |
2146 | std::piecewise_construct, std::make_tuple(kd_gl_id), |
2147 | std::make_tuple( |
2148 | get_rocksdb_tmpdir(), THDVAR(get_thd(), merge_buf_size), |
2149 | THDVAR(get_thd(), merge_combine_read_size), |
2150 | THDVAR(get_thd(), merge_tmp_file_removal_delay_ms), cf)); |
2151 | it = m_key_merge.find(kd_gl_id); |
2152 | if ((res = it->second.init()) != 0) { |
2153 | return res; |
2154 | } |
2155 | } |
2156 | *key_merge = &it->second; |
2157 | return HA_EXIT_SUCCESS; |
2158 | } |
2159 | |
2160 | int finish_bulk_load(int print_client_error = true) { |
2161 | int rc = 0, rc2; |
2162 | |
2163 | std::vector<std::shared_ptr<Rdb_sst_info>>::iterator it; |
2164 | for (it = m_curr_bulk_load.begin(); it != m_curr_bulk_load.end(); it++) { |
2165 | rc2 = (*it)->commit(print_client_error); |
2166 | if (rc2 != 0 && rc == 0) { |
2167 | rc = rc2; |
2168 | } |
2169 | } |
2170 | m_curr_bulk_load.clear(); |
2171 | m_curr_bulk_load_tablename.clear(); |
2172 | DBUG_ASSERT(m_curr_bulk_load.size() == 0); |
2173 | |
2174 | // Flush the index_merge sort buffers |
2175 | if (!m_key_merge.empty()) { |
2176 | rocksdb::Slice merge_key; |
2177 | rocksdb::Slice merge_val; |
2178 | for (auto it = m_key_merge.begin(); it != m_key_merge.end(); it++) { |
2179 | GL_INDEX_ID index_id = it->first; |
2180 | std::shared_ptr<const Rdb_key_def> keydef = |
2181 | ddl_manager.safe_find(index_id); |
2182 | std::string table_name = ddl_manager.safe_get_table_name(index_id); |
2183 | |
2184 | // Unable to find key definition or table name since the |
2185 | // table could have been dropped. |
2186 | // TODO(herman): there is a race here between dropping the table |
2187 | // and detecting a drop here. If the table is dropped while bulk |
2188 | // loading is finishing, these keys being added here may |
2189 | // be missed by the compaction filter and not be marked for |
2190 | // removal. It is unclear how to lock the sql table from the storage |
2191 | // engine to prevent modifications to it while bulk load is occurring. |
2192 | if (keydef == nullptr || table_name.empty()) { |
2193 | rc2 = HA_ERR_ROCKSDB_BULK_LOAD; |
2194 | break; |
2195 | } |
2196 | const std::string &index_name = keydef->get_name(); |
2197 | Rdb_index_merge &rdb_merge = it->second; |
2198 | |
2199 | // Rdb_sst_info expects a denormalized table name in the form of |
2200 | // "./database/table" |
2201 | std::replace(table_name.begin(), table_name.end(), '.', '/'); |
2202 | table_name = "./" + table_name; |
2203 | Rdb_sst_info sst_info(rdb, table_name, index_name, rdb_merge.get_cf(), |
2204 | *rocksdb_db_options, |
2205 | THDVAR(get_thd(), trace_sst_api)); |
2206 | |
2207 | while ((rc2 = rdb_merge.next(&merge_key, &merge_val)) == 0) { |
2208 | if ((rc2 = sst_info.put(merge_key, merge_val)) != 0) { |
2209 | break; |
2210 | } |
2211 | } |
2212 | |
2213 | // rc2 == -1 => finished ok; rc2 > 0 => error |
2214 | if (rc2 > 0 || (rc2 = sst_info.commit(print_client_error)) != 0) { |
2215 | if (rc == 0) { |
2216 | rc = rc2; |
2217 | } |
2218 | break; |
2219 | } |
2220 | } |
2221 | m_key_merge.clear(); |
2222 | |
2223 | /* |
2224 | Explicitly tell jemalloc to clean up any unused dirty pages at this |
2225 | point. |
2226 | See https://reviews.facebook.net/D63723 for more details. |
2227 | */ |
2228 | purge_all_jemalloc_arenas(); |
2229 | } |
2230 | return rc; |
2231 | } |
2232 | |
2233 | int start_bulk_load(ha_rocksdb *const bulk_load, |
2234 | std::shared_ptr<Rdb_sst_info> sst_info) { |
2235 | /* |
2236 | If we already have an open bulk load of a table and the name doesn't |
2237 | match the current one, close out the currently running one. This allows |
2238 | multiple bulk loads to occur on a partitioned table, but then closes |
2239 | them all out when we switch to another table. |
2240 | */ |
2241 | DBUG_ASSERT(bulk_load != nullptr); |
2242 | |
2243 | if (!m_curr_bulk_load.empty() && |
2244 | bulk_load->get_table_basename() != m_curr_bulk_load_tablename) { |
2245 | const auto res = finish_bulk_load(); |
2246 | if (res != HA_EXIT_SUCCESS) { |
2247 | m_curr_bulk_load.clear(); |
2248 | m_curr_bulk_load_tablename.clear(); |
2249 | return res; |
2250 | } |
2251 | } |
2252 | |
2253 | /* |
2254 | This used to track ha_rocksdb handler objects, but those can be |
2255 | freed by the table cache while this was referencing them. Instead |
2256 | of tracking ha_rocksdb handler objects, this now tracks the |
2257 | Rdb_sst_info allocated, and both the ha_rocksdb handler and the |
2258 | Rdb_transaction both have shared pointers to them. |
2259 | |
2260 | On transaction complete, it will commit each Rdb_sst_info structure found. |
2261 | If the ha_rocksdb object is freed, etc., it will also commit |
2262 | the Rdb_sst_info. The Rdb_sst_info commit path needs to be idempotent. |
2263 | */ |
2264 | m_curr_bulk_load.push_back(sst_info); |
2265 | m_curr_bulk_load_tablename = bulk_load->get_table_basename(); |
2266 | return HA_EXIT_SUCCESS; |
2267 | } |
2268 | |
2269 | int num_ongoing_bulk_load() const { return m_curr_bulk_load.size(); } |
2270 | |
2271 | const char *get_rocksdb_tmpdir() const { |
2272 | const char *tmp_dir = THDVAR(get_thd(), tmpdir); |
2273 | |
2274 | /* |
2275 | We want to treat an empty string as nullptr, in these cases DDL operations |
2276 | will use the default --tmpdir passed to mysql instead. |
2277 | */ |
2278 | if (tmp_dir != nullptr && *tmp_dir == '\0') { |
2279 | tmp_dir = nullptr; |
2280 | } |
2281 | return (tmp_dir); |
2282 | } |
2283 | |
2284 | /* |
2285 | Flush the data accumulated so far. This assumes we're doing a bulk insert. |
2286 | |
2287 | @detail |
2288 | This should work like transaction commit, except that we don't |
2289 | synchronize with the binlog (there is no API that would allow to have |
2290 | binlog flush the changes accumulated so far and return its current |
2291 | position) |
2292 | |
2293 | @todo |
2294 | Add test coverage for what happens when somebody attempts to do bulk |
2295 | inserts while inside a multi-statement transaction. |
2296 | */ |
2297 | bool flush_batch() { |
2298 | if (get_write_count() == 0) |
2299 | return false; |
2300 | |
2301 | /* Commit the current transaction */ |
2302 | if (commit_no_binlog()) |
2303 | return true; |
2304 | |
2305 | /* Start another one */ |
2306 | start_tx(); |
2307 | return false; |
2308 | } |
2309 | |
2310 | void set_auto_incr(const GL_INDEX_ID &gl_index_id, ulonglong curr_id) { |
2311 | m_auto_incr_map[gl_index_id] = |
2312 | std::max(m_auto_incr_map[gl_index_id], curr_id); |
2313 | } |
2314 | |
2315 | #ifndef NDEBUG |
2316 | ulonglong get_auto_incr(const GL_INDEX_ID &gl_index_id) { |
2317 | if (m_auto_incr_map.count(gl_index_id) > 0) { |
2318 | return m_auto_incr_map[gl_index_id]; |
2319 | } |
2320 | return 0; |
2321 | } |
2322 | #endif |
2323 | |
2324 | virtual rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family, |
2325 | const rocksdb::Slice &key, |
2326 | const rocksdb::Slice &value) = 0; |
2327 | virtual rocksdb::Status |
2328 | delete_key(rocksdb::ColumnFamilyHandle *const column_family, |
2329 | const rocksdb::Slice &key) = 0; |
2330 | virtual rocksdb::Status |
2331 | single_delete(rocksdb::ColumnFamilyHandle *const column_family, |
2332 | const rocksdb::Slice &key) = 0; |
2333 | |
2334 | virtual bool has_modifications() const = 0; |
2335 | |
2336 | virtual rocksdb::WriteBatchBase *get_indexed_write_batch() = 0; |
2337 | /* |
2338 | Return a WriteBatch that one can write to. The writes will skip any |
2339 | transaction locking. The writes will NOT be visible to the transaction. |
2340 | */ |
2341 | rocksdb::WriteBatchBase *get_blind_write_batch() { |
2342 | return get_indexed_write_batch()->GetWriteBatch(); |
2343 | } |
2344 | |
2345 | virtual rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family, |
2346 | const rocksdb::Slice &key, |
2347 | rocksdb::PinnableSlice *const value) const = 0; |
2348 | virtual rocksdb::Status |
2349 | get_for_update(rocksdb::ColumnFamilyHandle *const column_family, |
2350 | const rocksdb::Slice &key, rocksdb::PinnableSlice *const value, |
2351 | bool exclusive) = 0; |
2352 | |
2353 | rocksdb::Iterator * |
2354 | get_iterator(rocksdb::ColumnFamilyHandle *const column_family, |
2355 | bool skip_bloom_filter, bool fill_cache, |
2356 | const rocksdb::Slice &eq_cond_lower_bound, |
2357 | const rocksdb::Slice &eq_cond_upper_bound, |
2358 | bool read_current = false, bool create_snapshot = true) { |
2359 | // Make sure we are not doing both read_current (which implies we don't |
2360 | // want a snapshot) and create_snapshot which makes sure we create |
2361 | // a snapshot |
2362 | DBUG_ASSERT(column_family != nullptr); |
2363 | DBUG_ASSERT(!read_current || !create_snapshot); |
2364 | |
2365 | if (create_snapshot) |
2366 | acquire_snapshot(true); |
2367 | |
2368 | rocksdb::ReadOptions options = m_read_opts; |
2369 | |
2370 | if (skip_bloom_filter) { |
2371 | options.total_order_seek = true; |
2372 | options.iterate_lower_bound = &eq_cond_lower_bound; |
2373 | options.iterate_upper_bound = &eq_cond_upper_bound; |
2374 | } else { |
2375 | // With this option, Iterator::Valid() returns false if key |
2376 | // is outside of the prefix bloom filter range set at Seek(). |
2377 | // Must not be set to true if not using bloom filter. |
2378 | options.prefix_same_as_start = true; |
2379 | } |
2380 | options.fill_cache = fill_cache; |
2381 | if (read_current) { |
2382 | options.snapshot = nullptr; |
2383 | } |
2384 | return get_iterator(options, column_family); |
2385 | } |
2386 | |
2387 | virtual bool is_tx_started() const = 0; |
2388 | virtual void start_tx() = 0; |
2389 | virtual void start_stmt() = 0; |
2390 | |
2391 | void set_initial_savepoint() { |
2392 | /* |
2393 | Set the initial savepoint. If the first statement in the transaction |
2394 | fails, we need something to roll back to, without rolling back the |
2395 | entire transaction. |
2396 | */ |
2397 | do_set_savepoint(); |
2398 | m_n_savepoints= 1; |
2399 | m_writes_at_last_savepoint= m_write_count; |
2400 | } |
2401 | |
2402 | /* |
2403 | Called when a "top-level" statement inside a transaction completes |
2404 | successfully and its changes become part of the transaction's changes. |
2405 | */ |
2406 | void make_stmt_savepoint_permanent() { |
2407 | |
2408 | // Take another RocksDB savepoint only if we had changes since the last |
2409 | // one. This is very important for long transactions doing lots of |
2410 | // SELECTs. |
2411 | if (m_writes_at_last_savepoint != m_write_count) |
2412 | { |
2413 | do_set_savepoint(); |
2414 | m_writes_at_last_savepoint= m_write_count; |
2415 | m_n_savepoints++; |
2416 | } |
2417 | } |
2418 | |
2419 | |
2420 | /* |
2421 | Rollback to the savepoint we've set before the last statement |
2422 | */ |
2423 | void rollback_to_stmt_savepoint() { |
2424 | if (m_writes_at_last_savepoint != m_write_count) { |
2425 | do_rollback_to_savepoint(); |
2426 | if (!--m_n_savepoints) { |
2427 | do_set_savepoint(); |
2428 | m_n_savepoints= 1; |
2429 | } |
2430 | m_writes_at_last_savepoint= m_write_count; |
2431 | } |
2432 | } |
2433 | |
2434 | virtual void rollback_stmt() = 0; |
2435 | |
2436 | void set_tx_failed(bool failed_arg) { m_is_tx_failed = failed_arg; } |
2437 | |
2438 | bool can_prepare() const { |
2439 | if (m_rollback_only) { |
2440 | my_error(ER_ROLLBACK_ONLY, MYF(0)); |
2441 | return false; |
2442 | } |
2443 | return true; |
2444 | } |
2445 | |
2446 | int rollback_to_savepoint(void *const savepoint) { |
2447 | if (has_modifications()) { |
2448 | my_error(ER_ROLLBACK_TO_SAVEPOINT, MYF(0)); |
2449 | m_rollback_only = true; |
2450 | return HA_EXIT_FAILURE; |
2451 | } |
2452 | return HA_EXIT_SUCCESS; |
2453 | } |
2454 | |
2455 | /* |
2456 | This is used by transactions started with "START TRANSACTION WITH " |
2457 | "CONSISTENT [ROCKSDB] SNAPSHOT". When tx_read_only is turned on, |
2458 | snapshot has to be created via DB::GetSnapshot(), not via Transaction |
2459 | API. |
2460 | */ |
2461 | bool is_tx_read_only() const { return m_tx_read_only; } |
2462 | |
2463 | bool is_two_phase() const { return m_is_two_phase; } |
2464 | |
2465 | void set_tx_read_only(bool val) { m_tx_read_only = val; } |
2466 | |
2467 | explicit Rdb_transaction(THD *const thd) |
2468 | : m_thd(thd), m_tbl_io_perf(nullptr) { |
2469 | RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex); |
2470 | s_tx_list.insert(this); |
2471 | RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex); |
2472 | } |
2473 | |
2474 | virtual ~Rdb_transaction() { |
2475 | RDB_MUTEX_LOCK_CHECK(s_tx_list_mutex); |
2476 | s_tx_list.erase(this); |
2477 | RDB_MUTEX_UNLOCK_CHECK(s_tx_list_mutex); |
2478 | } |
2479 | }; |
2480 | |
2481 | /* |
2482 | This is a rocksdb transaction. Its members represent the current transaction, |
2483 | which consists of: |
2484 | - the snapshot |
2485 | - the changes we've made but are not seeing yet. |
2486 | |
2487 | The changes are made to individual tables, which store them here and then |
2488 | this object commits them on commit. |
2489 | */ |
2490 | class Rdb_transaction_impl : public Rdb_transaction { |
2491 | rocksdb::Transaction *m_rocksdb_tx = nullptr; |
2492 | rocksdb::Transaction *m_rocksdb_reuse_tx = nullptr; |
2493 | |
2494 | public: |
2495 | void set_lock_timeout(int timeout_sec_arg) override { |
2496 | if (m_rocksdb_tx) |
2497 | m_rocksdb_tx->SetLockTimeout(rdb_convert_sec_to_ms(m_timeout_sec)); |
2498 | } |
2499 | |
2500 | void set_sync(bool sync) override { |
2501 | if (m_rocksdb_tx) |
2502 | m_rocksdb_tx->GetWriteOptions()->sync = sync; |
2503 | } |
2504 | |
2505 | void release_lock(rocksdb::ColumnFamilyHandle *const column_family, |
2506 | const std::string &rowkey) override { |
2507 | if (!THDVAR(m_thd, lock_scanned_rows)) { |
2508 | m_rocksdb_tx->UndoGetForUpdate(column_family, rocksdb::Slice(rowkey)); |
2509 | } |
2510 | } |
2511 | |
2512 | virtual bool is_writebatch_trx() const override { return false; } |
2513 | |
2514 | private: |
2515 | void release_tx(void) { |
2516 | // We are done with the current active transaction object. Preserve it |
2517 | // for later reuse. |
2518 | DBUG_ASSERT(m_rocksdb_reuse_tx == nullptr); |
2519 | m_rocksdb_reuse_tx = m_rocksdb_tx; |
2520 | m_rocksdb_tx = nullptr; |
2521 | } |
2522 | |
2523 | bool prepare(const rocksdb::TransactionName &name) override { |
2524 | rocksdb::Status s; |
2525 | s = m_rocksdb_tx->SetName(name); |
2526 | if (!s.ok()) { |
2527 | rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT); |
2528 | return false; |
2529 | } |
2530 | |
2531 | s = merge_auto_incr_map(m_rocksdb_tx->GetWriteBatch()->GetWriteBatch()); |
2532 | if (!s.ok()) { |
2533 | rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT); |
2534 | return false; |
2535 | } |
2536 | |
2537 | s = m_rocksdb_tx->Prepare(); |
2538 | if (!s.ok()) { |
2539 | rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT); |
2540 | return false; |
2541 | } |
2542 | return true; |
2543 | } |
2544 | |
2545 | bool commit_no_binlog() override { |
2546 | bool res = false; |
2547 | rocksdb::Status s; |
2548 | |
2549 | s = merge_auto_incr_map(m_rocksdb_tx->GetWriteBatch()->GetWriteBatch()); |
2550 | if (!s.ok()) { |
2551 | rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT); |
2552 | res = true; |
2553 | goto error; |
2554 | } |
2555 | |
2556 | release_snapshot(); |
2557 | s = m_rocksdb_tx->Commit(); |
2558 | if (!s.ok()) { |
2559 | rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT); |
2560 | res = true; |
2561 | goto error; |
2562 | } |
2563 | |
2564 | error: |
2565 | /* Save the transaction object to be reused */ |
2566 | release_tx(); |
2567 | |
2568 | m_write_count = 0; |
2569 | m_insert_count = 0; |
2570 | m_update_count = 0; |
2571 | m_delete_count = 0; |
2572 | m_lock_count = 0; |
2573 | set_tx_read_only(false); |
2574 | m_rollback_only = false; |
2575 | return res; |
2576 | } |
2577 | |
2578 | public: |
2579 | void rollback() override { |
2580 | m_write_count = 0; |
2581 | m_insert_count = 0; |
2582 | m_update_count = 0; |
2583 | m_delete_count = 0; |
2584 | m_lock_count = 0; |
2585 | m_auto_incr_map.clear(); |
2586 | m_ddl_transaction = false; |
2587 | if (m_rocksdb_tx) { |
2588 | release_snapshot(); |
2589 | /* This will also release all of the locks: */ |
2590 | m_rocksdb_tx->Rollback(); |
2591 | |
2592 | /* Save the transaction object to be reused */ |
2593 | release_tx(); |
2594 | |
2595 | set_tx_read_only(false); |
2596 | m_rollback_only = false; |
2597 | } |
2598 | } |
2599 | |
2600 | void acquire_snapshot(bool acquire_now) override { |
2601 | if (m_read_opts.snapshot == nullptr) { |
2602 | if (is_tx_read_only()) { |
2603 | snapshot_created(rdb->GetSnapshot()); |
2604 | } else if (acquire_now) { |
2605 | m_rocksdb_tx->SetSnapshot(); |
2606 | snapshot_created(m_rocksdb_tx->GetSnapshot()); |
2607 | } else if (!m_is_delayed_snapshot) { |
2608 | m_rocksdb_tx->SetSnapshotOnNextOperation(m_notifier); |
2609 | m_is_delayed_snapshot = true; |
2610 | } |
2611 | } |
2612 | } |
2613 | |
2614 | void release_snapshot() override { |
2615 | bool need_clear = m_is_delayed_snapshot; |
2616 | |
2617 | if (m_read_opts.snapshot != nullptr) { |
2618 | m_snapshot_timestamp = 0; |
2619 | if (is_tx_read_only()) { |
2620 | rdb->ReleaseSnapshot(m_read_opts.snapshot); |
2621 | need_clear = false; |
2622 | } else { |
2623 | need_clear = true; |
2624 | } |
2625 | m_read_opts.snapshot = nullptr; |
2626 | } |
2627 | |
2628 | if (need_clear && m_rocksdb_tx != nullptr) |
2629 | m_rocksdb_tx->ClearSnapshot(); |
2630 | } |
2631 | |
2632 | bool has_snapshot() { return m_read_opts.snapshot != nullptr; } |
2633 | |
2634 | rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family, |
2635 | const rocksdb::Slice &key, |
2636 | const rocksdb::Slice &value) override { |
2637 | ++m_write_count; |
2638 | ++m_lock_count; |
2639 | if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks) |
2640 | return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit); |
2641 | return m_rocksdb_tx->Put(column_family, key, value); |
2642 | } |
2643 | |
2644 | rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family, |
2645 | const rocksdb::Slice &key) override { |
2646 | ++m_write_count; |
2647 | ++m_lock_count; |
2648 | if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks) |
2649 | return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit); |
2650 | return m_rocksdb_tx->Delete(column_family, key); |
2651 | } |
2652 | |
2653 | rocksdb::Status |
2654 | single_delete(rocksdb::ColumnFamilyHandle *const column_family, |
2655 | const rocksdb::Slice &key) override { |
2656 | ++m_write_count; |
2657 | ++m_lock_count; |
2658 | if (m_write_count > m_max_row_locks || m_lock_count > m_max_row_locks) |
2659 | return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit); |
2660 | return m_rocksdb_tx->SingleDelete(column_family, key); |
2661 | } |
2662 | |
2663 | bool has_modifications() const override { |
2664 | return m_rocksdb_tx->GetWriteBatch() && |
2665 | m_rocksdb_tx->GetWriteBatch()->GetWriteBatch() && |
2666 | m_rocksdb_tx->GetWriteBatch()->GetWriteBatch()->Count() > 0; |
2667 | } |
2668 | |
2669 | rocksdb::WriteBatchBase *get_write_batch() override { |
2670 | if (is_two_phase()) { |
2671 | return m_rocksdb_tx->GetCommitTimeWriteBatch(); |
2672 | } |
2673 | return m_rocksdb_tx->GetWriteBatch()->GetWriteBatch(); |
2674 | } |
2675 | |
2676 | /* |
2677 | Return a WriteBatch that one can write to. The writes will skip any |
2678 | transaction locking. The writes WILL be visible to the transaction. |
2679 | */ |
2680 | rocksdb::WriteBatchBase *get_indexed_write_batch() override { |
2681 | ++m_write_count; |
2682 | return m_rocksdb_tx->GetWriteBatch(); |
2683 | } |
2684 | |
2685 | rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family, |
2686 | const rocksdb::Slice &key, |
2687 | rocksdb::PinnableSlice *const value) const override { |
2688 | // clean PinnableSlice right begfore Get() for multiple gets per statement |
2689 | // the resources after the last Get in a statement are cleared in |
2690 | // handler::reset call |
2691 | value->Reset(); |
2692 | global_stats.queries[QUERIES_POINT].inc(); |
2693 | return m_rocksdb_tx->Get(m_read_opts, column_family, key, value); |
2694 | } |
2695 | |
2696 | rocksdb::Status |
2697 | get_for_update(rocksdb::ColumnFamilyHandle *const column_family, |
2698 | const rocksdb::Slice &key, rocksdb::PinnableSlice *const value, |
2699 | bool exclusive) override { |
2700 | if (++m_lock_count > m_max_row_locks) |
2701 | return rocksdb::Status::Aborted(rocksdb::Status::kLockLimit); |
2702 | |
2703 | if (value != nullptr) { |
2704 | value->Reset(); |
2705 | } |
2706 | return m_rocksdb_tx->GetForUpdate(m_read_opts, column_family, key, value, |
2707 | exclusive); |
2708 | } |
2709 | |
2710 | rocksdb::Iterator * |
2711 | get_iterator(const rocksdb::ReadOptions &options, |
2712 | rocksdb::ColumnFamilyHandle *const column_family) override { |
2713 | global_stats.queries[QUERIES_RANGE].inc(); |
2714 | return m_rocksdb_tx->GetIterator(options, column_family); |
2715 | } |
2716 | |
2717 | const rocksdb::Transaction *get_rdb_trx() const { return m_rocksdb_tx; } |
2718 | |
2719 | bool is_tx_started() const override { return (m_rocksdb_tx != nullptr); } |
2720 | |
2721 | void start_tx() override { |
2722 | rocksdb::TransactionOptions tx_opts; |
2723 | rocksdb::WriteOptions write_opts; |
2724 | tx_opts.set_snapshot = false; |
2725 | tx_opts.lock_timeout = rdb_convert_sec_to_ms(m_timeout_sec); |
2726 | tx_opts.deadlock_detect = THDVAR(m_thd, deadlock_detect); |
2727 | tx_opts.deadlock_detect_depth = THDVAR(m_thd, deadlock_detect_depth); |
2728 | tx_opts.max_write_batch_size = THDVAR(m_thd, write_batch_max_bytes); |
2729 | |
2730 | write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC); |
2731 | write_opts.disableWAL = THDVAR(m_thd, write_disable_wal); |
2732 | write_opts.ignore_missing_column_families = |
2733 | THDVAR(m_thd, write_ignore_missing_column_families); |
2734 | m_is_two_phase = rocksdb_enable_2pc; |
2735 | |
2736 | commit_ordered_done= false; |
2737 | |
2738 | /* |
2739 | If m_rocksdb_reuse_tx is null this will create a new transaction object. |
2740 | Otherwise it will reuse the existing one. |
2741 | */ |
2742 | m_rocksdb_tx = |
2743 | rdb->BeginTransaction(write_opts, tx_opts, m_rocksdb_reuse_tx); |
2744 | m_rocksdb_reuse_tx = nullptr; |
2745 | |
2746 | m_read_opts = rocksdb::ReadOptions(); |
2747 | |
2748 | set_initial_savepoint(); |
2749 | |
2750 | m_ddl_transaction = false; |
2751 | } |
2752 | |
2753 | /* Implementations of do_*savepoint based on rocksdB::Transaction savepoints */ |
2754 | void do_set_savepoint() override { |
2755 | m_rocksdb_tx->SetSavePoint(); |
2756 | } |
2757 | |
2758 | void do_rollback_to_savepoint() override { |
2759 | m_rocksdb_tx->RollbackToSavePoint(); |
2760 | } |
2761 | |
2762 | /* |
2763 | Start a statement inside a multi-statement transaction. |
2764 | |
2765 | @todo: are we sure this is called once (and not several times) per |
2766 | statement start? |
2767 | |
2768 | For hooking to start of statement that is its own transaction, see |
2769 | ha_rocksdb::external_lock(). |
2770 | */ |
2771 | void start_stmt() override { |
2772 | // Set the snapshot to delayed acquisition (SetSnapshotOnNextOperation) |
2773 | acquire_snapshot(false); |
2774 | } |
2775 | |
2776 | /* |
2777 | This must be called when last statement is rolled back, but the transaction |
2778 | continues |
2779 | */ |
2780 | void rollback_stmt() override { |
2781 | /* TODO: here we must release the locks taken since the start_stmt() call */ |
2782 | if (m_rocksdb_tx) { |
2783 | const rocksdb::Snapshot *const org_snapshot = m_rocksdb_tx->GetSnapshot(); |
2784 | rollback_to_stmt_savepoint(); |
2785 | |
2786 | const rocksdb::Snapshot *const cur_snapshot = m_rocksdb_tx->GetSnapshot(); |
2787 | if (org_snapshot != cur_snapshot) { |
2788 | if (org_snapshot != nullptr) |
2789 | m_snapshot_timestamp = 0; |
2790 | |
2791 | m_read_opts.snapshot = cur_snapshot; |
2792 | if (cur_snapshot != nullptr) |
2793 | rdb->GetEnv()->GetCurrentTime(&m_snapshot_timestamp); |
2794 | else |
2795 | m_is_delayed_snapshot = true; |
2796 | } |
2797 | } |
2798 | } |
2799 | |
2800 | explicit Rdb_transaction_impl(THD *const thd) |
2801 | : Rdb_transaction(thd), m_rocksdb_tx(nullptr) { |
2802 | // Create a notifier that can be called when a snapshot gets generated. |
2803 | m_notifier = std::make_shared<Rdb_snapshot_notifier>(this); |
2804 | } |
2805 | |
2806 | virtual ~Rdb_transaction_impl() { |
2807 | rollback(); |
2808 | |
2809 | // Theoretically the notifier could outlive the Rdb_transaction_impl |
2810 | // (because of the shared_ptr), so let it know it can't reference |
2811 | // the transaction anymore. |
2812 | m_notifier->detach(); |
2813 | |
2814 | // Free any transaction memory that is still hanging around. |
2815 | delete m_rocksdb_reuse_tx; |
2816 | DBUG_ASSERT(m_rocksdb_tx == nullptr); |
2817 | } |
2818 | }; |
2819 | |
2820 | /* This is a rocksdb write batch. This class doesn't hold or wait on any |
2821 | transaction locks (skips rocksdb transaction API) thus giving better |
2822 | performance. The commit is done through rdb->GetBaseDB()->Commit(). |
2823 | |
2824 | Currently this is only used for replication threads which are guaranteed |
2825 | to be non-conflicting. Any further usage of this class should completely |
2826 | be thought thoroughly. |
2827 | */ |
2828 | class Rdb_writebatch_impl : public Rdb_transaction { |
2829 | rocksdb::WriteBatchWithIndex *m_batch; |
2830 | rocksdb::WriteOptions write_opts; |
2831 | // Called after commit/rollback. |
2832 | void reset() { |
2833 | m_batch->Clear(); |
2834 | m_read_opts = rocksdb::ReadOptions(); |
2835 | m_ddl_transaction = false; |
2836 | } |
2837 | |
2838 | private: |
2839 | bool prepare(const rocksdb::TransactionName &name) override { return true; } |
2840 | |
2841 | bool commit_no_binlog() override { |
2842 | bool res = false; |
2843 | rocksdb::Status s; |
2844 | |
2845 | s = merge_auto_incr_map(m_batch->GetWriteBatch()); |
2846 | if (!s.ok()) { |
2847 | rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT); |
2848 | res = true; |
2849 | goto error; |
2850 | } |
2851 | |
2852 | release_snapshot(); |
2853 | |
2854 | s = rdb->GetBaseDB()->Write(write_opts, m_batch->GetWriteBatch()); |
2855 | if (!s.ok()) { |
2856 | rdb_handle_io_error(s, RDB_IO_ERROR_TX_COMMIT); |
2857 | res = true; |
2858 | goto error; |
2859 | } |
2860 | error: |
2861 | reset(); |
2862 | |
2863 | m_write_count = 0; |
2864 | m_insert_count = 0; |
2865 | m_update_count = 0; |
2866 | m_delete_count = 0; |
2867 | set_tx_read_only(false); |
2868 | m_rollback_only = false; |
2869 | return res; |
2870 | } |
2871 | |
2872 | protected: |
2873 | /* Implementations of do_*savepoint based on rocksdB::WriteBatch savepoints */ |
2874 | void do_set_savepoint() override { |
2875 | m_batch->SetSavePoint(); |
2876 | } |
2877 | |
2878 | void do_rollback_to_savepoint() override { |
2879 | m_batch->RollbackToSavePoint(); |
2880 | } |
2881 | |
2882 | public: |
2883 | bool is_writebatch_trx() const override { return true; } |
2884 | |
2885 | void set_lock_timeout(int timeout_sec_arg) override { |
2886 | // Nothing to do here. |
2887 | } |
2888 | |
2889 | void set_sync(bool sync) override { write_opts.sync = sync; } |
2890 | |
2891 | void release_lock(rocksdb::ColumnFamilyHandle *const column_family, |
2892 | const std::string &rowkey) override { |
2893 | // Nothing to do here since we don't hold any row locks. |
2894 | } |
2895 | |
2896 | void rollback() override { |
2897 | m_write_count = 0; |
2898 | m_insert_count = 0; |
2899 | m_update_count = 0; |
2900 | m_delete_count = 0; |
2901 | m_lock_count = 0; |
2902 | release_snapshot(); |
2903 | |
2904 | reset(); |
2905 | set_tx_read_only(false); |
2906 | m_rollback_only = false; |
2907 | } |
2908 | |
2909 | void acquire_snapshot(bool acquire_now) override { |
2910 | if (m_read_opts.snapshot == nullptr) |
2911 | snapshot_created(rdb->GetSnapshot()); |
2912 | } |
2913 | |
2914 | void release_snapshot() override { |
2915 | if (m_read_opts.snapshot != nullptr) { |
2916 | rdb->ReleaseSnapshot(m_read_opts.snapshot); |
2917 | m_read_opts.snapshot = nullptr; |
2918 | } |
2919 | } |
2920 | |
2921 | rocksdb::Status put(rocksdb::ColumnFamilyHandle *const column_family, |
2922 | const rocksdb::Slice &key, |
2923 | const rocksdb::Slice &value) override { |
2924 | ++m_write_count; |
2925 | m_batch->Put(column_family, key, value); |
2926 | // Note Put/Delete in write batch doesn't return any error code. We simply |
2927 | // return OK here. |
2928 | return rocksdb::Status::OK(); |
2929 | } |
2930 | |
2931 | rocksdb::Status delete_key(rocksdb::ColumnFamilyHandle *const column_family, |
2932 | const rocksdb::Slice &key) override { |
2933 | ++m_write_count; |
2934 | m_batch->Delete(column_family, key); |
2935 | return rocksdb::Status::OK(); |
2936 | } |
2937 | |
2938 | rocksdb::Status |
2939 | single_delete(rocksdb::ColumnFamilyHandle *const column_family, |
2940 | const rocksdb::Slice &key) override { |
2941 | ++m_write_count; |
2942 | m_batch->SingleDelete(column_family, key); |
2943 | return rocksdb::Status::OK(); |
2944 | } |
2945 | |
2946 | bool has_modifications() const override { |
2947 | return m_batch->GetWriteBatch()->Count() > 0; |
2948 | } |
2949 | |
2950 | rocksdb::WriteBatchBase *get_write_batch() override { return m_batch; } |
2951 | |
2952 | rocksdb::WriteBatchBase *get_indexed_write_batch() override { |
2953 | ++m_write_count; |
2954 | return m_batch; |
2955 | } |
2956 | |
2957 | rocksdb::Status get(rocksdb::ColumnFamilyHandle *const column_family, |
2958 | const rocksdb::Slice &key, |
2959 | rocksdb::PinnableSlice *const value) const override { |
2960 | value->Reset(); |
2961 | return m_batch->GetFromBatchAndDB(rdb, m_read_opts, column_family, key, |
2962 | value); |
2963 | } |
2964 | |
2965 | rocksdb::Status |
2966 | get_for_update(rocksdb::ColumnFamilyHandle *const column_family, |
2967 | const rocksdb::Slice &key, rocksdb::PinnableSlice *const value, |
2968 | bool exclusive) override { |
2969 | return get(column_family, key, value); |
2970 | } |
2971 | |
2972 | rocksdb::Iterator * |
2973 | get_iterator(const rocksdb::ReadOptions &options, |
2974 | rocksdb::ColumnFamilyHandle *const column_family) override { |
2975 | const auto it = rdb->NewIterator(options); |
2976 | return m_batch->NewIteratorWithBase(it); |
2977 | } |
2978 | |
2979 | bool is_tx_started() const override { return (m_batch != nullptr); } |
2980 | |
2981 | void start_tx() override { |
2982 | commit_ordered_done= false; // Do we need this here? |
2983 | reset(); |
2984 | write_opts.sync = (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC); |
2985 | write_opts.disableWAL = THDVAR(m_thd, write_disable_wal); |
2986 | write_opts.ignore_missing_column_families = |
2987 | THDVAR(m_thd, write_ignore_missing_column_families); |
2988 | |
2989 | set_initial_savepoint(); |
2990 | } |
2991 | |
2992 | void start_stmt() override {} |
2993 | |
2994 | void rollback_stmt() override { |
2995 | if (m_batch) |
2996 | rollback_to_stmt_savepoint(); |
2997 | } |
2998 | |
2999 | explicit Rdb_writebatch_impl(THD *const thd) |
3000 | : Rdb_transaction(thd), m_batch(nullptr) { |
3001 | m_batch = new rocksdb::WriteBatchWithIndex(rocksdb::BytewiseComparator(), 0, |
3002 | true); |
3003 | } |
3004 | |
3005 | virtual ~Rdb_writebatch_impl() { |
3006 | rollback(); |
3007 | delete m_batch; |
3008 | } |
3009 | }; |
3010 | |
3011 | void Rdb_snapshot_notifier::SnapshotCreated( |
3012 | const rocksdb::Snapshot *const snapshot) { |
3013 | if (m_owning_tx != nullptr) { |
3014 | m_owning_tx->snapshot_created(snapshot); |
3015 | } |
3016 | } |
3017 | |
3018 | std::multiset<Rdb_transaction *> Rdb_transaction::s_tx_list; |
3019 | mysql_mutex_t Rdb_transaction::s_tx_list_mutex; |
3020 | |
3021 | static Rdb_transaction *&get_tx_from_thd(THD *const thd) { |
3022 | return *reinterpret_cast<Rdb_transaction **>( |
3023 | my_core::thd_ha_data(thd, rocksdb_hton)); |
3024 | } |
3025 | |
3026 | namespace { |
3027 | |
3028 | class Rdb_perf_context_guard { |
3029 | Rdb_io_perf m_io_perf; |
3030 | Rdb_io_perf *m_io_perf_ptr; |
3031 | Rdb_transaction *m_tx; |
3032 | uint m_level; |
3033 | |
3034 | public: |
3035 | Rdb_perf_context_guard(const Rdb_perf_context_guard &) = delete; |
3036 | Rdb_perf_context_guard &operator=(const Rdb_perf_context_guard &) = delete; |
3037 | |
3038 | explicit Rdb_perf_context_guard(Rdb_io_perf *io_perf, uint level) |
3039 | : m_io_perf_ptr(io_perf), m_tx(nullptr), m_level(level) { |
3040 | m_io_perf_ptr->start(m_level); |
3041 | } |
3042 | |
3043 | explicit Rdb_perf_context_guard(Rdb_transaction *tx, uint level) |
3044 | : m_io_perf_ptr(nullptr), m_tx(tx), m_level(level) { |
3045 | /* |
3046 | if perf_context information is already being recorded, this becomes a |
3047 | no-op |
3048 | */ |
3049 | if (tx != nullptr) { |
3050 | tx->io_perf_start(&m_io_perf); |
3051 | } |
3052 | } |
3053 | |
3054 | ~Rdb_perf_context_guard() { |
3055 | if (m_tx != nullptr) { |
3056 | m_tx->io_perf_end_and_record(); |
3057 | } else if (m_io_perf_ptr != nullptr) { |
3058 | m_io_perf_ptr->end_and_record(m_level); |
3059 | } |
3060 | } |
3061 | }; |
3062 | |
3063 | } // anonymous namespace |
3064 | |
3065 | /* |
3066 | TODO: maybe, call this in external_lock() and store in ha_rocksdb.. |
3067 | */ |
3068 | |
3069 | static Rdb_transaction *get_or_create_tx(THD *const thd) { |
3070 | Rdb_transaction *&tx = get_tx_from_thd(thd); |
3071 | // TODO: this is called too many times.. O(#rows) |
3072 | if (tx == nullptr) { |
3073 | bool rpl_skip_tx_api= false; // MARIAROCKS_NOT_YET. |
3074 | if ((rpl_skip_tx_api && thd->rgi_slave) || |
3075 | false /* MARIAROCKS_NOT_YET: THDVAR(thd, master_skip_tx_api) && !thd->rgi_slave)*/) |
3076 | { |
3077 | tx = new Rdb_writebatch_impl(thd); |
3078 | } |
3079 | else |
3080 | { |
3081 | tx = new Rdb_transaction_impl(thd); |
3082 | } |
3083 | tx->set_params(THDVAR(thd, lock_wait_timeout), THDVAR(thd, max_row_locks)); |
3084 | tx->start_tx(); |
3085 | } else { |
3086 | tx->set_params(THDVAR(thd, lock_wait_timeout), THDVAR(thd, max_row_locks)); |
3087 | if (!tx->is_tx_started()) { |
3088 | tx->start_tx(); |
3089 | } |
3090 | } |
3091 | |
3092 | return tx; |
3093 | } |
3094 | |
3095 | static int rocksdb_close_connection(handlerton *const hton, THD *const thd) { |
3096 | Rdb_transaction *&tx = get_tx_from_thd(thd); |
3097 | if (tx != nullptr) { |
3098 | int rc = tx->finish_bulk_load(false); |
3099 | if (rc != 0) { |
3100 | // NO_LINT_DEBUG |
3101 | sql_print_error("RocksDB: Error %d finalizing last SST file while " |
3102 | "disconnecting" , |
3103 | rc); |
3104 | } |
3105 | |
3106 | delete tx; |
3107 | tx = nullptr; |
3108 | } |
3109 | return HA_EXIT_SUCCESS; |
3110 | } |
3111 | |
3112 | /* |
3113 | * Serializes an xid to a string so that it can |
3114 | * be used as a rocksdb transaction name |
3115 | */ |
3116 | static std::string rdb_xid_to_string(const XID &src) { |
3117 | DBUG_ASSERT(src.gtrid_length >= 0 && src.gtrid_length <= MAXGTRIDSIZE); |
3118 | DBUG_ASSERT(src.bqual_length >= 0 && src.bqual_length <= MAXBQUALSIZE); |
3119 | |
3120 | std::string buf; |
3121 | buf.reserve(RDB_XIDHDR_LEN + src.gtrid_length + src.bqual_length); |
3122 | |
3123 | /* |
3124 | * expand formatID to fill 8 bytes if it doesn't already |
3125 | * then reinterpret bit pattern as unsigned and store in network order |
3126 | */ |
3127 | uchar fidbuf[RDB_FORMATID_SZ]; |
3128 | int64 signed_fid8 = src.formatID; |
3129 | const uint64 raw_fid8 = *reinterpret_cast<uint64 *>(&signed_fid8); |
3130 | rdb_netbuf_store_uint64(fidbuf, raw_fid8); |
3131 | buf.append(reinterpret_cast<const char *>(fidbuf), RDB_FORMATID_SZ); |
3132 | |
3133 | buf.push_back(src.gtrid_length); |
3134 | buf.push_back(src.bqual_length); |
3135 | buf.append(src.data, (src.gtrid_length) + (src.bqual_length)); |
3136 | return buf; |
3137 | } |
3138 | |
3139 | #if 0 |
3140 | // MARIAROCKS: MariaDB doesn't have flush_wal method |
3141 | /** |
3142 | Called by hton->flush_logs after MySQL group commit prepares a set of |
3143 | transactions. |
3144 | */ |
3145 | static bool rocksdb_flush_wal(handlerton* hton __attribute__((__unused__))) |
3146 | DBUG_ASSERT(rdb != nullptr); |
3147 | |
3148 | rocksdb::Status s; |
3149 | /* |
3150 | target_lsn is set to 0 when MySQL wants to sync the wal files |
3151 | */ |
3152 | if ((target_lsn == 0 && !rocksdb_db_options->allow_mmap_writes) || |
3153 | rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER) { |
3154 | rocksdb_wal_group_syncs++; |
3155 | s = rdb->FlushWAL(target_lsn == 0 || |
3156 | rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC); |
3157 | } |
3158 | |
3159 | if (!s.ok()) { |
3160 | rdb_log_status_error(s); |
3161 | return HA_EXIT_FAILURE; |
3162 | } |
3163 | return HA_EXIT_SUCCESS; |
3164 | } |
3165 | #endif |
3166 | |
3167 | /** |
3168 | For a slave, prepare() updates the slave_gtid_info table which tracks the |
3169 | replication progress. |
3170 | */ |
3171 | static int rocksdb_prepare(handlerton* hton, THD* thd, bool prepare_tx) |
3172 | { |
3173 | bool async=false; // This is "ASYNC_COMMIT" feature which is only present in webscalesql |
3174 | |
3175 | Rdb_transaction *&tx = get_tx_from_thd(thd); |
3176 | if (!tx->can_prepare()) { |
3177 | return HA_EXIT_FAILURE; |
3178 | } |
3179 | if (prepare_tx || |
3180 | (!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) { |
3181 | /* We were instructed to prepare the whole transaction, or |
3182 | this is an SQL statement end and autocommit is on */ |
3183 | |
3184 | #ifdef MARIAROCKS_NOT_YET |
3185 | /* |
3186 | Storing binlog position inside MyRocks is needed only for restoring |
3187 | MyRocks from backups. This feature is not supported yet. |
3188 | */ |
3189 | std::vector<st_slave_gtid_info> slave_gtid_info; |
3190 | my_core::thd_slave_gtid_info(thd, &slave_gtid_info); |
3191 | for (const auto &it : slave_gtid_info) { |
3192 | rocksdb::WriteBatchBase *const write_batch = tx->get_blind_write_batch(); |
3193 | binlog_manager.update_slave_gtid_info(it.id, it.db, it.gtid, write_batch); |
3194 | } |
3195 | #endif |
3196 | |
3197 | if (tx->is_two_phase()) { |
3198 | |
3199 | /* |
3200 | MariaDB: the following branch is never taken. |
3201 | We always flush at Prepare and rely on RocksDB's internal Group Commit |
3202 | to do some grouping. |
3203 | */ |
3204 | if (thd->durability_property == HA_IGNORE_DURABILITY || async) { |
3205 | tx->set_sync(false); |
3206 | } |
3207 | |
3208 | /* |
3209 | MariaDB: do not flush logs if we are running in a non-crash-safe mode. |
3210 | */ |
3211 | if (!rocksdb_flush_log_at_trx_commit) |
3212 | tx->set_sync(false); |
3213 | |
3214 | XID xid; |
3215 | thd_get_xid(thd, reinterpret_cast<MYSQL_XID *>(&xid)); |
3216 | if (!tx->prepare(rdb_xid_to_string(xid))) { |
3217 | return HA_EXIT_FAILURE; |
3218 | } |
3219 | |
3220 | /* |
3221 | MariaDB: our Group Commit implementation does not use the |
3222 | hton->flush_logs call (at least currently) so the following is not |
3223 | needed (TODO: will we need this for binlog rotation?) |
3224 | */ |
3225 | #ifdef MARIAROCKS_NOT_YET |
3226 | if (thd->durability_property == HA_IGNORE_DURABILITY ) |
3227 | (rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER)) |
3228 | && |
3229 | THDVAR(thd, flush_log_at_trx_commit)) |
3230 | #endif |
3231 | #ifdef MARIAROCKS_NOT_YET |
3232 | { |
3233 | // MariaRocks: disable the |
3234 | // "write/sync redo log before flushing binlog cache to file" |
3235 | // feature. See a869c56d361bb44f46c0efeb11a8f03561676247 |
3236 | /** |
3237 | we set the log sequence as '1' just to trigger hton->flush_logs |
3238 | */ |
3239 | thd_store_lsn(thd, 1, DB_TYPE_ROCKSDB); |
3240 | } |
3241 | #endif |
3242 | } |
3243 | |
3244 | DEBUG_SYNC(thd, "rocksdb.prepared" ); |
3245 | } |
3246 | else |
3247 | tx->make_stmt_savepoint_permanent(); |
3248 | return HA_EXIT_SUCCESS; |
3249 | } |
3250 | |
3251 | /** |
3252 | do nothing for prepare/commit by xid |
3253 | this is needed to avoid crashes in XA scenarios |
3254 | */ |
3255 | static int rocksdb_commit_by_xid(handlerton *const hton, XID *const xid) { |
3256 | DBUG_ENTER_FUNC(); |
3257 | |
3258 | DBUG_ASSERT(hton != nullptr); |
3259 | DBUG_ASSERT(xid != nullptr); |
3260 | DBUG_ASSERT(commit_latency_stats != nullptr); |
3261 | |
3262 | rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true); |
3263 | |
3264 | const auto name = rdb_xid_to_string(*xid); |
3265 | DBUG_ASSERT(!name.empty()); |
3266 | |
3267 | rocksdb::Transaction *const trx = rdb->GetTransactionByName(name); |
3268 | |
3269 | if (trx == nullptr) { |
3270 | DBUG_RETURN(HA_EXIT_FAILURE); |
3271 | } |
3272 | |
3273 | const rocksdb::Status s = trx->Commit(); |
3274 | |
3275 | if (!s.ok()) { |
3276 | rdb_log_status_error(s); |
3277 | DBUG_RETURN(HA_EXIT_FAILURE); |
3278 | } |
3279 | |
3280 | delete trx; |
3281 | |
3282 | // `Add()` is implemented in a thread-safe manner. |
3283 | commit_latency_stats->Add(timer.ElapsedNanos() / 1000); |
3284 | |
3285 | DBUG_RETURN(HA_EXIT_SUCCESS); |
3286 | } |
3287 | |
3288 | static int |
3289 | rocksdb_rollback_by_xid(handlerton *const hton MY_ATTRIBUTE((__unused__)), |
3290 | XID *const xid) { |
3291 | DBUG_ENTER_FUNC(); |
3292 | |
3293 | DBUG_ASSERT(hton != nullptr); |
3294 | DBUG_ASSERT(xid != nullptr); |
3295 | DBUG_ASSERT(rdb != nullptr); |
3296 | |
3297 | const auto name = rdb_xid_to_string(*xid); |
3298 | |
3299 | rocksdb::Transaction *const trx = rdb->GetTransactionByName(name); |
3300 | |
3301 | if (trx == nullptr) { |
3302 | DBUG_RETURN(HA_EXIT_FAILURE); |
3303 | } |
3304 | |
3305 | const rocksdb::Status s = trx->Rollback(); |
3306 | |
3307 | if (!s.ok()) { |
3308 | rdb_log_status_error(s); |
3309 | DBUG_RETURN(HA_EXIT_FAILURE); |
3310 | } |
3311 | |
3312 | delete trx; |
3313 | |
3314 | DBUG_RETURN(HA_EXIT_SUCCESS); |
3315 | } |
3316 | |
3317 | /** |
3318 | Rebuilds an XID from a serialized version stored in a string. |
3319 | */ |
3320 | static void rdb_xid_from_string(const std::string &src, XID *const dst) { |
3321 | DBUG_ASSERT(dst != nullptr); |
3322 | uint offset = 0; |
3323 | uint64 raw_fid8 = |
3324 | rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(src.data())); |
3325 | const int64 signed_fid8 = *reinterpret_cast<int64 *>(&raw_fid8); |
3326 | dst->formatID = signed_fid8; |
3327 | offset += RDB_FORMATID_SZ; |
3328 | dst->gtrid_length = src.at(offset); |
3329 | offset += RDB_GTRID_SZ; |
3330 | dst->bqual_length = src.at(offset); |
3331 | offset += RDB_BQUAL_SZ; |
3332 | |
3333 | DBUG_ASSERT(dst->gtrid_length >= 0 && dst->gtrid_length <= MAXGTRIDSIZE); |
3334 | DBUG_ASSERT(dst->bqual_length >= 0 && dst->bqual_length <= MAXBQUALSIZE); |
3335 | |
3336 | src.copy(dst->data, (dst->gtrid_length) + (dst->bqual_length), |
3337 | RDB_XIDHDR_LEN); |
3338 | } |
3339 | |
3340 | /** |
3341 | Reading last committed binary log info from RocksDB system row. |
3342 | The info is needed for crash safe slave/master to work. |
3343 | */ |
3344 | static int rocksdb_recover(handlerton* hton, XID* xid_list, uint len) |
3345 | #ifdef MARIAROCKS_NOT_YET |
3346 | char* const binlog_file, |
3347 | my_off_t *const binlog_pos, |
3348 | Gtid *const binlog_max_gtid) { |
3349 | #endif |
3350 | { |
3351 | #ifdef MARIAROCKS_NOT_YET |
3352 | if (binlog_file && binlog_pos) { |
3353 | char file_buf[FN_REFLEN + 1] = {0}; |
3354 | my_off_t pos; |
3355 | char gtid_buf[FN_REFLEN + 1] = {0}; |
3356 | if (binlog_manager.read(file_buf, &pos, gtid_buf)) { |
3357 | if (is_binlog_advanced(binlog_file, *binlog_pos, file_buf, pos)) { |
3358 | memcpy(binlog_file, file_buf, FN_REFLEN + 1); |
3359 | *binlog_pos = pos; |
3360 | fprintf(stderr, "RocksDB: Last binlog file position %llu," |
3361 | " file name %s\n" , |
3362 | pos, file_buf); |
3363 | if (*gtid_buf) { |
3364 | global_sid_lock->rdlock(); |
3365 | binlog_max_gtid->parse(global_sid_map, gtid_buf); |
3366 | global_sid_lock->unlock(); |
3367 | fprintf(stderr, "RocksDB: Last MySQL Gtid %s\n" , gtid_buf); |
3368 | } |
3369 | } |
3370 | } |
3371 | } |
3372 | #endif |
3373 | |
3374 | if (len == 0 || xid_list == nullptr) { |
3375 | return HA_EXIT_SUCCESS; |
3376 | } |
3377 | |
3378 | std::vector<rocksdb::Transaction *> trans_list; |
3379 | rdb->GetAllPreparedTransactions(&trans_list); |
3380 | |
3381 | uint count = 0; |
3382 | for (auto &trans : trans_list) { |
3383 | if (count >= len) { |
3384 | break; |
3385 | } |
3386 | auto name = trans->GetName(); |
3387 | rdb_xid_from_string(name, &xid_list[count]); |
3388 | count++; |
3389 | } |
3390 | return count; |
3391 | } |
3392 | |
3393 | |
3394 | /* |
3395 | Handle a commit checkpoint request from server layer. |
3396 | |
3397 | InnoDB does this: |
3398 | We put the request in a queue, so that we can notify upper layer about |
3399 | checkpoint complete when we have flushed the redo log. |
3400 | If we have already flushed all relevant redo log, we notify immediately. |
3401 | |
3402 | MariaRocks just flushes everything right away ATM |
3403 | */ |
3404 | |
3405 | static void rocksdb_checkpoint_request(handlerton *hton, |
3406 | void *cookie) |
3407 | { |
3408 | const rocksdb::Status s= rdb->SyncWAL(); |
3409 | //TODO: what to do on error? |
3410 | if (s.ok()) |
3411 | { |
3412 | rocksdb_wal_group_syncs++; |
3413 | commit_checkpoint_notify_ha(hton, cookie); |
3414 | } |
3415 | } |
3416 | |
3417 | /* |
3418 | @param all: TRUE - commit the transaction |
3419 | FALSE - SQL statement ended |
3420 | */ |
3421 | static void rocksdb_commit_ordered(handlerton *hton, THD* thd, bool all) |
3422 | { |
3423 | // Same assert as InnoDB has |
3424 | DBUG_ASSERT(all || (!thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | |
3425 | OPTION_BEGIN))); |
3426 | Rdb_transaction *&tx = get_tx_from_thd(thd); |
3427 | if (!tx->is_two_phase()) { |
3428 | /* |
3429 | ordered_commit is supposedly slower as it is done sequentially |
3430 | in order to preserve commit order. |
3431 | |
3432 | if we are not required do 2-phase commit with the binlog, do not do |
3433 | anything here. |
3434 | */ |
3435 | return; |
3436 | } |
3437 | |
3438 | tx->set_sync(false); |
3439 | |
3440 | /* This will note the master position also */ |
3441 | tx->commit_ordered_res= tx->commit(); |
3442 | tx->commit_ordered_done= true; |
3443 | |
3444 | } |
3445 | |
3446 | |
3447 | static int rocksdb_commit(handlerton* hton, THD* thd, bool commit_tx) |
3448 | { |
3449 | DBUG_ENTER_FUNC(); |
3450 | |
3451 | DBUG_ASSERT(hton != nullptr); |
3452 | DBUG_ASSERT(thd != nullptr); |
3453 | DBUG_ASSERT(commit_latency_stats != nullptr); |
3454 | |
3455 | rocksdb::StopWatchNano timer(rocksdb::Env::Default(), true); |
3456 | |
3457 | /* note: h->external_lock(F_UNLCK) is called after this function is called) */ |
3458 | Rdb_transaction *&tx = get_tx_from_thd(thd); |
3459 | |
3460 | /* this will trigger saving of perf_context information */ |
3461 | Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd)); |
3462 | |
3463 | if (tx != nullptr) { |
3464 | if (commit_tx || (!my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | |
3465 | OPTION_BEGIN))) { |
3466 | /* |
3467 | This will not add anything to commit_latency_stats, and this is correct |
3468 | right? |
3469 | */ |
3470 | if (tx->commit_ordered_done) |
3471 | { |
3472 | thd_wakeup_subsequent_commits(thd, 0); |
3473 | DBUG_RETURN((tx->commit_ordered_res? HA_ERR_INTERNAL_ERROR: 0)); |
3474 | } |
3475 | |
3476 | /* |
3477 | We get here |
3478 | - For a COMMIT statement that finishes a multi-statement transaction |
3479 | - For a statement that has its own transaction |
3480 | */ |
3481 | |
3482 | // First, commit without syncing. This establishes the commit order |
3483 | tx->set_sync(false); |
3484 | if (tx->commit()) { |
3485 | DBUG_RETURN(HA_ERR_ROCKSDB_COMMIT_FAILED); |
3486 | } |
3487 | thd_wakeup_subsequent_commits(thd, 0); |
3488 | |
3489 | if (rocksdb_flush_log_at_trx_commit == FLUSH_LOG_SYNC) |
3490 | { |
3491 | rocksdb::Status s= rdb->FlushWAL(true); |
3492 | if (!s.ok()) |
3493 | DBUG_RETURN(HA_ERR_INTERNAL_ERROR); |
3494 | } |
3495 | } else { |
3496 | /* |
3497 | We get here when committing a statement within a transaction. |
3498 | */ |
3499 | tx->make_stmt_savepoint_permanent(); |
3500 | } |
3501 | |
3502 | if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) { |
3503 | // For READ_COMMITTED, we release any existing snapshot so that we will |
3504 | // see any changes that occurred since the last statement. |
3505 | tx->release_snapshot(); |
3506 | } |
3507 | } |
3508 | |
3509 | // `Add()` is implemented in a thread-safe manner. |
3510 | commit_latency_stats->Add(timer.ElapsedNanos() / 1000); |
3511 | |
3512 | DBUG_RETURN(HA_EXIT_SUCCESS); |
3513 | } |
3514 | |
3515 | |
3516 | static int rocksdb_rollback(handlerton *const hton, THD *const thd, |
3517 | bool rollback_tx) { |
3518 | Rdb_transaction *&tx = get_tx_from_thd(thd); |
3519 | Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd)); |
3520 | |
3521 | if (tx != nullptr) { |
3522 | if (rollback_tx) { |
3523 | /* |
3524 | We get here, when |
3525 | - ROLLBACK statement is issued. |
3526 | |
3527 | Discard the changes made by the transaction |
3528 | */ |
3529 | tx->rollback(); |
3530 | } else { |
3531 | /* |
3532 | We get here when |
3533 | - a statement with AUTOCOMMIT=1 is being rolled back (because of some |
3534 | error) |
3535 | - a statement inside a transaction is rolled back |
3536 | */ |
3537 | |
3538 | tx->rollback_stmt(); |
3539 | tx->set_tx_failed(true); |
3540 | } |
3541 | |
3542 | if (my_core::thd_tx_isolation(thd) <= ISO_READ_COMMITTED) { |
3543 | // For READ_COMMITTED, we release any existing snapshot so that we will |
3544 | // see any changes that occurred since the last statement. |
3545 | tx->release_snapshot(); |
3546 | } |
3547 | } |
3548 | return HA_EXIT_SUCCESS; |
3549 | } |
3550 | |
3551 | static bool print_stats(THD *const thd, std::string const &type, |
3552 | std::string const &name, std::string const &status, |
3553 | stat_print_fn *stat_print) { |
3554 | return stat_print(thd, type.c_str(), type.size(), name.c_str(), name.size(), |
3555 | status.c_str(), status.size()); |
3556 | } |
3557 | |
3558 | static std::string format_string(const char *const format, ...) { |
3559 | std::string res; |
3560 | va_list args; |
3561 | va_list args_copy; |
3562 | char static_buff[256]; |
3563 | |
3564 | DBUG_ASSERT(format != nullptr); |
3565 | |
3566 | va_start(args, format); |
3567 | va_copy(args_copy, args); |
3568 | |
3569 | // Calculate how much space we will need |
3570 | int len = vsnprintf(nullptr, 0, format, args); |
3571 | va_end(args); |
3572 | |
3573 | if (len < 0) { |
3574 | res = std::string("<format error>" ); |
3575 | } else if (len == 0) { |
3576 | // Shortcut for an empty string |
3577 | res = std::string("" ); |
3578 | } else { |
3579 | // For short enough output use a static buffer |
3580 | char *buff = static_buff; |
3581 | std::unique_ptr<char[]> dynamic_buff = nullptr; |
3582 | |
3583 | len++; // Add one for null terminator |
3584 | |
3585 | // for longer output use an allocated buffer |
3586 | if (static_cast<uint>(len) > sizeof(static_buff)) { |
3587 | dynamic_buff.reset(new char[len]); |
3588 | buff = dynamic_buff.get(); |
3589 | } |
3590 | |
3591 | // Now re-do the vsnprintf with the buffer which is now large enough |
3592 | (void)vsnprintf(buff, len, format, args_copy); |
3593 | |
3594 | // Convert to a std::string. Note we could have created a std::string |
3595 | // large enough and then converted the buffer to a 'char*' and created |
3596 | // the output in place. This would probably work but feels like a hack. |
3597 | // Since this isn't code that needs to be super-performant we are going |
3598 | // with this 'safer' method. |
3599 | res = std::string(buff); |
3600 | } |
3601 | |
3602 | va_end(args_copy); |
3603 | |
3604 | return res; |
3605 | } |
3606 | |
3607 | class Rdb_snapshot_status : public Rdb_tx_list_walker { |
3608 | private: |
3609 | std::string m_data; |
3610 | |
3611 | static std::string current_timestamp(void) { |
3612 | static const char *const format = "%d-%02d-%02d %02d:%02d:%02d" ; |
3613 | time_t currtime; |
3614 | struct tm currtm; |
3615 | |
3616 | time(&currtime); |
3617 | |
3618 | localtime_r(&currtime, &currtm); |
3619 | |
3620 | return format_string(format, currtm.tm_year + 1900, currtm.tm_mon + 1, |
3621 | currtm.tm_mday, currtm.tm_hour, currtm.tm_min, |
3622 | currtm.tm_sec); |
3623 | } |
3624 | |
3625 | static std::string (void) { |
3626 | return "\n============================================================\n" + |
3627 | current_timestamp() + |
3628 | " ROCKSDB TRANSACTION MONITOR OUTPUT\n" |
3629 | "============================================================\n" |
3630 | "---------\n" |
3631 | "SNAPSHOTS\n" |
3632 | "---------\n" |
3633 | "LIST OF SNAPSHOTS FOR EACH SESSION:\n" ; |
3634 | } |
3635 | |
3636 | static std::string (void) { |
3637 | return "-----------------------------------------\n" |
3638 | "END OF ROCKSDB TRANSACTION MONITOR OUTPUT\n" |
3639 | "=========================================\n" ; |
3640 | } |
3641 | |
3642 | static Rdb_deadlock_info::Rdb_dl_trx_info |
3643 | get_dl_txn_info(const rocksdb::DeadlockInfo &txn, |
3644 | const GL_INDEX_ID &gl_index_id) { |
3645 | Rdb_deadlock_info::Rdb_dl_trx_info txn_data; |
3646 | |
3647 | txn_data.trx_id = txn.m_txn_id; |
3648 | |
3649 | txn_data.table_name = ddl_manager.safe_get_table_name(gl_index_id); |
3650 | if (txn_data.table_name.empty()) { |
3651 | txn_data.table_name = |
3652 | "NOT FOUND; INDEX_ID: " + std::to_string(gl_index_id.index_id); |
3653 | } |
3654 | |
3655 | auto kd = ddl_manager.safe_find(gl_index_id); |
3656 | txn_data.index_name = |
3657 | (kd) ? kd->get_name() |
3658 | : "NOT FOUND; INDEX_ID: " + std::to_string(gl_index_id.index_id); |
3659 | |
3660 | rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(txn.m_cf_id); |
3661 | txn_data.cf_name = cfh->GetName(); |
3662 | |
3663 | txn_data.waiting_key = |
3664 | rdb_hexdump(txn.m_waiting_key.c_str(), txn.m_waiting_key.length()); |
3665 | |
3666 | txn_data.exclusive_lock = txn.m_exclusive; |
3667 | |
3668 | return txn_data; |
3669 | } |
3670 | |
3671 | static Rdb_deadlock_info |
3672 | get_dl_path_trx_info(const rocksdb::DeadlockPath &path_entry) { |
3673 | Rdb_deadlock_info deadlock_info; |
3674 | |
3675 | for (auto it = path_entry.path.begin(); it != path_entry.path.end(); |
3676 | it++) { |
3677 | auto txn = *it; |
3678 | const GL_INDEX_ID gl_index_id = { |
3679 | txn.m_cf_id, rdb_netbuf_to_uint32(reinterpret_cast<const uchar *>( |
3680 | txn.m_waiting_key.c_str()))}; |
3681 | deadlock_info.path.push_back(get_dl_txn_info(txn, gl_index_id)); |
3682 | } |
3683 | DBUG_ASSERT_IFF(path_entry.limit_exceeded, path_entry.path.empty()); |
3684 | /* print the first txn in the path to display the full deadlock cycle */ |
3685 | if (!path_entry.path.empty() && !path_entry.limit_exceeded) { |
3686 | auto deadlocking_txn = *(path_entry.path.end() - 1); |
3687 | deadlock_info.victim_trx_id = deadlocking_txn.m_txn_id; |
3688 | } |
3689 | return deadlock_info; |
3690 | } |
3691 | |
3692 | public: |
3693 | Rdb_snapshot_status() : m_data(get_header()) {} |
3694 | |
3695 | std::string getResult() { return m_data + get_footer(); } |
3696 | |
3697 | /* Implement Rdb_transaction interface */ |
3698 | /* Create one row in the snapshot status table */ |
3699 | void process_tran(const Rdb_transaction *const tx) override { |
3700 | DBUG_ASSERT(tx != nullptr); |
3701 | |
3702 | /* Calculate the duration the snapshot has existed */ |
3703 | int64_t snapshot_timestamp = tx->m_snapshot_timestamp; |
3704 | if (snapshot_timestamp != 0) { |
3705 | int64_t curr_time; |
3706 | rdb->GetEnv()->GetCurrentTime(&curr_time); |
3707 | |
3708 | char buffer[1024]; |
3709 | #ifdef MARIAROCKS_NOT_YET |
3710 | thd_security_context(tx->get_thd(), buffer, sizeof buffer, 0); |
3711 | #endif |
3712 | m_data += format_string( |
3713 | "---SNAPSHOT, ACTIVE %lld sec\n" |
3714 | "%s\n" |
3715 | "lock count %llu, write count %llu\n" |
3716 | "insert count %llu, update count %llu, delete count %llu\n" , |
3717 | (longlong)(curr_time - snapshot_timestamp), buffer, tx->get_lock_count(), |
3718 | tx->get_write_count(), tx->get_insert_count(), tx->get_update_count(), |
3719 | tx->get_delete_count()); |
3720 | } |
3721 | } |
3722 | |
3723 | void populate_deadlock_buffer() { |
3724 | auto dlock_buffer = rdb->GetDeadlockInfoBuffer(); |
3725 | m_data += "----------LATEST DETECTED DEADLOCKS----------\n" ; |
3726 | |
3727 | for (auto path_entry : dlock_buffer) { |
3728 | std::string path_data; |
3729 | if (path_entry.limit_exceeded) { |
3730 | path_data += "\n-------DEADLOCK EXCEEDED MAX DEPTH-------\n" ; |
3731 | } else { |
3732 | path_data += "\n*** DEADLOCK PATH\n" |
3733 | "=========================================\n" ; |
3734 | const auto dl_info = get_dl_path_trx_info(path_entry); |
3735 | for (auto it = dl_info.path.begin(); it != dl_info.path.end(); it++) { |
3736 | const auto trx_info = *it; |
3737 | path_data += format_string( |
3738 | "TRANSACTION ID: %u\n" |
3739 | "COLUMN FAMILY NAME: %s\n" |
3740 | "WAITING KEY: %s\n" |
3741 | "LOCK TYPE: %s\n" |
3742 | "INDEX NAME: %s\n" |
3743 | "TABLE NAME: %s\n" , |
3744 | trx_info.trx_id, trx_info.cf_name.c_str(), |
3745 | trx_info.waiting_key.c_str(), |
3746 | trx_info.exclusive_lock ? "EXCLUSIVE" : "SHARED" , |
3747 | trx_info.index_name.c_str(), trx_info.table_name.c_str()); |
3748 | if (it != dl_info.path.end() - 1) { |
3749 | path_data += "---------------WAITING FOR---------------\n" ; |
3750 | } |
3751 | } |
3752 | path_data += |
3753 | format_string("\n--------TRANSACTION ID: %u GOT DEADLOCK---------\n" , |
3754 | dl_info.victim_trx_id); |
3755 | } |
3756 | m_data += path_data; |
3757 | } |
3758 | } |
3759 | |
3760 | std::vector<Rdb_deadlock_info> get_deadlock_info() { |
3761 | std::vector<Rdb_deadlock_info> deadlock_info; |
3762 | auto dlock_buffer = rdb->GetDeadlockInfoBuffer(); |
3763 | for (auto path_entry : dlock_buffer) { |
3764 | if (!path_entry.limit_exceeded) { |
3765 | deadlock_info.push_back(get_dl_path_trx_info(path_entry)); |
3766 | } |
3767 | } |
3768 | return deadlock_info; |
3769 | } |
3770 | }; |
3771 | |
3772 | /** |
3773 | * @brief |
3774 | * walks through all non-replication transactions and copies |
3775 | * out relevant information for information_schema.rocksdb_trx |
3776 | */ |
3777 | class Rdb_trx_info_aggregator : public Rdb_tx_list_walker { |
3778 | private: |
3779 | std::vector<Rdb_trx_info> *m_trx_info; |
3780 | |
3781 | public: |
3782 | explicit Rdb_trx_info_aggregator(std::vector<Rdb_trx_info> *const trx_info) |
3783 | : m_trx_info(trx_info) {} |
3784 | |
3785 | void process_tran(const Rdb_transaction *const tx) override { |
3786 | static const std::map<int, std::string> state_map = { |
3787 | {rocksdb::Transaction::STARTED, "STARTED" }, |
3788 | {rocksdb::Transaction::AWAITING_PREPARE, "AWAITING_PREPARE" }, |
3789 | {rocksdb::Transaction::PREPARED, "PREPARED" }, |
3790 | {rocksdb::Transaction::AWAITING_COMMIT, "AWAITING_COMMIT" }, |
3791 | {rocksdb::Transaction::COMMITED, "COMMITED" }, |
3792 | {rocksdb::Transaction::AWAITING_ROLLBACK, "AWAITING_ROLLBACK" }, |
3793 | {rocksdb::Transaction::ROLLEDBACK, "ROLLEDBACK" }, |
3794 | }; |
3795 | |
3796 | DBUG_ASSERT(tx != nullptr); |
3797 | |
3798 | THD *const thd = tx->get_thd(); |
3799 | ulong thread_id = thd_get_thread_id(thd); |
3800 | |
3801 | if (tx->is_writebatch_trx()) { |
3802 | const auto wb_impl = static_cast<const Rdb_writebatch_impl *>(tx); |
3803 | DBUG_ASSERT(wb_impl); |
3804 | m_trx_info->push_back( |
3805 | {"" , /* name */ |
3806 | 0, /* trx_id */ |
3807 | wb_impl->get_write_count(), 0, /* lock_count */ |
3808 | 0, /* timeout_sec */ |
3809 | "" , /* state */ |
3810 | "" , /* waiting_key */ |
3811 | 0, /* waiting_cf_id */ |
3812 | 1, /*is_replication */ |
3813 | 1, /* skip_trx_api */ |
3814 | wb_impl->is_tx_read_only(), 0, /* deadlock detection */ |
3815 | wb_impl->num_ongoing_bulk_load(), thread_id, "" /* query string */}); |
3816 | } else { |
3817 | const auto tx_impl = static_cast<const Rdb_transaction_impl *>(tx); |
3818 | DBUG_ASSERT(tx_impl); |
3819 | const rocksdb::Transaction *rdb_trx = tx_impl->get_rdb_trx(); |
3820 | |
3821 | if (rdb_trx == nullptr) { |
3822 | return; |
3823 | } |
3824 | |
3825 | char query_buf[NAME_LEN+1]; |
3826 | thd_query_safe(thd, query_buf, sizeof(query_buf)); |
3827 | std::string query_str(query_buf); |
3828 | |
3829 | const auto state_it = state_map.find(rdb_trx->GetState()); |
3830 | DBUG_ASSERT(state_it != state_map.end()); |
3831 | const int is_replication = (thd->rgi_slave != nullptr); |
3832 | uint32_t waiting_cf_id; |
3833 | std::string waiting_key; |
3834 | rdb_trx->GetWaitingTxns(&waiting_cf_id, &waiting_key), |
3835 | |
3836 | m_trx_info->push_back( |
3837 | {rdb_trx->GetName(), rdb_trx->GetID(), tx_impl->get_write_count(), |
3838 | tx_impl->get_lock_count(), tx_impl->get_timeout_sec(), |
3839 | state_it->second, waiting_key, waiting_cf_id, is_replication, |
3840 | 0, /* skip_trx_api */ |
3841 | tx_impl->is_tx_read_only(), rdb_trx->IsDeadlockDetect(), |
3842 | tx_impl->num_ongoing_bulk_load(), thread_id, query_str}); |
3843 | } |
3844 | } |
3845 | }; |
3846 | |
3847 | /* |
3848 | returns a vector of info for all non-replication threads |
3849 | for use by information_schema.rocksdb_trx |
3850 | */ |
3851 | std::vector<Rdb_trx_info> rdb_get_all_trx_info() { |
3852 | std::vector<Rdb_trx_info> trx_info; |
3853 | Rdb_trx_info_aggregator trx_info_agg(&trx_info); |
3854 | Rdb_transaction::walk_tx_list(&trx_info_agg); |
3855 | return trx_info; |
3856 | } |
3857 | |
3858 | |
3859 | /* |
3860 | returns a vector of info of recent deadlocks |
3861 | for use by information_schema.rocksdb_deadlock |
3862 | */ |
3863 | std::vector<Rdb_deadlock_info> rdb_get_deadlock_info() { |
3864 | Rdb_snapshot_status showStatus; |
3865 | Rdb_transaction::walk_tx_list(&showStatus); |
3866 | return showStatus.get_deadlock_info(); |
3867 | } |
3868 | |
3869 | #ifdef MARIAROCKS_NOT_YET |
3870 | /* Generate the snapshot status table */ |
3871 | static bool rocksdb_show_snapshot_status(handlerton *const hton, THD *const thd, |
3872 | stat_print_fn *const stat_print) { |
3873 | Rdb_snapshot_status showStatus; |
3874 | |
3875 | Rdb_transaction::walk_tx_list(&showStatus); |
3876 | showStatus.populate_deadlock_buffer(); |
3877 | |
3878 | /* Send the result data back to MySQL */ |
3879 | return print_stats(thd, "rocksdb" , "" , showStatus.getResult(), stat_print); |
3880 | } |
3881 | #endif |
3882 | |
3883 | /* |
3884 | This is called for SHOW ENGINE ROCKSDB STATUS | LOGS | etc. |
3885 | |
3886 | For now, produce info about live files (which gives an imprecise idea about |
3887 | what column families are there). |
3888 | */ |
3889 | static bool rocksdb_show_status(handlerton *const hton, THD *const thd, |
3890 | stat_print_fn *const stat_print, |
3891 | enum ha_stat_type stat_type) { |
3892 | DBUG_ASSERT(hton != nullptr); |
3893 | DBUG_ASSERT(thd != nullptr); |
3894 | DBUG_ASSERT(stat_print != nullptr); |
3895 | |
3896 | bool res = false; |
3897 | char buf[100] = {'\0'}; |
3898 | |
3899 | if (stat_type == HA_ENGINE_STATUS) { |
3900 | DBUG_ASSERT(rdb != nullptr); |
3901 | |
3902 | std::string str; |
3903 | |
3904 | /* Global DB Statistics */ |
3905 | if (rocksdb_stats) { |
3906 | str = rocksdb_stats->ToString(); |
3907 | |
3908 | // Use the same format as internal RocksDB statistics entries to make |
3909 | // sure that output will look unified. |
3910 | DBUG_ASSERT(commit_latency_stats != nullptr); |
3911 | |
3912 | snprintf(buf, sizeof(buf), "rocksdb.commit_latency statistics " |
3913 | "Percentiles :=> 50 : %.2f 95 : %.2f " |
3914 | "99 : %.2f 100 : %.2f\n" , |
3915 | commit_latency_stats->Percentile(50), |
3916 | commit_latency_stats->Percentile(95), |
3917 | commit_latency_stats->Percentile(99), |
3918 | commit_latency_stats->Percentile(100)); |
3919 | str.append(buf); |
3920 | |
3921 | uint64_t v = 0; |
3922 | |
3923 | // Retrieve additional stalling related numbers from RocksDB and append |
3924 | // them to the buffer meant for displaying detailed statistics. The intent |
3925 | // here is to avoid adding another row to the query output because of |
3926 | // just two numbers. |
3927 | // |
3928 | // NB! We're replacing hyphens with underscores in output to better match |
3929 | // the existing naming convention. |
3930 | if (rdb->GetIntProperty("rocksdb.is-write-stopped" , &v)) { |
3931 | snprintf(buf, sizeof(buf), "rocksdb.is_write_stopped COUNT : %llu\n" , (ulonglong)v); |
3932 | str.append(buf); |
3933 | } |
3934 | |
3935 | if (rdb->GetIntProperty("rocksdb.actual-delayed-write-rate" , &v)) { |
3936 | snprintf(buf, sizeof(buf), "rocksdb.actual_delayed_write_rate " |
3937 | "COUNT : %llu\n" , |
3938 | (ulonglong)v); |
3939 | str.append(buf); |
3940 | } |
3941 | |
3942 | res |= print_stats(thd, "STATISTICS" , "rocksdb" , str, stat_print); |
3943 | } |
3944 | |
3945 | /* Per DB stats */ |
3946 | if (rdb->GetProperty("rocksdb.dbstats" , &str)) { |
3947 | res |= print_stats(thd, "DBSTATS" , "rocksdb" , str, stat_print); |
3948 | } |
3949 | |
3950 | /* Per column family stats */ |
3951 | for (const auto &cf_name : cf_manager.get_cf_names()) { |
3952 | rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name); |
3953 | if (cfh == nullptr) { |
3954 | continue; |
3955 | } |
3956 | |
3957 | if (!rdb->GetProperty(cfh, "rocksdb.cfstats" , &str)) { |
3958 | continue; |
3959 | } |
3960 | |
3961 | res |= print_stats(thd, "CF_COMPACTION" , cf_name, str, stat_print); |
3962 | } |
3963 | |
3964 | /* Memory Statistics */ |
3965 | std::vector<rocksdb::DB *> dbs; |
3966 | std::unordered_set<const rocksdb::Cache *> cache_set; |
3967 | size_t internal_cache_count = 0; |
3968 | size_t kDefaultInternalCacheSize = 8 * 1024 * 1024; |
3969 | |
3970 | dbs.push_back(rdb); |
3971 | cache_set.insert(rocksdb_tbl_options->block_cache.get()); |
3972 | |
3973 | for (const auto &cf_handle : cf_manager.get_all_cf()) { |
3974 | rocksdb::ColumnFamilyDescriptor cf_desc; |
3975 | cf_handle->GetDescriptor(&cf_desc); |
3976 | auto *const table_factory = cf_desc.options.table_factory.get(); |
3977 | |
3978 | if (table_factory != nullptr) { |
3979 | std::string tf_name = table_factory->Name(); |
3980 | |
3981 | if (tf_name.find("BlockBasedTable" ) != std::string::npos) { |
3982 | const rocksdb::BlockBasedTableOptions *const bbt_opt = |
3983 | reinterpret_cast<rocksdb::BlockBasedTableOptions *>( |
3984 | table_factory->GetOptions()); |
3985 | |
3986 | if (bbt_opt != nullptr) { |
3987 | if (bbt_opt->block_cache.get() != nullptr) { |
3988 | cache_set.insert(bbt_opt->block_cache.get()); |
3989 | } else { |
3990 | internal_cache_count++; |
3991 | } |
3992 | cache_set.insert(bbt_opt->block_cache_compressed.get()); |
3993 | } |
3994 | } |
3995 | } |
3996 | } |
3997 | |
3998 | std::map<rocksdb::MemoryUtil::UsageType, uint64_t> temp_usage_by_type; |
3999 | str.clear(); |
4000 | rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set, |
4001 | &temp_usage_by_type); |
4002 | snprintf(buf, sizeof(buf), "\nMemTable Total: %llu" , |
4003 | (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kMemTableTotal]); |
4004 | str.append(buf); |
4005 | snprintf(buf, sizeof(buf), "\nMemTable Unflushed: %llu" , |
4006 | (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kMemTableUnFlushed]); |
4007 | str.append(buf); |
4008 | snprintf(buf, sizeof(buf), "\nTable Readers Total: %llu" , |
4009 | (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kTableReadersTotal]); |
4010 | str.append(buf); |
4011 | snprintf(buf, sizeof(buf), "\nCache Total: %llu" , |
4012 | (ulonglong)temp_usage_by_type[rocksdb::MemoryUtil::kCacheTotal]); |
4013 | str.append(buf); |
4014 | snprintf(buf, sizeof(buf), "\nDefault Cache Capacity: %llu" , |
4015 | (ulonglong)internal_cache_count * kDefaultInternalCacheSize); |
4016 | str.append(buf); |
4017 | res |= print_stats(thd, "MEMORY_STATS" , "rocksdb" , str, stat_print); |
4018 | #ifdef MARIAROCKS_NOT_YET |
4019 | /* Show the background thread status */ |
4020 | std::vector<rocksdb::ThreadStatus> thread_list; |
4021 | rocksdb::Status s = rdb->GetEnv()->GetThreadList(&thread_list); |
4022 | |
4023 | if (!s.ok()) { |
4024 | sql_print_error("RocksDB: Returned error (%s) from GetThreadList.\n" , |
4025 | s.ToString().c_str()); |
4026 | res |= true; |
4027 | } else { |
4028 | /* For each background thread retrieved, print out its information */ |
4029 | for (auto &it : thread_list) { |
4030 | /* Only look at background threads. Ignore user threads, if any. */ |
4031 | if (it.thread_type > rocksdb::ThreadStatus::LOW_PRIORITY) { |
4032 | continue; |
4033 | } |
4034 | |
4035 | str = "\nthread_type: " + it.GetThreadTypeName(it.thread_type) + |
4036 | "\ncf_name: " + it.cf_name + |
4037 | "\noperation_type: " + it.GetOperationName(it.operation_type) + |
4038 | "\noperation_stage: " + |
4039 | it.GetOperationStageName(it.operation_stage) + |
4040 | "\nelapsed_time_ms: " + |
4041 | it.MicrosToString(it.op_elapsed_micros); |
4042 | |
4043 | for (auto &it_props : |
4044 | it.InterpretOperationProperties(it.operation_type, |
4045 | it.op_properties)) { |
4046 | str += "\n" + it_props.first + ": " + std::to_string(it_props.second); |
4047 | } |
4048 | |
4049 | str += "\nstate_type: " + it.GetStateName(it.state_type); |
4050 | |
4051 | res |= print_stats(thd, "BG_THREADS" , std::to_string(it.thread_id), |
4052 | str, stat_print); |
4053 | } |
4054 | } |
4055 | #endif |
4056 | |
4057 | #ifdef MARIAROCKS_NOT_YET |
4058 | } else if (stat_type == HA_ENGINE_TRX) { |
4059 | /* Handle the SHOW ENGINE ROCKSDB TRANSACTION STATUS command */ |
4060 | res |= rocksdb_show_snapshot_status(hton, thd, stat_print); |
4061 | #endif |
4062 | } |
4063 | return res; |
4064 | } |
4065 | |
4066 | static inline void rocksdb_register_tx(handlerton *const hton, THD *const thd, |
4067 | Rdb_transaction *const tx) { |
4068 | DBUG_ASSERT(tx != nullptr); |
4069 | |
4070 | trans_register_ha(thd, FALSE, rocksdb_hton); |
4071 | if (my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { |
4072 | tx->start_stmt(); |
4073 | trans_register_ha(thd, TRUE, rocksdb_hton); |
4074 | } |
4075 | } |
4076 | |
4077 | static const char *ha_rocksdb_exts[] = {NullS}; |
4078 | |
4079 | /* |
4080 | Supporting START TRANSACTION WITH CONSISTENT [ROCKSDB] SNAPSHOT |
4081 | |
4082 | Features: |
4083 | 1. Supporting START TRANSACTION WITH CONSISTENT SNAPSHOT |
4084 | 2. Getting current binlog position in addition to #1. |
4085 | |
4086 | The second feature is done by START TRANSACTION WITH |
4087 | CONSISTENT ROCKSDB SNAPSHOT. This is Facebook's extension, and |
4088 | it works like existing START TRANSACTION WITH CONSISTENT INNODB SNAPSHOT. |
4089 | |
4090 | - When not setting engine, START TRANSACTION WITH CONSISTENT SNAPSHOT |
4091 | takes both InnoDB and RocksDB snapshots, and both InnoDB and RocksDB |
4092 | participate in transaction. When executing COMMIT, both InnoDB and |
4093 | RocksDB modifications are committed. Remember that XA is not supported yet, |
4094 | so mixing engines is not recommended anyway. |
4095 | |
4096 | - When setting engine, START TRANSACTION WITH CONSISTENT.. takes |
4097 | snapshot for the specified engine only. But it starts both |
4098 | InnoDB and RocksDB transactions. |
4099 | */ |
4100 | static int rocksdb_start_tx_and_assign_read_view( |
4101 | handlerton *const hton, /*!< in: RocksDB handlerton */ |
4102 | THD* thd) /*!< in: MySQL thread handle of the |
4103 | user for whom the transaction should |
4104 | be committed */ |
4105 | { |
4106 | ulong const tx_isolation = my_core::thd_tx_isolation(thd); |
4107 | |
4108 | if (tx_isolation != ISO_REPEATABLE_READ) { |
4109 | my_error(ER_ISOLATION_LEVEL_WITH_CONSISTENT_SNAPSHOT, MYF(0)); |
4110 | return HA_EXIT_FAILURE; |
4111 | } |
4112 | /* |
4113 | MariaDB: there is no need to call mysql_bin_log_lock_commits and then |
4114 | unlock back. |
4115 | SQL layer calls start_consistent_snapshot() for all engines, including the |
4116 | binlog under LOCK_commit_ordered mutex. |
4117 | The mutex prevents binlog commits from happening (right?) while the storage |
4118 | engine(s) allocate read snapshots. That way, each storage engine is |
4119 | synchronized with current binlog position. |
4120 | */ |
4121 | mysql_mutex_assert_owner(&LOCK_commit_ordered); |
4122 | |
4123 | Rdb_transaction *const tx = get_or_create_tx(thd); |
4124 | Rdb_perf_context_guard guard(tx, rocksdb_perf_context_level(thd)); |
4125 | |
4126 | DBUG_ASSERT(!tx->has_snapshot()); |
4127 | tx->set_tx_read_only(true); |
4128 | rocksdb_register_tx(hton, thd, tx); |
4129 | tx->acquire_snapshot(true); |
4130 | |
4131 | return HA_EXIT_SUCCESS; |
4132 | } |
4133 | |
4134 | /* Dummy SAVEPOINT support. This is needed for long running transactions |
4135 | * like mysqldump (https://bugs.mysql.com/bug.php?id=71017). |
4136 | * Current SAVEPOINT does not correctly handle ROLLBACK and does not return |
4137 | * errors. This needs to be addressed in future versions (Issue#96). |
4138 | */ |
4139 | static int rocksdb_savepoint(handlerton *const hton, THD *const thd, |
4140 | void *const savepoint) { |
4141 | return HA_EXIT_SUCCESS; |
4142 | } |
4143 | |
4144 | static int rocksdb_rollback_to_savepoint(handlerton *const hton, THD *const thd, |
4145 | void *const savepoint) { |
4146 | Rdb_transaction *&tx = get_tx_from_thd(thd); |
4147 | return tx->rollback_to_savepoint(savepoint); |
4148 | } |
4149 | |
4150 | static bool |
4151 | rocksdb_rollback_to_savepoint_can_release_mdl(handlerton *const hton, |
4152 | THD *const thd) { |
4153 | return true; |
4154 | } |
4155 | |
4156 | #ifdef MARIAROCKS_NOT_YET |
4157 | /* |
4158 | This is called for INFORMATION_SCHEMA |
4159 | */ |
4160 | static void rocksdb_update_table_stats( |
4161 | /* per-table stats callback */ |
4162 | void (*cb)(const char *db, const char *tbl, bool is_partition, |
4163 | my_io_perf_t *r, my_io_perf_t *w, my_io_perf_t *r_blob, |
4164 | my_io_perf_t *r_primary, my_io_perf_t *r_secondary, |
4165 | page_stats_t *page_stats, comp_stats_t *comp_stats, |
4166 | int n_lock_wait, int n_lock_wait_timeout, int n_lock_deadlock, |
4167 | const char *engine)) { |
4168 | my_io_perf_t io_perf_read; |
4169 | my_io_perf_t io_perf_write; |
4170 | my_io_perf_t io_perf; |
4171 | page_stats_t page_stats; |
4172 | comp_stats_t comp_stats; |
4173 | uint lock_wait_timeout_stats; |
4174 | uint deadlock_stats; |
4175 | uint lock_wait_stats; |
4176 | std::vector<std::string> tablenames; |
4177 | |
4178 | /* |
4179 | Most of these are for innodb, so setting them to 0. |
4180 | TODO: possibly separate out primary vs. secondary index reads |
4181 | */ |
4182 | memset(&io_perf, 0, sizeof(io_perf)); |
4183 | memset(&page_stats, 0, sizeof(page_stats)); |
4184 | memset(&comp_stats, 0, sizeof(comp_stats)); |
4185 | memset(&io_perf_write, 0, sizeof(io_perf_write)); |
4186 | |
4187 | tablenames = rdb_open_tables.get_table_names(); |
4188 | |
4189 | for (const auto &it : tablenames) { |
4190 | Rdb_table_handler *table_handler; |
4191 | std::string str, dbname, tablename, partname; |
4192 | char dbname_sys[NAME_LEN + 1]; |
4193 | char tablename_sys[NAME_LEN + 1]; |
4194 | bool is_partition; |
4195 | |
4196 | if (rdb_normalize_tablename(it, &str) != HA_EXIT_SUCCESS) { |
4197 | /* Function needs to return void because of the interface and we've |
4198 | * detected an error which shouldn't happen. There's no way to let |
4199 | * caller know that something failed. |
4200 | */ |
4201 | SHIP_ASSERT(false); |
4202 | return; |
4203 | } |
4204 | |
4205 | if (rdb_split_normalized_tablename(str, &dbname, &tablename, &partname)) { |
4206 | continue; |
4207 | } |
4208 | |
4209 | is_partition = (partname.size() != 0); |
4210 | |
4211 | table_handler = rdb_open_tables.get_table_handler(it.c_str()); |
4212 | if (table_handler == nullptr) { |
4213 | continue; |
4214 | } |
4215 | |
4216 | io_perf_read.bytes = table_handler->m_io_perf_read.bytes.load(); |
4217 | io_perf_read.requests = table_handler->m_io_perf_read.requests.load(); |
4218 | io_perf_write.bytes = table_handler->m_io_perf_write.bytes.load(); |
4219 | io_perf_write.requests = table_handler->m_io_perf_write.requests.load(); |
4220 | lock_wait_timeout_stats = table_handler->m_lock_wait_timeout_counter.load(); |
4221 | deadlock_stats = table_handler->m_deadlock_counter.load(); |
4222 | lock_wait_stats = |
4223 | table_handler->m_table_perf_context.m_value[PC_KEY_LOCK_WAIT_COUNT] |
4224 | .load(); |
4225 | |
4226 | /* |
4227 | Convert from rocksdb timer to mysql timer. RocksDB values are |
4228 | in nanoseconds, but table statistics expect the value to be |
4229 | in my_timer format. |
4230 | */ |
4231 | io_perf_read.svc_time = my_core::microseconds_to_my_timer( |
4232 | table_handler->m_io_perf_read.svc_time.load() / 1000); |
4233 | io_perf_read.svc_time_max = my_core::microseconds_to_my_timer( |
4234 | table_handler->m_io_perf_read.svc_time_max.load() / 1000); |
4235 | io_perf_read.wait_time = my_core::microseconds_to_my_timer( |
4236 | table_handler->m_io_perf_read.wait_time.load() / 1000); |
4237 | io_perf_read.wait_time_max = my_core::microseconds_to_my_timer( |
4238 | table_handler->m_io_perf_read.wait_time_max.load() / 1000); |
4239 | io_perf_read.slow_ios = table_handler->m_io_perf_read.slow_ios.load(); |
4240 | rdb_open_tables.release_table_handler(table_handler); |
4241 | |
4242 | /* |
4243 | Table stats expects our database and table name to be in system encoding, |
4244 | not filename format. Convert before calling callback. |
4245 | */ |
4246 | my_core::filename_to_tablename(dbname.c_str(), dbname_sys, |
4247 | sizeof(dbname_sys)); |
4248 | my_core::filename_to_tablename(tablename.c_str(), tablename_sys, |
4249 | sizeof(tablename_sys)); |
4250 | (*cb)(dbname_sys, tablename_sys, is_partition, &io_perf_read, |
4251 | &io_perf_write, &io_perf, &io_perf, &io_perf, &page_stats, |
4252 | &comp_stats, lock_wait_stats, lock_wait_timeout_stats, deadlock_stats, |
4253 | rocksdb_hton_name); |
4254 | } |
4255 | } |
4256 | #endif |
4257 | static rocksdb::Status check_rocksdb_options_compatibility( |
4258 | const char *const dbpath, const rocksdb::Options &main_opts, |
4259 | const std::vector<rocksdb::ColumnFamilyDescriptor> &cf_descr) { |
4260 | DBUG_ASSERT(rocksdb_datadir != nullptr); |
4261 | |
4262 | rocksdb::DBOptions loaded_db_opt; |
4263 | std::vector<rocksdb::ColumnFamilyDescriptor> loaded_cf_descs; |
4264 | rocksdb::Status status = |
4265 | LoadLatestOptions(dbpath, rocksdb::Env::Default(), &loaded_db_opt, |
4266 | &loaded_cf_descs, rocksdb_ignore_unknown_options); |
4267 | |
4268 | // If we're starting from scratch and there are no options saved yet then this |
4269 | // is a valid case. Therefore we can't compare the current set of options to |
4270 | // anything. |
4271 | if (status.IsNotFound()) { |
4272 | return rocksdb::Status::OK(); |
4273 | } |
4274 | |
4275 | if (!status.ok()) { |
4276 | return status; |
4277 | } |
4278 | |
4279 | if (loaded_cf_descs.size() != cf_descr.size()) { |
4280 | return rocksdb::Status::NotSupported("Mismatched size of column family " |
4281 | "descriptors." ); |
4282 | } |
4283 | |
4284 | // Please see RocksDB documentation for more context about why we need to set |
4285 | // user-defined functions and pointer-typed options manually. |
4286 | for (size_t i = 0; i < loaded_cf_descs.size(); i++) { |
4287 | loaded_cf_descs[i].options.compaction_filter = |
4288 | cf_descr[i].options.compaction_filter; |
4289 | loaded_cf_descs[i].options.compaction_filter_factory = |
4290 | cf_descr[i].options.compaction_filter_factory; |
4291 | loaded_cf_descs[i].options.comparator = cf_descr[i].options.comparator; |
4292 | loaded_cf_descs[i].options.memtable_factory = |
4293 | cf_descr[i].options.memtable_factory; |
4294 | loaded_cf_descs[i].options.merge_operator = |
4295 | cf_descr[i].options.merge_operator; |
4296 | loaded_cf_descs[i].options.prefix_extractor = |
4297 | cf_descr[i].options.prefix_extractor; |
4298 | loaded_cf_descs[i].options.table_factory = |
4299 | cf_descr[i].options.table_factory; |
4300 | } |
4301 | |
4302 | // This is the essence of the function - determine if it's safe to open the |
4303 | // database or not. |
4304 | status = CheckOptionsCompatibility(dbpath, rocksdb::Env::Default(), main_opts, |
4305 | loaded_cf_descs, |
4306 | rocksdb_ignore_unknown_options); |
4307 | |
4308 | return status; |
4309 | } |
4310 | |
4311 | bool prevent_myrocks_loading= false; |
4312 | |
4313 | |
4314 | /* |
4315 | Storage Engine initialization function, invoked when plugin is loaded. |
4316 | */ |
4317 | |
4318 | static int rocksdb_init_func(void *const p) { |
4319 | |
4320 | DBUG_ENTER_FUNC(); |
4321 | |
4322 | if (prevent_myrocks_loading) |
4323 | { |
4324 | my_error(ER_INTERNAL_ERROR, MYF(0), |
4325 | "Loading MyRocks plugin after it has been unloaded is not " |
4326 | "supported. Please restart mysqld" ); |
4327 | DBUG_RETURN(1); |
4328 | } |
4329 | |
4330 | if (rdb_check_rocksdb_corruption()) { |
4331 | sql_print_error("RocksDB: There was a corruption detected in RockDB files. " |
4332 | "Check error log emitted earlier for more details." ); |
4333 | if (rocksdb_allow_to_start_after_corruption) { |
4334 | sql_print_information( |
4335 | "RocksDB: Remove rocksdb_allow_to_start_after_corruption to prevent " |
4336 | "server operating if RocksDB corruption is detected." ); |
4337 | } else { |
4338 | sql_print_error("RocksDB: The server will exit normally and stop restart " |
4339 | "attempts. Remove %s file from data directory and " |
4340 | "start mysqld manually." , |
4341 | rdb_corruption_marker_file_name().c_str()); |
4342 | exit(0); |
4343 | } |
4344 | } |
4345 | |
4346 | // Validate the assumption about the size of ROCKSDB_SIZEOF_HIDDEN_PK_COLUMN. |
4347 | static_assert(sizeof(longlong) == 8, "Assuming that longlong is 8 bytes." ); |
4348 | |
4349 | init_rocksdb_psi_keys(); |
4350 | |
4351 | rocksdb_hton = (handlerton *)p; |
4352 | mysql_mutex_init(rdb_psi_open_tbls_mutex_key, &rdb_open_tables.m_mutex, |
4353 | MY_MUTEX_INIT_FAST); |
4354 | #ifdef HAVE_PSI_INTERFACE |
4355 | rdb_bg_thread.init(rdb_signal_bg_psi_mutex_key, rdb_signal_bg_psi_cond_key); |
4356 | rdb_drop_idx_thread.init(rdb_signal_drop_idx_psi_mutex_key, |
4357 | rdb_signal_drop_idx_psi_cond_key); |
4358 | #else |
4359 | rdb_bg_thread.init(); |
4360 | rdb_drop_idx_thread.init(); |
4361 | #endif |
4362 | mysql_mutex_init(rdb_collation_data_mutex_key, &rdb_collation_data_mutex, |
4363 | MY_MUTEX_INIT_FAST); |
4364 | mysql_mutex_init(rdb_mem_cmp_space_mutex_key, &rdb_mem_cmp_space_mutex, |
4365 | MY_MUTEX_INIT_FAST); |
4366 | |
4367 | const char* initial_rocksdb_datadir_for_ignore_dirs= rocksdb_datadir; |
4368 | if (!strncmp(rocksdb_datadir, "./" , 2)) |
4369 | initial_rocksdb_datadir_for_ignore_dirs += 2; |
4370 | ignore_db_dirs_append(initial_rocksdb_datadir_for_ignore_dirs); |
4371 | |
4372 | #if defined(HAVE_PSI_INTERFACE) |
4373 | rdb_collation_exceptions = |
4374 | new Regex_list_handler(key_rwlock_collation_exception_list); |
4375 | #else |
4376 | rdb_collation_exceptions = new Regex_list_handler(); |
4377 | #endif |
4378 | |
4379 | mysql_mutex_init(rdb_sysvars_psi_mutex_key, &rdb_sysvars_mutex, |
4380 | MY_MUTEX_INIT_FAST); |
4381 | Rdb_transaction::init_mutex(); |
4382 | |
4383 | rocksdb_hton->state = SHOW_OPTION_YES; |
4384 | rocksdb_hton->create = rocksdb_create_handler; |
4385 | rocksdb_hton->close_connection = rocksdb_close_connection; |
4386 | |
4387 | rocksdb_hton->prepare = rocksdb_prepare; |
4388 | rocksdb_hton->prepare_ordered = NULL; // Do not need it |
4389 | |
4390 | rocksdb_hton->commit_by_xid = rocksdb_commit_by_xid; |
4391 | rocksdb_hton->rollback_by_xid = rocksdb_rollback_by_xid; |
4392 | rocksdb_hton->recover = rocksdb_recover; |
4393 | |
4394 | rocksdb_hton->commit_ordered= rocksdb_commit_ordered; |
4395 | rocksdb_hton->commit = rocksdb_commit; |
4396 | |
4397 | rocksdb_hton->commit_checkpoint_request= rocksdb_checkpoint_request; |
4398 | |
4399 | rocksdb_hton->rollback = rocksdb_rollback; |
4400 | rocksdb_hton->show_status = rocksdb_show_status; |
4401 | rocksdb_hton->start_consistent_snapshot = |
4402 | rocksdb_start_tx_and_assign_read_view; |
4403 | rocksdb_hton->savepoint_set = rocksdb_savepoint; |
4404 | rocksdb_hton->savepoint_rollback = rocksdb_rollback_to_savepoint; |
4405 | rocksdb_hton->savepoint_rollback_can_release_mdl = |
4406 | rocksdb_rollback_to_savepoint_can_release_mdl; |
4407 | #ifdef MARIAROCKS_NOT_YET |
4408 | rocksdb_hton->update_table_stats = rocksdb_update_table_stats; |
4409 | #endif // MARIAROCKS_NOT_YET |
4410 | |
4411 | /* |
4412 | Not needed in MariaDB: |
4413 | rocksdb_hton->flush_logs = rocksdb_flush_wal; |
4414 | */ |
4415 | |
4416 | rocksdb_hton->flags = HTON_TEMPORARY_NOT_SUPPORTED | |
4417 | HTON_SUPPORTS_EXTENDED_KEYS | HTON_CAN_RECREATE; |
4418 | |
4419 | rocksdb_hton->tablefile_extensions= ha_rocksdb_exts; |
4420 | DBUG_ASSERT(!mysqld_embedded); |
4421 | |
4422 | if (rocksdb_db_options->max_open_files > (long)open_files_limit) { |
4423 | sql_print_information("RocksDB: rocksdb_max_open_files should not be " |
4424 | "greater than the open_files_limit, effective value " |
4425 | "of rocksdb_max_open_files is being set to " |
4426 | "open_files_limit / 2." ); |
4427 | rocksdb_db_options->max_open_files = open_files_limit / 2; |
4428 | } else if (rocksdb_db_options->max_open_files == -2) { |
4429 | rocksdb_db_options->max_open_files = open_files_limit / 2; |
4430 | } |
4431 | |
4432 | rocksdb_stats = rocksdb::CreateDBStatistics(); |
4433 | rocksdb_db_options->statistics = rocksdb_stats; |
4434 | |
4435 | if (rocksdb_rate_limiter_bytes_per_sec != 0) { |
4436 | rocksdb_rate_limiter.reset( |
4437 | rocksdb::NewGenericRateLimiter(rocksdb_rate_limiter_bytes_per_sec)); |
4438 | rocksdb_db_options->rate_limiter = rocksdb_rate_limiter; |
4439 | } |
4440 | |
4441 | rocksdb_db_options->delayed_write_rate = rocksdb_delayed_write_rate; |
4442 | |
4443 | std::shared_ptr<Rdb_logger> myrocks_logger = std::make_shared<Rdb_logger>(); |
4444 | rocksdb::Status s = rocksdb::CreateLoggerFromOptions( |
4445 | rocksdb_datadir, *rocksdb_db_options, &rocksdb_db_options->info_log); |
4446 | if (s.ok()) { |
4447 | myrocks_logger->SetRocksDBLogger(rocksdb_db_options->info_log); |
4448 | } |
4449 | |
4450 | rocksdb_db_options->info_log = myrocks_logger; |
4451 | myrocks_logger->SetInfoLogLevel( |
4452 | static_cast<rocksdb::InfoLogLevel>(rocksdb_info_log_level)); |
4453 | rocksdb_db_options->wal_dir = rocksdb_wal_dir; |
4454 | |
4455 | rocksdb_db_options->wal_recovery_mode = |
4456 | static_cast<rocksdb::WALRecoveryMode>(rocksdb_wal_recovery_mode); |
4457 | |
4458 | rocksdb_db_options->access_hint_on_compaction_start = |
4459 | static_cast<rocksdb::Options::AccessHint>( |
4460 | rocksdb_access_hint_on_compaction_start); |
4461 | |
4462 | if (rocksdb_db_options->allow_mmap_reads && |
4463 | rocksdb_db_options->use_direct_reads) { |
4464 | // allow_mmap_reads implies !use_direct_reads and RocksDB will not open if |
4465 | // mmap_reads and direct_reads are both on. (NO_LINT_DEBUG) |
4466 | sql_print_error("RocksDB: Can't enable both use_direct_reads " |
4467 | "and allow_mmap_reads\n" ); |
4468 | DBUG_RETURN(HA_EXIT_FAILURE); |
4469 | } |
4470 | |
4471 | if (rocksdb_db_options->allow_mmap_writes && |
4472 | rocksdb_db_options->use_direct_io_for_flush_and_compaction) { |
4473 | // See above comment for allow_mmap_reads. (NO_LINT_DEBUG) |
4474 | sql_print_error("RocksDB: Can't enable both " |
4475 | "use_direct_io_for_flush_and_compaction and " |
4476 | "allow_mmap_writes\n" ); |
4477 | DBUG_RETURN(HA_EXIT_FAILURE); |
4478 | } |
4479 | |
4480 | if (rocksdb_db_options->allow_mmap_writes && |
4481 | rocksdb_flush_log_at_trx_commit != FLUSH_LOG_NEVER) { |
4482 | // NO_LINT_DEBUG |
4483 | sql_print_error("RocksDB: rocksdb_flush_log_at_trx_commit needs to be 0 " |
4484 | "to use allow_mmap_writes" ); |
4485 | DBUG_RETURN(HA_EXIT_FAILURE); |
4486 | } |
4487 | |
4488 | // sst_file_manager will move deleted rocksdb sst files to trash_dir |
4489 | // to be deleted in a background thread. |
4490 | std::string trash_dir = std::string(rocksdb_datadir) + "/trash" ; |
4491 | rocksdb_db_options->sst_file_manager.reset(NewSstFileManager( |
4492 | rocksdb_db_options->env, myrocks_logger, trash_dir, |
4493 | rocksdb_sst_mgr_rate_bytes_per_sec, true /* delete_existing_trash */)); |
4494 | |
4495 | std::vector<std::string> cf_names; |
4496 | rocksdb::Status status; |
4497 | status = rocksdb::DB::ListColumnFamilies(*rocksdb_db_options, rocksdb_datadir, |
4498 | &cf_names); |
4499 | if (!status.ok()) { |
4500 | /* |
4501 | When we start on an empty datadir, ListColumnFamilies returns IOError, |
4502 | and RocksDB doesn't provide any way to check what kind of error it was. |
4503 | Checking system errno happens to work right now. |
4504 | */ |
4505 | if (status.IsIOError() |
4506 | #ifndef _WIN32 |
4507 | && errno == ENOENT |
4508 | #endif |
4509 | ) { |
4510 | sql_print_information("RocksDB: Got ENOENT when listing column families" ); |
4511 | sql_print_information( |
4512 | "RocksDB: assuming that we're creating a new database" ); |
4513 | } else { |
4514 | rdb_log_status_error(status, "Error listing column families" ); |
4515 | DBUG_RETURN(HA_EXIT_FAILURE); |
4516 | } |
4517 | } else |
4518 | sql_print_information("RocksDB: %ld column families found" , |
4519 | cf_names.size()); |
4520 | |
4521 | std::vector<rocksdb::ColumnFamilyDescriptor> cf_descr; |
4522 | std::vector<rocksdb::ColumnFamilyHandle *> cf_handles; |
4523 | |
4524 | rocksdb_tbl_options->index_type = |
4525 | (rocksdb::BlockBasedTableOptions::IndexType)rocksdb_index_type; |
4526 | |
4527 | if (!rocksdb_tbl_options->no_block_cache) { |
4528 | std::shared_ptr<rocksdb::Cache> block_cache = rocksdb_use_clock_cache |
4529 | ? rocksdb::NewClockCache(rocksdb_block_cache_size) |
4530 | : rocksdb::NewLRUCache(rocksdb_block_cache_size); |
4531 | if (rocksdb_sim_cache_size > 0) { |
4532 | // Simulated cache enabled |
4533 | // Wrap block cache inside a simulated cache and pass it to RocksDB |
4534 | rocksdb_tbl_options->block_cache = |
4535 | rocksdb::NewSimCache(block_cache, rocksdb_sim_cache_size, 6); |
4536 | } else { |
4537 | // Pass block cache to RocksDB |
4538 | rocksdb_tbl_options->block_cache = block_cache; |
4539 | } |
4540 | } |
4541 | // Using newer BlockBasedTable format version for better compression |
4542 | // and better memory allocation. |
4543 | // See: |
4544 | // https://github.com/facebook/rocksdb/commit/9ab5adfc59a621d12357580c94451d9f7320c2dd |
4545 | rocksdb_tbl_options->format_version = 2; |
4546 | |
4547 | if (rocksdb_collect_sst_properties) { |
4548 | properties_collector_factory = |
4549 | std::make_shared<Rdb_tbl_prop_coll_factory>(&ddl_manager); |
4550 | |
4551 | rocksdb_set_compaction_options(nullptr, nullptr, nullptr, nullptr); |
4552 | |
4553 | RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex); |
4554 | |
4555 | DBUG_ASSERT(rocksdb_table_stats_sampling_pct <= |
4556 | RDB_TBL_STATS_SAMPLE_PCT_MAX); |
4557 | properties_collector_factory->SetTableStatsSamplingPct( |
4558 | rocksdb_table_stats_sampling_pct); |
4559 | |
4560 | RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); |
4561 | } |
4562 | |
4563 | if (rocksdb_persistent_cache_size_mb > 0) { |
4564 | std::shared_ptr<rocksdb::PersistentCache> pcache; |
4565 | uint64_t cache_size_bytes= rocksdb_persistent_cache_size_mb * 1024 * 1024; |
4566 | status = rocksdb::NewPersistentCache( |
4567 | rocksdb::Env::Default(), std::string(rocksdb_persistent_cache_path), |
4568 | cache_size_bytes, myrocks_logger, true, &pcache); |
4569 | if (!status.ok()) { |
4570 | // NO_LINT_DEBUG |
4571 | sql_print_error("RocksDB: Persistent cache returned error: (%s)" , |
4572 | status.getState()); |
4573 | DBUG_RETURN(HA_EXIT_FAILURE); |
4574 | } |
4575 | rocksdb_tbl_options->persistent_cache = pcache; |
4576 | } else if (strlen(rocksdb_persistent_cache_path)) { |
4577 | sql_print_error("RocksDB: Must specify rocksdb_persistent_cache_size_mb" ); |
4578 | DBUG_RETURN(HA_EXIT_FAILURE); |
4579 | } |
4580 | |
4581 | std::unique_ptr<Rdb_cf_options> cf_options_map(new Rdb_cf_options()); |
4582 | if (!cf_options_map->init(*rocksdb_tbl_options, properties_collector_factory, |
4583 | rocksdb_default_cf_options, |
4584 | rocksdb_override_cf_options)) { |
4585 | // NO_LINT_DEBUG |
4586 | sql_print_error("RocksDB: Failed to initialize CF options map." ); |
4587 | DBUG_RETURN(HA_EXIT_FAILURE); |
4588 | } |
4589 | |
4590 | /* |
4591 | If there are no column families, we're creating the new database. |
4592 | Create one column family named "default". |
4593 | */ |
4594 | if (cf_names.size() == 0) |
4595 | cf_names.push_back(DEFAULT_CF_NAME); |
4596 | |
4597 | std::vector<int> compaction_enabled_cf_indices; |
4598 | sql_print_information("RocksDB: Column Families at start:" ); |
4599 | for (size_t i = 0; i < cf_names.size(); ++i) { |
4600 | rocksdb::ColumnFamilyOptions opts; |
4601 | cf_options_map->get_cf_options(cf_names[i], &opts); |
4602 | |
4603 | sql_print_information(" cf=%s" , cf_names[i].c_str()); |
4604 | sql_print_information(" write_buffer_size=%ld" , opts.write_buffer_size); |
4605 | sql_print_information(" target_file_size_base=%" PRIu64, |
4606 | opts.target_file_size_base); |
4607 | |
4608 | /* |
4609 | Temporarily disable compactions to prevent a race condition where |
4610 | compaction starts before compaction filter is ready. |
4611 | */ |
4612 | if (!opts.disable_auto_compactions) { |
4613 | compaction_enabled_cf_indices.push_back(i); |
4614 | opts.disable_auto_compactions = true; |
4615 | } |
4616 | cf_descr.push_back(rocksdb::ColumnFamilyDescriptor(cf_names[i], opts)); |
4617 | } |
4618 | |
4619 | rocksdb::Options main_opts(*rocksdb_db_options, |
4620 | cf_options_map->get_defaults()); |
4621 | |
4622 | rocksdb::TransactionDBOptions tx_db_options; |
4623 | tx_db_options.transaction_lock_timeout = 2; // 2 seconds |
4624 | tx_db_options.custom_mutex_factory = std::make_shared<Rdb_mutex_factory>(); |
4625 | |
4626 | status = |
4627 | check_rocksdb_options_compatibility(rocksdb_datadir, main_opts, cf_descr); |
4628 | |
4629 | // We won't start if we'll determine that there's a chance of data corruption |
4630 | // because of incompatible options. |
4631 | if (!status.ok()) { |
4632 | rdb_log_status_error( |
4633 | status, "Compatibility check against existing database options failed" ); |
4634 | DBUG_RETURN(HA_EXIT_FAILURE); |
4635 | } |
4636 | |
4637 | status = rocksdb::TransactionDB::Open( |
4638 | main_opts, tx_db_options, rocksdb_datadir, cf_descr, &cf_handles, &rdb); |
4639 | |
4640 | if (!status.ok()) { |
4641 | rdb_log_status_error(status, "Error opening instance" ); |
4642 | DBUG_RETURN(HA_EXIT_FAILURE); |
4643 | } |
4644 | cf_manager.init(std::move(cf_options_map), &cf_handles); |
4645 | |
4646 | if (dict_manager.init(rdb->GetBaseDB(), &cf_manager)) { |
4647 | // NO_LINT_DEBUG |
4648 | sql_print_error("RocksDB: Failed to initialize data dictionary." ); |
4649 | DBUG_RETURN(HA_EXIT_FAILURE); |
4650 | } |
4651 | |
4652 | if (binlog_manager.init(&dict_manager)) { |
4653 | // NO_LINT_DEBUG |
4654 | sql_print_error("RocksDB: Failed to initialize binlog manager." ); |
4655 | DBUG_RETURN(HA_EXIT_FAILURE); |
4656 | } |
4657 | |
4658 | if (ddl_manager.init(&dict_manager, &cf_manager, rocksdb_validate_tables)) { |
4659 | // NO_LINT_DEBUG |
4660 | sql_print_error("RocksDB: Failed to initialize DDL manager." ); |
4661 | DBUG_RETURN(HA_EXIT_FAILURE); |
4662 | } |
4663 | |
4664 | Rdb_sst_info::init(rdb); |
4665 | |
4666 | /* |
4667 | Enable auto compaction, things needed for compaction filter are finished |
4668 | initializing |
4669 | */ |
4670 | std::vector<rocksdb::ColumnFamilyHandle *> compaction_enabled_cf_handles; |
4671 | compaction_enabled_cf_handles.reserve(compaction_enabled_cf_indices.size()); |
4672 | for (const auto &index : compaction_enabled_cf_indices) { |
4673 | compaction_enabled_cf_handles.push_back(cf_handles[index]); |
4674 | } |
4675 | |
4676 | status = rdb->EnableAutoCompaction(compaction_enabled_cf_handles); |
4677 | |
4678 | if (!status.ok()) { |
4679 | rdb_log_status_error(status, "Error enabling compaction" ); |
4680 | DBUG_RETURN(HA_EXIT_FAILURE); |
4681 | } |
4682 | |
4683 | auto err = rdb_bg_thread.create_thread(BG_THREAD_NAME |
4684 | #ifdef HAVE_PSI_INTERFACE |
4685 | , |
4686 | rdb_background_psi_thread_key |
4687 | #endif |
4688 | ); |
4689 | if (err != 0) { |
4690 | sql_print_error("RocksDB: Couldn't start the background thread: (errno=%d)" , |
4691 | err); |
4692 | DBUG_RETURN(HA_EXIT_FAILURE); |
4693 | } |
4694 | |
4695 | err = rdb_drop_idx_thread.create_thread(INDEX_THREAD_NAME |
4696 | #ifdef HAVE_PSI_INTERFACE |
4697 | , |
4698 | rdb_drop_idx_psi_thread_key |
4699 | #endif |
4700 | ); |
4701 | if (err != 0) { |
4702 | sql_print_error("RocksDB: Couldn't start the drop index thread: (errno=%d)" , |
4703 | err); |
4704 | DBUG_RETURN(HA_EXIT_FAILURE); |
4705 | } |
4706 | |
4707 | rdb_set_collation_exception_list(rocksdb_strict_collation_exceptions); |
4708 | |
4709 | if (rocksdb_pause_background_work) { |
4710 | rdb->PauseBackgroundWork(); |
4711 | } |
4712 | |
4713 | // NO_LINT_DEBUG |
4714 | sql_print_information("RocksDB: global statistics using %s indexer" , |
4715 | STRINGIFY_ARG(RDB_INDEXER)); |
4716 | #if defined(HAVE_SCHED_GETCPU) |
4717 | if (sched_getcpu() == -1) { |
4718 | // NO_LINT_DEBUG |
4719 | sql_print_information( |
4720 | "RocksDB: sched_getcpu() failed - " |
4721 | "global statistics will use thread_id_indexer_t instead" ); |
4722 | } |
4723 | #endif |
4724 | |
4725 | /** |
4726 | Rocksdb does not always shutdown its threads, when |
4727 | plugin is shut down. Disable server's leak check |
4728 | at exit to avoid crash. |
4729 | */ |
4730 | my_disable_leak_check = true; |
4731 | |
4732 | err = my_error_register(rdb_get_error_messages, HA_ERR_ROCKSDB_FIRST, |
4733 | HA_ERR_ROCKSDB_LAST); |
4734 | if (err != 0) { |
4735 | // NO_LINT_DEBUG |
4736 | sql_print_error("RocksDB: Couldn't initialize error messages" ); |
4737 | rdb_open_tables.m_hash.~Rdb_table_set(); |
4738 | DBUG_RETURN(HA_EXIT_FAILURE); |
4739 | } |
4740 | |
4741 | |
4742 | |
4743 | // Creating an instance of HistogramImpl should only happen after RocksDB |
4744 | // has been successfully initialized. |
4745 | commit_latency_stats = new rocksdb::HistogramImpl(); |
4746 | |
4747 | // Construct a list of directories which will be monitored by I/O watchdog |
4748 | // to make sure that we won't lose write access to them. |
4749 | std::vector<std::string> directories; |
4750 | |
4751 | // 1. Data directory. |
4752 | directories.push_back(mysql_real_data_home); |
4753 | |
4754 | // 2. Transaction logs. |
4755 | if (myrocks::rocksdb_wal_dir && *myrocks::rocksdb_wal_dir) { |
4756 | directories.push_back(myrocks::rocksdb_wal_dir); |
4757 | } |
4758 | |
4759 | #if !defined(_WIN32) && !defined(__APPLE__) |
4760 | io_watchdog = new Rdb_io_watchdog(directories); |
4761 | io_watchdog->reset_timeout(rocksdb_io_write_timeout_secs); |
4762 | #endif |
4763 | |
4764 | // NO_LINT_DEBUG |
4765 | sql_print_information("MyRocks storage engine plugin has been successfully " |
4766 | "initialized." ); |
4767 | |
4768 | DBUG_RETURN(HA_EXIT_SUCCESS); |
4769 | } |
4770 | |
4771 | /* |
4772 | Storage Engine deinitialization function, invoked when plugin is unloaded. |
4773 | */ |
4774 | |
4775 | static int rocksdb_done_func(void *const p) { |
4776 | DBUG_ENTER_FUNC(); |
4777 | |
4778 | int error = 0; |
4779 | |
4780 | // signal the drop index thread to stop |
4781 | rdb_drop_idx_thread.signal(true); |
4782 | |
4783 | // Flush all memtables for not losing data, even if WAL is disabled. |
4784 | rocksdb_flush_all_memtables(); |
4785 | |
4786 | // Stop all rocksdb background work |
4787 | CancelAllBackgroundWork(rdb->GetBaseDB(), true); |
4788 | |
4789 | // Signal the background thread to stop and to persist all stats collected |
4790 | // from background flushes and compactions. This will add more keys to a new |
4791 | // memtable, but since the memtables were just flushed, it should not trigger |
4792 | // a flush that can stall due to background threads being stopped. As long |
4793 | // as these keys are stored in a WAL file, they can be retrieved on restart. |
4794 | rdb_bg_thread.signal(true); |
4795 | |
4796 | // Wait for the background thread to finish. |
4797 | auto err = rdb_bg_thread.join(); |
4798 | if (err != 0) { |
4799 | // We'll log the message and continue because we're shutting down and |
4800 | // continuation is the optimal strategy. |
4801 | // NO_LINT_DEBUG |
4802 | sql_print_error("RocksDB: Couldn't stop the background thread: (errno=%d)" , |
4803 | err); |
4804 | } |
4805 | |
4806 | // Wait for the drop index thread to finish. |
4807 | err = rdb_drop_idx_thread.join(); |
4808 | if (err != 0) { |
4809 | // NO_LINT_DEBUG |
4810 | sql_print_error("RocksDB: Couldn't stop the index thread: (errno=%d)" , err); |
4811 | } |
4812 | |
4813 | if (rdb_open_tables.m_hash.size()) { |
4814 | // Looks like we are getting unloaded and yet we have some open tables |
4815 | // left behind. |
4816 | error = 1; |
4817 | } |
4818 | |
4819 | /* |
4820 | MariaDB: When the plugin is unloaded with UNINSTALL SONAME command, some |
4821 | connections may still have Rdb_transaction objects. |
4822 | |
4823 | These objects are not genuine transactions (as SQL layer makes sure that |
4824 | a plugin that is being unloaded has no open tables), they are empty |
4825 | Rdb_transaction objects that were left there to save on object |
4826 | creation/deletion. |
4827 | |
4828 | Go through the list and delete them. |
4829 | */ |
4830 | { |
4831 | class Rdb_trx_deleter: public Rdb_tx_list_walker { |
4832 | public: |
4833 | std::set<Rdb_transaction*> rdb_trxs; |
4834 | |
4835 | void process_tran(const Rdb_transaction *const tx) override { |
4836 | /* |
4837 | Check if the transaction is really empty. We only check |
4838 | non-WriteBatch-based transactions, because there is no easy way to |
4839 | check WriteBatch-based transactions. |
4840 | */ |
4841 | if (!tx->is_writebatch_trx()) { |
4842 | const auto tx_impl = static_cast<const Rdb_transaction_impl *>(tx); |
4843 | DBUG_ASSERT(tx_impl); |
4844 | if (tx_impl->get_rdb_trx()) |
4845 | DBUG_ASSERT(0); |
4846 | } |
4847 | rdb_trxs.insert((Rdb_transaction*)tx); |
4848 | }; |
4849 | } deleter; |
4850 | |
4851 | Rdb_transaction::walk_tx_list(&deleter); |
4852 | |
4853 | for (std::set<Rdb_transaction*>::iterator it= deleter.rdb_trxs.begin(); |
4854 | it != deleter.rdb_trxs.end(); |
4855 | ++it) |
4856 | { |
4857 | // When a transaction is deleted, it removes itself from s_tx_list. |
4858 | delete *it; |
4859 | } |
4860 | } |
4861 | |
4862 | /* |
4863 | destructors for static objects can be called at _exit(), |
4864 | but we want to free the memory at dlclose() |
4865 | */ |
4866 | rdb_open_tables.m_hash.~Rdb_table_set(); |
4867 | mysql_mutex_destroy(&rdb_open_tables.m_mutex); |
4868 | mysql_mutex_destroy(&rdb_sysvars_mutex); |
4869 | |
4870 | |
4871 | delete rdb_collation_exceptions; |
4872 | |
4873 | mysql_mutex_destroy(&rdb_collation_data_mutex); |
4874 | mysql_mutex_destroy(&rdb_mem_cmp_space_mutex); |
4875 | |
4876 | Rdb_transaction::term_mutex(); |
4877 | |
4878 | for (auto &it : rdb_collation_data) { |
4879 | delete it; |
4880 | it = nullptr; |
4881 | } |
4882 | |
4883 | ddl_manager.cleanup(); |
4884 | binlog_manager.cleanup(); |
4885 | dict_manager.cleanup(); |
4886 | cf_manager.cleanup(); |
4887 | |
4888 | delete rdb; |
4889 | rdb = nullptr; |
4890 | |
4891 | delete commit_latency_stats; |
4892 | commit_latency_stats = nullptr; |
4893 | |
4894 | #if !defined(_WIN32) && !defined(__APPLE__) |
4895 | delete io_watchdog; |
4896 | io_watchdog = nullptr; |
4897 | #endif |
4898 | |
4899 | // Disown the cache data since we're shutting down. |
4900 | // This results in memory leaks but it improved the shutdown time. |
4901 | // Don't disown when running under valgrind |
4902 | #ifndef HAVE_purify |
4903 | if (rocksdb_tbl_options->block_cache) { |
4904 | rocksdb_tbl_options->block_cache->DisownData(); |
4905 | } |
4906 | #endif /* HAVE_purify */ |
4907 | |
4908 | /* |
4909 | MariaDB: don't clear rocksdb_db_options and rocksdb_tbl_options. |
4910 | MyRocks' plugin variables refer to them. |
4911 | |
4912 | The plugin cannot be loaded again (see prevent_myrocks_loading) but plugin |
4913 | variables are processed before myrocks::rocksdb_init_func is invoked, so |
4914 | they must point to valid memory. |
4915 | */ |
4916 | //rocksdb_db_options = nullptr; |
4917 | rocksdb_db_options->statistics = nullptr; |
4918 | //rocksdb_tbl_options = nullptr; |
4919 | rocksdb_stats = nullptr; |
4920 | |
4921 | my_error_unregister(HA_ERR_ROCKSDB_FIRST, HA_ERR_ROCKSDB_LAST); |
4922 | |
4923 | /* |
4924 | Prevent loading the plugin after it has been loaded and then unloaded. This |
4925 | doesn't work currently. |
4926 | */ |
4927 | prevent_myrocks_loading= true; |
4928 | |
4929 | DBUG_RETURN(error); |
4930 | } |
4931 | |
4932 | static inline void rocksdb_smart_seek(bool seek_backward, |
4933 | rocksdb::Iterator *const iter, |
4934 | const rocksdb::Slice &key_slice) { |
4935 | if (seek_backward) { |
4936 | iter->SeekForPrev(key_slice); |
4937 | } else { |
4938 | iter->Seek(key_slice); |
4939 | } |
4940 | } |
4941 | |
4942 | static inline void rocksdb_smart_next(bool seek_backward, |
4943 | rocksdb::Iterator *const iter) { |
4944 | if (seek_backward) { |
4945 | iter->Prev(); |
4946 | } else { |
4947 | iter->Next(); |
4948 | } |
4949 | } |
4950 | |
4951 | #ifndef NDEBUG |
4952 | // simulate that RocksDB has reported corrupted data |
4953 | static void dbug_change_status_to_corrupted(rocksdb::Status *status) { |
4954 | *status = rocksdb::Status::Corruption(); |
4955 | } |
4956 | #endif |
4957 | |
4958 | // If the iterator is not valid it might be because of EOF but might be due |
4959 | // to IOError or corruption. The good practice is always check it. |
4960 | // https://github.com/facebook/rocksdb/wiki/Iterator#error-handling |
4961 | static inline bool is_valid(rocksdb::Iterator *scan_it) { |
4962 | if (scan_it->Valid()) { |
4963 | return true; |
4964 | } else { |
4965 | rocksdb::Status s = scan_it->status(); |
4966 | DBUG_EXECUTE_IF("rocksdb_return_status_corrupted" , |
4967 | dbug_change_status_to_corrupted(&s);); |
4968 | if (s.IsIOError() || s.IsCorruption()) { |
4969 | if (s.IsCorruption()) { |
4970 | rdb_persist_corruption_marker(); |
4971 | } |
4972 | rdb_handle_io_error(s, RDB_IO_ERROR_GENERAL); |
4973 | } |
4974 | return false; |
4975 | } |
4976 | } |
4977 | |
4978 | /** |
4979 | @brief |
4980 | Example of simple lock controls. The "table_handler" it creates is a |
4981 | structure we will pass to each ha_rocksdb handler. Do you have to have |
4982 | one of these? Well, you have pieces that are used for locking, and |
4983 | they are needed to function. |
4984 | */ |
4985 | |
4986 | Rdb_table_handler * |
4987 | Rdb_open_tables_map::get_table_handler(const char *const table_name) { |
4988 | Rdb_table_handler *table_handler; |
4989 | uint length; |
4990 | char *tmp_name; |
4991 | |
4992 | DBUG_ASSERT(table_name != nullptr); |
4993 | length = (uint)strlen(table_name); |
4994 | |
4995 | // First, look up the table in the hash map. |
4996 | RDB_MUTEX_LOCK_CHECK(m_mutex); |
4997 | if (!m_hash.size() || !(table_handler = m_hash.find(table_name, length))) { |
4998 | // Since we did not find it in the hash map, attempt to create and add it |
4999 | // to the hash map. |
5000 | if (!(table_handler = reinterpret_cast<Rdb_table_handler *>(my_multi_malloc( |
5001 | MYF(MY_WME | MY_ZEROFILL), &table_handler, sizeof(*table_handler), |
5002 | &tmp_name, length + 1, NullS)))) { |
5003 | // Allocating a new Rdb_table_handler and a new table name failed. |
5004 | RDB_MUTEX_UNLOCK_CHECK(m_mutex); |
5005 | return nullptr; |
5006 | } |
5007 | |
5008 | table_handler->m_ref_count = 0; |
5009 | table_handler->m_table_name_length = length; |
5010 | table_handler->m_table_name = tmp_name; |
5011 | strmov(table_handler->m_table_name, table_name); |
5012 | |
5013 | if (m_hash.insert(table_handler)) { |
5014 | // Inserting into the hash map failed. |
5015 | RDB_MUTEX_UNLOCK_CHECK(m_mutex); |
5016 | my_free(table_handler); |
5017 | return nullptr; |
5018 | } |
5019 | |
5020 | thr_lock_init(&table_handler->m_thr_lock); |
5021 | #ifdef MARIAROCKS_NOT_YET |
5022 | table_handler->m_io_perf_read.init(); |
5023 | table_handler->m_io_perf_write.init(); |
5024 | #endif |
5025 | } |
5026 | DBUG_ASSERT(table_handler->m_ref_count >= 0); |
5027 | table_handler->m_ref_count++; |
5028 | |
5029 | RDB_MUTEX_UNLOCK_CHECK(m_mutex); |
5030 | |
5031 | return table_handler; |
5032 | } |
5033 | |
5034 | std::vector<std::string> rdb_get_open_table_names(void) { |
5035 | return rdb_open_tables.get_table_names(); |
5036 | } |
5037 | |
5038 | std::vector<std::string> Rdb_open_tables_map::get_table_names(void) const { |
5039 | size_t i; |
5040 | const Rdb_table_handler *table_handler; |
5041 | std::vector<std::string> names; |
5042 | |
5043 | RDB_MUTEX_LOCK_CHECK(m_mutex); |
5044 | for (i = 0; (table_handler = m_hash.at(i)); i++) { |
5045 | DBUG_ASSERT(table_handler != nullptr); |
5046 | names.push_back(table_handler->m_table_name); |
5047 | } |
5048 | DBUG_ASSERT(i == m_hash.size()); |
5049 | RDB_MUTEX_UNLOCK_CHECK(m_mutex); |
5050 | |
5051 | return names; |
5052 | } |
5053 | |
5054 | /* |
5055 | Inspired by innobase_get_int_col_max_value from InnoDB. This returns the |
5056 | maximum value a type can take on. |
5057 | */ |
5058 | static ulonglong rdb_get_int_col_max_value(const Field *field) { |
5059 | ulonglong max_value = 0; |
5060 | switch (field->key_type()) { |
5061 | case HA_KEYTYPE_BINARY: |
5062 | max_value = 0xFFULL; |
5063 | break; |
5064 | case HA_KEYTYPE_INT8: |
5065 | max_value = 0x7FULL; |
5066 | break; |
5067 | case HA_KEYTYPE_USHORT_INT: |
5068 | max_value = 0xFFFFULL; |
5069 | break; |
5070 | case HA_KEYTYPE_SHORT_INT: |
5071 | max_value = 0x7FFFULL; |
5072 | break; |
5073 | case HA_KEYTYPE_UINT24: |
5074 | max_value = 0xFFFFFFULL; |
5075 | break; |
5076 | case HA_KEYTYPE_INT24: |
5077 | max_value = 0x7FFFFFULL; |
5078 | break; |
5079 | case HA_KEYTYPE_ULONG_INT: |
5080 | max_value = 0xFFFFFFFFULL; |
5081 | break; |
5082 | case HA_KEYTYPE_LONG_INT: |
5083 | max_value = 0x7FFFFFFFULL; |
5084 | break; |
5085 | case HA_KEYTYPE_ULONGLONG: |
5086 | max_value = 0xFFFFFFFFFFFFFFFFULL; |
5087 | break; |
5088 | case HA_KEYTYPE_LONGLONG: |
5089 | max_value = 0x7FFFFFFFFFFFFFFFULL; |
5090 | break; |
5091 | case HA_KEYTYPE_FLOAT: |
5092 | max_value = 0x1000000ULL; |
5093 | break; |
5094 | case HA_KEYTYPE_DOUBLE: |
5095 | max_value = 0x20000000000000ULL; |
5096 | break; |
5097 | default: |
5098 | abort(); |
5099 | } |
5100 | |
5101 | return max_value; |
5102 | } |
5103 | |
5104 | void ha_rocksdb::load_auto_incr_value() { |
5105 | ulonglong auto_incr = 0; |
5106 | bool validate_last = false, use_datadic = true; |
5107 | #ifndef NDEBUG |
5108 | DBUG_EXECUTE_IF("myrocks_autoinc_upgrade" , use_datadic = false;); |
5109 | validate_last = true; |
5110 | #endif |
5111 | |
5112 | if (use_datadic && dict_manager.get_auto_incr_val( |
5113 | m_tbl_def->get_autoincr_gl_index_id(), &auto_incr)) { |
5114 | update_auto_incr_val(auto_incr); |
5115 | } |
5116 | |
5117 | // If we find nothing in the data dictionary, or if we are in debug mode, |
5118 | // then call index_last to get the last value. |
5119 | // |
5120 | // This is needed when upgrading from a server that did not support |
5121 | // persistent auto_increment, of if the table is empty. |
5122 | // |
5123 | // For debug mode, we are just verifying that the data dictionary value is |
5124 | // greater than or equal to the maximum value in the table. |
5125 | if (auto_incr == 0 || validate_last) { |
5126 | auto_incr = load_auto_incr_value_from_index(); |
5127 | update_auto_incr_val(auto_incr); |
5128 | } |
5129 | |
5130 | // If we failed to find anything from the data dictionary and index, then |
5131 | // initialize auto_increment to 1. |
5132 | if (m_tbl_def->m_auto_incr_val == 0) { |
5133 | update_auto_incr_val(1); |
5134 | } |
5135 | } |
5136 | |
5137 | ulonglong ha_rocksdb::load_auto_incr_value_from_index() { |
5138 | const int save_active_index = active_index; |
5139 | active_index = table->s->next_number_index; |
5140 | const uint8 save_table_status = table->status; |
5141 | ulonglong last_val = 0; |
5142 | |
5143 | Rdb_transaction *const tx = get_or_create_tx(table->in_use); |
5144 | const bool is_new_snapshot = !tx->has_snapshot(); |
5145 | if (is_new_snapshot) { |
5146 | tx->acquire_snapshot(true); |
5147 | } |
5148 | |
5149 | // Do a lookup. We only need index column, so it should be index-only. |
5150 | // (another reason to make it index-only is that table->read_set is not set |
5151 | // appropriately and non-index-only lookup will not read the value) |
5152 | const bool save_keyread_only = m_keyread_only; |
5153 | m_keyread_only = true; |
5154 | m_key_requested = true; |
5155 | |
5156 | if (!index_last(table->record[0])) { |
5157 | Field *field = |
5158 | table->key_info[table->s->next_number_index].key_part[0].field; |
5159 | ulonglong max_val = rdb_get_int_col_max_value(field); |
5160 | my_bitmap_map *const old_map = |
5161 | dbug_tmp_use_all_columns(table, table->read_set); |
5162 | last_val = field->val_int(); |
5163 | if (last_val != max_val) { |
5164 | last_val++; |
5165 | } |
5166 | #ifndef NDEBUG |
5167 | ulonglong dd_val; |
5168 | if (last_val <= max_val) { |
5169 | const auto &gl_index_id = m_tbl_def->get_autoincr_gl_index_id(); |
5170 | if (dict_manager.get_auto_incr_val(gl_index_id, &dd_val) && |
5171 | tx->get_auto_incr(gl_index_id) == 0) { |
5172 | DBUG_ASSERT(dd_val >= last_val); |
5173 | } |
5174 | } |
5175 | #endif |
5176 | dbug_tmp_restore_column_map(table->read_set, old_map); |
5177 | } |
5178 | |
5179 | m_keyread_only = save_keyread_only; |
5180 | if (is_new_snapshot) { |
5181 | tx->release_snapshot(); |
5182 | } |
5183 | |
5184 | table->status = save_table_status; |
5185 | active_index = save_active_index; |
5186 | |
5187 | /* |
5188 | Do what ha_rocksdb::index_end() does. |
5189 | (Why don't we use index_init/index_end? class handler defines index_init |
5190 | as private, for some reason). |
5191 | */ |
5192 | release_scan_iterator(); |
5193 | |
5194 | return last_val; |
5195 | } |
5196 | |
5197 | void ha_rocksdb::update_auto_incr_val(ulonglong val) { |
5198 | ulonglong auto_incr_val = m_tbl_def->m_auto_incr_val; |
5199 | while ( |
5200 | auto_incr_val < val && |
5201 | !m_tbl_def->m_auto_incr_val.compare_exchange_weak(auto_incr_val, val)) { |
5202 | // Do nothing - just loop until auto_incr_val is >= val or we successfully |
5203 | // set it |
5204 | } |
5205 | } |
5206 | |
5207 | void ha_rocksdb::update_auto_incr_val_from_field() { |
5208 | Field *field; |
5209 | ulonglong new_val, max_val; |
5210 | field = table->key_info[table->s->next_number_index].key_part[0].field; |
5211 | max_val = rdb_get_int_col_max_value(field); |
5212 | |
5213 | my_bitmap_map *const old_map = |
5214 | dbug_tmp_use_all_columns(table, table->read_set); |
5215 | new_val = field->val_int(); |
5216 | // don't increment if we would wrap around |
5217 | if (new_val != max_val) { |
5218 | new_val++; |
5219 | } |
5220 | |
5221 | dbug_tmp_restore_column_map(table->read_set, old_map); |
5222 | |
5223 | // Only update if positive value was set for auto_incr column. |
5224 | if (new_val <= max_val) { |
5225 | Rdb_transaction *const tx = get_or_create_tx(table->in_use); |
5226 | tx->set_auto_incr(m_tbl_def->get_autoincr_gl_index_id(), new_val); |
5227 | |
5228 | // Update the in memory auto_incr value in m_tbl_def. |
5229 | update_auto_incr_val(new_val); |
5230 | } |
5231 | } |
5232 | |
5233 | int ha_rocksdb::load_hidden_pk_value() { |
5234 | const int save_active_index = active_index; |
5235 | active_index = m_tbl_def->m_key_count - 1; |
5236 | const uint8 save_table_status = table->status; |
5237 | |
5238 | /* |
5239 | We should read the latest committed value in the database. |
5240 | That is, if we have an open transaction with a snapshot, we should not use |
5241 | it as we may get old data. Start a new transaction to read the latest |
5242 | value. |
5243 | */ |
5244 | Rdb_transaction *const temp_tx = new Rdb_transaction_impl(table->in_use); |
5245 | temp_tx->start_tx(); |
5246 | Rdb_transaction *&tx = get_tx_from_thd(table->in_use); |
5247 | Rdb_transaction *save_tx= tx; |
5248 | tx= temp_tx; |
5249 | |
5250 | longlong hidden_pk_id = 1; |
5251 | // Do a lookup. |
5252 | if (!index_last(table->record[0])) { |
5253 | /* |
5254 | Decode PK field from the key |
5255 | */ |
5256 | auto err = read_hidden_pk_id_from_rowkey(&hidden_pk_id); |
5257 | if (err) { |
5258 | delete tx; |
5259 | tx= save_tx; |
5260 | return err; |
5261 | } |
5262 | |
5263 | hidden_pk_id++; |
5264 | } |
5265 | |
5266 | longlong old = m_tbl_def->m_hidden_pk_val; |
5267 | while (old < hidden_pk_id && |
5268 | !m_tbl_def->m_hidden_pk_val.compare_exchange_weak(old, hidden_pk_id)) { |
5269 | } |
5270 | |
5271 | delete tx; |
5272 | tx= save_tx; |
5273 | |
5274 | table->status = save_table_status; |
5275 | active_index = save_active_index; |
5276 | |
5277 | release_scan_iterator(); |
5278 | |
5279 | return HA_EXIT_SUCCESS; |
5280 | } |
5281 | |
5282 | /* Get PK value from m_tbl_def->m_hidden_pk_info. */ |
5283 | longlong ha_rocksdb::update_hidden_pk_val() { |
5284 | DBUG_ASSERT(has_hidden_pk(table)); |
5285 | const longlong new_val = m_tbl_def->m_hidden_pk_val++; |
5286 | return new_val; |
5287 | } |
5288 | |
5289 | /* Get the id of the hidden pk id from m_last_rowkey */ |
5290 | int ha_rocksdb::read_hidden_pk_id_from_rowkey(longlong *const hidden_pk_id) { |
5291 | DBUG_ASSERT(hidden_pk_id != nullptr); |
5292 | DBUG_ASSERT(table != nullptr); |
5293 | DBUG_ASSERT(has_hidden_pk(table)); |
5294 | |
5295 | rocksdb::Slice rowkey_slice(m_last_rowkey.ptr(), m_last_rowkey.length()); |
5296 | |
5297 | // Get hidden primary key from old key slice |
5298 | Rdb_string_reader reader(&rowkey_slice); |
5299 | if ((!reader.read(Rdb_key_def::INDEX_NUMBER_SIZE))) |
5300 | return HA_ERR_ROCKSDB_CORRUPT_DATA; |
5301 | |
5302 | const int length= 8; /* was Field_longlong::PACK_LENGTH in FB MySQL tree */ |
5303 | const uchar *from = reinterpret_cast<const uchar *>(reader.read(length)); |
5304 | if (from == nullptr) { |
5305 | /* Mem-comparable image doesn't have enough bytes */ |
5306 | return HA_ERR_ROCKSDB_CORRUPT_DATA; |
5307 | } |
5308 | |
5309 | *hidden_pk_id = rdb_netbuf_read_uint64(&from); |
5310 | return HA_EXIT_SUCCESS; |
5311 | } |
5312 | |
5313 | /** |
5314 | @brief |
5315 | Free lock controls. We call this whenever we close a table. If the table had |
5316 | the last reference to the table_handler, then we free the memory associated |
5317 | with it. |
5318 | */ |
5319 | |
5320 | void Rdb_open_tables_map::release_table_handler( |
5321 | Rdb_table_handler *const table_handler) { |
5322 | RDB_MUTEX_LOCK_CHECK(m_mutex); |
5323 | |
5324 | DBUG_ASSERT(table_handler != nullptr); |
5325 | DBUG_ASSERT(table_handler->m_ref_count > 0); |
5326 | if (!--table_handler->m_ref_count) { |
5327 | // Last reference was released. Tear down the hash entry. |
5328 | const auto ret MY_ATTRIBUTE((__unused__)) = m_hash.remove(table_handler); |
5329 | DBUG_ASSERT(!ret); // the hash entry must actually be found and deleted |
5330 | my_core::thr_lock_delete(&table_handler->m_thr_lock); |
5331 | my_free(table_handler); |
5332 | } |
5333 | |
5334 | RDB_MUTEX_UNLOCK_CHECK(m_mutex); |
5335 | } |
5336 | |
5337 | static handler *rocksdb_create_handler(my_core::handlerton *const hton, |
5338 | my_core::TABLE_SHARE *const table_arg, |
5339 | my_core::MEM_ROOT *const mem_root) { |
5340 | return new (mem_root) ha_rocksdb(hton, table_arg); |
5341 | } |
5342 | |
5343 | ha_rocksdb::ha_rocksdb(my_core::handlerton *const hton, |
5344 | my_core::TABLE_SHARE *const table_arg) |
5345 | : handler(hton, table_arg), m_table_handler(nullptr), m_scan_it(nullptr), |
5346 | m_scan_it_skips_bloom(false), m_scan_it_snapshot(nullptr), |
5347 | m_tbl_def(nullptr), m_pk_descr(nullptr), m_key_descr_arr(nullptr), |
5348 | m_pk_can_be_decoded(false), m_maybe_unpack_info(false), |
5349 | m_pk_tuple(nullptr), m_pk_packed_tuple(nullptr), |
5350 | m_sk_packed_tuple(nullptr), m_end_key_packed_tuple(nullptr), |
5351 | m_sk_match_prefix(nullptr), m_sk_match_prefix_buf(nullptr), |
5352 | m_sk_packed_tuple_old(nullptr), m_dup_sk_packed_tuple(nullptr), |
5353 | m_dup_sk_packed_tuple_old(nullptr), m_eq_cond_lower_bound(nullptr), |
5354 | m_eq_cond_upper_bound(nullptr), m_pack_buffer(nullptr), |
5355 | m_lock_rows(RDB_LOCK_NONE), m_keyread_only(FALSE), m_encoder_arr(nullptr), |
5356 | m_row_checksums_checked(0), m_in_rpl_delete_rows(false), |
5357 | m_in_rpl_update_rows(false), m_force_skip_unique_check(false) {} |
5358 | |
5359 | |
5360 | const std::string &ha_rocksdb::get_table_basename() const { |
5361 | return m_tbl_def->base_tablename(); |
5362 | } |
5363 | |
5364 | /** |
5365 | @return |
5366 | false OK |
5367 | other Error inpacking the data |
5368 | */ |
5369 | bool ha_rocksdb::init_with_fields() { |
5370 | DBUG_ENTER_FUNC(); |
5371 | |
5372 | const uint pk = table_share->primary_key; |
5373 | if (pk != MAX_KEY) { |
5374 | const uint key_parts = table_share->key_info[pk].user_defined_key_parts; |
5375 | check_keyread_allowed(pk /*PK*/, key_parts - 1, true); |
5376 | } else |
5377 | m_pk_can_be_decoded = false; |
5378 | |
5379 | cached_table_flags = table_flags(); |
5380 | |
5381 | DBUG_RETURN(false); /* Ok */ |
5382 | } |
5383 | |
5384 | /* |
5385 | If the key is a TTL key, we may need to filter it out. |
5386 | |
5387 | The purpose of read filtering for tables with TTL is to ensure that |
5388 | during a transaction a key which has expired already but not removed by |
5389 | compaction yet is not returned to the user. |
5390 | |
5391 | Without this the user might be hit with problems such as disappearing |
5392 | rows within a transaction, etc, because the compaction filter ignores |
5393 | snapshots when filtering keys. |
5394 | */ |
5395 | bool ha_rocksdb::should_hide_ttl_rec(const Rdb_key_def &kd, |
5396 | const rocksdb::Slice &ttl_rec_val, |
5397 | const int64_t curr_ts) { |
5398 | DBUG_ASSERT(kd.has_ttl()); |
5399 | DBUG_ASSERT(kd.m_ttl_rec_offset != UINT_MAX); |
5400 | |
5401 | /* |
5402 | Curr_ts can only be 0 if there are no snapshots open. |
5403 | should_hide_ttl_rec can only be called when there is >=1 snapshots, unless |
5404 | we are filtering on the write path (single INSERT/UPDATE) in which case |
5405 | we are passed in the current time as curr_ts. |
5406 | |
5407 | In the event curr_ts is 0, we always decide not to filter the record. We |
5408 | also log a warning and increment a diagnostic counter. |
5409 | */ |
5410 | if (curr_ts == 0) { |
5411 | update_row_stats(ROWS_HIDDEN_NO_SNAPSHOT); |
5412 | return false; |
5413 | } |
5414 | |
5415 | if (!rdb_is_ttl_read_filtering_enabled() || !rdb_is_ttl_enabled()) { |
5416 | return false; |
5417 | } |
5418 | |
5419 | Rdb_string_reader reader(&ttl_rec_val); |
5420 | |
5421 | /* |
5422 | Find where the 8-byte ttl is for each record in this index. |
5423 | */ |
5424 | uint64 ts; |
5425 | if (!reader.read(kd.m_ttl_rec_offset) || reader.read_uint64(&ts)) { |
5426 | /* |
5427 | This condition should never be reached since all TTL records have an |
5428 | 8 byte ttl field in front. Don't filter the record out, and log an error. |
5429 | */ |
5430 | std::string buf; |
5431 | buf = rdb_hexdump(ttl_rec_val.data(), ttl_rec_val.size(), |
5432 | RDB_MAX_HEXDUMP_LEN); |
5433 | const GL_INDEX_ID gl_index_id = kd.get_gl_index_id(); |
5434 | // NO_LINT_DEBUG |
5435 | sql_print_error("Decoding ttl from PK value failed, " |
5436 | "for index (%u,%u), val: %s" , |
5437 | gl_index_id.cf_id, gl_index_id.index_id, buf.c_str()); |
5438 | DBUG_ASSERT(0); |
5439 | return false; |
5440 | } |
5441 | |
5442 | /* Hide record if it has expired before the current snapshot time. */ |
5443 | uint64 read_filter_ts = 0; |
5444 | #ifndef NDEBUG |
5445 | read_filter_ts += rdb_dbug_set_ttl_read_filter_ts(); |
5446 | #endif |
5447 | bool is_hide_ttl = |
5448 | ts + kd.m_ttl_duration + read_filter_ts <= static_cast<uint64>(curr_ts); |
5449 | if (is_hide_ttl) { |
5450 | update_row_stats(ROWS_FILTERED); |
5451 | } |
5452 | return is_hide_ttl; |
5453 | } |
5454 | |
5455 | void ha_rocksdb::rocksdb_skip_expired_records(const Rdb_key_def &kd, |
5456 | rocksdb::Iterator *const iter, |
5457 | bool seek_backward) { |
5458 | if (kd.has_ttl()) { |
5459 | while (iter->Valid() && |
5460 | should_hide_ttl_rec( |
5461 | kd, iter->value(), |
5462 | get_or_create_tx(table->in_use)->m_snapshot_timestamp)) { |
5463 | rocksdb_smart_next(seek_backward, iter); |
5464 | } |
5465 | } |
5466 | } |
5467 | |
5468 | /** |
5469 | Convert record from table->record[0] form into a form that can be written |
5470 | into rocksdb. |
5471 | |
5472 | @param pk_packed_slice Packed PK tuple. We need it in order to compute |
5473 | and store its CRC. |
5474 | @param packed_rec OUT Data slice with record data. |
5475 | */ |
5476 | |
5477 | int ha_rocksdb::convert_record_to_storage_format( |
5478 | const struct update_row_info &row_info, rocksdb::Slice *const packed_rec) { |
5479 | DBUG_ASSERT_IMP(m_maybe_unpack_info, row_info.new_pk_unpack_info); |
5480 | DBUG_ASSERT(m_pk_descr != nullptr); |
5481 | |
5482 | const rocksdb::Slice &pk_packed_slice = row_info.new_pk_slice; |
5483 | Rdb_string_writer *const pk_unpack_info = row_info.new_pk_unpack_info; |
5484 | bool has_ttl = m_pk_descr->has_ttl(); |
5485 | bool has_ttl_column = !m_pk_descr->m_ttl_column.empty(); |
5486 | bool ttl_in_pk = has_ttl_column && (row_info.ttl_pk_offset != UINT_MAX); |
5487 | |
5488 | m_storage_record.length(0); |
5489 | |
5490 | if (has_ttl) { |
5491 | /* If it's a TTL record, reserve space for 8 byte TTL value in front. */ |
5492 | m_storage_record.fill(ROCKSDB_SIZEOF_TTL_RECORD + m_null_bytes_in_rec, 0); |
5493 | m_ttl_bytes_updated = false; |
5494 | |
5495 | /* |
5496 | If the TTL is contained within the key, we use the offset to find the |
5497 | TTL value and place it in the beginning of the value record. |
5498 | */ |
5499 | if (ttl_in_pk) { |
5500 | Rdb_string_reader reader(&pk_packed_slice); |
5501 | const char *ts; |
5502 | if (!reader.read(row_info.ttl_pk_offset) || |
5503 | !(ts = reader.read(ROCKSDB_SIZEOF_TTL_RECORD))) { |
5504 | std::string buf; |
5505 | buf = rdb_hexdump(pk_packed_slice.data(), pk_packed_slice.size(), |
5506 | RDB_MAX_HEXDUMP_LEN); |
5507 | const GL_INDEX_ID gl_index_id = m_pk_descr->get_gl_index_id(); |
5508 | // NO_LINT_DEBUG |
5509 | sql_print_error("Decoding ttl from PK failed during insert, " |
5510 | "for index (%u,%u), key: %s" , |
5511 | gl_index_id.cf_id, gl_index_id.index_id, buf.c_str()); |
5512 | return HA_EXIT_FAILURE; |
5513 | } |
5514 | |
5515 | char *const data = const_cast<char *>(m_storage_record.ptr()); |
5516 | memcpy(data, ts, ROCKSDB_SIZEOF_TTL_RECORD); |
5517 | #ifndef NDEBUG |
5518 | // Adjust for test case if needed |
5519 | rdb_netbuf_store_uint64( |
5520 | reinterpret_cast<uchar *>(data), |
5521 | rdb_netbuf_to_uint64(reinterpret_cast<const uchar *>(data)) + |
5522 | rdb_dbug_set_ttl_rec_ts()); |
5523 | #endif |
5524 | // Also store in m_ttl_bytes to propagate to update_sk |
5525 | memcpy(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD); |
5526 | } else if (!has_ttl_column) { |
5527 | /* |
5528 | For implicitly generated TTL records we need to copy over the old |
5529 | TTL value from the old record in the event of an update. It was stored |
5530 | in m_ttl_bytes. |
5531 | |
5532 | Otherwise, generate a timestamp using the current time. |
5533 | */ |
5534 | if (!row_info.old_pk_slice.empty()) { |
5535 | char *const data = const_cast<char *>(m_storage_record.ptr()); |
5536 | memcpy(data, m_ttl_bytes, sizeof(uint64)); |
5537 | } else { |
5538 | uint64 ts = static_cast<uint64>(std::time(nullptr)); |
5539 | #ifndef NDEBUG |
5540 | ts += rdb_dbug_set_ttl_rec_ts(); |
5541 | #endif |
5542 | char *const data = const_cast<char *>(m_storage_record.ptr()); |
5543 | rdb_netbuf_store_uint64(reinterpret_cast<uchar *>(data), ts); |
5544 | // Also store in m_ttl_bytes to propagate to update_sk |
5545 | memcpy(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD); |
5546 | } |
5547 | } |
5548 | } else { |
5549 | /* All NULL bits are initially 0 */ |
5550 | m_storage_record.fill(m_null_bytes_in_rec, 0); |
5551 | } |
5552 | |
5553 | // If a primary key may have non-empty unpack_info for certain values, |
5554 | // (m_maybe_unpack_info=TRUE), we write the unpack_info block. The block |
5555 | // itself was prepared in Rdb_key_def::pack_record. |
5556 | if (m_maybe_unpack_info) { |
5557 | m_storage_record.append(reinterpret_cast<char *>(pk_unpack_info->ptr()), |
5558 | pk_unpack_info->get_current_pos()); |
5559 | } |
5560 | |
5561 | for (uint i = 0; i < table->s->fields; i++) { |
5562 | /* Don't pack decodable PK key parts */ |
5563 | if (m_encoder_arr[i].m_storage_type != Rdb_field_encoder::STORE_ALL) { |
5564 | continue; |
5565 | } |
5566 | |
5567 | Field *const field = table->field[i]; |
5568 | if (m_encoder_arr[i].maybe_null()) { |
5569 | char *data = const_cast<char *>(m_storage_record.ptr()); |
5570 | if (has_ttl) { |
5571 | data += ROCKSDB_SIZEOF_TTL_RECORD; |
5572 | } |
5573 | |
5574 | if (field->is_null()) { |
5575 | data[m_encoder_arr[i].m_null_offset] |= m_encoder_arr[i].m_null_mask; |
5576 | /* Don't write anything for NULL values */ |
5577 | continue; |
5578 | } |
5579 | } |
5580 | |
5581 | if (m_encoder_arr[i].m_field_type == MYSQL_TYPE_BLOB) { |
5582 | my_core::Field_blob *blob = (my_core::Field_blob *)field; |
5583 | /* Get the number of bytes needed to store length*/ |
5584 | const uint length_bytes = blob->pack_length() - portable_sizeof_char_ptr; |
5585 | |
5586 | /* Store the length of the value */ |
5587 | m_storage_record.append(reinterpret_cast<char *>(blob->ptr), |
5588 | length_bytes); |
5589 | |
5590 | /* Store the blob value itself */ |
5591 | char *data_ptr; |
5592 | memcpy(&data_ptr, blob->ptr + length_bytes, sizeof(uchar **)); |
5593 | m_storage_record.append(data_ptr, blob->get_length()); |
5594 | } else if (m_encoder_arr[i].m_field_type == MYSQL_TYPE_VARCHAR) { |
5595 | Field_varstring *const field_var = (Field_varstring *)field; |
5596 | uint data_len; |
5597 | /* field_var->length_bytes is 1 or 2 */ |
5598 | if (field_var->length_bytes == 1) { |
5599 | data_len = field_var->ptr[0]; |
5600 | } else { |
5601 | DBUG_ASSERT(field_var->length_bytes == 2); |
5602 | data_len = uint2korr(field_var->ptr); |
5603 | } |
5604 | m_storage_record.append(reinterpret_cast<char *>(field_var->ptr), |
5605 | field_var->length_bytes + data_len); |
5606 | } else { |
5607 | /* Copy the field data */ |
5608 | const uint len = field->pack_length_in_rec(); |
5609 | m_storage_record.append(reinterpret_cast<char *>(field->ptr), len); |
5610 | |
5611 | /* |
5612 | Check if this is the TTL field within the table, if so store the TTL |
5613 | in the front of the record as well here. |
5614 | */ |
5615 | if (has_ttl && has_ttl_column && |
5616 | i == m_pk_descr->get_ttl_field_offset()) { |
5617 | DBUG_ASSERT(len == ROCKSDB_SIZEOF_TTL_RECORD); |
5618 | DBUG_ASSERT(field->real_type() == MYSQL_TYPE_LONGLONG); |
5619 | DBUG_ASSERT(m_pk_descr->get_ttl_field_offset() != UINT_MAX); |
5620 | |
5621 | char *const data = const_cast<char *>(m_storage_record.ptr()); |
5622 | uint64 ts = uint8korr(field->ptr); |
5623 | #ifndef NDEBUG |
5624 | ts += rdb_dbug_set_ttl_rec_ts(); |
5625 | #endif |
5626 | rdb_netbuf_store_uint64(reinterpret_cast<uchar *>(data), ts); |
5627 | |
5628 | // If this is an update and the timestamp has been updated, take note |
5629 | // so we can avoid updating SKs unnecessarily. |
5630 | if (!row_info.old_pk_slice.empty()) { |
5631 | m_ttl_bytes_updated = |
5632 | memcmp(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD); |
5633 | } |
5634 | // Store timestamp in m_ttl_bytes to propagate to update_sk |
5635 | memcpy(m_ttl_bytes, data, ROCKSDB_SIZEOF_TTL_RECORD); |
5636 | } |
5637 | } |
5638 | } |
5639 | |
5640 | if (should_store_row_debug_checksums()) { |
5641 | const uint32_t key_crc32 = my_core::crc32( |
5642 | 0, rdb_slice_to_uchar_ptr(&pk_packed_slice), pk_packed_slice.size()); |
5643 | const uint32_t val_crc32 = |
5644 | my_core::crc32(0, rdb_mysql_str_to_uchar_str(&m_storage_record), |
5645 | m_storage_record.length()); |
5646 | uchar key_crc_buf[RDB_CHECKSUM_SIZE]; |
5647 | uchar val_crc_buf[RDB_CHECKSUM_SIZE]; |
5648 | rdb_netbuf_store_uint32(key_crc_buf, key_crc32); |
5649 | rdb_netbuf_store_uint32(val_crc_buf, val_crc32); |
5650 | m_storage_record.append((const char *)&RDB_CHECKSUM_DATA_TAG, 1); |
5651 | m_storage_record.append((const char *)key_crc_buf, RDB_CHECKSUM_SIZE); |
5652 | m_storage_record.append((const char *)val_crc_buf, RDB_CHECKSUM_SIZE); |
5653 | } |
5654 | |
5655 | *packed_rec = |
5656 | rocksdb::Slice(m_storage_record.ptr(), m_storage_record.length()); |
5657 | |
5658 | return HA_EXIT_SUCCESS; |
5659 | } |
5660 | |
5661 | /* |
5662 | @brief |
5663 | Setup which fields will be unpacked when reading rows |
5664 | |
5665 | @detail |
5666 | Three special cases when we still unpack all fields: |
5667 | - When this table is being updated (m_lock_rows==RDB_LOCK_WRITE). |
5668 | - When @@rocksdb_verify_row_debug_checksums is ON (In this mode, we need to |
5669 | read all fields to find whether there is a row checksum at the end. We could |
5670 | skip the fields instead of decoding them, but currently we do decoding.) |
5671 | - On index merge as bitmap is cleared during that operation |
5672 | |
5673 | @seealso |
5674 | ha_rocksdb::setup_field_converters() |
5675 | ha_rocksdb::convert_record_from_storage_format() |
5676 | */ |
5677 | void ha_rocksdb::setup_read_decoders() { |
5678 | m_decoders_vect.clear(); |
5679 | m_key_requested = false; |
5680 | |
5681 | int last_useful = 0; |
5682 | int skip_size = 0; |
5683 | |
5684 | for (uint i = 0; i < table->s->fields; i++) { |
5685 | // bitmap is cleared on index merge, but it still needs to decode columns |
5686 | const bool field_requested = |
5687 | m_lock_rows == RDB_LOCK_WRITE || m_verify_row_debug_checksums || |
5688 | bitmap_is_clear_all(table->read_set) || |
5689 | bitmap_is_set(table->read_set, table->field[i]->field_index); |
5690 | |
5691 | // We only need the decoder if the whole record is stored. |
5692 | if (m_encoder_arr[i].m_storage_type != Rdb_field_encoder::STORE_ALL) { |
5693 | // the field potentially needs unpacking |
5694 | if (field_requested) { |
5695 | // the field is in the read set |
5696 | m_key_requested = true; |
5697 | } |
5698 | continue; |
5699 | } |
5700 | |
5701 | if (field_requested) { |
5702 | // We will need to decode this field |
5703 | m_decoders_vect.push_back({&m_encoder_arr[i], true, skip_size}); |
5704 | last_useful = m_decoders_vect.size(); |
5705 | skip_size = 0; |
5706 | } else { |
5707 | if (m_encoder_arr[i].uses_variable_len_encoding() || |
5708 | m_encoder_arr[i].maybe_null()) { |
5709 | // For variable-length field, we need to read the data and skip it |
5710 | m_decoders_vect.push_back({&m_encoder_arr[i], false, skip_size}); |
5711 | skip_size = 0; |
5712 | } else { |
5713 | // Fixed-width field can be skipped without looking at it. |
5714 | // Add appropriate skip_size to the next field. |
5715 | skip_size += m_encoder_arr[i].m_pack_length_in_rec; |
5716 | } |
5717 | } |
5718 | } |
5719 | |
5720 | // It could be that the last few elements are varchars that just do |
5721 | // skipping. Remove them. |
5722 | m_decoders_vect.erase(m_decoders_vect.begin() + last_useful, |
5723 | m_decoders_vect.end()); |
5724 | } |
5725 | |
5726 | #ifndef NDEBUG |
5727 | void dbug_append_garbage_at_end(rocksdb::PinnableSlice *on_disk_rec) { |
5728 | std::string str(on_disk_rec->data(), on_disk_rec->size()); |
5729 | on_disk_rec->Reset(); |
5730 | str.append("abc" ); |
5731 | on_disk_rec->PinSelf(rocksdb::Slice(str)); |
5732 | } |
5733 | |
5734 | void dbug_truncate_record(rocksdb::PinnableSlice *on_disk_rec) { |
5735 | on_disk_rec->remove_suffix(on_disk_rec->size()); |
5736 | } |
5737 | |
5738 | void dbug_modify_rec_varchar12(rocksdb::PinnableSlice *on_disk_rec) { |
5739 | std::string res; |
5740 | // The record is NULL-byte followed by VARCHAR(10). |
5741 | // Put the NULL-byte |
5742 | res.append("\0" , 1); |
5743 | // Then, add a valid VARCHAR(12) value. |
5744 | res.append("\xC" , 1); |
5745 | res.append("123456789ab" , 12); |
5746 | |
5747 | on_disk_rec->Reset(); |
5748 | on_disk_rec->PinSelf(rocksdb::Slice(res)); |
5749 | } |
5750 | |
5751 | void dbug_modify_key_varchar8(String &on_disk_rec) { |
5752 | std::string res; |
5753 | // The key starts with index number |
5754 | res.append(on_disk_rec.ptr(), Rdb_key_def::INDEX_NUMBER_SIZE); |
5755 | |
5756 | // Then, a mem-comparable form of a varchar(8) value. |
5757 | res.append("ABCDE\0\0\0\xFC" , 9); |
5758 | on_disk_rec.length(0); |
5759 | on_disk_rec.append(res.data(), res.size()); |
5760 | } |
5761 | |
5762 | void dbug_create_err_inplace_alter() { |
5763 | my_printf_error(ER_UNKNOWN_ERROR, |
5764 | "Intentional failure in inplace alter occurred." , MYF(0)); |
5765 | } |
5766 | #endif |
5767 | |
5768 | int ha_rocksdb::convert_record_from_storage_format( |
5769 | const rocksdb::Slice *const key, uchar *const buf) { |
5770 | |
5771 | DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read1" , |
5772 | dbug_append_garbage_at_end(&m_retrieved_record);); |
5773 | DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read2" , |
5774 | dbug_truncate_record(&m_retrieved_record);); |
5775 | DBUG_EXECUTE_IF("myrocks_simulate_bad_row_read3" , |
5776 | dbug_modify_rec_varchar12(&m_retrieved_record);); |
5777 | |
5778 | return convert_record_from_storage_format(key, &m_retrieved_record, buf); |
5779 | } |
5780 | |
5781 | int ha_rocksdb::convert_blob_from_storage_format( |
5782 | my_core::Field_blob *const blob, |
5783 | Rdb_string_reader *const reader, |
5784 | bool decode) |
5785 | { |
5786 | /* Get the number of bytes needed to store length*/ |
5787 | const uint length_bytes = blob->pack_length() - portable_sizeof_char_ptr; |
5788 | |
5789 | const char *data_len_str; |
5790 | if (!(data_len_str = reader->read(length_bytes))) { |
5791 | return HA_ERR_ROCKSDB_CORRUPT_DATA; |
5792 | } |
5793 | |
5794 | memcpy(blob->ptr, data_len_str, length_bytes); |
5795 | |
5796 | const uint32 data_len = blob->get_length( |
5797 | reinterpret_cast<const uchar*>(data_len_str), length_bytes); |
5798 | const char *blob_ptr; |
5799 | if (!(blob_ptr = reader->read(data_len))) { |
5800 | return HA_ERR_ROCKSDB_CORRUPT_DATA; |
5801 | } |
5802 | |
5803 | if (decode) { |
5804 | // set 8-byte pointer to 0, like innodb does (relevant for 32-bit |
5805 | // platforms) |
5806 | memset(blob->ptr + length_bytes, 0, 8); |
5807 | memcpy(blob->ptr + length_bytes, &blob_ptr, sizeof(uchar **)); |
5808 | } |
5809 | |
5810 | return HA_EXIT_SUCCESS; |
5811 | } |
5812 | |
5813 | int ha_rocksdb::convert_varchar_from_storage_format( |
5814 | my_core::Field_varstring *const field_var, |
5815 | Rdb_string_reader *const reader, |
5816 | bool decode) |
5817 | { |
5818 | const char *data_len_str; |
5819 | if (!(data_len_str = reader->read(field_var->length_bytes))) |
5820 | return HA_ERR_ROCKSDB_CORRUPT_DATA; |
5821 | |
5822 | uint data_len; |
5823 | /* field_var->length_bytes is 1 or 2 */ |
5824 | if (field_var->length_bytes == 1) { |
5825 | data_len = (uchar)data_len_str[0]; |
5826 | } else { |
5827 | DBUG_ASSERT(field_var->length_bytes == 2); |
5828 | data_len = uint2korr(data_len_str); |
5829 | } |
5830 | |
5831 | if (data_len > field_var->field_length) { |
5832 | /* The data on disk is longer than table DDL allows? */ |
5833 | return HA_ERR_ROCKSDB_CORRUPT_DATA; |
5834 | } |
5835 | |
5836 | if (!reader->read(data_len)) { |
5837 | return HA_ERR_ROCKSDB_CORRUPT_DATA; |
5838 | } |
5839 | |
5840 | if (decode) { |
5841 | memcpy(field_var->ptr, data_len_str, field_var->length_bytes + data_len); |
5842 | } |
5843 | |
5844 | return HA_EXIT_SUCCESS; |
5845 | } |
5846 | |
5847 | int ha_rocksdb::convert_field_from_storage_format( |
5848 | my_core::Field *const field, |
5849 | Rdb_string_reader *const reader, |
5850 | bool decode, |
5851 | uint len) |
5852 | { |
5853 | const char *data_bytes; |
5854 | if (len > 0) { |
5855 | if ((data_bytes = reader->read(len)) == nullptr) { |
5856 | return HA_ERR_ROCKSDB_CORRUPT_DATA; |
5857 | } |
5858 | |
5859 | if (decode) |
5860 | memcpy(field->ptr, data_bytes, len); |
5861 | } |
5862 | |
5863 | return HA_EXIT_SUCCESS; |
5864 | } |
5865 | |
5866 | /* |
5867 | @brief |
5868 | Unpack the record in this->m_retrieved_record and this->m_last_rowkey from |
5869 | storage format into buf (which can be table->record[0] or table->record[1]). |
5870 | |
5871 | @param key Table record's key in mem-comparable form. |
5872 | @param buf Store record in table->record[0] format here |
5873 | |
5874 | @detail |
5875 | If the table has blobs, the unpacked data in buf may keep pointers to the |
5876 | data in this->m_retrieved_record. |
5877 | |
5878 | The key is only needed to check its checksum value (the checksum is in |
5879 | m_retrieved_record). |
5880 | |
5881 | @seealso |
5882 | ha_rocksdb::setup_read_decoders() Sets up data structures which tell which |
5883 | columns to decode. |
5884 | |
5885 | @return |
5886 | 0 OK |
5887 | other Error inpacking the data |
5888 | */ |
5889 | |
5890 | int ha_rocksdb::convert_record_from_storage_format( |
5891 | const rocksdb::Slice *const key, const rocksdb::Slice *const value, |
5892 | uchar *const buf) { |
5893 | DBUG_ASSERT(key != nullptr); |
5894 | DBUG_ASSERT(buf != nullptr); |
5895 | |
5896 | Rdb_string_reader reader(value); |
5897 | |
5898 | /* |
5899 | Decode PK fields from the key |
5900 | */ |
5901 | DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_read1" , |
5902 | dbug_modify_key_varchar8(m_last_rowkey);); |
5903 | |
5904 | const rocksdb::Slice rowkey_slice(m_last_rowkey.ptr(), |
5905 | m_last_rowkey.length()); |
5906 | const char *unpack_info = nullptr; |
5907 | uint16 unpack_info_len = 0; |
5908 | rocksdb::Slice unpack_slice; |
5909 | |
5910 | /* If it's a TTL record, skip the 8 byte TTL value */ |
5911 | const char *ttl_bytes; |
5912 | if (m_pk_descr->has_ttl()) { |
5913 | if ((ttl_bytes = reader.read(ROCKSDB_SIZEOF_TTL_RECORD))) { |
5914 | memcpy(m_ttl_bytes, ttl_bytes, ROCKSDB_SIZEOF_TTL_RECORD); |
5915 | } else { |
5916 | return HA_ERR_ROCKSDB_CORRUPT_DATA; |
5917 | } |
5918 | } |
5919 | |
5920 | /* Other fields are decoded from the value */ |
5921 | const char *null_bytes = nullptr; |
5922 | if (m_null_bytes_in_rec && !(null_bytes = reader.read(m_null_bytes_in_rec))) { |
5923 | return HA_ERR_ROCKSDB_CORRUPT_DATA; |
5924 | } |
5925 | |
5926 | if (m_maybe_unpack_info) { |
5927 | unpack_info = reader.get_current_ptr(); |
5928 | if (!unpack_info || !Rdb_key_def::is_unpack_data_tag(unpack_info[0]) || |
5929 | !reader.read(Rdb_key_def::get_unpack_header_size(unpack_info[0]))) { |
5930 | return HA_ERR_ROCKSDB_CORRUPT_DATA; |
5931 | } |
5932 | |
5933 | unpack_info_len = |
5934 | rdb_netbuf_to_uint16(reinterpret_cast<const uchar *>(unpack_info + 1)); |
5935 | unpack_slice = rocksdb::Slice(unpack_info, unpack_info_len); |
5936 | |
5937 | reader.read(unpack_info_len - |
5938 | Rdb_key_def::get_unpack_header_size(unpack_info[0])); |
5939 | } |
5940 | |
5941 | int err = HA_EXIT_SUCCESS; |
5942 | if (m_key_requested) { |
5943 | err = m_pk_descr->unpack_record(table, buf, &rowkey_slice, |
5944 | unpack_info ? &unpack_slice : nullptr, |
5945 | false /* verify_checksum */); |
5946 | } |
5947 | |
5948 | if (err != HA_EXIT_SUCCESS) { |
5949 | return err; |
5950 | } |
5951 | |
5952 | for (auto it = m_decoders_vect.begin(); it != m_decoders_vect.end(); it++) { |
5953 | const Rdb_field_encoder *const field_dec = it->m_field_enc; |
5954 | const bool decode = it->m_decode; |
5955 | const bool isNull = |
5956 | field_dec->maybe_null() && |
5957 | ((null_bytes[field_dec->m_null_offset] & field_dec->m_null_mask) != 0); |
5958 | |
5959 | Field *const field = table->field[field_dec->m_field_index]; |
5960 | |
5961 | /* Skip the bytes we need to skip */ |
5962 | if (it->m_skip && !reader.read(it->m_skip)) { |
5963 | return HA_ERR_ROCKSDB_CORRUPT_DATA; |
5964 | } |
5965 | |
5966 | uint field_offset = field->ptr - table->record[0]; |
5967 | uint null_offset = field->null_offset(); |
5968 | bool maybe_null = field->real_maybe_null(); |
5969 | field->move_field(buf + field_offset, |
5970 | maybe_null ? buf + null_offset : nullptr, |
5971 | field->null_bit); |
5972 | // WARNING! - Don't return before restoring field->ptr and field->null_ptr! |
5973 | |
5974 | if (isNull) { |
5975 | if (decode) { |
5976 | /* This sets the NULL-bit of this record */ |
5977 | field->set_null(); |
5978 | /* |
5979 | Besides that, set the field value to default value. CHECKSUM TABLE |
5980 | depends on this. |
5981 | */ |
5982 | memcpy(field->ptr, table->s->default_values + field_offset, |
5983 | field->pack_length()); |
5984 | } |
5985 | } else { |
5986 | if (decode) { |
5987 | field->set_notnull(); |
5988 | } |
5989 | |
5990 | if (field_dec->m_field_type == MYSQL_TYPE_BLOB) { |
5991 | err = convert_blob_from_storage_format( |
5992 | (my_core::Field_blob *) field, &reader, decode); |
5993 | } else if (field_dec->m_field_type == MYSQL_TYPE_VARCHAR) { |
5994 | err = convert_varchar_from_storage_format( |
5995 | (my_core::Field_varstring *) field, &reader, decode); |
5996 | } else { |
5997 | err = convert_field_from_storage_format( |
5998 | field, &reader, decode, field_dec->m_pack_length_in_rec); |
5999 | } |
6000 | } |
6001 | |
6002 | // Restore field->ptr and field->null_ptr |
6003 | field->move_field(table->record[0] + field_offset, |
6004 | maybe_null ? table->record[0] + null_offset : nullptr, |
6005 | field->null_bit); |
6006 | |
6007 | if (err != HA_EXIT_SUCCESS) { |
6008 | return err; |
6009 | } |
6010 | } |
6011 | |
6012 | if (m_verify_row_debug_checksums) { |
6013 | if (reader.remaining_bytes() == RDB_CHECKSUM_CHUNK_SIZE && |
6014 | reader.read(1)[0] == RDB_CHECKSUM_DATA_TAG) { |
6015 | uint32_t stored_key_chksum = |
6016 | rdb_netbuf_to_uint32((const uchar *)reader.read(RDB_CHECKSUM_SIZE)); |
6017 | uint32_t stored_val_chksum = |
6018 | rdb_netbuf_to_uint32((const uchar *)reader.read(RDB_CHECKSUM_SIZE)); |
6019 | |
6020 | const uint32_t computed_key_chksum = |
6021 | my_core::crc32(0, rdb_slice_to_uchar_ptr(key), key->size()); |
6022 | const uint32_t computed_val_chksum = |
6023 | my_core::crc32(0, rdb_slice_to_uchar_ptr(value), |
6024 | value->size() - RDB_CHECKSUM_CHUNK_SIZE); |
6025 | |
6026 | DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_checksum1" , |
6027 | stored_key_chksum++;); |
6028 | |
6029 | if (stored_key_chksum != computed_key_chksum) { |
6030 | m_pk_descr->report_checksum_mismatch(true, key->data(), key->size()); |
6031 | return HA_ERR_ROCKSDB_CHECKSUM_MISMATCH; |
6032 | } |
6033 | |
6034 | DBUG_EXECUTE_IF("myrocks_simulate_bad_pk_checksum2" , |
6035 | stored_val_chksum++;); |
6036 | if (stored_val_chksum != computed_val_chksum) { |
6037 | m_pk_descr->report_checksum_mismatch(false, value->data(), |
6038 | value->size()); |
6039 | return HA_ERR_ROCKSDB_CHECKSUM_MISMATCH; |
6040 | } |
6041 | |
6042 | m_row_checksums_checked++; |
6043 | } |
6044 | if (reader.remaining_bytes()) |
6045 | return HA_ERR_ROCKSDB_CORRUPT_DATA; |
6046 | } |
6047 | |
6048 | return HA_EXIT_SUCCESS; |
6049 | } |
6050 | |
6051 | void ha_rocksdb::get_storage_type(Rdb_field_encoder *const encoder, |
6052 | const uint &kp) { |
6053 | // STORE_SOME uses unpack_info. |
6054 | if (m_pk_descr->has_unpack_info(kp)) { |
6055 | DBUG_ASSERT(m_pk_descr->can_unpack(kp)); |
6056 | encoder->m_storage_type = Rdb_field_encoder::STORE_SOME; |
6057 | m_maybe_unpack_info = true; |
6058 | } else if (m_pk_descr->can_unpack(kp)) { |
6059 | encoder->m_storage_type = Rdb_field_encoder::STORE_NONE; |
6060 | } |
6061 | } |
6062 | |
6063 | /* |
6064 | Setup data needed to convert table->record[] to and from record storage |
6065 | format. |
6066 | |
6067 | @seealso |
6068 | ha_rocksdb::convert_record_to_storage_format, |
6069 | ha_rocksdb::convert_record_from_storage_format |
6070 | */ |
6071 | |
6072 | void ha_rocksdb::setup_field_converters() { |
6073 | uint i; |
6074 | uint null_bytes = 0; |
6075 | uchar cur_null_mask = 0x1; |
6076 | |
6077 | DBUG_ASSERT(m_encoder_arr == nullptr); |
6078 | m_encoder_arr = static_cast<Rdb_field_encoder *>( |
6079 | my_malloc(table->s->fields * sizeof(Rdb_field_encoder), MYF(0))); |
6080 | if (m_encoder_arr == nullptr) { |
6081 | return; |
6082 | } |
6083 | |
6084 | for (i = 0; i < table->s->fields; i++) { |
6085 | Field *const field = table->field[i]; |
6086 | m_encoder_arr[i].m_storage_type = Rdb_field_encoder::STORE_ALL; |
6087 | |
6088 | /* |
6089 | Check if this field is |
6090 | - a part of primary key, and |
6091 | - it can be decoded back from its key image. |
6092 | If both hold, we don't need to store this field in the value part of |
6093 | RocksDB's key-value pair. |
6094 | |
6095 | If hidden pk exists, we skip this check since the field will never be |
6096 | part of the hidden pk. |
6097 | */ |
6098 | if (!has_hidden_pk(table)) { |
6099 | KEY *const pk_info = &table->key_info[table->s->primary_key]; |
6100 | for (uint kp = 0; kp < pk_info->user_defined_key_parts; kp++) { |
6101 | /* key_part->fieldnr is counted from 1 */ |
6102 | if (field->field_index + 1 == pk_info->key_part[kp].fieldnr) { |
6103 | get_storage_type(&m_encoder_arr[i], kp); |
6104 | break; |
6105 | } |
6106 | } |
6107 | } |
6108 | |
6109 | m_encoder_arr[i].m_field_type = field->real_type(); |
6110 | m_encoder_arr[i].m_field_index = i; |
6111 | m_encoder_arr[i].m_pack_length_in_rec = field->pack_length_in_rec(); |
6112 | |
6113 | if (field->real_maybe_null()) { |
6114 | m_encoder_arr[i].m_null_mask = cur_null_mask; |
6115 | m_encoder_arr[i].m_null_offset = null_bytes; |
6116 | if (cur_null_mask == 0x80) { |
6117 | cur_null_mask = 0x1; |
6118 | null_bytes++; |
6119 | } else |
6120 | cur_null_mask = cur_null_mask << 1; |
6121 | } else { |
6122 | m_encoder_arr[i].m_null_mask = 0; |
6123 | } |
6124 | } |
6125 | |
6126 | /* Count the last, unfinished NULL-bits byte */ |
6127 | if (cur_null_mask != 0x1) |
6128 | null_bytes++; |
6129 | |
6130 | m_null_bytes_in_rec = null_bytes; |
6131 | } |
6132 | |
6133 | int ha_rocksdb::alloc_key_buffers(const TABLE *const table_arg, |
6134 | const Rdb_tbl_def *const tbl_def_arg, |
6135 | bool alloc_alter_buffers) { |
6136 | DBUG_ENTER_FUNC(); |
6137 | |
6138 | DBUG_ASSERT(m_pk_tuple == nullptr); |
6139 | DBUG_ASSERT(tbl_def_arg != nullptr); |
6140 | |
6141 | std::shared_ptr<Rdb_key_def> *const kd_arr = tbl_def_arg->m_key_descr_arr; |
6142 | |
6143 | uint key_len = 0; |
6144 | uint max_packed_sk_len = 0; |
6145 | uint pack_key_len = 0; |
6146 | |
6147 | m_pk_descr = kd_arr[pk_index(table_arg, tbl_def_arg)]; |
6148 | if (has_hidden_pk(table_arg)) { |
6149 | m_pk_key_parts = 1; |
6150 | } else { |
6151 | m_pk_key_parts = |
6152 | table->key_info[table->s->primary_key].user_defined_key_parts; |
6153 | key_len = table->key_info[table->s->primary_key].key_length; |
6154 | } |
6155 | |
6156 | // move this into get_table_handler() ?? |
6157 | m_pk_descr->setup(table_arg, tbl_def_arg); |
6158 | |
6159 | m_pk_tuple = reinterpret_cast<uchar *>(my_malloc(key_len, MYF(0))); |
6160 | |
6161 | pack_key_len = m_pk_descr->max_storage_fmt_length(); |
6162 | m_pk_packed_tuple = |
6163 | reinterpret_cast<uchar *>(my_malloc(pack_key_len, MYF(0))); |
6164 | |
6165 | /* Sometimes, we may use m_sk_packed_tuple for storing packed PK */ |
6166 | max_packed_sk_len = pack_key_len; |
6167 | for (uint i = 0; i < table_arg->s->keys; i++) { |
6168 | if (i == table_arg->s->primary_key) /* Primary key was processed above */ |
6169 | continue; |
6170 | |
6171 | // TODO: move this into get_table_handler() ?? |
6172 | kd_arr[i]->setup(table_arg, tbl_def_arg); |
6173 | |
6174 | const uint packed_len = kd_arr[i]->max_storage_fmt_length(); |
6175 | if (packed_len > max_packed_sk_len) { |
6176 | max_packed_sk_len = packed_len; |
6177 | } |
6178 | } |
6179 | |
6180 | m_sk_packed_tuple = |
6181 | reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0))); |
6182 | m_sk_match_prefix_buf = |
6183 | reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0))); |
6184 | m_sk_packed_tuple_old = |
6185 | reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0))); |
6186 | m_end_key_packed_tuple = |
6187 | reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0))); |
6188 | m_pack_buffer = |
6189 | reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0))); |
6190 | |
6191 | m_eq_cond_upper_bound = |
6192 | reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0))); |
6193 | m_eq_cond_lower_bound = |
6194 | reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0))); |
6195 | |
6196 | /* |
6197 | If inplace alter is happening, allocate special buffers for unique |
6198 | secondary index duplicate checking. |
6199 | */ |
6200 | if (alloc_alter_buffers) { |
6201 | m_dup_sk_packed_tuple = |
6202 | reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0))); |
6203 | m_dup_sk_packed_tuple_old = |
6204 | reinterpret_cast<uchar *>(my_malloc(max_packed_sk_len, MYF(0))); |
6205 | } |
6206 | |
6207 | if (m_pk_tuple == nullptr || m_pk_packed_tuple == nullptr || |
6208 | m_sk_packed_tuple == nullptr || m_sk_packed_tuple_old == nullptr || |
6209 | m_end_key_packed_tuple == nullptr || m_pack_buffer == nullptr || |
6210 | m_eq_cond_upper_bound == nullptr || m_eq_cond_lower_bound == nullptr || |
6211 | (alloc_alter_buffers && (m_dup_sk_packed_tuple == nullptr || |
6212 | m_dup_sk_packed_tuple_old == nullptr))) { |
6213 | // One or more of the above allocations failed. Clean up and exit |
6214 | free_key_buffers(); |
6215 | |
6216 | DBUG_RETURN(HA_ERR_OUT_OF_MEM); |
6217 | } |
6218 | |
6219 | DBUG_RETURN(HA_EXIT_SUCCESS); |
6220 | } |
6221 | |
6222 | void ha_rocksdb::free_key_buffers() { |
6223 | my_free(m_pk_tuple); |
6224 | m_pk_tuple = nullptr; |
6225 | |
6226 | my_free(m_pk_packed_tuple); |
6227 | m_pk_packed_tuple = nullptr; |
6228 | |
6229 | my_free(m_sk_packed_tuple); |
6230 | m_sk_packed_tuple = nullptr; |
6231 | |
6232 | my_free(m_sk_match_prefix_buf); |
6233 | m_sk_match_prefix_buf = nullptr; |
6234 | |
6235 | my_free(m_sk_packed_tuple_old); |
6236 | m_sk_packed_tuple_old = nullptr; |
6237 | |
6238 | my_free(m_end_key_packed_tuple); |
6239 | m_end_key_packed_tuple = nullptr; |
6240 | |
6241 | my_free(m_pack_buffer); |
6242 | m_pack_buffer = nullptr; |
6243 | |
6244 | my_free(m_dup_sk_packed_tuple); |
6245 | m_dup_sk_packed_tuple = nullptr; |
6246 | |
6247 | my_free(m_dup_sk_packed_tuple_old); |
6248 | m_dup_sk_packed_tuple_old = nullptr; |
6249 | |
6250 | my_free(m_eq_cond_upper_bound); |
6251 | m_eq_cond_upper_bound = nullptr; |
6252 | |
6253 | my_free(m_eq_cond_lower_bound); |
6254 | m_eq_cond_lower_bound = nullptr; |
6255 | } |
6256 | |
6257 | #ifdef MARIAROCKS_NOT_YET |
6258 | void ha_rocksdb::set_use_read_free_rpl(const char *const whitelist) { |
6259 | DBUG_ASSERT(whitelist != nullptr); |
6260 | |
6261 | #if defined(HAVE_PSI_INTERFACE) |
6262 | Regex_list_handler regex_handler(key_rwlock_read_free_rpl_tables); |
6263 | #else |
6264 | Regex_list_handler regex_handler; |
6265 | #endif |
6266 | |
6267 | if (!regex_handler.set_patterns(whitelist)) { |
6268 | warn_about_bad_patterns(®ex_handler, "read_free_rpl_tables" ); |
6269 | } |
6270 | |
6271 | m_use_read_free_rpl = regex_handler.matches(m_tbl_def->base_tablename()); |
6272 | } |
6273 | #endif |
6274 | |
6275 | void ha_rocksdb::set_skip_unique_check_tables(const char *const whitelist) { |
6276 | DBUG_ASSERT(whitelist != nullptr); |
6277 | |
6278 | #if defined(HAVE_PSI_INTERFACE) |
6279 | Regex_list_handler regex_handler(key_rwlock_skip_unique_check_tables); |
6280 | #else |
6281 | Regex_list_handler regex_handler; |
6282 | #endif |
6283 | |
6284 | if (!regex_handler.set_patterns(whitelist)) { |
6285 | warn_about_bad_patterns(®ex_handler, "skip_unique_check_tables" ); |
6286 | } |
6287 | |
6288 | m_skip_unique_check = regex_handler.matches(m_tbl_def->base_tablename()); |
6289 | } |
6290 | |
6291 | /** |
6292 | @return |
6293 | HA_EXIT_SUCCESS OK |
6294 | other HA_ERR error code (can be SE-specific) |
6295 | */ |
6296 | int ha_rocksdb::open(const char *const name, int mode, uint test_if_locked) { |
6297 | DBUG_ENTER_FUNC(); |
6298 | |
6299 | int err = close(); |
6300 | if (err) { |
6301 | DBUG_RETURN(err); |
6302 | } |
6303 | |
6304 | m_table_handler = rdb_open_tables.get_table_handler(name); |
6305 | |
6306 | if (m_table_handler == nullptr) { |
6307 | DBUG_RETURN(HA_ERR_OUT_OF_MEM); |
6308 | } |
6309 | |
6310 | my_core::thr_lock_data_init(&m_table_handler->m_thr_lock, &m_db_lock, |
6311 | nullptr); |
6312 | m_io_perf.init(&m_table_handler->m_table_perf_context, |
6313 | &m_table_handler->m_io_perf_read, |
6314 | &m_table_handler->m_io_perf_write, &stats); |
6315 | Rdb_perf_context_guard guard(&m_io_perf, |
6316 | rocksdb_perf_context_level(ha_thd())); |
6317 | |
6318 | std::string fullname; |
6319 | err = rdb_normalize_tablename(name, &fullname); |
6320 | if (err != HA_EXIT_SUCCESS) { |
6321 | DBUG_RETURN(err); |
6322 | } |
6323 | |
6324 | m_tbl_def = ddl_manager.find(fullname); |
6325 | if (m_tbl_def == nullptr) { |
6326 | my_error(ER_INTERNAL_ERROR, MYF(0), |
6327 | "Attempt to open a table that is not present in RocksDB-SE data " |
6328 | "dictionary" ); |
6329 | DBUG_RETURN(HA_ERR_ROCKSDB_INVALID_TABLE); |
6330 | } |
6331 | m_lock_rows = RDB_LOCK_NONE; |
6332 | |
6333 | m_key_descr_arr = m_tbl_def->m_key_descr_arr; |
6334 | |
6335 | /* |
6336 | Full table scan actually uses primary key |
6337 | (UPDATE needs to know this, otherwise it will go into infinite loop on |
6338 | queries like "UPDATE tbl SET pk=pk+100") |
6339 | */ |
6340 | key_used_on_scan = table->s->primary_key; |
6341 | |
6342 | // close() above has already called free_key_buffers(). No need to do it here. |
6343 | err = alloc_key_buffers(table, m_tbl_def); |
6344 | |
6345 | if (err) { |
6346 | DBUG_RETURN(err); |
6347 | } |
6348 | |
6349 | /* |
6350 | init_with_fields() is used to initialize table flags based on the field |
6351 | definitions in table->field[]. |
6352 | It is called by open_binary_frm(), but that function calls the method for |
6353 | a temporary ha_rocksdb object which is later destroyed. |
6354 | |
6355 | If we are here in ::open(), then init_with_fields() has not been called |
6356 | for this object. Call it ourselves, we want all member variables to be |
6357 | properly initialized. |
6358 | */ |
6359 | init_with_fields(); |
6360 | |
6361 | setup_field_converters(); |
6362 | |
6363 | /* |
6364 | MariaDB: adjust field->part_of_key for PK columns. We can only do it here |
6365 | because SE API is just relying on the HA_PRIMARY_KEY_IN_READ_INDEX which |
6366 | does not allow to distinguish between unpack'able and non-unpack'able |
6367 | columns. |
6368 | Upstream uses handler->init_with_fields() but we don't have that call. |
6369 | */ |
6370 | { |
6371 | if (!has_hidden_pk(table)) { |
6372 | KEY *const pk_info = &table->key_info[table->s->primary_key]; |
6373 | for (uint kp = 0; kp < pk_info->user_defined_key_parts; kp++) { |
6374 | if (!m_pk_descr->can_unpack(kp)) { |
6375 | // |
6376 | uint field_index= pk_info->key_part[kp].field->field_index; |
6377 | table->field[field_index]->part_of_key.clear_all(); |
6378 | table->field[field_index]->part_of_key.set_bit(table->s->primary_key); |
6379 | } |
6380 | } |
6381 | } |
6382 | |
6383 | for (uint key= 0; key < table->s->keys; key++) { |
6384 | KEY *const key_info = &table->key_info[key]; |
6385 | if (key == table->s->primary_key) |
6386 | continue; |
6387 | for (uint kp = 0; kp < key_info->usable_key_parts; kp++) { |
6388 | uint field_index= key_info->key_part[kp].field->field_index; |
6389 | if (m_key_descr_arr[key]->can_unpack(kp)) { |
6390 | table->field[field_index]->part_of_key.set_bit(key); |
6391 | } else { |
6392 | table->field[field_index]->part_of_key.clear_bit(key); |
6393 | } |
6394 | } |
6395 | } |
6396 | } |
6397 | |
6398 | info(HA_STATUS_NO_LOCK | HA_STATUS_VARIABLE | HA_STATUS_CONST); |
6399 | |
6400 | /* |
6401 | The following load_XXX code calls row decode functions, and they do |
6402 | that without having done ::external_lock() or index_init()/rnd_init(). |
6403 | (Note: this also means we're doing a read when there was no |
6404 | setup_field_converters() call) |
6405 | |
6406 | Initialize the necessary variables for them: |
6407 | */ |
6408 | m_verify_row_debug_checksums = false; |
6409 | |
6410 | /* Load auto_increment value only once on first use. */ |
6411 | if (table->found_next_number_field && m_tbl_def->m_auto_incr_val == 0) { |
6412 | load_auto_incr_value(); |
6413 | } |
6414 | |
6415 | /* Load hidden pk only once on first use. */ |
6416 | if (has_hidden_pk(table) && m_tbl_def->m_hidden_pk_val == 0 && |
6417 | (err = load_hidden_pk_value()) != HA_EXIT_SUCCESS) { |
6418 | free_key_buffers(); |
6419 | DBUG_RETURN(err); |
6420 | } |
6421 | |
6422 | /* Index block size in MyRocks: used by MySQL in query optimization */ |
6423 | stats.block_size = rocksdb_tbl_options->block_size; |
6424 | |
6425 | #ifdef MARIAROCKS_NOT_YET // MDEV-10976 |
6426 | /* Determine at open whether we can use Read Free Replication or not */ |
6427 | set_use_read_free_rpl(THDVAR(ha_thd(), read_free_rpl_tables)); |
6428 | #endif |
6429 | |
6430 | /* Determine at open whether we should skip unique checks for this table */ |
6431 | set_skip_unique_check_tables(THDVAR(ha_thd(), skip_unique_check_tables)); |
6432 | |
6433 | DBUG_RETURN(HA_EXIT_SUCCESS); |
6434 | } |
6435 | |
6436 | int ha_rocksdb::close(void) { |
6437 | DBUG_ENTER_FUNC(); |
6438 | |
6439 | m_pk_descr = nullptr; |
6440 | m_key_descr_arr = nullptr; |
6441 | |
6442 | free_key_buffers(); |
6443 | |
6444 | my_free(m_encoder_arr); |
6445 | m_encoder_arr = nullptr; |
6446 | |
6447 | if (m_table_handler != nullptr) { |
6448 | rdb_open_tables.release_table_handler(m_table_handler); |
6449 | m_table_handler = nullptr; |
6450 | } |
6451 | |
6452 | // These are needed to suppress valgrind errors in rocksdb.partition |
6453 | m_storage_record.free(); |
6454 | m_last_rowkey.free(); |
6455 | m_sk_tails.free(); |
6456 | m_sk_tails_old.free(); |
6457 | m_pk_unpack_info.free(); |
6458 | |
6459 | DBUG_RETURN(HA_EXIT_SUCCESS); |
6460 | } |
6461 | |
6462 | static const char *rdb_error_messages[] = { |
6463 | "Table must have a PRIMARY KEY." , |
6464 | "Specifying DATA DIRECTORY for an individual table is not supported." , |
6465 | "Specifying INDEX DIRECTORY for an individual table is not supported." , |
6466 | "RocksDB commit failed." , |
6467 | "Failure during bulk load operation." , |
6468 | "Found data corruption." , |
6469 | "CRC checksum mismatch." , |
6470 | "Invalid table." , |
6471 | "Could not access RocksDB properties." , |
6472 | "File I/O error during merge/sort operation." , |
6473 | "RocksDB status: not found." , |
6474 | "RocksDB status: corruption." , |
6475 | "RocksDB status: invalid argument." , |
6476 | "RocksDB status: io error." , |
6477 | "RocksDB status: no space." , |
6478 | "RocksDB status: merge in progress." , |
6479 | "RocksDB status: incomplete." , |
6480 | "RocksDB status: shutdown in progress." , |
6481 | "RocksDB status: timed out." , |
6482 | "RocksDB status: aborted." , |
6483 | "RocksDB status: lock limit reached." , |
6484 | "RocksDB status: busy." , |
6485 | "RocksDB status: deadlock." , |
6486 | "RocksDB status: expired." , |
6487 | "RocksDB status: try again." , |
6488 | }; |
6489 | |
6490 | static_assert((sizeof(rdb_error_messages) / sizeof(rdb_error_messages[0])) == |
6491 | ((HA_ERR_ROCKSDB_LAST - HA_ERR_ROCKSDB_FIRST) + 1), |
6492 | "Number of error messages doesn't match number of error codes" ); |
6493 | |
6494 | //psergey-merge: do we need this in MariaDB: we have get_error_messages |
6495 | //below... |
6496 | #if 0 |
6497 | static const char *rdb_get_error_message(int nr) { |
6498 | return rdb_error_messages[nr - HA_ERR_ROCKSDB_FIRST]; |
6499 | } |
6500 | #endif |
6501 | |
6502 | static const char **rdb_get_error_messages(int nr) { return rdb_error_messages; } |
6503 | |
6504 | bool ha_rocksdb::get_error_message(const int error, String *const buf) { |
6505 | DBUG_ENTER_FUNC(); |
6506 | |
6507 | static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST, |
6508 | "HA_ERR_ROCKSDB_LAST > HA_ERR_FIRST" ); |
6509 | static_assert(HA_ERR_ROCKSDB_LAST > HA_ERR_LAST, |
6510 | "HA_ERR_ROCKSDB_LAST > HA_ERR_LAST" ); |
6511 | |
6512 | DBUG_ASSERT(buf != nullptr); |
6513 | |
6514 | if (error == HA_ERR_LOCK_WAIT_TIMEOUT || error == HA_ERR_LOCK_DEADLOCK) { |
6515 | Rdb_transaction *const tx = get_tx_from_thd(ha_thd()); |
6516 | DBUG_ASSERT(tx != nullptr); |
6517 | buf->append(tx->m_detailed_error); |
6518 | DBUG_RETURN(true); |
6519 | } |
6520 | |
6521 | if (error >= HA_ERR_ROCKSDB_FIRST && error <= HA_ERR_ROCKSDB_LAST) { |
6522 | buf->append(rdb_error_messages[error - HA_ERR_ROCKSDB_FIRST]); |
6523 | } |
6524 | |
6525 | // We can be called with the values which are < HA_ERR_FIRST because most |
6526 | // MySQL internal functions will just return HA_EXIT_FAILURE in case of |
6527 | // an error. |
6528 | |
6529 | DBUG_RETURN(false); |
6530 | } |
6531 | |
6532 | /* |
6533 | Generalized way to convert RocksDB status errors into MySQL error code, and |
6534 | print error message. |
6535 | |
6536 | Each error code below maps to a RocksDB status code found in: |
6537 | rocksdb/include/rocksdb/status.h |
6538 | */ |
6539 | int ha_rocksdb::rdb_error_to_mysql(const rocksdb::Status &s, |
6540 | const char *opt_msg) { |
6541 | DBUG_ASSERT(!s.ok()); |
6542 | |
6543 | int err; |
6544 | switch (s.code()) { |
6545 | case rocksdb::Status::Code::kOk: |
6546 | err = HA_EXIT_SUCCESS; |
6547 | break; |
6548 | case rocksdb::Status::Code::kNotFound: |
6549 | err = HA_ERR_ROCKSDB_STATUS_NOT_FOUND; |
6550 | break; |
6551 | case rocksdb::Status::Code::kCorruption: |
6552 | err = HA_ERR_ROCKSDB_STATUS_CORRUPTION; |
6553 | break; |
6554 | case rocksdb::Status::Code::kNotSupported: |
6555 | err = HA_ERR_ROCKSDB_STATUS_NOT_SUPPORTED; |
6556 | break; |
6557 | case rocksdb::Status::Code::kInvalidArgument: |
6558 | err = HA_ERR_ROCKSDB_STATUS_INVALID_ARGUMENT; |
6559 | break; |
6560 | case rocksdb::Status::Code::kIOError: |
6561 | err = (s.IsNoSpace()) ? HA_ERR_ROCKSDB_STATUS_NO_SPACE |
6562 | : HA_ERR_ROCKSDB_STATUS_IO_ERROR; |
6563 | break; |
6564 | case rocksdb::Status::Code::kMergeInProgress: |
6565 | err = HA_ERR_ROCKSDB_STATUS_MERGE_IN_PROGRESS; |
6566 | break; |
6567 | case rocksdb::Status::Code::kIncomplete: |
6568 | err = HA_ERR_ROCKSDB_STATUS_INCOMPLETE; |
6569 | break; |
6570 | case rocksdb::Status::Code::kShutdownInProgress: |
6571 | err = HA_ERR_ROCKSDB_STATUS_SHUTDOWN_IN_PROGRESS; |
6572 | break; |
6573 | case rocksdb::Status::Code::kTimedOut: |
6574 | err = HA_ERR_ROCKSDB_STATUS_TIMED_OUT; |
6575 | break; |
6576 | case rocksdb::Status::Code::kAborted: |
6577 | err = (s.IsLockLimit()) ? HA_ERR_ROCKSDB_STATUS_LOCK_LIMIT |
6578 | : HA_ERR_ROCKSDB_STATUS_ABORTED; |
6579 | break; |
6580 | case rocksdb::Status::Code::kBusy: |
6581 | err = (s.IsDeadlock()) ? HA_ERR_ROCKSDB_STATUS_DEADLOCK |
6582 | : HA_ERR_ROCKSDB_STATUS_BUSY; |
6583 | break; |
6584 | case rocksdb::Status::Code::kExpired: |
6585 | err = HA_ERR_ROCKSDB_STATUS_EXPIRED; |
6586 | break; |
6587 | case rocksdb::Status::Code::kTryAgain: |
6588 | err = HA_ERR_ROCKSDB_STATUS_TRY_AGAIN; |
6589 | break; |
6590 | default: |
6591 | DBUG_ASSERT(0); |
6592 | return -1; |
6593 | } |
6594 | |
6595 | if (opt_msg) { |
6596 | my_error(ER_RDB_STATUS_MSG, MYF(0), opt_msg, s.code(), |
6597 | s.ToString().c_str()); |
6598 | } else { |
6599 | my_error(ER_RDB_STATUS_GENERAL, MYF(0), s.code(), s.ToString().c_str()); |
6600 | } |
6601 | |
6602 | return err; |
6603 | } |
6604 | |
6605 | /* MyRocks supports only the following collations for indexed columns */ |
6606 | static const std::set<uint> RDB_INDEX_COLLATIONS = { |
6607 | COLLATION_BINARY, COLLATION_UTF8_BIN, COLLATION_LATIN1_BIN}; |
6608 | |
6609 | static bool |
6610 | rdb_is_index_collation_supported(const my_core::Field *const field) { |
6611 | const my_core::enum_field_types type = field->real_type(); |
6612 | /* Handle [VAR](CHAR|BINARY) or TEXT|BLOB */ |
6613 | if (type == MYSQL_TYPE_VARCHAR || type == MYSQL_TYPE_STRING || |
6614 | type == MYSQL_TYPE_BLOB) { |
6615 | |
6616 | return (RDB_INDEX_COLLATIONS.find(field->charset()->number) != |
6617 | RDB_INDEX_COLLATIONS.end()) || |
6618 | rdb_is_collation_supported(field->charset()); |
6619 | } |
6620 | return true; |
6621 | } |
6622 | |
6623 | |
6624 | static bool |
6625 | rdb_field_uses_nopad_collation(const my_core::Field *const field) { |
6626 | const my_core::enum_field_types type = field->real_type(); |
6627 | /* Handle [VAR](CHAR|BINARY) or TEXT|BLOB */ |
6628 | if (type == MYSQL_TYPE_VARCHAR || type == MYSQL_TYPE_STRING || |
6629 | type == MYSQL_TYPE_BLOB) { |
6630 | |
6631 | /* |
6632 | This is technically a NOPAD collation but it's a binary collation |
6633 | that we can handle. |
6634 | */ |
6635 | if (RDB_INDEX_COLLATIONS.find(field->charset()->number) != |
6636 | RDB_INDEX_COLLATIONS.end()) |
6637 | return false; |
6638 | |
6639 | return (field->charset()->state & MY_CS_NOPAD); |
6640 | } |
6641 | return false; |
6642 | } |
6643 | |
6644 | |
6645 | /* |
6646 | Create structures needed for storing data in rocksdb. This is called when the |
6647 | table is created. The structures will be shared by all TABLE* objects. |
6648 | |
6649 | @param |
6650 | table_arg Table with definition |
6651 | db_table "dbname.tablename" |
6652 | len strlen of the above |
6653 | tbl_def_arg tbl_def whose key_descr is being created/populated |
6654 | old_tbl_def_arg tbl_def from which keys are being copied over from |
6655 | (for use during inplace alter) |
6656 | |
6657 | @return |
6658 | 0 - Ok |
6659 | other - error, either given table ddl is not supported by rocksdb or OOM. |
6660 | */ |
6661 | int ha_rocksdb::create_key_defs( |
6662 | const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg, |
6663 | const TABLE *const old_table_arg /* = nullptr */, |
6664 | const Rdb_tbl_def *const old_tbl_def_arg |
6665 | /* = nullptr */) const { |
6666 | DBUG_ENTER_FUNC(); |
6667 | |
6668 | DBUG_ASSERT(table_arg != nullptr); |
6669 | DBUG_ASSERT(table_arg->s != nullptr); |
6670 | |
6671 | uint i; |
6672 | |
6673 | /* |
6674 | These need to be one greater than MAX_INDEXES since the user can create |
6675 | MAX_INDEXES secondary keys and no primary key which would cause us |
6676 | to generate a hidden one. |
6677 | */ |
6678 | std::array<key_def_cf_info, MAX_INDEXES + 1> cfs; |
6679 | |
6680 | /* |
6681 | NOTE: All new column families must be created before new index numbers are |
6682 | allocated to each key definition. See below for more details. |
6683 | http://github.com/MySQLOnRocksDB/mysql-5.6/issues/86#issuecomment-138515501 |
6684 | */ |
6685 | if (create_cfs(table_arg, tbl_def_arg, &cfs)) { |
6686 | DBUG_RETURN(HA_EXIT_FAILURE); |
6687 | } |
6688 | |
6689 | if (!old_tbl_def_arg) { |
6690 | /* |
6691 | old_tbl_def doesn't exist. this means we are in the process of creating |
6692 | a new table. |
6693 | |
6694 | Get the index numbers (this will update the next_index_number) |
6695 | and create Rdb_key_def structures. |
6696 | */ |
6697 | for (i = 0; i < tbl_def_arg->m_key_count; i++) { |
6698 | if (create_key_def(table_arg, i, tbl_def_arg, &m_key_descr_arr[i], |
6699 | cfs[i])) { |
6700 | DBUG_RETURN(HA_EXIT_FAILURE); |
6701 | } |
6702 | } |
6703 | } else { |
6704 | /* |
6705 | old_tbl_def exists. This means we are creating a new tbl_def as part of |
6706 | in-place alter table. Copy over existing keys from the old_tbl_def and |
6707 | generate the necessary new key definitions if any. |
6708 | */ |
6709 | if (create_inplace_key_defs(table_arg, tbl_def_arg, old_table_arg, |
6710 | old_tbl_def_arg, cfs)) { |
6711 | DBUG_RETURN(HA_EXIT_FAILURE); |
6712 | } |
6713 | } |
6714 | |
6715 | DBUG_RETURN(HA_EXIT_SUCCESS); |
6716 | } |
6717 | |
6718 | /* |
6719 | Checks index parameters and creates column families needed for storing data |
6720 | in rocksdb if necessary. |
6721 | |
6722 | @param in |
6723 | table_arg Table with definition |
6724 | db_table Table name |
6725 | tbl_def_arg Table def structure being populated |
6726 | |
6727 | @param out |
6728 | cfs CF info for each key definition in 'key_info' order |
6729 | |
6730 | @return |
6731 | 0 - Ok |
6732 | other - error |
6733 | */ |
6734 | int ha_rocksdb::create_cfs( |
6735 | const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg, |
6736 | std::array<struct key_def_cf_info, MAX_INDEXES + 1> *const cfs) const { |
6737 | DBUG_ENTER_FUNC(); |
6738 | |
6739 | DBUG_ASSERT(table_arg != nullptr); |
6740 | DBUG_ASSERT(table_arg->s != nullptr); |
6741 | DBUG_ASSERT(tbl_def_arg != nullptr); |
6742 | |
6743 | char tablename_sys[NAME_LEN + 1]; |
6744 | bool tsys_set= false; |
6745 | |
6746 | /* |
6747 | The first loop checks the index parameters and creates |
6748 | column families if necessary. |
6749 | */ |
6750 | for (uint i = 0; i < tbl_def_arg->m_key_count; i++) { |
6751 | rocksdb::ColumnFamilyHandle *cf_handle; |
6752 | |
6753 | if (!is_hidden_pk(i, table_arg, tbl_def_arg) && |
6754 | tbl_def_arg->base_tablename().find(tmp_file_prefix) != 0) { |
6755 | if (!tsys_set) |
6756 | { |
6757 | tsys_set= true; |
6758 | my_core::filename_to_tablename(tbl_def_arg->base_tablename().c_str(), |
6759 | tablename_sys, sizeof(tablename_sys)); |
6760 | } |
6761 | |
6762 | for (uint part = 0; part < table_arg->key_info[i].ext_key_parts; |
6763 | part++) |
6764 | { |
6765 | /* MariaDB: disallow NOPAD collations */ |
6766 | if (rdb_field_uses_nopad_collation( |
6767 | table_arg->key_info[i].key_part[part].field)) |
6768 | { |
6769 | my_error(ER_MYROCKS_CANT_NOPAD_COLLATION, MYF(0)); |
6770 | DBUG_RETURN(HA_EXIT_FAILURE); |
6771 | } |
6772 | |
6773 | if (rocksdb_strict_collation_check && |
6774 | !rdb_is_index_collation_supported( |
6775 | table_arg->key_info[i].key_part[part].field) && |
6776 | !rdb_collation_exceptions->matches(tablename_sys)) { |
6777 | |
6778 | char buf[1024]; |
6779 | my_snprintf(buf, sizeof(buf), |
6780 | "Indexed column %s.%s uses a collation that does not " |
6781 | "allow index-only access in secondary key and has " |
6782 | "reduced disk space efficiency in primary key." , |
6783 | tbl_def_arg->full_tablename().c_str(), |
6784 | table_arg->key_info[i].key_part[part].field->field_name.str); |
6785 | |
6786 | my_error(ER_INTERNAL_ERROR, MYF(ME_JUST_WARNING), buf); |
6787 | } |
6788 | } |
6789 | } |
6790 | |
6791 | // Internal consistency check to make sure that data in TABLE and |
6792 | // Rdb_tbl_def structures matches. Either both are missing or both are |
6793 | // specified. Yes, this is critical enough to make it into SHIP_ASSERT. |
6794 | SHIP_ASSERT(IF_PARTITIONING(!table_arg->part_info,true) == tbl_def_arg->base_partition().empty()); |
6795 | |
6796 | // Generate the name for the column family to use. |
6797 | bool per_part_match_found = false; |
6798 | std::string cf_name = generate_cf_name(i, table_arg, tbl_def_arg, |
6799 | &per_part_match_found); |
6800 | |
6801 | // Prevent create from using the system column family. |
6802 | if (cf_name == DEFAULT_SYSTEM_CF_NAME) { |
6803 | my_error(ER_WRONG_ARGUMENTS, MYF(0), |
6804 | "column family not valid for storing index data." ); |
6805 | DBUG_RETURN(HA_EXIT_FAILURE); |
6806 | } |
6807 | |
6808 | // Here's how `get_or_create_cf` will use the input parameters: |
6809 | // |
6810 | // `cf_name` - will be used as a CF name. |
6811 | cf_handle = cf_manager.get_or_create_cf(rdb, cf_name); |
6812 | |
6813 | if (!cf_handle) { |
6814 | DBUG_RETURN(HA_EXIT_FAILURE); |
6815 | } |
6816 | |
6817 | auto &cf = (*cfs)[i]; |
6818 | |
6819 | cf.cf_handle = cf_handle; |
6820 | cf.is_reverse_cf = Rdb_cf_manager::is_cf_name_reverse(cf_name.c_str()); |
6821 | cf.is_per_partition_cf = per_part_match_found; |
6822 | } |
6823 | |
6824 | DBUG_RETURN(HA_EXIT_SUCCESS); |
6825 | } |
6826 | |
6827 | /* |
6828 | Create key definition needed for storing data in rocksdb during ADD index |
6829 | inplace operations. |
6830 | |
6831 | @param in |
6832 | table_arg Table with definition |
6833 | tbl_def_arg New table def structure being populated |
6834 | old_tbl_def_arg Old(current) table def structure |
6835 | cfs Struct array which contains column family information |
6836 | |
6837 | @return |
6838 | 0 - Ok |
6839 | other - error, either given table ddl is not supported by rocksdb or OOM. |
6840 | */ |
6841 | int ha_rocksdb::create_inplace_key_defs( |
6842 | const TABLE *const table_arg, Rdb_tbl_def *const tbl_def_arg, |
6843 | const TABLE *const old_table_arg, const Rdb_tbl_def *const old_tbl_def_arg, |
6844 | const std::array<key_def_cf_info, MAX_INDEXES + 1> &cfs) const { |
6845 | DBUG_ENTER_FUNC(); |
6846 | |
6847 | DBUG_ASSERT(table_arg != nullptr); |
6848 | DBUG_ASSERT(tbl_def_arg != nullptr); |
6849 | DBUG_ASSERT(old_tbl_def_arg != nullptr); |
6850 | |
6851 | std::shared_ptr<Rdb_key_def> *const old_key_descr = |
6852 | old_tbl_def_arg->m_key_descr_arr; |
6853 | std::shared_ptr<Rdb_key_def> *const new_key_descr = |
6854 | tbl_def_arg->m_key_descr_arr; |
6855 | const std::unordered_map<std::string, uint> old_key_pos = |
6856 | get_old_key_positions(table_arg, tbl_def_arg, old_table_arg, |
6857 | old_tbl_def_arg); |
6858 | |
6859 | uint i; |
6860 | for (i = 0; i < tbl_def_arg->m_key_count; i++) { |
6861 | const auto &it = old_key_pos.find(get_key_name(i, table_arg, tbl_def_arg)); |
6862 | |
6863 | if (it != old_key_pos.end()) { |
6864 | /* |
6865 | Found matching index in old table definition, so copy it over to the |
6866 | new one created. |
6867 | */ |
6868 | const Rdb_key_def &okd = *old_key_descr[it->second]; |
6869 | |
6870 | const GL_INDEX_ID gl_index_id = okd.get_gl_index_id(); |
6871 | struct Rdb_index_info index_info; |
6872 | if (!dict_manager.get_index_info(gl_index_id, &index_info)) { |
6873 | // NO_LINT_DEBUG |
6874 | sql_print_error("RocksDB: Could not get index information " |
6875 | "for Index Number (%u,%u), table %s" , |
6876 | gl_index_id.cf_id, gl_index_id.index_id, |
6877 | old_tbl_def_arg->full_tablename().c_str()); |
6878 | DBUG_RETURN(HA_EXIT_FAILURE); |
6879 | } |
6880 | |
6881 | uint32 ttl_rec_offset = |
6882 | Rdb_key_def::has_index_flag(index_info.m_index_flags, |
6883 | Rdb_key_def::TTL_FLAG) |
6884 | ? Rdb_key_def::calculate_index_flag_offset( |
6885 | index_info.m_index_flags, Rdb_key_def::TTL_FLAG) |
6886 | : UINT_MAX; |
6887 | |
6888 | /* |
6889 | We can't use the copy constructor because we need to update the |
6890 | keynr within the pack_info for each field and the keyno of the keydef |
6891 | itself. |
6892 | */ |
6893 | new_key_descr[i] = std::make_shared<Rdb_key_def>( |
6894 | okd.get_index_number(), i, okd.get_cf(), |
6895 | index_info.m_index_dict_version, index_info.m_index_type, |
6896 | index_info.m_kv_version, okd.m_is_reverse_cf, |
6897 | okd.m_is_per_partition_cf, okd.m_name.c_str(), |
6898 | dict_manager.get_stats(gl_index_id), index_info.m_index_flags, |
6899 | ttl_rec_offset, index_info.m_ttl_duration); |
6900 | } else if (create_key_def(table_arg, i, tbl_def_arg, &new_key_descr[i], |
6901 | cfs[i])) { |
6902 | DBUG_RETURN(HA_EXIT_FAILURE); |
6903 | } |
6904 | |
6905 | DBUG_ASSERT(new_key_descr[i] != nullptr); |
6906 | new_key_descr[i]->setup(table_arg, tbl_def_arg); |
6907 | } |
6908 | |
6909 | DBUG_RETURN(HA_EXIT_SUCCESS); |
6910 | } |
6911 | |
6912 | std::unordered_map<std::string, uint> ha_rocksdb::get_old_key_positions( |
6913 | const TABLE *const table_arg, const Rdb_tbl_def *const tbl_def_arg, |
6914 | const TABLE *const old_table_arg, |
6915 | const Rdb_tbl_def *const old_tbl_def_arg) const { |
6916 | DBUG_ENTER_FUNC(); |
6917 | |
6918 | DBUG_ASSERT(table_arg != nullptr); |
6919 | DBUG_ASSERT(old_table_arg != nullptr); |
6920 | DBUG_ASSERT(tbl_def_arg != nullptr); |
6921 | DBUG_ASSERT(old_tbl_def_arg != nullptr); |
6922 | |
6923 | std::shared_ptr<Rdb_key_def> *const old_key_descr = |
6924 | old_tbl_def_arg->m_key_descr_arr; |
6925 | std::unordered_map<std::string, uint> old_key_pos; |
6926 | std::unordered_map<std::string, uint> new_key_pos; |
6927 | uint i; |
6928 | |
6929 | for (i = 0; i < tbl_def_arg->m_key_count; i++) { |
6930 | new_key_pos[get_key_name(i, table_arg, tbl_def_arg)] = i; |
6931 | } |
6932 | |
6933 | for (i = 0; i < old_tbl_def_arg->m_key_count; i++) { |
6934 | if (is_hidden_pk(i, old_table_arg, old_tbl_def_arg)) { |
6935 | old_key_pos[old_key_descr[i]->m_name] = i; |
6936 | continue; |
6937 | } |
6938 | |
6939 | /* |
6940 | In case of matching key name, need to check key parts of keys as well, |
6941 | in case a simultaneous drop + add is performed, where the key name is the |
6942 | same but the key parts are different. |
6943 | |
6944 | Example: |
6945 | CREATE TABLE t1 (a INT, b INT, KEY ka(a)) ENGINE=RocksDB; |
6946 | ALTER TABLE t1 DROP INDEX ka, ADD INDEX ka(b), ALGORITHM=INPLACE; |
6947 | */ |
6948 | const KEY *const old_key = &old_table_arg->key_info[i]; |
6949 | const auto &it = new_key_pos.find(old_key->name.str); |
6950 | if (it == new_key_pos.end()) { |
6951 | continue; |
6952 | } |
6953 | |
6954 | KEY *const new_key = &table_arg->key_info[it->second]; |
6955 | |
6956 | /* |
6957 | Check that the key is identical between old and new tables. |
6958 | If not, we still need to create a new index. |
6959 | |
6960 | The exception is if there is an index changed from unique to non-unique, |
6961 | in these cases we don't need to rebuild as they are stored the same way in |
6962 | RocksDB. |
6963 | */ |
6964 | bool unique_to_non_unique = |
6965 | ((old_key->flags ^ new_key->flags) == HA_NOSAME) && |
6966 | (old_key->flags & HA_NOSAME); |
6967 | |
6968 | if (compare_keys(old_key, new_key) && !unique_to_non_unique) { |
6969 | continue; |
6970 | } |
6971 | |
6972 | /* Check to make sure key parts match. */ |
6973 | if (compare_key_parts(old_key, new_key)) { |
6974 | continue; |
6975 | } |
6976 | |
6977 | old_key_pos[old_key->name.str] = i; |
6978 | } |
6979 | |
6980 | DBUG_RETURN(old_key_pos); |
6981 | } |
6982 | |
6983 | /* Check to see if two keys are identical. */ |
6984 | int ha_rocksdb::compare_keys(const KEY *const old_key, |
6985 | const KEY *const new_key) const { |
6986 | DBUG_ENTER_FUNC(); |
6987 | |
6988 | DBUG_ASSERT(old_key != nullptr); |
6989 | DBUG_ASSERT(new_key != nullptr); |
6990 | |
6991 | /* Check index name. */ |
6992 | if (strcmp(old_key->name.str, new_key->name.str) != 0) { |
6993 | DBUG_RETURN(HA_EXIT_FAILURE); |
6994 | } |
6995 | |
6996 | /* If index algorithms are different then keys are different. */ |
6997 | if (old_key->algorithm != new_key->algorithm) { |
6998 | DBUG_RETURN(HA_EXIT_FAILURE); |
6999 | } |
7000 | |
7001 | /* Check that the key is identical between old and new tables. */ |
7002 | if ((old_key->flags ^ new_key->flags) & HA_KEYFLAG_MASK) { |
7003 | DBUG_RETURN(HA_EXIT_FAILURE); |
7004 | } |
7005 | |
7006 | /* Check index comment. (for column family changes) */ |
7007 | std::string (old_key->comment.str, old_key->comment.length); |
7008 | std::string (new_key->comment.str, new_key->comment.length); |
7009 | if (old_comment.compare(new_comment) != 0) { |
7010 | DBUG_RETURN(HA_EXIT_FAILURE); |
7011 | } |
7012 | |
7013 | DBUG_RETURN(HA_EXIT_SUCCESS); |
7014 | } |
7015 | |
7016 | /* Check two keys to ensure that key parts within keys match */ |
7017 | int ha_rocksdb::compare_key_parts(const KEY *const old_key, |
7018 | const KEY *const new_key) const { |
7019 | DBUG_ENTER_FUNC(); |
7020 | |
7021 | DBUG_ASSERT(old_key != nullptr); |
7022 | DBUG_ASSERT(new_key != nullptr); |
7023 | |
7024 | /* Skip if key parts do not match, as it is a different key */ |
7025 | if (new_key->user_defined_key_parts != old_key->user_defined_key_parts) { |
7026 | DBUG_RETURN(HA_EXIT_FAILURE); |
7027 | } |
7028 | |
7029 | /* Check to see that key parts themselves match */ |
7030 | for (uint i = 0; i < old_key->user_defined_key_parts; i++) { |
7031 | if (strcmp(old_key->key_part[i].field->field_name.str, |
7032 | new_key->key_part[i].field->field_name.str) != 0) { |
7033 | DBUG_RETURN(HA_EXIT_FAILURE); |
7034 | } |
7035 | |
7036 | /* Check if prefix index key part length has changed */ |
7037 | if (old_key->key_part[i].length != new_key->key_part[i].length) { |
7038 | DBUG_RETURN(HA_EXIT_FAILURE); |
7039 | } |
7040 | } |
7041 | |
7042 | DBUG_RETURN(HA_EXIT_SUCCESS); |
7043 | } |
7044 | |
7045 | /* |
7046 | Create key definition needed for storing data in rocksdb. |
7047 | This can be called either during CREATE table or doing ADD index operations. |
7048 | |
7049 | @param in |
7050 | table_arg Table with definition |
7051 | i Position of index being created inside table_arg->key_info |
7052 | tbl_def_arg Table def structure being populated |
7053 | cf_info Struct which contains column family information |
7054 | |
7055 | @param out |
7056 | new_key_def Newly created index definition. |
7057 | |
7058 | @return |
7059 | 0 - Ok |
7060 | other - error, either given table ddl is not supported by rocksdb or OOM. |
7061 | */ |
7062 | int ha_rocksdb::create_key_def(const TABLE *const table_arg, const uint &i, |
7063 | const Rdb_tbl_def *const tbl_def_arg, |
7064 | std::shared_ptr<Rdb_key_def> *const new_key_def, |
7065 | const struct key_def_cf_info &cf_info) const { |
7066 | DBUG_ENTER_FUNC(); |
7067 | |
7068 | DBUG_ASSERT(new_key_def != nullptr); |
7069 | DBUG_ASSERT(*new_key_def == nullptr); |
7070 | |
7071 | uint64 ttl_duration = 0; |
7072 | std::string ttl_column; |
7073 | uint ttl_field_offset; |
7074 | |
7075 | uint err; |
7076 | if ((err = Rdb_key_def::extract_ttl_duration(table_arg, tbl_def_arg, |
7077 | &ttl_duration))) { |
7078 | DBUG_RETURN(err); |
7079 | } |
7080 | |
7081 | if ((err = Rdb_key_def::extract_ttl_col(table_arg, tbl_def_arg, &ttl_column, |
7082 | &ttl_field_offset))) { |
7083 | DBUG_RETURN(err); |
7084 | } |
7085 | |
7086 | /* We don't currently support TTL on tables with hidden primary keys. */ |
7087 | if (ttl_duration > 0 && is_hidden_pk(i, table_arg, tbl_def_arg)) { |
7088 | my_error(ER_RDB_TTL_UNSUPPORTED, MYF(0)); |
7089 | DBUG_RETURN(HA_EXIT_FAILURE); |
7090 | } |
7091 | |
7092 | /* |
7093 | If TTL duration is not specified but TTL column was specified, throw an |
7094 | error because TTL column requires duration. |
7095 | */ |
7096 | if (ttl_duration == 0 && !ttl_column.empty()) { |
7097 | my_error(ER_RDB_TTL_COL_FORMAT, MYF(0), ttl_column.c_str()); |
7098 | DBUG_RETURN(HA_EXIT_FAILURE); |
7099 | } |
7100 | |
7101 | const uint index_id = ddl_manager.get_and_update_next_number(&dict_manager); |
7102 | const uint16_t index_dict_version = Rdb_key_def::INDEX_INFO_VERSION_LATEST; |
7103 | uchar index_type; |
7104 | uint16_t kv_version; |
7105 | |
7106 | if (is_hidden_pk(i, table_arg, tbl_def_arg)) { |
7107 | index_type = Rdb_key_def::INDEX_TYPE_HIDDEN_PRIMARY; |
7108 | kv_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST; |
7109 | } else if (i == table_arg->s->primary_key) { |
7110 | index_type = Rdb_key_def::INDEX_TYPE_PRIMARY; |
7111 | uint16 pk_latest_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_LATEST; |
7112 | kv_version = pk_latest_version; |
7113 | } else { |
7114 | index_type = Rdb_key_def::INDEX_TYPE_SECONDARY; |
7115 | uint16 sk_latest_version = Rdb_key_def::SECONDARY_FORMAT_VERSION_LATEST; |
7116 | kv_version = sk_latest_version; |
7117 | } |
7118 | |
7119 | // Use PRIMARY_FORMAT_VERSION_UPDATE1 here since it is the same value as |
7120 | // SECONDARY_FORMAT_VERSION_UPDATE1 so it doesn't matter if this is a |
7121 | // primary key or secondary key. |
7122 | DBUG_EXECUTE_IF("MYROCKS_LEGACY_VARBINARY_FORMAT" , { |
7123 | kv_version = Rdb_key_def::PRIMARY_FORMAT_VERSION_UPDATE1; |
7124 | }); |
7125 | |
7126 | DBUG_EXECUTE_IF("MYROCKS_NO_COVERED_BITMAP_FORMAT" , { |
7127 | if (index_type == Rdb_key_def::INDEX_TYPE_SECONDARY) { |
7128 | kv_version = Rdb_key_def::SECONDARY_FORMAT_VERSION_UPDATE2; |
7129 | } |
7130 | }); |
7131 | |
7132 | uint32 index_flags = (ttl_duration > 0 ? Rdb_key_def::TTL_FLAG : 0); |
7133 | |
7134 | uint32 ttl_rec_offset = |
7135 | Rdb_key_def::has_index_flag(index_flags, Rdb_key_def::TTL_FLAG) |
7136 | ? Rdb_key_def::calculate_index_flag_offset(index_flags, |
7137 | Rdb_key_def::TTL_FLAG) |
7138 | : UINT_MAX; |
7139 | |
7140 | const char *const key_name = get_key_name(i, table_arg, m_tbl_def); |
7141 | *new_key_def = std::make_shared<Rdb_key_def>( |
7142 | index_id, i, cf_info.cf_handle, index_dict_version, index_type, |
7143 | kv_version, cf_info.is_reverse_cf, cf_info.is_per_partition_cf, key_name, |
7144 | Rdb_index_stats(), index_flags, ttl_rec_offset, ttl_duration); |
7145 | |
7146 | if (!ttl_column.empty()) { |
7147 | (*new_key_def)->m_ttl_column = ttl_column; |
7148 | } |
7149 | |
7150 | DBUG_RETURN(HA_EXIT_SUCCESS); |
7151 | } |
7152 | |
7153 | int rdb_normalize_tablename(const std::string &tablename, |
7154 | std::string *const strbuf) { |
7155 | DBUG_ASSERT(strbuf != nullptr); |
7156 | |
7157 | if (tablename.size() < 2 || tablename[0] != '.' || |
7158 | (tablename[1] != FN_LIBCHAR && tablename[1] != FN_LIBCHAR2)) { |
7159 | DBUG_ASSERT(0); // We were not passed table name? |
7160 | return HA_ERR_ROCKSDB_INVALID_TABLE; |
7161 | } |
7162 | |
7163 | size_t pos = tablename.find_first_of(FN_LIBCHAR, 2); |
7164 | if (pos == std::string::npos) { |
7165 | pos = tablename.find_first_of(FN_LIBCHAR2, 2); |
7166 | } |
7167 | |
7168 | if (pos == std::string::npos) { |
7169 | DBUG_ASSERT(0); // We were not passed table name? |
7170 | return HA_ERR_ROCKSDB_INVALID_TABLE; |
7171 | } |
7172 | |
7173 | *strbuf = tablename.substr(2, pos - 2) + "." + tablename.substr(pos + 1); |
7174 | |
7175 | return HA_EXIT_SUCCESS; |
7176 | } |
7177 | |
7178 | /* |
7179 | Check to see if the user's original statement includes foreign key |
7180 | references |
7181 | */ |
7182 | bool ha_rocksdb::contains_foreign_key(THD *const thd) { |
7183 | bool success; |
7184 | const char *str = thd_query_string(thd)->str; |
7185 | |
7186 | DBUG_ASSERT(str != nullptr); |
7187 | |
7188 | while (*str != '\0') { |
7189 | // Scan from our current pos looking for 'FOREIGN' |
7190 | str = rdb_find_in_string(str, "FOREIGN" , &success); |
7191 | if (!success) { |
7192 | return false; |
7193 | } |
7194 | |
7195 | // Skip past the found "FOREIGN' |
7196 | str = rdb_check_next_token(&my_charset_bin, str, "FOREIGN" , &success); |
7197 | DBUG_ASSERT(success); |
7198 | |
7199 | if (!my_isspace(&my_charset_bin, *str)) { |
7200 | return false; |
7201 | } |
7202 | |
7203 | // See if the next token is 'KEY' |
7204 | str = rdb_check_next_token(&my_charset_bin, str, "KEY" , &success); |
7205 | if (!success) { |
7206 | continue; |
7207 | } |
7208 | |
7209 | // See if the next token is '(' |
7210 | str = rdb_check_next_token(&my_charset_bin, str, "(" , &success); |
7211 | if (!success) { |
7212 | // There is an optional index id after 'FOREIGN KEY', skip it |
7213 | str = rdb_skip_id(&my_charset_bin, str); |
7214 | |
7215 | // Now check for '(' again |
7216 | str = rdb_check_next_token(&my_charset_bin, str, "(" , &success); |
7217 | } |
7218 | |
7219 | // If we have found 'FOREIGN KEY [<word>] (' we can be confident we have |
7220 | // a foreign key clause. |
7221 | return success; |
7222 | } |
7223 | |
7224 | // We never found a valid foreign key clause |
7225 | return false; |
7226 | } |
7227 | |
7228 | /** |
7229 | @brief |
7230 | splits the normalized table name of <dbname>.<tablename>#P#<part_no> into |
7231 | the <dbname>, <tablename> and <part_no> components. |
7232 | |
7233 | @param dbbuf returns database name/table_schema |
7234 | @param tablebuf returns tablename |
7235 | @param partitionbuf returns partition suffix if there is one |
7236 | @return HA_EXIT_SUCCESS on success, non-zero on failure to split |
7237 | */ |
7238 | int rdb_split_normalized_tablename(const std::string &fullname, |
7239 | std::string *const db, |
7240 | std::string *const table, |
7241 | std::string *const partition) { |
7242 | DBUG_ASSERT(!fullname.empty()); |
7243 | |
7244 | #define RDB_PARTITION_STR "#P#" |
7245 | |
7246 | /* Normalize returns dbname.tablename. */ |
7247 | size_t dotpos = fullname.find('.'); |
7248 | |
7249 | /* Invalid table name? */ |
7250 | if (dotpos == std::string::npos) { |
7251 | return HA_ERR_ROCKSDB_INVALID_TABLE; |
7252 | } |
7253 | |
7254 | // Table must have a database name associated with it. |
7255 | DBUG_ASSERT(dotpos > 0); |
7256 | |
7257 | if (db != nullptr) { |
7258 | *db = fullname.substr(0, dotpos); |
7259 | } |
7260 | |
7261 | dotpos++; |
7262 | |
7263 | const size_t partpos = |
7264 | fullname.find(RDB_PARTITION_STR, dotpos, strlen(RDB_PARTITION_STR)); |
7265 | |
7266 | if (partpos != std::string::npos) { |
7267 | DBUG_ASSERT(partpos >= dotpos); |
7268 | |
7269 | if (table != nullptr) { |
7270 | *table = fullname.substr(dotpos, partpos - dotpos); |
7271 | } |
7272 | |
7273 | if (partition != nullptr) { |
7274 | *partition = fullname.substr(partpos + strlen(RDB_PARTITION_STR)); |
7275 | } |
7276 | } else if (table != nullptr) { |
7277 | *table = fullname.substr(dotpos); |
7278 | } |
7279 | |
7280 | return HA_EXIT_SUCCESS; |
7281 | } |
7282 | |
7283 | /** |
7284 | @brief |
7285 | create() is called to create a database. The variable name will have the name |
7286 | of the table. |
7287 | |
7288 | @details |
7289 | When create() is called you do not need to worry about |
7290 | opening the table. Also, the .frm file will have already been |
7291 | created so adjusting create_info is not necessary. You can overwrite |
7292 | the .frm file at this point if you wish to change the table |
7293 | definition, but there are no methods currently provided for doing |
7294 | so. |
7295 | |
7296 | Called from handle.cc by ha_create_table(). |
7297 | |
7298 | @return |
7299 | HA_EXIT_SUCCESS OK |
7300 | other HA_ERR error code (can be SE-specific) |
7301 | |
7302 | @see |
7303 | ha_create_table() in handle.cc |
7304 | */ |
7305 | |
7306 | int ha_rocksdb::create(const char *const name, TABLE *const table_arg, |
7307 | HA_CREATE_INFO *const create_info) { |
7308 | DBUG_ENTER_FUNC(); |
7309 | |
7310 | DBUG_ASSERT(table_arg != nullptr); |
7311 | DBUG_ASSERT(create_info != nullptr); |
7312 | |
7313 | if (create_info->data_file_name) { |
7314 | // DATA DIRECTORY is used to create tables under a specific location |
7315 | // outside the MySQL data directory. We don't support this for MyRocks. |
7316 | // The `rocksdb_datadir` setting should be used to configure RocksDB data |
7317 | // directory. |
7318 | print_error(HA_ERR_ROCKSDB_TABLE_DATA_DIRECTORY_NOT_SUPPORTED, |
7319 | MYF(ME_JUST_WARNING)); |
7320 | DBUG_RETURN(HA_WRONG_CREATE_OPTION); |
7321 | } |
7322 | |
7323 | if (create_info->index_file_name) { |
7324 | // Similar check for INDEX DIRECTORY as well. |
7325 | print_error(HA_ERR_ROCKSDB_TABLE_INDEX_DIRECTORY_NOT_SUPPORTED, |
7326 | MYF(ME_JUST_WARNING)); |
7327 | DBUG_RETURN(HA_WRONG_CREATE_OPTION); |
7328 | } |
7329 | |
7330 | std::string str; |
7331 | int err; |
7332 | |
7333 | THD *const thd = my_core::thd_get_current_thd(); |
7334 | |
7335 | if (get_table_if_exists(name)) { |
7336 | if (thd->lex->sql_command == SQLCOM_TRUNCATE) { |
7337 | err = delete_table(name); |
7338 | if (err != HA_EXIT_SUCCESS) { |
7339 | DBUG_RETURN(err); |
7340 | } |
7341 | } else { |
7342 | err = rdb_normalize_tablename(name, &str); |
7343 | if (err != HA_EXIT_SUCCESS) { |
7344 | DBUG_RETURN(err); |
7345 | } |
7346 | my_error(ER_METADATA_INCONSISTENCY, MYF(0), str.c_str(), name); |
7347 | DBUG_RETURN(HA_ERR_ROCKSDB_CORRUPT_DATA); |
7348 | } |
7349 | } |
7350 | |
7351 | /* |
7352 | Construct dbname.tablename ourselves, because parititioning |
7353 | passes strings like "./test/t14#P#p0" for individual partitions, |
7354 | while table_arg->s->table_name has none of that. |
7355 | */ |
7356 | err = rdb_normalize_tablename(name, &str); |
7357 | if (err != HA_EXIT_SUCCESS) { |
7358 | DBUG_RETURN(err); |
7359 | } |
7360 | |
7361 | if (contains_foreign_key(thd)) { |
7362 | my_error(ER_NOT_SUPPORTED_YET, MYF(0), |
7363 | "FOREIGN KEY for the RocksDB storage engine" ); |
7364 | DBUG_RETURN(HA_ERR_UNSUPPORTED); |
7365 | } |
7366 | |
7367 | const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin(); |
7368 | rocksdb::WriteBatch *const batch = wb.get(); |
7369 | |
7370 | /* Create table/key descriptions and put them into the data dictionary */ |
7371 | m_tbl_def = new Rdb_tbl_def(str); |
7372 | |
7373 | uint n_keys = table_arg->s->keys; |
7374 | |
7375 | /* |
7376 | If no primary key found, create a hidden PK and place it inside table |
7377 | definition |
7378 | */ |
7379 | if (has_hidden_pk(table_arg)) { |
7380 | n_keys += 1; |
7381 | } |
7382 | |
7383 | m_key_descr_arr = new std::shared_ptr<Rdb_key_def>[n_keys]; |
7384 | m_tbl_def->m_key_count = n_keys; |
7385 | m_tbl_def->m_key_descr_arr = m_key_descr_arr; |
7386 | |
7387 | err = create_key_defs(table_arg, m_tbl_def); |
7388 | if (err != HA_EXIT_SUCCESS) { |
7389 | goto error; |
7390 | } |
7391 | |
7392 | m_pk_descr = m_key_descr_arr[pk_index(table_arg, m_tbl_def)]; |
7393 | |
7394 | if (create_info->auto_increment_value) { |
7395 | bool autoinc_upgrade_test = false; |
7396 | m_tbl_def->m_auto_incr_val = create_info->auto_increment_value; |
7397 | DBUG_EXECUTE_IF("myrocks_autoinc_upgrade" , autoinc_upgrade_test = true;); |
7398 | if (!autoinc_upgrade_test) { |
7399 | auto s = dict_manager.put_auto_incr_val( |
7400 | batch, m_tbl_def->get_autoincr_gl_index_id(), |
7401 | m_tbl_def->m_auto_incr_val); |
7402 | if (!s.ok()) { |
7403 | goto error; |
7404 | } |
7405 | } |
7406 | } |
7407 | |
7408 | dict_manager.lock(); |
7409 | err = ddl_manager.put_and_write(m_tbl_def, batch); |
7410 | if (err != HA_EXIT_SUCCESS) { |
7411 | dict_manager.unlock(); |
7412 | goto error; |
7413 | } |
7414 | |
7415 | err = dict_manager.commit(batch); |
7416 | if (err != HA_EXIT_SUCCESS) { |
7417 | dict_manager.unlock(); |
7418 | goto error; |
7419 | } |
7420 | |
7421 | dict_manager.unlock(); |
7422 | |
7423 | DBUG_RETURN(HA_EXIT_SUCCESS); |
7424 | |
7425 | error: |
7426 | /* Delete what we have allocated so far */ |
7427 | delete m_tbl_def; |
7428 | m_tbl_def = nullptr; |
7429 | m_key_descr_arr = nullptr; |
7430 | |
7431 | DBUG_RETURN(err); |
7432 | } |
7433 | |
7434 | /** |
7435 | @note |
7436 | This function is used only when the table has not yet been opened, and |
7437 | keyread_allowed bitmap doesn't have the correct values yet. |
7438 | |
7439 | See comment in ha_rocksdb::index_flags() for details. |
7440 | */ |
7441 | |
7442 | bool ha_rocksdb::check_keyread_allowed(uint inx, uint part, |
7443 | bool all_parts) const { |
7444 | bool res = true; |
7445 | KEY *const key_info = &table_share->key_info[inx]; |
7446 | |
7447 | Rdb_field_packing dummy1; |
7448 | res = dummy1.setup(nullptr, key_info->key_part[part].field, inx, part, |
7449 | key_info->key_part[part].length); |
7450 | |
7451 | if (res && all_parts) { |
7452 | for (uint i = 0; i < part; i++) { |
7453 | Field *field; |
7454 | if ((field = key_info->key_part[i].field)) { |
7455 | Rdb_field_packing dummy; |
7456 | if (!dummy.setup(nullptr, field, inx, i, |
7457 | key_info->key_part[i].length)) { |
7458 | /* Cannot do index-only reads for this column */ |
7459 | res = false; |
7460 | break; |
7461 | } |
7462 | } |
7463 | } |
7464 | } |
7465 | |
7466 | const uint pk = table_share->primary_key; |
7467 | if (inx == pk && all_parts && |
7468 | part + 1 == table_share->key_info[pk].user_defined_key_parts) { |
7469 | m_pk_can_be_decoded = res; |
7470 | } |
7471 | |
7472 | return res; |
7473 | } |
7474 | |
7475 | int ha_rocksdb::read_key_exact(const Rdb_key_def &kd, |
7476 | rocksdb::Iterator *const iter, |
7477 | const bool &full_key_match, |
7478 | const rocksdb::Slice &key_slice, |
7479 | const int64_t ttl_filter_ts) { |
7480 | DBUG_ASSERT(iter != nullptr); |
7481 | |
7482 | /* |
7483 | We are looking for the first record such that |
7484 | index_tuple= lookup_tuple. |
7485 | lookup_tuple may be a prefix of the index. |
7486 | */ |
7487 | rocksdb_smart_seek(kd.m_is_reverse_cf, iter, key_slice); |
7488 | |
7489 | while (iter->Valid() && kd.value_matches_prefix(iter->key(), key_slice)) { |
7490 | /* |
7491 | If TTL is enabled we need to check if the given key has already expired |
7492 | from the POV of the current transaction. If it has, try going to the next |
7493 | key. |
7494 | */ |
7495 | if (kd.has_ttl() && should_hide_ttl_rec(kd, iter->value(), ttl_filter_ts)) { |
7496 | rocksdb_smart_next(kd.m_is_reverse_cf, iter); |
7497 | continue; |
7498 | } |
7499 | |
7500 | return HA_EXIT_SUCCESS; |
7501 | } |
7502 | |
7503 | /* |
7504 | Got a record that is not equal to the lookup value, or even a record |
7505 | from another table.index. |
7506 | */ |
7507 | return HA_ERR_KEY_NOT_FOUND; |
7508 | } |
7509 | |
7510 | int ha_rocksdb::read_before_key(const Rdb_key_def &kd, |
7511 | const bool &full_key_match, |
7512 | const rocksdb::Slice &key_slice, |
7513 | const int64_t ttl_filter_ts) { |
7514 | /* |
7515 | We are looking for record with the biggest t.key such that |
7516 | t.key < lookup_tuple. |
7517 | */ |
7518 | rocksdb_smart_seek(!kd.m_is_reverse_cf, m_scan_it, key_slice); |
7519 | |
7520 | while (is_valid(m_scan_it)) { |
7521 | /* |
7522 | We are using full key and we've hit an exact match, or... |
7523 | |
7524 | If TTL is enabled we need to check if the given key has already expired |
7525 | from the POV of the current transaction. If it has, try going to the next |
7526 | key. |
7527 | */ |
7528 | if ((full_key_match && |
7529 | kd.value_matches_prefix(m_scan_it->key(), key_slice)) || |
7530 | (kd.has_ttl() && |
7531 | should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts))) { |
7532 | rocksdb_smart_next(!kd.m_is_reverse_cf, m_scan_it); |
7533 | continue; |
7534 | } |
7535 | |
7536 | return HA_EXIT_SUCCESS; |
7537 | } |
7538 | |
7539 | return HA_ERR_KEY_NOT_FOUND; |
7540 | } |
7541 | |
7542 | int ha_rocksdb::read_after_key(const Rdb_key_def &kd, |
7543 | const rocksdb::Slice &key_slice, |
7544 | const int64_t ttl_filter_ts) { |
7545 | /* |
7546 | We are looking for the first record such that |
7547 | |
7548 | index_tuple $GT lookup_tuple |
7549 | |
7550 | with HA_READ_AFTER_KEY, $GT = '>', |
7551 | with HA_READ_KEY_OR_NEXT, $GT = '>=' |
7552 | */ |
7553 | rocksdb_smart_seek(kd.m_is_reverse_cf, m_scan_it, key_slice); |
7554 | |
7555 | /* |
7556 | If TTL is enabled we need to check if the given key has already expired |
7557 | from the POV of the current transaction. If it has, try going to the next |
7558 | key. |
7559 | */ |
7560 | while (is_valid(m_scan_it) && kd.has_ttl() && |
7561 | should_hide_ttl_rec(kd, m_scan_it->value(), ttl_filter_ts)) { |
7562 | rocksdb_smart_next(kd.m_is_reverse_cf, m_scan_it); |
7563 | } |
7564 | |
7565 | return is_valid(m_scan_it) ? HA_EXIT_SUCCESS : HA_ERR_KEY_NOT_FOUND; |
7566 | } |
7567 | |
7568 | int ha_rocksdb::position_to_correct_key( |
7569 | const Rdb_key_def &kd, const enum ha_rkey_function &find_flag, |
7570 | const bool &full_key_match, const uchar *const key, |
7571 | const key_part_map &keypart_map, const rocksdb::Slice &key_slice, |
7572 | bool *const move_forward, const int64_t ttl_filter_ts) { |
7573 | int rc = 0; |
7574 | |
7575 | *move_forward = true; |
7576 | |
7577 | switch (find_flag) { |
7578 | case HA_READ_KEY_EXACT: |
7579 | rc = |
7580 | read_key_exact(kd, m_scan_it, full_key_match, key_slice, ttl_filter_ts); |
7581 | break; |
7582 | case HA_READ_BEFORE_KEY: |
7583 | *move_forward = false; |
7584 | rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts); |
7585 | if (rc == 0 && !kd.covers_key(m_scan_it->key())) { |
7586 | /* The record we've got is not from this index */ |
7587 | rc = HA_ERR_KEY_NOT_FOUND; |
7588 | } |
7589 | break; |
7590 | case HA_READ_AFTER_KEY: |
7591 | case HA_READ_KEY_OR_NEXT: |
7592 | rc = read_after_key(kd, key_slice, ttl_filter_ts); |
7593 | if (rc == 0 && !kd.covers_key(m_scan_it->key())) { |
7594 | /* The record we've got is not from this index */ |
7595 | rc = HA_ERR_KEY_NOT_FOUND; |
7596 | } |
7597 | break; |
7598 | case HA_READ_KEY_OR_PREV: |
7599 | case HA_READ_PREFIX: |
7600 | /* This flag is not used by the SQL layer, so we don't support it yet. */ |
7601 | rc = HA_ERR_UNSUPPORTED; |
7602 | break; |
7603 | case HA_READ_PREFIX_LAST: |
7604 | case HA_READ_PREFIX_LAST_OR_PREV: |
7605 | *move_forward = false; |
7606 | /* |
7607 | Find the last record with the specified index prefix lookup. |
7608 | - HA_READ_PREFIX_LAST requires that the record has the |
7609 | prefix=lookup (if there are no such records, |
7610 | HA_ERR_KEY_NOT_FOUND should be returned). |
7611 | - HA_READ_PREFIX_LAST_OR_PREV has no such requirement. If there are no |
7612 | records with prefix=lookup, we should return the last record |
7613 | before that. |
7614 | */ |
7615 | rc = read_before_key(kd, full_key_match, key_slice, ttl_filter_ts); |
7616 | if (rc == 0) { |
7617 | const rocksdb::Slice &rkey = m_scan_it->key(); |
7618 | if (!kd.covers_key(rkey)) { |
7619 | /* The record we've got is not from this index */ |
7620 | rc = HA_ERR_KEY_NOT_FOUND; |
7621 | } else if (find_flag == HA_READ_PREFIX_LAST) { |
7622 | uint size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple, |
7623 | key, keypart_map); |
7624 | rocksdb::Slice lookup_tuple(reinterpret_cast<char *>(m_sk_packed_tuple), |
7625 | size); |
7626 | |
7627 | // We need to compare the key we've got with the original search prefix. |
7628 | if (!kd.value_matches_prefix(rkey, lookup_tuple)) { |
7629 | rc = HA_ERR_KEY_NOT_FOUND; |
7630 | } |
7631 | } |
7632 | } |
7633 | break; |
7634 | default: |
7635 | DBUG_ASSERT(0); |
7636 | break; |
7637 | } |
7638 | |
7639 | return rc; |
7640 | } |
7641 | |
7642 | int ha_rocksdb::calc_eq_cond_len(const Rdb_key_def &kd, |
7643 | const enum ha_rkey_function &find_flag, |
7644 | const rocksdb::Slice &slice, |
7645 | const int &bytes_changed_by_succ, |
7646 | const key_range *const end_key, |
7647 | uint *const end_key_packed_size) { |
7648 | if (find_flag == HA_READ_KEY_EXACT) |
7649 | return slice.size(); |
7650 | |
7651 | if (find_flag == HA_READ_PREFIX_LAST) { |
7652 | /* |
7653 | We have made the kd.successor(m_sk_packed_tuple) call above. |
7654 | |
7655 | The slice is at least Rdb_key_def::INDEX_NUMBER_SIZE bytes long. |
7656 | */ |
7657 | return slice.size() - bytes_changed_by_succ; |
7658 | } |
7659 | |
7660 | if (end_key) { |
7661 | *end_key_packed_size = |
7662 | kd.pack_index_tuple(table, m_pack_buffer, m_end_key_packed_tuple, |
7663 | end_key->key, end_key->keypart_map); |
7664 | |
7665 | /* |
7666 | Calculating length of the equal conditions here. 4 byte index id is |
7667 | included. |
7668 | Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3) |
7669 | WHERE id1=1 AND id2=1 AND id3>=2 => eq_cond_len= 4+8+4= 16 |
7670 | WHERE id1=1 AND id2>=1 AND id3>=2 => eq_cond_len= 4+8= 12 |
7671 | Example2: id1 VARCHAR(30), id2 INT, PRIMARY KEY (id1, id2) |
7672 | WHERE id1 = 'AAA' and id2 < 3; => eq_cond_len=13 (varchar used 9 bytes) |
7673 | */ |
7674 | rocksdb::Slice end_slice(reinterpret_cast<char *>(m_end_key_packed_tuple), |
7675 | *end_key_packed_size); |
7676 | return slice.difference_offset(end_slice); |
7677 | } |
7678 | |
7679 | /* |
7680 | On range scan without any end key condition, there is no |
7681 | eq cond, and eq cond length is the same as index_id size (4 bytes). |
7682 | Example1: id1 BIGINT, id2 INT, id3 BIGINT, PRIMARY KEY (id1, id2, id3) |
7683 | WHERE id1>=1 AND id2 >= 2 and id2 <= 5 => eq_cond_len= 4 |
7684 | */ |
7685 | return Rdb_key_def::INDEX_NUMBER_SIZE; |
7686 | } |
7687 | |
7688 | int ha_rocksdb::read_row_from_primary_key(uchar *const buf) { |
7689 | DBUG_ASSERT(buf != nullptr); |
7690 | |
7691 | int rc; |
7692 | const rocksdb::Slice &rkey = m_scan_it->key(); |
7693 | const uint pk_size = rkey.size(); |
7694 | const char *pk_data = rkey.data(); |
7695 | |
7696 | memcpy(m_pk_packed_tuple, pk_data, pk_size); |
7697 | m_last_rowkey.copy(pk_data, pk_size, &my_charset_bin); |
7698 | |
7699 | if (m_lock_rows != RDB_LOCK_NONE) { |
7700 | /* We need to put a lock and re-read */ |
7701 | rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size); |
7702 | } else { |
7703 | /* Unpack from the row we've read */ |
7704 | const rocksdb::Slice &value = m_scan_it->value(); |
7705 | rc = convert_record_from_storage_format(&rkey, &value, buf); |
7706 | } |
7707 | |
7708 | return rc; |
7709 | } |
7710 | |
7711 | int ha_rocksdb::read_row_from_secondary_key(uchar *const buf, |
7712 | const Rdb_key_def &kd, |
7713 | bool move_forward) { |
7714 | DBUG_ASSERT(buf != nullptr); |
7715 | |
7716 | int rc = 0; |
7717 | uint pk_size; |
7718 | |
7719 | /* Get the key columns and primary key value */ |
7720 | const rocksdb::Slice &rkey = m_scan_it->key(); |
7721 | const rocksdb::Slice &value = m_scan_it->value(); |
7722 | |
7723 | bool covered_lookup = |
7724 | m_keyread_only || kd.covers_lookup(table, &value, &m_lookup_bitmap); |
7725 | if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) { |
7726 | pk_size = |
7727 | kd.get_primary_key_tuple(table, *m_pk_descr, &rkey, m_pk_packed_tuple); |
7728 | if (pk_size == RDB_INVALID_KEY_LEN) { |
7729 | rc = HA_ERR_ROCKSDB_CORRUPT_DATA; |
7730 | } else { |
7731 | rc = kd.unpack_record(table, buf, &rkey, &value, |
7732 | m_verify_row_debug_checksums); |
7733 | global_stats.covered_secondary_key_lookups.inc(); |
7734 | } |
7735 | } else { |
7736 | if (kd.m_is_reverse_cf) |
7737 | move_forward = !move_forward; |
7738 | |
7739 | rc = find_icp_matching_index_rec(move_forward, buf); |
7740 | if (!rc) { |
7741 | const rocksdb::Slice &rkey = m_scan_it->key(); |
7742 | pk_size = kd.get_primary_key_tuple(table, *m_pk_descr, &rkey, |
7743 | m_pk_packed_tuple); |
7744 | if (pk_size == RDB_INVALID_KEY_LEN) { |
7745 | rc = HA_ERR_ROCKSDB_CORRUPT_DATA; |
7746 | } else { |
7747 | rc = get_row_by_rowid(buf, m_pk_packed_tuple, pk_size); |
7748 | } |
7749 | } |
7750 | } |
7751 | |
7752 | if (!rc) { |
7753 | m_last_rowkey.copy((const char *)m_pk_packed_tuple, pk_size, |
7754 | &my_charset_bin); |
7755 | } |
7756 | |
7757 | return rc; |
7758 | } |
7759 | |
7760 | /** |
7761 | @note |
7762 | The problem with this function is that SQL layer calls it, when |
7763 | - the table has not been yet opened (no ::open() call done) |
7764 | - this->table_share already exists, but it is in the process of being |
7765 | filled, so some of fields are still NULL. |
7766 | - In particular, table_share->key_info[inx].key_part[] is filled only up |
7767 | to part #part. Subsequent key parts are not yet filled. |
7768 | |
7769 | To complicate things further, SQL layer will call index_flags() with |
7770 | all_parts=TRUE. Essentially, we're asked to provide flags for reading |
7771 | keyparts whose datatype is not yet known. |
7772 | |
7773 | We walk around this problem by using check_keyread_allowed(), which uses |
7774 | table_share object and is careful not to step on unitialized data. |
7775 | |
7776 | When we get a call with all_parts=TRUE, we try to analyze all parts but |
7777 | ignore those that have key_part->field==nullptr (these are not initialized |
7778 | yet). |
7779 | */ |
7780 | |
7781 | ulong ha_rocksdb::index_flags(uint inx, uint part, bool all_parts) const { |
7782 | DBUG_ENTER_FUNC(); |
7783 | |
7784 | ulong base_flags = HA_READ_NEXT | // doesn't seem to be used |
7785 | HA_READ_ORDER | HA_READ_RANGE | HA_READ_PREV; |
7786 | |
7787 | if (check_keyread_allowed(inx, part, all_parts)) |
7788 | base_flags |= HA_KEYREAD_ONLY; |
7789 | |
7790 | if (inx == table_share->primary_key) { |
7791 | /* |
7792 | Index-only reads on primary key are the same as table scan for us. Still, |
7793 | we need to explicitly "allow" them, otherwise SQL layer will miss some |
7794 | plans. |
7795 | */ |
7796 | base_flags |= HA_KEYREAD_ONLY | HA_CLUSTERED_INDEX; |
7797 | } else { |
7798 | /* |
7799 | We can Index Condition Pushdown any key except the primary. With primary |
7800 | key, we get (pk, record) pair immediately, there is no place to put the |
7801 | ICP check. |
7802 | */ |
7803 | base_flags |= HA_DO_INDEX_COND_PUSHDOWN; |
7804 | } |
7805 | |
7806 | DBUG_RETURN(base_flags); |
7807 | } |
7808 | |
7809 | /** |
7810 | @brief |
7811 | Read next index tuple through the secondary index. |
7812 | |
7813 | @details |
7814 | m_scan_it points at the index key-value pair that we should read the (pk,row) |
7815 | pair for. |
7816 | */ |
7817 | int ha_rocksdb::secondary_index_read(const int keyno, uchar *const buf) { |
7818 | DBUG_ASSERT(buf != nullptr); |
7819 | DBUG_ASSERT(table != nullptr); |
7820 | #ifdef MARIAROCKS_NOT_YET |
7821 | stats.rows_requested++; |
7822 | #endif |
7823 | /* Use STATUS_NOT_FOUND when record not found or some error occurred */ |
7824 | table->status = STATUS_NOT_FOUND; |
7825 | |
7826 | if (is_valid(m_scan_it)) { |
7827 | rocksdb::Slice key = m_scan_it->key(); |
7828 | |
7829 | /* Check if we've ran out of records of this index */ |
7830 | if (m_key_descr_arr[keyno]->covers_key(key)) { |
7831 | int rc = 0; |
7832 | |
7833 | // TODO: We could here check if we have ran out of range we're scanning |
7834 | const uint size = m_key_descr_arr[keyno]->get_primary_key_tuple( |
7835 | table, *m_pk_descr, &key, m_pk_packed_tuple); |
7836 | if (size == RDB_INVALID_KEY_LEN) { |
7837 | return HA_ERR_ROCKSDB_CORRUPT_DATA; |
7838 | } |
7839 | |
7840 | m_last_rowkey.copy((const char *)m_pk_packed_tuple, size, |
7841 | &my_charset_bin); |
7842 | |
7843 | rocksdb::Slice value = m_scan_it->value(); |
7844 | bool covered_lookup = |
7845 | m_keyread_only || m_key_descr_arr[keyno]->covers_lookup( |
7846 | table, &value, &m_lookup_bitmap); |
7847 | if (covered_lookup && m_lock_rows == RDB_LOCK_NONE) { |
7848 | rc = m_key_descr_arr[keyno]->unpack_record( |
7849 | table, buf, &key, &value, m_verify_row_debug_checksums); |
7850 | global_stats.covered_secondary_key_lookups.inc(); |
7851 | } else { |
7852 | rc = get_row_by_rowid(buf, m_pk_packed_tuple, size); |
7853 | } |
7854 | |
7855 | if (!rc) { |
7856 | table->status = 0; |
7857 | #ifdef MARIAROCKS_NOT_YET |
7858 | stats.rows_read++; |
7859 | stats.rows_index_next++; |
7860 | #endif |
7861 | update_row_stats(ROWS_READ); |
7862 | } |
7863 | return rc; |
7864 | } |
7865 | } |
7866 | return HA_ERR_END_OF_FILE; |
7867 | } |
7868 | |
7869 | bool ha_rocksdb::is_using_full_key(key_part_map keypart_map, |
7870 | uint actual_key_parts) |
7871 | { |
7872 | return (keypart_map == HA_WHOLE_KEY) || |
7873 | (keypart_map == ((key_part_map(1) << actual_key_parts) |
7874 | - 1)); |
7875 | } |
7876 | |
7877 | /* |
7878 | ha_rocksdb::read_range_first overrides handler::read_range_first. |
7879 | The only difference from handler::read_range_first is that |
7880 | ha_rocksdb::read_range_first passes end_key to |
7881 | ha_rocksdb::index_read_map_impl function. |
7882 | |
7883 | @return |
7884 | HA_EXIT_SUCCESS OK |
7885 | other HA_ERR error code (can be SE-specific) |
7886 | */ |
7887 | int ha_rocksdb::read_range_first(const key_range *const start_key, |
7888 | const key_range *const end_key, |
7889 | bool eq_range_arg, bool sorted) { |
7890 | DBUG_ENTER_FUNC(); |
7891 | |
7892 | int result; |
7893 | |
7894 | eq_range = eq_range_arg; |
7895 | |
7896 | /* MariaDB: Pass RANGE_SCAN_ASC when we support ICP on backwards scans */ |
7897 | set_end_range(end_key); |
7898 | |
7899 | range_key_part = table->key_info[active_index].key_part; |
7900 | |
7901 | if (!start_key) // Read first record |
7902 | result = ha_index_first(table->record[0]); |
7903 | else { |
7904 | #ifdef MARIAROCKS_NOT_YET |
7905 | if (is_using_prohibited_gap_locks( |
7906 | is_using_full_unique_key(active_index, start_key->keypart_map, |
7907 | start_key->flag))) { |
7908 | DBUG_RETURN(HA_ERR_LOCK_DEADLOCK); |
7909 | } |
7910 | #endif |
7911 | increment_statistics(&SSV::ha_read_key_count); |
7912 | |
7913 | MYSQL_TABLE_IO_WAIT(m_psi, PSI_TABLE_FETCH_ROW, active_index, 0, { |
7914 | result = |
7915 | index_read_map_impl(table->record[0], start_key->key, |
7916 | start_key->keypart_map, start_key->flag, end_key); |
7917 | }) |
7918 | } |
7919 | if (result) |
7920 | DBUG_RETURN((result == HA_ERR_KEY_NOT_FOUND) ? HA_ERR_END_OF_FILE : result); |
7921 | |
7922 | if (compare_key(end_range) <= 0) { |
7923 | DBUG_RETURN(HA_EXIT_SUCCESS); |
7924 | } else { |
7925 | /* |
7926 | The last read row does not fall in the range. So request |
7927 | storage engine to release row lock if possible. |
7928 | */ |
7929 | unlock_row(); |
7930 | DBUG_RETURN(HA_ERR_END_OF_FILE); |
7931 | } |
7932 | } |
7933 | |
7934 | int ha_rocksdb::prepare_index_scan() |
7935 | { |
7936 | range_key_part= table->key_info[active_index].key_part; |
7937 | set_end_range(NULL); |
7938 | return 0; |
7939 | } |
7940 | |
7941 | |
7942 | int ha_rocksdb::prepare_range_scan(const key_range *start_key, |
7943 | const key_range *end_key) |
7944 | { |
7945 | range_key_part= table->key_info[active_index].key_part; |
7946 | |
7947 | if (start_key) |
7948 | { |
7949 | m_save_start_range= *start_key; |
7950 | m_start_range= &m_save_start_range; |
7951 | } |
7952 | else |
7953 | m_start_range= NULL; |
7954 | |
7955 | set_end_range(end_key); |
7956 | return 0; |
7957 | } |
7958 | |
7959 | |
7960 | /** |
7961 | @return |
7962 | HA_EXIT_SUCCESS OK |
7963 | other HA_ERR error code (can be SE-specific) |
7964 | */ |
7965 | |
7966 | int ha_rocksdb::index_read_map(uchar *const buf, const uchar *const key, |
7967 | key_part_map keypart_map, |
7968 | enum ha_rkey_function find_flag) { |
7969 | DBUG_ENTER_FUNC(); |
7970 | |
7971 | DBUG_RETURN(index_read_map_impl(buf, key, keypart_map, find_flag, nullptr)); |
7972 | } |
7973 | |
7974 | /* |
7975 | See storage/rocksdb/rocksdb-range-access.txt for description of how MySQL |
7976 | index navigation commands are converted into RocksDB lookup commands. |
7977 | |
7978 | This function takes end_key as an argument, and it is set on range scan. |
7979 | MyRocks needs to decide whether prefix bloom filter can be used or not. |
7980 | To decide to use prefix bloom filter or not, calculating equal condition |
7981 | length |
7982 | is needed. On equal lookups (find_flag == HA_READ_KEY_EXACT), equal |
7983 | condition length is the same as rocksdb::Slice.size() of the start key. |
7984 | On range scan, equal condition length is MIN(start_key, end_key) of the |
7985 | rocksdb::Slice expression. |
7986 | |
7987 | @return |
7988 | HA_EXIT_SUCCESS OK |
7989 | other HA_ERR error code (can be SE-specific) |
7990 | */ |
7991 | int ha_rocksdb::index_read_map_impl(uchar *const buf, const uchar *const key, |
7992 | key_part_map keypart_map, |
7993 | enum ha_rkey_function find_flag, |
7994 | const key_range *end_key) { |
7995 | DBUG_ENTER_FUNC(); |
7996 | |
7997 | int rc = 0; |
7998 | |
7999 | const Rdb_key_def &kd = *m_key_descr_arr[active_index]; |
8000 | const uint actual_key_parts = kd.get_key_parts(); |
8001 | bool using_full_key = is_using_full_key(keypart_map, actual_key_parts); |
8002 | |
8003 | if (!end_key) |
8004 | end_key = end_range; |
8005 | |
8006 | /* By default, we don't need the retrieved records to match the prefix */ |
8007 | m_sk_match_prefix = nullptr; |
8008 | #ifdef MARIAROCKS_NOT_YET |
8009 | stats.rows_requested++; |
8010 | #endif |
8011 | if (active_index == table->s->primary_key && find_flag == HA_READ_KEY_EXACT && |
8012 | using_full_key) { |
8013 | /* |
8014 | Equality lookup over primary key, using full tuple. |
8015 | This is a special case, use DB::Get. |
8016 | */ |
8017 | const uint size = kd.pack_index_tuple(table, m_pack_buffer, |
8018 | m_pk_packed_tuple, key, keypart_map); |
8019 | bool skip_lookup = is_blind_delete_enabled(); |
8020 | |
8021 | rc = get_row_by_rowid(buf, m_pk_packed_tuple, size, |
8022 | skip_lookup, false); |
8023 | |
8024 | if (!rc && !skip_lookup) { |
8025 | #ifdef MARIAROCKS_NOT_YET |
8026 | stats.rows_read++; |
8027 | stats.rows_index_first++; |
8028 | #endif |
8029 | update_row_stats(ROWS_READ); |
8030 | } |
8031 | DBUG_RETURN(rc); |
8032 | } |
8033 | |
8034 | /* |
8035 | Unique secondary index performs lookups without the extended key fields |
8036 | */ |
8037 | uint packed_size; |
8038 | if (active_index != table->s->primary_key && |
8039 | table->key_info[active_index].flags & HA_NOSAME && |
8040 | find_flag == HA_READ_KEY_EXACT && using_full_key) { |
8041 | key_part_map tmp_map = (key_part_map(1) << table->key_info[active_index] |
8042 | .user_defined_key_parts) - |
8043 | 1; |
8044 | packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple, |
8045 | key, tmp_map); |
8046 | if (table->key_info[active_index].user_defined_key_parts != |
8047 | kd.get_key_parts()) |
8048 | using_full_key = false; |
8049 | } else { |
8050 | packed_size = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple, |
8051 | key, keypart_map); |
8052 | } |
8053 | |
8054 | if ((pushed_idx_cond && pushed_idx_cond_keyno == active_index) && |
8055 | (find_flag == HA_READ_KEY_EXACT || find_flag == HA_READ_PREFIX_LAST)) { |
8056 | /* |
8057 | We are doing a point index lookup, and ICP is enabled. It is possible |
8058 | that this call will be followed by ha_rocksdb->index_next_same() call. |
8059 | |
8060 | Do what InnoDB does: save the lookup tuple now. We will need it in |
8061 | index_next_same/find_icp_matching_index_rec in order to stop scanning |
8062 | as soon as index record doesn't match the lookup tuple. |
8063 | |
8064 | When not using ICP, handler::index_next_same() will make sure that rows |
8065 | that don't match the lookup prefix are not returned. |
8066 | row matches the lookup prefix. |
8067 | */ |
8068 | m_sk_match_prefix = m_sk_match_prefix_buf; |
8069 | m_sk_match_length = packed_size; |
8070 | memcpy(m_sk_match_prefix, m_sk_packed_tuple, packed_size); |
8071 | } |
8072 | |
8073 | int bytes_changed_by_succ = 0; |
8074 | if (find_flag == HA_READ_PREFIX_LAST_OR_PREV || |
8075 | find_flag == HA_READ_PREFIX_LAST || find_flag == HA_READ_AFTER_KEY) { |
8076 | /* See below */ |
8077 | bytes_changed_by_succ = kd.successor(m_sk_packed_tuple, packed_size); |
8078 | } |
8079 | |
8080 | rocksdb::Slice slice(reinterpret_cast<const char *>(m_sk_packed_tuple), |
8081 | packed_size); |
8082 | |
8083 | uint end_key_packed_size = 0; |
8084 | const key_range *cur_end_key= end_key; |
8085 | if (find_flag == HA_READ_PREFIX_LAST_OR_PREV) |
8086 | cur_end_key= m_start_range; |
8087 | |
8088 | const uint eq_cond_len = |
8089 | calc_eq_cond_len(kd, find_flag, slice, bytes_changed_by_succ, cur_end_key, |
8090 | &end_key_packed_size); |
8091 | |
8092 | bool use_all_keys = false; |
8093 | if (find_flag == HA_READ_KEY_EXACT && |
8094 | my_count_bits(keypart_map) == kd.get_key_parts()) |
8095 | use_all_keys = true; |
8096 | |
8097 | Rdb_transaction *const tx = get_or_create_tx(table->in_use); |
8098 | const bool is_new_snapshot = !tx->has_snapshot(); |
8099 | // Loop as long as we get a deadlock error AND we end up creating the |
8100 | // snapshot here (i.e. it did not exist prior to this) |
8101 | for (;;) { |
8102 | /* |
8103 | This will open the iterator and position it at a record that's equal or |
8104 | greater than the lookup tuple. |
8105 | */ |
8106 | setup_scan_iterator(kd, &slice, use_all_keys, eq_cond_len); |
8107 | |
8108 | /* |
8109 | Once we are positioned on from above, move to the position we really |
8110 | want: See storage/rocksdb/rocksdb-range-access.txt |
8111 | */ |
8112 | bool move_forward; |
8113 | rc = |
8114 | position_to_correct_key(kd, find_flag, using_full_key, key, keypart_map, |
8115 | slice, &move_forward, tx->m_snapshot_timestamp); |
8116 | |
8117 | if (rc) { |
8118 | /* This status is returned on any error */ |
8119 | table->status = STATUS_NOT_FOUND; |
8120 | DBUG_RETURN(rc); |
8121 | } |
8122 | |
8123 | m_skip_scan_it_next_call = false; |
8124 | |
8125 | /* |
8126 | Now get the data for the row into 'buf'. If we were using a primary key |
8127 | then we have all the rows we need. For a secondary key we now need to |
8128 | lookup the primary key. |
8129 | */ |
8130 | if (active_index == table->s->primary_key) |
8131 | rc = read_row_from_primary_key(buf); |
8132 | else |
8133 | rc = read_row_from_secondary_key(buf, kd, move_forward); |
8134 | |
8135 | if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot) |
8136 | break; /* Exit the loop */ |
8137 | |
8138 | // release the snapshot and iterator so they will be regenerated |
8139 | tx->release_snapshot(); |
8140 | release_scan_iterator(); |
8141 | } |
8142 | |
8143 | if (rc) { |
8144 | /* the only possible error condition is record-not-found */ |
8145 | table->status = STATUS_NOT_FOUND; |
8146 | } else { |
8147 | table->status = 0; |
8148 | #ifdef MARIAROCKS_NOT_YET |
8149 | stats.rows_read++; |
8150 | stats.rows_index_first++; |
8151 | #endif |
8152 | update_row_stats(ROWS_READ); |
8153 | } |
8154 | |
8155 | DBUG_RETURN(rc); |
8156 | } |
8157 | |
8158 | /* |
8159 | @brief |
8160 | Scan the secondary index until we find an index record that satisfies ICP |
8161 | |
8162 | @param move_forward TRUE <=> move m_scan_it forward |
8163 | FALSE <=> move m_scan_it backward |
8164 | @param buf Record buffer (must be the same buffer that |
8165 | pushed index condition points to, in practice |
8166 | it is table->record[0]) |
8167 | |
8168 | @detail |
8169 | Move the current iterator m_scan_it until we get an index tuple that |
8170 | satisfies the pushed Index Condition. |
8171 | (if there is no pushed index condition, return right away) |
8172 | |
8173 | @return |
8174 | 0 - Index tuple satisfies ICP, can do index read. |
8175 | other - error code |
8176 | */ |
8177 | |
8178 | int ha_rocksdb::find_icp_matching_index_rec(const bool &move_forward, |
8179 | uchar *const buf) { |
8180 | DBUG_ASSERT(buf != nullptr); |
8181 | |
8182 | if (pushed_idx_cond && pushed_idx_cond_keyno == active_index) { |
8183 | const Rdb_key_def &kd = *m_key_descr_arr[active_index]; |
8184 | |
8185 | while (1) { |
8186 | rocksdb_skip_expired_records(kd, m_scan_it, !move_forward); |
8187 | |
8188 | if (!is_valid(m_scan_it)) { |
8189 | table->status = STATUS_NOT_FOUND; |
8190 | return HA_ERR_END_OF_FILE; |
8191 | } |
8192 | const rocksdb::Slice rkey = m_scan_it->key(); |
8193 | |
8194 | if (!kd.covers_key(rkey)) { |
8195 | table->status = STATUS_NOT_FOUND; |
8196 | return HA_ERR_END_OF_FILE; |
8197 | } |
8198 | |
8199 | if (m_sk_match_prefix) { |
8200 | const rocksdb::Slice prefix((const char *)m_sk_match_prefix, |
8201 | m_sk_match_length); |
8202 | if (!kd.value_matches_prefix(rkey, prefix)) { |
8203 | table->status = STATUS_NOT_FOUND; |
8204 | return HA_ERR_END_OF_FILE; |
8205 | } |
8206 | } |
8207 | |
8208 | const rocksdb::Slice value = m_scan_it->value(); |
8209 | int err = kd.unpack_record(table, buf, &rkey, &value, |
8210 | m_verify_row_debug_checksums); |
8211 | if (err != HA_EXIT_SUCCESS) { |
8212 | return err; |
8213 | } |
8214 | |
8215 | const enum icp_result icp_status= handler_index_cond_check(this); |
8216 | if (icp_status == ICP_NO_MATCH) { |
8217 | rocksdb_smart_next(!move_forward, m_scan_it); |
8218 | continue; /* Get the next (or prev) index tuple */ |
8219 | } |
8220 | else if (icp_status == ICP_OUT_OF_RANGE || icp_status == ICP_ABORTED_BY_USER) { |
8221 | /* We have walked out of range we are scanning */ |
8222 | table->status = STATUS_NOT_FOUND; |
8223 | return HA_ERR_END_OF_FILE; |
8224 | } |
8225 | else /* icp_status == ICP_MATCH */ |
8226 | { |
8227 | /* Index Condition is satisfied. We have rc==0, proceed to fetch the |
8228 | * row. */ |
8229 | break; |
8230 | } |
8231 | /* |
8232 | TODO: should we have this here, or RockDB handles this internally? |
8233 | if (my_core::thd_killed(current_thd)) |
8234 | { |
8235 | rc= HA_ERR_INTERNAL_ERROR; // doesn't matter |
8236 | break; |
8237 | } |
8238 | */ |
8239 | } |
8240 | } |
8241 | return HA_EXIT_SUCCESS; |
8242 | } |
8243 | |
8244 | /** |
8245 | @return |
8246 | HA_EXIT_SUCCESS OK |
8247 | other HA_ERR error code (can be SE-specific) |
8248 | */ |
8249 | |
8250 | /** |
8251 | @return |
8252 | HA_ADMIN_OK OK |
8253 | other HA_ADMIN error code |
8254 | */ |
8255 | int ha_rocksdb::check(THD *const thd, HA_CHECK_OPT *const check_opt) { |
8256 | DBUG_ENTER_FUNC(); |
8257 | |
8258 | DBUG_ASSERT(thd != nullptr); |
8259 | DBUG_ASSERT(check_opt != nullptr); |
8260 | |
8261 | const uint pk = pk_index(table, m_tbl_def); |
8262 | String rowkey_copy; |
8263 | String sec_key_copy; |
8264 | const char *const table_name = table->s->table_name.str; |
8265 | |
8266 | // Only when debugging: don't use snapshot when reading |
8267 | // Rdb_transaction *tx= get_or_create_tx(table->in_use); |
8268 | // tx->snapshot= nullptr; |
8269 | |
8270 | const bool save_verify_row_debug_checksums = m_verify_row_debug_checksums; |
8271 | m_verify_row_debug_checksums = true; |
8272 | |
8273 | /* For each secondary index, check that we can get a PK value from it */ |
8274 | // NO_LINT_DEBUG |
8275 | sql_print_verbose_info("CHECKTABLE %s: Checking table %s" , table_name, |
8276 | table_name); |
8277 | ha_rows UNINIT_VAR(row_checksums_at_start); // set/used iff first_index==true |
8278 | ha_rows row_checksums = ha_rows(-1); |
8279 | bool first_index = true; |
8280 | |
8281 | for (uint keyno = 0; keyno < table->s->keys; keyno++) { |
8282 | if (keyno != pk) { |
8283 | extra(HA_EXTRA_KEYREAD); |
8284 | ha_index_init(keyno, true); |
8285 | ha_rows rows = 0; |
8286 | ha_rows checksums = 0; |
8287 | if (first_index) |
8288 | row_checksums_at_start = m_row_checksums_checked; |
8289 | int res; |
8290 | // NO_LINT_DEBUG |
8291 | sql_print_verbose_info("CHECKTABLE %s: Checking index %s" , table_name, |
8292 | table->key_info[keyno].name); |
8293 | while (1) { |
8294 | if (!rows) |
8295 | res = index_first(table->record[0]); |
8296 | else |
8297 | res = index_next(table->record[0]); |
8298 | |
8299 | if (res == HA_ERR_END_OF_FILE) |
8300 | break; |
8301 | if (res) { |
8302 | // error |
8303 | // NO_LINT_DEBUG |
8304 | sql_print_error("CHECKTABLE %s: .. row %lld: index scan error %d" , |
8305 | table_name, rows, res); |
8306 | goto error; |
8307 | } |
8308 | rocksdb::Slice key = m_scan_it->key(); |
8309 | sec_key_copy.copy(key.data(), key.size(), &my_charset_bin); |
8310 | rowkey_copy.copy(m_last_rowkey.ptr(), m_last_rowkey.length(), |
8311 | &my_charset_bin); |
8312 | |
8313 | if (m_key_descr_arr[keyno]->unpack_info_has_checksum( |
8314 | m_scan_it->value())) { |
8315 | checksums++; |
8316 | } |
8317 | |
8318 | if ((res = get_row_by_rowid(table->record[0], rowkey_copy.ptr(), |
8319 | rowkey_copy.length()))) { |
8320 | // NO_LINT_DEBUG |
8321 | sql_print_error("CHECKTABLE %s: .. row %lld: " |
8322 | "failed to fetch row by rowid" , |
8323 | table_name, rows); |
8324 | goto error; |
8325 | } |
8326 | |
8327 | longlong hidden_pk_id = 0; |
8328 | if (has_hidden_pk(table) && |
8329 | read_hidden_pk_id_from_rowkey(&hidden_pk_id)) |
8330 | goto error; |
8331 | |
8332 | /* Check if we get the same PK value */ |
8333 | uint packed_size = m_pk_descr->pack_record( |
8334 | table, m_pack_buffer, table->record[0], m_pk_packed_tuple, nullptr, |
8335 | false, hidden_pk_id); |
8336 | if (packed_size != rowkey_copy.length() || |
8337 | memcmp(m_pk_packed_tuple, rowkey_copy.ptr(), packed_size)) { |
8338 | // NO_LINT_DEBUG |
8339 | sql_print_error("CHECKTABLE %s: .. row %lld: PK value mismatch" , |
8340 | table_name, rows); |
8341 | goto print_and_error; |
8342 | } |
8343 | |
8344 | /* Check if we get the same secondary key value */ |
8345 | packed_size = m_key_descr_arr[keyno]->pack_record( |
8346 | table, m_pack_buffer, table->record[0], m_sk_packed_tuple, |
8347 | &m_sk_tails, false, hidden_pk_id); |
8348 | if (packed_size != sec_key_copy.length() || |
8349 | memcmp(m_sk_packed_tuple, sec_key_copy.ptr(), packed_size)) { |
8350 | // NO_LINT_DEBUG |
8351 | sql_print_error("CHECKTABLE %s: .. row %lld: " |
8352 | "secondary index value mismatch" , |
8353 | table_name, rows); |
8354 | goto print_and_error; |
8355 | } |
8356 | rows++; |
8357 | continue; |
8358 | |
8359 | print_and_error : { |
8360 | std::string buf; |
8361 | buf = rdb_hexdump(rowkey_copy.ptr(), rowkey_copy.length(), |
8362 | RDB_MAX_HEXDUMP_LEN); |
8363 | // NO_LINT_DEBUG |
8364 | sql_print_error("CHECKTABLE %s: rowkey: %s" , table_name, buf.c_str()); |
8365 | |
8366 | buf = rdb_hexdump(m_retrieved_record.data(), m_retrieved_record.size(), |
8367 | RDB_MAX_HEXDUMP_LEN); |
8368 | // NO_LINT_DEBUG |
8369 | sql_print_error("CHECKTABLE %s: record: %s" , table_name, buf.c_str()); |
8370 | |
8371 | buf = rdb_hexdump(sec_key_copy.ptr(), sec_key_copy.length(), |
8372 | RDB_MAX_HEXDUMP_LEN); |
8373 | // NO_LINT_DEBUG |
8374 | sql_print_error("CHECKTABLE %s: index: %s" , table_name, buf.c_str()); |
8375 | |
8376 | goto error; |
8377 | } |
8378 | } |
8379 | // NO_LINT_DEBUG |
8380 | sql_print_verbose_info("CHECKTABLE %s: ... %lld index entries checked " |
8381 | "(%lld had checksums)" , |
8382 | table_name, rows, checksums); |
8383 | |
8384 | if (first_index) { |
8385 | row_checksums = m_row_checksums_checked - row_checksums_at_start; |
8386 | first_index = false; |
8387 | } |
8388 | ha_index_end(); |
8389 | } |
8390 | } |
8391 | if (row_checksums != ha_rows(-1)) { |
8392 | // NO_LINT_DEBUG |
8393 | sql_print_verbose_info("CHECKTABLE %s: %lld table records had checksums" , |
8394 | table_name, row_checksums); |
8395 | } |
8396 | extra(HA_EXTRA_NO_KEYREAD); |
8397 | |
8398 | m_verify_row_debug_checksums = save_verify_row_debug_checksums; |
8399 | /* |
8400 | TODO: we should check also for PK records that are missing in the secondary |
8401 | indexes. |
8402 | For that, need to walk through the PK and check that every PK record has a |
8403 | proper counterpart in each secondary index. |
8404 | */ |
8405 | DBUG_RETURN(HA_ADMIN_OK); |
8406 | error: |
8407 | m_verify_row_debug_checksums = save_verify_row_debug_checksums; |
8408 | ha_index_or_rnd_end(); |
8409 | extra(HA_EXTRA_NO_KEYREAD); |
8410 | |
8411 | DBUG_RETURN(HA_ADMIN_CORRUPT); |
8412 | } |
8413 | |
8414 | static void dbug_dump_str(FILE *const out, const char *const str, int len) { |
8415 | fprintf(out, "\"" ); |
8416 | for (int i = 0; i < len; i++) { |
8417 | if (str[i] > 32) |
8418 | fprintf(out, "%c" , str[i]); |
8419 | else |
8420 | fprintf(out, "\\%d" , str[i]); |
8421 | } |
8422 | fprintf(out, "\"" ); |
8423 | } |
8424 | |
8425 | /* |
8426 | Debugging help: dump the whole database into a human-readable file. |
8427 | Usage: |
8428 | dbug_dump_database(rdb); |
8429 | */ |
8430 | |
8431 | void dbug_dump_database(rocksdb::DB *const db) { |
8432 | FILE *const out = fopen("/tmp/rocksdb.dump" , "wt" ); |
8433 | if (!out) |
8434 | return; |
8435 | |
8436 | rocksdb::Iterator *it = db->NewIterator(rocksdb::ReadOptions()); |
8437 | for (it->SeekToFirst(); it->Valid(); it->Next()) { |
8438 | rocksdb::Slice key = it->key(); |
8439 | rocksdb::Slice val = it->value(); |
8440 | dbug_dump_str(out, key.data(), key.size()); |
8441 | fprintf(out, " -> " ); |
8442 | dbug_dump_str(out, val.data(), val.size()); |
8443 | fprintf(out, "\n" ); |
8444 | } |
8445 | |
8446 | delete it; |
8447 | fclose(out); |
8448 | } |
8449 | |
8450 | rocksdb::Status ha_rocksdb::get_for_update( |
8451 | Rdb_transaction *const tx, rocksdb::ColumnFamilyHandle *const column_family, |
8452 | const rocksdb::Slice &key, rocksdb::PinnableSlice *const value) const { |
8453 | DBUG_ASSERT(m_lock_rows != RDB_LOCK_NONE); |
8454 | const bool exclusive = m_lock_rows != RDB_LOCK_READ; |
8455 | |
8456 | rocksdb::Status s = tx->get_for_update(column_family, key, value, exclusive); |
8457 | |
8458 | // If we have a lock conflict and we are running in READ COMMITTTED mode |
8459 | // release and reacquire the snapshot and then retry the get_for_update(). |
8460 | if (s.IsBusy() && !s.IsDeadlock() && |
8461 | my_core::thd_tx_isolation(ha_thd()) == ISO_READ_COMMITTED) { |
8462 | tx->release_snapshot(); |
8463 | tx->acquire_snapshot(false); |
8464 | |
8465 | s = tx->get_for_update(column_family, key, value, exclusive); |
8466 | } |
8467 | |
8468 | return s; |
8469 | } |
8470 | |
8471 | bool ha_rocksdb::is_blind_delete_enabled() |
8472 | { |
8473 | THD *thd = ha_thd(); |
8474 | return (THDVAR(thd, blind_delete_primary_key) && |
8475 | thd->lex->sql_command == SQLCOM_DELETE && |
8476 | thd->lex->table_count == 1 && |
8477 | table->s->keys == 1 && |
8478 | !has_hidden_pk(table) && |
8479 | !thd->rgi_slave); |
8480 | } |
8481 | |
8482 | /* |
8483 | Given a rowid (i.e. packed PK) as a parameter, get the record. |
8484 | |
8485 | @return |
8486 | HA_EXIT_SUCCESS OK |
8487 | other HA_ERR error code (can be SE-specific) |
8488 | */ |
8489 | |
8490 | int ha_rocksdb::get_row_by_rowid(uchar *const buf, const char *const rowid, |
8491 | const uint rowid_size, const bool skip_lookup, |
8492 | const bool skip_ttl_check) { |
8493 | DBUG_ENTER_FUNC(); |
8494 | |
8495 | DBUG_ASSERT(buf != nullptr); |
8496 | DBUG_ASSERT(rowid != nullptr); |
8497 | DBUG_ASSERT(table != nullptr); |
8498 | |
8499 | int rc; |
8500 | |
8501 | rocksdb::Slice key_slice(rowid, rowid_size); |
8502 | |
8503 | Rdb_transaction *const tx = get_or_create_tx(table->in_use); |
8504 | DBUG_ASSERT(tx != nullptr); |
8505 | |
8506 | DEBUG_SYNC(ha_thd(), "rocksdb.get_row_by_rowid" ); |
8507 | DBUG_EXECUTE_IF("dbug.rocksdb.get_row_by_rowid" , { |
8508 | THD *thd = ha_thd(); |
8509 | const char act[] = "now signal Reached " |
8510 | "wait_for signal.rocksdb.get_row_by_rowid_let_running" ; |
8511 | DBUG_ASSERT(opt_debug_sync_timeout > 0); |
8512 | DBUG_ASSERT(!debug_sync_set_action(thd, STRING_WITH_LEN(act))); |
8513 | };); |
8514 | |
8515 | bool found; |
8516 | rocksdb::Status s; |
8517 | |
8518 | /* Pretend row found without looking up */ |
8519 | if (skip_lookup) |
8520 | { |
8521 | #ifdef MARIAROCKS_NOT_YET |
8522 | stats.rows_deleted_blind++; |
8523 | #endif |
8524 | update_row_stats(ROWS_DELETED_BLIND); |
8525 | m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin); |
8526 | table->status = 0; |
8527 | DBUG_RETURN(0); |
8528 | } |
8529 | |
8530 | if (m_lock_rows == RDB_LOCK_NONE) { |
8531 | tx->acquire_snapshot(true); |
8532 | s = tx->get(m_pk_descr->get_cf(), key_slice, &m_retrieved_record); |
8533 | } else { |
8534 | s = get_for_update(tx, m_pk_descr->get_cf(), key_slice, |
8535 | &m_retrieved_record); |
8536 | } |
8537 | |
8538 | DBUG_EXECUTE_IF("rocksdb_return_status_corrupted" , |
8539 | dbug_change_status_to_corrupted(&s);); |
8540 | |
8541 | if (!s.IsNotFound() && !s.ok()) { |
8542 | DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def, |
8543 | m_table_handler)); |
8544 | } |
8545 | found = !s.IsNotFound(); |
8546 | |
8547 | table->status = STATUS_NOT_FOUND; |
8548 | if (found) { |
8549 | /* If we found the record, but it's expired, pretend we didn't find it. */ |
8550 | if (!skip_ttl_check && m_pk_descr->has_ttl() && |
8551 | should_hide_ttl_rec(*m_pk_descr, m_retrieved_record, |
8552 | tx->m_snapshot_timestamp)) { |
8553 | DBUG_RETURN(HA_ERR_KEY_NOT_FOUND); |
8554 | } |
8555 | |
8556 | m_last_rowkey.copy((const char *)rowid, rowid_size, &my_charset_bin); |
8557 | rc = convert_record_from_storage_format(&key_slice, buf); |
8558 | |
8559 | if (!rc) { |
8560 | table->status = 0; |
8561 | } |
8562 | } else { |
8563 | /* |
8564 | Note: we don't need to unlock the row. It is intentional that we keep |
8565 | locks on rows that don't exist. |
8566 | */ |
8567 | rc = HA_ERR_KEY_NOT_FOUND; |
8568 | } |
8569 | |
8570 | DBUG_RETURN(rc); |
8571 | } |
8572 | |
8573 | /** |
8574 | @return |
8575 | HA_EXIT_SUCCESS OK |
8576 | other HA_ERR error code (can be SE-specific) |
8577 | */ |
8578 | int ha_rocksdb::index_next(uchar *const buf) { |
8579 | DBUG_ENTER_FUNC(); |
8580 | |
8581 | bool moves_forward = true; |
8582 | if (m_key_descr_arr[active_index]->m_is_reverse_cf) { |
8583 | moves_forward = false; |
8584 | } |
8585 | |
8586 | int rc = index_next_with_direction(buf, moves_forward); |
8587 | if (rc == HA_ERR_KEY_NOT_FOUND) |
8588 | rc = HA_ERR_END_OF_FILE; |
8589 | |
8590 | DBUG_RETURN(rc); |
8591 | } |
8592 | |
8593 | /** |
8594 | @return |
8595 | HA_EXIT_SUCCESS OK |
8596 | other HA_ERR error code (can be SE-specific) |
8597 | */ |
8598 | int ha_rocksdb::index_prev(uchar *const buf) { |
8599 | DBUG_ENTER_FUNC(); |
8600 | |
8601 | bool moves_forward = false; |
8602 | if (m_key_descr_arr[active_index]->m_is_reverse_cf) { |
8603 | moves_forward = true; |
8604 | } |
8605 | |
8606 | int rc = index_next_with_direction(buf, moves_forward); |
8607 | if (rc == HA_ERR_KEY_NOT_FOUND) |
8608 | rc = HA_ERR_END_OF_FILE; |
8609 | |
8610 | DBUG_RETURN(rc); |
8611 | } |
8612 | |
8613 | int ha_rocksdb::index_next_with_direction(uchar *const buf, bool move_forward) { |
8614 | DBUG_ENTER_FUNC(); |
8615 | |
8616 | int rc; |
8617 | |
8618 | if (active_index == pk_index(table, m_tbl_def)) { |
8619 | rc = rnd_next_with_direction(buf, move_forward); |
8620 | } else { |
8621 | if (m_skip_scan_it_next_call) { |
8622 | m_skip_scan_it_next_call = false; |
8623 | } else { |
8624 | if (move_forward) |
8625 | m_scan_it->Next(); /* this call cannot fail */ |
8626 | else |
8627 | m_scan_it->Prev(); |
8628 | } |
8629 | rocksdb_skip_expired_records(*m_key_descr_arr[active_index], m_scan_it, |
8630 | !move_forward); |
8631 | rc = find_icp_matching_index_rec(move_forward, buf); |
8632 | if (!rc) |
8633 | rc = secondary_index_read(active_index, buf); |
8634 | } |
8635 | |
8636 | DBUG_RETURN(rc); |
8637 | } |
8638 | |
8639 | /** |
8640 | @return |
8641 | HA_EXIT_SUCCESS OK |
8642 | other HA_ERR error code (can be SE-specific) |
8643 | */ |
8644 | int ha_rocksdb::index_first(uchar *const buf) { |
8645 | DBUG_ENTER_FUNC(); |
8646 | |
8647 | m_sk_match_prefix = nullptr; |
8648 | int rc = m_key_descr_arr[active_index]->m_is_reverse_cf |
8649 | ? index_last_intern(buf) |
8650 | : index_first_intern(buf); |
8651 | if (rc == HA_ERR_KEY_NOT_FOUND) |
8652 | rc = HA_ERR_END_OF_FILE; |
8653 | |
8654 | DBUG_RETURN(rc); |
8655 | } |
8656 | |
8657 | /** |
8658 | @return |
8659 | HA_EXIT_SUCCESS OK |
8660 | other HA_ERR error code (can be SE-specific) |
8661 | */ |
8662 | int ha_rocksdb::index_last(uchar *const buf) { |
8663 | DBUG_ENTER_FUNC(); |
8664 | |
8665 | m_sk_match_prefix = nullptr; |
8666 | int rc = m_key_descr_arr[active_index]->m_is_reverse_cf |
8667 | ? index_first_intern(buf) |
8668 | : index_last_intern(buf); |
8669 | if (rc == HA_ERR_KEY_NOT_FOUND) |
8670 | rc = HA_ERR_END_OF_FILE; |
8671 | |
8672 | DBUG_RETURN(rc); |
8673 | } |
8674 | |
8675 | /* |
8676 | Start scanning from the "first" value. |
8677 | |
8678 | The 'first' here means "the first from start of the key space". |
8679 | For reverse-ordered key spaces, the first value will be the biggest, 'zzzz'. |
8680 | |
8681 | An picture of a forward-ordered keyspace (remember, the keys have form |
8682 | 'indexnr-keyval'. Suppose the index we are at has number n) |
8683 | |
8684 | (n-1) - ... |
8685 | ( n ) <--- 1. (n) doesn't exist in the db but it would be here. |
8686 | ( n ) - aaa <--- 2. Seek("n") will put us here on the first index |
8687 | ( n ) - bbb record. |
8688 | ( n ) - cc |
8689 | |
8690 | So, need to do: Seek(n); |
8691 | |
8692 | A backward-ordered keyspace: |
8693 | |
8694 | (n+1) - bbb |
8695 | (n+1) - aaa |
8696 | (n+1) <--- (n+1) doesn't exist in the db but would be here. |
8697 | ( n ) - ccc <--- 1. We need to be here. |
8698 | ( n ) - bbb |
8699 | ( n ) - aaa |
8700 | ( n ) |
8701 | |
8702 | So, need to: Seek(n+1); |
8703 | |
8704 | */ |
8705 | |
8706 | int ha_rocksdb::index_first_intern(uchar *const buf) { |
8707 | DBUG_ENTER_FUNC(); |
8708 | |
8709 | DBUG_ASSERT(buf != nullptr); |
8710 | |
8711 | uchar *key; |
8712 | uint key_size; |
8713 | int rc; |
8714 | |
8715 | if (is_pk(active_index, table, m_tbl_def)) { |
8716 | key = m_pk_packed_tuple; |
8717 | } else { |
8718 | key = m_sk_packed_tuple; |
8719 | } |
8720 | |
8721 | DBUG_ASSERT(key != nullptr); |
8722 | |
8723 | const Rdb_key_def &kd = *m_key_descr_arr[active_index]; |
8724 | int key_start_matching_bytes = kd.get_first_key(key, &key_size); |
8725 | |
8726 | rocksdb::Slice index_key((const char *)key, key_size); |
8727 | |
8728 | Rdb_transaction *const tx = get_or_create_tx(table->in_use); |
8729 | DBUG_ASSERT(tx != nullptr); |
8730 | |
8731 | const bool is_new_snapshot = !tx->has_snapshot(); |
8732 | // Loop as long as we get a deadlock error AND we end up creating the |
8733 | // snapshot here (i.e. it did not exist prior to this) |
8734 | for (;;) { |
8735 | setup_scan_iterator(kd, &index_key, false, key_start_matching_bytes); |
8736 | m_scan_it->Seek(index_key); |
8737 | m_skip_scan_it_next_call = true; |
8738 | |
8739 | rc = index_next_with_direction(buf, true); |
8740 | if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot) |
8741 | break; // exit the loop |
8742 | |
8743 | // release the snapshot and iterator so they will be regenerated |
8744 | tx->release_snapshot(); |
8745 | release_scan_iterator(); |
8746 | } |
8747 | |
8748 | if (!rc) { |
8749 | /* |
8750 | index_next is always incremented on success, so decrement if it is |
8751 | index_first instead |
8752 | */ |
8753 | #ifdef MARIAROCKS_NOT_YET |
8754 | stats.rows_index_first++; |
8755 | stats.rows_index_next--; |
8756 | #endif |
8757 | } |
8758 | |
8759 | DBUG_RETURN(rc); |
8760 | } |
8761 | |
8762 | /** |
8763 | @details |
8764 | Start scanning from the "last" value |
8765 | |
8766 | The 'last' here means "the last from start of the key space". |
8767 | For reverse-ordered key spaces, we will actually read the smallest value. |
8768 | |
8769 | An picture of a forward-ordered keyspace (remember, the keys have form |
8770 | 'indexnr-keyval'. Suppose the we are at a key that has number n) |
8771 | |
8772 | (n-1)-something |
8773 | ( n )-aaa |
8774 | ( n )-bbb |
8775 | ( n )-ccc <----------- Need to seek to here. |
8776 | (n+1) <---- Doesn't exist, but would be here. |
8777 | (n+1)-smth, or no value at all |
8778 | |
8779 | RocksDB's Iterator::SeekForPrev($val) seeks to "at $val or last value that's |
8780 | smaller". We can't seek to "(n)-ccc" directly, because we don't know what |
8781 | is the value of 'ccc' (the biggest record with prefix (n)). Instead, we seek |
8782 | to "(n+1)", which is the least possible value that's greater than any value |
8783 | in index #n. |
8784 | |
8785 | So, need to: it->SeekForPrev(n+1) |
8786 | |
8787 | A backward-ordered keyspace: |
8788 | |
8789 | (n+1)-something |
8790 | ( n ) - ccc |
8791 | ( n ) - bbb |
8792 | ( n ) - aaa <---------------- (*) Need to seek here. |
8793 | ( n ) <--- Doesn't exist, but would be here. |
8794 | (n-1)-smth, or no value at all |
8795 | |
8796 | So, need to: it->SeekForPrev(n) |
8797 | */ |
8798 | |
8799 | int ha_rocksdb::index_last_intern(uchar *const buf) { |
8800 | DBUG_ENTER_FUNC(); |
8801 | |
8802 | DBUG_ASSERT(buf != nullptr); |
8803 | |
8804 | uchar *key; |
8805 | uint key_size; |
8806 | int rc; |
8807 | |
8808 | if (is_pk(active_index, table, m_tbl_def)) { |
8809 | key = m_pk_packed_tuple; |
8810 | } else { |
8811 | key = m_sk_packed_tuple; |
8812 | } |
8813 | |
8814 | DBUG_ASSERT(key != nullptr); |
8815 | |
8816 | const Rdb_key_def &kd = *m_key_descr_arr[active_index]; |
8817 | if (kd.m_is_reverse_cf) { |
8818 | kd.get_infimum_key(key, &key_size); |
8819 | } else { |
8820 | kd.get_supremum_key(key, &key_size); |
8821 | } |
8822 | |
8823 | rocksdb::Slice index_key((const char *)key, key_size); |
8824 | |
8825 | Rdb_transaction *const tx = get_or_create_tx(table->in_use); |
8826 | DBUG_ASSERT(tx != nullptr); |
8827 | |
8828 | bool is_new_snapshot = !tx->has_snapshot(); |
8829 | // Loop as long as we get a deadlock error AND we end up creating the |
8830 | // snapshot here (i.e. it did not exist prior to this) |
8831 | for (;;) { |
8832 | setup_scan_iterator(kd, &index_key, false, Rdb_key_def::INDEX_NUMBER_SIZE); |
8833 | m_scan_it->SeekForPrev(index_key); |
8834 | m_skip_scan_it_next_call = false; |
8835 | |
8836 | if (is_pk(active_index, table, m_tbl_def)) { |
8837 | m_skip_scan_it_next_call = true; |
8838 | rc = rnd_next_with_direction(buf, false); |
8839 | } else { |
8840 | rc = find_icp_matching_index_rec(false /*move_forward*/, buf); |
8841 | if (!rc) |
8842 | rc = secondary_index_read(active_index, buf); |
8843 | } |
8844 | |
8845 | if (rc != HA_ERR_LOCK_DEADLOCK || !is_new_snapshot) |
8846 | break; /* exit the loop */ |
8847 | |
8848 | // release the snapshot and iterator so they will be regenerated |
8849 | tx->release_snapshot(); |
8850 | release_scan_iterator(); |
8851 | } |
8852 | |
8853 | if (!rc) { |
8854 | /* |
8855 | index_next is always incremented on success, so decrement if it is |
8856 | index_first instead |
8857 | */ |
8858 | #ifdef MARIAROCKS_NOT_YET |
8859 | stats.rows_index_first++; |
8860 | stats.rows_index_next--; |
8861 | #endif |
8862 | } |
8863 | |
8864 | DBUG_RETURN(rc); |
8865 | } |
8866 | |
8867 | void ha_rocksdb::unlock_row() { |
8868 | DBUG_ENTER_FUNC(); |
8869 | |
8870 | if (m_lock_rows != RDB_LOCK_NONE) { |
8871 | Rdb_transaction *const tx = get_or_create_tx(table->in_use); |
8872 | tx->release_lock(m_pk_descr->get_cf(), |
8873 | std::string(m_last_rowkey.ptr(), m_last_rowkey.length())); |
8874 | } |
8875 | |
8876 | DBUG_VOID_RETURN; |
8877 | } |
8878 | |
8879 | /* |
8880 | Returning true if SingleDelete can be used. |
8881 | - Secondary Indexes can always use SingleDelete. |
8882 | - If the index is PRIMARY KEY, and if all of the columns of the table |
8883 | are covered by the PRIMARY KEY, SingleDelete can be used. |
8884 | */ |
8885 | bool ha_rocksdb::can_use_single_delete(const uint &index) const { |
8886 | return (index != pk_index(table, m_tbl_def) || |
8887 | (!has_hidden_pk(table) && |
8888 | table->key_info[index].ext_key_parts == table->s->fields)); |
8889 | } |
8890 | |
8891 | bool ha_rocksdb::skip_unique_check() const { |
8892 | /* |
8893 | We want to skip unique checks if: |
8894 | 1) bulk_load is on |
8895 | 2) this table is in the whitelist of tables to skip and the replication |
8896 | lag has reached a large enough value (see unique_check_lag_threshold |
8897 | and unique_check_lage_reset_threshold) |
8898 | 3) the user set unique_checks option to 0, and the table does not have |
8899 | any indexes. If the table has secondary keys, then those might becomes |
8900 | inconsisted/corrupted |
8901 | */ |
8902 | return THDVAR(table->in_use, bulk_load) || |
8903 | (m_force_skip_unique_check && m_skip_unique_check) || |
8904 | (my_core::thd_test_options(table->in_use, |
8905 | OPTION_RELAXED_UNIQUE_CHECKS) && |
8906 | m_tbl_def->m_key_count == 1); |
8907 | } |
8908 | |
8909 | #ifdef MARIAROCKS_NOT_YET // MDEV-10975 |
8910 | void ha_rocksdb::set_force_skip_unique_check(bool skip) { |
8911 | DBUG_ENTER_FUNC(); |
8912 | |
8913 | m_force_skip_unique_check = skip; |
8914 | |
8915 | DBUG_VOID_RETURN; |
8916 | } |
8917 | #endif |
8918 | |
8919 | bool ha_rocksdb::commit_in_the_middle() { |
8920 | return THDVAR(table->in_use, bulk_load) || |
8921 | THDVAR(table->in_use, commit_in_the_middle); |
8922 | } |
8923 | |
8924 | /* |
8925 | Executing bulk commit if it should. |
8926 | @retval true if bulk commit failed |
8927 | @retval false if bulk commit was skipped or succeeded |
8928 | */ |
8929 | bool ha_rocksdb::do_bulk_commit(Rdb_transaction *const tx) { |
8930 | DBUG_ASSERT(tx != nullptr); |
8931 | return commit_in_the_middle() && |
8932 | tx->get_write_count() >= THDVAR(table->in_use, bulk_load_size) && |
8933 | tx->flush_batch(); |
8934 | } |
8935 | |
8936 | /* |
8937 | If table was created without primary key, SQL layer represents the primary |
8938 | key number as MAX_INDEXES. Hence, this function returns true if the table |
8939 | does not contain a primary key. (In which case we generate a hidden |
8940 | 'auto-incremented' pk.) |
8941 | */ |
8942 | bool ha_rocksdb::has_hidden_pk(const TABLE *const table) const { |
8943 | DBUG_ASSERT(table != nullptr); |
8944 | return Rdb_key_def::table_has_hidden_pk(table); |
8945 | } |
8946 | |
8947 | /* |
8948 | Returns true if given index number is a hidden_pk. |
8949 | - This is used when a table is created with no primary key. |
8950 | */ |
8951 | bool ha_rocksdb::is_hidden_pk(const uint index, const TABLE *const table_arg, |
8952 | const Rdb_tbl_def *const tbl_def_arg) { |
8953 | DBUG_ASSERT(table_arg != nullptr); |
8954 | DBUG_ASSERT(table_arg->s != nullptr); |
8955 | DBUG_ASSERT(tbl_def_arg != nullptr); |
8956 | |
8957 | return (table_arg->s->primary_key == MAX_INDEXES && |
8958 | index == tbl_def_arg->m_key_count - 1); |
8959 | } |
8960 | |
8961 | /* Returns index of primary key */ |
8962 | uint ha_rocksdb::pk_index(const TABLE *const table_arg, |
8963 | const Rdb_tbl_def *const tbl_def_arg) { |
8964 | DBUG_ASSERT(table_arg != nullptr); |
8965 | DBUG_ASSERT(table_arg->s != nullptr); |
8966 | DBUG_ASSERT(tbl_def_arg != nullptr); |
8967 | |
8968 | return table_arg->s->primary_key == MAX_INDEXES ? tbl_def_arg->m_key_count - 1 |
8969 | : table_arg->s->primary_key; |
8970 | } |
8971 | |
8972 | /* Returns true if given index number is a primary key */ |
8973 | bool ha_rocksdb::is_pk(const uint index, const TABLE *const table_arg, |
8974 | const Rdb_tbl_def *const tbl_def_arg) { |
8975 | DBUG_ASSERT(table_arg != nullptr); |
8976 | DBUG_ASSERT(table_arg->s != nullptr); |
8977 | DBUG_ASSERT(tbl_def_arg != nullptr); |
8978 | |
8979 | return index == table_arg->s->primary_key || |
8980 | is_hidden_pk(index, table_arg, tbl_def_arg); |
8981 | } |
8982 | |
8983 | uint ha_rocksdb::max_supported_key_part_length() const { |
8984 | DBUG_ENTER_FUNC(); |
8985 | DBUG_RETURN(rocksdb_large_prefix ? MAX_INDEX_COL_LEN_LARGE |
8986 | : MAX_INDEX_COL_LEN_SMALL); |
8987 | } |
8988 | |
8989 | const char *ha_rocksdb::get_key_name(const uint index, |
8990 | const TABLE *const table_arg, |
8991 | const Rdb_tbl_def *const tbl_def_arg) { |
8992 | DBUG_ASSERT(table_arg != nullptr); |
8993 | DBUG_ASSERT(tbl_def_arg != nullptr); |
8994 | |
8995 | if (is_hidden_pk(index, table_arg, tbl_def_arg)) { |
8996 | return HIDDEN_PK_NAME; |
8997 | } |
8998 | |
8999 | DBUG_ASSERT(table_arg->key_info != nullptr); |
9000 | DBUG_ASSERT(table_arg->key_info[index].name.str != nullptr); |
9001 | |
9002 | return table_arg->key_info[index].name.str; |
9003 | } |
9004 | |
9005 | const char *ha_rocksdb::(const uint index, |
9006 | const TABLE *const table_arg, |
9007 | const Rdb_tbl_def *const tbl_def_arg) { |
9008 | DBUG_ASSERT(table_arg != nullptr); |
9009 | DBUG_ASSERT(tbl_def_arg != nullptr); |
9010 | |
9011 | if (is_hidden_pk(index, table_arg, tbl_def_arg)) { |
9012 | return nullptr; |
9013 | } |
9014 | |
9015 | DBUG_ASSERT(table_arg->key_info != nullptr); |
9016 | |
9017 | return table_arg->key_info[index].comment.str; |
9018 | } |
9019 | |
9020 | const std::string ha_rocksdb::generate_cf_name(const uint index, |
9021 | const TABLE *const table_arg, |
9022 | const Rdb_tbl_def *const tbl_def_arg, |
9023 | bool *per_part_match_found) { |
9024 | DBUG_ASSERT(table_arg != nullptr); |
9025 | DBUG_ASSERT(tbl_def_arg != nullptr); |
9026 | DBUG_ASSERT(per_part_match_found != nullptr); |
9027 | |
9028 | // When creating CF-s the caller needs to know if there was a custom CF name |
9029 | // specified for a given paritition. |
9030 | *per_part_match_found = false; |
9031 | |
9032 | // Index comment is used to define the column family name specification(s). |
9033 | // If there was no comment, we get an emptry string, and it means "use the |
9034 | // default column family". |
9035 | const char *const = get_key_comment(index, table_arg, tbl_def_arg); |
9036 | |
9037 | // `get_key_comment` can return `nullptr`, that's why this. |
9038 | std::string = comment ? comment : "" ; |
9039 | |
9040 | std::string cf_name = Rdb_key_def::parse_comment_for_qualifier( |
9041 | key_comment, table_arg, tbl_def_arg, per_part_match_found, |
9042 | RDB_CF_NAME_QUALIFIER); |
9043 | |
9044 | if (IF_PARTITIONING(table_arg->part_info,nullptr) != nullptr && !*per_part_match_found) { |
9045 | // At this point we tried to search for a custom CF name for a partition, |
9046 | // but none was specified. Therefore default one will be used. |
9047 | return "" ; |
9048 | } |
9049 | |
9050 | // If we didn't find any partitioned/non-partitioned qualifiers, return the |
9051 | // comment itself. NOTE: this currently handles returning the cf name |
9052 | // specified in the index comment in the case of no partitions, which doesn't |
9053 | // use any qualifiers at the moment. (aka its a special case) |
9054 | if (cf_name.empty() && !key_comment.empty()) { |
9055 | return key_comment; |
9056 | } |
9057 | |
9058 | return cf_name; |
9059 | } |
9060 | |
9061 | const std::string ha_rocksdb::(const TABLE *const table_arg) { |
9062 | DBUG_ASSERT(table_arg != nullptr); |
9063 | DBUG_ASSERT(table_arg->s != nullptr); |
9064 | |
9065 | return table_arg->s->comment.str; |
9066 | } |
9067 | |
9068 | /** |
9069 | @return |
9070 | HA_EXIT_SUCCESS OK |
9071 | other HA_ERR error code (can be SE-specific) |
9072 | */ |
9073 | int ha_rocksdb::write_row(uchar *const buf) { |
9074 | DBUG_ENTER_FUNC(); |
9075 | |
9076 | DBUG_ASSERT(buf != nullptr); |
9077 | DBUG_ASSERT(buf == table->record[0]); |
9078 | DBUG_ASSERT(m_lock_rows == RDB_LOCK_WRITE); |
9079 | |
9080 | /* |
9081 | Note: "buf == table->record[0]" is copied from innodb. I am not aware of |
9082 | any use cases where this condition is not true. |
9083 | */ |
9084 | if (table->next_number_field && buf == table->record[0]) { |
9085 | int err; |
9086 | if ((err = update_auto_increment())) { |
9087 | DBUG_RETURN(err); |
9088 | } |
9089 | } |
9090 | |
9091 | const int rv = update_write_row(nullptr, buf, skip_unique_check()); |
9092 | |
9093 | if (rv == 0) { |
9094 | #ifdef MARIAROCKS_NOT_YET |
9095 | stats.rows_inserted++; |
9096 | #endif |
9097 | update_row_stats(ROWS_INSERTED); |
9098 | } |
9099 | |
9100 | DBUG_RETURN(rv); |
9101 | } |
9102 | |
9103 | /** |
9104 | Constructing m_last_rowkey (MyRocks key expression) from |
9105 | before_update|delete image (MySQL row expression). |
9106 | m_last_rowkey is normally set during lookup phase, such as |
9107 | rnd_next_with_direction() and rnd_pos(). With Read Free Replication, |
9108 | these read functions are skipped and update_rows(), delete_rows() are |
9109 | called without setting m_last_rowkey. This function sets m_last_rowkey |
9110 | for Read Free Replication. |
9111 | */ |
9112 | void ha_rocksdb::set_last_rowkey(const uchar *const old_data) { |
9113 | #ifdef MARIAROCKS_NOT_YET |
9114 | if (old_data && use_read_free_rpl()) { |
9115 | const int old_pk_size = m_pk_descr->pack_record( |
9116 | table, m_pack_buffer, old_data, m_pk_packed_tuple, nullptr, false); |
9117 | m_last_rowkey.copy((const char *)m_pk_packed_tuple, old_pk_size, |
9118 | &my_charset_bin); |
9119 | } |
9120 | #endif |
9121 | } |
9122 | |
9123 | int ha_rocksdb::get_pk_for_update(struct update_row_info *const row_info) { |
9124 | int size; |
9125 | |
9126 | /* |
9127 | Get new row key for any insert, and any update where the pk is not hidden. |
9128 | Row key for updates with hidden pk is handled below. |
9129 | */ |
9130 | if (!has_hidden_pk(table)) { |
9131 | row_info->hidden_pk_id = 0; |
9132 | |
9133 | row_info->new_pk_unpack_info = &m_pk_unpack_info; |
9134 | |
9135 | size = |
9136 | m_pk_descr->pack_record(table, m_pack_buffer, row_info->new_data, |
9137 | m_pk_packed_tuple, row_info->new_pk_unpack_info, |
9138 | false, 0, 0, nullptr, &row_info->ttl_pk_offset); |
9139 | } else if (row_info->old_data == nullptr) { |
9140 | row_info->hidden_pk_id = update_hidden_pk_val(); |
9141 | size = |
9142 | m_pk_descr->pack_hidden_pk(row_info->hidden_pk_id, m_pk_packed_tuple); |
9143 | } else { |
9144 | /* |
9145 | If hidden primary key, rowkey for new record will always be the same as |
9146 | before |
9147 | */ |
9148 | size = row_info->old_pk_slice.size(); |
9149 | memcpy(m_pk_packed_tuple, row_info->old_pk_slice.data(), size); |
9150 | int err = read_hidden_pk_id_from_rowkey(&row_info->hidden_pk_id); |
9151 | if (err) { |
9152 | return err; |
9153 | } |
9154 | } |
9155 | |
9156 | row_info->new_pk_slice = |
9157 | rocksdb::Slice((const char *)m_pk_packed_tuple, size); |
9158 | |
9159 | return HA_EXIT_SUCCESS; |
9160 | } |
9161 | |
9162 | int ha_rocksdb::check_and_lock_unique_pk(const uint &key_id, |
9163 | const struct update_row_info &row_info, |
9164 | bool *const found, |
9165 | bool *const pk_changed) { |
9166 | DBUG_ASSERT(found != nullptr); |
9167 | DBUG_ASSERT(pk_changed != nullptr); |
9168 | |
9169 | *pk_changed = false; |
9170 | |
9171 | /* |
9172 | For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs |
9173 | always require locking. |
9174 | */ |
9175 | if (row_info.old_pk_slice.size() > 0) { |
9176 | /* |
9177 | If the keys are the same, then no lock is needed |
9178 | */ |
9179 | if (!Rdb_pk_comparator::bytewise_compare(row_info.new_pk_slice, |
9180 | row_info.old_pk_slice)) { |
9181 | *found = false; |
9182 | return HA_EXIT_SUCCESS; |
9183 | } |
9184 | |
9185 | *pk_changed = true; |
9186 | } |
9187 | |
9188 | /* |
9189 | Perform a read to determine if a duplicate entry exists. For primary |
9190 | keys, a point lookup will be sufficient. |
9191 | |
9192 | note: we intentionally don't set options.snapshot here. We want to read |
9193 | the latest committed data. |
9194 | */ |
9195 | |
9196 | /* |
9197 | To prevent race conditions like below, it is necessary to |
9198 | take a lock for a target row. get_for_update() holds a gap lock if |
9199 | target key does not exist, so below conditions should never |
9200 | happen. |
9201 | |
9202 | 1) T1 Get(empty) -> T2 Get(empty) -> T1 Put(insert) -> T1 commit |
9203 | -> T2 Put(overwrite) -> T2 commit |
9204 | 2) T1 Get(empty) -> T1 Put(insert, not committed yet) -> T2 Get(empty) |
9205 | -> T2 Put(insert, blocked) -> T1 commit -> T2 commit(overwrite) |
9206 | */ |
9207 | const rocksdb::Status s = |
9208 | get_for_update(row_info.tx, m_pk_descr->get_cf(), row_info.new_pk_slice, |
9209 | &m_retrieved_record); |
9210 | if (!s.ok() && !s.IsNotFound()) { |
9211 | return row_info.tx->set_status_error( |
9212 | table->in_use, s, *m_key_descr_arr[key_id], m_tbl_def, m_table_handler); |
9213 | } |
9214 | |
9215 | *found = !s.IsNotFound(); |
9216 | return HA_EXIT_SUCCESS; |
9217 | } |
9218 | |
9219 | int ha_rocksdb::check_and_lock_sk(const uint &key_id, |
9220 | const struct update_row_info &row_info, |
9221 | bool *const found) { |
9222 | DBUG_ASSERT(found != nullptr); |
9223 | *found = false; |
9224 | |
9225 | /* |
9226 | Can skip checking this key if none of the key fields have changed. |
9227 | */ |
9228 | if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id)) { |
9229 | return HA_EXIT_SUCCESS; |
9230 | } |
9231 | |
9232 | KEY *key_info = nullptr; |
9233 | uint n_null_fields = 0; |
9234 | uint user_defined_key_parts = 1; |
9235 | |
9236 | key_info = &table->key_info[key_id]; |
9237 | user_defined_key_parts = key_info->user_defined_key_parts; |
9238 | /* |
9239 | If there are no uniqueness requirements, there's no need to obtain a |
9240 | lock for this key. |
9241 | */ |
9242 | if (!(key_info->flags & HA_NOSAME)) { |
9243 | return HA_EXIT_SUCCESS; |
9244 | } |
9245 | |
9246 | const Rdb_key_def &kd = *m_key_descr_arr[key_id]; |
9247 | |
9248 | /* |
9249 | Calculate the new key for obtaining the lock |
9250 | |
9251 | For unique secondary indexes, the key used for locking does not |
9252 | include the extended fields. |
9253 | */ |
9254 | int size = |
9255 | kd.pack_record(table, m_pack_buffer, row_info.new_data, m_sk_packed_tuple, |
9256 | nullptr, false, 0, user_defined_key_parts, &n_null_fields); |
9257 | if (n_null_fields > 0) { |
9258 | /* |
9259 | If any fields are marked as NULL this will never match another row as |
9260 | to NULL never matches anything else including another NULL. |
9261 | */ |
9262 | return HA_EXIT_SUCCESS; |
9263 | } |
9264 | |
9265 | const rocksdb::Slice new_slice = |
9266 | rocksdb::Slice((const char *)m_sk_packed_tuple, size); |
9267 | |
9268 | /* |
9269 | For UPDATEs, if the key has changed, we need to obtain a lock. INSERTs |
9270 | always require locking. |
9271 | */ |
9272 | if (row_info.old_data != nullptr) { |
9273 | size = kd.pack_record(table, m_pack_buffer, row_info.old_data, |
9274 | m_sk_packed_tuple_old, nullptr, false, 0, |
9275 | user_defined_key_parts); |
9276 | const rocksdb::Slice old_slice = |
9277 | rocksdb::Slice((const char *)m_sk_packed_tuple_old, size); |
9278 | |
9279 | /* |
9280 | For updates, if the keys are the same, then no lock is needed |
9281 | |
9282 | Also check to see if the key has any fields set to NULL. If it does, then |
9283 | this key is unique since NULL is not equal to each other, so no lock is |
9284 | needed. |
9285 | */ |
9286 | if (!Rdb_pk_comparator::bytewise_compare(new_slice, old_slice)) { |
9287 | return HA_EXIT_SUCCESS; |
9288 | } |
9289 | } |
9290 | |
9291 | /* |
9292 | Perform a read to determine if a duplicate entry exists - since this is |
9293 | a secondary indexes a range scan is needed. |
9294 | |
9295 | note: we intentionally don't set options.snapshot here. We want to read |
9296 | the latest committed data. |
9297 | */ |
9298 | |
9299 | const bool all_parts_used = (user_defined_key_parts == kd.get_key_parts()); |
9300 | |
9301 | /* |
9302 | This iterator seems expensive since we need to allocate and free |
9303 | memory for each unique index. |
9304 | |
9305 | If this needs to be optimized, for keys without NULL fields, the |
9306 | extended primary key fields can be migrated to the value portion of the |
9307 | key. This enables using Get() instead of Seek() as in the primary key |
9308 | case. |
9309 | |
9310 | The bloom filter may need to be disabled for this lookup. |
9311 | */ |
9312 | uchar min_bound_buf[MAX_KEY_LENGTH]; |
9313 | uchar max_bound_buf[MAX_KEY_LENGTH]; |
9314 | rocksdb::Slice min_bound_slice; |
9315 | rocksdb::Slice max_bound_slice; |
9316 | const bool total_order_seek = !check_bloom_and_set_bounds( |
9317 | ha_thd(), kd, new_slice, all_parts_used, |
9318 | min_bound_buf, |
9319 | max_bound_buf, |
9320 | &min_bound_slice, |
9321 | &max_bound_slice); |
9322 | const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache); |
9323 | |
9324 | const rocksdb::Status s = |
9325 | get_for_update(row_info.tx, kd.get_cf(), new_slice, nullptr); |
9326 | if (!s.ok() && !s.IsNotFound()) { |
9327 | return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def, |
9328 | m_table_handler); |
9329 | } |
9330 | |
9331 | rocksdb::Iterator *const iter = row_info.tx->get_iterator( |
9332 | kd.get_cf(), total_order_seek, fill_cache, |
9333 | min_bound_slice, max_bound_slice, |
9334 | true /* read current data */, |
9335 | false /* acquire snapshot */); |
9336 | /* |
9337 | Need to scan the transaction to see if there is a duplicate key. |
9338 | Also need to scan RocksDB and verify the key has not been deleted |
9339 | in the transaction. |
9340 | */ |
9341 | iter->Seek(new_slice); |
9342 | *found = !read_key_exact(kd, iter, all_parts_used, new_slice, |
9343 | row_info.tx->m_snapshot_timestamp); |
9344 | delete iter; |
9345 | |
9346 | return HA_EXIT_SUCCESS; |
9347 | } |
9348 | |
9349 | int ha_rocksdb::check_uniqueness_and_lock( |
9350 | const struct update_row_info &row_info, bool *const pk_changed) { |
9351 | /* |
9352 | Go through each index and determine if the index has uniqueness |
9353 | requirements. If it does, then try to obtain a row lock on the new values. |
9354 | Once all locks have been obtained, then perform the changes needed to |
9355 | update/insert the row. |
9356 | */ |
9357 | for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) { |
9358 | bool found; |
9359 | int rc; |
9360 | |
9361 | if (is_pk(key_id, table, m_tbl_def)) { |
9362 | rc = check_and_lock_unique_pk(key_id, row_info, &found, pk_changed); |
9363 | } else { |
9364 | rc = check_and_lock_sk(key_id, row_info, &found); |
9365 | } |
9366 | |
9367 | if (rc != HA_EXIT_SUCCESS) { |
9368 | return rc; |
9369 | } |
9370 | |
9371 | /* |
9372 | If the pk key has ttl, we may need to pretend the row wasn't |
9373 | found if it is already expired. The pk record is read into |
9374 | m_retrieved_record by check_and_lock_unique_pk(). |
9375 | */ |
9376 | if (is_pk(key_id, table, m_tbl_def) && found && m_pk_descr->has_ttl() && |
9377 | should_hide_ttl_rec(*m_pk_descr, m_retrieved_record, |
9378 | (row_info.tx->m_snapshot_timestamp |
9379 | ? row_info.tx->m_snapshot_timestamp |
9380 | : static_cast<int64_t>(std::time(nullptr))))) { |
9381 | found = false; |
9382 | } |
9383 | |
9384 | if (found) { |
9385 | /* There is a row with this key already, so error out. */ |
9386 | errkey = key_id; |
9387 | m_dupp_errkey = errkey; |
9388 | return HA_ERR_FOUND_DUPP_KEY; |
9389 | } |
9390 | } |
9391 | |
9392 | return HA_EXIT_SUCCESS; |
9393 | } |
9394 | |
9395 | int ha_rocksdb::check_duplicate_sk(const TABLE *table_arg, |
9396 | const Rdb_key_def &index, |
9397 | const rocksdb::Slice *key, |
9398 | struct unique_sk_buf_info *sk_info) { |
9399 | uint n_null_fields = 0; |
9400 | const rocksdb::Comparator *index_comp = index.get_cf()->GetComparator(); |
9401 | |
9402 | /* Get proper SK buffer. */ |
9403 | uchar *sk_buf = sk_info->swap_and_get_sk_buf(); |
9404 | |
9405 | /* Get memcmp form of sk without extended pk tail */ |
9406 | uint sk_memcmp_size = |
9407 | index.get_memcmp_sk_parts(table_arg, *key, sk_buf, &n_null_fields); |
9408 | |
9409 | sk_info->sk_memcmp_key = |
9410 | rocksdb::Slice(reinterpret_cast<char *>(sk_buf), sk_memcmp_size); |
9411 | |
9412 | if (sk_info->sk_memcmp_key_old.size() > 0 && n_null_fields == 0 && |
9413 | index_comp->Compare(sk_info->sk_memcmp_key, sk_info->sk_memcmp_key_old) == |
9414 | 0) { |
9415 | return 1; |
9416 | } |
9417 | |
9418 | sk_info->sk_memcmp_key_old = sk_info->sk_memcmp_key; |
9419 | return 0; |
9420 | } |
9421 | |
9422 | int ha_rocksdb::bulk_load_key(Rdb_transaction *const tx, const Rdb_key_def &kd, |
9423 | const rocksdb::Slice &key, |
9424 | const rocksdb::Slice &value, bool sort) { |
9425 | DBUG_ENTER_FUNC(); |
9426 | int res; |
9427 | rocksdb::ColumnFamilyHandle *cf = kd.get_cf(); |
9428 | |
9429 | // In the case of unsorted inserts, m_sst_info allocated here is not |
9430 | // used to store the keys. It is still used to indicate when tables |
9431 | // are switched. |
9432 | if (m_sst_info == nullptr || m_sst_info->is_committed()) { |
9433 | m_sst_info.reset(new Rdb_sst_info(rdb, m_table_handler->m_table_name, |
9434 | kd.get_name(), cf, *rocksdb_db_options, |
9435 | THDVAR(ha_thd(), trace_sst_api))); |
9436 | res = tx->start_bulk_load(this, m_sst_info); |
9437 | if (res != HA_EXIT_SUCCESS) { |
9438 | DBUG_RETURN(res); |
9439 | } |
9440 | } |
9441 | DBUG_ASSERT(m_sst_info); |
9442 | |
9443 | if (sort) { |
9444 | Rdb_index_merge *key_merge; |
9445 | DBUG_ASSERT(cf != nullptr); |
9446 | |
9447 | res = tx->get_key_merge(kd.get_gl_index_id(), cf, &key_merge); |
9448 | if (res == HA_EXIT_SUCCESS) { |
9449 | res = key_merge->add(key, value); |
9450 | } |
9451 | } else { |
9452 | res = m_sst_info->put(key, value); |
9453 | } |
9454 | |
9455 | DBUG_RETURN(res); |
9456 | } |
9457 | |
9458 | int ha_rocksdb::finalize_bulk_load(bool print_client_error) { |
9459 | DBUG_ENTER_FUNC(); |
9460 | |
9461 | int res = HA_EXIT_SUCCESS; |
9462 | |
9463 | /* Skip if there are no possible ongoing bulk loads */ |
9464 | if (m_sst_info) { |
9465 | res = m_sst_info->commit(print_client_error); |
9466 | m_sst_info.reset(); |
9467 | } |
9468 | DBUG_RETURN(res); |
9469 | } |
9470 | |
9471 | int ha_rocksdb::update_pk(const Rdb_key_def &kd, |
9472 | const struct update_row_info &row_info, |
9473 | const bool &pk_changed) { |
9474 | const uint key_id = kd.get_keyno(); |
9475 | const bool hidden_pk = is_hidden_pk(key_id, table, m_tbl_def); |
9476 | ulonglong bytes_written = 0; |
9477 | |
9478 | /* |
9479 | If the PK has changed, or if this PK uses single deletes and this is an |
9480 | update, the old key needs to be deleted. In the single delete case, it |
9481 | might be possible to have this sequence of keys: PUT(X), PUT(X), SD(X), |
9482 | resulting in the first PUT(X) showing up. |
9483 | */ |
9484 | if (!hidden_pk && (pk_changed || ((row_info.old_pk_slice.size() > 0) && |
9485 | can_use_single_delete(key_id)))) { |
9486 | const rocksdb::Status s = delete_or_singledelete( |
9487 | key_id, row_info.tx, kd.get_cf(), row_info.old_pk_slice); |
9488 | if (!s.ok()) { |
9489 | return row_info.tx->set_status_error(table->in_use, s, kd, m_tbl_def, |
9490 | m_table_handler); |
9491 | } else { |
9492 | bytes_written = row_info.old_pk_slice.size(); |
9493 | } |
9494 | } |
9495 | |
9496 | if (table->next_number_field) { |
9497 | update_auto_incr_val_from_field(); |
9498 | } |
9499 | |
9500 | int rc = HA_EXIT_SUCCESS; |
9501 | rocksdb::Slice value_slice; |
9502 | /* Prepare the new record to be written into RocksDB */ |
9503 | if ((rc = convert_record_to_storage_format(row_info, &value_slice))) { |
9504 | return rc; |
9505 | } |
9506 | |
9507 | const auto cf = m_pk_descr->get_cf(); |
9508 | if (rocksdb_enable_bulk_load_api && THDVAR(table->in_use, bulk_load) && |
9509 | !hidden_pk) { |
9510 | /* |
9511 | Write the primary key directly to an SST file using an SstFileWriter |
9512 | */ |
9513 | rc = bulk_load_key(row_info.tx, kd, row_info.new_pk_slice, value_slice, |
9514 | THDVAR(table->in_use, bulk_load_allow_unsorted)); |
9515 | } else if (row_info.skip_unique_check || row_info.tx->m_ddl_transaction) { |
9516 | /* |
9517 | It is responsibility of the user to make sure that the data being |
9518 | inserted doesn't violate any unique keys. |
9519 | */ |
9520 | row_info.tx->get_indexed_write_batch()->Put(cf, row_info.new_pk_slice, |
9521 | value_slice); |
9522 | } else { |
9523 | const auto s = row_info.tx->put(cf, row_info.new_pk_slice, value_slice); |
9524 | if (!s.ok()) { |
9525 | if (s.IsBusy()) { |
9526 | errkey = table->s->primary_key; |
9527 | m_dupp_errkey = errkey; |
9528 | rc = HA_ERR_FOUND_DUPP_KEY; |
9529 | } else { |
9530 | rc = row_info.tx->set_status_error(table->in_use, s, *m_pk_descr, |
9531 | m_tbl_def, m_table_handler); |
9532 | } |
9533 | } |
9534 | } |
9535 | |
9536 | if (rc == HA_EXIT_SUCCESS) { |
9537 | row_info.tx->update_bytes_written( |
9538 | bytes_written + row_info.new_pk_slice.size() + value_slice.size()); |
9539 | } |
9540 | return rc; |
9541 | } |
9542 | |
9543 | int ha_rocksdb::update_sk(const TABLE *const table_arg, const Rdb_key_def &kd, |
9544 | const struct update_row_info &row_info) { |
9545 | int new_packed_size; |
9546 | int old_packed_size; |
9547 | |
9548 | rocksdb::Slice new_key_slice; |
9549 | rocksdb::Slice new_value_slice; |
9550 | rocksdb::Slice old_key_slice; |
9551 | |
9552 | const uint key_id = kd.get_keyno(); |
9553 | |
9554 | ulonglong bytes_written = 0; |
9555 | |
9556 | /* |
9557 | Can skip updating this key if none of the key fields have changed and, if |
9558 | this table has TTL, the TTL timestamp has not changed. |
9559 | */ |
9560 | if (row_info.old_data != nullptr && !m_update_scope.is_set(key_id) && |
9561 | (!kd.has_ttl() || !m_ttl_bytes_updated)) { |
9562 | return HA_EXIT_SUCCESS; |
9563 | } |
9564 | |
9565 | const bool store_row_debug_checksums = should_store_row_debug_checksums(); |
9566 | |
9567 | new_packed_size = |
9568 | kd.pack_record(table_arg, m_pack_buffer, row_info.new_data, |
9569 | m_sk_packed_tuple, &m_sk_tails, store_row_debug_checksums, |
9570 | row_info.hidden_pk_id, 0, nullptr, nullptr, m_ttl_bytes); |
9571 | |
9572 | if (row_info.old_data != nullptr) { |
9573 | // The old value |
9574 | old_packed_size = kd.pack_record( |
9575 | table_arg, m_pack_buffer, row_info.old_data, m_sk_packed_tuple_old, |
9576 | &m_sk_tails_old, store_row_debug_checksums, row_info.hidden_pk_id, 0, |
9577 | nullptr, nullptr, m_ttl_bytes); |
9578 | |
9579 | /* |
9580 | Check if we are going to write the same value. This can happen when |
9581 | one does |
9582 | UPDATE tbl SET col='foo' |
9583 | and we are looking at the row that already has col='foo'. |
9584 | |
9585 | We also need to compare the unpack info. Suppose, the collation is |
9586 | case-insensitive, and unpack info contains information about whether |
9587 | the letters were uppercase and lowercase. Then, both 'foo' and 'FOO' |
9588 | will have the same key value, but different data in unpack_info. |
9589 | |
9590 | (note: anyone changing bytewise_compare should take this code into |
9591 | account) |
9592 | */ |
9593 | if (old_packed_size == new_packed_size && |
9594 | m_sk_tails_old.get_current_pos() == m_sk_tails.get_current_pos() && |
9595 | !(kd.has_ttl() && m_ttl_bytes_updated) && |
9596 | memcmp(m_sk_packed_tuple_old, m_sk_packed_tuple, old_packed_size) == |
9597 | 0 && |
9598 | memcmp(m_sk_tails_old.ptr(), m_sk_tails.ptr(), |
9599 | m_sk_tails.get_current_pos()) == 0) { |
9600 | return HA_EXIT_SUCCESS; |
9601 | } |
9602 | |
9603 | /* |
9604 | Deleting entries from secondary index should skip locking, but |
9605 | be visible to the transaction. |
9606 | (also note that DDL statements do not delete rows, so this is not a DDL |
9607 | statement) |
9608 | */ |
9609 | old_key_slice = rocksdb::Slice( |
9610 | reinterpret_cast<const char *>(m_sk_packed_tuple_old), old_packed_size); |
9611 | |
9612 | row_info.tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(), |
9613 | old_key_slice); |
9614 | |
9615 | bytes_written = old_key_slice.size(); |
9616 | } |
9617 | |
9618 | new_key_slice = rocksdb::Slice( |
9619 | reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size); |
9620 | new_value_slice = |
9621 | rocksdb::Slice(reinterpret_cast<const char *>(m_sk_tails.ptr()), |
9622 | m_sk_tails.get_current_pos()); |
9623 | |
9624 | row_info.tx->get_indexed_write_batch()->Put(kd.get_cf(), new_key_slice, |
9625 | new_value_slice); |
9626 | |
9627 | row_info.tx->update_bytes_written(bytes_written + new_key_slice.size() + |
9628 | new_value_slice.size()); |
9629 | |
9630 | return HA_EXIT_SUCCESS; |
9631 | } |
9632 | |
9633 | int ha_rocksdb::update_indexes(const struct update_row_info &row_info, |
9634 | const bool &pk_changed) { |
9635 | int rc; |
9636 | |
9637 | // The PK must be updated first to pull out the TTL value. |
9638 | rc = update_pk(*m_pk_descr, row_info, pk_changed); |
9639 | if (rc != HA_EXIT_SUCCESS) { |
9640 | return rc; |
9641 | } |
9642 | |
9643 | // Update the remaining indexes. |
9644 | for (uint key_id = 0; key_id < m_tbl_def->m_key_count; key_id++) { |
9645 | if (is_pk(key_id, table, m_tbl_def)) { |
9646 | continue; |
9647 | } |
9648 | |
9649 | rc = update_sk(table, *m_key_descr_arr[key_id], row_info); |
9650 | if (rc != HA_EXIT_SUCCESS) { |
9651 | return rc; |
9652 | } |
9653 | } |
9654 | |
9655 | return HA_EXIT_SUCCESS; |
9656 | } |
9657 | |
9658 | int ha_rocksdb::update_write_row(const uchar *const old_data, |
9659 | const uchar *const new_data, |
9660 | const bool skip_unique_check) { |
9661 | DBUG_ENTER_FUNC(); |
9662 | |
9663 | bool pk_changed = false; |
9664 | struct update_row_info row_info; |
9665 | |
9666 | row_info.old_data = old_data; |
9667 | row_info.new_data = new_data; |
9668 | row_info.skip_unique_check = skip_unique_check; |
9669 | row_info.new_pk_unpack_info = nullptr; |
9670 | |
9671 | set_last_rowkey(old_data); |
9672 | |
9673 | row_info.tx = get_or_create_tx(table->in_use); |
9674 | |
9675 | if (old_data != nullptr) { |
9676 | row_info.old_pk_slice = |
9677 | rocksdb::Slice(m_last_rowkey.ptr(), m_last_rowkey.length()); |
9678 | |
9679 | /* Determine which indexes need updating. */ |
9680 | calc_updated_indexes(); |
9681 | } |
9682 | |
9683 | /* |
9684 | Get the new row key into row_info.new_pk_slice |
9685 | */ |
9686 | int rc = get_pk_for_update(&row_info); |
9687 | if (rc != HA_EXIT_SUCCESS) { |
9688 | DBUG_RETURN(rc); |
9689 | } |
9690 | |
9691 | if (!skip_unique_check) { |
9692 | /* |
9693 | Check to see if we are going to have failures because of unique |
9694 | keys. Also lock the appropriate key values. |
9695 | */ |
9696 | rc = check_uniqueness_and_lock(row_info, &pk_changed); |
9697 | if (rc != HA_EXIT_SUCCESS) { |
9698 | DBUG_RETURN(rc); |
9699 | } |
9700 | } |
9701 | |
9702 | DEBUG_SYNC(ha_thd(), "rocksdb.update_write_row_after_unique_check" ); |
9703 | |
9704 | /* |
9705 | At this point, all locks have been obtained, and all checks for duplicate |
9706 | keys have been performed. No further errors can be allowed to occur from |
9707 | here because updates to the transaction will be made and those updates |
9708 | cannot be easily removed without rolling back the entire transaction. |
9709 | */ |
9710 | rc = update_indexes(row_info, pk_changed); |
9711 | if (rc != HA_EXIT_SUCCESS) { |
9712 | DBUG_RETURN(rc); |
9713 | } |
9714 | |
9715 | if (old_data != nullptr) { |
9716 | row_info.tx->incr_update_count(); |
9717 | } else { |
9718 | row_info.tx->incr_insert_count(); |
9719 | } |
9720 | |
9721 | if (do_bulk_commit(row_info.tx)) { |
9722 | DBUG_RETURN(HA_ERR_ROCKSDB_BULK_LOAD); |
9723 | } |
9724 | |
9725 | DBUG_RETURN(HA_EXIT_SUCCESS); |
9726 | } |
9727 | |
9728 | /* |
9729 | Setting iterator upper/lower bounds for Seek/SeekForPrev. |
9730 | This makes RocksDB to avoid scanning tombstones outside of |
9731 | the given key ranges, when prefix_same_as_start=true was not passed |
9732 | (when prefix bloom filter can not be used). |
9733 | Inversing upper/lower bound is necessary on reverse order CF. |
9734 | This covers HA_READ_PREFIX_LAST* case as well. For example, |
9735 | if given query eq condition was 12 bytes and condition was |
9736 | 0x0000b3eb003f65c5e78858b8, and if doing HA_READ_PREFIX_LAST, |
9737 | eq_cond_len was 11 (see calc_eq_cond_len() for details). |
9738 | If the index was reverse order, upper bound would be |
9739 | 0x0000b3eb003f65c5e78857, and lower bound would be |
9740 | 0x0000b3eb003f65c5e78859. These cover given eq condition range. |
9741 | |
9742 | @param lower_bound_buf IN Buffer for lower bound |
9743 | @param upper_bound_buf IN Buffer for upper bound |
9744 | |
9745 | @param outer_u |
9746 | */ |
9747 | void ha_rocksdb::setup_iterator_bounds(const Rdb_key_def &kd, |
9748 | const rocksdb::Slice &eq_cond, |
9749 | uchar *lower_bound_buf, |
9750 | uchar *upper_bound_buf, |
9751 | rocksdb::Slice *out_lower_bound, |
9752 | rocksdb::Slice *out_upper_bound) { |
9753 | uint eq_cond_len = eq_cond.size(); |
9754 | memcpy(upper_bound_buf, eq_cond.data(), eq_cond_len); |
9755 | kd.successor(upper_bound_buf, eq_cond_len); |
9756 | memcpy(lower_bound_buf, eq_cond.data(), eq_cond_len); |
9757 | kd.predecessor(lower_bound_buf, eq_cond_len); |
9758 | |
9759 | if (kd.m_is_reverse_cf) { |
9760 | *out_upper_bound = |
9761 | rocksdb::Slice((const char *)lower_bound_buf, eq_cond_len); |
9762 | *out_lower_bound = |
9763 | rocksdb::Slice((const char *)upper_bound_buf, eq_cond_len); |
9764 | } else { |
9765 | *out_upper_bound = |
9766 | rocksdb::Slice((const char *)upper_bound_buf, eq_cond_len); |
9767 | *out_lower_bound = |
9768 | rocksdb::Slice((const char *)lower_bound_buf, eq_cond_len); |
9769 | } |
9770 | } |
9771 | |
9772 | /* |
9773 | Open a cursor |
9774 | */ |
9775 | |
9776 | void ha_rocksdb::setup_scan_iterator(const Rdb_key_def &kd, |
9777 | rocksdb::Slice *const slice, |
9778 | const bool use_all_keys, |
9779 | const uint eq_cond_len) { |
9780 | DBUG_ASSERT(slice != nullptr); |
9781 | DBUG_ASSERT(slice->size() >= eq_cond_len); |
9782 | |
9783 | Rdb_transaction *const tx = get_or_create_tx(table->in_use); |
9784 | |
9785 | bool skip_bloom = true; |
9786 | |
9787 | const rocksdb::Slice eq_cond(slice->data(), eq_cond_len); |
9788 | if (check_bloom_and_set_bounds(ha_thd(), kd, eq_cond, use_all_keys, |
9789 | m_eq_cond_lower_bound, |
9790 | m_eq_cond_upper_bound, |
9791 | &m_eq_cond_lower_bound_slice, |
9792 | &m_eq_cond_upper_bound_slice)) { |
9793 | skip_bloom = false; |
9794 | } |
9795 | |
9796 | /* |
9797 | In some cases, setup_scan_iterator() is called multiple times from |
9798 | the same query but bloom filter can not always be used. |
9799 | Suppose the following query example. id2 is VARCHAR(30) and PRIMARY KEY |
9800 | (id1, id2). |
9801 | select count(*) from t2 WHERE id1=100 and id2 IN ('00000000000000000000', |
9802 | '100'); |
9803 | In this case, setup_scan_iterator() is called twice, the first time is for |
9804 | (id1, id2)=(100, '00000000000000000000') and the second time is for (100, |
9805 | '100'). |
9806 | If prefix bloom filter length is 24 bytes, prefix bloom filter can be used |
9807 | for the |
9808 | first condition but not for the second condition. |
9809 | If bloom filter condition is changed, currently it is necessary to destroy |
9810 | and |
9811 | re-create Iterator. |
9812 | */ |
9813 | if (m_scan_it_skips_bloom != skip_bloom) { |
9814 | release_scan_iterator(); |
9815 | } |
9816 | |
9817 | /* |
9818 | SQL layer can call rnd_init() multiple times in a row. |
9819 | In that case, re-use the iterator, but re-position it at the table start. |
9820 | */ |
9821 | if (!m_scan_it) { |
9822 | const bool fill_cache = !THDVAR(ha_thd(), skip_fill_cache); |
9823 | if (commit_in_the_middle()) { |
9824 | DBUG_ASSERT(m_scan_it_snapshot == nullptr); |
9825 | m_scan_it_snapshot = rdb->GetSnapshot(); |
9826 | |
9827 | auto read_opts = rocksdb::ReadOptions(); |
9828 | read_opts.total_order_seek = true; // TODO: set based on WHERE conditions |
9829 | read_opts.snapshot = m_scan_it_snapshot; |
9830 | m_scan_it = rdb->NewIterator(read_opts, kd.get_cf()); |
9831 | } else { |
9832 | m_scan_it = tx->get_iterator(kd.get_cf(), skip_bloom, fill_cache, |
9833 | m_eq_cond_lower_bound_slice, |
9834 | m_eq_cond_upper_bound_slice); |
9835 | } |
9836 | m_scan_it_skips_bloom = skip_bloom; |
9837 | } |
9838 | } |
9839 | |
9840 | void ha_rocksdb::release_scan_iterator() { |
9841 | delete m_scan_it; |
9842 | m_scan_it = nullptr; |
9843 | |
9844 | if (m_scan_it_snapshot) { |
9845 | rdb->ReleaseSnapshot(m_scan_it_snapshot); |
9846 | m_scan_it_snapshot = nullptr; |
9847 | } |
9848 | } |
9849 | |
9850 | void ha_rocksdb::setup_iterator_for_rnd_scan() { |
9851 | uint key_size; |
9852 | |
9853 | int key_start_matching_bytes = m_pk_descr->get_first_key(m_pk_packed_tuple, &key_size); |
9854 | |
9855 | rocksdb::Slice table_key((const char *)m_pk_packed_tuple, key_size); |
9856 | |
9857 | setup_scan_iterator(*m_pk_descr, &table_key, false, |
9858 | key_start_matching_bytes); |
9859 | m_scan_it->Seek(table_key); |
9860 | m_skip_scan_it_next_call = true; |
9861 | } |
9862 | |
9863 | /** |
9864 | @return |
9865 | HA_EXIT_SUCCESS OK |
9866 | other HA_ERR error code (can be SE-specific) |
9867 | */ |
9868 | int ha_rocksdb::rnd_init(bool scan) { |
9869 | DBUG_ENTER_FUNC(); |
9870 | |
9871 | Rdb_transaction *const tx = get_or_create_tx(table->in_use); |
9872 | setup_read_decoders(); |
9873 | |
9874 | if (scan) { |
9875 | m_rnd_scan_is_new_snapshot = !tx->has_snapshot(); |
9876 | setup_iterator_for_rnd_scan(); |
9877 | } else { |
9878 | /* We don't need any preparations for rnd_pos() calls. */ |
9879 | } |
9880 | |
9881 | // If m_lock_rows is on then we will be doing a get_for_update when accessing |
9882 | // the index, so don't acquire the snapshot right away. Otherwise acquire |
9883 | // the snapshot immediately. |
9884 | tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE); |
9885 | |
9886 | DBUG_RETURN(HA_EXIT_SUCCESS); |
9887 | } |
9888 | |
9889 | /** |
9890 | @return |
9891 | HA_EXIT_SUCCESS OK |
9892 | other HA_ERR error code (can be SE-specific) |
9893 | */ |
9894 | int ha_rocksdb::rnd_next(uchar *const buf) { |
9895 | DBUG_ENTER_FUNC(); |
9896 | |
9897 | int rc; |
9898 | for (;;) { |
9899 | rc = rnd_next_with_direction(buf, true); |
9900 | if (rc != HA_ERR_LOCK_DEADLOCK || !m_rnd_scan_is_new_snapshot) |
9901 | break; /* exit the loop */ |
9902 | |
9903 | // release the snapshot and iterator and then regenerate them |
9904 | Rdb_transaction *tx = get_or_create_tx(table->in_use); |
9905 | tx->release_snapshot(); |
9906 | release_scan_iterator(); |
9907 | setup_iterator_for_rnd_scan(); |
9908 | } |
9909 | |
9910 | m_rnd_scan_is_new_snapshot = false; |
9911 | |
9912 | if (rc == HA_ERR_KEY_NOT_FOUND) |
9913 | rc = HA_ERR_END_OF_FILE; |
9914 | |
9915 | DBUG_RETURN(rc); |
9916 | } |
9917 | |
9918 | /* |
9919 | See also secondary_index_read(). |
9920 | */ |
9921 | int ha_rocksdb::rnd_next_with_direction(uchar *const buf, bool move_forward) { |
9922 | DBUG_ENTER_FUNC(); |
9923 | |
9924 | int rc; |
9925 | |
9926 | table->status = STATUS_NOT_FOUND; |
9927 | #ifdef MARIAROCKS_NOT_YET |
9928 | stats.rows_requested++; |
9929 | #endif |
9930 | if (!m_scan_it || !is_valid(m_scan_it)) { |
9931 | /* |
9932 | We can get here when SQL layer has called |
9933 | |
9934 | h->index_init(PRIMARY); |
9935 | h->index_read_map(full index tuple, HA_READ_KEY_EXACT); |
9936 | |
9937 | In this case, we should return EOF. |
9938 | */ |
9939 | DBUG_RETURN(HA_ERR_END_OF_FILE); |
9940 | } |
9941 | |
9942 | for (;;) { |
9943 | if (m_skip_scan_it_next_call) { |
9944 | m_skip_scan_it_next_call = false; |
9945 | } else { |
9946 | if (move_forward) |
9947 | m_scan_it->Next(); /* this call cannot fail */ |
9948 | else |
9949 | m_scan_it->Prev(); /* this call cannot fail */ |
9950 | } |
9951 | |
9952 | if (!is_valid(m_scan_it)) { |
9953 | rc = HA_ERR_END_OF_FILE; |
9954 | break; |
9955 | } |
9956 | |
9957 | /* check if we're out of this table */ |
9958 | const rocksdb::Slice key = m_scan_it->key(); |
9959 | if (!m_pk_descr->covers_key(key)) { |
9960 | rc = HA_ERR_END_OF_FILE; |
9961 | break; |
9962 | } |
9963 | |
9964 | if (m_lock_rows != RDB_LOCK_NONE) { |
9965 | /* |
9966 | Lock the row we've just read. |
9967 | |
9968 | Now we call get_for_update which will 1) Take a lock and 2) Will fail |
9969 | if the row was deleted since the snapshot was taken. |
9970 | */ |
9971 | Rdb_transaction *const tx = get_or_create_tx(table->in_use); |
9972 | DEBUG_SYNC(ha_thd(), "rocksdb_concurrent_delete" ); |
9973 | |
9974 | if (m_pk_descr->has_ttl() && |
9975 | should_hide_ttl_rec(*m_pk_descr, m_scan_it->value(), |
9976 | tx->m_snapshot_timestamp)) { |
9977 | continue; |
9978 | } |
9979 | |
9980 | const rocksdb::Status s = |
9981 | get_for_update(tx, m_pk_descr->get_cf(), key, &m_retrieved_record); |
9982 | if (s.IsNotFound() && |
9983 | my_core::thd_tx_isolation(ha_thd()) == ISO_READ_COMMITTED) { |
9984 | // This occurs if we accessed a row, tried to lock it, failed, |
9985 | // released and reacquired the snapshot (because of READ COMMITTED |
9986 | // mode) and the row was deleted by someone else in the meantime. |
9987 | // If so, we just want to move on to the next row. |
9988 | continue; |
9989 | } |
9990 | |
9991 | if (!s.ok()) { |
9992 | DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, |
9993 | m_tbl_def, m_table_handler)); |
9994 | } |
9995 | |
9996 | // If we called get_for_update() use the value from that call not from |
9997 | // the iterator as it may be stale since we don't have a snapshot |
9998 | // when m_lock_rows is not RDB_LOCK_NONE. |
9999 | m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin); |
10000 | rc = convert_record_from_storage_format(&key, buf); |
10001 | } else { |
10002 | // Use the value from the iterator |
10003 | rocksdb::Slice value = m_scan_it->value(); |
10004 | |
10005 | if (m_pk_descr->has_ttl() && |
10006 | should_hide_ttl_rec( |
10007 | *m_pk_descr, value, |
10008 | get_or_create_tx(table->in_use)->m_snapshot_timestamp)) { |
10009 | continue; |
10010 | } |
10011 | |
10012 | m_last_rowkey.copy(key.data(), key.size(), &my_charset_bin); |
10013 | rc = convert_record_from_storage_format(&key, &value, buf); |
10014 | } |
10015 | |
10016 | table->status = 0; |
10017 | break; |
10018 | } |
10019 | |
10020 | if (!rc) { |
10021 | #ifdef MARIAROCKS_NOT_YET |
10022 | stats.rows_read++; |
10023 | stats.rows_index_next++; |
10024 | #endif |
10025 | update_row_stats(ROWS_READ); |
10026 | } |
10027 | |
10028 | DBUG_RETURN(rc); |
10029 | } |
10030 | |
10031 | int ha_rocksdb::rnd_end() { |
10032 | DBUG_ENTER_FUNC(); |
10033 | |
10034 | release_scan_iterator(); |
10035 | |
10036 | DBUG_RETURN(HA_EXIT_SUCCESS); |
10037 | } |
10038 | |
10039 | /** |
10040 | @return |
10041 | HA_EXIT_SUCCESS OK |
10042 | other HA_ERR error code (can be SE-specific) |
10043 | */ |
10044 | int ha_rocksdb::index_init(uint idx, bool sorted) { |
10045 | DBUG_ENTER_FUNC(); |
10046 | |
10047 | Rdb_transaction *const tx = get_or_create_tx(table->in_use); |
10048 | DBUG_ASSERT(tx != nullptr); |
10049 | |
10050 | setup_read_decoders(); |
10051 | m_start_range= NULL; |
10052 | |
10053 | if (!m_keyread_only) { |
10054 | m_key_descr_arr[idx]->get_lookup_bitmap(table, &m_lookup_bitmap); |
10055 | } |
10056 | |
10057 | // If m_lock_rows is not RDB_LOCK_NONE then we will be doing a get_for_update |
10058 | // when accessing the index, so don't acquire the snapshot right away. |
10059 | // Otherwise acquire the snapshot immediately. |
10060 | tx->acquire_snapshot(m_lock_rows == RDB_LOCK_NONE); |
10061 | |
10062 | active_index = idx; |
10063 | |
10064 | DBUG_RETURN(HA_EXIT_SUCCESS); |
10065 | } |
10066 | |
10067 | /** |
10068 | @return |
10069 | HA_EXIT_SUCCESS OK |
10070 | */ |
10071 | int ha_rocksdb::index_end() { |
10072 | DBUG_ENTER_FUNC(); |
10073 | |
10074 | release_scan_iterator(); |
10075 | |
10076 | bitmap_free(&m_lookup_bitmap); |
10077 | |
10078 | active_index = MAX_KEY; |
10079 | in_range_check_pushed_down = FALSE; |
10080 | m_start_range= NULL; |
10081 | |
10082 | DBUG_RETURN(HA_EXIT_SUCCESS); |
10083 | } |
10084 | |
10085 | /** |
10086 | @return |
10087 | HA_EXIT_SUCCESS OK |
10088 | other HA_ERR error code (can be SE-specific) |
10089 | */ |
10090 | int ha_rocksdb::truncate() { |
10091 | DBUG_ENTER_FUNC(); |
10092 | |
10093 | DBUG_ASSERT(m_tbl_def != nullptr); |
10094 | |
10095 | m_tbl_def->m_auto_incr_val = 1; |
10096 | m_tbl_def->m_hidden_pk_val = 1; |
10097 | |
10098 | DBUG_RETURN(remove_rows(m_tbl_def)); |
10099 | } |
10100 | |
10101 | /* |
10102 | Delete the row we've last read. The row is also passed as parameter. |
10103 | |
10104 | @detail |
10105 | The caller guarantees table buf points to the row that was just read. |
10106 | The row is either table->record[0] or table->record[1]. |
10107 | (Check out InnoDB: row_update_for_mysql() has "UT_NOT_USED(mysql_rec)" |
10108 | |
10109 | @return |
10110 | HA_EXIT_SUCCESS OK |
10111 | other HA_ERR error code (can be SE-specific) |
10112 | */ |
10113 | int ha_rocksdb::delete_row(const uchar *const buf) { |
10114 | DBUG_ENTER_FUNC(); |
10115 | |
10116 | DBUG_ASSERT(buf != nullptr); |
10117 | |
10118 | set_last_rowkey(buf); |
10119 | |
10120 | rocksdb::Slice key_slice(m_last_rowkey.ptr(), m_last_rowkey.length()); |
10121 | Rdb_transaction *const tx = get_or_create_tx(table->in_use); |
10122 | ulonglong bytes_written = 0; |
10123 | |
10124 | const uint index = pk_index(table, m_tbl_def); |
10125 | rocksdb::Status s = |
10126 | delete_or_singledelete(index, tx, m_pk_descr->get_cf(), key_slice); |
10127 | if (!s.ok()) { |
10128 | DBUG_RETURN(tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def, |
10129 | m_table_handler)); |
10130 | } else { |
10131 | bytes_written = key_slice.size(); |
10132 | } |
10133 | |
10134 | longlong hidden_pk_id = 0; |
10135 | if (m_tbl_def->m_key_count > 1 && has_hidden_pk(table)) { |
10136 | int err = read_hidden_pk_id_from_rowkey(&hidden_pk_id); |
10137 | if (err) { |
10138 | DBUG_RETURN(err); |
10139 | } |
10140 | } |
10141 | |
10142 | // Delete the record for every secondary index |
10143 | for (uint i = 0; i < m_tbl_def->m_key_count; i++) { |
10144 | if (!is_pk(i, table, m_tbl_def)) { |
10145 | int packed_size; |
10146 | const Rdb_key_def &kd = *m_key_descr_arr[i]; |
10147 | packed_size = kd.pack_record(table, m_pack_buffer, buf, m_sk_packed_tuple, |
10148 | nullptr, false, hidden_pk_id); |
10149 | rocksdb::Slice secondary_key_slice( |
10150 | reinterpret_cast<const char *>(m_sk_packed_tuple), packed_size); |
10151 | /* Deleting on secondary key doesn't need any locks: */ |
10152 | tx->get_indexed_write_batch()->SingleDelete(kd.get_cf(), |
10153 | secondary_key_slice); |
10154 | bytes_written += secondary_key_slice.size(); |
10155 | } |
10156 | } |
10157 | |
10158 | tx->incr_delete_count(); |
10159 | |
10160 | if (do_bulk_commit(tx)) { |
10161 | DBUG_RETURN(HA_ERR_ROCKSDB_BULK_LOAD); |
10162 | } |
10163 | #ifdef MARIAROCKS_NOT_YET |
10164 | stats.rows_deleted++; |
10165 | #endif |
10166 | update_row_stats(ROWS_DELETED); |
10167 | tx->update_bytes_written(bytes_written); |
10168 | |
10169 | DBUG_RETURN(HA_EXIT_SUCCESS); |
10170 | } |
10171 | |
10172 | rocksdb::Status ha_rocksdb::delete_or_singledelete( |
10173 | uint index, Rdb_transaction *const tx, |
10174 | rocksdb::ColumnFamilyHandle *const column_family, |
10175 | const rocksdb::Slice &key) { |
10176 | if (can_use_single_delete(index)) |
10177 | return tx->single_delete(column_family, key); |
10178 | return tx->delete_key(column_family, key); |
10179 | } |
10180 | |
10181 | void ha_rocksdb::update_stats(void) { |
10182 | DBUG_ENTER_FUNC(); |
10183 | |
10184 | stats.records = 0; |
10185 | stats.index_file_length = 0ul; |
10186 | stats.data_file_length = 0ul; |
10187 | stats.mean_rec_length = 0; |
10188 | |
10189 | for (uint i = 0; i < m_tbl_def->m_key_count; i++) { |
10190 | if (is_pk(i, table, m_tbl_def)) { |
10191 | stats.data_file_length = m_pk_descr->m_stats.m_actual_disk_size; |
10192 | stats.records = m_pk_descr->m_stats.m_rows; |
10193 | } else { |
10194 | stats.index_file_length += m_key_descr_arr[i]->m_stats.m_actual_disk_size; |
10195 | } |
10196 | } |
10197 | |
10198 | DBUG_VOID_RETURN; |
10199 | } |
10200 | |
10201 | /** |
10202 | @return |
10203 | HA_EXIT_SUCCESS OK |
10204 | HA_EXIT_FAILURE Error |
10205 | */ |
10206 | int ha_rocksdb::info(uint flag) { |
10207 | DBUG_ENTER_FUNC(); |
10208 | |
10209 | if (!table) |
10210 | DBUG_RETURN(HA_EXIT_FAILURE); |
10211 | |
10212 | if (flag & HA_STATUS_VARIABLE) { |
10213 | /* |
10214 | Test only to simulate corrupted stats |
10215 | */ |
10216 | DBUG_EXECUTE_IF("myrocks_simulate_negative_stats" , |
10217 | m_pk_descr->m_stats.m_actual_disk_size = |
10218 | -m_pk_descr->m_stats.m_actual_disk_size;); |
10219 | |
10220 | update_stats(); |
10221 | |
10222 | /* |
10223 | If any stats are negative due to bad cached stats, re-run analyze table |
10224 | and re-retrieve the stats. |
10225 | */ |
10226 | if (static_cast<longlong>(stats.data_file_length) < 0 || |
10227 | static_cast<longlong>(stats.index_file_length) < 0 || |
10228 | static_cast<longlong>(stats.records) < 0) { |
10229 | if (analyze(nullptr, nullptr)) { |
10230 | DBUG_RETURN(HA_EXIT_FAILURE); |
10231 | } |
10232 | |
10233 | update_stats(); |
10234 | } |
10235 | |
10236 | // if number of records is hardcoded, we do not want to force computation |
10237 | // of memtable cardinalities |
10238 | if (stats.records == 0 || |
10239 | (rocksdb_force_compute_memtable_stats && |
10240 | rocksdb_debug_optimizer_n_rows == 0)) |
10241 | { |
10242 | // First, compute SST files stats |
10243 | uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]; |
10244 | auto r = get_range(pk_index(table, m_tbl_def), buf); |
10245 | uint64_t sz = 0; |
10246 | uint8_t include_flags = rocksdb::DB::INCLUDE_FILES; |
10247 | // recompute SST files stats only if records count is 0 |
10248 | if (stats.records == 0) { |
10249 | rdb->GetApproximateSizes(m_pk_descr->get_cf(), &r, 1, &sz, |
10250 | include_flags); |
10251 | stats.records+= sz/ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE; |
10252 | stats.data_file_length+= sz; |
10253 | } |
10254 | // Second, compute memtable stats. This call is expensive, so cache |
10255 | // values computed for some time. |
10256 | uint64_t cachetime = rocksdb_force_compute_memtable_stats_cachetime; |
10257 | uint64_t time = (cachetime == 0) ? 0 : my_interval_timer() / 1000; |
10258 | if (cachetime == 0 || |
10259 | time > m_table_handler->m_mtcache_last_update + cachetime) { |
10260 | uint64_t memtableCount; |
10261 | uint64_t memtableSize; |
10262 | |
10263 | // the stats below are calculated from skiplist wich is a probablistic |
10264 | // data structure, so the results vary between test runs |
10265 | // it also can return 0 for quite a large tables which means that |
10266 | // cardinality for memtable only indxes will be reported as 0 |
10267 | rdb->GetApproximateMemTableStats(m_pk_descr->get_cf(), r, |
10268 | &memtableCount, &memtableSize); |
10269 | |
10270 | // Atomically update all of these fields at the same time |
10271 | if (cachetime > 0) { |
10272 | if (m_table_handler->m_mtcache_lock.fetch_add( |
10273 | 1, std::memory_order_acquire) == 0) { |
10274 | m_table_handler->m_mtcache_count = memtableCount; |
10275 | m_table_handler->m_mtcache_size = memtableSize; |
10276 | m_table_handler->m_mtcache_last_update = time; |
10277 | } |
10278 | m_table_handler->m_mtcache_lock.fetch_sub(1, |
10279 | std::memory_order_release); |
10280 | } |
10281 | |
10282 | stats.records += memtableCount; |
10283 | stats.data_file_length += memtableSize; |
10284 | } else { |
10285 | // Cached data is still valid, so use it instead |
10286 | stats.records += m_table_handler->m_mtcache_count; |
10287 | stats.data_file_length += m_table_handler->m_mtcache_size; |
10288 | } |
10289 | |
10290 | if (rocksdb_debug_optimizer_n_rows > 0) |
10291 | stats.records = rocksdb_debug_optimizer_n_rows; |
10292 | } |
10293 | |
10294 | if (stats.records != 0) |
10295 | stats.mean_rec_length = stats.data_file_length / stats.records; |
10296 | } |
10297 | if (flag & HA_STATUS_CONST) { |
10298 | ref_length = m_pk_descr->max_storage_fmt_length(); |
10299 | |
10300 | for (uint i = 0; i < m_tbl_def->m_key_count; i++) { |
10301 | if (is_hidden_pk(i, table, m_tbl_def)) { |
10302 | continue; |
10303 | } |
10304 | KEY *const k = &table->key_info[i]; |
10305 | for (uint j = 0; j < k->ext_key_parts; j++) { |
10306 | const Rdb_index_stats &k_stats = m_key_descr_arr[i]->m_stats; |
10307 | uint x = k_stats.m_distinct_keys_per_prefix.size() > j && |
10308 | k_stats.m_distinct_keys_per_prefix[j] > 0 |
10309 | ? k_stats.m_rows / k_stats.m_distinct_keys_per_prefix[j] |
10310 | : 0; |
10311 | if (x > stats.records) |
10312 | x = stats.records; |
10313 | if ((x == 0 && rocksdb_debug_optimizer_no_zero_cardinality) || |
10314 | rocksdb_debug_optimizer_n_rows > 0) { |
10315 | // Fake cardinality implementation. For example, (idx1, idx2, idx3) |
10316 | // index |
10317 | /* |
10318 | Make MariaRocks behave the same way as MyRocks does: |
10319 | 1. SQL layer thinks that unique secondary indexes are not extended |
10320 | with PK columns (both in MySQL and MariaDB) |
10321 | 2. MariaDB also thinks that indexes with partially-covered columns |
10322 | are not extended with PK columns. Use the same number of |
10323 | keyparts that MyRocks would use. |
10324 | */ |
10325 | uint ext_key_parts2; |
10326 | if (k->flags & HA_NOSAME) |
10327 | ext_key_parts2= k->ext_key_parts; // This is #1 |
10328 | else |
10329 | ext_key_parts2= m_key_descr_arr[i]->get_key_parts(); // This is #2. |
10330 | |
10331 | // will have rec_per_key for (idx1)=4, (idx1,2)=2, and (idx1,2,3)=1. |
10332 | // rec_per_key for the whole index is 1, and multiplied by 2^n if |
10333 | // n suffix columns of the index are not used. |
10334 | x = 1 << (ext_key_parts2 - j - 1); |
10335 | } |
10336 | k->rec_per_key[j] = x; |
10337 | } |
10338 | } |
10339 | } |
10340 | |
10341 | if (flag & HA_STATUS_ERRKEY) { |
10342 | /* |
10343 | Currently we support only primary keys so we know which key had a |
10344 | uniqueness violation. |
10345 | */ |
10346 | errkey = m_dupp_errkey; |
10347 | dup_ref = m_pk_tuple; // TODO(?): this should store packed PK. |
10348 | } |
10349 | |
10350 | if (flag & HA_STATUS_AUTO) { |
10351 | stats.auto_increment_value = m_tbl_def->m_auto_incr_val; |
10352 | } |
10353 | |
10354 | DBUG_RETURN(HA_EXIT_SUCCESS); |
10355 | } |
10356 | |
10357 | void ha_rocksdb::position(const uchar *const record) { |
10358 | DBUG_ENTER_FUNC(); |
10359 | |
10360 | longlong hidden_pk_id = 0; |
10361 | if (has_hidden_pk(table) && read_hidden_pk_id_from_rowkey(&hidden_pk_id)) |
10362 | DBUG_ASSERT(false); // should never reach here |
10363 | |
10364 | /* |
10365 | Get packed primary key value from the record. |
10366 | |
10367 | (Note: m_last_rowkey has the packed PK of last-read-row, which allows to |
10368 | handle most cases, but there is an exception to this: when slave applies |
10369 | RBR events, it fills the record and then calls position(); rnd_pos()) |
10370 | |
10371 | Also note that we *can* rely on PK column values being available. This is |
10372 | because table_flags() includes HA_PRIMARY_KEY_REQUIRED_FOR_POSITION bit. |
10373 | When that is true, table->prepare_for_position() adds PK columns into the |
10374 | read set (this may potentially disable index-only access when PK column |
10375 | cannot be restored from its mem-comparable form in the secondary indexes). |
10376 | */ |
10377 | const uint packed_size = m_pk_descr->pack_record( |
10378 | table, m_pack_buffer, record, ref, nullptr, false, hidden_pk_id); |
10379 | |
10380 | /* |
10381 | It could be that mem-comparable form of PK occupies less than ref_length |
10382 | bytes. Fill the remainder with zeros. |
10383 | */ |
10384 | if (ref_length > packed_size) |
10385 | memset(ref + packed_size, 0, ref_length - packed_size); |
10386 | |
10387 | DBUG_VOID_RETURN; |
10388 | } |
10389 | |
10390 | /** |
10391 | @return |
10392 | HA_EXIT_SUCCESS OK |
10393 | other HA_ERR error code (can be SE-specific) |
10394 | */ |
10395 | int ha_rocksdb::rnd_pos(uchar *const buf, uchar *const pos) { |
10396 | DBUG_ENTER_FUNC(); |
10397 | |
10398 | int rc; |
10399 | size_t len; |
10400 | |
10401 | #ifdef MARIAROCKS_NOT_YET |
10402 | stats.rows_requested++; |
10403 | #endif |
10404 | len = m_pk_descr->key_length(table, |
10405 | rocksdb::Slice((const char *)pos, ref_length)); |
10406 | if (len == size_t(-1)) { |
10407 | DBUG_RETURN(HA_ERR_ROCKSDB_CORRUPT_DATA); /* Data corruption? */ |
10408 | } |
10409 | |
10410 | rc = get_row_by_rowid(buf, pos, len); |
10411 | |
10412 | if (!rc) { |
10413 | #ifdef MARIAROCKS_NOT_YET |
10414 | stats.rows_read++; |
10415 | #endif |
10416 | update_row_stats(ROWS_READ); |
10417 | } |
10418 | |
10419 | DBUG_RETURN(rc); |
10420 | } |
10421 | |
10422 | /* |
10423 | @brief |
10424 | Calculate (if needed) the bitmap of indexes that are modified by the |
10425 | current query |
10426 | |
10427 | @detail |
10428 | The calculation done by checking index definitions against the |
10429 | table->write_set |
10430 | column bitmap. |
10431 | */ |
10432 | |
10433 | void ha_rocksdb::calc_updated_indexes() { |
10434 | if (!m_update_scope_is_valid) { |
10435 | m_update_scope_is_valid = true; |
10436 | m_update_scope.clear_all(); |
10437 | |
10438 | for (uint keynr = 0; keynr < table->s->keys; keynr++) { |
10439 | const Rdb_key_def &kd = *m_key_descr_arr[keynr]; |
10440 | /* Walk over all key parts, including the "extended key" suffix */ |
10441 | const uint key_parts = kd.get_key_parts(); |
10442 | for (uint kp = 0; kp < key_parts; kp++) { |
10443 | if (has_hidden_pk(table) && kp + 1 == key_parts) |
10444 | break; |
10445 | |
10446 | Field *const field = kd.get_table_field_for_part_no(table, kp); |
10447 | if (bitmap_is_set(table->write_set, field->field_index)) { |
10448 | m_update_scope.set_bit(keynr); |
10449 | break; |
10450 | } |
10451 | } |
10452 | } |
10453 | } |
10454 | } |
10455 | |
10456 | /** |
10457 | @return |
10458 | HA_EXIT_SUCCESS OK |
10459 | other HA_ERR error code (can be SE-specific) |
10460 | */ |
10461 | int ha_rocksdb::update_row(const uchar *const old_data, |
10462 | const uchar *const new_data) { |
10463 | DBUG_ENTER_FUNC(); |
10464 | |
10465 | DBUG_ASSERT(old_data != nullptr); |
10466 | DBUG_ASSERT(new_data != nullptr); |
10467 | DBUG_ASSERT(m_lock_rows == RDB_LOCK_WRITE); |
10468 | /* |
10469 | old_data points to record we're updating. It is the same as the record |
10470 | we've just read (for multi-table UPDATE, too, because SQL layer will make |
10471 | an rnd_pos() call to re-read the record before calling update_row()) |
10472 | */ |
10473 | DBUG_ASSERT(new_data == table->record[0]); |
10474 | |
10475 | const int rv = update_write_row(old_data, new_data, false); |
10476 | |
10477 | if (rv == 0) { |
10478 | #ifdef MARIAROCKS_NOT_YET |
10479 | stats.rows_updated++; |
10480 | #endif |
10481 | update_row_stats(ROWS_UPDATED); |
10482 | } |
10483 | |
10484 | DBUG_RETURN(rv); |
10485 | } |
10486 | |
10487 | /* |
10488 | MariaDB's temporary: MyRocks has this function in sql/handler.cc: |
10489 | */ |
10490 | |
10491 | bool can_hold_read_locks_on_select(THD *thd, thr_lock_type lock_type) |
10492 | { |
10493 | return (lock_type == TL_READ_WITH_SHARED_LOCKS |
10494 | || lock_type == TL_READ_NO_INSERT |
10495 | || (lock_type != TL_IGNORE |
10496 | && thd->lex->sql_command != SQLCOM_SELECT)); |
10497 | } |
10498 | |
10499 | |
10500 | /* The following function was copied from ha_blackhole::store_lock: */ |
10501 | THR_LOCK_DATA **ha_rocksdb::store_lock(THD *const thd, THR_LOCK_DATA **to, |
10502 | enum thr_lock_type lock_type) { |
10503 | DBUG_ENTER_FUNC(); |
10504 | |
10505 | DBUG_ASSERT(thd != nullptr); |
10506 | DBUG_ASSERT(to != nullptr); |
10507 | |
10508 | bool in_lock_tables = my_core::thd_in_lock_tables(thd); |
10509 | |
10510 | /* First, make a decision about MyRocks's internal locking */ |
10511 | if (lock_type >= TL_WRITE_ALLOW_WRITE) { |
10512 | m_lock_rows = RDB_LOCK_WRITE; |
10513 | } else if (lock_type == TL_READ_WITH_SHARED_LOCKS) { |
10514 | m_lock_rows = RDB_LOCK_READ; |
10515 | } else { |
10516 | m_lock_rows = RDB_LOCK_NONE; |
10517 | if (THDVAR(thd, lock_scanned_rows)) { |
10518 | /* |
10519 | The following logic was copied directly from |
10520 | ha_innobase::store_lock_with_x_type() in |
10521 | storage/innobase/handler/ha_innodb.cc and causes MyRocks to leave |
10522 | locks in place on rows that are in a table that is not being updated. |
10523 | */ |
10524 | const uint sql_command = my_core::thd_sql_command(thd); |
10525 | if ((lock_type == TL_READ && in_lock_tables) || |
10526 | (lock_type == TL_READ_HIGH_PRIORITY && in_lock_tables) || |
10527 | can_hold_read_locks_on_select(thd, lock_type)) { |
10528 | ulong tx_isolation = my_core::thd_tx_isolation(thd); |
10529 | if (sql_command != SQLCOM_CHECKSUM && |
10530 | ((my_core::thd_test_options(thd, OPTION_BIN_LOG) && |
10531 | tx_isolation > ISO_READ_COMMITTED) || |
10532 | tx_isolation == ISO_SERIALIZABLE || |
10533 | (lock_type != TL_READ && lock_type != TL_READ_NO_INSERT) || |
10534 | (sql_command != SQLCOM_INSERT_SELECT && |
10535 | sql_command != SQLCOM_REPLACE_SELECT && |
10536 | sql_command != SQLCOM_UPDATE && sql_command != SQLCOM_DELETE && |
10537 | sql_command != SQLCOM_CREATE_TABLE))) { |
10538 | m_lock_rows = RDB_LOCK_READ; |
10539 | } |
10540 | } |
10541 | } |
10542 | } |
10543 | |
10544 | /* Then, tell the SQL layer what kind of locking it should use: */ |
10545 | if (lock_type != TL_IGNORE && m_db_lock.type == TL_UNLOCK) { |
10546 | /* |
10547 | Here is where we get into the guts of a row level lock. |
10548 | If TL_UNLOCK is set |
10549 | If we are not doing a LOCK TABLE or DISCARD/IMPORT |
10550 | TABLESPACE, then allow multiple writers |
10551 | */ |
10552 | |
10553 | if ((lock_type >= TL_WRITE_CONCURRENT_INSERT && lock_type <= TL_WRITE) && |
10554 | !in_lock_tables && !my_core::thd_tablespace_op(thd)) { |
10555 | lock_type = TL_WRITE_ALLOW_WRITE; |
10556 | } |
10557 | |
10558 | /* |
10559 | In queries of type INSERT INTO t1 SELECT ... FROM t2 ... |
10560 | MySQL would use the lock TL_READ_NO_INSERT on t2, and that |
10561 | would conflict with TL_WRITE_ALLOW_WRITE, blocking all inserts |
10562 | to t2. Convert the lock to a normal read lock to allow |
10563 | concurrent inserts to t2. |
10564 | */ |
10565 | |
10566 | if (lock_type == TL_READ_NO_INSERT && !in_lock_tables) { |
10567 | lock_type = TL_READ; |
10568 | } |
10569 | |
10570 | m_db_lock.type = lock_type; |
10571 | } |
10572 | |
10573 | *to++ = &m_db_lock; |
10574 | |
10575 | DBUG_RETURN(to); |
10576 | } |
10577 | |
10578 | void ha_rocksdb::read_thd_vars(THD *const thd) { |
10579 | m_store_row_debug_checksums = THDVAR(thd, store_row_debug_checksums); |
10580 | m_verify_row_debug_checksums = THDVAR(thd, verify_row_debug_checksums); |
10581 | m_checksums_pct = THDVAR(thd, checksums_pct); |
10582 | } |
10583 | |
10584 | |
10585 | /** |
10586 | @return |
10587 | HA_EXIT_SUCCESS OK |
10588 | other HA_ERR error code (cannot be SE-specific) |
10589 | */ |
10590 | int ha_rocksdb::external_lock(THD *const thd, int lock_type) { |
10591 | DBUG_ENTER_FUNC(); |
10592 | |
10593 | DBUG_ASSERT(thd != nullptr); |
10594 | |
10595 | int res = HA_EXIT_SUCCESS; |
10596 | int binlog_format = my_core::thd_binlog_format(thd); |
10597 | bool unsafe_for_binlog = THDVAR(ha_thd(), unsafe_for_binlog); |
10598 | |
10599 | /* |
10600 | A note about (*) below: In fb/mysql, LOCK TABLE myrocks_table WRITE |
10601 | - produces an error when @@binlog_format='ROW'. |
10602 | - does not produce an error with @@binlog_format='STATEMENT' |
10603 | |
10604 | We cannot easily provide this behavior in MariaDB, because LOCK ROWS |
10605 | changes @@binlog_format for the duration of the statement. |
10606 | |
10607 | Instead we will make the check in ha_rocksdb::start_stmt and will produce |
10608 | an error in the first DML statement done under LOCK TABLE |
10609 | |
10610 | (**) - The same happens for other "DDL-type" commands, so allow them here |
10611 | as well. |
10612 | */ |
10613 | if (lock_type == F_WRLCK && !thd->rgi_slave && !unsafe_for_binlog && |
10614 | binlog_format != BINLOG_FORMAT_ROW && |
10615 | binlog_format != BINLOG_FORMAT_UNSPEC && |
10616 | thd->lex->sql_command != SQLCOM_LOCK_TABLES && // (*) |
10617 | thd->lex->sql_command != SQLCOM_ANALYZE && // (**) |
10618 | thd->lex->sql_command != SQLCOM_OPTIMIZE && // (**) |
10619 | thd->lex->sql_command != SQLCOM_FLUSH && // (**) |
10620 | my_core::thd_binlog_filter_ok(thd)) { |
10621 | my_error(ER_REQUIRE_ROW_BINLOG_FORMAT, MYF(0)); |
10622 | DBUG_RETURN(HA_ERR_UNSUPPORTED); |
10623 | } |
10624 | |
10625 | if (lock_type == F_UNLCK) { |
10626 | Rdb_transaction *const tx = get_tx_from_thd(thd); |
10627 | |
10628 | if (tx) { |
10629 | tx->io_perf_end_and_record(&m_io_perf); |
10630 | tx->m_n_mysql_tables_in_use--; |
10631 | if (tx->m_n_mysql_tables_in_use == 0 && |
10632 | !my_core::thd_test_options(thd, OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN)) { |
10633 | /* |
10634 | Do like InnoDB: when we get here, it's time to commit a |
10635 | single-statement transaction. |
10636 | |
10637 | If the statement involved multiple tables, this code will be executed |
10638 | for each of them, but that's ok because non-first tx->commit() calls |
10639 | will be no-ops. |
10640 | */ |
10641 | if (tx->commit_or_rollback()) { |
10642 | res = HA_ERR_INTERNAL_ERROR; |
10643 | } |
10644 | } |
10645 | } |
10646 | } else { |
10647 | if (my_core::thd_tx_isolation(thd) < ISO_READ_COMMITTED || |
10648 | my_core::thd_tx_isolation(thd) > ISO_REPEATABLE_READ) { |
10649 | my_error(ER_ISOLATION_MODE_NOT_SUPPORTED, MYF(0), |
10650 | tx_isolation_names[my_core::thd_tx_isolation(thd)]); |
10651 | DBUG_RETURN(HA_ERR_UNSUPPORTED); |
10652 | } |
10653 | /* |
10654 | It's nice to do the following on start of every statement. The problem |
10655 | is, handler->start_stmt() is not called for INSERTs. |
10656 | So, we put this code here. |
10657 | */ |
10658 | Rdb_transaction *const tx = get_or_create_tx(thd); |
10659 | read_thd_vars(thd); |
10660 | |
10661 | m_update_scope_is_valid = false; |
10662 | |
10663 | if (skip_unique_check()) { |
10664 | if ((thd->lex->sql_command == SQLCOM_INSERT || |
10665 | thd->lex->sql_command == SQLCOM_LOAD || |
10666 | thd->lex->sql_command == SQLCOM_REPLACE) && |
10667 | (thd->lex->duplicates == DUP_REPLACE || |
10668 | thd->lex->duplicates == DUP_UPDATE)) { |
10669 | my_error(ER_ON_DUPLICATE_DISABLED, MYF(0), thd->query()); |
10670 | DBUG_RETURN(HA_ERR_UNSUPPORTED); |
10671 | } |
10672 | } |
10673 | |
10674 | if (lock_type == F_WRLCK) { |
10675 | if (tx->is_tx_read_only()) { |
10676 | my_error(ER_UPDATES_WITH_CONSISTENT_SNAPSHOT, MYF(0)); |
10677 | DBUG_RETURN(HA_ERR_UNSUPPORTED); |
10678 | } |
10679 | |
10680 | /* |
10681 | SQL layer signals us to take a write lock. It does so when starting DML |
10682 | statement. We should put locks on the rows we're reading. |
10683 | |
10684 | Note: sometimes, external_lock() can be called without a prior |
10685 | ::store_lock call. That's why we need to set lock_* members here, too. |
10686 | */ |
10687 | m_lock_rows = RDB_LOCK_WRITE; |
10688 | |
10689 | if (thd->lex->sql_command == SQLCOM_CREATE_INDEX || |
10690 | thd->lex->sql_command == SQLCOM_DROP_INDEX || |
10691 | thd->lex->sql_command == SQLCOM_ALTER_TABLE) { |
10692 | tx->m_ddl_transaction = true; |
10693 | } |
10694 | } |
10695 | tx->m_n_mysql_tables_in_use++; |
10696 | rocksdb_register_tx(rocksdb_hton, thd, tx); |
10697 | tx->io_perf_start(&m_io_perf); |
10698 | } |
10699 | |
10700 | DBUG_RETURN(res); |
10701 | } |
10702 | |
10703 | /** |
10704 | @note |
10705 | A quote from ha_innobase::start_stmt(): |
10706 | <quote> |
10707 | MySQL calls this function at the start of each SQL statement inside LOCK |
10708 | TABLES. Inside LOCK TABLES the ::external_lock method does not work to |
10709 | mark SQL statement borders. |
10710 | </quote> |
10711 | |
10712 | @return |
10713 | HA_EXIT_SUCCESS OK |
10714 | */ |
10715 | |
10716 | int ha_rocksdb::start_stmt(THD *const thd, thr_lock_type lock_type) { |
10717 | DBUG_ENTER_FUNC(); |
10718 | |
10719 | /* |
10720 | MariaDB: the following is a copy of the check in ha_rocksdb::external_lock: |
10721 | */ |
10722 | int binlog_format = my_core::thd_binlog_format(thd); |
10723 | bool unsafe_for_binlog = THDVAR(ha_thd(), unsafe_for_binlog); |
10724 | if (lock_type >= TL_WRITE_ALLOW_WRITE && |
10725 | !thd->rgi_slave && !unsafe_for_binlog && |
10726 | binlog_format != BINLOG_FORMAT_ROW && |
10727 | binlog_format != BINLOG_FORMAT_UNSPEC && |
10728 | my_core::thd_binlog_filter_ok(thd)) { |
10729 | my_error(ER_REQUIRE_ROW_BINLOG_FORMAT, MYF(0)); |
10730 | DBUG_RETURN(HA_ERR_UNSUPPORTED); |
10731 | } |
10732 | |
10733 | DBUG_ASSERT(thd != nullptr); |
10734 | |
10735 | Rdb_transaction *const tx = get_or_create_tx(thd); |
10736 | read_thd_vars(thd); |
10737 | rocksdb_register_tx(ht, thd, tx); |
10738 | tx->io_perf_start(&m_io_perf); |
10739 | |
10740 | DBUG_RETURN(HA_EXIT_SUCCESS); |
10741 | } |
10742 | |
10743 | rocksdb::Range get_range(uint32_t i, |
10744 | uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2], |
10745 | int offset1, int offset2) { |
10746 | uchar *buf_begin = buf; |
10747 | uchar *buf_end = buf + Rdb_key_def::INDEX_NUMBER_SIZE; |
10748 | rdb_netbuf_store_index(buf_begin, i + offset1); |
10749 | rdb_netbuf_store_index(buf_end, i + offset2); |
10750 | |
10751 | return rocksdb::Range( |
10752 | rocksdb::Slice((const char *)buf_begin, Rdb_key_def::INDEX_NUMBER_SIZE), |
10753 | rocksdb::Slice((const char *)buf_end, Rdb_key_def::INDEX_NUMBER_SIZE)); |
10754 | } |
10755 | |
10756 | static rocksdb::Range get_range(const Rdb_key_def &kd, |
10757 | uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2], |
10758 | int offset1, int offset2) { |
10759 | return get_range(kd.get_index_number(), buf, offset1, offset2); |
10760 | } |
10761 | |
10762 | rocksdb::Range get_range(const Rdb_key_def &kd, |
10763 | uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) { |
10764 | if (kd.m_is_reverse_cf) { |
10765 | return myrocks::get_range(kd, buf, 1, 0); |
10766 | } else { |
10767 | return myrocks::get_range(kd, buf, 0, 1); |
10768 | } |
10769 | } |
10770 | |
10771 | rocksdb::Range |
10772 | ha_rocksdb::get_range(const int &i, |
10773 | uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]) const { |
10774 | return myrocks::get_range(*m_key_descr_arr[i], buf); |
10775 | } |
10776 | |
10777 | /* |
10778 | This function is called with total_order_seek=true, but |
10779 | upper/lower bound setting is not necessary. |
10780 | Boundary set is useful when there is no matching key, |
10781 | but in drop_index_thread's case, it means index is marked as removed, |
10782 | so no further seek will happen for the index id. |
10783 | */ |
10784 | static bool is_myrocks_index_empty( |
10785 | rocksdb::ColumnFamilyHandle *cfh, const bool is_reverse_cf, |
10786 | const rocksdb::ReadOptions &read_opts, |
10787 | const uint index_id) |
10788 | { |
10789 | bool index_removed = false; |
10790 | uchar key_buf[Rdb_key_def::INDEX_NUMBER_SIZE] = {0}; |
10791 | rdb_netbuf_store_uint32(key_buf, index_id); |
10792 | const rocksdb::Slice key = |
10793 | rocksdb::Slice(reinterpret_cast<char *>(key_buf), sizeof(key_buf)); |
10794 | std::unique_ptr<rocksdb::Iterator> it(rdb->NewIterator(read_opts, cfh)); |
10795 | rocksdb_smart_seek(is_reverse_cf, it.get(), key); |
10796 | if (!it->Valid()) { |
10797 | index_removed = true; |
10798 | } else { |
10799 | if (memcmp(it->key().data(), key_buf, |
10800 | Rdb_key_def::INDEX_NUMBER_SIZE)) { |
10801 | // Key does not have same prefix |
10802 | index_removed = true; |
10803 | } |
10804 | } |
10805 | return index_removed; |
10806 | } |
10807 | |
10808 | /* |
10809 | Drop index thread's main logic |
10810 | */ |
10811 | |
10812 | void Rdb_drop_index_thread::run() { |
10813 | RDB_MUTEX_LOCK_CHECK(m_signal_mutex); |
10814 | |
10815 | for (;;) { |
10816 | // The stop flag might be set by shutdown command |
10817 | // after drop_index_thread releases signal_mutex |
10818 | // (i.e. while executing expensive Seek()). To prevent drop_index_thread |
10819 | // from entering long cond_timedwait, checking if stop flag |
10820 | // is true or not is needed, with drop_index_interrupt_mutex held. |
10821 | if (m_stop) { |
10822 | break; |
10823 | } |
10824 | |
10825 | timespec ts; |
10826 | int sec= dict_manager.is_drop_index_empty() |
10827 | ? 24 * 60 * 60 // no filtering |
10828 | : 60; // filtering |
10829 | set_timespec(ts,sec); |
10830 | |
10831 | const auto ret MY_ATTRIBUTE((__unused__)) = |
10832 | mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts); |
10833 | if (m_stop) { |
10834 | break; |
10835 | } |
10836 | // make sure, no program error is returned |
10837 | DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT); |
10838 | RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex); |
10839 | |
10840 | std::unordered_set<GL_INDEX_ID> indices; |
10841 | dict_manager.get_ongoing_drop_indexes(&indices); |
10842 | if (!indices.empty()) { |
10843 | std::unordered_set<GL_INDEX_ID> finished; |
10844 | rocksdb::ReadOptions read_opts; |
10845 | read_opts.total_order_seek = true; // disable bloom filter |
10846 | |
10847 | for (const auto d : indices) { |
10848 | uint32 cf_flags = 0; |
10849 | if (!dict_manager.get_cf_flags(d.cf_id, &cf_flags)) { |
10850 | sql_print_error("RocksDB: Failed to get column family flags " |
10851 | "from cf id %u. MyRocks data dictionary may " |
10852 | "get corrupted." , |
10853 | d.cf_id); |
10854 | abort(); |
10855 | } |
10856 | rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(d.cf_id); |
10857 | DBUG_ASSERT(cfh); |
10858 | const bool is_reverse_cf = cf_flags & Rdb_key_def::REVERSE_CF_FLAG; |
10859 | |
10860 | if (is_myrocks_index_empty(cfh, is_reverse_cf, read_opts, d.index_id)) |
10861 | { |
10862 | finished.insert(d); |
10863 | continue; |
10864 | } |
10865 | uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]; |
10866 | rocksdb::Range range = get_range(d.index_id, buf, is_reverse_cf ? 1 : 0, |
10867 | is_reverse_cf ? 0 : 1); |
10868 | rocksdb::Status status = DeleteFilesInRange(rdb->GetBaseDB(), cfh, |
10869 | &range.start, &range.limit); |
10870 | if (!status.ok()) { |
10871 | if (status.IsShutdownInProgress()) { |
10872 | break; |
10873 | } |
10874 | rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD); |
10875 | } |
10876 | status = rdb->CompactRange(getCompactRangeOptions(), cfh, &range.start, |
10877 | &range.limit); |
10878 | if (!status.ok()) { |
10879 | if (status.IsShutdownInProgress()) { |
10880 | break; |
10881 | } |
10882 | rdb_handle_io_error(status, RDB_IO_ERROR_BG_THREAD); |
10883 | } |
10884 | if (is_myrocks_index_empty(cfh, is_reverse_cf, read_opts, d.index_id)) |
10885 | { |
10886 | finished.insert(d); |
10887 | } |
10888 | } |
10889 | |
10890 | if (!finished.empty()) { |
10891 | dict_manager.finish_drop_indexes(finished); |
10892 | } |
10893 | } |
10894 | RDB_MUTEX_LOCK_CHECK(m_signal_mutex); |
10895 | } |
10896 | |
10897 | RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex); |
10898 | } |
10899 | |
10900 | Rdb_tbl_def *ha_rocksdb::get_table_if_exists(const char *const tablename) { |
10901 | DBUG_ASSERT(tablename != nullptr); |
10902 | |
10903 | std::string str; |
10904 | if (rdb_normalize_tablename(tablename, &str) != HA_EXIT_SUCCESS) { |
10905 | // We were not passed table name? |
10906 | DBUG_ASSERT(0); |
10907 | return nullptr; |
10908 | } |
10909 | |
10910 | return ddl_manager.find(str); |
10911 | } |
10912 | |
10913 | /* |
10914 | Note: the following function is called when the table is not open. That is, |
10915 | this->table==nullptr, pk_key_descr==nullptr, etc. |
10916 | |
10917 | tablename points to line in form "./dbname/tablename". |
10918 | |
10919 | @return |
10920 | HA_EXIT_SUCCESS OK |
10921 | other HA_ERR error code (can be SE-specific) |
10922 | */ |
10923 | |
10924 | int ha_rocksdb::delete_table(const char *const tablename) { |
10925 | DBUG_ENTER_FUNC(); |
10926 | |
10927 | DBUG_ASSERT(tablename != nullptr); |
10928 | |
10929 | const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin(); |
10930 | rocksdb::WriteBatch *const batch = wb.get(); |
10931 | |
10932 | /* Find the table in the hash */ |
10933 | Rdb_tbl_def *const tbl = get_table_if_exists(tablename); |
10934 | if (!tbl) |
10935 | DBUG_RETURN(HA_ERR_NO_SUCH_TABLE); |
10936 | dict_manager.add_drop_table(tbl->m_key_descr_arr, tbl->m_key_count, batch); |
10937 | |
10938 | /* |
10939 | Remove the table entry in data dictionary (this will also remove it from |
10940 | the persistent data dictionary). |
10941 | */ |
10942 | ddl_manager.remove(tbl, batch, true); |
10943 | |
10944 | int err = dict_manager.commit(batch); |
10945 | if (err) { |
10946 | DBUG_RETURN(err); |
10947 | } |
10948 | |
10949 | rdb_drop_idx_thread.signal(); |
10950 | |
10951 | DBUG_RETURN(HA_EXIT_SUCCESS); |
10952 | } |
10953 | |
10954 | int ha_rocksdb::remove_rows(Rdb_tbl_def *const tbl) { |
10955 | const rocksdb::WriteOptions wo = |
10956 | rdb_get_rocksdb_write_options(handler::ha_thd()); |
10957 | |
10958 | rocksdb::ReadOptions opts; |
10959 | opts.total_order_seek = true; |
10960 | Rdb_transaction *const tx = get_or_create_tx(table->in_use); |
10961 | |
10962 | char key_buf[MAX_KEY_LENGTH]; |
10963 | uint key_len; |
10964 | ulonglong bytes_written = 0; |
10965 | /* |
10966 | Remove all records in each index. |
10967 | (This is is not crash-safe, but it doesn't matter, because bulk row |
10968 | deletion will be handled on rocksdb side) |
10969 | */ |
10970 | for (uint i = 0; i < tbl->m_key_count; i++) { |
10971 | const Rdb_key_def &kd = *tbl->m_key_descr_arr[i]; |
10972 | kd.get_infimum_key(reinterpret_cast<uchar *>(key_buf), &key_len); |
10973 | rocksdb::ColumnFamilyHandle *cf = kd.get_cf(); |
10974 | const rocksdb::Slice table_key(key_buf, key_len); |
10975 | setup_iterator_bounds(kd, table_key, |
10976 | m_eq_cond_lower_bound, |
10977 | m_eq_cond_upper_bound, |
10978 | &m_eq_cond_lower_bound_slice, |
10979 | &m_eq_cond_upper_bound_slice); |
10980 | opts.iterate_lower_bound = &m_eq_cond_lower_bound_slice; |
10981 | opts.iterate_upper_bound = &m_eq_cond_upper_bound_slice; |
10982 | std::unique_ptr<rocksdb::Iterator> it(rdb->NewIterator(opts, cf)); |
10983 | |
10984 | it->Seek(table_key); |
10985 | while (it->Valid()) { |
10986 | const rocksdb::Slice key = it->key(); |
10987 | if (!kd.covers_key(key)) { |
10988 | break; |
10989 | } |
10990 | |
10991 | rocksdb::Status s; |
10992 | if (can_use_single_delete(i)) { |
10993 | s = rdb->SingleDelete(wo, cf, key); |
10994 | } else { |
10995 | s = rdb->Delete(wo, cf, key); |
10996 | } |
10997 | |
10998 | if (!s.ok()) { |
10999 | return tx->set_status_error(table->in_use, s, *m_pk_descr, m_tbl_def, |
11000 | m_table_handler); |
11001 | } |
11002 | bytes_written += key.size(); |
11003 | it->Next(); |
11004 | } |
11005 | } |
11006 | |
11007 | tx->update_bytes_written(bytes_written); |
11008 | |
11009 | return HA_EXIT_SUCCESS; |
11010 | } |
11011 | |
11012 | /** |
11013 | @return |
11014 | HA_EXIT_SUCCESS OK |
11015 | other HA_ERR error code (cannot be SE-specific) |
11016 | */ |
11017 | int ha_rocksdb::rename_table(const char *const from, const char *const to) { |
11018 | DBUG_ENTER_FUNC(); |
11019 | |
11020 | DBUG_ASSERT(from != nullptr); |
11021 | DBUG_ASSERT(to != nullptr); |
11022 | |
11023 | std::string from_str; |
11024 | std::string to_str; |
11025 | std::string from_db; |
11026 | std::string to_db; |
11027 | |
11028 | int rc = rdb_normalize_tablename(from, &from_str); |
11029 | if (rc != HA_EXIT_SUCCESS) { |
11030 | DBUG_RETURN(rc); |
11031 | } |
11032 | |
11033 | rc = rdb_split_normalized_tablename(from_str, &from_db); |
11034 | if (rc != HA_EXIT_SUCCESS) { |
11035 | DBUG_RETURN(rc); |
11036 | } |
11037 | |
11038 | rc = rdb_normalize_tablename(to, &to_str); |
11039 | if (rc != HA_EXIT_SUCCESS) { |
11040 | DBUG_RETURN(rc); |
11041 | } |
11042 | |
11043 | rc = rdb_split_normalized_tablename(to_str, &to_db); |
11044 | if (rc != HA_EXIT_SUCCESS) { |
11045 | DBUG_RETURN(rc); |
11046 | } |
11047 | |
11048 | // If the user changed the database part of the name then validate that the |
11049 | // 'to' database exists. |
11050 | if (from_db != to_db && !rdb_database_exists(to_db)) { |
11051 | // If we return a RocksDB specific error code here we get |
11052 | // "error: 206 - Unknown error 206". InnoDB gets |
11053 | // "error -1 - Unknown error -1" so let's match them. |
11054 | DBUG_RETURN(-1); |
11055 | } |
11056 | |
11057 | DBUG_EXECUTE_IF("gen_sql_table_name" , to_str = to_str + "#sql-test" ;); |
11058 | |
11059 | const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin(); |
11060 | rocksdb::WriteBatch *const batch = wb.get(); |
11061 | dict_manager.lock(); |
11062 | |
11063 | if (ddl_manager.rename(from_str, to_str, batch)) { |
11064 | rc = HA_ERR_NO_SUCH_TABLE; |
11065 | } else { |
11066 | rc = dict_manager.commit(batch); |
11067 | } |
11068 | dict_manager.unlock(); |
11069 | |
11070 | DBUG_RETURN(rc); |
11071 | } |
11072 | |
11073 | /** |
11074 | check_if_incompatible_data() called if ALTER TABLE can't detect otherwise |
11075 | if new and old definition are compatible |
11076 | |
11077 | @details If there are no other explicit signs like changed number of |
11078 | fields this function will be called by compare_tables() |
11079 | (sql/sql_tables.cc) to decide should we rewrite whole table or only .frm |
11080 | file. |
11081 | |
11082 | */ |
11083 | |
11084 | bool ha_rocksdb::check_if_incompatible_data(HA_CREATE_INFO *const info, |
11085 | uint table_changes) { |
11086 | DBUG_ENTER_FUNC(); |
11087 | |
11088 | DBUG_ASSERT(info != nullptr); |
11089 | |
11090 | // this function is needed only for online alter-table |
11091 | DBUG_RETURN(COMPATIBLE_DATA_NO); |
11092 | } |
11093 | |
11094 | /** |
11095 | @return |
11096 | HA_EXIT_SUCCESS OK |
11097 | */ |
11098 | int ha_rocksdb::(enum ha_extra_function operation) { |
11099 | DBUG_ENTER_FUNC(); |
11100 | |
11101 | switch (operation) { |
11102 | case HA_EXTRA_KEYREAD: |
11103 | m_keyread_only = true; |
11104 | break; |
11105 | case HA_EXTRA_NO_KEYREAD: |
11106 | m_keyread_only = false; |
11107 | break; |
11108 | case HA_EXTRA_FLUSH: |
11109 | /* |
11110 | If the table has blobs, then they are part of m_retrieved_record. |
11111 | This call invalidates them. |
11112 | */ |
11113 | m_retrieved_record.Reset(); |
11114 | break; |
11115 | default: |
11116 | break; |
11117 | } |
11118 | |
11119 | DBUG_RETURN(HA_EXIT_SUCCESS); |
11120 | } |
11121 | |
11122 | /* |
11123 | Given a starting key and an ending key, estimate the number of rows that |
11124 | will exist between the two keys. |
11125 | */ |
11126 | ha_rows ha_rocksdb::records_in_range(uint inx, key_range *const min_key, |
11127 | key_range *const max_key) { |
11128 | DBUG_ENTER_FUNC(); |
11129 | |
11130 | ha_rows ret = THDVAR(ha_thd(), records_in_range); |
11131 | if (ret) { |
11132 | DBUG_RETURN(ret); |
11133 | } |
11134 | if (table->force_index) { |
11135 | const ha_rows force_rows = THDVAR(ha_thd(), force_index_records_in_range); |
11136 | if (force_rows) { |
11137 | DBUG_RETURN(force_rows); |
11138 | } |
11139 | } |
11140 | |
11141 | const Rdb_key_def &kd = *m_key_descr_arr[inx]; |
11142 | |
11143 | uint size1 = 0; |
11144 | if (min_key) { |
11145 | size1 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple, |
11146 | min_key->key, min_key->keypart_map); |
11147 | if (min_key->flag == HA_READ_PREFIX_LAST_OR_PREV || |
11148 | min_key->flag == HA_READ_PREFIX_LAST || |
11149 | min_key->flag == HA_READ_AFTER_KEY) { |
11150 | kd.successor(m_sk_packed_tuple, size1); |
11151 | } |
11152 | } else { |
11153 | kd.get_infimum_key(m_sk_packed_tuple, &size1); |
11154 | } |
11155 | |
11156 | uint size2 = 0; |
11157 | if (max_key) { |
11158 | size2 = kd.pack_index_tuple(table, m_pack_buffer, m_sk_packed_tuple_old, |
11159 | max_key->key, max_key->keypart_map); |
11160 | if (max_key->flag == HA_READ_PREFIX_LAST_OR_PREV || |
11161 | max_key->flag == HA_READ_PREFIX_LAST || |
11162 | max_key->flag == HA_READ_AFTER_KEY) { |
11163 | kd.successor(m_sk_packed_tuple_old, size2); |
11164 | } |
11165 | // pad the upper key with FFFFs to make sure it is more than the lower |
11166 | if (size1 > size2) { |
11167 | memset(m_sk_packed_tuple_old + size2, 0xff, size1 - size2); |
11168 | size2 = size1; |
11169 | } |
11170 | } else { |
11171 | kd.get_supremum_key(m_sk_packed_tuple_old, &size2); |
11172 | } |
11173 | |
11174 | const rocksdb::Slice slice1((const char *)m_sk_packed_tuple, size1); |
11175 | const rocksdb::Slice slice2((const char *)m_sk_packed_tuple_old, size2); |
11176 | |
11177 | // slice1 >= slice2 means no row will match |
11178 | if (slice1.compare(slice2) >= 0) { |
11179 | DBUG_RETURN(HA_EXIT_SUCCESS); |
11180 | } |
11181 | |
11182 | rocksdb::Range r(kd.m_is_reverse_cf ? slice2 : slice1, |
11183 | kd.m_is_reverse_cf ? slice1 : slice2); |
11184 | |
11185 | uint64_t sz = 0; |
11186 | auto disk_size = kd.m_stats.m_actual_disk_size; |
11187 | if (disk_size == 0) |
11188 | disk_size = kd.m_stats.m_data_size; |
11189 | auto rows = kd.m_stats.m_rows; |
11190 | if (rows == 0 || disk_size == 0) { |
11191 | rows = 1; |
11192 | disk_size = ROCKSDB_ASSUMED_KEY_VALUE_DISK_SIZE; |
11193 | } |
11194 | |
11195 | // Getting statistics, including from Memtables |
11196 | uint8_t include_flags = rocksdb::DB::INCLUDE_FILES; |
11197 | rdb->GetApproximateSizes(kd.get_cf(), &r, 1, &sz, include_flags); |
11198 | ret = rows * sz / disk_size; |
11199 | uint64_t memTableCount; |
11200 | rdb->GetApproximateMemTableStats(kd.get_cf(), r, &memTableCount, &sz); |
11201 | ret += memTableCount; |
11202 | |
11203 | /* |
11204 | GetApproximateSizes() gives estimates so ret might exceed stats.records. |
11205 | MySQL then decides to use full index scan rather than range scan, which |
11206 | is not efficient for most cases. |
11207 | To prevent this, changing estimated records slightly smaller than |
11208 | stats.records. |
11209 | */ |
11210 | if (ret >= stats.records) { |
11211 | ret = stats.records * 0.99; |
11212 | } |
11213 | |
11214 | if (rocksdb_debug_optimizer_n_rows > 0) { |
11215 | ret = rocksdb_debug_optimizer_n_rows; |
11216 | } else if (ret == 0) { |
11217 | ret = 1; |
11218 | } |
11219 | |
11220 | DBUG_RETURN(ret); |
11221 | } |
11222 | |
11223 | void ha_rocksdb::update_create_info(HA_CREATE_INFO *const create_info) { |
11224 | DBUG_ENTER_FUNC(); |
11225 | |
11226 | DBUG_ASSERT(create_info != nullptr); |
11227 | |
11228 | if (!(create_info->used_fields & HA_CREATE_USED_AUTO)) { |
11229 | create_info->auto_increment_value = m_tbl_def->m_auto_incr_val; |
11230 | } |
11231 | |
11232 | DBUG_VOID_RETURN; |
11233 | } |
11234 | |
11235 | /** |
11236 | @brief |
11237 | Doing manual compaction on OPTIMIZE TABLE in RocksDB. |
11238 | Compaction itself is executed by background thread in RocksDB, but |
11239 | CompactRange() waits until compaction completes so this function |
11240 | may take a long time. |
11241 | Since RocksDB dataset is allocated per index id, OPTIMIZE TABLE |
11242 | triggers manual compaction for all indexes of the table. |
11243 | @details |
11244 | Compaction range is from the beginning of the index id to |
11245 | the first row of the next index id. When using reverse order |
11246 | column family, the first row of the next index id should be |
11247 | the last row of the previous index id. |
11248 | |
11249 | @return |
11250 | HA_ADMIN_OK OK |
11251 | other HA_ADMIN error code |
11252 | */ |
11253 | int ha_rocksdb::optimize(THD *const thd, HA_CHECK_OPT *const check_opt) { |
11254 | DBUG_ENTER_FUNC(); |
11255 | |
11256 | DBUG_ASSERT(thd != nullptr); |
11257 | DBUG_ASSERT(check_opt != nullptr); |
11258 | |
11259 | for (uint i = 0; i < table->s->keys; i++) { |
11260 | uchar buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]; |
11261 | auto range = get_range(i, buf); |
11262 | const rocksdb::Status s = rdb->CompactRange(getCompactRangeOptions(), |
11263 | m_key_descr_arr[i]->get_cf(), |
11264 | &range.start, &range.limit); |
11265 | if (!s.ok()) { |
11266 | DBUG_RETURN(rdb_error_to_mysql(s)); |
11267 | } |
11268 | } |
11269 | |
11270 | DBUG_RETURN(HA_EXIT_SUCCESS); |
11271 | } |
11272 | |
11273 | int ha_rocksdb::calculate_stats(const TABLE *const table_arg, THD *const thd, |
11274 | HA_CHECK_OPT *const check_opt) { |
11275 | DBUG_ENTER_FUNC(); |
11276 | |
11277 | // find per column family key ranges which need to be queried |
11278 | std::unordered_map<rocksdb::ColumnFamilyHandle *, std::vector<rocksdb::Range>> |
11279 | ranges; |
11280 | std::unordered_set<GL_INDEX_ID> ids_to_check; |
11281 | std::vector<uchar> buf(table_arg->s->keys * 2 * |
11282 | Rdb_key_def::INDEX_NUMBER_SIZE); |
11283 | std::unordered_map<GL_INDEX_ID, Rdb_index_stats> stats; |
11284 | for (uint i = 0; i < table_arg->s->keys; i++) { |
11285 | const auto bufp = &buf[i * 2 * Rdb_key_def::INDEX_NUMBER_SIZE]; |
11286 | const Rdb_key_def &kd = *m_key_descr_arr[i]; |
11287 | const GL_INDEX_ID index_id = kd.get_gl_index_id(); |
11288 | ranges[kd.get_cf()].push_back(get_range(i, bufp)); |
11289 | |
11290 | ids_to_check.insert(index_id); |
11291 | // Initialize the stats to 0. If there are no files that contain |
11292 | // this gl_index_id, then 0 should be stored for the cached stats. |
11293 | stats[index_id] = Rdb_index_stats(index_id); |
11294 | DBUG_ASSERT(kd.get_key_parts() > 0); |
11295 | stats[index_id].m_distinct_keys_per_prefix.resize(kd.get_key_parts()); |
11296 | } |
11297 | |
11298 | // get RocksDB table properties for these ranges |
11299 | rocksdb::TablePropertiesCollection props; |
11300 | for (auto it : ranges) { |
11301 | const auto old_size MY_ATTRIBUTE((__unused__)) = props.size(); |
11302 | const auto status = rdb->GetPropertiesOfTablesInRange( |
11303 | it.first, &it.second[0], it.second.size(), &props); |
11304 | DBUG_ASSERT(props.size() >= old_size); |
11305 | if (!status.ok()) { |
11306 | DBUG_RETURN( |
11307 | rdb_error_to_mysql(status, "Could not access RocksDB properties" )); |
11308 | } |
11309 | } |
11310 | |
11311 | int num_sst = 0; |
11312 | for (const auto &it : props) { |
11313 | std::vector<Rdb_index_stats> sst_stats; |
11314 | Rdb_tbl_prop_coll::read_stats_from_tbl_props(it.second, &sst_stats); |
11315 | /* |
11316 | sst_stats is a list of index statistics for indexes that have entries |
11317 | in the current SST file. |
11318 | */ |
11319 | for (const auto &it1 : sst_stats) { |
11320 | /* |
11321 | Only update statistics for indexes that belong to this SQL table. |
11322 | |
11323 | The reason is: We are walking through all SST files that have |
11324 | entries from this table (and so can compute good statistics). For |
11325 | other SQL tables, it can be that we're only seeing a small fraction |
11326 | of table's entries (and so we can't update statistics based on that). |
11327 | */ |
11328 | if (ids_to_check.find(it1.m_gl_index_id) == ids_to_check.end()) |
11329 | continue; |
11330 | |
11331 | auto kd = ddl_manager.safe_find(it1.m_gl_index_id); |
11332 | DBUG_ASSERT(kd != nullptr); |
11333 | stats[it1.m_gl_index_id].merge(it1, true, kd->max_storage_fmt_length()); |
11334 | } |
11335 | num_sst++; |
11336 | } |
11337 | |
11338 | // calculate memtable cardinality |
11339 | Rdb_tbl_card_coll cardinality_collector(rocksdb_table_stats_sampling_pct); |
11340 | auto read_opts = rocksdb::ReadOptions(); |
11341 | read_opts.read_tier = rocksdb::ReadTier::kMemtableTier; |
11342 | for (uint i = 0; i < table_arg->s->keys; i++) { |
11343 | const Rdb_key_def &kd = *m_key_descr_arr[i]; |
11344 | Rdb_index_stats &stat = stats[kd.get_gl_index_id()]; |
11345 | |
11346 | uchar r_buf[Rdb_key_def::INDEX_NUMBER_SIZE * 2]; |
11347 | auto r = get_range(i, r_buf); |
11348 | uint64_t memtableCount; |
11349 | uint64_t memtableSize; |
11350 | rdb->GetApproximateMemTableStats(kd.get_cf(), r, &memtableCount, |
11351 | &memtableSize); |
11352 | if (memtableCount < (uint64_t)stat.m_rows / 10) { |
11353 | // skip tables that already have enough stats from SST files to reduce |
11354 | // overhead and avoid degradation of big tables stats by sampling from |
11355 | // relatively tiny (less than 10% of full data set) memtable dataset |
11356 | continue; |
11357 | } |
11358 | |
11359 | std::unique_ptr<rocksdb::Iterator> it = std::unique_ptr<rocksdb::Iterator>( |
11360 | rdb->NewIterator(read_opts, kd.get_cf())); |
11361 | |
11362 | uchar *first_key; |
11363 | uint key_size; |
11364 | if (is_pk(i, table, m_tbl_def)) { |
11365 | first_key = m_pk_packed_tuple; |
11366 | } else { |
11367 | first_key = m_sk_packed_tuple; |
11368 | } |
11369 | kd.get_first_key(first_key, &key_size); |
11370 | rocksdb::Slice first_index_key((const char *)first_key, key_size); |
11371 | |
11372 | cardinality_collector.Reset(); |
11373 | for (it->Seek(first_index_key); is_valid(it.get()); it->Next()) { |
11374 | const rocksdb::Slice key = it->key(); |
11375 | if (!kd.covers_key(key)) { |
11376 | break; // end of this index |
11377 | } |
11378 | stat.m_rows++; |
11379 | |
11380 | cardinality_collector.ProcessKey(key, &kd, &stat); |
11381 | } |
11382 | cardinality_collector.AdjustStats(&stat); |
11383 | } |
11384 | |
11385 | // set and persist new stats |
11386 | ddl_manager.set_stats(stats); |
11387 | ddl_manager.persist_stats(true); |
11388 | |
11389 | DBUG_RETURN(HA_EXIT_SUCCESS); |
11390 | } |
11391 | |
11392 | /* |
11393 | @return |
11394 | HA_ADMIN_OK OK |
11395 | other HA_ADMIN error code |
11396 | */ |
11397 | int ha_rocksdb::analyze(THD *const thd, HA_CHECK_OPT *const check_opt) { |
11398 | DBUG_ENTER_FUNC(); |
11399 | |
11400 | if (table && calculate_stats(table, thd, check_opt) != HA_EXIT_SUCCESS) { |
11401 | DBUG_RETURN(HA_ADMIN_FAILED); |
11402 | } |
11403 | |
11404 | DBUG_RETURN(HA_ADMIN_OK); |
11405 | } |
11406 | |
11407 | void ha_rocksdb::get_auto_increment(ulonglong off, ulonglong inc, |
11408 | ulonglong nb_desired_values, |
11409 | ulonglong *const first_value, |
11410 | ulonglong *const nb_reserved_values) { |
11411 | /* |
11412 | MySQL has a somewhat complicated way of handling the auto-increment value. |
11413 | The first time get_auto_increment is called for a statement, |
11414 | nb_desired_values is the estimate for how many values will be needed. The |
11415 | engine can then reserve some values, and those will be automatically used |
11416 | by MySQL, until a hard-coded value shows up in the insert statement, after |
11417 | which MySQL again calls this function to reset its starting value. |
11418 | * |
11419 | For simplicity we will just ignore nb_desired_values - we aren't going to |
11420 | reserve any extra values for a multi-insert statement. Each row will |
11421 | simply acquire the next value as needed and we will always tell MySQL that |
11422 | we only reserved 1 value. Since we are using an atomic value for |
11423 | m_auto_incr_val this should be safe - if we had to grab a mutex, doing |
11424 | an actual reserve of some values might be a better solution. |
11425 | */ |
11426 | DEBUG_SYNC(ha_thd(), "rocksdb.autoinc_vars" ); |
11427 | |
11428 | if (off > inc) { |
11429 | off = 1; |
11430 | } |
11431 | |
11432 | Field *field; |
11433 | ulonglong new_val, max_val; |
11434 | field = table->key_info[table->s->next_number_index].key_part[0].field; |
11435 | max_val = rdb_get_int_col_max_value(field); |
11436 | |
11437 | // Local variable reference to simplify code below |
11438 | auto &auto_incr = m_tbl_def->m_auto_incr_val; |
11439 | |
11440 | if (inc == 1) { |
11441 | DBUG_ASSERT(off == 1); |
11442 | // Optimization for the standard case where we are always simply |
11443 | // incrementing from the last position |
11444 | |
11445 | // Use CAS operation in a loop to make sure automically get the next auto |
11446 | // increment value while ensuring that we don't wrap around to a negative |
11447 | // number. |
11448 | // |
11449 | // We set auto_incr to the min of max_val and new_val + 1. This means that |
11450 | // if we're at the maximum, we should be returning the same value for |
11451 | // multiple rows, resulting in duplicate key errors (as expected). |
11452 | // |
11453 | // If we return values greater than the max, the SQL layer will "truncate" |
11454 | // the value anyway, but it means that we store invalid values into |
11455 | // auto_incr that will be visible in SHOW CREATE TABLE. |
11456 | new_val = auto_incr; |
11457 | while (new_val != std::numeric_limits<ulonglong>::max()) { |
11458 | if (auto_incr.compare_exchange_weak(new_val, |
11459 | std::min(new_val + 1, max_val))) { |
11460 | break; |
11461 | } |
11462 | } |
11463 | } else { |
11464 | // The next value can be more complicated if either 'inc' or 'off' is not 1 |
11465 | ulonglong last_val = auto_incr; |
11466 | |
11467 | // Loop until we can correctly update the atomic value |
11468 | do { |
11469 | DBUG_ASSERT(last_val > 0); |
11470 | // Calculate the next value in the auto increment series: offset |
11471 | // + N * increment where N is 0, 1, 2, ... |
11472 | // |
11473 | // For further information please visit: |
11474 | // http://dev.mysql.com/doc/refman/5.7/en/replication-options-master.html |
11475 | // |
11476 | // The following is confusing so here is an explanation: |
11477 | // To get the next number in the sequence above you subtract out the |
11478 | // offset, calculate the next sequence (N * increment) and then add the |
11479 | // offset back in. |
11480 | // |
11481 | // The additions are rearranged to avoid overflow. The following is |
11482 | // equivalent to (last_val - 1 + inc - off) / inc. This uses the fact |
11483 | // that (a+b)/c = a/c + b/c + (a%c + b%c)/c. To show why: |
11484 | // |
11485 | // (a+b)/c |
11486 | // = (a - a%c + a%c + b - b%c + b%c) / c |
11487 | // = (a - a%c) / c + (b - b%c) / c + (a%c + b%c) / c |
11488 | // = a/c + b/c + (a%c + b%c) / c |
11489 | // |
11490 | // Now, substitute a = last_val - 1, b = inc - off, c = inc to get the |
11491 | // following statement. |
11492 | ulonglong n = |
11493 | (last_val - 1) / inc + ((last_val - 1) % inc + inc - off) / inc; |
11494 | |
11495 | // Check if n * inc + off will overflow. This can only happen if we have |
11496 | // an UNSIGNED BIGINT field. |
11497 | if (n > (std::numeric_limits<ulonglong>::max() - off) / inc) { |
11498 | DBUG_ASSERT(max_val == std::numeric_limits<ulonglong>::max()); |
11499 | // The 'last_val' value is already equal to or larger than the largest |
11500 | // value in the sequence. Continuing would wrap around (technically |
11501 | // the behavior would be undefined). What should we do? |
11502 | // We could: |
11503 | // 1) set the new value to the last possible number in our sequence |
11504 | // as described above. The problem with this is that this |
11505 | // number could be smaller than a value in an existing row. |
11506 | // 2) set the new value to the largest possible number. This number |
11507 | // may not be in our sequence, but it is guaranteed to be equal |
11508 | // to or larger than any other value already inserted. |
11509 | // |
11510 | // For now I'm going to take option 2. |
11511 | // |
11512 | // Returning ULLONG_MAX from get_auto_increment will cause the SQL |
11513 | // layer to fail with ER_AUTOINC_READ_FAILED. This means that due to |
11514 | // the SE API for get_auto_increment, inserts will fail with |
11515 | // ER_AUTOINC_READ_FAILED if the column is UNSIGNED BIGINT, but |
11516 | // inserts will fail with ER_DUP_ENTRY for other types (or no failure |
11517 | // if the column is in a non-unique SK). |
11518 | new_val = std::numeric_limits<ulonglong>::max(); |
11519 | auto_incr = new_val; // Store the largest value into auto_incr |
11520 | break; |
11521 | } |
11522 | |
11523 | new_val = n * inc + off; |
11524 | |
11525 | // Attempt to store the new value (plus 1 since m_auto_incr_val contains |
11526 | // the next available value) into the atomic value. If the current |
11527 | // value no longer matches what we have in 'last_val' this will fail and |
11528 | // we will repeat the loop (`last_val` will automatically get updated |
11529 | // with the current value). |
11530 | // |
11531 | // See above explanation for inc == 1 for why we use std::min. |
11532 | } while (!auto_incr.compare_exchange_weak(last_val, |
11533 | std::min(new_val + 1, max_val))); |
11534 | } |
11535 | |
11536 | *first_value = new_val; |
11537 | *nb_reserved_values = 1; |
11538 | } |
11539 | |
11540 | #ifndef DBUG_OFF |
11541 | |
11542 | /* Debugger help function */ |
11543 | static char dbug_item_print_buf[512]; |
11544 | |
11545 | const char *dbug_print_item(Item *const item) { |
11546 | char *const buf = dbug_item_print_buf; |
11547 | String str(buf, sizeof(dbug_item_print_buf), &my_charset_bin); |
11548 | str.length(0); |
11549 | if (!item) |
11550 | return "(Item*)nullptr" ; |
11551 | item->print(&str, QT_ORDINARY); |
11552 | if (str.c_ptr() == buf) |
11553 | return buf; |
11554 | else |
11555 | return "Couldn't fit into buffer" ; |
11556 | } |
11557 | |
11558 | #endif /*DBUG_OFF*/ |
11559 | |
11560 | /** |
11561 | SQL layer calls this function to push an index condition. |
11562 | |
11563 | @details |
11564 | The condition is for index keyno (only one condition can be pushed at a |
11565 | time). |
11566 | The caller guarantees that condition refers only to index fields; besides |
11567 | that, fields must have |
11568 | |
11569 | $field->part_of_key.set_bit(keyno) |
11570 | |
11571 | which means that |
11572 | |
11573 | (handler->index_flags(keyno, $keypart, 0) & HA_KEYREAD_ONLY) == 1 |
11574 | |
11575 | which means that field value can be restored from the index tuple. |
11576 | |
11577 | @return |
11578 | Part of condition we couldn't check (always nullptr). |
11579 | */ |
11580 | |
11581 | class Item *ha_rocksdb::idx_cond_push(uint keyno, class Item *const idx_cond) { |
11582 | DBUG_ENTER_FUNC(); |
11583 | |
11584 | DBUG_ASSERT(keyno != MAX_KEY); |
11585 | DBUG_ASSERT(idx_cond != nullptr); |
11586 | |
11587 | pushed_idx_cond = idx_cond; |
11588 | pushed_idx_cond_keyno = keyno; |
11589 | in_range_check_pushed_down = TRUE; |
11590 | |
11591 | /* We will check the whole condition */ |
11592 | DBUG_RETURN(nullptr); |
11593 | } |
11594 | |
11595 | /* |
11596 | Checks if inplace alter is supported for a given operation. |
11597 | */ |
11598 | |
11599 | my_core::enum_alter_inplace_result ha_rocksdb::check_if_supported_inplace_alter( |
11600 | TABLE *altered_table, my_core::Alter_inplace_info *const ha_alter_info) { |
11601 | DBUG_ENTER_FUNC(); |
11602 | |
11603 | DBUG_ASSERT(ha_alter_info != nullptr); |
11604 | |
11605 | if (ha_alter_info->handler_flags & |
11606 | ~(ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX | |
11607 | ALTER_DROP_UNIQUE_INDEX | |
11608 | ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX | |
11609 | ALTER_PARTITIONED | |
11610 | ALTER_ADD_UNIQUE_INDEX | |
11611 | ALTER_CHANGE_CREATE_OPTION)) { |
11612 | DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED); |
11613 | } |
11614 | |
11615 | /* We don't support unique keys on table w/ no primary keys */ |
11616 | if ((ha_alter_info->handler_flags & |
11617 | ALTER_ADD_UNIQUE_INDEX) && |
11618 | has_hidden_pk(altered_table)) { |
11619 | DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED); |
11620 | } |
11621 | |
11622 | /* We only support changing auto_increment for table options. */ |
11623 | if ((ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) && |
11624 | !(ha_alter_info->create_info->used_fields & HA_CREATE_USED_AUTO)) { |
11625 | DBUG_RETURN(my_core::HA_ALTER_INPLACE_NOT_SUPPORTED); |
11626 | } |
11627 | |
11628 | /* FIXME: MDEV-16099 Use alter algorithm=nocopy or algorithm=instant |
11629 | for non-InnoDB engine */ |
11630 | DBUG_RETURN(my_core::HA_ALTER_INPLACE_COPY_LOCK); |
11631 | } |
11632 | |
11633 | /** |
11634 | Allows the storage engine to update internal structures with concurrent |
11635 | writes blocked. If check_if_supported_inplace_alter() returns |
11636 | HA_ALTER_INPLACE_COPY_NO_LOCK or HA_ALTER_INPLACE_COPY_LOCK. |
11637 | this function is called with exclusive lock otherwise the same level of |
11638 | locking as for inplace_alter_table() will be used. |
11639 | |
11640 | @note Storage engines are responsible for reporting any errors by |
11641 | calling my_error()/print_error() |
11642 | |
11643 | @note If this function reports error, commit_inplace_alter_table() |
11644 | will be called with commit= false. |
11645 | |
11646 | @note For partitioning, failing to prepare one partition, means that |
11647 | commit_inplace_alter_table() will be called to roll back changes for |
11648 | all partitions. This means that commit_inplace_alter_table() might be |
11649 | called without prepare_inplace_alter_table() having been called first |
11650 | for a given partition. |
11651 | |
11652 | @param altered_table TABLE object for new version of table. |
11653 | @param ha_alter_info Structure describing changes to be done |
11654 | by ALTER TABLE and holding data used |
11655 | during in-place alter. |
11656 | |
11657 | @retval true Error |
11658 | @retval false Success |
11659 | */ |
11660 | bool ha_rocksdb::prepare_inplace_alter_table( |
11661 | TABLE *const altered_table, |
11662 | my_core::Alter_inplace_info *const ha_alter_info) { |
11663 | DBUG_ENTER_FUNC(); |
11664 | |
11665 | DBUG_ASSERT(altered_table != nullptr); |
11666 | DBUG_ASSERT(ha_alter_info != nullptr); |
11667 | |
11668 | Rdb_tbl_def *new_tdef = nullptr; |
11669 | std::shared_ptr<Rdb_key_def> *old_key_descr = nullptr; |
11670 | std::shared_ptr<Rdb_key_def> *new_key_descr = nullptr; |
11671 | uint old_n_keys = m_tbl_def->m_key_count; |
11672 | uint new_n_keys = altered_table->s->keys; |
11673 | std::unordered_set<std::shared_ptr<Rdb_key_def>> added_indexes; |
11674 | std::unordered_set<GL_INDEX_ID> dropped_index_ids; |
11675 | uint n_dropped_keys = 0; |
11676 | uint n_added_keys = 0; |
11677 | ulonglong max_auto_incr = 0; |
11678 | |
11679 | if (ha_alter_info->handler_flags & |
11680 | (ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX | |
11681 | ALTER_DROP_UNIQUE_INDEX | |
11682 | ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX | |
11683 | ALTER_ADD_UNIQUE_INDEX)) { |
11684 | |
11685 | if (has_hidden_pk(altered_table)) { |
11686 | new_n_keys += 1; |
11687 | } |
11688 | |
11689 | const TABLE *const old_table = table; |
11690 | old_key_descr = m_tbl_def->m_key_descr_arr; |
11691 | new_key_descr = new std::shared_ptr<Rdb_key_def>[new_n_keys]; |
11692 | |
11693 | new_tdef = new Rdb_tbl_def(m_tbl_def->full_tablename()); |
11694 | new_tdef->m_key_descr_arr = new_key_descr; |
11695 | new_tdef->m_key_count = new_n_keys; |
11696 | new_tdef->m_auto_incr_val = |
11697 | m_tbl_def->m_auto_incr_val.load(std::memory_order_relaxed); |
11698 | new_tdef->m_hidden_pk_val = |
11699 | m_tbl_def->m_hidden_pk_val.load(std::memory_order_relaxed); |
11700 | |
11701 | if (create_key_defs(altered_table, new_tdef, table, m_tbl_def)) { |
11702 | /* Delete the new key descriptors */ |
11703 | delete[] new_key_descr; |
11704 | |
11705 | /* |
11706 | Explicitly mark as nullptr so we don't accidentally remove entries |
11707 | from data dictionary on cleanup (or cause double delete[]). |
11708 | */ |
11709 | new_tdef->m_key_descr_arr = nullptr; |
11710 | delete new_tdef; |
11711 | |
11712 | my_error(ER_KEY_CREATE_DURING_ALTER, MYF(0)); |
11713 | DBUG_RETURN(HA_EXIT_FAILURE); |
11714 | } |
11715 | |
11716 | uint i; |
11717 | uint j; |
11718 | |
11719 | /* Determine which(if any) key definition(s) need to be dropped */ |
11720 | for (i = 0; i < ha_alter_info->index_drop_count; i++) { |
11721 | const KEY *const dropped_key = ha_alter_info->index_drop_buffer[i]; |
11722 | for (j = 0; j < old_n_keys; j++) { |
11723 | const KEY *const old_key = |
11724 | &old_table->key_info[old_key_descr[j]->get_keyno()]; |
11725 | |
11726 | if (!compare_keys(old_key, dropped_key)) { |
11727 | dropped_index_ids.insert(old_key_descr[j]->get_gl_index_id()); |
11728 | break; |
11729 | } |
11730 | } |
11731 | } |
11732 | |
11733 | /* Determine which(if any) key definitions(s) need to be added */ |
11734 | int identical_indexes_found = 0; |
11735 | for (i = 0; i < ha_alter_info->index_add_count; i++) { |
11736 | const KEY *const added_key = |
11737 | &ha_alter_info->key_info_buffer[ha_alter_info->index_add_buffer[i]]; |
11738 | for (j = 0; j < new_n_keys; j++) { |
11739 | const KEY *const new_key = |
11740 | &altered_table->key_info[new_key_descr[j]->get_keyno()]; |
11741 | if (!compare_keys(new_key, added_key)) { |
11742 | /* |
11743 | Check for cases where an 'identical' index is being dropped and |
11744 | re-added in a single ALTER statement. Turn this into a no-op as the |
11745 | index has not changed. |
11746 | |
11747 | E.G. Unique index -> non-unique index requires no change |
11748 | |
11749 | Note that cases where the index name remains the same but the |
11750 | key-parts are changed is already handled in create_inplace_key_defs. |
11751 | In these cases the index needs to be rebuilt. |
11752 | */ |
11753 | if (dropped_index_ids.count(new_key_descr[j]->get_gl_index_id())) { |
11754 | dropped_index_ids.erase(new_key_descr[j]->get_gl_index_id()); |
11755 | identical_indexes_found++; |
11756 | } else { |
11757 | added_indexes.insert(new_key_descr[j]); |
11758 | } |
11759 | |
11760 | break; |
11761 | } |
11762 | } |
11763 | } |
11764 | |
11765 | n_dropped_keys = ha_alter_info->index_drop_count - identical_indexes_found; |
11766 | n_added_keys = ha_alter_info->index_add_count - identical_indexes_found; |
11767 | DBUG_ASSERT(dropped_index_ids.size() == n_dropped_keys); |
11768 | DBUG_ASSERT(added_indexes.size() == n_added_keys); |
11769 | DBUG_ASSERT(new_n_keys == (old_n_keys - n_dropped_keys + n_added_keys)); |
11770 | } |
11771 | if (ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) { |
11772 | if (!new_tdef) { |
11773 | new_tdef = m_tbl_def; |
11774 | } |
11775 | if (table->found_next_number_field) { |
11776 | max_auto_incr = load_auto_incr_value_from_index(); |
11777 | } |
11778 | } |
11779 | |
11780 | ha_alter_info->handler_ctx = new Rdb_inplace_alter_ctx( |
11781 | new_tdef, old_key_descr, new_key_descr, old_n_keys, new_n_keys, |
11782 | added_indexes, dropped_index_ids, n_added_keys, n_dropped_keys, |
11783 | max_auto_incr); |
11784 | DBUG_RETURN(HA_EXIT_SUCCESS); |
11785 | } |
11786 | |
11787 | /** |
11788 | Alter the table structure in-place with operations specified using |
11789 | HA_ALTER_FLAGS and Alter_inplace_info. The level of concurrency allowed |
11790 | during this operation depends on the return value from |
11791 | check_if_supported_inplace_alter(). |
11792 | |
11793 | @note Storage engines are responsible for reporting any errors by |
11794 | calling my_error()/print_error() |
11795 | |
11796 | @note If this function reports error, commit_inplace_alter_table() |
11797 | will be called with commit= false. |
11798 | |
11799 | @param altered_table TABLE object for new version of table. |
11800 | @param ha_alter_info Structure describing changes to be done |
11801 | by ALTER TABLE and holding data used |
11802 | during in-place alter. |
11803 | |
11804 | @retval true Error |
11805 | @retval false Success |
11806 | */ |
11807 | bool ha_rocksdb::inplace_alter_table( |
11808 | TABLE *const altered_table, |
11809 | my_core::Alter_inplace_info *const ha_alter_info) { |
11810 | DBUG_ENTER_FUNC(); |
11811 | |
11812 | DBUG_ASSERT(altered_table != nullptr); |
11813 | DBUG_ASSERT(ha_alter_info != nullptr); |
11814 | DBUG_ASSERT(ha_alter_info->handler_ctx != nullptr); |
11815 | |
11816 | Rdb_inplace_alter_ctx *const ctx = |
11817 | static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx); |
11818 | |
11819 | if (ha_alter_info->handler_flags & |
11820 | (ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX | |
11821 | ALTER_ADD_UNIQUE_INDEX)) { |
11822 | /* |
11823 | Buffers need to be set up again to account for new, possibly longer |
11824 | secondary keys. |
11825 | */ |
11826 | free_key_buffers(); |
11827 | |
11828 | DBUG_ASSERT(ctx != nullptr); |
11829 | |
11830 | /* |
11831 | If adding unique index, allocate special buffers for duplicate checking. |
11832 | */ |
11833 | int err; |
11834 | if ((err = alloc_key_buffers( |
11835 | altered_table, ctx->m_new_tdef, |
11836 | ha_alter_info->handler_flags & |
11837 | ALTER_ADD_UNIQUE_INDEX))) { |
11838 | my_error(ER_OUT_OF_RESOURCES, MYF(0)); |
11839 | DBUG_RETURN(err); |
11840 | } |
11841 | |
11842 | /* Populate all new secondary keys by scanning the primary key. */ |
11843 | if ((err = inplace_populate_sk(altered_table, ctx->m_added_indexes))) { |
11844 | my_error(ER_SK_POPULATE_DURING_ALTER, MYF(0)); |
11845 | DBUG_RETURN(HA_EXIT_FAILURE); |
11846 | } |
11847 | } |
11848 | |
11849 | DBUG_EXECUTE_IF("myrocks_simulate_index_create_rollback" , { |
11850 | dbug_create_err_inplace_alter(); |
11851 | DBUG_RETURN(HA_EXIT_FAILURE); |
11852 | };); |
11853 | |
11854 | DBUG_RETURN(HA_EXIT_SUCCESS); |
11855 | } |
11856 | |
11857 | /** |
11858 | Scan the Primary Key index entries and populate the new secondary keys. |
11859 | */ |
11860 | int ha_rocksdb::inplace_populate_sk( |
11861 | TABLE *const new_table_arg, |
11862 | const std::unordered_set<std::shared_ptr<Rdb_key_def>> &indexes) { |
11863 | DBUG_ENTER_FUNC(); |
11864 | int res = HA_EXIT_SUCCESS; |
11865 | const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin(); |
11866 | rocksdb::WriteBatch *const batch = wb.get(); |
11867 | |
11868 | /* Update the data dictionary */ |
11869 | std::unordered_set<GL_INDEX_ID> create_index_ids; |
11870 | for (const auto &index : indexes) { |
11871 | create_index_ids.insert(index->get_gl_index_id()); |
11872 | } |
11873 | dict_manager.add_create_index(create_index_ids, batch); |
11874 | res = dict_manager.commit(batch); |
11875 | if (res != HA_EXIT_SUCCESS) { |
11876 | return res; |
11877 | } |
11878 | |
11879 | /* |
11880 | Add uncommitted key definitons to ddl_manager. We need to do this |
11881 | so that the property collector can find this keydef when it needs to |
11882 | update stats. The property collector looks for the keydef in the |
11883 | data dictionary, but it won't be there yet since this key definition |
11884 | is still in the creation process. |
11885 | */ |
11886 | ddl_manager.add_uncommitted_keydefs(indexes); |
11887 | |
11888 | const bool hidden_pk_exists = has_hidden_pk(table); |
11889 | |
11890 | Rdb_transaction *tx = get_or_create_tx(table->in_use); |
11891 | |
11892 | /* |
11893 | There is one specific scenario where m_sst_info may not be nullptr. This |
11894 | happens if the handler we're using happens to be the handler where the PK |
11895 | bulk load was done on. The sequence of events that lead to this is as |
11896 | follows (T1 is PK bulk load, T2 is SK alter table): |
11897 | |
11898 | T1: Execute last INSERT statement |
11899 | T1: Return TABLE and handler object back to Table_cache_manager |
11900 | T1: Close connection |
11901 | T2: Execute ALTER statement |
11902 | T2: Take same TABLE/handler from Table_cache_manager |
11903 | T2: Call closefrm which will call finalize_bulk_load on every other open |
11904 | table/handler *except* the one it's on. |
11905 | T2: Acquire stale snapshot of PK |
11906 | T1: Call finalize_bulk_load |
11907 | |
11908 | This is rare because usually, closefrm will call the destructor (and thus |
11909 | finalize_bulk_load) on the handler where PK bulk load is done. However, if |
11910 | the thread ids of the bulk load thread and the alter thread differ by a |
11911 | multiple of table_cache_instances (8 by default), then they hash to the |
11912 | same bucket in Table_cache_manager and the alter thread will not not call |
11913 | the destructor on the handler it is holding. Thus, its m_sst_info will not |
11914 | be nullptr. |
11915 | |
11916 | At this point, it is safe to refresh the snapshot because we know all other |
11917 | open handlers have been closed at this point, and the one we're on is the |
11918 | only one left. |
11919 | */ |
11920 | if (m_sst_info) { |
11921 | if ((res = finalize_bulk_load())) { |
11922 | DBUG_RETURN(res); |
11923 | } |
11924 | tx->commit(); |
11925 | } |
11926 | |
11927 | const ulonglong rdb_merge_buf_size = THDVAR(ha_thd(), merge_buf_size); |
11928 | const ulonglong rdb_merge_combine_read_size = |
11929 | THDVAR(ha_thd(), merge_combine_read_size); |
11930 | const ulonglong rdb_merge_tmp_file_removal_delay = |
11931 | THDVAR(ha_thd(), merge_tmp_file_removal_delay_ms); |
11932 | |
11933 | for (const auto &index : indexes) { |
11934 | bool is_unique_index = |
11935 | new_table_arg->key_info[index->get_keyno()].flags & HA_NOSAME; |
11936 | |
11937 | Rdb_index_merge rdb_merge(tx->get_rocksdb_tmpdir(), rdb_merge_buf_size, |
11938 | rdb_merge_combine_read_size, |
11939 | rdb_merge_tmp_file_removal_delay, |
11940 | index->get_cf()); |
11941 | |
11942 | if ((res = rdb_merge.init())) { |
11943 | DBUG_RETURN(res); |
11944 | } |
11945 | |
11946 | /* |
11947 | Note: We pass in the currently existing table + tbl_def object here, |
11948 | as the pk index position may have changed in the case of hidden primary |
11949 | keys. |
11950 | */ |
11951 | const uint pk = pk_index(table, m_tbl_def); |
11952 | ha_index_init(pk, true); |
11953 | |
11954 | /* Scan each record in the primary key in order */ |
11955 | for (res = index_first(table->record[0]); res == 0; |
11956 | res = index_next(table->record[0])) { |
11957 | longlong hidden_pk_id = 0; |
11958 | if (hidden_pk_exists && |
11959 | (res = read_hidden_pk_id_from_rowkey(&hidden_pk_id))) { |
11960 | // NO_LINT_DEBUG |
11961 | sql_print_error("Error retrieving hidden pk id." ); |
11962 | ha_index_end(); |
11963 | DBUG_RETURN(res); |
11964 | } |
11965 | |
11966 | /* Create new secondary index entry */ |
11967 | const int new_packed_size = index->pack_record( |
11968 | new_table_arg, m_pack_buffer, table->record[0], m_sk_packed_tuple, |
11969 | &m_sk_tails, should_store_row_debug_checksums(), hidden_pk_id, 0, |
11970 | nullptr, nullptr, m_ttl_bytes); |
11971 | |
11972 | const rocksdb::Slice key = rocksdb::Slice( |
11973 | reinterpret_cast<const char *>(m_sk_packed_tuple), new_packed_size); |
11974 | const rocksdb::Slice val = |
11975 | rocksdb::Slice(reinterpret_cast<const char *>(m_sk_tails.ptr()), |
11976 | m_sk_tails.get_current_pos()); |
11977 | |
11978 | /* |
11979 | Add record to offset tree in preparation for writing out to |
11980 | disk in sorted chunks. |
11981 | */ |
11982 | if ((res = rdb_merge.add(key, val))) { |
11983 | ha_index_end(); |
11984 | DBUG_RETURN(res); |
11985 | } |
11986 | } |
11987 | |
11988 | if (res != HA_ERR_END_OF_FILE) { |
11989 | // NO_LINT_DEBUG |
11990 | sql_print_error("Error retrieving index entry from primary key." ); |
11991 | ha_index_end(); |
11992 | DBUG_RETURN(res); |
11993 | } |
11994 | |
11995 | ha_index_end(); |
11996 | |
11997 | /* |
11998 | Perform an n-way merge of n sorted buffers on disk, then writes all |
11999 | results to RocksDB via SSTFileWriter API. |
12000 | */ |
12001 | rocksdb::Slice merge_key; |
12002 | rocksdb::Slice merge_val; |
12003 | |
12004 | struct unique_sk_buf_info sk_info; |
12005 | sk_info.dup_sk_buf = m_dup_sk_packed_tuple; |
12006 | sk_info.dup_sk_buf_old = m_dup_sk_packed_tuple_old; |
12007 | |
12008 | while ((res = rdb_merge.next(&merge_key, &merge_val)) == 0) { |
12009 | /* Perform uniqueness check if needed */ |
12010 | if (is_unique_index) { |
12011 | if (check_duplicate_sk(new_table_arg, *index, &merge_key, &sk_info)) { |
12012 | /* |
12013 | Duplicate entry found when trying to create unique secondary key. |
12014 | We need to unpack the record into new_table_arg->record[0] as it |
12015 | is used inside print_keydup_error so that the error message shows |
12016 | the duplicate record. |
12017 | */ |
12018 | if (index->unpack_record(new_table_arg, new_table_arg->record[0], |
12019 | &merge_key, nullptr, |
12020 | m_verify_row_debug_checksums)) { |
12021 | /* Should never reach here */ |
12022 | DBUG_ASSERT(0); |
12023 | } |
12024 | |
12025 | print_keydup_error(new_table_arg, |
12026 | &new_table_arg->key_info[index->get_keyno()], |
12027 | MYF(0)); |
12028 | DBUG_RETURN(ER_DUP_ENTRY); |
12029 | } |
12030 | } |
12031 | |
12032 | /* |
12033 | Insert key and slice to SST via SSTFileWriter API. |
12034 | */ |
12035 | if ((res = bulk_load_key(tx, *index, merge_key, merge_val, false))) { |
12036 | break; |
12037 | } |
12038 | } |
12039 | |
12040 | /* |
12041 | Here, res == -1 means that we are finished, while > 0 means an error |
12042 | occurred. |
12043 | */ |
12044 | if (res > 0) { |
12045 | // NO_LINT_DEBUG |
12046 | sql_print_error("Error while bulk loading keys in external merge sort." ); |
12047 | DBUG_RETURN(res); |
12048 | } |
12049 | |
12050 | if ((res = tx->finish_bulk_load())) { |
12051 | // NO_LINT_DEBUG |
12052 | sql_print_error("Error finishing bulk load." ); |
12053 | DBUG_RETURN(res); |
12054 | } |
12055 | } |
12056 | |
12057 | /* |
12058 | Explicitly tell jemalloc to clean up any unused dirty pages at this point. |
12059 | See https://reviews.facebook.net/D63723 for more details. |
12060 | */ |
12061 | purge_all_jemalloc_arenas(); |
12062 | |
12063 | DBUG_EXECUTE_IF("crash_during_online_index_creation" , DBUG_SUICIDE();); |
12064 | DBUG_RETURN(res); |
12065 | } |
12066 | |
12067 | /** |
12068 | Commit or rollback the changes made during prepare_inplace_alter_table() |
12069 | and inplace_alter_table() inside the storage engine. |
12070 | Note that in case of rollback the allowed level of concurrency during |
12071 | this operation will be the same as for inplace_alter_table() and thus |
12072 | might be higher than during prepare_inplace_alter_table(). (For example, |
12073 | concurrent writes were blocked during prepare, but might not be during |
12074 | rollback). |
12075 | |
12076 | @note Storage engines are responsible for reporting any errors by |
12077 | calling my_error()/print_error() |
12078 | |
12079 | @note If this function with commit= true reports error, it will be called |
12080 | again with commit= false. |
12081 | |
12082 | @note In case of partitioning, this function might be called for rollback |
12083 | without prepare_inplace_alter_table() having been called first. |
12084 | Also partitioned tables sets ha_alter_info->group_commit_ctx to a NULL |
12085 | terminated array of the partitions handlers and if all of them are |
12086 | committed as one, then group_commit_ctx should be set to NULL to indicate |
12087 | to the partitioning handler that all partitions handlers are committed. |
12088 | @see prepare_inplace_alter_table(). |
12089 | |
12090 | @param altered_table TABLE object for new version of table. |
12091 | @param ha_alter_info Structure describing changes to be done |
12092 | by ALTER TABLE and holding data used |
12093 | during in-place alter. |
12094 | @param commit True => Commit, False => Rollback. |
12095 | |
12096 | @retval true Error |
12097 | @retval false Success |
12098 | */ |
12099 | bool ha_rocksdb::commit_inplace_alter_table( |
12100 | my_core::TABLE *const altered_table, |
12101 | my_core::Alter_inplace_info *const ha_alter_info, bool commit) { |
12102 | DBUG_ENTER_FUNC(); |
12103 | |
12104 | DBUG_ASSERT(altered_table != nullptr); |
12105 | DBUG_ASSERT(ha_alter_info != nullptr); |
12106 | |
12107 | Rdb_inplace_alter_ctx *const ctx0 = |
12108 | static_cast<Rdb_inplace_alter_ctx *>(ha_alter_info->handler_ctx); |
12109 | |
12110 | DEBUG_SYNC(ha_thd(), "rocksdb.commit_in_place_alter_table" ); |
12111 | |
12112 | /* |
12113 | IMPORTANT: When rollback is requested, mysql will abort with |
12114 | an assertion failure. That means every failed commit during inplace alter |
12115 | table will result in a fatal error on the server. Indexes ongoing creation |
12116 | will be detected when the server restarts, and dropped. |
12117 | |
12118 | For partitioned tables, a rollback call to this function (commit == false) |
12119 | is done for each partition. A successful commit call only executes once |
12120 | for all partitions. |
12121 | */ |
12122 | if (!commit) { |
12123 | /* If ctx has not been created yet, nothing to do here */ |
12124 | if (!ctx0) { |
12125 | DBUG_RETURN(HA_EXIT_SUCCESS); |
12126 | } |
12127 | |
12128 | /* |
12129 | Cannot call destructor for Rdb_tbl_def directly because we don't want to |
12130 | erase the mappings inside the ddl_manager, as the old_key_descr is still |
12131 | using them. |
12132 | */ |
12133 | if (ctx0->m_new_key_descr) { |
12134 | /* Delete the new key descriptors */ |
12135 | for (uint i = 0; i < ctx0->m_new_tdef->m_key_count; i++) { |
12136 | ctx0->m_new_key_descr[i] = nullptr; |
12137 | } |
12138 | |
12139 | delete[] ctx0->m_new_key_descr; |
12140 | ctx0->m_new_key_descr = nullptr; |
12141 | ctx0->m_new_tdef->m_key_descr_arr = nullptr; |
12142 | |
12143 | delete ctx0->m_new_tdef; |
12144 | } |
12145 | |
12146 | /* Remove uncommitted key definitons from ddl_manager */ |
12147 | ddl_manager.remove_uncommitted_keydefs(ctx0->m_added_indexes); |
12148 | |
12149 | /* Rollback any partially created indexes */ |
12150 | dict_manager.rollback_ongoing_index_creation(); |
12151 | |
12152 | DBUG_RETURN(HA_EXIT_SUCCESS); |
12153 | } |
12154 | |
12155 | DBUG_ASSERT(ctx0); |
12156 | |
12157 | /* |
12158 | For partitioned tables, we need to commit all changes to all tables at |
12159 | once, unlike in the other inplace alter API methods. |
12160 | */ |
12161 | inplace_alter_handler_ctx **ctx_array; |
12162 | inplace_alter_handler_ctx *ctx_single[2]; |
12163 | |
12164 | if (ha_alter_info->group_commit_ctx) { |
12165 | DBUG_EXECUTE_IF("crash_during_index_creation_partition" , DBUG_SUICIDE();); |
12166 | ctx_array = ha_alter_info->group_commit_ctx; |
12167 | } else { |
12168 | ctx_single[0] = ctx0; |
12169 | ctx_single[1] = nullptr; |
12170 | ctx_array = ctx_single; |
12171 | } |
12172 | |
12173 | DBUG_ASSERT(ctx0 == ctx_array[0]); |
12174 | ha_alter_info->group_commit_ctx = nullptr; |
12175 | |
12176 | if (ha_alter_info->handler_flags & |
12177 | (ALTER_DROP_NON_UNIQUE_NON_PRIM_INDEX | |
12178 | ALTER_DROP_UNIQUE_INDEX | |
12179 | ALTER_ADD_NON_UNIQUE_NON_PRIM_INDEX | |
12180 | ALTER_ADD_UNIQUE_INDEX)) { |
12181 | const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin(); |
12182 | rocksdb::WriteBatch *const batch = wb.get(); |
12183 | std::unordered_set<GL_INDEX_ID> create_index_ids; |
12184 | |
12185 | m_tbl_def = ctx0->m_new_tdef; |
12186 | m_key_descr_arr = m_tbl_def->m_key_descr_arr; |
12187 | m_pk_descr = m_key_descr_arr[pk_index(altered_table, m_tbl_def)]; |
12188 | |
12189 | dict_manager.lock(); |
12190 | for (inplace_alter_handler_ctx **pctx = ctx_array; *pctx; pctx++) { |
12191 | Rdb_inplace_alter_ctx *const ctx = |
12192 | static_cast<Rdb_inplace_alter_ctx *>(*pctx); |
12193 | |
12194 | /* Mark indexes to be dropped */ |
12195 | dict_manager.add_drop_index(ctx->m_dropped_index_ids, batch); |
12196 | |
12197 | for (const auto &index : ctx->m_added_indexes) { |
12198 | create_index_ids.insert(index->get_gl_index_id()); |
12199 | } |
12200 | |
12201 | if (ddl_manager.put_and_write(ctx->m_new_tdef, batch)) { |
12202 | /* |
12203 | Failed to write new entry into data dictionary, this should never |
12204 | happen. |
12205 | */ |
12206 | DBUG_ASSERT(0); |
12207 | } |
12208 | |
12209 | /* |
12210 | Remove uncommitted key definitons from ddl_manager, as they are now |
12211 | committed into the data dictionary. |
12212 | */ |
12213 | ddl_manager.remove_uncommitted_keydefs(ctx->m_added_indexes); |
12214 | } |
12215 | |
12216 | if (dict_manager.commit(batch)) { |
12217 | /* |
12218 | Should never reach here. We assume MyRocks will abort if commit fails. |
12219 | */ |
12220 | DBUG_ASSERT(0); |
12221 | } |
12222 | |
12223 | dict_manager.unlock(); |
12224 | |
12225 | /* Mark ongoing create indexes as finished/remove from data dictionary */ |
12226 | dict_manager.finish_indexes_operation( |
12227 | create_index_ids, Rdb_key_def::DDL_CREATE_INDEX_ONGOING); |
12228 | |
12229 | /* |
12230 | We need to recalculate the index stats here manually. The reason is that |
12231 | the secondary index does not exist inside |
12232 | m_index_num_to_keydef until it is committed to the data dictionary, which |
12233 | prevents us from updating the stats normally as the ddl_manager cannot |
12234 | find the proper gl_index_ids yet during adjust_stats calls. |
12235 | */ |
12236 | if (calculate_stats(altered_table, nullptr, nullptr)) { |
12237 | /* Failed to update index statistics, should never happen */ |
12238 | DBUG_ASSERT(0); |
12239 | } |
12240 | |
12241 | rdb_drop_idx_thread.signal(); |
12242 | } |
12243 | |
12244 | if (ha_alter_info->handler_flags & ALTER_CHANGE_CREATE_OPTION) { |
12245 | const std::unique_ptr<rocksdb::WriteBatch> wb = dict_manager.begin(); |
12246 | rocksdb::WriteBatch *const batch = wb.get(); |
12247 | std::unordered_set<GL_INDEX_ID> create_index_ids; |
12248 | |
12249 | ulonglong auto_incr_val = ha_alter_info->create_info->auto_increment_value; |
12250 | |
12251 | for (inplace_alter_handler_ctx **pctx = ctx_array; *pctx; pctx++) { |
12252 | Rdb_inplace_alter_ctx *const ctx = |
12253 | static_cast<Rdb_inplace_alter_ctx *>(*pctx); |
12254 | auto_incr_val = std::max(auto_incr_val, ctx->m_max_auto_incr); |
12255 | dict_manager.put_auto_incr_val( |
12256 | batch, ctx->m_new_tdef->get_autoincr_gl_index_id(), auto_incr_val, |
12257 | true /* overwrite */); |
12258 | ctx->m_new_tdef->m_auto_incr_val = auto_incr_val; |
12259 | } |
12260 | |
12261 | if (dict_manager.commit(batch)) { |
12262 | DBUG_ASSERT(0); |
12263 | } |
12264 | } |
12265 | |
12266 | DBUG_RETURN(HA_EXIT_SUCCESS); |
12267 | } |
12268 | |
12269 | #define SHOW_FNAME(name) rocksdb_show_##name |
12270 | |
12271 | #define DEF_SHOW_FUNC(name, key) \ |
12272 | static int SHOW_FNAME(name)(MYSQL_THD thd, SHOW_VAR * var, char *buff) { \ |
12273 | rocksdb_status_counters.name = \ |
12274 | rocksdb_stats->getTickerCount(rocksdb::key); \ |
12275 | var->type = SHOW_LONGLONG; \ |
12276 | var->value = (char *)&rocksdb_status_counters.name; \ |
12277 | return HA_EXIT_SUCCESS; \ |
12278 | } |
12279 | |
12280 | #define DEF_STATUS_VAR(name) \ |
12281 | { "rocksdb_" #name, (char *)&SHOW_FNAME(name), SHOW_FUNC } |
12282 | |
12283 | #define DEF_STATUS_VAR_PTR(name, ptr, option) \ |
12284 | { "rocksdb_" name, (char *)ptr, option } |
12285 | |
12286 | #define DEF_STATUS_VAR_FUNC(name, ptr, option) \ |
12287 | { name, reinterpret_cast<char *>(ptr), option } |
12288 | |
12289 | struct rocksdb_status_counters_t { |
12290 | uint64_t block_cache_miss; |
12291 | uint64_t block_cache_hit; |
12292 | uint64_t block_cache_add; |
12293 | uint64_t block_cache_add_failures; |
12294 | uint64_t block_cache_index_miss; |
12295 | uint64_t block_cache_index_hit; |
12296 | uint64_t block_cache_index_add; |
12297 | uint64_t block_cache_index_bytes_insert; |
12298 | uint64_t block_cache_index_bytes_evict; |
12299 | uint64_t block_cache_filter_miss; |
12300 | uint64_t block_cache_filter_hit; |
12301 | uint64_t block_cache_filter_add; |
12302 | uint64_t block_cache_filter_bytes_insert; |
12303 | uint64_t block_cache_filter_bytes_evict; |
12304 | uint64_t block_cache_bytes_read; |
12305 | uint64_t block_cache_bytes_write; |
12306 | uint64_t block_cache_data_bytes_insert; |
12307 | uint64_t block_cache_data_miss; |
12308 | uint64_t block_cache_data_hit; |
12309 | uint64_t block_cache_data_add; |
12310 | uint64_t bloom_filter_useful; |
12311 | uint64_t memtable_hit; |
12312 | uint64_t memtable_miss; |
12313 | uint64_t get_hit_l0; |
12314 | uint64_t get_hit_l1; |
12315 | uint64_t get_hit_l2_and_up; |
12316 | uint64_t compaction_key_drop_new; |
12317 | uint64_t compaction_key_drop_obsolete; |
12318 | uint64_t compaction_key_drop_user; |
12319 | uint64_t number_keys_written; |
12320 | uint64_t number_keys_read; |
12321 | uint64_t number_keys_updated; |
12322 | uint64_t bytes_written; |
12323 | uint64_t bytes_read; |
12324 | uint64_t number_db_seek; |
12325 | uint64_t number_db_seek_found; |
12326 | uint64_t number_db_next; |
12327 | uint64_t number_db_next_found; |
12328 | uint64_t number_db_prev; |
12329 | uint64_t number_db_prev_found; |
12330 | uint64_t iter_bytes_read; |
12331 | uint64_t no_file_closes; |
12332 | uint64_t no_file_opens; |
12333 | uint64_t no_file_errors; |
12334 | uint64_t stall_micros; |
12335 | uint64_t num_iterators; |
12336 | uint64_t number_multiget_get; |
12337 | uint64_t number_multiget_keys_read; |
12338 | uint64_t number_multiget_bytes_read; |
12339 | uint64_t number_deletes_filtered; |
12340 | uint64_t number_merge_failures; |
12341 | uint64_t bloom_filter_prefix_checked; |
12342 | uint64_t bloom_filter_prefix_useful; |
12343 | uint64_t number_reseeks_iteration; |
12344 | uint64_t getupdatessince_calls; |
12345 | uint64_t block_cachecompressed_miss; |
12346 | uint64_t block_cachecompressed_hit; |
12347 | uint64_t wal_synced; |
12348 | uint64_t wal_bytes; |
12349 | uint64_t write_self; |
12350 | uint64_t write_other; |
12351 | uint64_t write_timedout; |
12352 | uint64_t write_wal; |
12353 | uint64_t flush_write_bytes; |
12354 | uint64_t compact_read_bytes; |
12355 | uint64_t compact_write_bytes; |
12356 | uint64_t number_superversion_acquires; |
12357 | uint64_t number_superversion_releases; |
12358 | uint64_t number_superversion_cleanups; |
12359 | uint64_t number_block_not_compressed; |
12360 | }; |
12361 | |
12362 | static rocksdb_status_counters_t rocksdb_status_counters; |
12363 | |
12364 | DEF_SHOW_FUNC(block_cache_miss, BLOCK_CACHE_MISS) |
12365 | DEF_SHOW_FUNC(block_cache_hit, BLOCK_CACHE_HIT) |
12366 | DEF_SHOW_FUNC(block_cache_add, BLOCK_CACHE_ADD) |
12367 | DEF_SHOW_FUNC(block_cache_add_failures, BLOCK_CACHE_ADD_FAILURES) |
12368 | DEF_SHOW_FUNC(block_cache_index_miss, BLOCK_CACHE_INDEX_MISS) |
12369 | DEF_SHOW_FUNC(block_cache_index_hit, BLOCK_CACHE_INDEX_HIT) |
12370 | DEF_SHOW_FUNC(block_cache_index_add, BLOCK_CACHE_INDEX_ADD) |
12371 | DEF_SHOW_FUNC(block_cache_index_bytes_insert, BLOCK_CACHE_INDEX_BYTES_INSERT) |
12372 | DEF_SHOW_FUNC(block_cache_index_bytes_evict, BLOCK_CACHE_INDEX_BYTES_EVICT) |
12373 | DEF_SHOW_FUNC(block_cache_filter_miss, BLOCK_CACHE_FILTER_MISS) |
12374 | DEF_SHOW_FUNC(block_cache_filter_hit, BLOCK_CACHE_FILTER_HIT) |
12375 | DEF_SHOW_FUNC(block_cache_filter_add, BLOCK_CACHE_FILTER_ADD) |
12376 | DEF_SHOW_FUNC(block_cache_filter_bytes_insert, BLOCK_CACHE_FILTER_BYTES_INSERT) |
12377 | DEF_SHOW_FUNC(block_cache_filter_bytes_evict, BLOCK_CACHE_FILTER_BYTES_EVICT) |
12378 | DEF_SHOW_FUNC(block_cache_bytes_read, BLOCK_CACHE_BYTES_READ) |
12379 | DEF_SHOW_FUNC(block_cache_bytes_write, BLOCK_CACHE_BYTES_WRITE) |
12380 | DEF_SHOW_FUNC(block_cache_data_bytes_insert, BLOCK_CACHE_DATA_BYTES_INSERT) |
12381 | DEF_SHOW_FUNC(block_cache_data_miss, BLOCK_CACHE_DATA_MISS) |
12382 | DEF_SHOW_FUNC(block_cache_data_hit, BLOCK_CACHE_DATA_HIT) |
12383 | DEF_SHOW_FUNC(block_cache_data_add, BLOCK_CACHE_DATA_ADD) |
12384 | DEF_SHOW_FUNC(bloom_filter_useful, BLOOM_FILTER_USEFUL) |
12385 | DEF_SHOW_FUNC(memtable_hit, MEMTABLE_HIT) |
12386 | DEF_SHOW_FUNC(memtable_miss, MEMTABLE_MISS) |
12387 | DEF_SHOW_FUNC(get_hit_l0, GET_HIT_L0) |
12388 | DEF_SHOW_FUNC(get_hit_l1, GET_HIT_L1) |
12389 | DEF_SHOW_FUNC(get_hit_l2_and_up, GET_HIT_L2_AND_UP) |
12390 | DEF_SHOW_FUNC(compaction_key_drop_new, COMPACTION_KEY_DROP_NEWER_ENTRY) |
12391 | DEF_SHOW_FUNC(compaction_key_drop_obsolete, COMPACTION_KEY_DROP_OBSOLETE) |
12392 | DEF_SHOW_FUNC(compaction_key_drop_user, COMPACTION_KEY_DROP_USER) |
12393 | DEF_SHOW_FUNC(number_keys_written, NUMBER_KEYS_WRITTEN) |
12394 | DEF_SHOW_FUNC(number_keys_read, NUMBER_KEYS_READ) |
12395 | DEF_SHOW_FUNC(number_keys_updated, NUMBER_KEYS_UPDATED) |
12396 | DEF_SHOW_FUNC(bytes_written, BYTES_WRITTEN) |
12397 | DEF_SHOW_FUNC(bytes_read, BYTES_READ) |
12398 | DEF_SHOW_FUNC(number_db_seek, NUMBER_DB_SEEK) |
12399 | DEF_SHOW_FUNC(number_db_seek_found, NUMBER_DB_SEEK_FOUND) |
12400 | DEF_SHOW_FUNC(number_db_next, NUMBER_DB_NEXT) |
12401 | DEF_SHOW_FUNC(number_db_next_found, NUMBER_DB_NEXT_FOUND) |
12402 | DEF_SHOW_FUNC(number_db_prev, NUMBER_DB_PREV) |
12403 | DEF_SHOW_FUNC(number_db_prev_found, NUMBER_DB_PREV_FOUND) |
12404 | DEF_SHOW_FUNC(iter_bytes_read, ITER_BYTES_READ) |
12405 | DEF_SHOW_FUNC(no_file_closes, NO_FILE_CLOSES) |
12406 | DEF_SHOW_FUNC(no_file_opens, NO_FILE_OPENS) |
12407 | DEF_SHOW_FUNC(no_file_errors, NO_FILE_ERRORS) |
12408 | DEF_SHOW_FUNC(stall_micros, STALL_MICROS) |
12409 | DEF_SHOW_FUNC(num_iterators, NO_ITERATORS) |
12410 | DEF_SHOW_FUNC(number_multiget_get, NUMBER_MULTIGET_CALLS) |
12411 | DEF_SHOW_FUNC(number_multiget_keys_read, NUMBER_MULTIGET_KEYS_READ) |
12412 | DEF_SHOW_FUNC(number_multiget_bytes_read, NUMBER_MULTIGET_BYTES_READ) |
12413 | DEF_SHOW_FUNC(number_deletes_filtered, NUMBER_FILTERED_DELETES) |
12414 | DEF_SHOW_FUNC(number_merge_failures, NUMBER_MERGE_FAILURES) |
12415 | DEF_SHOW_FUNC(bloom_filter_prefix_checked, BLOOM_FILTER_PREFIX_CHECKED) |
12416 | DEF_SHOW_FUNC(bloom_filter_prefix_useful, BLOOM_FILTER_PREFIX_USEFUL) |
12417 | DEF_SHOW_FUNC(number_reseeks_iteration, NUMBER_OF_RESEEKS_IN_ITERATION) |
12418 | DEF_SHOW_FUNC(getupdatessince_calls, GET_UPDATES_SINCE_CALLS) |
12419 | DEF_SHOW_FUNC(block_cachecompressed_miss, BLOCK_CACHE_COMPRESSED_MISS) |
12420 | DEF_SHOW_FUNC(block_cachecompressed_hit, BLOCK_CACHE_COMPRESSED_HIT) |
12421 | DEF_SHOW_FUNC(wal_synced, WAL_FILE_SYNCED) |
12422 | DEF_SHOW_FUNC(wal_bytes, WAL_FILE_BYTES) |
12423 | DEF_SHOW_FUNC(write_self, WRITE_DONE_BY_SELF) |
12424 | DEF_SHOW_FUNC(write_other, WRITE_DONE_BY_OTHER) |
12425 | DEF_SHOW_FUNC(write_timedout, WRITE_TIMEDOUT) |
12426 | DEF_SHOW_FUNC(write_wal, WRITE_WITH_WAL) |
12427 | DEF_SHOW_FUNC(flush_write_bytes, FLUSH_WRITE_BYTES) |
12428 | DEF_SHOW_FUNC(compact_read_bytes, COMPACT_READ_BYTES) |
12429 | DEF_SHOW_FUNC(compact_write_bytes, COMPACT_WRITE_BYTES) |
12430 | DEF_SHOW_FUNC(number_superversion_acquires, NUMBER_SUPERVERSION_ACQUIRES) |
12431 | DEF_SHOW_FUNC(number_superversion_releases, NUMBER_SUPERVERSION_RELEASES) |
12432 | DEF_SHOW_FUNC(number_superversion_cleanups, NUMBER_SUPERVERSION_CLEANUPS) |
12433 | DEF_SHOW_FUNC(number_block_not_compressed, NUMBER_BLOCK_NOT_COMPRESSED) |
12434 | |
12435 | static void myrocks_update_status() { |
12436 | export_stats.rows_deleted = global_stats.rows[ROWS_DELETED]; |
12437 | export_stats.rows_inserted = global_stats.rows[ROWS_INSERTED]; |
12438 | export_stats.rows_read = global_stats.rows[ROWS_READ]; |
12439 | export_stats.rows_updated = global_stats.rows[ROWS_UPDATED]; |
12440 | export_stats.rows_deleted_blind = global_stats.rows[ROWS_DELETED_BLIND]; |
12441 | export_stats.rows_expired = global_stats.rows[ROWS_EXPIRED]; |
12442 | export_stats.rows_filtered = global_stats.rows[ROWS_FILTERED]; |
12443 | |
12444 | export_stats.system_rows_deleted = global_stats.system_rows[ROWS_DELETED]; |
12445 | export_stats.system_rows_inserted = global_stats.system_rows[ROWS_INSERTED]; |
12446 | export_stats.system_rows_read = global_stats.system_rows[ROWS_READ]; |
12447 | export_stats.system_rows_updated = global_stats.system_rows[ROWS_UPDATED]; |
12448 | |
12449 | export_stats.queries_point = global_stats.queries[QUERIES_POINT]; |
12450 | export_stats.queries_range = global_stats.queries[QUERIES_RANGE]; |
12451 | |
12452 | export_stats.covered_secondary_key_lookups = |
12453 | global_stats.covered_secondary_key_lookups; |
12454 | } |
12455 | |
12456 | static void myrocks_update_memory_status() { |
12457 | std::vector<rocksdb::DB *> dbs; |
12458 | std::unordered_set<const rocksdb::Cache *> cache_set; |
12459 | dbs.push_back(rdb); |
12460 | std::map<rocksdb::MemoryUtil::UsageType, uint64_t> temp_usage_by_type; |
12461 | rocksdb::MemoryUtil::GetApproximateMemoryUsageByType(dbs, cache_set, |
12462 | &temp_usage_by_type); |
12463 | memory_stats.memtable_total = |
12464 | temp_usage_by_type[rocksdb::MemoryUtil::kMemTableTotal]; |
12465 | memory_stats.memtable_unflushed = |
12466 | temp_usage_by_type[rocksdb::MemoryUtil::kMemTableUnFlushed]; |
12467 | } |
12468 | |
12469 | static SHOW_VAR myrocks_status_variables[] = { |
12470 | DEF_STATUS_VAR_FUNC("rows_deleted" , &export_stats.rows_deleted, |
12471 | SHOW_LONGLONG), |
12472 | DEF_STATUS_VAR_FUNC("rows_inserted" , &export_stats.rows_inserted, |
12473 | SHOW_LONGLONG), |
12474 | DEF_STATUS_VAR_FUNC("rows_read" , &export_stats.rows_read, SHOW_LONGLONG), |
12475 | DEF_STATUS_VAR_FUNC("rows_updated" , &export_stats.rows_updated, |
12476 | SHOW_LONGLONG), |
12477 | DEF_STATUS_VAR_FUNC("rows_deleted_blind" , &export_stats.rows_deleted_blind, |
12478 | SHOW_LONGLONG), |
12479 | DEF_STATUS_VAR_FUNC("rows_expired" , &export_stats.rows_expired, |
12480 | SHOW_LONGLONG), |
12481 | DEF_STATUS_VAR_FUNC("rows_filtered" , &export_stats.rows_filtered, |
12482 | SHOW_LONGLONG), |
12483 | DEF_STATUS_VAR_FUNC("system_rows_deleted" , |
12484 | &export_stats.system_rows_deleted, SHOW_LONGLONG), |
12485 | DEF_STATUS_VAR_FUNC("system_rows_inserted" , |
12486 | &export_stats.system_rows_inserted, SHOW_LONGLONG), |
12487 | DEF_STATUS_VAR_FUNC("system_rows_read" , &export_stats.system_rows_read, |
12488 | SHOW_LONGLONG), |
12489 | DEF_STATUS_VAR_FUNC("system_rows_updated" , |
12490 | &export_stats.system_rows_updated, SHOW_LONGLONG), |
12491 | DEF_STATUS_VAR_FUNC("memtable_total" , &memory_stats.memtable_total, |
12492 | SHOW_LONGLONG), |
12493 | DEF_STATUS_VAR_FUNC("memtable_unflushed" , &memory_stats.memtable_unflushed, |
12494 | SHOW_LONGLONG), |
12495 | DEF_STATUS_VAR_FUNC("queries_point" , &export_stats.queries_point, |
12496 | SHOW_LONGLONG), |
12497 | DEF_STATUS_VAR_FUNC("queries_range" , &export_stats.queries_range, |
12498 | SHOW_LONGLONG), |
12499 | DEF_STATUS_VAR_FUNC("covered_secondary_key_lookups" , |
12500 | &export_stats.covered_secondary_key_lookups, |
12501 | SHOW_LONGLONG), |
12502 | |
12503 | {NullS, NullS, SHOW_LONG}}; |
12504 | |
12505 | static void show_myrocks_vars(THD *thd, SHOW_VAR *var, char *buff) { |
12506 | myrocks_update_status(); |
12507 | myrocks_update_memory_status(); |
12508 | var->type = SHOW_ARRAY; |
12509 | var->value = reinterpret_cast<char *>(&myrocks_status_variables); |
12510 | } |
12511 | |
12512 | static ulonglong |
12513 | io_stall_prop_value(const std::map<std::string, std::string> &props, |
12514 | const std::string &key) { |
12515 | std::map<std::string, std::string>::const_iterator iter = |
12516 | props.find("io_stalls." + key); |
12517 | if (iter != props.end()) { |
12518 | return std::stoull(iter->second); |
12519 | } else { |
12520 | DBUG_PRINT("warning" , |
12521 | ("RocksDB GetMapPropery hasn't returned key=%s" , key.c_str())); |
12522 | DBUG_ASSERT(0); |
12523 | return 0; |
12524 | } |
12525 | } |
12526 | |
12527 | static void update_rocksdb_stall_status() { |
12528 | st_io_stall_stats local_io_stall_stats; |
12529 | for (const auto &cf_name : cf_manager.get_cf_names()) { |
12530 | rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name); |
12531 | if (cfh == nullptr) { |
12532 | continue; |
12533 | } |
12534 | |
12535 | std::map<std::string, std::string> props; |
12536 | if (!rdb->GetMapProperty(cfh, "rocksdb.cfstats" , &props)) { |
12537 | continue; |
12538 | } |
12539 | |
12540 | local_io_stall_stats.level0_slowdown += |
12541 | io_stall_prop_value(props, "level0_slowdown" ); |
12542 | local_io_stall_stats.level0_slowdown_with_compaction += |
12543 | io_stall_prop_value(props, "level0_slowdown_with_compaction" ); |
12544 | local_io_stall_stats.level0_numfiles += |
12545 | io_stall_prop_value(props, "level0_numfiles" ); |
12546 | local_io_stall_stats.level0_numfiles_with_compaction += |
12547 | io_stall_prop_value(props, "level0_numfiles_with_compaction" ); |
12548 | local_io_stall_stats.stop_for_pending_compaction_bytes += |
12549 | io_stall_prop_value(props, "stop_for_pending_compaction_bytes" ); |
12550 | local_io_stall_stats.slowdown_for_pending_compaction_bytes += |
12551 | io_stall_prop_value(props, "slowdown_for_pending_compaction_bytes" ); |
12552 | local_io_stall_stats.memtable_compaction += |
12553 | io_stall_prop_value(props, "memtable_compaction" ); |
12554 | local_io_stall_stats.memtable_slowdown += |
12555 | io_stall_prop_value(props, "memtable_slowdown" ); |
12556 | local_io_stall_stats.total_stop += io_stall_prop_value(props, "total_stop" ); |
12557 | local_io_stall_stats.total_slowdown += |
12558 | io_stall_prop_value(props, "total_slowdown" ); |
12559 | } |
12560 | io_stall_stats = local_io_stall_stats; |
12561 | } |
12562 | |
12563 | static SHOW_VAR rocksdb_stall_status_variables[] = { |
12564 | DEF_STATUS_VAR_FUNC("l0_file_count_limit_slowdowns" , |
12565 | &io_stall_stats.level0_slowdown, SHOW_LONGLONG), |
12566 | DEF_STATUS_VAR_FUNC("locked_l0_file_count_limit_slowdowns" , |
12567 | &io_stall_stats.level0_slowdown_with_compaction, |
12568 | SHOW_LONGLONG), |
12569 | DEF_STATUS_VAR_FUNC("l0_file_count_limit_stops" , |
12570 | &io_stall_stats.level0_numfiles, SHOW_LONGLONG), |
12571 | DEF_STATUS_VAR_FUNC("locked_l0_file_count_limit_stops" , |
12572 | &io_stall_stats.level0_numfiles_with_compaction, |
12573 | SHOW_LONGLONG), |
12574 | DEF_STATUS_VAR_FUNC("pending_compaction_limit_stops" , |
12575 | &io_stall_stats.stop_for_pending_compaction_bytes, |
12576 | SHOW_LONGLONG), |
12577 | DEF_STATUS_VAR_FUNC("pending_compaction_limit_slowdowns" , |
12578 | &io_stall_stats.slowdown_for_pending_compaction_bytes, |
12579 | SHOW_LONGLONG), |
12580 | DEF_STATUS_VAR_FUNC("memtable_limit_stops" , |
12581 | &io_stall_stats.memtable_compaction, SHOW_LONGLONG), |
12582 | DEF_STATUS_VAR_FUNC("memtable_limit_slowdowns" , |
12583 | &io_stall_stats.memtable_slowdown, SHOW_LONGLONG), |
12584 | DEF_STATUS_VAR_FUNC("total_stops" , &io_stall_stats.total_stop, |
12585 | SHOW_LONGLONG), |
12586 | DEF_STATUS_VAR_FUNC("total_slowdowns" , &io_stall_stats.total_slowdown, |
12587 | SHOW_LONGLONG), |
12588 | // end of the array marker |
12589 | {NullS, NullS, SHOW_LONG}}; |
12590 | |
12591 | static void show_rocksdb_stall_vars(THD *thd, SHOW_VAR *var, char *buff) { |
12592 | update_rocksdb_stall_status(); |
12593 | var->type = SHOW_ARRAY; |
12594 | var->value = reinterpret_cast<char *>(&rocksdb_stall_status_variables); |
12595 | } |
12596 | |
12597 | static SHOW_VAR rocksdb_status_vars[] = { |
12598 | DEF_STATUS_VAR(block_cache_miss), |
12599 | DEF_STATUS_VAR(block_cache_hit), |
12600 | DEF_STATUS_VAR(block_cache_add), |
12601 | DEF_STATUS_VAR(block_cache_add_failures), |
12602 | DEF_STATUS_VAR(block_cache_index_miss), |
12603 | DEF_STATUS_VAR(block_cache_index_hit), |
12604 | DEF_STATUS_VAR(block_cache_index_add), |
12605 | DEF_STATUS_VAR(block_cache_index_bytes_insert), |
12606 | DEF_STATUS_VAR(block_cache_index_bytes_evict), |
12607 | DEF_STATUS_VAR(block_cache_filter_miss), |
12608 | DEF_STATUS_VAR(block_cache_filter_hit), |
12609 | DEF_STATUS_VAR(block_cache_filter_add), |
12610 | DEF_STATUS_VAR(block_cache_filter_bytes_insert), |
12611 | DEF_STATUS_VAR(block_cache_filter_bytes_evict), |
12612 | DEF_STATUS_VAR(block_cache_bytes_read), |
12613 | DEF_STATUS_VAR(block_cache_bytes_write), |
12614 | DEF_STATUS_VAR(block_cache_data_bytes_insert), |
12615 | DEF_STATUS_VAR(block_cache_data_miss), |
12616 | DEF_STATUS_VAR(block_cache_data_hit), |
12617 | DEF_STATUS_VAR(block_cache_data_add), |
12618 | DEF_STATUS_VAR(bloom_filter_useful), |
12619 | DEF_STATUS_VAR(memtable_hit), |
12620 | DEF_STATUS_VAR(memtable_miss), |
12621 | DEF_STATUS_VAR(get_hit_l0), |
12622 | DEF_STATUS_VAR(get_hit_l1), |
12623 | DEF_STATUS_VAR(get_hit_l2_and_up), |
12624 | DEF_STATUS_VAR(compaction_key_drop_new), |
12625 | DEF_STATUS_VAR(compaction_key_drop_obsolete), |
12626 | DEF_STATUS_VAR(compaction_key_drop_user), |
12627 | DEF_STATUS_VAR(number_keys_written), |
12628 | DEF_STATUS_VAR(number_keys_read), |
12629 | DEF_STATUS_VAR(number_keys_updated), |
12630 | DEF_STATUS_VAR(bytes_written), |
12631 | DEF_STATUS_VAR(bytes_read), |
12632 | DEF_STATUS_VAR(number_db_seek), |
12633 | DEF_STATUS_VAR(number_db_seek_found), |
12634 | DEF_STATUS_VAR(number_db_next), |
12635 | DEF_STATUS_VAR(number_db_next_found), |
12636 | DEF_STATUS_VAR(number_db_prev), |
12637 | DEF_STATUS_VAR(number_db_prev_found), |
12638 | DEF_STATUS_VAR(iter_bytes_read), |
12639 | DEF_STATUS_VAR(no_file_closes), |
12640 | DEF_STATUS_VAR(no_file_opens), |
12641 | DEF_STATUS_VAR(no_file_errors), |
12642 | DEF_STATUS_VAR(stall_micros), |
12643 | DEF_STATUS_VAR(num_iterators), |
12644 | DEF_STATUS_VAR(number_multiget_get), |
12645 | DEF_STATUS_VAR(number_multiget_keys_read), |
12646 | DEF_STATUS_VAR(number_multiget_bytes_read), |
12647 | DEF_STATUS_VAR(number_deletes_filtered), |
12648 | DEF_STATUS_VAR(number_merge_failures), |
12649 | DEF_STATUS_VAR(bloom_filter_prefix_checked), |
12650 | DEF_STATUS_VAR(bloom_filter_prefix_useful), |
12651 | DEF_STATUS_VAR(number_reseeks_iteration), |
12652 | DEF_STATUS_VAR(getupdatessince_calls), |
12653 | DEF_STATUS_VAR(block_cachecompressed_miss), |
12654 | DEF_STATUS_VAR(block_cachecompressed_hit), |
12655 | DEF_STATUS_VAR(wal_synced), |
12656 | DEF_STATUS_VAR(wal_bytes), |
12657 | DEF_STATUS_VAR(write_self), |
12658 | DEF_STATUS_VAR(write_other), |
12659 | DEF_STATUS_VAR(write_timedout), |
12660 | DEF_STATUS_VAR(write_wal), |
12661 | DEF_STATUS_VAR(flush_write_bytes), |
12662 | DEF_STATUS_VAR(compact_read_bytes), |
12663 | DEF_STATUS_VAR(compact_write_bytes), |
12664 | DEF_STATUS_VAR(number_superversion_acquires), |
12665 | DEF_STATUS_VAR(number_superversion_releases), |
12666 | DEF_STATUS_VAR(number_superversion_cleanups), |
12667 | DEF_STATUS_VAR(number_block_not_compressed), |
12668 | DEF_STATUS_VAR_PTR("row_lock_deadlocks" , &rocksdb_row_lock_deadlocks, |
12669 | SHOW_LONGLONG), |
12670 | DEF_STATUS_VAR_PTR("row_lock_wait_timeouts" , |
12671 | &rocksdb_row_lock_wait_timeouts, SHOW_LONGLONG), |
12672 | DEF_STATUS_VAR_PTR("snapshot_conflict_errors" , |
12673 | &rocksdb_snapshot_conflict_errors, SHOW_LONGLONG), |
12674 | DEF_STATUS_VAR_PTR("wal_group_syncs" , &rocksdb_wal_group_syncs, |
12675 | SHOW_LONGLONG), |
12676 | DEF_STATUS_VAR_PTR("number_sst_entry_put" , &rocksdb_num_sst_entry_put, |
12677 | SHOW_LONGLONG), |
12678 | DEF_STATUS_VAR_PTR("number_sst_entry_delete" , &rocksdb_num_sst_entry_delete, |
12679 | SHOW_LONGLONG), |
12680 | DEF_STATUS_VAR_PTR("number_sst_entry_singledelete" , |
12681 | &rocksdb_num_sst_entry_singledelete, SHOW_LONGLONG), |
12682 | DEF_STATUS_VAR_PTR("number_sst_entry_merge" , &rocksdb_num_sst_entry_merge, |
12683 | SHOW_LONGLONG), |
12684 | DEF_STATUS_VAR_PTR("number_sst_entry_other" , &rocksdb_num_sst_entry_other, |
12685 | SHOW_LONGLONG), |
12686 | // the variables generated by SHOW_FUNC are sorted only by prefix (first |
12687 | // arg in the tuple below), so make sure it is unique to make sorting |
12688 | // deterministic as quick sort is not stable |
12689 | {"rocksdb" , reinterpret_cast<char *>(&show_myrocks_vars), SHOW_FUNC}, |
12690 | {"rocksdb_stall" , reinterpret_cast<char *>(&show_rocksdb_stall_vars), |
12691 | SHOW_FUNC}, |
12692 | {NullS, NullS, SHOW_LONG}}; |
12693 | |
12694 | /* |
12695 | Background thread's main logic |
12696 | */ |
12697 | |
12698 | void Rdb_background_thread::run() { |
12699 | // How many seconds to wait till flushing the WAL next time. |
12700 | const int WAKE_UP_INTERVAL = 1; |
12701 | |
12702 | timespec ts_next_sync; |
12703 | set_timespec(ts_next_sync, WAKE_UP_INTERVAL); |
12704 | |
12705 | for (;;) { |
12706 | // Wait until the next timeout or until we receive a signal to stop the |
12707 | // thread. Request to stop the thread should only be triggered when the |
12708 | // storage engine is being unloaded. |
12709 | RDB_MUTEX_LOCK_CHECK(m_signal_mutex); |
12710 | const auto ret MY_ATTRIBUTE((__unused__)) = |
12711 | mysql_cond_timedwait(&m_signal_cond, &m_signal_mutex, &ts_next_sync); |
12712 | |
12713 | // Check that we receive only the expected error codes. |
12714 | DBUG_ASSERT(ret == 0 || ret == ETIMEDOUT); |
12715 | const bool local_stop = m_stop; |
12716 | const bool local_save_stats = m_save_stats; |
12717 | reset(); |
12718 | RDB_MUTEX_UNLOCK_CHECK(m_signal_mutex); |
12719 | |
12720 | if (local_stop) { |
12721 | // If we're here then that's because condition variable was signaled by |
12722 | // another thread and we're shutting down. Break out the loop to make |
12723 | // sure that shutdown thread can proceed. |
12724 | break; |
12725 | } |
12726 | |
12727 | // This path should be taken only when the timer expired. |
12728 | DBUG_ASSERT(ret == ETIMEDOUT); |
12729 | |
12730 | if (local_save_stats) { |
12731 | ddl_manager.persist_stats(); |
12732 | } |
12733 | |
12734 | // Set the next timestamp for mysql_cond_timedwait() (which ends up calling |
12735 | // pthread_cond_timedwait()) to wait on. |
12736 | set_timespec(ts_next_sync, WAKE_UP_INTERVAL); |
12737 | |
12738 | // Flush the WAL. Sync it for both background and never modes to copy |
12739 | // InnoDB's behavior. For mode never, the wal file isn't even written, |
12740 | // whereas background writes to the wal file, but issues the syncs in a |
12741 | // background thread. |
12742 | if (rdb && (rocksdb_flush_log_at_trx_commit != FLUSH_LOG_SYNC) && |
12743 | !rocksdb_db_options->allow_mmap_writes) { |
12744 | const rocksdb::Status s = rdb->FlushWAL(true); |
12745 | if (!s.ok()) { |
12746 | rdb_handle_io_error(s, RDB_IO_ERROR_BG_THREAD); |
12747 | } |
12748 | } |
12749 | } |
12750 | |
12751 | // save remaining stats which might've left unsaved |
12752 | ddl_manager.persist_stats(); |
12753 | } |
12754 | |
12755 | bool ha_rocksdb::check_bloom_and_set_bounds(THD *thd, const Rdb_key_def &kd, |
12756 | const rocksdb::Slice &eq_cond, |
12757 | const bool use_all_keys, |
12758 | uchar *lower_bound_buf, |
12759 | uchar *upper_bound_buf, |
12760 | rocksdb::Slice *out_lower_bound, |
12761 | rocksdb::Slice *out_upper_bound) { |
12762 | bool can_use_bloom = can_use_bloom_filter(thd, kd, eq_cond, use_all_keys); |
12763 | if (!can_use_bloom) { |
12764 | setup_iterator_bounds(kd, eq_cond, |
12765 | lower_bound_buf, upper_bound_buf, |
12766 | out_lower_bound, out_upper_bound); |
12767 | } |
12768 | return can_use_bloom; |
12769 | } |
12770 | |
12771 | /** |
12772 | Deciding if it is possible to use bloom filter or not. |
12773 | |
12774 | @detail |
12775 | Even if bloom filter exists, it is not always possible |
12776 | to use bloom filter. If using bloom filter when you shouldn't, |
12777 | false negative may happen -- fewer rows than expected may be returned. |
12778 | It is users' responsibility to use bloom filter correctly. |
12779 | |
12780 | If bloom filter does not exist, return value does not matter because |
12781 | RocksDB does not use bloom filter internally. |
12782 | |
12783 | @param kd |
12784 | @param eq_cond Equal condition part of the key. This always includes |
12785 | system index id (4 bytes). |
12786 | @param use_all_keys True if all key parts are set with equal conditions. |
12787 | This is aware of extended keys. |
12788 | */ |
12789 | bool ha_rocksdb::can_use_bloom_filter(THD *thd, const Rdb_key_def &kd, |
12790 | const rocksdb::Slice &eq_cond, |
12791 | const bool use_all_keys) { |
12792 | bool can_use = false; |
12793 | |
12794 | if (THDVAR(thd, skip_bloom_filter_on_read)) { |
12795 | return can_use; |
12796 | } |
12797 | |
12798 | const rocksdb::SliceTransform * = kd.get_extractor(); |
12799 | if (prefix_extractor) { |
12800 | /* |
12801 | This is an optimized use case for CappedPrefixTransform. |
12802 | If eq_cond length >= prefix extractor length and if |
12803 | all keys are used for equal lookup, it is |
12804 | always possible to use bloom filter. |
12805 | |
12806 | Prefix bloom filter can't be used on descending scan with |
12807 | prefix lookup (i.e. WHERE id1=1 ORDER BY id2 DESC), because of |
12808 | RocksDB's limitation. On ascending (or not sorting) scan, |
12809 | keys longer than the capped prefix length will be truncated down |
12810 | to the capped length and the resulting key is added to the bloom filter. |
12811 | |
12812 | Keys shorter than the capped prefix length will be added to |
12813 | the bloom filter. When keys are looked up, key conditionals |
12814 | longer than the capped length can be used; key conditionals |
12815 | shorter require all parts of the key to be available |
12816 | for the short key match. |
12817 | */ |
12818 | if ((use_all_keys && prefix_extractor->InRange(eq_cond)) |
12819 | || prefix_extractor->SameResultWhenAppended(eq_cond)) |
12820 | can_use = true; |
12821 | else |
12822 | can_use = false; |
12823 | } else { |
12824 | /* |
12825 | if prefix extractor is not defined, all key parts have to be |
12826 | used by eq_cond. |
12827 | */ |
12828 | if (use_all_keys) |
12829 | can_use = true; |
12830 | else |
12831 | can_use = false; |
12832 | } |
12833 | |
12834 | return can_use; |
12835 | } |
12836 | |
12837 | /* For modules that need access to the global data structures */ |
12838 | rocksdb::TransactionDB *rdb_get_rocksdb_db() { return rdb; } |
12839 | |
12840 | Rdb_cf_manager &rdb_get_cf_manager() { return cf_manager; } |
12841 | |
12842 | const rocksdb::BlockBasedTableOptions &rdb_get_table_options() { |
12843 | return *rocksdb_tbl_options; |
12844 | } |
12845 | |
12846 | bool rdb_is_ttl_enabled() { return rocksdb_enable_ttl; } |
12847 | bool rdb_is_ttl_read_filtering_enabled() { |
12848 | return rocksdb_enable_ttl_read_filtering; |
12849 | } |
12850 | #ifndef NDEBUG |
12851 | int rdb_dbug_set_ttl_rec_ts() { return rocksdb_debug_ttl_rec_ts; } |
12852 | int rdb_dbug_set_ttl_snapshot_ts() { return rocksdb_debug_ttl_snapshot_ts; } |
12853 | int rdb_dbug_set_ttl_read_filter_ts() { |
12854 | return rocksdb_debug_ttl_read_filter_ts; |
12855 | } |
12856 | bool rdb_dbug_set_ttl_ignore_pk() { return rocksdb_debug_ttl_ignore_pk; } |
12857 | #endif |
12858 | |
12859 | void rdb_update_global_stats(const operation_type &type, uint count, |
12860 | bool is_system_table) { |
12861 | DBUG_ASSERT(type < ROWS_MAX); |
12862 | |
12863 | if (count == 0) { |
12864 | return; |
12865 | } |
12866 | |
12867 | if (is_system_table) { |
12868 | global_stats.system_rows[type].add(count); |
12869 | } else { |
12870 | global_stats.rows[type].add(count); |
12871 | } |
12872 | } |
12873 | |
12874 | int rdb_get_table_perf_counters(const char *const tablename, |
12875 | Rdb_perf_counters *const counters) { |
12876 | DBUG_ASSERT(counters != nullptr); |
12877 | DBUG_ASSERT(tablename != nullptr); |
12878 | |
12879 | Rdb_table_handler *table_handler; |
12880 | table_handler = rdb_open_tables.get_table_handler(tablename); |
12881 | if (table_handler == nullptr) { |
12882 | return HA_ERR_ROCKSDB_INVALID_TABLE; |
12883 | } |
12884 | |
12885 | counters->load(table_handler->m_table_perf_context); |
12886 | |
12887 | rdb_open_tables.release_table_handler(table_handler); |
12888 | return HA_EXIT_SUCCESS; |
12889 | } |
12890 | |
12891 | const char *get_rdb_io_error_string(const RDB_IO_ERROR_TYPE err_type) { |
12892 | // If this assertion fails then this means that a member has been either added |
12893 | // to or removed from RDB_IO_ERROR_TYPE enum and this function needs to be |
12894 | // changed to return the appropriate value. |
12895 | static_assert(RDB_IO_ERROR_LAST == 4, "Please handle all the error types." ); |
12896 | |
12897 | switch (err_type) { |
12898 | case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_TX_COMMIT: |
12899 | return "RDB_IO_ERROR_TX_COMMIT" ; |
12900 | case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_DICT_COMMIT: |
12901 | return "RDB_IO_ERROR_DICT_COMMIT" ; |
12902 | case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_BG_THREAD: |
12903 | return "RDB_IO_ERROR_BG_THREAD" ; |
12904 | case RDB_IO_ERROR_TYPE::RDB_IO_ERROR_GENERAL: |
12905 | return "RDB_IO_ERROR_GENERAL" ; |
12906 | default: |
12907 | DBUG_ASSERT(false); |
12908 | return "(unknown)" ; |
12909 | } |
12910 | } |
12911 | |
12912 | // In case of core dump generation we want this function NOT to be optimized |
12913 | // so that we can capture as much data as possible to debug the root cause |
12914 | // more efficiently. |
12915 | #ifdef __GNUC__ |
12916 | #pragma GCC push_options |
12917 | #pragma GCC optimize("O0") |
12918 | #endif |
12919 | |
12920 | void rdb_handle_io_error(const rocksdb::Status status, |
12921 | const RDB_IO_ERROR_TYPE err_type) { |
12922 | if (status.IsIOError()) { |
12923 | switch (err_type) { |
12924 | case RDB_IO_ERROR_TX_COMMIT: |
12925 | case RDB_IO_ERROR_DICT_COMMIT: { |
12926 | rdb_log_status_error(status, "failed to write to WAL" ); |
12927 | /* NO_LINT_DEBUG */ |
12928 | sql_print_error("MyRocks: aborting on WAL write error." ); |
12929 | abort(); |
12930 | break; |
12931 | } |
12932 | case RDB_IO_ERROR_BG_THREAD: { |
12933 | rdb_log_status_error(status, "BG thread failed to write to RocksDB" ); |
12934 | break; |
12935 | } |
12936 | case RDB_IO_ERROR_GENERAL: { |
12937 | rdb_log_status_error(status, "failed on I/O" ); |
12938 | /* NO_LINT_DEBUG */ |
12939 | sql_print_error("MyRocks: aborting on I/O error." ); |
12940 | abort(); |
12941 | break; |
12942 | } |
12943 | default: |
12944 | DBUG_ASSERT(0); |
12945 | break; |
12946 | } |
12947 | } else if (status.IsCorruption()) { |
12948 | rdb_log_status_error(status, "data corruption detected!" ); |
12949 | rdb_persist_corruption_marker(); |
12950 | /* NO_LINT_DEBUG */ |
12951 | sql_print_error("MyRocks: aborting because of data corruption." ); |
12952 | abort(); |
12953 | } else if (!status.ok()) { |
12954 | switch (err_type) { |
12955 | case RDB_IO_ERROR_DICT_COMMIT: { |
12956 | rdb_log_status_error(status, "Failed to write to WAL (dictionary)" ); |
12957 | /* NO_LINT_DEBUG */ |
12958 | sql_print_error("MyRocks: aborting on WAL write error." ); |
12959 | abort(); |
12960 | break; |
12961 | } |
12962 | default: |
12963 | rdb_log_status_error(status, "Failed to read/write in RocksDB" ); |
12964 | break; |
12965 | } |
12966 | } |
12967 | } |
12968 | #ifdef __GNUC__ |
12969 | #pragma GCC pop_options |
12970 | #endif |
12971 | |
12972 | Rdb_dict_manager *rdb_get_dict_manager(void) { return &dict_manager; } |
12973 | |
12974 | Rdb_ddl_manager *rdb_get_ddl_manager(void) { return &ddl_manager; } |
12975 | |
12976 | Rdb_binlog_manager *rdb_get_binlog_manager(void) { return &binlog_manager; } |
12977 | |
12978 | void rocksdb_set_compaction_options( |
12979 | my_core::THD *const thd MY_ATTRIBUTE((__unused__)), |
12980 | my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)), |
12981 | void *const var_ptr, const void *const save) { |
12982 | if (var_ptr && save) { |
12983 | *(uint64_t *)var_ptr = *(const uint64_t *)save; |
12984 | } |
12985 | const Rdb_compact_params params = { |
12986 | (uint64_t)rocksdb_compaction_sequential_deletes, |
12987 | (uint64_t)rocksdb_compaction_sequential_deletes_window, |
12988 | (uint64_t)rocksdb_compaction_sequential_deletes_file_size}; |
12989 | if (properties_collector_factory) { |
12990 | properties_collector_factory->SetCompactionParams(params); |
12991 | } |
12992 | } |
12993 | |
12994 | void rocksdb_set_table_stats_sampling_pct( |
12995 | my_core::THD *const thd MY_ATTRIBUTE((__unused__)), |
12996 | my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)), |
12997 | void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) { |
12998 | RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex); |
12999 | |
13000 | const uint32_t new_val = *static_cast<const uint32_t *>(save); |
13001 | |
13002 | if (new_val != rocksdb_table_stats_sampling_pct) { |
13003 | rocksdb_table_stats_sampling_pct = new_val; |
13004 | |
13005 | if (properties_collector_factory) { |
13006 | properties_collector_factory->SetTableStatsSamplingPct( |
13007 | rocksdb_table_stats_sampling_pct); |
13008 | } |
13009 | } |
13010 | |
13011 | RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); |
13012 | } |
13013 | |
13014 | /* |
13015 | This function allows setting the rate limiter's bytes per second value |
13016 | but only if the rate limiter is turned on which has to be done at startup. |
13017 | If the rate is already 0 (turned off) or we are changing it to 0 (trying |
13018 | to turn it off) this function will push a warning to the client and do |
13019 | nothing. |
13020 | This is similar to the code in innodb_doublewrite_update (found in |
13021 | storage/innobase/handler/ha_innodb.cc). |
13022 | */ |
13023 | void rocksdb_set_rate_limiter_bytes_per_sec( |
13024 | my_core::THD *const thd, |
13025 | my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)), |
13026 | void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) { |
13027 | const uint64_t new_val = *static_cast<const uint64_t *>(save); |
13028 | if (new_val == 0 || rocksdb_rate_limiter_bytes_per_sec == 0) { |
13029 | /* |
13030 | If a rate_limiter was not enabled at startup we can't change it nor |
13031 | can we disable it if one was created at startup |
13032 | */ |
13033 | push_warning_printf(thd, Sql_condition::WARN_LEVEL_WARN, ER_WRONG_ARGUMENTS, |
13034 | "RocksDB: rocksdb_rate_limiter_bytes_per_sec cannot " |
13035 | "be dynamically changed to or from 0. Do a clean " |
13036 | "shutdown if you want to change it from or to 0." ); |
13037 | } else if (new_val != rocksdb_rate_limiter_bytes_per_sec) { |
13038 | /* Apply the new value to the rate limiter and store it locally */ |
13039 | DBUG_ASSERT(rocksdb_rate_limiter != nullptr); |
13040 | rocksdb_rate_limiter_bytes_per_sec = new_val; |
13041 | rocksdb_rate_limiter->SetBytesPerSecond(new_val); |
13042 | } |
13043 | } |
13044 | |
13045 | void rocksdb_set_sst_mgr_rate_bytes_per_sec( |
13046 | my_core::THD *const thd, |
13047 | my_core::st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)), |
13048 | void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) { |
13049 | RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex); |
13050 | |
13051 | const uint64_t new_val = *static_cast<const uint64_t *>(save); |
13052 | |
13053 | if (new_val != rocksdb_sst_mgr_rate_bytes_per_sec) { |
13054 | rocksdb_sst_mgr_rate_bytes_per_sec = new_val; |
13055 | |
13056 | rocksdb_db_options->sst_file_manager->SetDeleteRateBytesPerSecond( |
13057 | rocksdb_sst_mgr_rate_bytes_per_sec); |
13058 | } |
13059 | |
13060 | RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); |
13061 | } |
13062 | |
13063 | void rocksdb_set_delayed_write_rate(THD *thd, struct st_mysql_sys_var *var, |
13064 | void *var_ptr, const void *save) { |
13065 | RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex); |
13066 | const uint64_t new_val = *static_cast<const uint64_t *>(save); |
13067 | if (rocksdb_delayed_write_rate != new_val) { |
13068 | rocksdb_delayed_write_rate = new_val; |
13069 | rocksdb::Status s = |
13070 | rdb->SetDBOptions({{"delayed_write_rate" , std::to_string(new_val)}}); |
13071 | |
13072 | if (!s.ok()) { |
13073 | /* NO_LINT_DEBUG */ |
13074 | sql_print_warning("MyRocks: failed to update delayed_write_rate. " |
13075 | "status code = %d, status = %s" , |
13076 | s.code(), s.ToString().c_str()); |
13077 | } |
13078 | } |
13079 | RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); |
13080 | } |
13081 | |
13082 | void rocksdb_set_max_latest_deadlocks(THD *thd, struct st_mysql_sys_var *var, |
13083 | void *var_ptr, const void *save) { |
13084 | RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex); |
13085 | const uint32_t new_val = *static_cast<const uint32_t *>(save); |
13086 | if (rocksdb_max_latest_deadlocks != new_val) { |
13087 | rocksdb_max_latest_deadlocks = new_val; |
13088 | rdb->SetDeadlockInfoBufferSize(rocksdb_max_latest_deadlocks); |
13089 | } |
13090 | RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); |
13091 | } |
13092 | |
13093 | void rdb_set_collation_exception_list(const char *const exception_list) { |
13094 | DBUG_ASSERT(rdb_collation_exceptions != nullptr); |
13095 | |
13096 | if (!rdb_collation_exceptions->set_patterns(exception_list)) { |
13097 | my_core::warn_about_bad_patterns(rdb_collation_exceptions, |
13098 | "strict_collation_exceptions" ); |
13099 | } |
13100 | } |
13101 | |
13102 | void rocksdb_set_collation_exception_list(THD *const thd, |
13103 | struct st_mysql_sys_var *const var, |
13104 | void *const var_ptr, |
13105 | const void *const save) { |
13106 | const char *const val = *static_cast<const char *const *>(save); |
13107 | |
13108 | rdb_set_collation_exception_list(val == nullptr ? "" : val); |
13109 | |
13110 | //psergey-todo: what is the purpose of the below?? |
13111 | const char *val_copy= val? my_strdup(val, MYF(0)): nullptr; |
13112 | my_free(*static_cast<char**>(var_ptr)); |
13113 | *static_cast<const char**>(var_ptr) = val_copy; |
13114 | } |
13115 | |
13116 | int mysql_value_to_bool(struct st_mysql_value *value, my_bool *return_value) { |
13117 | int new_value_type = value->value_type(value); |
13118 | if (new_value_type == MYSQL_VALUE_TYPE_STRING) { |
13119 | char buf[16]; |
13120 | int len = sizeof(buf); |
13121 | const char *str = value->val_str(value, buf, &len); |
13122 | if (str && (my_strcasecmp(system_charset_info, "true" , str) == 0 || |
13123 | my_strcasecmp(system_charset_info, "on" , str) == 0)) { |
13124 | *return_value = TRUE; |
13125 | } else if (str && (my_strcasecmp(system_charset_info, "false" , str) == 0 || |
13126 | my_strcasecmp(system_charset_info, "off" , str) == 0)) { |
13127 | *return_value = FALSE; |
13128 | } else { |
13129 | return 1; |
13130 | } |
13131 | } else if (new_value_type == MYSQL_VALUE_TYPE_INT) { |
13132 | long long intbuf; |
13133 | value->val_int(value, &intbuf); |
13134 | if (intbuf > 1) |
13135 | return 1; |
13136 | *return_value = intbuf > 0 ? TRUE : FALSE; |
13137 | } else { |
13138 | return 1; |
13139 | } |
13140 | |
13141 | return 0; |
13142 | } |
13143 | |
13144 | int rocksdb_check_bulk_load( |
13145 | THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), |
13146 | void *save, struct st_mysql_value *value) { |
13147 | my_bool new_value; |
13148 | if (mysql_value_to_bool(value, &new_value) != 0) { |
13149 | return 1; |
13150 | } |
13151 | |
13152 | Rdb_transaction *&tx = get_tx_from_thd(thd); |
13153 | if (tx != nullptr) { |
13154 | const int rc = tx->finish_bulk_load(); |
13155 | if (rc != 0) { |
13156 | // NO_LINT_DEBUG |
13157 | sql_print_error("RocksDB: Error %d finalizing last SST file while " |
13158 | "setting bulk loading variable" , |
13159 | rc); |
13160 | THDVAR(thd, bulk_load) = 0; |
13161 | return 1; |
13162 | } |
13163 | } |
13164 | |
13165 | *static_cast<bool *>(save) = new_value; |
13166 | return 0; |
13167 | } |
13168 | |
13169 | int rocksdb_check_bulk_load_allow_unsorted( |
13170 | THD *const thd, struct st_mysql_sys_var *var MY_ATTRIBUTE((__unused__)), |
13171 | void *save, struct st_mysql_value *value) { |
13172 | my_bool new_value; |
13173 | if (mysql_value_to_bool(value, &new_value) != 0) { |
13174 | return 1; |
13175 | } |
13176 | |
13177 | if (THDVAR(thd, bulk_load)) { |
13178 | my_error(ER_ERROR_WHEN_EXECUTING_COMMAND, MYF(0), "SET" , |
13179 | "Cannot change this setting while bulk load is enabled" ); |
13180 | |
13181 | return 1; |
13182 | } |
13183 | |
13184 | *static_cast<bool *>(save) = new_value; |
13185 | return 0; |
13186 | } |
13187 | |
13188 | static void rocksdb_set_max_background_jobs(THD *thd, |
13189 | struct st_mysql_sys_var *const var, |
13190 | void *const var_ptr, |
13191 | const void *const save) { |
13192 | DBUG_ASSERT(save != nullptr); |
13193 | DBUG_ASSERT(rocksdb_db_options != nullptr); |
13194 | DBUG_ASSERT(rocksdb_db_options->env != nullptr); |
13195 | |
13196 | RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex); |
13197 | |
13198 | const int new_val = *static_cast<const int *>(save); |
13199 | |
13200 | if (rocksdb_db_options->max_background_jobs != new_val) { |
13201 | rocksdb_db_options->max_background_jobs = new_val; |
13202 | rocksdb::Status s = |
13203 | rdb->SetDBOptions({{"max_background_jobs" , std::to_string(new_val)}}); |
13204 | |
13205 | if (!s.ok()) { |
13206 | /* NO_LINT_DEBUG */ |
13207 | sql_print_warning("MyRocks: failed to update max_background_jobs. " |
13208 | "Status code = %d, status = %s." , |
13209 | s.code(), s.ToString().c_str()); |
13210 | } |
13211 | } |
13212 | |
13213 | RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); |
13214 | } |
13215 | |
13216 | static void rocksdb_set_bytes_per_sync( |
13217 | THD *thd MY_ATTRIBUTE((__unused__)), |
13218 | struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)), |
13219 | void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) { |
13220 | DBUG_ASSERT(save != nullptr); |
13221 | DBUG_ASSERT(rocksdb_db_options != nullptr); |
13222 | DBUG_ASSERT(rocksdb_db_options->env != nullptr); |
13223 | |
13224 | RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex); |
13225 | |
13226 | const ulonglong new_val = *static_cast<const ulonglong *>(save); |
13227 | |
13228 | if (rocksdb_db_options->bytes_per_sync != new_val) { |
13229 | rocksdb_db_options->bytes_per_sync = new_val; |
13230 | rocksdb::Status s = |
13231 | rdb->SetDBOptions({{"bytes_per_sync" , std::to_string(new_val)}}); |
13232 | |
13233 | if (!s.ok()) { |
13234 | /* NO_LINT_DEBUG */ |
13235 | sql_print_warning("MyRocks: failed to update max_background_jobs. " |
13236 | "Status code = %d, status = %s." , |
13237 | s.code(), s.ToString().c_str()); |
13238 | } |
13239 | } |
13240 | |
13241 | RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); |
13242 | } |
13243 | |
13244 | static void rocksdb_set_wal_bytes_per_sync( |
13245 | THD *thd MY_ATTRIBUTE((__unused__)), |
13246 | struct st_mysql_sys_var *const var MY_ATTRIBUTE((__unused__)), |
13247 | void *const var_ptr MY_ATTRIBUTE((__unused__)), const void *const save) { |
13248 | DBUG_ASSERT(save != nullptr); |
13249 | DBUG_ASSERT(rocksdb_db_options != nullptr); |
13250 | DBUG_ASSERT(rocksdb_db_options->env != nullptr); |
13251 | |
13252 | RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex); |
13253 | |
13254 | const ulonglong new_val = *static_cast<const ulonglong *>(save); |
13255 | |
13256 | if (rocksdb_db_options->wal_bytes_per_sync != new_val) { |
13257 | rocksdb_db_options->wal_bytes_per_sync = new_val; |
13258 | rocksdb::Status s = |
13259 | rdb->SetDBOptions({{"wal_bytes_per_sync" , std::to_string(new_val)}}); |
13260 | |
13261 | if (!s.ok()) { |
13262 | /* NO_LINT_DEBUG */ |
13263 | sql_print_warning("MyRocks: failed to update max_background_jobs. " |
13264 | "Status code = %d, status = %s." , |
13265 | s.code(), s.ToString().c_str()); |
13266 | } |
13267 | } |
13268 | |
13269 | RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); |
13270 | } |
13271 | |
13272 | static int |
13273 | rocksdb_validate_update_cf_options(THD * /* unused */, |
13274 | struct st_mysql_sys_var * /*unused*/, |
13275 | void *save, struct st_mysql_value *value) { |
13276 | |
13277 | char buff[STRING_BUFFER_USUAL_SIZE]; |
13278 | const char *str; |
13279 | int length; |
13280 | length = sizeof(buff); |
13281 | str = value->val_str(value, buff, &length); |
13282 | *(const char **)save = str; |
13283 | |
13284 | if (str == nullptr) { |
13285 | return HA_EXIT_SUCCESS; |
13286 | } |
13287 | |
13288 | Rdb_cf_options::Name_to_config_t option_map; |
13289 | |
13290 | // Basic sanity checking and parsing the options into a map. If this fails |
13291 | // then there's no point to proceed. |
13292 | if (!Rdb_cf_options::parse_cf_options(str, &option_map)) { |
13293 | my_error(ER_WRONG_VALUE_FOR_VAR, MYF(0), "rocksdb_update_cf_options" , str); |
13294 | return HA_EXIT_FAILURE; |
13295 | } |
13296 | return HA_EXIT_SUCCESS; |
13297 | } |
13298 | |
13299 | static void |
13300 | rocksdb_set_update_cf_options(THD *const /* unused */, |
13301 | struct st_mysql_sys_var *const /* unused */, |
13302 | void *const var_ptr, const void *const save) { |
13303 | const char *const val = *static_cast<const char *const *>(save); |
13304 | |
13305 | RDB_MUTEX_LOCK_CHECK(rdb_sysvars_mutex); |
13306 | |
13307 | if (!val) { |
13308 | *reinterpret_cast<char **>(var_ptr) = nullptr; |
13309 | RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); |
13310 | return; |
13311 | } |
13312 | |
13313 | DBUG_ASSERT(val != nullptr); |
13314 | |
13315 | // Reset the pointers regardless of how much success we had with updating |
13316 | // the CF options. This will results in consistent behavior and avoids |
13317 | // dealing with cases when only a subset of CF-s was successfully updated. |
13318 | *reinterpret_cast<char **>(var_ptr) = my_strdup(val, MYF(0)); |
13319 | |
13320 | // Do the real work of applying the changes. |
13321 | Rdb_cf_options::Name_to_config_t option_map; |
13322 | |
13323 | // This should never fail, because of rocksdb_validate_update_cf_options |
13324 | if (!Rdb_cf_options::parse_cf_options(val, &option_map)) { |
13325 | my_free(*reinterpret_cast<char**>(var_ptr)); |
13326 | RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); |
13327 | return; |
13328 | } |
13329 | |
13330 | // For each CF we have, see if we need to update any settings. |
13331 | for (const auto &cf_name : cf_manager.get_cf_names()) { |
13332 | DBUG_ASSERT(!cf_name.empty()); |
13333 | |
13334 | rocksdb::ColumnFamilyHandle *cfh = cf_manager.get_cf(cf_name); |
13335 | DBUG_ASSERT(cfh != nullptr); |
13336 | |
13337 | const auto it = option_map.find(cf_name); |
13338 | std::string per_cf_options = (it != option_map.end()) ? it->second : "" ; |
13339 | |
13340 | if (!per_cf_options.empty()) { |
13341 | Rdb_cf_options::Name_to_config_t opt_map; |
13342 | rocksdb::Status s = rocksdb::StringToMap(per_cf_options, &opt_map); |
13343 | |
13344 | if (s != rocksdb::Status::OK()) { |
13345 | // NO_LINT_DEBUG |
13346 | sql_print_warning("MyRocks: failed to convert the options for column " |
13347 | "family '%s' to a map. %s" , cf_name.c_str(), |
13348 | s.ToString().c_str()); |
13349 | } else { |
13350 | DBUG_ASSERT(rdb != nullptr); |
13351 | |
13352 | // Finally we can apply the options. |
13353 | s = rdb->SetOptions(cfh, opt_map); |
13354 | |
13355 | if (s != rocksdb::Status::OK()) { |
13356 | // NO_LINT_DEBUG |
13357 | sql_print_warning("MyRocks: failed to apply the options for column " |
13358 | "family '%s'. %s" , cf_name.c_str(), |
13359 | s.ToString().c_str()); |
13360 | } else { |
13361 | // NO_LINT_DEBUG |
13362 | sql_print_information("MyRocks: options for column family '%s' " |
13363 | "have been successfully updated." , |
13364 | cf_name.c_str()); |
13365 | |
13366 | // Make sure that data is internally consistent as well and update |
13367 | // the CF options. This is necessary also to make sure that the CF |
13368 | // options will be correctly reflected in the relevant table: |
13369 | // ROCKSDB_CF_OPTIONS in INFORMATION_SCHEMA. |
13370 | rocksdb::ColumnFamilyOptions cf_options = rdb->GetOptions(cfh); |
13371 | std::string updated_options; |
13372 | |
13373 | s = rocksdb::GetStringFromColumnFamilyOptions(&updated_options, |
13374 | cf_options); |
13375 | |
13376 | DBUG_ASSERT(s == rocksdb::Status::OK()); |
13377 | DBUG_ASSERT(!updated_options.empty()); |
13378 | |
13379 | cf_manager.update_options_map(cf_name, updated_options); |
13380 | } |
13381 | } |
13382 | } |
13383 | } |
13384 | |
13385 | // Our caller (`plugin_var_memalloc_global_update`) will call `my_free` to |
13386 | // free up resources used before. |
13387 | |
13388 | RDB_MUTEX_UNLOCK_CHECK(rdb_sysvars_mutex); |
13389 | } |
13390 | |
13391 | void rdb_queue_save_stats_request() { rdb_bg_thread.request_save_stats(); } |
13392 | |
13393 | #ifdef MARIAROCKS_NOT_YET // MDEV-10976 |
13394 | |
13395 | void ha_rocksdb::rpl_before_delete_rows() { |
13396 | DBUG_ENTER_FUNC(); |
13397 | |
13398 | m_in_rpl_delete_rows = true; |
13399 | |
13400 | DBUG_VOID_RETURN; |
13401 | } |
13402 | |
13403 | void ha_rocksdb::rpl_after_delete_rows() { |
13404 | DBUG_ENTER_FUNC(); |
13405 | |
13406 | m_in_rpl_delete_rows = false; |
13407 | |
13408 | DBUG_VOID_RETURN; |
13409 | } |
13410 | |
13411 | void ha_rocksdb::rpl_before_update_rows() { |
13412 | DBUG_ENTER_FUNC(); |
13413 | |
13414 | m_in_rpl_update_rows = true; |
13415 | |
13416 | DBUG_VOID_RETURN; |
13417 | } |
13418 | |
13419 | void ha_rocksdb::rpl_after_update_rows() { |
13420 | DBUG_ENTER_FUNC(); |
13421 | |
13422 | m_in_rpl_update_rows = false; |
13423 | |
13424 | DBUG_VOID_RETURN; |
13425 | } |
13426 | |
13427 | /** |
13428 | @brief |
13429 | Read Free Replication can be used or not. Returning False means |
13430 | Read Free Replication can be used. Read Free Replication can be used |
13431 | on UPDATE or DELETE row events, and table must have user defined |
13432 | primary key. |
13433 | */ |
13434 | bool ha_rocksdb::use_read_free_rpl() { |
13435 | DBUG_ENTER_FUNC(); |
13436 | |
13437 | DBUG_RETURN((m_in_rpl_delete_rows || m_in_rpl_update_rows) && |
13438 | !has_hidden_pk(table) && m_use_read_free_rpl); |
13439 | } |
13440 | #endif // MARIAROCKS_NOT_YET |
13441 | |
13442 | double ha_rocksdb::read_time(uint index, uint ranges, ha_rows rows) { |
13443 | DBUG_ENTER_FUNC(); |
13444 | |
13445 | if (index != table->s->primary_key) { |
13446 | /* Non covering index range scan */ |
13447 | DBUG_RETURN(handler::read_time(index, ranges, rows)); |
13448 | } |
13449 | |
13450 | DBUG_RETURN((rows / 20.0) + 1); |
13451 | } |
13452 | |
13453 | std::string rdb_corruption_marker_file_name() { |
13454 | std::string ret(rocksdb_datadir); |
13455 | ret.append("/ROCKSDB_CORRUPTED" ); |
13456 | return ret; |
13457 | } |
13458 | |
13459 | void sql_print_verbose_info(const char *format, ...) |
13460 | { |
13461 | va_list args; |
13462 | |
13463 | if (global_system_variables.log_warnings > 2) { |
13464 | va_start(args, format); |
13465 | sql_print_information_v(format, args); |
13466 | va_end(args); |
13467 | } |
13468 | } |
13469 | |
13470 | } // namespace myrocks |
13471 | |
13472 | |
13473 | /** |
13474 | Construct and emit duplicate key error message using information |
13475 | from table's record buffer. |
13476 | |
13477 | @sa print_keydup_error(table, key, msg, errflag, thd, org_table_name). |
13478 | */ |
13479 | |
13480 | void print_keydup_error(TABLE *table, KEY *key, myf errflag, |
13481 | const THD *thd, const char *org_table_name) |
13482 | { |
13483 | print_keydup_error(table, key, ER(ER_DUP_ENTRY_WITH_KEY_NAME), errflag); |
13484 | } |
13485 | |
13486 | /* |
13487 | Register the storage engine plugin outside of myrocks namespace |
13488 | so that mysql_declare_plugin does not get confused when it does |
13489 | its name generation. |
13490 | */ |
13491 | |
13492 | |
13493 | struct st_mysql_storage_engine rocksdb_storage_engine = { |
13494 | MYSQL_HANDLERTON_INTERFACE_VERSION}; |
13495 | |
13496 | maria_declare_plugin(rocksdb_se){ |
13497 | MYSQL_STORAGE_ENGINE_PLUGIN, /* Plugin Type */ |
13498 | &rocksdb_storage_engine, /* Plugin Descriptor */ |
13499 | "ROCKSDB" , /* Plugin Name */ |
13500 | "Monty Program Ab" , /* Plugin Author */ |
13501 | "RocksDB storage engine" , /* Plugin Description */ |
13502 | PLUGIN_LICENSE_GPL, /* Plugin Licence */ |
13503 | myrocks::rocksdb_init_func, /* Plugin Entry Point */ |
13504 | myrocks::rocksdb_done_func, /* Plugin Deinitializer */ |
13505 | 0x0001, /* version number (0.1) */ |
13506 | myrocks::rocksdb_status_vars, /* status variables */ |
13507 | myrocks::rocksdb_system_variables, /* system variables */ |
13508 | "1.0" , /* string version */ |
13509 | myrocks::MYROCKS_MARIADB_PLUGIN_MATURITY_LEVEL |
13510 | }, |
13511 | myrocks::rdb_i_s_cfstats, myrocks::rdb_i_s_dbstats, |
13512 | myrocks::rdb_i_s_perf_context, myrocks::rdb_i_s_perf_context_global, |
13513 | myrocks::rdb_i_s_cfoptions, myrocks::rdb_i_s_compact_stats, |
13514 | myrocks::rdb_i_s_global_info, myrocks::rdb_i_s_ddl, |
13515 | myrocks::rdb_i_s_index_file_map, myrocks::rdb_i_s_lock_info, |
13516 | myrocks::rdb_i_s_trx_info, |
13517 | myrocks::rdb_i_s_deadlock_info |
13518 | maria_declare_plugin_end; |
13519 | |