| 1 | /***************************************************************************** |
| 2 | |
| 3 | Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved. |
| 4 | Copyright (C) 2014, 2018, MariaDB Corporation. |
| 5 | |
| 6 | This program is free software; you can redistribute it and/or modify it under |
| 7 | the terms of the GNU General Public License as published by the Free Software |
| 8 | Foundation; version 2 of the License. |
| 9 | |
| 10 | This program is distributed in the hope that it will be useful, but WITHOUT |
| 11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| 12 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
| 13 | |
| 14 | You should have received a copy of the GNU General Public License along with |
| 15 | this program; if not, write to the Free Software Foundation, Inc., |
| 16 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA |
| 17 | |
| 18 | *****************************************************************************/ |
| 19 | /**************************************************//** |
| 20 | @file btr/btr0defragment.cc |
| 21 | Index defragmentation. |
| 22 | |
| 23 | Created 05/29/2014 Rongrong Zhong |
| 24 | Modified 16/07/2014 Sunguck Lee |
| 25 | Modified 30/07/2014 Jan Lindström jan.lindstrom@mariadb.com |
| 26 | *******************************************************/ |
| 27 | |
| 28 | #include "btr0defragment.h" |
| 29 | #include "btr0btr.h" |
| 30 | #include "btr0cur.h" |
| 31 | #include "btr0sea.h" |
| 32 | #include "btr0pcur.h" |
| 33 | #include "dict0stats.h" |
| 34 | #include "dict0stats_bg.h" |
| 35 | #include "dict0defrag_bg.h" |
| 36 | #include "ibuf0ibuf.h" |
| 37 | #include "lock0lock.h" |
| 38 | #include "srv0start.h" |
| 39 | #include "ut0timer.h" |
| 40 | |
| 41 | #include <list> |
| 42 | |
| 43 | /**************************************************//** |
| 44 | Custom nullptr implementation for under g++ 4.6 |
| 45 | *******************************************************/ |
| 46 | // #pragma once |
| 47 | /* |
| 48 | namespace std |
| 49 | { |
| 50 | // based on SC22/WG21/N2431 = J16/07-0301 |
| 51 | struct nullptr_t |
| 52 | { |
| 53 | template<typename any> operator any * () const |
| 54 | { |
| 55 | return 0; |
| 56 | } |
| 57 | template<class any, typename T> operator T any:: * () const |
| 58 | { |
| 59 | return 0; |
| 60 | } |
| 61 | |
| 62 | #ifdef _MSC_VER |
| 63 | struct pad {}; |
| 64 | pad __[sizeof(void*)/sizeof(pad)]; |
| 65 | #else |
| 66 | char __[sizeof(void*)]; |
| 67 | #endif |
| 68 | private: |
| 69 | // nullptr_t();// {} |
| 70 | // nullptr_t(const nullptr_t&); |
| 71 | // void operator = (const nullptr_t&); |
| 72 | void operator &() const; |
| 73 | template<typename any> void operator +(any) const |
| 74 | { |
| 75 | // I Love MSVC 2005! |
| 76 | } |
| 77 | template<typename any> void operator -(any) const |
| 78 | { |
| 79 | // I Love MSVC 2005! |
| 80 | } |
| 81 | }; |
| 82 | static const nullptr_t __nullptr = {}; |
| 83 | } |
| 84 | |
| 85 | #ifndef nullptr |
| 86 | #define nullptr std::__nullptr |
| 87 | #endif |
| 88 | */ |
| 89 | |
| 90 | /**************************************************//** |
| 91 | End of Custom nullptr implementation for under g++ 4.6 |
| 92 | *******************************************************/ |
| 93 | |
| 94 | /* When there's no work, either because defragment is disabled, or because no |
| 95 | query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS.*/ |
| 96 | #define BTR_DEFRAGMENT_SLEEP_IN_USECS 1000000 |
| 97 | /* Reduce the target page size by this amount when compression failure happens |
| 98 | during defragmentaiton. 512 is chosen because it's a power of 2 and it is about |
| 99 | 3% of the page size. When there are compression failures in defragmentation, |
| 100 | our goal is to get a decent defrag ratio with as few compression failure as |
| 101 | possible. From experimentation it seems that reduce the target size by 512 every |
| 102 | time will make sure the page is compressible within a couple of iterations. */ |
| 103 | #define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE 512 |
| 104 | |
| 105 | /* Work queue for defragmentation. */ |
| 106 | typedef std::list<btr_defragment_item_t*> btr_defragment_wq_t; |
| 107 | static btr_defragment_wq_t btr_defragment_wq; |
| 108 | |
| 109 | /* Mutex protecting the defragmentation work queue.*/ |
| 110 | ib_mutex_t btr_defragment_mutex; |
| 111 | #ifdef UNIV_PFS_MUTEX |
| 112 | UNIV_INTERN mysql_pfs_key_t btr_defragment_mutex_key; |
| 113 | #endif /* UNIV_PFS_MUTEX */ |
| 114 | |
| 115 | /* Number of compression failures caused by defragmentation since server |
| 116 | start. */ |
| 117 | ulint btr_defragment_compression_failures = 0; |
| 118 | /* Number of btr_defragment_n_pages calls that altered page but didn't |
| 119 | manage to release any page. */ |
| 120 | ulint btr_defragment_failures = 0; |
| 121 | /* Total number of btr_defragment_n_pages calls that altered page. |
| 122 | The difference between btr_defragment_count and btr_defragment_failures shows |
| 123 | the amount of effort wasted. */ |
| 124 | ulint btr_defragment_count = 0; |
| 125 | |
| 126 | /******************************************************************//** |
| 127 | Constructor for btr_defragment_item_t. */ |
| 128 | btr_defragment_item_t::btr_defragment_item_t( |
| 129 | btr_pcur_t* pcur, |
| 130 | os_event_t event) |
| 131 | { |
| 132 | this->pcur = pcur; |
| 133 | this->event = event; |
| 134 | this->removed = false; |
| 135 | this->last_processed = 0; |
| 136 | } |
| 137 | |
| 138 | /******************************************************************//** |
| 139 | Destructor for btr_defragment_item_t. */ |
| 140 | btr_defragment_item_t::~btr_defragment_item_t() { |
| 141 | if (this->pcur) { |
| 142 | btr_pcur_free_for_mysql(this->pcur); |
| 143 | } |
| 144 | if (this->event) { |
| 145 | os_event_set(this->event); |
| 146 | } |
| 147 | } |
| 148 | |
| 149 | /******************************************************************//** |
| 150 | Initialize defragmentation. */ |
| 151 | void |
| 152 | btr_defragment_init() |
| 153 | { |
| 154 | srv_defragment_interval = ut_microseconds_to_timer( |
| 155 | (ulonglong) (1000000.0 / srv_defragment_frequency)); |
| 156 | mutex_create(LATCH_ID_BTR_DEFRAGMENT_MUTEX, &btr_defragment_mutex); |
| 157 | } |
| 158 | |
| 159 | /******************************************************************//** |
| 160 | Shutdown defragmentation. Release all resources. */ |
| 161 | void |
| 162 | btr_defragment_shutdown() |
| 163 | { |
| 164 | mutex_enter(&btr_defragment_mutex); |
| 165 | std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); |
| 166 | while(iter != btr_defragment_wq.end()) { |
| 167 | btr_defragment_item_t* item = *iter; |
| 168 | iter = btr_defragment_wq.erase(iter); |
| 169 | delete item; |
| 170 | } |
| 171 | mutex_exit(&btr_defragment_mutex); |
| 172 | mutex_free(&btr_defragment_mutex); |
| 173 | } |
| 174 | |
| 175 | |
| 176 | /******************************************************************//** |
| 177 | Functions used by the query threads: btr_defragment_xxx_index |
| 178 | Query threads find/add/remove index. */ |
| 179 | /******************************************************************//** |
| 180 | Check whether the given index is in btr_defragment_wq. We use index->id |
| 181 | to identify indices. */ |
| 182 | bool |
| 183 | btr_defragment_find_index( |
| 184 | dict_index_t* index) /*!< Index to find. */ |
| 185 | { |
| 186 | mutex_enter(&btr_defragment_mutex); |
| 187 | for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); |
| 188 | iter != btr_defragment_wq.end(); |
| 189 | ++iter) { |
| 190 | btr_defragment_item_t* item = *iter; |
| 191 | btr_pcur_t* pcur = item->pcur; |
| 192 | btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur); |
| 193 | dict_index_t* idx = btr_cur_get_index(cursor); |
| 194 | if (index->id == idx->id) { |
| 195 | mutex_exit(&btr_defragment_mutex); |
| 196 | return true; |
| 197 | } |
| 198 | } |
| 199 | mutex_exit(&btr_defragment_mutex); |
| 200 | return false; |
| 201 | } |
| 202 | |
| 203 | /******************************************************************//** |
| 204 | Query thread uses this function to add an index to btr_defragment_wq. |
| 205 | Return a pointer to os_event for the query thread to wait on if this is a |
| 206 | synchronized defragmentation. */ |
| 207 | os_event_t |
| 208 | btr_defragment_add_index( |
| 209 | dict_index_t* index, /*!< index to be added */ |
| 210 | bool async, /*!< whether this is an async |
| 211 | defragmentation */ |
| 212 | dberr_t* err) /*!< out: error code */ |
| 213 | { |
| 214 | mtr_t mtr; |
| 215 | *err = DB_SUCCESS; |
| 216 | |
| 217 | mtr_start(&mtr); |
| 218 | // Load index rood page. |
| 219 | buf_block_t* block = btr_block_get( |
| 220 | page_id_t(index->table->space->id, index->page), |
| 221 | page_size_t(index->table->space->flags), |
| 222 | RW_NO_LATCH, index, &mtr); |
| 223 | page_t* page = NULL; |
| 224 | |
| 225 | if (block) { |
| 226 | page = buf_block_get_frame(block); |
| 227 | } |
| 228 | |
| 229 | if (page == NULL && !index->is_readable()) { |
| 230 | mtr_commit(&mtr); |
| 231 | *err = DB_DECRYPTION_FAILED; |
| 232 | return NULL; |
| 233 | } |
| 234 | |
| 235 | ut_ad(page_is_root(page)); |
| 236 | |
| 237 | if (page_is_leaf(page)) { |
| 238 | // Index root is a leaf page, no need to defragment. |
| 239 | mtr_commit(&mtr); |
| 240 | return NULL; |
| 241 | } |
| 242 | btr_pcur_t* pcur = btr_pcur_create_for_mysql(); |
| 243 | os_event_t event = NULL; |
| 244 | if (!async) { |
| 245 | event = os_event_create(0); |
| 246 | } |
| 247 | btr_pcur_open_at_index_side(true, index, BTR_SEARCH_LEAF, pcur, |
| 248 | true, 0, &mtr); |
| 249 | btr_pcur_move_to_next(pcur, &mtr); |
| 250 | btr_pcur_store_position(pcur, &mtr); |
| 251 | mtr_commit(&mtr); |
| 252 | dict_stats_empty_defrag_summary(index); |
| 253 | btr_defragment_item_t* item = new btr_defragment_item_t(pcur, event); |
| 254 | mutex_enter(&btr_defragment_mutex); |
| 255 | btr_defragment_wq.push_back(item); |
| 256 | mutex_exit(&btr_defragment_mutex); |
| 257 | return event; |
| 258 | } |
| 259 | |
| 260 | /******************************************************************//** |
| 261 | When table is dropped, this function is called to mark a table as removed in |
| 262 | btr_efragment_wq. The difference between this function and the remove_index |
| 263 | function is this will not NULL the event. */ |
| 264 | void |
| 265 | btr_defragment_remove_table( |
| 266 | dict_table_t* table) /*!< Index to be removed. */ |
| 267 | { |
| 268 | mutex_enter(&btr_defragment_mutex); |
| 269 | for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); |
| 270 | iter != btr_defragment_wq.end(); |
| 271 | ++iter) { |
| 272 | btr_defragment_item_t* item = *iter; |
| 273 | btr_pcur_t* pcur = item->pcur; |
| 274 | btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur); |
| 275 | dict_index_t* idx = btr_cur_get_index(cursor); |
| 276 | if (table->id == idx->table->id) { |
| 277 | item->removed = true; |
| 278 | } |
| 279 | } |
| 280 | mutex_exit(&btr_defragment_mutex); |
| 281 | } |
| 282 | |
| 283 | /******************************************************************//** |
| 284 | Query thread uses this function to mark an index as removed in |
| 285 | btr_efragment_wq. */ |
| 286 | void |
| 287 | btr_defragment_remove_index( |
| 288 | dict_index_t* index) /*!< Index to be removed. */ |
| 289 | { |
| 290 | mutex_enter(&btr_defragment_mutex); |
| 291 | for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); |
| 292 | iter != btr_defragment_wq.end(); |
| 293 | ++iter) { |
| 294 | btr_defragment_item_t* item = *iter; |
| 295 | btr_pcur_t* pcur = item->pcur; |
| 296 | btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur); |
| 297 | dict_index_t* idx = btr_cur_get_index(cursor); |
| 298 | if (index->id == idx->id) { |
| 299 | item->removed = true; |
| 300 | item->event = NULL; |
| 301 | break; |
| 302 | } |
| 303 | } |
| 304 | mutex_exit(&btr_defragment_mutex); |
| 305 | } |
| 306 | |
| 307 | /******************************************************************//** |
| 308 | Functions used by defragmentation thread: btr_defragment_xxx_item. |
| 309 | Defragmentation thread operates on the work *item*. It gets/removes |
| 310 | item from the work queue. */ |
| 311 | /******************************************************************//** |
| 312 | Defragment thread uses this to remove an item from btr_defragment_wq. |
| 313 | When an item is removed from the work queue, all resources associated with it |
| 314 | are free as well. */ |
| 315 | void |
| 316 | btr_defragment_remove_item( |
| 317 | btr_defragment_item_t* item) /*!< Item to be removed. */ |
| 318 | { |
| 319 | mutex_enter(&btr_defragment_mutex); |
| 320 | for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); |
| 321 | iter != btr_defragment_wq.end(); |
| 322 | ++iter) { |
| 323 | if (item == *iter) { |
| 324 | btr_defragment_wq.erase(iter); |
| 325 | delete item; |
| 326 | break; |
| 327 | } |
| 328 | } |
| 329 | mutex_exit(&btr_defragment_mutex); |
| 330 | } |
| 331 | |
| 332 | /******************************************************************//** |
| 333 | Defragment thread uses this to get an item from btr_defragment_wq to work on. |
| 334 | The item is not removed from the work queue so query threads can still access |
| 335 | this item. We keep it this way so query threads can find and kill a |
| 336 | defragmentation even if that index is being worked on. Be aware that while you |
| 337 | work on this item you have no lock protection on it whatsoever. This is OK as |
| 338 | long as the query threads and defragment thread won't modify the same fields |
| 339 | without lock protection. |
| 340 | */ |
| 341 | btr_defragment_item_t* |
| 342 | btr_defragment_get_item() |
| 343 | { |
| 344 | if (btr_defragment_wq.empty()) { |
| 345 | return NULL; |
| 346 | //return nullptr; |
| 347 | } |
| 348 | mutex_enter(&btr_defragment_mutex); |
| 349 | std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); |
| 350 | if (iter == btr_defragment_wq.end()) { |
| 351 | iter = btr_defragment_wq.begin(); |
| 352 | } |
| 353 | btr_defragment_item_t* item = *iter; |
| 354 | iter++; |
| 355 | mutex_exit(&btr_defragment_mutex); |
| 356 | return item; |
| 357 | } |
| 358 | |
| 359 | /*********************************************************************//** |
| 360 | Check whether we should save defragmentation statistics to persistent storage. |
| 361 | Currently we save the stats to persistent storage every 100 updates. */ |
| 362 | UNIV_INTERN |
| 363 | void |
| 364 | btr_defragment_save_defrag_stats_if_needed( |
| 365 | dict_index_t* index) /*!< in: index */ |
| 366 | { |
| 367 | if (srv_defragment_stats_accuracy != 0 // stats tracking disabled |
| 368 | && index->table->space->id != 0 // do not track system tables |
| 369 | && index->stat_defrag_modified_counter |
| 370 | >= srv_defragment_stats_accuracy) { |
| 371 | dict_stats_defrag_pool_add(index); |
| 372 | index->stat_defrag_modified_counter = 0; |
| 373 | } |
| 374 | } |
| 375 | |
| 376 | /*********************************************************************//** |
| 377 | Main defragment functionalities used by defragment thread.*/ |
| 378 | /*************************************************************//** |
| 379 | Calculate number of records from beginning of block that can |
| 380 | fit into size_limit |
| 381 | @return number of records */ |
| 382 | UNIV_INTERN |
| 383 | ulint |
| 384 | btr_defragment_calc_n_recs_for_size( |
| 385 | buf_block_t* block, /*!< in: B-tree page */ |
| 386 | dict_index_t* index, /*!< in: index of the page */ |
| 387 | ulint size_limit, /*!< in: size limit to fit records in */ |
| 388 | ulint* n_recs_size) /*!< out: actual size of the records that fit |
| 389 | in size_limit. */ |
| 390 | { |
| 391 | page_t* page = buf_block_get_frame(block); |
| 392 | ulint n_recs = 0; |
| 393 | ulint offsets_[REC_OFFS_NORMAL_SIZE]; |
| 394 | ulint* offsets = offsets_; |
| 395 | rec_offs_init(offsets_); |
| 396 | mem_heap_t* heap = NULL; |
| 397 | ulint size = 0; |
| 398 | page_cur_t cur; |
| 399 | |
| 400 | page_cur_set_before_first(block, &cur); |
| 401 | page_cur_move_to_next(&cur); |
| 402 | while (page_cur_get_rec(&cur) != page_get_supremum_rec(page)) { |
| 403 | rec_t* cur_rec = page_cur_get_rec(&cur); |
| 404 | offsets = rec_get_offsets(cur_rec, index, offsets, |
| 405 | page_is_leaf(page), |
| 406 | ULINT_UNDEFINED, &heap); |
| 407 | ulint rec_size = rec_offs_size(offsets); |
| 408 | size += rec_size; |
| 409 | if (size > size_limit) { |
| 410 | size = size - rec_size; |
| 411 | break; |
| 412 | } |
| 413 | n_recs ++; |
| 414 | page_cur_move_to_next(&cur); |
| 415 | } |
| 416 | *n_recs_size = size; |
| 417 | return n_recs; |
| 418 | } |
| 419 | |
| 420 | /*************************************************************//** |
| 421 | Merge as many records from the from_block to the to_block. Delete |
| 422 | the from_block if all records are successfully merged to to_block. |
| 423 | @return the to_block to target for next merge operation. */ |
| 424 | UNIV_INTERN |
| 425 | buf_block_t* |
| 426 | btr_defragment_merge_pages( |
| 427 | dict_index_t* index, /*!< in: index tree */ |
| 428 | buf_block_t* from_block, /*!< in: origin of merge */ |
| 429 | buf_block_t* to_block, /*!< in: destination of merge */ |
| 430 | const page_size_t page_size, /*!< in: page size of the block */ |
| 431 | ulint reserved_space, /*!< in: space reserved for future |
| 432 | insert to avoid immediate page split */ |
| 433 | ulint* max_data_size, /*!< in/out: max data size to |
| 434 | fit in a single compressed page. */ |
| 435 | mem_heap_t* heap, /*!< in/out: pointer to memory heap */ |
| 436 | mtr_t* mtr) /*!< in/out: mini-transaction */ |
| 437 | { |
| 438 | page_t* from_page = buf_block_get_frame(from_block); |
| 439 | page_t* to_page = buf_block_get_frame(to_block); |
| 440 | ulint level = btr_page_get_level(from_page); |
| 441 | ulint n_recs = page_get_n_recs(from_page); |
| 442 | ulint new_data_size = page_get_data_size(to_page); |
| 443 | ulint max_ins_size = |
| 444 | page_get_max_insert_size(to_page, n_recs); |
| 445 | ulint max_ins_size_reorg = |
| 446 | page_get_max_insert_size_after_reorganize( |
| 447 | to_page, n_recs); |
| 448 | ulint max_ins_size_to_use = max_ins_size_reorg > reserved_space |
| 449 | ? max_ins_size_reorg - reserved_space : 0; |
| 450 | ulint move_size = 0; |
| 451 | ulint n_recs_to_move = 0; |
| 452 | rec_t* rec = NULL; |
| 453 | ulint target_n_recs = 0; |
| 454 | rec_t* orig_pred; |
| 455 | |
| 456 | // Estimate how many records can be moved from the from_page to |
| 457 | // the to_page. |
| 458 | if (page_size.is_compressed()) { |
| 459 | ulint page_diff = srv_page_size - *max_data_size; |
| 460 | max_ins_size_to_use = (max_ins_size_to_use > page_diff) |
| 461 | ? max_ins_size_to_use - page_diff : 0; |
| 462 | } |
| 463 | n_recs_to_move = btr_defragment_calc_n_recs_for_size( |
| 464 | from_block, index, max_ins_size_to_use, &move_size); |
| 465 | |
| 466 | // If max_ins_size >= move_size, we can move the records without |
| 467 | // reorganizing the page, otherwise we need to reorganize the page |
| 468 | // first to release more space. |
| 469 | if (move_size > max_ins_size) { |
| 470 | if (!btr_page_reorganize_block(false, page_zip_level, |
| 471 | to_block, index, |
| 472 | mtr)) { |
| 473 | if (!dict_index_is_clust(index) |
| 474 | && page_is_leaf(to_page)) { |
| 475 | ibuf_reset_free_bits(to_block); |
| 476 | } |
| 477 | // If reorganization fails, that means page is |
| 478 | // not compressable. There's no point to try |
| 479 | // merging into this page. Continue to the |
| 480 | // next page. |
| 481 | return from_block; |
| 482 | } |
| 483 | ut_ad(page_validate(to_page, index)); |
| 484 | max_ins_size = page_get_max_insert_size(to_page, n_recs); |
| 485 | ut_a(max_ins_size >= move_size); |
| 486 | } |
| 487 | |
| 488 | // Move records to pack to_page more full. |
| 489 | orig_pred = NULL; |
| 490 | target_n_recs = n_recs_to_move; |
| 491 | while (n_recs_to_move > 0) { |
| 492 | rec = page_rec_get_nth(from_page, |
| 493 | n_recs_to_move + 1); |
| 494 | orig_pred = page_copy_rec_list_start( |
| 495 | to_block, from_block, rec, index, mtr); |
| 496 | if (orig_pred) |
| 497 | break; |
| 498 | // If we reach here, that means compression failed after packing |
| 499 | // n_recs_to_move number of records to to_page. We try to reduce |
| 500 | // the targeted data size on the to_page by |
| 501 | // BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again. |
| 502 | my_atomic_addlint( |
| 503 | &btr_defragment_compression_failures, 1); |
| 504 | max_ins_size_to_use = |
| 505 | move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE |
| 506 | ? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE |
| 507 | : 0; |
| 508 | if (max_ins_size_to_use == 0) { |
| 509 | n_recs_to_move = 0; |
| 510 | move_size = 0; |
| 511 | break; |
| 512 | } |
| 513 | n_recs_to_move = btr_defragment_calc_n_recs_for_size( |
| 514 | from_block, index, max_ins_size_to_use, &move_size); |
| 515 | } |
| 516 | // If less than target_n_recs are moved, it means there are |
| 517 | // compression failures during page_copy_rec_list_start. Adjust |
| 518 | // the max_data_size estimation to reduce compression failures |
| 519 | // in the following runs. |
| 520 | if (target_n_recs > n_recs_to_move |
| 521 | && *max_data_size > new_data_size + move_size) { |
| 522 | *max_data_size = new_data_size + move_size; |
| 523 | } |
| 524 | // Set ibuf free bits if necessary. |
| 525 | if (!dict_index_is_clust(index) |
| 526 | && page_is_leaf(to_page)) { |
| 527 | if (page_size.is_compressed()) { |
| 528 | ibuf_reset_free_bits(to_block); |
| 529 | } else { |
| 530 | ibuf_update_free_bits_if_full( |
| 531 | to_block, |
| 532 | srv_page_size, |
| 533 | ULINT_UNDEFINED); |
| 534 | } |
| 535 | } |
| 536 | if (n_recs_to_move == n_recs) { |
| 537 | /* The whole page is merged with the previous page, |
| 538 | free it. */ |
| 539 | lock_update_merge_left(to_block, orig_pred, |
| 540 | from_block); |
| 541 | btr_search_drop_page_hash_index(from_block); |
| 542 | btr_level_list_remove( |
| 543 | index->table->space->id, |
| 544 | page_size, from_page, index, mtr); |
| 545 | btr_node_ptr_delete(index, from_block, mtr); |
| 546 | /* btr_blob_dbg_remove(from_page, index, |
| 547 | "btr_defragment_n_pages"); */ |
| 548 | btr_page_free(index, from_block, mtr); |
| 549 | } else { |
| 550 | // There are still records left on the page, so |
| 551 | // increment n_defragmented. Node pointer will be changed |
| 552 | // so remove the old node pointer. |
| 553 | if (n_recs_to_move > 0) { |
| 554 | // Part of the page is merged to left, remove |
| 555 | // the merged records, update record locks and |
| 556 | // node pointer. |
| 557 | dtuple_t* node_ptr; |
| 558 | page_delete_rec_list_start(rec, from_block, |
| 559 | index, mtr); |
| 560 | lock_update_split_and_merge(to_block, |
| 561 | orig_pred, |
| 562 | from_block); |
| 563 | btr_node_ptr_delete(index, from_block, mtr); |
| 564 | rec = page_rec_get_next( |
| 565 | page_get_infimum_rec(from_page)); |
| 566 | node_ptr = dict_index_build_node_ptr( |
| 567 | index, rec, page_get_page_no(from_page), |
| 568 | heap, level); |
| 569 | btr_insert_on_non_leaf_level(0, index, level+1, |
| 570 | node_ptr, mtr); |
| 571 | } |
| 572 | to_block = from_block; |
| 573 | } |
| 574 | return to_block; |
| 575 | } |
| 576 | |
| 577 | /*************************************************************//** |
| 578 | Tries to merge N consecutive pages, starting from the page pointed by the |
| 579 | cursor. Skip space 0. Only consider leaf pages. |
| 580 | This function first loads all N pages into memory, then for each of |
| 581 | the pages other than the first page, it tries to move as many records |
| 582 | as possible to the left sibling to keep the left sibling full. During |
| 583 | the process, if any page becomes empty, that page will be removed from |
| 584 | the level list. Record locks, hash, and node pointers are updated after |
| 585 | page reorganization. |
| 586 | @return pointer to the last block processed, or NULL if reaching end of index */ |
| 587 | UNIV_INTERN |
| 588 | buf_block_t* |
| 589 | btr_defragment_n_pages( |
| 590 | buf_block_t* block, /*!< in: starting block for defragmentation */ |
| 591 | dict_index_t* index, /*!< in: index tree */ |
| 592 | uint n_pages,/*!< in: number of pages to defragment */ |
| 593 | mtr_t* mtr) /*!< in/out: mini-transaction */ |
| 594 | { |
| 595 | /* We will need to load the n+1 block because if the last page is freed |
| 596 | and we need to modify the prev_page_no of that block. */ |
| 597 | buf_block_t* blocks[BTR_DEFRAGMENT_MAX_N_PAGES + 1]; |
| 598 | page_t* first_page; |
| 599 | buf_block_t* current_block; |
| 600 | ulint total_data_size = 0; |
| 601 | ulint total_n_recs = 0; |
| 602 | ulint data_size_per_rec; |
| 603 | ulint optimal_page_size; |
| 604 | ulint reserved_space; |
| 605 | ulint max_data_size = 0; |
| 606 | uint n_defragmented = 0; |
| 607 | uint n_new_slots; |
| 608 | mem_heap_t* heap; |
| 609 | ibool end_of_index = FALSE; |
| 610 | |
| 611 | /* It doesn't make sense to call this function with n_pages = 1. */ |
| 612 | ut_ad(n_pages > 1); |
| 613 | |
| 614 | if (!page_is_leaf(block->frame)) { |
| 615 | return NULL; |
| 616 | } |
| 617 | |
| 618 | if (!index->table->space || !index->table->space->id) { |
| 619 | /* Ignore space 0. */ |
| 620 | return NULL; |
| 621 | } |
| 622 | |
| 623 | if (n_pages > BTR_DEFRAGMENT_MAX_N_PAGES) { |
| 624 | n_pages = BTR_DEFRAGMENT_MAX_N_PAGES; |
| 625 | } |
| 626 | |
| 627 | first_page = buf_block_get_frame(block); |
| 628 | const page_size_t page_size(index->table->space->flags); |
| 629 | |
| 630 | /* 1. Load the pages and calculate the total data size. */ |
| 631 | blocks[0] = block; |
| 632 | for (uint i = 1; i <= n_pages; i++) { |
| 633 | page_t* page = buf_block_get_frame(blocks[i-1]); |
| 634 | ulint page_no = btr_page_get_next(page, mtr); |
| 635 | total_data_size += page_get_data_size(page); |
| 636 | total_n_recs += page_get_n_recs(page); |
| 637 | if (page_no == FIL_NULL) { |
| 638 | n_pages = i; |
| 639 | end_of_index = TRUE; |
| 640 | break; |
| 641 | } |
| 642 | |
| 643 | blocks[i] = btr_block_get(page_id_t(index->table->space->id, |
| 644 | page_no), page_size, |
| 645 | RW_X_LATCH, index, mtr); |
| 646 | } |
| 647 | |
| 648 | if (n_pages == 1) { |
| 649 | if (!page_has_prev(first_page)) { |
| 650 | /* last page in the index */ |
| 651 | if (dict_index_get_page(index) |
| 652 | == page_get_page_no(first_page)) |
| 653 | return NULL; |
| 654 | /* given page is the last page. |
| 655 | Lift the records to father. */ |
| 656 | btr_lift_page_up(index, block, mtr); |
| 657 | } |
| 658 | return NULL; |
| 659 | } |
| 660 | |
| 661 | /* 2. Calculate how many pages data can fit in. If not compressable, |
| 662 | return early. */ |
| 663 | ut_a(total_n_recs != 0); |
| 664 | data_size_per_rec = total_data_size / total_n_recs; |
| 665 | // For uncompressed pages, the optimal data size if the free space of a |
| 666 | // empty page. |
| 667 | optimal_page_size = page_get_free_space_of_empty( |
| 668 | page_is_comp(first_page)); |
| 669 | // For compressed pages, we take compression failures into account. |
| 670 | if (page_size.is_compressed()) { |
| 671 | ulint size = 0; |
| 672 | uint i = 0; |
| 673 | // We estimate the optimal data size of the index use samples of |
| 674 | // data size. These samples are taken when pages failed to |
| 675 | // compress due to insertion on the page. We use the average |
| 676 | // of all samples we have as the estimation. Different pages of |
| 677 | // the same index vary in compressibility. Average gives a good |
| 678 | // enough estimation. |
| 679 | for (;i < STAT_DEFRAG_DATA_SIZE_N_SAMPLE; i++) { |
| 680 | if (index->stat_defrag_data_size_sample[i] == 0) { |
| 681 | break; |
| 682 | } |
| 683 | size += index->stat_defrag_data_size_sample[i]; |
| 684 | } |
| 685 | if (i != 0) { |
| 686 | size /= i; |
| 687 | optimal_page_size = ut_min(optimal_page_size, size); |
| 688 | } |
| 689 | max_data_size = optimal_page_size; |
| 690 | } |
| 691 | |
| 692 | reserved_space = ut_min((ulint)(optimal_page_size |
| 693 | * (1 - srv_defragment_fill_factor)), |
| 694 | (data_size_per_rec |
| 695 | * srv_defragment_fill_factor_n_recs)); |
| 696 | optimal_page_size -= reserved_space; |
| 697 | n_new_slots = uint((total_data_size + optimal_page_size - 1) |
| 698 | / optimal_page_size); |
| 699 | if (n_new_slots >= n_pages) { |
| 700 | /* Can't defragment. */ |
| 701 | if (end_of_index) |
| 702 | return NULL; |
| 703 | return blocks[n_pages-1]; |
| 704 | } |
| 705 | |
| 706 | /* 3. Defragment pages. */ |
| 707 | heap = mem_heap_create(256); |
| 708 | // First defragmented page will be the first page. |
| 709 | current_block = blocks[0]; |
| 710 | // Start from the second page. |
| 711 | for (uint i = 1; i < n_pages; i ++) { |
| 712 | buf_block_t* new_block = btr_defragment_merge_pages( |
| 713 | index, blocks[i], current_block, page_size, |
| 714 | reserved_space, &max_data_size, heap, mtr); |
| 715 | if (new_block != current_block) { |
| 716 | n_defragmented ++; |
| 717 | current_block = new_block; |
| 718 | } |
| 719 | } |
| 720 | mem_heap_free(heap); |
| 721 | n_defragmented ++; |
| 722 | my_atomic_addlint( |
| 723 | &btr_defragment_count, 1); |
| 724 | if (n_pages == n_defragmented) { |
| 725 | my_atomic_addlint( |
| 726 | &btr_defragment_failures, 1); |
| 727 | } else { |
| 728 | index->stat_defrag_n_pages_freed += (n_pages - n_defragmented); |
| 729 | } |
| 730 | if (end_of_index) |
| 731 | return NULL; |
| 732 | return current_block; |
| 733 | } |
| 734 | |
| 735 | /** Whether btr_defragment_thread is active */ |
| 736 | bool btr_defragment_thread_active; |
| 737 | |
| 738 | /** Merge consecutive b-tree pages into fewer pages to defragment indexes */ |
| 739 | extern "C" UNIV_INTERN |
| 740 | os_thread_ret_t |
| 741 | DECLARE_THREAD(btr_defragment_thread)(void*) |
| 742 | { |
| 743 | btr_pcur_t* pcur; |
| 744 | btr_cur_t* cursor; |
| 745 | dict_index_t* index; |
| 746 | mtr_t mtr; |
| 747 | buf_block_t* first_block; |
| 748 | buf_block_t* last_block; |
| 749 | |
| 750 | while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { |
| 751 | ut_ad(btr_defragment_thread_active); |
| 752 | |
| 753 | /* If defragmentation is disabled, sleep before |
| 754 | checking whether it's enabled. */ |
| 755 | if (!srv_defragment) { |
| 756 | os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS); |
| 757 | continue; |
| 758 | } |
| 759 | /* The following call won't remove the item from work queue. |
| 760 | We only get a pointer to it to work on. This will make sure |
| 761 | when user issue a kill command, all indices are in the work |
| 762 | queue to be searched. This also means that the user thread |
| 763 | cannot directly remove the item from queue (since we might be |
| 764 | using it). So user thread only marks index as removed. */ |
| 765 | btr_defragment_item_t* item = btr_defragment_get_item(); |
| 766 | /* If work queue is empty, sleep and check later. */ |
| 767 | if (!item) { |
| 768 | os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS); |
| 769 | continue; |
| 770 | } |
| 771 | /* If an index is marked as removed, we remove it from the work |
| 772 | queue. No other thread could be using this item at this point so |
| 773 | it's safe to remove now. */ |
| 774 | if (item->removed) { |
| 775 | btr_defragment_remove_item(item); |
| 776 | continue; |
| 777 | } |
| 778 | |
| 779 | pcur = item->pcur; |
| 780 | ulonglong now = ut_timer_now(); |
| 781 | ulonglong elapsed = now - item->last_processed; |
| 782 | |
| 783 | if (elapsed < srv_defragment_interval) { |
| 784 | /* If we see an index again before the interval |
| 785 | determined by the configured frequency is reached, |
| 786 | we just sleep until the interval pass. Since |
| 787 | defragmentation of all indices queue up on a single |
| 788 | thread, it's likely other indices that follow this one |
| 789 | don't need to sleep again. */ |
| 790 | os_thread_sleep(((ulint)ut_timer_to_microseconds( |
| 791 | srv_defragment_interval - elapsed))); |
| 792 | } |
| 793 | |
| 794 | now = ut_timer_now(); |
| 795 | mtr_start(&mtr); |
| 796 | cursor = btr_pcur_get_btr_cur(pcur); |
| 797 | index = btr_cur_get_index(cursor); |
| 798 | index->set_modified(mtr); |
| 799 | /* To follow the latching order defined in WL#6326, acquire index->lock X-latch. |
| 800 | This entitles us to acquire page latches in any order for the index. */ |
| 801 | mtr_x_lock(&index->lock, &mtr); |
| 802 | /* This will acquire index->lock SX-latch, which per WL#6363 is allowed |
| 803 | when we are already holding the X-latch. */ |
| 804 | btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, &mtr); |
| 805 | first_block = btr_cur_get_block(cursor); |
| 806 | |
| 807 | last_block = btr_defragment_n_pages(first_block, index, |
| 808 | srv_defragment_n_pages, |
| 809 | &mtr); |
| 810 | if (last_block) { |
| 811 | /* If we haven't reached the end of the index, |
| 812 | place the cursor on the last record of last page, |
| 813 | store the cursor position, and put back in queue. */ |
| 814 | page_t* last_page = buf_block_get_frame(last_block); |
| 815 | rec_t* rec = page_rec_get_prev( |
| 816 | page_get_supremum_rec(last_page)); |
| 817 | ut_a(page_rec_is_user_rec(rec)); |
| 818 | page_cur_position(rec, last_block, |
| 819 | btr_cur_get_page_cur(cursor)); |
| 820 | btr_pcur_store_position(pcur, &mtr); |
| 821 | mtr_commit(&mtr); |
| 822 | /* Update the last_processed time of this index. */ |
| 823 | item->last_processed = now; |
| 824 | } else { |
| 825 | dberr_t err = DB_SUCCESS; |
| 826 | mtr_commit(&mtr); |
| 827 | /* Reaching the end of the index. */ |
| 828 | dict_stats_empty_defrag_stats(index); |
| 829 | err = dict_stats_save_defrag_stats(index); |
| 830 | if (err != DB_SUCCESS) { |
| 831 | ib::error() << "Saving defragmentation stats for table " |
| 832 | << index->table->name.m_name |
| 833 | << " index " << index->name() |
| 834 | << " failed with error " << err; |
| 835 | } else { |
| 836 | err = dict_stats_save_defrag_summary(index); |
| 837 | |
| 838 | if (err != DB_SUCCESS) { |
| 839 | ib::error() << "Saving defragmentation summary for table " |
| 840 | << index->table->name.m_name |
| 841 | << " index " << index->name() |
| 842 | << " failed with error " << err; |
| 843 | } |
| 844 | } |
| 845 | |
| 846 | btr_defragment_remove_item(item); |
| 847 | } |
| 848 | } |
| 849 | |
| 850 | btr_defragment_thread_active = false; |
| 851 | os_thread_exit(); |
| 852 | OS_THREAD_DUMMY_RETURN; |
| 853 | } |
| 854 | |