1/*****************************************************************************
2
3Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved.
4Copyright (C) 2014, 2018, MariaDB Corporation.
5
6This program is free software; you can redistribute it and/or modify it under
7the terms of the GNU General Public License as published by the Free Software
8Foundation; version 2 of the License.
9
10This program is distributed in the hope that it will be useful, but WITHOUT
11ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14You should have received a copy of the GNU General Public License along with
15this program; if not, write to the Free Software Foundation, Inc.,
1651 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
17
18*****************************************************************************/
19/**************************************************//**
20@file btr/btr0defragment.cc
21Index defragmentation.
22
23Created 05/29/2014 Rongrong Zhong
24Modified 16/07/2014 Sunguck Lee
25Modified 30/07/2014 Jan Lindström jan.lindstrom@mariadb.com
26*******************************************************/
27
28#include "btr0defragment.h"
29#include "btr0btr.h"
30#include "btr0cur.h"
31#include "btr0sea.h"
32#include "btr0pcur.h"
33#include "dict0stats.h"
34#include "dict0stats_bg.h"
35#include "dict0defrag_bg.h"
36#include "ibuf0ibuf.h"
37#include "lock0lock.h"
38#include "srv0start.h"
39#include "ut0timer.h"
40
41#include <list>
42
43/**************************************************//**
44Custom nullptr implementation for under g++ 4.6
45*******************************************************/
46// #pragma once
47/*
48namespace std
49{
50 // based on SC22/WG21/N2431 = J16/07-0301
51 struct nullptr_t
52 {
53 template<typename any> operator any * () const
54 {
55 return 0;
56 }
57 template<class any, typename T> operator T any:: * () const
58 {
59 return 0;
60 }
61
62#ifdef _MSC_VER
63 struct pad {};
64 pad __[sizeof(void*)/sizeof(pad)];
65#else
66 char __[sizeof(void*)];
67#endif
68private:
69 // nullptr_t();// {}
70 // nullptr_t(const nullptr_t&);
71 // void operator = (const nullptr_t&);
72 void operator &() const;
73 template<typename any> void operator +(any) const
74 {
75 // I Love MSVC 2005!
76 }
77 template<typename any> void operator -(any) const
78 {
79 // I Love MSVC 2005!
80 }
81 };
82static const nullptr_t __nullptr = {};
83}
84
85#ifndef nullptr
86#define nullptr std::__nullptr
87#endif
88*/
89
90/**************************************************//**
91End of Custom nullptr implementation for under g++ 4.6
92*******************************************************/
93
94/* When there's no work, either because defragment is disabled, or because no
95query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS.*/
96#define BTR_DEFRAGMENT_SLEEP_IN_USECS 1000000
97/* Reduce the target page size by this amount when compression failure happens
98during defragmentaiton. 512 is chosen because it's a power of 2 and it is about
993% of the page size. When there are compression failures in defragmentation,
100our goal is to get a decent defrag ratio with as few compression failure as
101possible. From experimentation it seems that reduce the target size by 512 every
102time will make sure the page is compressible within a couple of iterations. */
103#define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE 512
104
105/* Work queue for defragmentation. */
106typedef std::list<btr_defragment_item_t*> btr_defragment_wq_t;
107static btr_defragment_wq_t btr_defragment_wq;
108
109/* Mutex protecting the defragmentation work queue.*/
110ib_mutex_t btr_defragment_mutex;
111#ifdef UNIV_PFS_MUTEX
112UNIV_INTERN mysql_pfs_key_t btr_defragment_mutex_key;
113#endif /* UNIV_PFS_MUTEX */
114
115/* Number of compression failures caused by defragmentation since server
116start. */
117ulint btr_defragment_compression_failures = 0;
118/* Number of btr_defragment_n_pages calls that altered page but didn't
119manage to release any page. */
120ulint btr_defragment_failures = 0;
121/* Total number of btr_defragment_n_pages calls that altered page.
122The difference between btr_defragment_count and btr_defragment_failures shows
123the amount of effort wasted. */
124ulint btr_defragment_count = 0;
125
126/******************************************************************//**
127Constructor for btr_defragment_item_t. */
128btr_defragment_item_t::btr_defragment_item_t(
129 btr_pcur_t* pcur,
130 os_event_t event)
131{
132 this->pcur = pcur;
133 this->event = event;
134 this->removed = false;
135 this->last_processed = 0;
136}
137
138/******************************************************************//**
139Destructor for btr_defragment_item_t. */
140btr_defragment_item_t::~btr_defragment_item_t() {
141 if (this->pcur) {
142 btr_pcur_free_for_mysql(this->pcur);
143 }
144 if (this->event) {
145 os_event_set(this->event);
146 }
147}
148
149/******************************************************************//**
150Initialize defragmentation. */
151void
152btr_defragment_init()
153{
154 srv_defragment_interval = ut_microseconds_to_timer(
155 (ulonglong) (1000000.0 / srv_defragment_frequency));
156 mutex_create(LATCH_ID_BTR_DEFRAGMENT_MUTEX, &btr_defragment_mutex);
157}
158
159/******************************************************************//**
160Shutdown defragmentation. Release all resources. */
161void
162btr_defragment_shutdown()
163{
164 mutex_enter(&btr_defragment_mutex);
165 std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
166 while(iter != btr_defragment_wq.end()) {
167 btr_defragment_item_t* item = *iter;
168 iter = btr_defragment_wq.erase(iter);
169 delete item;
170 }
171 mutex_exit(&btr_defragment_mutex);
172 mutex_free(&btr_defragment_mutex);
173}
174
175
176/******************************************************************//**
177Functions used by the query threads: btr_defragment_xxx_index
178Query threads find/add/remove index. */
179/******************************************************************//**
180Check whether the given index is in btr_defragment_wq. We use index->id
181to identify indices. */
182bool
183btr_defragment_find_index(
184 dict_index_t* index) /*!< Index to find. */
185{
186 mutex_enter(&btr_defragment_mutex);
187 for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
188 iter != btr_defragment_wq.end();
189 ++iter) {
190 btr_defragment_item_t* item = *iter;
191 btr_pcur_t* pcur = item->pcur;
192 btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
193 dict_index_t* idx = btr_cur_get_index(cursor);
194 if (index->id == idx->id) {
195 mutex_exit(&btr_defragment_mutex);
196 return true;
197 }
198 }
199 mutex_exit(&btr_defragment_mutex);
200 return false;
201}
202
203/******************************************************************//**
204Query thread uses this function to add an index to btr_defragment_wq.
205Return a pointer to os_event for the query thread to wait on if this is a
206synchronized defragmentation. */
207os_event_t
208btr_defragment_add_index(
209 dict_index_t* index, /*!< index to be added */
210 bool async, /*!< whether this is an async
211 defragmentation */
212 dberr_t* err) /*!< out: error code */
213{
214 mtr_t mtr;
215 *err = DB_SUCCESS;
216
217 mtr_start(&mtr);
218 // Load index rood page.
219 buf_block_t* block = btr_block_get(
220 page_id_t(index->table->space->id, index->page),
221 page_size_t(index->table->space->flags),
222 RW_NO_LATCH, index, &mtr);
223 page_t* page = NULL;
224
225 if (block) {
226 page = buf_block_get_frame(block);
227 }
228
229 if (page == NULL && !index->is_readable()) {
230 mtr_commit(&mtr);
231 *err = DB_DECRYPTION_FAILED;
232 return NULL;
233 }
234
235 ut_ad(page_is_root(page));
236
237 if (page_is_leaf(page)) {
238 // Index root is a leaf page, no need to defragment.
239 mtr_commit(&mtr);
240 return NULL;
241 }
242 btr_pcur_t* pcur = btr_pcur_create_for_mysql();
243 os_event_t event = NULL;
244 if (!async) {
245 event = os_event_create(0);
246 }
247 btr_pcur_open_at_index_side(true, index, BTR_SEARCH_LEAF, pcur,
248 true, 0, &mtr);
249 btr_pcur_move_to_next(pcur, &mtr);
250 btr_pcur_store_position(pcur, &mtr);
251 mtr_commit(&mtr);
252 dict_stats_empty_defrag_summary(index);
253 btr_defragment_item_t* item = new btr_defragment_item_t(pcur, event);
254 mutex_enter(&btr_defragment_mutex);
255 btr_defragment_wq.push_back(item);
256 mutex_exit(&btr_defragment_mutex);
257 return event;
258}
259
260/******************************************************************//**
261When table is dropped, this function is called to mark a table as removed in
262btr_efragment_wq. The difference between this function and the remove_index
263function is this will not NULL the event. */
264void
265btr_defragment_remove_table(
266 dict_table_t* table) /*!< Index to be removed. */
267{
268 mutex_enter(&btr_defragment_mutex);
269 for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
270 iter != btr_defragment_wq.end();
271 ++iter) {
272 btr_defragment_item_t* item = *iter;
273 btr_pcur_t* pcur = item->pcur;
274 btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
275 dict_index_t* idx = btr_cur_get_index(cursor);
276 if (table->id == idx->table->id) {
277 item->removed = true;
278 }
279 }
280 mutex_exit(&btr_defragment_mutex);
281}
282
283/******************************************************************//**
284Query thread uses this function to mark an index as removed in
285btr_efragment_wq. */
286void
287btr_defragment_remove_index(
288 dict_index_t* index) /*!< Index to be removed. */
289{
290 mutex_enter(&btr_defragment_mutex);
291 for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
292 iter != btr_defragment_wq.end();
293 ++iter) {
294 btr_defragment_item_t* item = *iter;
295 btr_pcur_t* pcur = item->pcur;
296 btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
297 dict_index_t* idx = btr_cur_get_index(cursor);
298 if (index->id == idx->id) {
299 item->removed = true;
300 item->event = NULL;
301 break;
302 }
303 }
304 mutex_exit(&btr_defragment_mutex);
305}
306
307/******************************************************************//**
308Functions used by defragmentation thread: btr_defragment_xxx_item.
309Defragmentation thread operates on the work *item*. It gets/removes
310item from the work queue. */
311/******************************************************************//**
312Defragment thread uses this to remove an item from btr_defragment_wq.
313When an item is removed from the work queue, all resources associated with it
314are free as well. */
315void
316btr_defragment_remove_item(
317 btr_defragment_item_t* item) /*!< Item to be removed. */
318{
319 mutex_enter(&btr_defragment_mutex);
320 for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
321 iter != btr_defragment_wq.end();
322 ++iter) {
323 if (item == *iter) {
324 btr_defragment_wq.erase(iter);
325 delete item;
326 break;
327 }
328 }
329 mutex_exit(&btr_defragment_mutex);
330}
331
332/******************************************************************//**
333Defragment thread uses this to get an item from btr_defragment_wq to work on.
334The item is not removed from the work queue so query threads can still access
335this item. We keep it this way so query threads can find and kill a
336defragmentation even if that index is being worked on. Be aware that while you
337work on this item you have no lock protection on it whatsoever. This is OK as
338long as the query threads and defragment thread won't modify the same fields
339without lock protection.
340*/
341btr_defragment_item_t*
342btr_defragment_get_item()
343{
344 if (btr_defragment_wq.empty()) {
345 return NULL;
346 //return nullptr;
347 }
348 mutex_enter(&btr_defragment_mutex);
349 std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
350 if (iter == btr_defragment_wq.end()) {
351 iter = btr_defragment_wq.begin();
352 }
353 btr_defragment_item_t* item = *iter;
354 iter++;
355 mutex_exit(&btr_defragment_mutex);
356 return item;
357}
358
359/*********************************************************************//**
360Check whether we should save defragmentation statistics to persistent storage.
361Currently we save the stats to persistent storage every 100 updates. */
362UNIV_INTERN
363void
364btr_defragment_save_defrag_stats_if_needed(
365 dict_index_t* index) /*!< in: index */
366{
367 if (srv_defragment_stats_accuracy != 0 // stats tracking disabled
368 && index->table->space->id != 0 // do not track system tables
369 && index->stat_defrag_modified_counter
370 >= srv_defragment_stats_accuracy) {
371 dict_stats_defrag_pool_add(index);
372 index->stat_defrag_modified_counter = 0;
373 }
374}
375
376/*********************************************************************//**
377Main defragment functionalities used by defragment thread.*/
378/*************************************************************//**
379Calculate number of records from beginning of block that can
380fit into size_limit
381@return number of records */
382UNIV_INTERN
383ulint
384btr_defragment_calc_n_recs_for_size(
385 buf_block_t* block, /*!< in: B-tree page */
386 dict_index_t* index, /*!< in: index of the page */
387 ulint size_limit, /*!< in: size limit to fit records in */
388 ulint* n_recs_size) /*!< out: actual size of the records that fit
389 in size_limit. */
390{
391 page_t* page = buf_block_get_frame(block);
392 ulint n_recs = 0;
393 ulint offsets_[REC_OFFS_NORMAL_SIZE];
394 ulint* offsets = offsets_;
395 rec_offs_init(offsets_);
396 mem_heap_t* heap = NULL;
397 ulint size = 0;
398 page_cur_t cur;
399
400 page_cur_set_before_first(block, &cur);
401 page_cur_move_to_next(&cur);
402 while (page_cur_get_rec(&cur) != page_get_supremum_rec(page)) {
403 rec_t* cur_rec = page_cur_get_rec(&cur);
404 offsets = rec_get_offsets(cur_rec, index, offsets,
405 page_is_leaf(page),
406 ULINT_UNDEFINED, &heap);
407 ulint rec_size = rec_offs_size(offsets);
408 size += rec_size;
409 if (size > size_limit) {
410 size = size - rec_size;
411 break;
412 }
413 n_recs ++;
414 page_cur_move_to_next(&cur);
415 }
416 *n_recs_size = size;
417 return n_recs;
418}
419
420/*************************************************************//**
421Merge as many records from the from_block to the to_block. Delete
422the from_block if all records are successfully merged to to_block.
423@return the to_block to target for next merge operation. */
424UNIV_INTERN
425buf_block_t*
426btr_defragment_merge_pages(
427 dict_index_t* index, /*!< in: index tree */
428 buf_block_t* from_block, /*!< in: origin of merge */
429 buf_block_t* to_block, /*!< in: destination of merge */
430 const page_size_t page_size, /*!< in: page size of the block */
431 ulint reserved_space, /*!< in: space reserved for future
432 insert to avoid immediate page split */
433 ulint* max_data_size, /*!< in/out: max data size to
434 fit in a single compressed page. */
435 mem_heap_t* heap, /*!< in/out: pointer to memory heap */
436 mtr_t* mtr) /*!< in/out: mini-transaction */
437{
438 page_t* from_page = buf_block_get_frame(from_block);
439 page_t* to_page = buf_block_get_frame(to_block);
440 ulint level = btr_page_get_level(from_page);
441 ulint n_recs = page_get_n_recs(from_page);
442 ulint new_data_size = page_get_data_size(to_page);
443 ulint max_ins_size =
444 page_get_max_insert_size(to_page, n_recs);
445 ulint max_ins_size_reorg =
446 page_get_max_insert_size_after_reorganize(
447 to_page, n_recs);
448 ulint max_ins_size_to_use = max_ins_size_reorg > reserved_space
449 ? max_ins_size_reorg - reserved_space : 0;
450 ulint move_size = 0;
451 ulint n_recs_to_move = 0;
452 rec_t* rec = NULL;
453 ulint target_n_recs = 0;
454 rec_t* orig_pred;
455
456 // Estimate how many records can be moved from the from_page to
457 // the to_page.
458 if (page_size.is_compressed()) {
459 ulint page_diff = srv_page_size - *max_data_size;
460 max_ins_size_to_use = (max_ins_size_to_use > page_diff)
461 ? max_ins_size_to_use - page_diff : 0;
462 }
463 n_recs_to_move = btr_defragment_calc_n_recs_for_size(
464 from_block, index, max_ins_size_to_use, &move_size);
465
466 // If max_ins_size >= move_size, we can move the records without
467 // reorganizing the page, otherwise we need to reorganize the page
468 // first to release more space.
469 if (move_size > max_ins_size) {
470 if (!btr_page_reorganize_block(false, page_zip_level,
471 to_block, index,
472 mtr)) {
473 if (!dict_index_is_clust(index)
474 && page_is_leaf(to_page)) {
475 ibuf_reset_free_bits(to_block);
476 }
477 // If reorganization fails, that means page is
478 // not compressable. There's no point to try
479 // merging into this page. Continue to the
480 // next page.
481 return from_block;
482 }
483 ut_ad(page_validate(to_page, index));
484 max_ins_size = page_get_max_insert_size(to_page, n_recs);
485 ut_a(max_ins_size >= move_size);
486 }
487
488 // Move records to pack to_page more full.
489 orig_pred = NULL;
490 target_n_recs = n_recs_to_move;
491 while (n_recs_to_move > 0) {
492 rec = page_rec_get_nth(from_page,
493 n_recs_to_move + 1);
494 orig_pred = page_copy_rec_list_start(
495 to_block, from_block, rec, index, mtr);
496 if (orig_pred)
497 break;
498 // If we reach here, that means compression failed after packing
499 // n_recs_to_move number of records to to_page. We try to reduce
500 // the targeted data size on the to_page by
501 // BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again.
502 my_atomic_addlint(
503 &btr_defragment_compression_failures, 1);
504 max_ins_size_to_use =
505 move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
506 ? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
507 : 0;
508 if (max_ins_size_to_use == 0) {
509 n_recs_to_move = 0;
510 move_size = 0;
511 break;
512 }
513 n_recs_to_move = btr_defragment_calc_n_recs_for_size(
514 from_block, index, max_ins_size_to_use, &move_size);
515 }
516 // If less than target_n_recs are moved, it means there are
517 // compression failures during page_copy_rec_list_start. Adjust
518 // the max_data_size estimation to reduce compression failures
519 // in the following runs.
520 if (target_n_recs > n_recs_to_move
521 && *max_data_size > new_data_size + move_size) {
522 *max_data_size = new_data_size + move_size;
523 }
524 // Set ibuf free bits if necessary.
525 if (!dict_index_is_clust(index)
526 && page_is_leaf(to_page)) {
527 if (page_size.is_compressed()) {
528 ibuf_reset_free_bits(to_block);
529 } else {
530 ibuf_update_free_bits_if_full(
531 to_block,
532 srv_page_size,
533 ULINT_UNDEFINED);
534 }
535 }
536 if (n_recs_to_move == n_recs) {
537 /* The whole page is merged with the previous page,
538 free it. */
539 lock_update_merge_left(to_block, orig_pred,
540 from_block);
541 btr_search_drop_page_hash_index(from_block);
542 btr_level_list_remove(
543 index->table->space->id,
544 page_size, from_page, index, mtr);
545 btr_node_ptr_delete(index, from_block, mtr);
546 /* btr_blob_dbg_remove(from_page, index,
547 "btr_defragment_n_pages"); */
548 btr_page_free(index, from_block, mtr);
549 } else {
550 // There are still records left on the page, so
551 // increment n_defragmented. Node pointer will be changed
552 // so remove the old node pointer.
553 if (n_recs_to_move > 0) {
554 // Part of the page is merged to left, remove
555 // the merged records, update record locks and
556 // node pointer.
557 dtuple_t* node_ptr;
558 page_delete_rec_list_start(rec, from_block,
559 index, mtr);
560 lock_update_split_and_merge(to_block,
561 orig_pred,
562 from_block);
563 btr_node_ptr_delete(index, from_block, mtr);
564 rec = page_rec_get_next(
565 page_get_infimum_rec(from_page));
566 node_ptr = dict_index_build_node_ptr(
567 index, rec, page_get_page_no(from_page),
568 heap, level);
569 btr_insert_on_non_leaf_level(0, index, level+1,
570 node_ptr, mtr);
571 }
572 to_block = from_block;
573 }
574 return to_block;
575}
576
577/*************************************************************//**
578Tries to merge N consecutive pages, starting from the page pointed by the
579cursor. Skip space 0. Only consider leaf pages.
580This function first loads all N pages into memory, then for each of
581the pages other than the first page, it tries to move as many records
582as possible to the left sibling to keep the left sibling full. During
583the process, if any page becomes empty, that page will be removed from
584the level list. Record locks, hash, and node pointers are updated after
585page reorganization.
586@return pointer to the last block processed, or NULL if reaching end of index */
587UNIV_INTERN
588buf_block_t*
589btr_defragment_n_pages(
590 buf_block_t* block, /*!< in: starting block for defragmentation */
591 dict_index_t* index, /*!< in: index tree */
592 uint n_pages,/*!< in: number of pages to defragment */
593 mtr_t* mtr) /*!< in/out: mini-transaction */
594{
595 /* We will need to load the n+1 block because if the last page is freed
596 and we need to modify the prev_page_no of that block. */
597 buf_block_t* blocks[BTR_DEFRAGMENT_MAX_N_PAGES + 1];
598 page_t* first_page;
599 buf_block_t* current_block;
600 ulint total_data_size = 0;
601 ulint total_n_recs = 0;
602 ulint data_size_per_rec;
603 ulint optimal_page_size;
604 ulint reserved_space;
605 ulint max_data_size = 0;
606 uint n_defragmented = 0;
607 uint n_new_slots;
608 mem_heap_t* heap;
609 ibool end_of_index = FALSE;
610
611 /* It doesn't make sense to call this function with n_pages = 1. */
612 ut_ad(n_pages > 1);
613
614 if (!page_is_leaf(block->frame)) {
615 return NULL;
616 }
617
618 if (!index->table->space || !index->table->space->id) {
619 /* Ignore space 0. */
620 return NULL;
621 }
622
623 if (n_pages > BTR_DEFRAGMENT_MAX_N_PAGES) {
624 n_pages = BTR_DEFRAGMENT_MAX_N_PAGES;
625 }
626
627 first_page = buf_block_get_frame(block);
628 const page_size_t page_size(index->table->space->flags);
629
630 /* 1. Load the pages and calculate the total data size. */
631 blocks[0] = block;
632 for (uint i = 1; i <= n_pages; i++) {
633 page_t* page = buf_block_get_frame(blocks[i-1]);
634 ulint page_no = btr_page_get_next(page, mtr);
635 total_data_size += page_get_data_size(page);
636 total_n_recs += page_get_n_recs(page);
637 if (page_no == FIL_NULL) {
638 n_pages = i;
639 end_of_index = TRUE;
640 break;
641 }
642
643 blocks[i] = btr_block_get(page_id_t(index->table->space->id,
644 page_no), page_size,
645 RW_X_LATCH, index, mtr);
646 }
647
648 if (n_pages == 1) {
649 if (!page_has_prev(first_page)) {
650 /* last page in the index */
651 if (dict_index_get_page(index)
652 == page_get_page_no(first_page))
653 return NULL;
654 /* given page is the last page.
655 Lift the records to father. */
656 btr_lift_page_up(index, block, mtr);
657 }
658 return NULL;
659 }
660
661 /* 2. Calculate how many pages data can fit in. If not compressable,
662 return early. */
663 ut_a(total_n_recs != 0);
664 data_size_per_rec = total_data_size / total_n_recs;
665 // For uncompressed pages, the optimal data size if the free space of a
666 // empty page.
667 optimal_page_size = page_get_free_space_of_empty(
668 page_is_comp(first_page));
669 // For compressed pages, we take compression failures into account.
670 if (page_size.is_compressed()) {
671 ulint size = 0;
672 uint i = 0;
673 // We estimate the optimal data size of the index use samples of
674 // data size. These samples are taken when pages failed to
675 // compress due to insertion on the page. We use the average
676 // of all samples we have as the estimation. Different pages of
677 // the same index vary in compressibility. Average gives a good
678 // enough estimation.
679 for (;i < STAT_DEFRAG_DATA_SIZE_N_SAMPLE; i++) {
680 if (index->stat_defrag_data_size_sample[i] == 0) {
681 break;
682 }
683 size += index->stat_defrag_data_size_sample[i];
684 }
685 if (i != 0) {
686 size /= i;
687 optimal_page_size = ut_min(optimal_page_size, size);
688 }
689 max_data_size = optimal_page_size;
690 }
691
692 reserved_space = ut_min((ulint)(optimal_page_size
693 * (1 - srv_defragment_fill_factor)),
694 (data_size_per_rec
695 * srv_defragment_fill_factor_n_recs));
696 optimal_page_size -= reserved_space;
697 n_new_slots = uint((total_data_size + optimal_page_size - 1)
698 / optimal_page_size);
699 if (n_new_slots >= n_pages) {
700 /* Can't defragment. */
701 if (end_of_index)
702 return NULL;
703 return blocks[n_pages-1];
704 }
705
706 /* 3. Defragment pages. */
707 heap = mem_heap_create(256);
708 // First defragmented page will be the first page.
709 current_block = blocks[0];
710 // Start from the second page.
711 for (uint i = 1; i < n_pages; i ++) {
712 buf_block_t* new_block = btr_defragment_merge_pages(
713 index, blocks[i], current_block, page_size,
714 reserved_space, &max_data_size, heap, mtr);
715 if (new_block != current_block) {
716 n_defragmented ++;
717 current_block = new_block;
718 }
719 }
720 mem_heap_free(heap);
721 n_defragmented ++;
722 my_atomic_addlint(
723 &btr_defragment_count, 1);
724 if (n_pages == n_defragmented) {
725 my_atomic_addlint(
726 &btr_defragment_failures, 1);
727 } else {
728 index->stat_defrag_n_pages_freed += (n_pages - n_defragmented);
729 }
730 if (end_of_index)
731 return NULL;
732 return current_block;
733}
734
735/** Whether btr_defragment_thread is active */
736bool btr_defragment_thread_active;
737
738/** Merge consecutive b-tree pages into fewer pages to defragment indexes */
739extern "C" UNIV_INTERN
740os_thread_ret_t
741DECLARE_THREAD(btr_defragment_thread)(void*)
742{
743 btr_pcur_t* pcur;
744 btr_cur_t* cursor;
745 dict_index_t* index;
746 mtr_t mtr;
747 buf_block_t* first_block;
748 buf_block_t* last_block;
749
750 while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
751 ut_ad(btr_defragment_thread_active);
752
753 /* If defragmentation is disabled, sleep before
754 checking whether it's enabled. */
755 if (!srv_defragment) {
756 os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS);
757 continue;
758 }
759 /* The following call won't remove the item from work queue.
760 We only get a pointer to it to work on. This will make sure
761 when user issue a kill command, all indices are in the work
762 queue to be searched. This also means that the user thread
763 cannot directly remove the item from queue (since we might be
764 using it). So user thread only marks index as removed. */
765 btr_defragment_item_t* item = btr_defragment_get_item();
766 /* If work queue is empty, sleep and check later. */
767 if (!item) {
768 os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS);
769 continue;
770 }
771 /* If an index is marked as removed, we remove it from the work
772 queue. No other thread could be using this item at this point so
773 it's safe to remove now. */
774 if (item->removed) {
775 btr_defragment_remove_item(item);
776 continue;
777 }
778
779 pcur = item->pcur;
780 ulonglong now = ut_timer_now();
781 ulonglong elapsed = now - item->last_processed;
782
783 if (elapsed < srv_defragment_interval) {
784 /* If we see an index again before the interval
785 determined by the configured frequency is reached,
786 we just sleep until the interval pass. Since
787 defragmentation of all indices queue up on a single
788 thread, it's likely other indices that follow this one
789 don't need to sleep again. */
790 os_thread_sleep(((ulint)ut_timer_to_microseconds(
791 srv_defragment_interval - elapsed)));
792 }
793
794 now = ut_timer_now();
795 mtr_start(&mtr);
796 cursor = btr_pcur_get_btr_cur(pcur);
797 index = btr_cur_get_index(cursor);
798 index->set_modified(mtr);
799 /* To follow the latching order defined in WL#6326, acquire index->lock X-latch.
800 This entitles us to acquire page latches in any order for the index. */
801 mtr_x_lock(&index->lock, &mtr);
802 /* This will acquire index->lock SX-latch, which per WL#6363 is allowed
803 when we are already holding the X-latch. */
804 btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, &mtr);
805 first_block = btr_cur_get_block(cursor);
806
807 last_block = btr_defragment_n_pages(first_block, index,
808 srv_defragment_n_pages,
809 &mtr);
810 if (last_block) {
811 /* If we haven't reached the end of the index,
812 place the cursor on the last record of last page,
813 store the cursor position, and put back in queue. */
814 page_t* last_page = buf_block_get_frame(last_block);
815 rec_t* rec = page_rec_get_prev(
816 page_get_supremum_rec(last_page));
817 ut_a(page_rec_is_user_rec(rec));
818 page_cur_position(rec, last_block,
819 btr_cur_get_page_cur(cursor));
820 btr_pcur_store_position(pcur, &mtr);
821 mtr_commit(&mtr);
822 /* Update the last_processed time of this index. */
823 item->last_processed = now;
824 } else {
825 dberr_t err = DB_SUCCESS;
826 mtr_commit(&mtr);
827 /* Reaching the end of the index. */
828 dict_stats_empty_defrag_stats(index);
829 err = dict_stats_save_defrag_stats(index);
830 if (err != DB_SUCCESS) {
831 ib::error() << "Saving defragmentation stats for table "
832 << index->table->name.m_name
833 << " index " << index->name()
834 << " failed with error " << err;
835 } else {
836 err = dict_stats_save_defrag_summary(index);
837
838 if (err != DB_SUCCESS) {
839 ib::error() << "Saving defragmentation summary for table "
840 << index->table->name.m_name
841 << " index " << index->name()
842 << " failed with error " << err;
843 }
844 }
845
846 btr_defragment_remove_item(item);
847 }
848 }
849
850 btr_defragment_thread_active = false;
851 os_thread_exit();
852 OS_THREAD_DUMMY_RETURN;
853}
854