1 | /***************************************************************************** |
2 | |
3 | Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved. |
4 | Copyright (C) 2014, 2018, MariaDB Corporation. |
5 | |
6 | This program is free software; you can redistribute it and/or modify it under |
7 | the terms of the GNU General Public License as published by the Free Software |
8 | Foundation; version 2 of the License. |
9 | |
10 | This program is distributed in the hope that it will be useful, but WITHOUT |
11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
12 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
13 | |
14 | You should have received a copy of the GNU General Public License along with |
15 | this program; if not, write to the Free Software Foundation, Inc., |
16 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA |
17 | |
18 | *****************************************************************************/ |
19 | /**************************************************//** |
20 | @file btr/btr0defragment.cc |
21 | Index defragmentation. |
22 | |
23 | Created 05/29/2014 Rongrong Zhong |
24 | Modified 16/07/2014 Sunguck Lee |
25 | Modified 30/07/2014 Jan Lindström jan.lindstrom@mariadb.com |
26 | *******************************************************/ |
27 | |
28 | #include "btr0defragment.h" |
29 | #include "btr0btr.h" |
30 | #include "btr0cur.h" |
31 | #include "btr0sea.h" |
32 | #include "btr0pcur.h" |
33 | #include "dict0stats.h" |
34 | #include "dict0stats_bg.h" |
35 | #include "dict0defrag_bg.h" |
36 | #include "ibuf0ibuf.h" |
37 | #include "lock0lock.h" |
38 | #include "srv0start.h" |
39 | #include "ut0timer.h" |
40 | |
41 | #include <list> |
42 | |
43 | /**************************************************//** |
44 | Custom nullptr implementation for under g++ 4.6 |
45 | *******************************************************/ |
46 | // #pragma once |
47 | /* |
48 | namespace std |
49 | { |
50 | // based on SC22/WG21/N2431 = J16/07-0301 |
51 | struct nullptr_t |
52 | { |
53 | template<typename any> operator any * () const |
54 | { |
55 | return 0; |
56 | } |
57 | template<class any, typename T> operator T any:: * () const |
58 | { |
59 | return 0; |
60 | } |
61 | |
62 | #ifdef _MSC_VER |
63 | struct pad {}; |
64 | pad __[sizeof(void*)/sizeof(pad)]; |
65 | #else |
66 | char __[sizeof(void*)]; |
67 | #endif |
68 | private: |
69 | // nullptr_t();// {} |
70 | // nullptr_t(const nullptr_t&); |
71 | // void operator = (const nullptr_t&); |
72 | void operator &() const; |
73 | template<typename any> void operator +(any) const |
74 | { |
75 | // I Love MSVC 2005! |
76 | } |
77 | template<typename any> void operator -(any) const |
78 | { |
79 | // I Love MSVC 2005! |
80 | } |
81 | }; |
82 | static const nullptr_t __nullptr = {}; |
83 | } |
84 | |
85 | #ifndef nullptr |
86 | #define nullptr std::__nullptr |
87 | #endif |
88 | */ |
89 | |
90 | /**************************************************//** |
91 | End of Custom nullptr implementation for under g++ 4.6 |
92 | *******************************************************/ |
93 | |
94 | /* When there's no work, either because defragment is disabled, or because no |
95 | query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS.*/ |
96 | #define BTR_DEFRAGMENT_SLEEP_IN_USECS 1000000 |
97 | /* Reduce the target page size by this amount when compression failure happens |
98 | during defragmentaiton. 512 is chosen because it's a power of 2 and it is about |
99 | 3% of the page size. When there are compression failures in defragmentation, |
100 | our goal is to get a decent defrag ratio with as few compression failure as |
101 | possible. From experimentation it seems that reduce the target size by 512 every |
102 | time will make sure the page is compressible within a couple of iterations. */ |
103 | #define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE 512 |
104 | |
105 | /* Work queue for defragmentation. */ |
106 | typedef std::list<btr_defragment_item_t*> btr_defragment_wq_t; |
107 | static btr_defragment_wq_t btr_defragment_wq; |
108 | |
109 | /* Mutex protecting the defragmentation work queue.*/ |
110 | ib_mutex_t btr_defragment_mutex; |
111 | #ifdef UNIV_PFS_MUTEX |
112 | UNIV_INTERN mysql_pfs_key_t btr_defragment_mutex_key; |
113 | #endif /* UNIV_PFS_MUTEX */ |
114 | |
115 | /* Number of compression failures caused by defragmentation since server |
116 | start. */ |
117 | ulint btr_defragment_compression_failures = 0; |
118 | /* Number of btr_defragment_n_pages calls that altered page but didn't |
119 | manage to release any page. */ |
120 | ulint btr_defragment_failures = 0; |
121 | /* Total number of btr_defragment_n_pages calls that altered page. |
122 | The difference between btr_defragment_count and btr_defragment_failures shows |
123 | the amount of effort wasted. */ |
124 | ulint btr_defragment_count = 0; |
125 | |
126 | /******************************************************************//** |
127 | Constructor for btr_defragment_item_t. */ |
128 | btr_defragment_item_t::btr_defragment_item_t( |
129 | btr_pcur_t* pcur, |
130 | os_event_t event) |
131 | { |
132 | this->pcur = pcur; |
133 | this->event = event; |
134 | this->removed = false; |
135 | this->last_processed = 0; |
136 | } |
137 | |
138 | /******************************************************************//** |
139 | Destructor for btr_defragment_item_t. */ |
140 | btr_defragment_item_t::~btr_defragment_item_t() { |
141 | if (this->pcur) { |
142 | btr_pcur_free_for_mysql(this->pcur); |
143 | } |
144 | if (this->event) { |
145 | os_event_set(this->event); |
146 | } |
147 | } |
148 | |
149 | /******************************************************************//** |
150 | Initialize defragmentation. */ |
151 | void |
152 | btr_defragment_init() |
153 | { |
154 | srv_defragment_interval = ut_microseconds_to_timer( |
155 | (ulonglong) (1000000.0 / srv_defragment_frequency)); |
156 | mutex_create(LATCH_ID_BTR_DEFRAGMENT_MUTEX, &btr_defragment_mutex); |
157 | } |
158 | |
159 | /******************************************************************//** |
160 | Shutdown defragmentation. Release all resources. */ |
161 | void |
162 | btr_defragment_shutdown() |
163 | { |
164 | mutex_enter(&btr_defragment_mutex); |
165 | std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); |
166 | while(iter != btr_defragment_wq.end()) { |
167 | btr_defragment_item_t* item = *iter; |
168 | iter = btr_defragment_wq.erase(iter); |
169 | delete item; |
170 | } |
171 | mutex_exit(&btr_defragment_mutex); |
172 | mutex_free(&btr_defragment_mutex); |
173 | } |
174 | |
175 | |
176 | /******************************************************************//** |
177 | Functions used by the query threads: btr_defragment_xxx_index |
178 | Query threads find/add/remove index. */ |
179 | /******************************************************************//** |
180 | Check whether the given index is in btr_defragment_wq. We use index->id |
181 | to identify indices. */ |
182 | bool |
183 | btr_defragment_find_index( |
184 | dict_index_t* index) /*!< Index to find. */ |
185 | { |
186 | mutex_enter(&btr_defragment_mutex); |
187 | for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); |
188 | iter != btr_defragment_wq.end(); |
189 | ++iter) { |
190 | btr_defragment_item_t* item = *iter; |
191 | btr_pcur_t* pcur = item->pcur; |
192 | btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur); |
193 | dict_index_t* idx = btr_cur_get_index(cursor); |
194 | if (index->id == idx->id) { |
195 | mutex_exit(&btr_defragment_mutex); |
196 | return true; |
197 | } |
198 | } |
199 | mutex_exit(&btr_defragment_mutex); |
200 | return false; |
201 | } |
202 | |
203 | /******************************************************************//** |
204 | Query thread uses this function to add an index to btr_defragment_wq. |
205 | Return a pointer to os_event for the query thread to wait on if this is a |
206 | synchronized defragmentation. */ |
207 | os_event_t |
208 | btr_defragment_add_index( |
209 | dict_index_t* index, /*!< index to be added */ |
210 | bool async, /*!< whether this is an async |
211 | defragmentation */ |
212 | dberr_t* err) /*!< out: error code */ |
213 | { |
214 | mtr_t mtr; |
215 | *err = DB_SUCCESS; |
216 | |
217 | mtr_start(&mtr); |
218 | // Load index rood page. |
219 | buf_block_t* block = btr_block_get( |
220 | page_id_t(index->table->space->id, index->page), |
221 | page_size_t(index->table->space->flags), |
222 | RW_NO_LATCH, index, &mtr); |
223 | page_t* page = NULL; |
224 | |
225 | if (block) { |
226 | page = buf_block_get_frame(block); |
227 | } |
228 | |
229 | if (page == NULL && !index->is_readable()) { |
230 | mtr_commit(&mtr); |
231 | *err = DB_DECRYPTION_FAILED; |
232 | return NULL; |
233 | } |
234 | |
235 | ut_ad(page_is_root(page)); |
236 | |
237 | if (page_is_leaf(page)) { |
238 | // Index root is a leaf page, no need to defragment. |
239 | mtr_commit(&mtr); |
240 | return NULL; |
241 | } |
242 | btr_pcur_t* pcur = btr_pcur_create_for_mysql(); |
243 | os_event_t event = NULL; |
244 | if (!async) { |
245 | event = os_event_create(0); |
246 | } |
247 | btr_pcur_open_at_index_side(true, index, BTR_SEARCH_LEAF, pcur, |
248 | true, 0, &mtr); |
249 | btr_pcur_move_to_next(pcur, &mtr); |
250 | btr_pcur_store_position(pcur, &mtr); |
251 | mtr_commit(&mtr); |
252 | dict_stats_empty_defrag_summary(index); |
253 | btr_defragment_item_t* item = new btr_defragment_item_t(pcur, event); |
254 | mutex_enter(&btr_defragment_mutex); |
255 | btr_defragment_wq.push_back(item); |
256 | mutex_exit(&btr_defragment_mutex); |
257 | return event; |
258 | } |
259 | |
260 | /******************************************************************//** |
261 | When table is dropped, this function is called to mark a table as removed in |
262 | btr_efragment_wq. The difference between this function and the remove_index |
263 | function is this will not NULL the event. */ |
264 | void |
265 | btr_defragment_remove_table( |
266 | dict_table_t* table) /*!< Index to be removed. */ |
267 | { |
268 | mutex_enter(&btr_defragment_mutex); |
269 | for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); |
270 | iter != btr_defragment_wq.end(); |
271 | ++iter) { |
272 | btr_defragment_item_t* item = *iter; |
273 | btr_pcur_t* pcur = item->pcur; |
274 | btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur); |
275 | dict_index_t* idx = btr_cur_get_index(cursor); |
276 | if (table->id == idx->table->id) { |
277 | item->removed = true; |
278 | } |
279 | } |
280 | mutex_exit(&btr_defragment_mutex); |
281 | } |
282 | |
283 | /******************************************************************//** |
284 | Query thread uses this function to mark an index as removed in |
285 | btr_efragment_wq. */ |
286 | void |
287 | btr_defragment_remove_index( |
288 | dict_index_t* index) /*!< Index to be removed. */ |
289 | { |
290 | mutex_enter(&btr_defragment_mutex); |
291 | for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); |
292 | iter != btr_defragment_wq.end(); |
293 | ++iter) { |
294 | btr_defragment_item_t* item = *iter; |
295 | btr_pcur_t* pcur = item->pcur; |
296 | btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur); |
297 | dict_index_t* idx = btr_cur_get_index(cursor); |
298 | if (index->id == idx->id) { |
299 | item->removed = true; |
300 | item->event = NULL; |
301 | break; |
302 | } |
303 | } |
304 | mutex_exit(&btr_defragment_mutex); |
305 | } |
306 | |
307 | /******************************************************************//** |
308 | Functions used by defragmentation thread: btr_defragment_xxx_item. |
309 | Defragmentation thread operates on the work *item*. It gets/removes |
310 | item from the work queue. */ |
311 | /******************************************************************//** |
312 | Defragment thread uses this to remove an item from btr_defragment_wq. |
313 | When an item is removed from the work queue, all resources associated with it |
314 | are free as well. */ |
315 | void |
316 | btr_defragment_remove_item( |
317 | btr_defragment_item_t* item) /*!< Item to be removed. */ |
318 | { |
319 | mutex_enter(&btr_defragment_mutex); |
320 | for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); |
321 | iter != btr_defragment_wq.end(); |
322 | ++iter) { |
323 | if (item == *iter) { |
324 | btr_defragment_wq.erase(iter); |
325 | delete item; |
326 | break; |
327 | } |
328 | } |
329 | mutex_exit(&btr_defragment_mutex); |
330 | } |
331 | |
332 | /******************************************************************//** |
333 | Defragment thread uses this to get an item from btr_defragment_wq to work on. |
334 | The item is not removed from the work queue so query threads can still access |
335 | this item. We keep it this way so query threads can find and kill a |
336 | defragmentation even if that index is being worked on. Be aware that while you |
337 | work on this item you have no lock protection on it whatsoever. This is OK as |
338 | long as the query threads and defragment thread won't modify the same fields |
339 | without lock protection. |
340 | */ |
341 | btr_defragment_item_t* |
342 | btr_defragment_get_item() |
343 | { |
344 | if (btr_defragment_wq.empty()) { |
345 | return NULL; |
346 | //return nullptr; |
347 | } |
348 | mutex_enter(&btr_defragment_mutex); |
349 | std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin(); |
350 | if (iter == btr_defragment_wq.end()) { |
351 | iter = btr_defragment_wq.begin(); |
352 | } |
353 | btr_defragment_item_t* item = *iter; |
354 | iter++; |
355 | mutex_exit(&btr_defragment_mutex); |
356 | return item; |
357 | } |
358 | |
359 | /*********************************************************************//** |
360 | Check whether we should save defragmentation statistics to persistent storage. |
361 | Currently we save the stats to persistent storage every 100 updates. */ |
362 | UNIV_INTERN |
363 | void |
364 | btr_defragment_save_defrag_stats_if_needed( |
365 | dict_index_t* index) /*!< in: index */ |
366 | { |
367 | if (srv_defragment_stats_accuracy != 0 // stats tracking disabled |
368 | && index->table->space->id != 0 // do not track system tables |
369 | && index->stat_defrag_modified_counter |
370 | >= srv_defragment_stats_accuracy) { |
371 | dict_stats_defrag_pool_add(index); |
372 | index->stat_defrag_modified_counter = 0; |
373 | } |
374 | } |
375 | |
376 | /*********************************************************************//** |
377 | Main defragment functionalities used by defragment thread.*/ |
378 | /*************************************************************//** |
379 | Calculate number of records from beginning of block that can |
380 | fit into size_limit |
381 | @return number of records */ |
382 | UNIV_INTERN |
383 | ulint |
384 | btr_defragment_calc_n_recs_for_size( |
385 | buf_block_t* block, /*!< in: B-tree page */ |
386 | dict_index_t* index, /*!< in: index of the page */ |
387 | ulint size_limit, /*!< in: size limit to fit records in */ |
388 | ulint* n_recs_size) /*!< out: actual size of the records that fit |
389 | in size_limit. */ |
390 | { |
391 | page_t* page = buf_block_get_frame(block); |
392 | ulint n_recs = 0; |
393 | ulint offsets_[REC_OFFS_NORMAL_SIZE]; |
394 | ulint* offsets = offsets_; |
395 | rec_offs_init(offsets_); |
396 | mem_heap_t* heap = NULL; |
397 | ulint size = 0; |
398 | page_cur_t cur; |
399 | |
400 | page_cur_set_before_first(block, &cur); |
401 | page_cur_move_to_next(&cur); |
402 | while (page_cur_get_rec(&cur) != page_get_supremum_rec(page)) { |
403 | rec_t* cur_rec = page_cur_get_rec(&cur); |
404 | offsets = rec_get_offsets(cur_rec, index, offsets, |
405 | page_is_leaf(page), |
406 | ULINT_UNDEFINED, &heap); |
407 | ulint rec_size = rec_offs_size(offsets); |
408 | size += rec_size; |
409 | if (size > size_limit) { |
410 | size = size - rec_size; |
411 | break; |
412 | } |
413 | n_recs ++; |
414 | page_cur_move_to_next(&cur); |
415 | } |
416 | *n_recs_size = size; |
417 | return n_recs; |
418 | } |
419 | |
420 | /*************************************************************//** |
421 | Merge as many records from the from_block to the to_block. Delete |
422 | the from_block if all records are successfully merged to to_block. |
423 | @return the to_block to target for next merge operation. */ |
424 | UNIV_INTERN |
425 | buf_block_t* |
426 | btr_defragment_merge_pages( |
427 | dict_index_t* index, /*!< in: index tree */ |
428 | buf_block_t* from_block, /*!< in: origin of merge */ |
429 | buf_block_t* to_block, /*!< in: destination of merge */ |
430 | const page_size_t page_size, /*!< in: page size of the block */ |
431 | ulint reserved_space, /*!< in: space reserved for future |
432 | insert to avoid immediate page split */ |
433 | ulint* max_data_size, /*!< in/out: max data size to |
434 | fit in a single compressed page. */ |
435 | mem_heap_t* heap, /*!< in/out: pointer to memory heap */ |
436 | mtr_t* mtr) /*!< in/out: mini-transaction */ |
437 | { |
438 | page_t* from_page = buf_block_get_frame(from_block); |
439 | page_t* to_page = buf_block_get_frame(to_block); |
440 | ulint level = btr_page_get_level(from_page); |
441 | ulint n_recs = page_get_n_recs(from_page); |
442 | ulint new_data_size = page_get_data_size(to_page); |
443 | ulint max_ins_size = |
444 | page_get_max_insert_size(to_page, n_recs); |
445 | ulint max_ins_size_reorg = |
446 | page_get_max_insert_size_after_reorganize( |
447 | to_page, n_recs); |
448 | ulint max_ins_size_to_use = max_ins_size_reorg > reserved_space |
449 | ? max_ins_size_reorg - reserved_space : 0; |
450 | ulint move_size = 0; |
451 | ulint n_recs_to_move = 0; |
452 | rec_t* rec = NULL; |
453 | ulint target_n_recs = 0; |
454 | rec_t* orig_pred; |
455 | |
456 | // Estimate how many records can be moved from the from_page to |
457 | // the to_page. |
458 | if (page_size.is_compressed()) { |
459 | ulint page_diff = srv_page_size - *max_data_size; |
460 | max_ins_size_to_use = (max_ins_size_to_use > page_diff) |
461 | ? max_ins_size_to_use - page_diff : 0; |
462 | } |
463 | n_recs_to_move = btr_defragment_calc_n_recs_for_size( |
464 | from_block, index, max_ins_size_to_use, &move_size); |
465 | |
466 | // If max_ins_size >= move_size, we can move the records without |
467 | // reorganizing the page, otherwise we need to reorganize the page |
468 | // first to release more space. |
469 | if (move_size > max_ins_size) { |
470 | if (!btr_page_reorganize_block(false, page_zip_level, |
471 | to_block, index, |
472 | mtr)) { |
473 | if (!dict_index_is_clust(index) |
474 | && page_is_leaf(to_page)) { |
475 | ibuf_reset_free_bits(to_block); |
476 | } |
477 | // If reorganization fails, that means page is |
478 | // not compressable. There's no point to try |
479 | // merging into this page. Continue to the |
480 | // next page. |
481 | return from_block; |
482 | } |
483 | ut_ad(page_validate(to_page, index)); |
484 | max_ins_size = page_get_max_insert_size(to_page, n_recs); |
485 | ut_a(max_ins_size >= move_size); |
486 | } |
487 | |
488 | // Move records to pack to_page more full. |
489 | orig_pred = NULL; |
490 | target_n_recs = n_recs_to_move; |
491 | while (n_recs_to_move > 0) { |
492 | rec = page_rec_get_nth(from_page, |
493 | n_recs_to_move + 1); |
494 | orig_pred = page_copy_rec_list_start( |
495 | to_block, from_block, rec, index, mtr); |
496 | if (orig_pred) |
497 | break; |
498 | // If we reach here, that means compression failed after packing |
499 | // n_recs_to_move number of records to to_page. We try to reduce |
500 | // the targeted data size on the to_page by |
501 | // BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again. |
502 | my_atomic_addlint( |
503 | &btr_defragment_compression_failures, 1); |
504 | max_ins_size_to_use = |
505 | move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE |
506 | ? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE |
507 | : 0; |
508 | if (max_ins_size_to_use == 0) { |
509 | n_recs_to_move = 0; |
510 | move_size = 0; |
511 | break; |
512 | } |
513 | n_recs_to_move = btr_defragment_calc_n_recs_for_size( |
514 | from_block, index, max_ins_size_to_use, &move_size); |
515 | } |
516 | // If less than target_n_recs are moved, it means there are |
517 | // compression failures during page_copy_rec_list_start. Adjust |
518 | // the max_data_size estimation to reduce compression failures |
519 | // in the following runs. |
520 | if (target_n_recs > n_recs_to_move |
521 | && *max_data_size > new_data_size + move_size) { |
522 | *max_data_size = new_data_size + move_size; |
523 | } |
524 | // Set ibuf free bits if necessary. |
525 | if (!dict_index_is_clust(index) |
526 | && page_is_leaf(to_page)) { |
527 | if (page_size.is_compressed()) { |
528 | ibuf_reset_free_bits(to_block); |
529 | } else { |
530 | ibuf_update_free_bits_if_full( |
531 | to_block, |
532 | srv_page_size, |
533 | ULINT_UNDEFINED); |
534 | } |
535 | } |
536 | if (n_recs_to_move == n_recs) { |
537 | /* The whole page is merged with the previous page, |
538 | free it. */ |
539 | lock_update_merge_left(to_block, orig_pred, |
540 | from_block); |
541 | btr_search_drop_page_hash_index(from_block); |
542 | btr_level_list_remove( |
543 | index->table->space->id, |
544 | page_size, from_page, index, mtr); |
545 | btr_node_ptr_delete(index, from_block, mtr); |
546 | /* btr_blob_dbg_remove(from_page, index, |
547 | "btr_defragment_n_pages"); */ |
548 | btr_page_free(index, from_block, mtr); |
549 | } else { |
550 | // There are still records left on the page, so |
551 | // increment n_defragmented. Node pointer will be changed |
552 | // so remove the old node pointer. |
553 | if (n_recs_to_move > 0) { |
554 | // Part of the page is merged to left, remove |
555 | // the merged records, update record locks and |
556 | // node pointer. |
557 | dtuple_t* node_ptr; |
558 | page_delete_rec_list_start(rec, from_block, |
559 | index, mtr); |
560 | lock_update_split_and_merge(to_block, |
561 | orig_pred, |
562 | from_block); |
563 | btr_node_ptr_delete(index, from_block, mtr); |
564 | rec = page_rec_get_next( |
565 | page_get_infimum_rec(from_page)); |
566 | node_ptr = dict_index_build_node_ptr( |
567 | index, rec, page_get_page_no(from_page), |
568 | heap, level); |
569 | btr_insert_on_non_leaf_level(0, index, level+1, |
570 | node_ptr, mtr); |
571 | } |
572 | to_block = from_block; |
573 | } |
574 | return to_block; |
575 | } |
576 | |
577 | /*************************************************************//** |
578 | Tries to merge N consecutive pages, starting from the page pointed by the |
579 | cursor. Skip space 0. Only consider leaf pages. |
580 | This function first loads all N pages into memory, then for each of |
581 | the pages other than the first page, it tries to move as many records |
582 | as possible to the left sibling to keep the left sibling full. During |
583 | the process, if any page becomes empty, that page will be removed from |
584 | the level list. Record locks, hash, and node pointers are updated after |
585 | page reorganization. |
586 | @return pointer to the last block processed, or NULL if reaching end of index */ |
587 | UNIV_INTERN |
588 | buf_block_t* |
589 | btr_defragment_n_pages( |
590 | buf_block_t* block, /*!< in: starting block for defragmentation */ |
591 | dict_index_t* index, /*!< in: index tree */ |
592 | uint n_pages,/*!< in: number of pages to defragment */ |
593 | mtr_t* mtr) /*!< in/out: mini-transaction */ |
594 | { |
595 | /* We will need to load the n+1 block because if the last page is freed |
596 | and we need to modify the prev_page_no of that block. */ |
597 | buf_block_t* blocks[BTR_DEFRAGMENT_MAX_N_PAGES + 1]; |
598 | page_t* first_page; |
599 | buf_block_t* current_block; |
600 | ulint total_data_size = 0; |
601 | ulint total_n_recs = 0; |
602 | ulint data_size_per_rec; |
603 | ulint optimal_page_size; |
604 | ulint reserved_space; |
605 | ulint max_data_size = 0; |
606 | uint n_defragmented = 0; |
607 | uint n_new_slots; |
608 | mem_heap_t* heap; |
609 | ibool end_of_index = FALSE; |
610 | |
611 | /* It doesn't make sense to call this function with n_pages = 1. */ |
612 | ut_ad(n_pages > 1); |
613 | |
614 | if (!page_is_leaf(block->frame)) { |
615 | return NULL; |
616 | } |
617 | |
618 | if (!index->table->space || !index->table->space->id) { |
619 | /* Ignore space 0. */ |
620 | return NULL; |
621 | } |
622 | |
623 | if (n_pages > BTR_DEFRAGMENT_MAX_N_PAGES) { |
624 | n_pages = BTR_DEFRAGMENT_MAX_N_PAGES; |
625 | } |
626 | |
627 | first_page = buf_block_get_frame(block); |
628 | const page_size_t page_size(index->table->space->flags); |
629 | |
630 | /* 1. Load the pages and calculate the total data size. */ |
631 | blocks[0] = block; |
632 | for (uint i = 1; i <= n_pages; i++) { |
633 | page_t* page = buf_block_get_frame(blocks[i-1]); |
634 | ulint page_no = btr_page_get_next(page, mtr); |
635 | total_data_size += page_get_data_size(page); |
636 | total_n_recs += page_get_n_recs(page); |
637 | if (page_no == FIL_NULL) { |
638 | n_pages = i; |
639 | end_of_index = TRUE; |
640 | break; |
641 | } |
642 | |
643 | blocks[i] = btr_block_get(page_id_t(index->table->space->id, |
644 | page_no), page_size, |
645 | RW_X_LATCH, index, mtr); |
646 | } |
647 | |
648 | if (n_pages == 1) { |
649 | if (!page_has_prev(first_page)) { |
650 | /* last page in the index */ |
651 | if (dict_index_get_page(index) |
652 | == page_get_page_no(first_page)) |
653 | return NULL; |
654 | /* given page is the last page. |
655 | Lift the records to father. */ |
656 | btr_lift_page_up(index, block, mtr); |
657 | } |
658 | return NULL; |
659 | } |
660 | |
661 | /* 2. Calculate how many pages data can fit in. If not compressable, |
662 | return early. */ |
663 | ut_a(total_n_recs != 0); |
664 | data_size_per_rec = total_data_size / total_n_recs; |
665 | // For uncompressed pages, the optimal data size if the free space of a |
666 | // empty page. |
667 | optimal_page_size = page_get_free_space_of_empty( |
668 | page_is_comp(first_page)); |
669 | // For compressed pages, we take compression failures into account. |
670 | if (page_size.is_compressed()) { |
671 | ulint size = 0; |
672 | uint i = 0; |
673 | // We estimate the optimal data size of the index use samples of |
674 | // data size. These samples are taken when pages failed to |
675 | // compress due to insertion on the page. We use the average |
676 | // of all samples we have as the estimation. Different pages of |
677 | // the same index vary in compressibility. Average gives a good |
678 | // enough estimation. |
679 | for (;i < STAT_DEFRAG_DATA_SIZE_N_SAMPLE; i++) { |
680 | if (index->stat_defrag_data_size_sample[i] == 0) { |
681 | break; |
682 | } |
683 | size += index->stat_defrag_data_size_sample[i]; |
684 | } |
685 | if (i != 0) { |
686 | size /= i; |
687 | optimal_page_size = ut_min(optimal_page_size, size); |
688 | } |
689 | max_data_size = optimal_page_size; |
690 | } |
691 | |
692 | reserved_space = ut_min((ulint)(optimal_page_size |
693 | * (1 - srv_defragment_fill_factor)), |
694 | (data_size_per_rec |
695 | * srv_defragment_fill_factor_n_recs)); |
696 | optimal_page_size -= reserved_space; |
697 | n_new_slots = uint((total_data_size + optimal_page_size - 1) |
698 | / optimal_page_size); |
699 | if (n_new_slots >= n_pages) { |
700 | /* Can't defragment. */ |
701 | if (end_of_index) |
702 | return NULL; |
703 | return blocks[n_pages-1]; |
704 | } |
705 | |
706 | /* 3. Defragment pages. */ |
707 | heap = mem_heap_create(256); |
708 | // First defragmented page will be the first page. |
709 | current_block = blocks[0]; |
710 | // Start from the second page. |
711 | for (uint i = 1; i < n_pages; i ++) { |
712 | buf_block_t* new_block = btr_defragment_merge_pages( |
713 | index, blocks[i], current_block, page_size, |
714 | reserved_space, &max_data_size, heap, mtr); |
715 | if (new_block != current_block) { |
716 | n_defragmented ++; |
717 | current_block = new_block; |
718 | } |
719 | } |
720 | mem_heap_free(heap); |
721 | n_defragmented ++; |
722 | my_atomic_addlint( |
723 | &btr_defragment_count, 1); |
724 | if (n_pages == n_defragmented) { |
725 | my_atomic_addlint( |
726 | &btr_defragment_failures, 1); |
727 | } else { |
728 | index->stat_defrag_n_pages_freed += (n_pages - n_defragmented); |
729 | } |
730 | if (end_of_index) |
731 | return NULL; |
732 | return current_block; |
733 | } |
734 | |
735 | /** Whether btr_defragment_thread is active */ |
736 | bool btr_defragment_thread_active; |
737 | |
738 | /** Merge consecutive b-tree pages into fewer pages to defragment indexes */ |
739 | extern "C" UNIV_INTERN |
740 | os_thread_ret_t |
741 | DECLARE_THREAD(btr_defragment_thread)(void*) |
742 | { |
743 | btr_pcur_t* pcur; |
744 | btr_cur_t* cursor; |
745 | dict_index_t* index; |
746 | mtr_t mtr; |
747 | buf_block_t* first_block; |
748 | buf_block_t* last_block; |
749 | |
750 | while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { |
751 | ut_ad(btr_defragment_thread_active); |
752 | |
753 | /* If defragmentation is disabled, sleep before |
754 | checking whether it's enabled. */ |
755 | if (!srv_defragment) { |
756 | os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS); |
757 | continue; |
758 | } |
759 | /* The following call won't remove the item from work queue. |
760 | We only get a pointer to it to work on. This will make sure |
761 | when user issue a kill command, all indices are in the work |
762 | queue to be searched. This also means that the user thread |
763 | cannot directly remove the item from queue (since we might be |
764 | using it). So user thread only marks index as removed. */ |
765 | btr_defragment_item_t* item = btr_defragment_get_item(); |
766 | /* If work queue is empty, sleep and check later. */ |
767 | if (!item) { |
768 | os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS); |
769 | continue; |
770 | } |
771 | /* If an index is marked as removed, we remove it from the work |
772 | queue. No other thread could be using this item at this point so |
773 | it's safe to remove now. */ |
774 | if (item->removed) { |
775 | btr_defragment_remove_item(item); |
776 | continue; |
777 | } |
778 | |
779 | pcur = item->pcur; |
780 | ulonglong now = ut_timer_now(); |
781 | ulonglong elapsed = now - item->last_processed; |
782 | |
783 | if (elapsed < srv_defragment_interval) { |
784 | /* If we see an index again before the interval |
785 | determined by the configured frequency is reached, |
786 | we just sleep until the interval pass. Since |
787 | defragmentation of all indices queue up on a single |
788 | thread, it's likely other indices that follow this one |
789 | don't need to sleep again. */ |
790 | os_thread_sleep(((ulint)ut_timer_to_microseconds( |
791 | srv_defragment_interval - elapsed))); |
792 | } |
793 | |
794 | now = ut_timer_now(); |
795 | mtr_start(&mtr); |
796 | cursor = btr_pcur_get_btr_cur(pcur); |
797 | index = btr_cur_get_index(cursor); |
798 | index->set_modified(mtr); |
799 | /* To follow the latching order defined in WL#6326, acquire index->lock X-latch. |
800 | This entitles us to acquire page latches in any order for the index. */ |
801 | mtr_x_lock(&index->lock, &mtr); |
802 | /* This will acquire index->lock SX-latch, which per WL#6363 is allowed |
803 | when we are already holding the X-latch. */ |
804 | btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, &mtr); |
805 | first_block = btr_cur_get_block(cursor); |
806 | |
807 | last_block = btr_defragment_n_pages(first_block, index, |
808 | srv_defragment_n_pages, |
809 | &mtr); |
810 | if (last_block) { |
811 | /* If we haven't reached the end of the index, |
812 | place the cursor on the last record of last page, |
813 | store the cursor position, and put back in queue. */ |
814 | page_t* last_page = buf_block_get_frame(last_block); |
815 | rec_t* rec = page_rec_get_prev( |
816 | page_get_supremum_rec(last_page)); |
817 | ut_a(page_rec_is_user_rec(rec)); |
818 | page_cur_position(rec, last_block, |
819 | btr_cur_get_page_cur(cursor)); |
820 | btr_pcur_store_position(pcur, &mtr); |
821 | mtr_commit(&mtr); |
822 | /* Update the last_processed time of this index. */ |
823 | item->last_processed = now; |
824 | } else { |
825 | dberr_t err = DB_SUCCESS; |
826 | mtr_commit(&mtr); |
827 | /* Reaching the end of the index. */ |
828 | dict_stats_empty_defrag_stats(index); |
829 | err = dict_stats_save_defrag_stats(index); |
830 | if (err != DB_SUCCESS) { |
831 | ib::error() << "Saving defragmentation stats for table " |
832 | << index->table->name.m_name |
833 | << " index " << index->name() |
834 | << " failed with error " << err; |
835 | } else { |
836 | err = dict_stats_save_defrag_summary(index); |
837 | |
838 | if (err != DB_SUCCESS) { |
839 | ib::error() << "Saving defragmentation summary for table " |
840 | << index->table->name.m_name |
841 | << " index " << index->name() |
842 | << " failed with error " << err; |
843 | } |
844 | } |
845 | |
846 | btr_defragment_remove_item(item); |
847 | } |
848 | } |
849 | |
850 | btr_defragment_thread_active = false; |
851 | os_thread_exit(); |
852 | OS_THREAD_DUMMY_RETURN; |
853 | } |
854 | |