btr0defragment.cc source code [MariaDB/storage/innobase/btr/btr0defragment.cc]

1	/*****************************************************************************
2
3	Copyright (C) 2013, 2014 Facebook, Inc. All Rights Reserved.
4	Copyright (C) 2014, 2018, MariaDB Corporation.
5
6	This program is free software; you can redistribute it and/or modify it under
7	the terms of the GNU General Public License as published by the Free Software
8	Foundation; version 2 of the License.
9
10	This program is distributed in the hope that it will be useful, but WITHOUT
11	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12	FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14	You should have received a copy of the GNU General Public License along with
15	this program; if not, write to the Free Software Foundation, Inc.,
16	51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
17
18	*****************************************************************************/
19	/************************************************//**
20	@file btr/btr0defragment.cc
21	Index defragmentation.
22
23	Created 05/29/2014 Rongrong Zhong
24	Modified 16/07/2014 Sunguck Lee
25	Modified 30/07/2014 Jan Lindström jan.lindstrom@mariadb.com
26	*******************************************************/
27
28	#include "btr0defragment.h"
29	#include "btr0btr.h"
30	#include "btr0cur.h"
31	#include "btr0sea.h"
32	#include "btr0pcur.h"
33	#include "dict0stats.h"
34	#include "dict0stats_bg.h"
35	#include "dict0defrag_bg.h"
36	#include "ibuf0ibuf.h"
37	#include "lock0lock.h"
38	#include "srv0start.h"
39	#include "ut0timer.h"
40
41	#include <list>
42
43	/************************************************//**
44	Custom nullptr implementation for under g++ 4.6
45	*******************************************************/
46	// #pragma once
47	/*
48	namespace std
49	{
50	// based on SC22/WG21/N2431 = J16/07-0301
51	struct nullptr_t
52	{
53	template<typename any> operator any () const*
54	{
55	return 0;
56	}
57	template<class any, typename T> operator T any:: () const*
58	{
59	return 0;
60	}
61
62	#ifdef _MSC_VER
63	struct pad {};
64	pad __[sizeof(void)/sizeof(pad)];*
65	#else
66	char __[sizeof(void)];*
67	#endif
68	private:
69	// nullptr_t();// {}
70	// nullptr_t(const nullptr_t&);
71	// void operator = (const nullptr_t&);
72	void operator &() const;
73	template<typename any> void operator +(any) const
74	{
75	// I Love MSVC 2005!
76	}
77	template<typename any> void operator -(any) const
78	{
79	// I Love MSVC 2005!
80	}
81	};
82	static const nullptr_t __nullptr = {};
83	}
84
85	#ifndef nullptr
86	#define nullptr std::__nullptr
87	#endif
88	*/
89
90	/************************************************//**
91	End of Custom nullptr implementation for under g++ 4.6
92	*******************************************************/
93
94	/ When there's no work, either because defragment is disabled, or because no*
95	query is submitted, thread checks state every BTR_DEFRAGMENT_SLEEP_IN_USECS./*
96	#define BTR_DEFRAGMENT_SLEEP_IN_USECS 1000000
97	/ Reduce the target page size by this amount when compression failure happens*
98	during defragmentaiton. 512 is chosen because it's a power of 2 and it is about
99	3% of the page size. When there are compression failures in defragmentation,
100	our goal is to get a decent defrag ratio with as few compression failure as
101	possible. From experimentation it seems that reduce the target size by 512 every
102	time will make sure the page is compressible within a couple of iterations. /*
103	#define BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE 512
104
105	/ Work queue for defragmentation. /
106	typedef std::list<btr_defragment_item_t*> btr_defragment_wq_t;
107	static btr_defragment_wq_t btr_defragment_wq;
108
109	/ Mutex protecting the defragmentation work queue./
110	ib_mutex_t btr_defragment_mutex;
111	#ifdef UNIV_PFS_MUTEX
112	UNIV_INTERN mysql_pfs_key_t btr_defragment_mutex_key;
113	#endif /* UNIV_PFS_MUTEX */
114
115	/ Number of compression failures caused by defragmentation since server*
116	start. /*
117	ulint btr_defragment_compression_failures = `0`;
118	/ Number of btr_defragment_n_pages calls that altered page but didn't*
119	manage to release any page. /*
120	ulint btr_defragment_failures = `0`;
121	/ Total number of btr_defragment_n_pages calls that altered page.*
122	The difference between btr_defragment_count and btr_defragment_failures shows
123	the amount of effort wasted. /*
124	ulint btr_defragment_count = `0`;
125
126	/****************************************************************//**
127	Constructor for btr_defragment_item_t. /*
128	btr_defragment_item_t::btr_defragment_item_t(
129	btr_pcur_t* pcur,
130	os_event_t event)
131	{
132	this->pcur = pcur;
133	this->event = event;
134	this->removed = false;
135	this->last_processed = `0`;
136	}
137
138	/****************************************************************//**
139	Destructor for btr_defragment_item_t. /*
140	btr_defragment_item_t::~btr_defragment_item_t() {
141	if (this->pcur) {
142	btr_pcur_free_for_mysql(this->pcur);
143	}
144	if (this->event) {
145	os_event_set(this->event);
146	}
147	}
148
149	/****************************************************************//**
150	Initialize defragmentation. /*
151	void
152	btr_defragment_init()
153	{
154	srv_defragment_interval = ut_microseconds_to_timer(
155	(ulonglong) (`1000000.0` / srv_defragment_frequency));
156	mutex_create(LATCH_ID_BTR_DEFRAGMENT_MUTEX, &btr_defragment_mutex);
157	}
158
159	/****************************************************************//**
160	Shutdown defragmentation. Release all resources. /*
161	void
162	btr_defragment_shutdown()
163	{
164	mutex_enter(&btr_defragment_mutex);
165	std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
166	while(iter != btr_defragment_wq.end()) {
167	btr_defragment_item_t* item = *iter;
168	iter = btr_defragment_wq.erase(iter);
169	delete item;
170	}
171	mutex_exit(&btr_defragment_mutex);
172	mutex_free(&btr_defragment_mutex);
173	}
174
175
176	/****************************************************************//**
177	Functions used by the query threads: btr_defragment_xxx_index
178	Query threads find/add/remove index. /*
179	/****************************************************************//**
180	Check whether the given index is in btr_defragment_wq. We use index->id
181	to identify indices. /*
182	bool
183	btr_defragment_find_index(
184	dict_index_t* index) /!< Index to find. /
185	{
186	mutex_enter(&btr_defragment_mutex);
187	for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
188	iter != btr_defragment_wq.end();
189	++iter) {
190	btr_defragment_item_t* item = *iter;
191	btr_pcur_t* pcur = item->pcur;
192	btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
193	dict_index_t* idx = btr_cur_get_index(cursor);
194	if (index->id == idx->id) {
195	mutex_exit(&btr_defragment_mutex);
196	return true;
197	}
198	}
199	mutex_exit(&btr_defragment_mutex);
200	return false;
201	}
202
203	/****************************************************************//**
204	Query thread uses this function to add an index to btr_defragment_wq.
205	Return a pointer to os_event for the query thread to wait on if this is a
206	synchronized defragmentation. /*
207	os_event_t
208	btr_defragment_add_index(
209	dict_index_t* index, /!< index to be added /
210	bool async, /!< whether this is an async*
211	defragmentation /*
212	dberr_t* err) /!< out: error code /
213	{
214	mtr_t mtr;
215	*err = DB_SUCCESS;
216
217	mtr_start(&mtr);
218	// Load index rood page.
219	buf_block_t* block = btr_block_get(
220	page_id_t (index->table->space->id, index->page),
221	page_size_t (index->table->space->flags),
222	RW_NO_LATCH, index, &mtr);
223	page_t* page = NULL;
224
225	if (block) {
226	page = buf_block_get_frame(block);
227	}
228
229	if (page == NULL && !index->is_readable()) {
230	mtr_commit(&mtr);
231	*err = DB_DECRYPTION_FAILED;
232	return NULL;
233	}
234
235	ut_ad(page_is_root(page));
236
237	if (page_is_leaf(page)) {
238	// Index root is a leaf page, no need to defragment.
239	mtr_commit(&mtr);
240	return NULL;
241	}
242	btr_pcur_t* pcur = btr_pcur_create_for_mysql();
243	os_event_t event = NULL;
244	if (!async) {
245	event = os_event_create(`0`);
246	}
247	btr_pcur_open_at_index_side(true, index, BTR_SEARCH_LEAF, pcur,
248	true, `0`, &mtr);
249	btr_pcur_move_to_next(pcur, &mtr);
250	btr_pcur_store_position(pcur, &mtr);
251	mtr_commit(&mtr);
252	dict_stats_empty_defrag_summary(index);
253	btr_defragment_item_t* item = new btr_defragment_item_t (pcur, event);
254	mutex_enter(&btr_defragment_mutex);
255	btr_defragment_wq.push_back(item);
256	mutex_exit(&btr_defragment_mutex);
257	return event;
258	}
259
260	/****************************************************************//**
261	When table is dropped, this function is called to mark a table as removed in
262	btr_efragment_wq. The difference between this function and the remove_index
263	function is this will not NULL the event. /*
264	void
265	btr_defragment_remove_table(
266	dict_table_t* table) /!< Index to be removed. /
267	{
268	mutex_enter(&btr_defragment_mutex);
269	for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
270	iter != btr_defragment_wq.end();
271	++iter) {
272	btr_defragment_item_t* item = *iter;
273	btr_pcur_t* pcur = item->pcur;
274	btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
275	dict_index_t* idx = btr_cur_get_index(cursor);
276	if (table->id == idx->table->id) {
277	item->removed = true;
278	}
279	}
280	mutex_exit(&btr_defragment_mutex);
281	}
282
283	/****************************************************************//**
284	Query thread uses this function to mark an index as removed in
285	btr_efragment_wq. /*
286	void
287	btr_defragment_remove_index(
288	dict_index_t* index) /!< Index to be removed. /
289	{
290	mutex_enter(&btr_defragment_mutex);
291	for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
292	iter != btr_defragment_wq.end();
293	++iter) {
294	btr_defragment_item_t* item = *iter;
295	btr_pcur_t* pcur = item->pcur;
296	btr_cur_t* cursor = btr_pcur_get_btr_cur(pcur);
297	dict_index_t* idx = btr_cur_get_index(cursor);
298	if (index->id == idx->id) {
299	item->removed = true;
300	item->event = NULL;
301	break;
302	}
303	}
304	mutex_exit(&btr_defragment_mutex);
305	}
306
307	/****************************************************************//**
308	Functions used by defragmentation thread: btr_defragment_xxx_item.
309	Defragmentation thread operates on the work item. It gets/removes
310	item from the work queue. /*
311	/****************************************************************//**
312	Defragment thread uses this to remove an item from btr_defragment_wq.
313	When an item is removed from the work queue, all resources associated with it
314	are free as well. /*
315	void
316	btr_defragment_remove_item(
317	btr_defragment_item_t* item) /!< Item to be removed. /
318	{
319	mutex_enter(&btr_defragment_mutex);
320	for (std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
321	iter != btr_defragment_wq.end();
322	++iter) {
323	if (item == *iter) {
324	btr_defragment_wq.erase(iter);
325	delete item;
326	break;
327	}
328	}
329	mutex_exit(&btr_defragment_mutex);
330	}
331
332	/****************************************************************//**
333	Defragment thread uses this to get an item from btr_defragment_wq to work on.
334	The item is not removed from the work queue so query threads can still access
335	this item. We keep it this way so query threads can find and kill a
336	defragmentation even if that index is being worked on. Be aware that while you
337	work on this item you have no lock protection on it whatsoever. This is OK as
338	long as the query threads and defragment thread won't modify the same fields
339	without lock protection.
340	*/
341	btr_defragment_item_t*
342	btr_defragment_get_item()
343	{
344	if (btr_defragment_wq.empty()) {
345	return NULL;
346	//return nullptr;
347	}
348	mutex_enter(&btr_defragment_mutex);
349	std::list< btr_defragment_item_t* >::iterator iter = btr_defragment_wq.begin();
350	if (iter == btr_defragment_wq.end()) {
351	iter = btr_defragment_wq.begin();
352	}
353	btr_defragment_item_t* item = *iter;
354	iter ++;
355	mutex_exit(&btr_defragment_mutex);
356	return item;
357	}
358
359	/*******************************************************************//**
360	Check whether we should save defragmentation statistics to persistent storage.
361	Currently we save the stats to persistent storage every 100 updates. /*
362	UNIV_INTERN
363	void
364	btr_defragment_save_defrag_stats_if_needed(
365	dict_index_t* index) /!< in: index /
366	{
367	if (srv_defragment_stats_accuracy != `0` // stats tracking disabled
368	&& index->table->space->id != `0` // do not track system tables
369	&& index->stat_defrag_modified_counter
370	>= srv_defragment_stats_accuracy) {
371	dict_stats_defrag_pool_add(index);
372	index->stat_defrag_modified_counter = `0`;
373	}
374	}
375
376	/*******************************************************************//**
377	Main defragment functionalities used by defragment thread./*
378	/***********************************************************//**
379	Calculate number of records from beginning of block that can
380	fit into size_limit
381	@return number of records /*
382	UNIV_INTERN
383	ulint
384	btr_defragment_calc_n_recs_for_size(
385	buf_block_t* block, /!< in: B-tree page /
386	dict_index_t* index, /!< in: index of the page /
387	ulint size_limit, /!< in: size limit to fit records in /
388	ulint* n_recs_size) /!< out: actual size of the records that fit*
389	in size_limit. /*
390	{
391	page_t* page = buf_block_get_frame(block);
392	ulint n_recs = `0`;
393	ulint offsets_[REC_OFFS_NORMAL_SIZE];
394	ulint* offsets = offsets_;
395	rec_offs_init(offsets_);
396	mem_heap_t* heap = NULL;
397	ulint size = `0`;
398	page_cur_t cur;
399
400	page_cur_set_before_first(block, &cur);
401	page_cur_move_to_next(&cur);
402	while (page_cur_get_rec(&cur) != page_get_supremum_rec(page)) {
403	rec_t* cur_rec = page_cur_get_rec(&cur);
404	offsets = rec_get_offsets(cur_rec, index, offsets,
405	page_is_leaf(page),
406	ULINT_UNDEFINED, &heap);
407	ulint rec_size = rec_offs_size(offsets);
408	size += rec_size;
409	if (size > size_limit) {
410	size = size - rec_size;
411	break;
412	}
413	n_recs ++;
414	page_cur_move_to_next(&cur);
415	}
416	*n_recs_size = size;
417	return n_recs;
418	}
419
420	/***********************************************************//**
421	Merge as many records from the from_block to the to_block. Delete
422	the from_block if all records are successfully merged to to_block.
423	@return the to_block to target for next merge operation. /*
424	UNIV_INTERN
425	buf_block_t*
426	btr_defragment_merge_pages(
427	dict_index_t* index, /!< in: index tree /
428	buf_block_t* from_block, /!< in: origin of merge /
429	buf_block_t* to_block, /!< in: destination of merge /
430	const page_size_t page_size, /!< in: page size of the block /
431	ulint reserved_space, /!< in: space reserved for future*
432	insert to avoid immediate page split /*
433	ulint* max_data_size, /!< in/out: max data size to*
434	fit in a single compressed page. /*
435	mem_heap_t* heap, /!< in/out: pointer to memory heap /
436	mtr_t* mtr) /!< in/out: mini-transaction /
437	{
438	page_t* from_page = buf_block_get_frame(from_block);
439	page_t* to_page = buf_block_get_frame(to_block);
440	ulint level = btr_page_get_level(from_page);
441	ulint n_recs = page_get_n_recs(from_page);
442	ulint new_data_size = page_get_data_size(to_page);
443	ulint max_ins_size =
444	page_get_max_insert_size(to_page, n_recs);
445	ulint max_ins_size_reorg =
446	page_get_max_insert_size_after_reorganize(
447	to_page, n_recs);
448	ulint max_ins_size_to_use = max_ins_size_reorg > reserved_space
449	? max_ins_size_reorg - reserved_space : `0`;
450	ulint move_size = `0`;
451	ulint n_recs_to_move = `0`;
452	rec_t* rec = NULL;
453	ulint target_n_recs = `0`;
454	rec_t* orig_pred;
455
456	// Estimate how many records can be moved from the from_page to
457	// the to_page.
458	if (page_size.is_compressed()) {
459	ulint page_diff = srv_page_size - *max_data_size;
460	max_ins_size_to_use = (max_ins_size_to_use > page_diff)
461	? max_ins_size_to_use - page_diff : `0`;
462	}
463	n_recs_to_move = btr_defragment_calc_n_recs_for_size(
464	from_block, index, max_ins_size_to_use, &move_size);
465
466	// If max_ins_size >= move_size, we can move the records without
467	// reorganizing the page, otherwise we need to reorganize the page
468	// first to release more space.
469	if (move_size > max_ins_size) {
470	if (!btr_page_reorganize_block(false, page_zip_level,
471	to_block, index,
472	mtr)) {
473	if (!dict_index_is_clust(index)
474	&& page_is_leaf(to_page)) {
475	ibuf_reset_free_bits(to_block);
476	}
477	// If reorganization fails, that means page is
478	// not compressable. There's no point to try
479	// merging into this page. Continue to the
480	// next page.
481	return from_block;
482	}
483	ut_ad(page_validate(to_page, index));
484	max_ins_size = page_get_max_insert_size(to_page, n_recs);
485	ut_a(max_ins_size >= move_size);
486	}
487
488	// Move records to pack to_page more full.
489	orig_pred = NULL;
490	target_n_recs = n_recs_to_move;
491	while (n_recs_to_move > `0`) {
492	rec = page_rec_get_nth(from_page,
493	n_recs_to_move + `1`);
494	orig_pred = page_copy_rec_list_start(
495	to_block, from_block, rec, index, mtr);
496	if (orig_pred)
497	break;
498	// If we reach here, that means compression failed after packing
499	// n_recs_to_move number of records to to_page. We try to reduce
500	// the targeted data size on the to_page by
501	// BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE and try again.
502	my_atomic_addlint(
503	&btr_defragment_compression_failures, `1`);
504	max_ins_size_to_use =
505	move_size > BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
506	? move_size - BTR_DEFRAGMENT_PAGE_REDUCTION_STEP_SIZE
507	: `0`;
508	if (max_ins_size_to_use == `0`) {
509	n_recs_to_move = `0`;
510	move_size = `0`;
511	break;
512	}
513	n_recs_to_move = btr_defragment_calc_n_recs_for_size(
514	from_block, index, max_ins_size_to_use, &move_size);
515	}
516	// If less than target_n_recs are moved, it means there are
517	// compression failures during page_copy_rec_list_start. Adjust
518	// the max_data_size estimation to reduce compression failures
519	// in the following runs.
520	if (target_n_recs > n_recs_to_move
521	&& *max_data_size > new_data_size + move_size) {
522	*max_data_size = new_data_size + move_size;
523	}
524	// Set ibuf free bits if necessary.
525	if (!dict_index_is_clust(index)
526	&& page_is_leaf(to_page)) {
527	if (page_size.is_compressed()) {
528	ibuf_reset_free_bits(to_block);
529	} else {
530	ibuf_update_free_bits_if_full(
531	to_block,
532	srv_page_size,
533	ULINT_UNDEFINED);
534	}
535	}
536	if (n_recs_to_move == n_recs) {
537	/ The whole page is merged with the previous page,*
538	free it. /*
539	lock_update_merge_left(to_block, orig_pred,
540	from_block);
541	btr_search_drop_page_hash_index(from_block);
542	btr_level_list_remove(
543	index->table->space->id,
544	page_size, from_page, index, mtr);
545	btr_node_ptr_delete(index, from_block, mtr);
546	/ btr_blob_dbg_remove(from_page, index,*
547	"btr_defragment_n_pages"); /*
548	btr_page_free(index, from_block, mtr);
549	} else {
550	// There are still records left on the page, so
551	// increment n_defragmented. Node pointer will be changed
552	// so remove the old node pointer.
553	if (n_recs_to_move > `0`) {
554	// Part of the page is merged to left, remove
555	// the merged records, update record locks and
556	// node pointer.
557	dtuple_t* node_ptr;
558	page_delete_rec_list_start(rec, from_block,
559	index, mtr);
560	lock_update_split_and_merge(to_block,
561	orig_pred,
562	from_block);
563	btr_node_ptr_delete(index, from_block, mtr);
564	rec = page_rec_get_next(
565	page_get_infimum_rec(from_page));
566	node_ptr = dict_index_build_node_ptr(
567	index, rec, page_get_page_no(from_page),
568	heap, level);
569	btr_insert_on_non_leaf_level(`0`, index, level+`1`,
570	node_ptr, mtr);
571	}
572	to_block = from_block;
573	}
574	return to_block;
575	}
576
577	/***********************************************************//**
578	Tries to merge N consecutive pages, starting from the page pointed by the
579	cursor. Skip space 0. Only consider leaf pages.
580	This function first loads all N pages into memory, then for each of
581	the pages other than the first page, it tries to move as many records
582	as possible to the left sibling to keep the left sibling full. During
583	the process, if any page becomes empty, that page will be removed from
584	the level list. Record locks, hash, and node pointers are updated after
585	page reorganization.
586	@return pointer to the last block processed, or NULL if reaching end of index /*
587	UNIV_INTERN
588	buf_block_t*
589	btr_defragment_n_pages(
590	buf_block_t* block, /!< in: starting block for defragmentation /
591	dict_index_t* index, /!< in: index tree /
592	uint n_pages,/!< in: number of pages to defragment /
593	mtr_t* mtr) /!< in/out: mini-transaction /
594	{
595	/ We will need to load the n+1 block because if the last page is freed*
596	and we need to modify the prev_page_no of that block. /*
597	buf_block_t* blocks[BTR_DEFRAGMENT_MAX_N_PAGES + `1`];
598	page_t* first_page;
599	buf_block_t* current_block;
600	ulint total_data_size = `0`;
601	ulint total_n_recs = `0`;
602	ulint data_size_per_rec;
603	ulint optimal_page_size;
604	ulint reserved_space;
605	ulint max_data_size = `0`;
606	uint n_defragmented = `0`;
607	uint n_new_slots;
608	mem_heap_t* heap;
609	ibool end_of_index = FALSE;
610
611	/ It doesn't make sense to call this function with n_pages = 1. /
612	ut_ad(n_pages > `1`);
613
614	if (!page_is_leaf(block->frame)) {
615	return NULL;
616	}
617
618	if (!index->table->space \|\| !index->table->space->id) {
619	/ Ignore space 0. /
620	return NULL;
621	}
622
623	if (n_pages > BTR_DEFRAGMENT_MAX_N_PAGES) {
624	n_pages = BTR_DEFRAGMENT_MAX_N_PAGES;
625	}
626
627	first_page = buf_block_get_frame(block);
628	const page_size_t page_size(index->table->space->flags);
629
630	/ 1. Load the pages and calculate the total data size. /
631	blocks[`0`] = block;
632	for (uint i = `1`; i <= n_pages; i++) {
633	page_t* page = buf_block_get_frame(blocks[i-`1`]);
634	ulint page_no = btr_page_get_next(page, mtr);
635	total_data_size += page_get_data_size(page);
636	total_n_recs += page_get_n_recs(page);
637	if (page_no == FIL_NULL) {
638	n_pages = i;
639	end_of_index = TRUE;
640	break;
641	}
642
643	blocks[i] = btr_block_get(page_id_t (index->table->space->id,
644	page_no), page_size,
645	RW_X_LATCH, index, mtr);
646	}
647
648	if (n_pages == `1`) {
649	if (!page_has_prev(first_page)) {
650	/ last page in the index /
651	if (dict_index_get_page(index)
652	== page_get_page_no(first_page))
653	return NULL;
654	/ given page is the last page.*
655	Lift the records to father. /*
656	btr_lift_page_up(index, block, mtr);
657	}
658	return NULL;
659	}
660
661	/ 2. Calculate how many pages data can fit in. If not compressable,*
662	return early. /*
663	ut_a(total_n_recs != `0`);
664	data_size_per_rec = total_data_size / total_n_recs;
665	// For uncompressed pages, the optimal data size if the free space of a
666	// empty page.
667	optimal_page_size = page_get_free_space_of_empty(
668	page_is_comp(first_page));
669	// For compressed pages, we take compression failures into account.
670	if (page_size.is_compressed()) {
671	ulint size = `0`;
672	uint i = `0`;
673	// We estimate the optimal data size of the index use samples of
674	// data size. These samples are taken when pages failed to
675	// compress due to insertion on the page. We use the average
676	// of all samples we have as the estimation. Different pages of
677	// the same index vary in compressibility. Average gives a good
678	// enough estimation.
679	for (;i < STAT_DEFRAG_DATA_SIZE_N_SAMPLE; i++) {
680	if (index->stat_defrag_data_size_sample[i] == `0`) {
681	break;
682	}
683	size += index->stat_defrag_data_size_sample[i];
684	}
685	if (i != `0`) {
686	size /= i;
687	optimal_page_size = ut_min(optimal_page_size, size);
688	}
689	max_data_size = optimal_page_size;
690	}
691
692	reserved_space = ut_min((ulint)(optimal_page_size
693	* (`1` - srv_defragment_fill_factor)),
694	(data_size_per_rec
695	* srv_defragment_fill_factor_n_recs));
696	optimal_page_size -= reserved_space;
697	n_new_slots = uint((total_data_size + optimal_page_size - `1`)
698	/ optimal_page_size);
699	if (n_new_slots >= n_pages) {
700	/ Can't defragment. /
701	if (end_of_index)
702	return NULL;
703	return blocks[n_pages-`1`];
704	}
705
706	/ 3. Defragment pages. /
707	heap = mem_heap_create(`256`);
708	// First defragmented page will be the first page.
709	current_block = blocks[`0`];
710	// Start from the second page.
711	for (uint i = `1`; i < n_pages; i ++) {
712	buf_block_t* new_block = btr_defragment_merge_pages(
713	index, blocks[i], current_block, page_size,
714	reserved_space, &max_data_size, heap, mtr);
715	if (new_block != current_block) {
716	n_defragmented ++;
717	current_block = new_block;
718	}
719	}
720	mem_heap_free(heap);
721	n_defragmented ++;
722	my_atomic_addlint(
723	&btr_defragment_count, `1`);
724	if (n_pages == n_defragmented) {
725	my_atomic_addlint(
726	&btr_defragment_failures, `1`);
727	} else {
728	index->stat_defrag_n_pages_freed += (n_pages - n_defragmented);
729	}
730	if (end_of_index)
731	return NULL;
732	return current_block;
733	}
734
735	/* Whether btr_defragment_thread is active /
736	bool btr_defragment_thread_active;
737
738	/* Merge consecutive b-tree pages into fewer pages to defragment indexes /
739	extern "C" UNIV_INTERN
740	os_thread_ret_t
741	DECLARE_THREAD(btr_defragment_thread)(void*)
742	{
743	btr_pcur_t* pcur;
744	btr_cur_t* cursor;
745	dict_index_t* index;
746	mtr_t mtr;
747	buf_block_t* first_block;
748	buf_block_t* last_block;
749
750	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
751	ut_ad(btr_defragment_thread_active);
752
753	/ If defragmentation is disabled, sleep before*
754	checking whether it's enabled. /*
755	if (!srv_defragment) {
756	os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS);
757	continue;
758	}
759	/ The following call won't remove the item from work queue.*
760	We only get a pointer to it to work on. This will make sure
761	when user issue a kill command, all indices are in the work
762	queue to be searched. This also means that the user thread
763	cannot directly remove the item from queue (since we might be
764	using it). So user thread only marks index as removed. /*
765	btr_defragment_item_t* item = btr_defragment_get_item();
766	/ If work queue is empty, sleep and check later. /
767	if (!item) {
768	os_thread_sleep(BTR_DEFRAGMENT_SLEEP_IN_USECS);
769	continue;
770	}
771	/ If an index is marked as removed, we remove it from the work*
772	queue. No other thread could be using this item at this point so
773	it's safe to remove now. /*
774	if (item->removed) {
775	btr_defragment_remove_item(item);
776	continue;
777	}
778
779	pcur = item->pcur;
780	ulonglong now = ut_timer_now();
781	ulonglong elapsed = now - item->last_processed;
782
783	if (elapsed < srv_defragment_interval) {
784	/ If we see an index again before the interval*
785	determined by the configured frequency is reached,
786	we just sleep until the interval pass. Since
787	defragmentation of all indices queue up on a single
788	thread, it's likely other indices that follow this one
789	don't need to sleep again. /*
790	os_thread_sleep(((ulint)ut_timer_to_microseconds(
791	srv_defragment_interval - elapsed)));
792	}
793
794	now = ut_timer_now();
795	mtr_start(&mtr);
796	cursor = btr_pcur_get_btr_cur(pcur);
797	index = btr_cur_get_index(cursor);
798	index->set_modified(mtr);
799	/ To follow the latching order defined in WL#6326, acquire index->lock X-latch.*
800	This entitles us to acquire page latches in any order for the index. /*
801	mtr_x_lock(&index->lock, &mtr);
802	/ This will acquire index->lock SX-latch, which per WL#6363 is allowed*
803	when we are already holding the X-latch. /*
804	btr_pcur_restore_position(BTR_MODIFY_TREE, pcur, &mtr);
805	first_block = btr_cur_get_block(cursor);
806
807	last_block = btr_defragment_n_pages(first_block, index,
808	srv_defragment_n_pages,
809	&mtr);
810	if (last_block) {
811	/ If we haven't reached the end of the index,*
812	place the cursor on the last record of last page,
813	store the cursor position, and put back in queue. /*
814	page_t* last_page = buf_block_get_frame(last_block);
815	rec_t* rec = page_rec_get_prev(
816	page_get_supremum_rec(last_page));
817	ut_a(page_rec_is_user_rec(rec));
818	page_cur_position(rec, last_block,
819	btr_cur_get_page_cur(cursor));
820	btr_pcur_store_position(pcur, &mtr);
821	mtr_commit(&mtr);
822	/ Update the last_processed time of this index. /
823	item->last_processed = now;
824	} else {
825	dberr_t err = DB_SUCCESS;
826	mtr_commit(&mtr);
827	/ Reaching the end of the index. /
828	dict_stats_empty_defrag_stats(index);
829	err = dict_stats_save_defrag_stats(index);
830	if (err != DB_SUCCESS) {
831	ib::error () << "Saving defragmentation stats for table "
832	<< index->table->name.m_name
833	<< " index " << index->name ()
834	<< " failed with error " << err;
835	} else {
836	err = dict_stats_save_defrag_summary(index);
837
838	if (err != DB_SUCCESS) {
839	ib::error () << "Saving defragmentation summary for table "
840	<< index->table->name.m_name
841	<< " index " << index->name ()
842	<< " failed with error " << err;
843	}
844	}
845
846	btr_defragment_remove_item(item);
847	}
848	}
849
850	btr_defragment_thread_active = false;
851	os_thread_exit();
852	OS_THREAD_DUMMY_RETURN;
853	}
854

Browse the source code of MariaDB/storage/innobase/btr/btr0defragment.cc