buf0flu.cc source code [MariaDB/storage/innobase/buf/buf0flu.cc]

1	/*****************************************************************************
2
3	Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4	Copyright (c) 2013, 2018, MariaDB Corporation.
5	Copyright (c) 2013, 2014, Fusion-io
6
7	This program is free software; you can redistribute it and/or modify it under
8	the terms of the GNU General Public License as published by the Free Software
9	Foundation; version 2 of the License.
10
11	This program is distributed in the hope that it will be useful, but WITHOUT
12	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13	FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
14
15	You should have received a copy of the GNU General Public License along with
16	this program; if not, write to the Free Software Foundation, Inc.,
17	51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
18
19	*****************************************************************************/
20
21	/************************************************//**
22	@file buf/buf0flu.cc
23	The database buffer buf_pool flush algorithm
24
25	Created 11/11/1995 Heikki Tuuri
26	*******************************************************/
27
28	#include "ha_prototypes.h"
29	#include <mysql/service_thd_wait.h>
30	#include <my_dbug.h>
31	#include <sql_class.h>
32
33	#include "buf0flu.h"
34	#include "buf0buf.h"
35	#include "buf0checksum.h"
36	#include "srv0start.h"
37	#include "srv0srv.h"
38	#include "page0zip.h"
39	#include "ut0byte.h"
40	#include "page0page.h"
41	#include "fil0fil.h"
42	#include "buf0lru.h"
43	#include "buf0rea.h"
44	#include "ibuf0ibuf.h"
45	#include "log0log.h"
46	#include "os0file.h"
47	#include "trx0sys.h"
48	#include "srv0mon.h"
49	#include "fsp0sysspace.h"
50	#include "ut0stage.h"
51	#include "fil0pagecompress.h"
52	#ifdef UNIV_LINUX
53	/ include defs for CPU time priority settings /
54	#include <unistd.h>
55	#include <sys/syscall.h>
56	#include <sys/time.h>
57	#include <sys/resource.h>
58	static const int buf_flush_page_cleaner_priority = -`20`;
59	#endif /* UNIV_LINUX */
60
61	/* Sleep time in microseconds for loop waiting for the oldest*
62	modification lsn /*
63	static const ulint buf_flush_wait_flushed_sleep_time = `10000`;
64
65	#include <my_service_manager.h>
66
67	/* Number of pages flushed through non flush_list flushes. /
68	static ulint buf_lru_flush_page_count = `0`;
69
70	/* Flag indicating if the page_cleaner is in active state. This flag*
71	is set to TRUE by the page_cleaner thread when it is spawned and is set
72	back to FALSE at shutdown by the page_cleaner as well. Therefore no
73	need to protect it by a mutex. It is only ever read by the thread
74	doing the shutdown /*
75	bool buf_page_cleaner_is_active;
76
77	/* Factor for scan length to determine n_pages for intended oldest LSN*
78	progress /*
79	static ulint buf_flush_lsn_scan_factor = `3`;
80
81	/* Average redo generation rate /
82	static lsn_t lsn_avg_rate = `0`;
83
84	/* Target oldest LSN for the requested flush_sync /
85	static lsn_t buf_flush_sync_lsn = `0`;
86
87	#ifdef UNIV_PFS_THREAD
88	mysql_pfs_key_t page_cleaner_thread_key;
89	#endif /* UNIV_PFS_THREAD */
90
91	/* Event to synchronise with the flushing. /
92	os_event_t buf_flush_event;
93
94	/* State for page cleaner array slot /
95	enum page_cleaner_state_t {
96	/* Not requested any yet.*
97	Moved from FINISHED by the coordinator. /*
98	PAGE_CLEANER_STATE_NONE = `0`,
99	/* Requested but not started flushing.*
100	Moved from NONE by the coordinator. /*
101	PAGE_CLEANER_STATE_REQUESTED,
102	/* Flushing is on going.*
103	Moved from REQUESTED by the worker. /*
104	PAGE_CLEANER_STATE_FLUSHING,
105	/* Flushing was finished.*
106	Moved from FLUSHING by the worker. /*
107	PAGE_CLEANER_STATE_FINISHED
108	};
109
110	/* Page cleaner request state for each buffer pool instance /
111	struct page_cleaner_slot_t {
112	page_cleaner_state_t state; /!< state of the request.*
113	protected by page_cleaner_t::mutex
114	if the worker thread got the slot and
115	set to PAGE_CLEANER_STATE_FLUSHING,
116	n_flushed_lru and n_flushed_list can be
117	updated only by the worker thread /*
118	/ This value is set during state==PAGE_CLEANER_STATE_NONE /
119	ulint n_pages_requested;
120	/!< number of requested pages*
121	for the slot /*
122	/ These values are updated during state==PAGE_CLEANER_STATE_FLUSHING,*
123	and commited with state==PAGE_CLEANER_STATE_FINISHED.
124	The consistency is protected by the 'state' /*
125	ulint n_flushed_lru;
126	/!< number of flushed pages*
127	by LRU scan flushing /*
128	ulint n_flushed_list;
129	/!< number of flushed pages*
130	by flush_list flushing /*
131	bool succeeded_list;
132	/!< true if flush_list flushing*
133	succeeded. /*
134	ulint flush_lru_time;
135	/!< elapsed time for LRU flushing /
136	ulint flush_list_time;
137	/!< elapsed time for flush_list*
138	flushing /*
139	ulint flush_lru_pass;
140	/!< count to attempt LRU flushing /
141	ulint flush_list_pass;
142	/!< count to attempt flush_list*
143	flushing /*
144	};
145
146	/* Page cleaner structure common for all threads /
147	struct page_cleaner_t {
148	ib_mutex_t mutex; /!< mutex to protect whole of*
149	page_cleaner_t struct and
150	page_cleaner_slot_t slots. /*
151	os_event_t is_requested; /!< event to activate worker*
152	threads. /*
153	os_event_t is_finished; /!< event to signal that all*
154	slots were finished. /*
155	os_event_t is_started; /!< event to signal that*
156	thread is started/exiting /*
157	volatile ulint n_workers; /!< number of worker threads*
158	in existence /*
159	bool requested; /!< true if requested pages*
160	to flush /*
161	lsn_t lsn_limit; /!< upper limit of LSN to be*
162	flushed /*
163	ulint n_slots; /!< total number of slots /
164	ulint n_slots_requested;
165	/!< number of slots*
166	in the state
167	PAGE_CLEANER_STATE_REQUESTED /*
168	ulint n_slots_flushing;
169	/!< number of slots*
170	in the state
171	PAGE_CLEANER_STATE_FLUSHING /*
172	ulint n_slots_finished;
173	/!< number of slots*
174	in the state
175	PAGE_CLEANER_STATE_FINISHED /*
176	ulint flush_time; /!< elapsed time to flush*
177	requests for all slots /*
178	ulint flush_pass; /!< count to finish to flush*
179	requests for all slots /*
180	page_cleaner_slot_t slots[MAX_BUFFER_POOLS];
181	bool is_running; /!< false if attempt*
182	to shutdown /*
183
184	#ifdef UNIV_DEBUG
185	ulint n_disabled_debug;
186	/<! how many of pc threads*
187	have been disabled /*
188	#endif /* UNIV_DEBUG */
189	};
190
191	static page_cleaner_t page_cleaner;
192
193	#ifdef UNIV_DEBUG
194	my_bool innodb_page_cleaner_disabled_debug;
195	#endif /* UNIV_DEBUG */
196
197	/* If LRU list of a buf_pool is less than this size then LRU eviction*
198	should not happen. This is because when we do LRU flushing we also put
199	the blocks on free list. If LRU list is very small then we can end up
200	in thrashing. /*
201	#define BUF_LRU_MIN_LEN 256
202
203	/ @} /
204
205	/****************************************************************//**
206	Increases flush_list size in bytes with the page size in inline function /*
207	static inline
208	void
209	incr_flush_list_size_in_bytes(
210	/==========================/
211	buf_block_t* block, /!< in: control block /
212	buf_pool_t* buf_pool) /!< in: buffer pool instance /
213	{
214	ut_ad(buf_flush_list_mutex_own(buf_pool));
215
216	buf_pool->stat.flush_list_bytes += block->page.size.physical();
217
218	ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size);
219	}
220
221	#if defined UNIV_DEBUG \|\| defined UNIV_BUF_DEBUG
222	/****************************************************************//**
223	Validates the flush list.
224	@return TRUE if ok /*
225	static
226	ibool
227	buf_flush_validate_low(
228	/===================/
229	buf_pool_t* buf_pool); /!< in: Buffer pool instance /
230
231	/****************************************************************//**
232	Validates the flush list some of the time.
233	@return TRUE if ok or the check was skipped /*
234	static
235	ibool
236	buf_flush_validate_skip(
237	/====================/
238	buf_pool_t* buf_pool) /!< in: Buffer pool instance /
239	{
240	/* Try buf_flush_validate_low() every this many times /
241	# define BUF_FLUSH_VALIDATE_SKIP 23
242
243	/* The buf_flush_validate_low() call skip counter.*
244	Use a signed type because of the race condition below. /*
245	static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
246
247	/ There is a race condition below, but it does not matter,*
248	because this call is only for heuristic purposes. We want to
249	reduce the call frequency of the costly buf_flush_validate_low()
250	check in debug builds. /*
251	if (--buf_flush_validate_count > `0`) {
252	return(TRUE);
253	}
254
255	buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
256	return(buf_flush_validate_low(buf_pool));
257	}
258	#endif /* UNIV_DEBUG \|\| UNIV_BUF_DEBUG */
259
260	/****************************************************************//**
261	Insert a block in the flush_rbt and returns a pointer to its
262	predecessor or NULL if no predecessor. The ordering is maintained
263	on the basis of the <oldest_modification, space, offset> key.
264	@return pointer to the predecessor or NULL if no predecessor. /*
265	static
266	buf_page_t*
267	buf_flush_insert_in_flush_rbt(
268	/==========================/
269	buf_page_t* bpage) /!< in: bpage to be inserted. /
270	{
271	const ib_rbt_node_t* c_node;
272	const ib_rbt_node_t* p_node;
273	buf_page_t* prev = NULL;
274	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
275
276	ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE);
277	ut_ad(buf_flush_list_mutex_own(buf_pool));
278
279	/ Insert this buffer into the rbt. /
280	c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
281	ut_a(c_node != NULL);
282
283	/ Get the predecessor. /
284	p_node = rbt_prev(buf_pool->flush_rbt, c_node);
285
286	if (p_node != NULL) {
287	buf_page_t** value;
288	value = rbt_value(buf_page_t*, p_node);
289	prev = *value;
290	ut_a(prev != NULL);
291	}
292
293	return(prev);
294	}
295
296	/*******************************************************//**
297	Delete a bpage from the flush_rbt. /*
298	static
299	void
300	buf_flush_delete_from_flush_rbt(
301	/============================/
302	buf_page_t* bpage) /!< in: bpage to be removed. /
303	{
304	#ifdef UNIV_DEBUG
305	ibool ret = FALSE;
306	#endif /* UNIV_DEBUG */
307	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
308
309	ut_ad(buf_flush_list_mutex_own(buf_pool));
310
311	#ifdef UNIV_DEBUG
312	ret =
313	#endif /* UNIV_DEBUG */
314	rbt_delete(buf_pool->flush_rbt, &bpage);
315
316	ut_ad(ret);
317	}
318
319	/***************************************************************//**
320	Compare two modified blocks in the buffer pool. The key for comparison
321	is:
322	key = <oldest_modification, space, offset>
323	This comparison is used to maintian ordering of blocks in the
324	buf_pool->flush_rbt.
325	Note that for the purpose of flush_rbt, we only need to order blocks
326	on the oldest_modification. The other two fields are used to uniquely
327	identify the blocks.
328	@return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 /*
329	static
330	int
331	buf_flush_block_cmp(
332	/================/
333	const void* p1, /!< in: block1 /
334	const void* p2) /!< in: block2 /
335	{
336	int ret;
337	const buf_page_t* b1 = (const* buf_page_t**) p1;
338	const buf_page_t* b2 = (const* buf_page_t**) p2;
339
340	ut_ad(b1 != NULL);
341	ut_ad(b2 != NULL);
342
343	#ifdef UNIV_DEBUG
344	buf_pool_t* buf_pool = buf_pool_from_bpage(b1);
345	#endif /* UNIV_DEBUG */
346
347	ut_ad(buf_flush_list_mutex_own(buf_pool));
348
349	ut_ad(b1->in_flush_list);
350	ut_ad(b2->in_flush_list);
351
352	if (b2->oldest_modification > b1->oldest_modification) {
353	return(`1`);
354	} else if (b2->oldest_modification < b1->oldest_modification) {
355	return(-`1`);
356	}
357
358	/ If oldest_modification is same then decide on the space. /
359	ret = (int)(b2->id.space() - b1->id.space());
360
361	/ Or else decide ordering on the page number. /
362	return(ret ? ret : (int) (b2->id.page_no() - b1->id.page_no()));
363	}
364
365	/******************************************************************//**
366	Initialize the red-black tree to speed up insertions into the flush_list
367	during recovery process. Should be called at the start of recovery
368	process before any page has been read/written. /*
369	void
370	buf_flush_init_flush_rbt(void)
371	/==========================/
372	{
373	ulint i;
374
375	for (i = `0`; i < srv_buf_pool_instances; i++) {
376	buf_pool_t* buf_pool;
377
378	buf_pool = buf_pool_from_array(i);
379
380	buf_flush_list_mutex_enter(buf_pool);
381
382	ut_ad(buf_pool->flush_rbt == NULL);
383
384	/ Create red black tree for speedy insertions in flush list. /
385	buf_pool->flush_rbt = rbt_create(
386	sizeof(buf_page_t*), buf_flush_block_cmp);
387
388	buf_flush_list_mutex_exit(buf_pool);
389	}
390	}
391
392	/******************************************************************//**
393	Frees up the red-black tree. /*
394	void
395	buf_flush_free_flush_rbt(void)
396	/==========================/
397	{
398	ulint i;
399
400	for (i = `0`; i < srv_buf_pool_instances; i++) {
401	buf_pool_t* buf_pool;
402
403	buf_pool = buf_pool_from_array(i);
404
405	buf_flush_list_mutex_enter(buf_pool);
406
407	#if defined UNIV_DEBUG \|\| defined UNIV_BUF_DEBUG
408	ut_a(buf_flush_validate_low(buf_pool));
409	#endif /* UNIV_DEBUG \|\| UNIV_BUF_DEBUG */
410
411	rbt_free(buf_pool->flush_rbt);
412	buf_pool->flush_rbt = NULL;
413
414	buf_flush_list_mutex_exit(buf_pool);
415	}
416	}
417
418	/******************************************************************//**
419	Inserts a modified block into the flush list. /*
420	void
421	buf_flush_insert_into_flush_list(
422	/=============================/
423	buf_pool_t* buf_pool, /!< buffer pool instance /
424	buf_block_t* block, /!< in/out: block which is modified /
425	lsn_t lsn) /!< in: oldest modification /
426	{
427	ut_ad(!buf_pool_mutex_own(buf_pool));
428	ut_ad(log_flush_order_mutex_own());
429	ut_ad(buf_page_mutex_own(block));
430
431	buf_flush_list_mutex_enter(buf_pool);
432
433	ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
434	\|\| (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
435	<= lsn));
436
437	/ If we are in the recovery then we need to update the flush*
438	red-black tree as well. /*
439	if (buf_pool->flush_rbt != NULL) {
440	buf_flush_list_mutex_exit(buf_pool);
441	buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
442	return;
443	}
444
445	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
446	ut_ad(!block->page.in_flush_list);
447
448	ut_d(block->page.in_flush_list = TRUE);
449	block->page.oldest_modification = lsn;
450
451	UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page);
452
453	incr_flush_list_size_in_bytes(block, buf_pool);
454
455	#ifdef UNIV_DEBUG_VALGRIND
456	void* p;
457
458	if (block->page.size.is_compressed()) {
459	p = block->page.zip.data;
460	} else {
461	p = block->frame;
462	}
463
464	UNIV_MEM_ASSERT_RW(p, block->page.size.physical());
465	#endif /* UNIV_DEBUG_VALGRIND */
466
467	#if defined UNIV_DEBUG \|\| defined UNIV_BUF_DEBUG
468	ut_a(buf_flush_validate_skip(buf_pool));
469	#endif /* UNIV_DEBUG \|\| UNIV_BUF_DEBUG */
470
471	buf_flush_list_mutex_exit(buf_pool);
472	}
473
474	/******************************************************************//**
475	Inserts a modified block into the flush list in the right sorted position.
476	This function is used by recovery, because there the modifications do not
477	necessarily come in the order of lsn's. /*
478	void
479	buf_flush_insert_sorted_into_flush_list(
480	/====================================/
481	buf_pool_t* buf_pool, /!< in: buffer pool instance /
482	buf_block_t* block, /!< in/out: block which is modified /
483	lsn_t lsn) /!< in: oldest modification /
484	{
485	buf_page_t* prev_b;
486	buf_page_t* b;
487
488	ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE);
489	ut_ad(!buf_pool_mutex_own(buf_pool));
490	ut_ad(log_flush_order_mutex_own());
491	ut_ad(buf_page_mutex_own(block));
492	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
493
494	buf_flush_list_mutex_enter(buf_pool);
495
496	/ The field in_LRU_list is protected by buf_pool->mutex, which*
497	we are not holding. However, while a block is in the flush
498	list, it is dirty and cannot be discarded, not from the
499	page_hash or from the LRU list. At most, the uncompressed
500	page frame of a compressed block may be discarded or created
501	(copying the block->page to or from a buf_page_t that is
502	dynamically allocated from buf_buddy_alloc()). Because those
503	transitions hold block->mutex and the flush list mutex (via
504	buf_flush_relocate_on_flush_list()), there is no possibility
505	of a race condition in the assertions below. /*
506	ut_ad(block->page.in_LRU_list);
507	ut_ad(block->page.in_page_hash);
508	/ buf_buddy_block_register() will take a block in the*
509	BUF_BLOCK_MEMORY state, not a file page. /*
510	ut_ad(!block->page.in_zip_hash);
511
512	ut_ad(!block->page.in_flush_list);
513	ut_d(block->page.in_flush_list = TRUE);
514	block->page.oldest_modification = lsn;
515
516	#ifdef UNIV_DEBUG_VALGRIND
517	void* p;
518
519	if (block->page.size.is_compressed()) {
520	p = block->page.zip.data;
521	} else {
522	p = block->frame;
523	}
524
525	UNIV_MEM_ASSERT_RW(p, block->page.size.physical());
526	#endif /* UNIV_DEBUG_VALGRIND */
527
528	prev_b = NULL;
529
530	/ For the most part when this function is called the flush_rbt*
531	should not be NULL. In a very rare boundary case it is possible
532	that the flush_rbt has already been freed by the recovery thread
533	before the last page was hooked up in the flush_list by the
534	io-handler thread. In that case we'll just do a simple
535	linear search in the else block. /*
536	if (buf_pool->flush_rbt != NULL) {
537
538	prev_b = buf_flush_insert_in_flush_rbt(&block->page);
539
540	} else {
541
542	b = UT_LIST_GET_FIRST(buf_pool->flush_list);
543
544	while (b != NULL && b->oldest_modification
545	> block->page.oldest_modification) {
546
547	ut_ad(b->in_flush_list);
548	prev_b = b;
549	b = UT_LIST_GET_NEXT(list, b);
550	}
551	}
552
553	if (prev_b == NULL) {
554	UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page);
555	} else {
556	UT_LIST_INSERT_AFTER(buf_pool->flush_list, prev_b, &block->page);
557	}
558
559	incr_flush_list_size_in_bytes(block, buf_pool);
560
561	#if defined UNIV_DEBUG \|\| defined UNIV_BUF_DEBUG
562	ut_a(buf_flush_validate_low(buf_pool));
563	#endif /* UNIV_DEBUG \|\| UNIV_BUF_DEBUG */
564
565	buf_flush_list_mutex_exit(buf_pool);
566	}
567
568	/******************************************************************//**
569	Returns TRUE if the file page block is immediately suitable for replacement,
570	i.e., the transition FILE_PAGE => NOT_USED allowed.
571	@return TRUE if can replace immediately /*
572	ibool
573	buf_flush_ready_for_replace(
574	/========================/
575	buf_page_t* bpage) /!< in: buffer control block, must be*
576	buf_page_in_file(bpage) and in the LRU list /*
577	{
578	#ifdef UNIV_DEBUG
579	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
580	ut_ad(buf_pool_mutex_own(buf_pool));
581	#endif /* UNIV_DEBUG */
582	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
583	ut_ad(bpage->in_LRU_list);
584
585	if (buf_page_in_file(bpage)) {
586
587	return(bpage->oldest_modification == `0`
588	&& bpage->buf_fix_count == `0`
589	&& buf_page_get_io_fix(bpage) == BUF_IO_NONE);
590	}
591
592	ib::fatal () << "Buffer block " << bpage << " state " << bpage->state
593	<< " in the LRU list!";
594
595	return(FALSE);
596	}
597
598	/******************************************************************//**
599	Returns true if the block is modified and ready for flushing.
600	@return true if can flush immediately /*
601	bool
602	buf_flush_ready_for_flush(
603	/======================/
604	buf_page_t* bpage, /!< in: buffer control block, must be*
605	buf_page_in_file(bpage) /*
606	buf_flush_t flush_type)/!< in: type of flush /
607	{
608	#ifdef UNIV_DEBUG
609	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
610	ut_ad(buf_pool_mutex_own(buf_pool));
611	#endif /* UNIV_DEBUG */
612
613	ut_a(buf_page_in_file(bpage));
614	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
615	ut_ad(flush_type < BUF_FLUSH_N_TYPES);
616
617	if (bpage->oldest_modification == `0`
618	\|\| buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
619	return(false);
620	}
621
622	ut_ad(bpage->in_flush_list);
623
624	switch (flush_type) {
625	case BUF_FLUSH_LIST:
626	case BUF_FLUSH_LRU:
627	case BUF_FLUSH_SINGLE_PAGE:
628	return(true);
629
630	case BUF_FLUSH_N_TYPES:
631	break;
632	}
633
634	ut_error;
635	return(false);
636	}
637
638	/******************************************************************//**
639	Remove a block from the flush list of modified blocks. /*
640	void
641	buf_flush_remove(
642	/=============/
643	buf_page_t* bpage) /!< in: pointer to the block in question /
644	{
645	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
646
647	#if 0 // FIXME: Rate-limit the output. Move this to the page cleaner?
648	if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE)) {
649	service_manager_extend_timeout(
650	INNODB_EXTEND_TIMEOUT_INTERVAL,
651	"Flush and remove page with tablespace id %u"
652	", Poolid " ULINTPF ", flush list length " ULINTPF,
653	bpage->space, buf_pool->instance_no,
654	UT_LIST_GET_LEN(buf_pool->flush_list));
655	}
656	#endif
657
658	ut_ad(buf_pool_mutex_own(buf_pool));
659	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
660	ut_ad(bpage->in_flush_list);
661
662	buf_flush_list_mutex_enter(buf_pool);
663
664	/ Important that we adjust the hazard pointer before removing*
665	the bpage from flush list. /*
666	buf_pool->flush_hp.adjust(bpage);
667
668	switch (buf_page_get_state(bpage)) {
669	case BUF_BLOCK_POOL_WATCH:
670	case BUF_BLOCK_ZIP_PAGE:
671	/ Clean compressed pages should not be on the flush list /
672	case BUF_BLOCK_NOT_USED:
673	case BUF_BLOCK_READY_FOR_USE:
674	case BUF_BLOCK_MEMORY:
675	case BUF_BLOCK_REMOVE_HASH:
676	ut_error;
677	return;
678	case BUF_BLOCK_ZIP_DIRTY:
679	buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
680	UT_LIST_REMOVE(buf_pool->flush_list, bpage);
681	#if defined UNIV_DEBUG \|\| defined UNIV_BUF_DEBUG
682	buf_LRU_insert_zip_clean(bpage);
683	#endif /* UNIV_DEBUG \|\| UNIV_BUF_DEBUG */
684	break;
685	case BUF_BLOCK_FILE_PAGE:
686	UT_LIST_REMOVE(buf_pool->flush_list, bpage);
687	break;
688	}
689
690	/ If the flush_rbt is active then delete from there as well. /
691	if (buf_pool->flush_rbt != NULL) {
692	buf_flush_delete_from_flush_rbt(bpage);
693	}
694
695	/ Must be done after we have removed it from the flush_rbt*
696	because we assert on in_flush_list in comparison function. /*
697	ut_d(bpage->in_flush_list = FALSE);
698
699	buf_pool->stat.flush_list_bytes -= bpage->size.physical();
700
701	bpage->oldest_modification = `0`;
702
703	#if defined UNIV_DEBUG \|\| defined UNIV_BUF_DEBUG
704	ut_a(buf_flush_validate_skip(buf_pool));
705	#endif /* UNIV_DEBUG \|\| UNIV_BUF_DEBUG */
706
707	/ If there is an observer that want to know if the asynchronous*
708	flushing was done then notify it. /*
709	if (bpage->flush_observer != NULL) {
710	bpage->flush_observer->notify_remove(buf_pool, bpage);
711
712	bpage->flush_observer = NULL;
713	}
714
715	buf_flush_list_mutex_exit(buf_pool);
716	}
717
718	/*****************************************************************//**
719	Relocates a buffer control block on the flush_list.
720	Note that it is assumed that the contents of bpage have already been
721	copied to dpage.
722	IMPORTANT: When this function is called bpage and dpage are not
723	exact copies of each other. For example, they both will have different
724	::state. Also the ::list pointers in dpage may be stale. We need to
725	use the current list node (bpage) to do the list manipulation because
726	the list pointers could have changed between the time that we copied
727	the contents of bpage to the dpage and the flush list manipulation
728	below. /*
729	void
730	buf_flush_relocate_on_flush_list(
731	/=============================/
732	buf_page_t* bpage, /!< in/out: control block being moved /
733	buf_page_t* dpage) /!< in/out: destination block /
734	{
735	buf_page_t* prev;
736	buf_page_t* prev_b = NULL;
737	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
738
739	ut_ad(buf_pool_mutex_own(buf_pool));
740	/ Must reside in the same buffer pool. /
741	ut_ad(buf_pool == buf_pool_from_bpage(dpage));
742
743	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
744
745	buf_flush_list_mutex_enter(buf_pool);
746
747	/ FIXME: At this point we have both buf_pool and flush_list*
748	mutexes. Theoretically removal of a block from flush list is
749	only covered by flush_list mutex but currently we do
750	have buf_pool mutex in buf_flush_remove() therefore this block
751	is guaranteed to be in the flush list. We need to check if
752	this will work without the assumption of block removing code
753	having the buf_pool mutex. /*
754	ut_ad(bpage->in_flush_list);
755	ut_ad(dpage->in_flush_list);
756
757	/ If recovery is active we must swap the control blocks in*
758	the flush_rbt as well. /*
759	if (buf_pool->flush_rbt != NULL) {
760	buf_flush_delete_from_flush_rbt(bpage);
761	prev_b = buf_flush_insert_in_flush_rbt(dpage);
762	}
763
764	/ Important that we adjust the hazard pointer before removing*
765	the bpage from the flush list. /*
766	buf_pool->flush_hp.adjust(bpage);
767
768	/ Must be done after we have removed it from the flush_rbt*
769	because we assert on in_flush_list in comparison function. /*
770	ut_d(bpage->in_flush_list = FALSE);
771
772	prev = UT_LIST_GET_PREV(list, bpage);
773	UT_LIST_REMOVE(buf_pool->flush_list, bpage);
774
775	if (prev) {
776	ut_ad(prev->in_flush_list);
777	UT_LIST_INSERT_AFTER( buf_pool->flush_list, prev, dpage);
778	} else {
779	UT_LIST_ADD_FIRST(buf_pool->flush_list, dpage);
780	}
781
782	/ Just an extra check. Previous in flush_list*
783	should be the same control block as in flush_rbt. /*
784	ut_a(buf_pool->flush_rbt == NULL \|\| prev_b == prev);
785
786	#if defined UNIV_DEBUG \|\| defined UNIV_BUF_DEBUG
787	ut_a(buf_flush_validate_low(buf_pool));
788	#endif /* UNIV_DEBUG \|\| UNIV_BUF_DEBUG */
789
790	buf_flush_list_mutex_exit(buf_pool);
791	}
792
793	/* Update the flush system data structures when a write is completed.*
794	@param[in,out] bpage flushed page
795	@param[in] dblwr whether the doublewrite buffer was used /*
796	void buf_flush_write_complete(buf_page_t* bpage, bool dblwr)
797	{
798	buf_flush_t flush_type;
799	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
800
801	ut_ad(bpage);
802
803	buf_flush_remove(bpage);
804
805	flush_type = buf_page_get_flush_type(bpage);
806	buf_pool->n_flush[flush_type]--;
807	ut_ad(buf_pool->n_flush[flush_type] != ULINT_MAX);
808
809	ut_ad(buf_pool_mutex_own(buf_pool));
810
811	if (buf_pool->n_flush[flush_type] == `0`
812	&& buf_pool->init_flush[flush_type] == FALSE) {
813
814	/ The running flush batch has ended /
815
816	os_event_set(buf_pool->no_flush[flush_type]);
817	}
818
819	if (dblwr) {
820	buf_dblwr_update(bpage, flush_type);
821	}
822	}
823
824	/* Calculate the checksum of a page from compressed table and update*
825	the page.
826	@param[in,out] page page to update
827	@param[in] size compressed page size
828	@param[in] lsn LSN to stamp on the page /*
829	void
830	buf_flush_update_zip_checksum(
831	buf_frame_t* page,
832	ulint size,
833	lsn_t lsn)
834	{
835	ut_a(size > `0`);
836
837	const uint32_t checksum = page_zip_calc_checksum(
838	page, size,
839	static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm));
840
841	mach_write_to_8(page + FIL_PAGE_LSN, lsn);
842	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
843	}
844
845	/* Initialize a page for writing to the tablespace.*
846	@param[in] block buffer block; NULL if bypassing the buffer pool
847	@param[in,out] page page frame
848	@param[in,out] page_zip_ compressed page, or NULL if uncompressed
849	@param[in] newest_lsn newest modification LSN to the page /*
850	void
851	buf_flush_init_for_writing(
852	const buf_block_t* block,
853	byte* page,
854	void* page_zip_,
855	lsn_t newest_lsn)
856	{
857	ut_ad(block == NULL \|\| block->frame == page);
858	ut_ad(block == NULL \|\| page_zip_ == NULL
859	\|\| &block->page.zip == page_zip_);
860	ut_ad(page);
861
862	if (page_zip_) {
863	page_zip_des_t* page_zip;
864	ulint size;
865
866	page_zip = static_cast<page_zip_des_t*>(page_zip_);
867	size = page_zip_get_size(page_zip);
868
869	ut_ad(size);
870	ut_ad(ut_is_2pow(size));
871	ut_ad(size <= UNIV_ZIP_SIZE_MAX);
872
873	switch (fil_page_get_type(page)) {
874	case FIL_PAGE_TYPE_ALLOCATED:
875	case FIL_PAGE_INODE:
876	case FIL_PAGE_IBUF_BITMAP:
877	case FIL_PAGE_TYPE_FSP_HDR:
878	case FIL_PAGE_TYPE_XDES:
879	/ These are essentially uncompressed pages. /
880	memcpy(page_zip->data, page, size);
881	/ fall through /
882	case FIL_PAGE_TYPE_ZBLOB:
883	case FIL_PAGE_TYPE_ZBLOB2:
884	case FIL_PAGE_INDEX:
885	case FIL_PAGE_RTREE:
886
887	buf_flush_update_zip_checksum(
888	page_zip->data, size, newest_lsn);
889
890	return;
891	}
892
893	ib::error () << "The compressed page to be written"
894	" seems corrupt:";
895	ut_print_buf(stderr, page, size);
896	fputs("\nInnoDB: Possibly older version of the page:", stderr);
897	ut_print_buf(stderr, page_zip->data, size);
898	putc(`'\n'`, stderr);
899	ut_error;
900	}
901
902	/ Write the newest modification lsn to the page header and trailer /
903	mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
904
905	mach_write_to_8(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
906	newest_lsn);
907
908	if (block && srv_page_size == `16384`) {
909	/ The page type could be garbage in old files*
910	created before MySQL 5.5. Such files always
911	had a page size of 16 kilobytes. /*
912	ulint page_type = fil_page_get_type(page);
913	ulint reset_type = page_type;
914
915	switch (block->page.id.page_no() % `16384`) {
916	case `0`:
917	reset_type = block->page.id.page_no() == `0`
918	? FIL_PAGE_TYPE_FSP_HDR
919	: FIL_PAGE_TYPE_XDES;
920	break;
921	case `1`:
922	reset_type = FIL_PAGE_IBUF_BITMAP;
923	break;
924	case FSP_TRX_SYS_PAGE_NO:
925	if (block->page.id.page_no()
926	== TRX_SYS_PAGE_NO
927	&& block->page.id.space()
928	== TRX_SYS_SPACE) {
929	reset_type = FIL_PAGE_TYPE_TRX_SYS;
930	break;
931	}
932	/ fall through /
933	default:
934	switch (page_type) {
935	case FIL_PAGE_INDEX:
936	case FIL_PAGE_TYPE_INSTANT:
937	case FIL_PAGE_RTREE:
938	case FIL_PAGE_UNDO_LOG:
939	case FIL_PAGE_INODE:
940	case FIL_PAGE_IBUF_FREE_LIST:
941	case FIL_PAGE_TYPE_ALLOCATED:
942	case FIL_PAGE_TYPE_SYS:
943	case FIL_PAGE_TYPE_TRX_SYS:
944	case FIL_PAGE_TYPE_BLOB:
945	case FIL_PAGE_TYPE_ZBLOB:
946	case FIL_PAGE_TYPE_ZBLOB2:
947	break;
948	case FIL_PAGE_TYPE_FSP_HDR:
949	case FIL_PAGE_TYPE_XDES:
950	case FIL_PAGE_IBUF_BITMAP:
951	/ These pages should have*
952	predetermined page numbers
953	(see above). /*
954	default:
955	reset_type = FIL_PAGE_TYPE_UNKNOWN;
956	break;
957	}
958	}
959
960	if (UNIV_UNLIKELY(page_type != reset_type)) {
961	ib::info ()
962	<< "Resetting invalid page "
963	<< block->page.id << " type "
964	<< page_type << " to "
965	<< reset_type << " when flushing.";
966	fil_page_set_type(page, reset_type);
967	}
968	}
969
970	uint32_t checksum= `0`;
971
972	switch (srv_checksum_algorithm_t(srv_checksum_algorithm)) {
973	case SRV_CHECKSUM_ALGORITHM_INNODB:
974	case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
975	checksum = buf_calc_page_new_checksum(page);
976	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
977	checksum);
978	/ With the InnoDB checksum, we overwrite the first 4 bytes of*
979	the end lsn field to store the old formula checksum. Since it
980	depends also on the field FIL_PAGE_SPACE_OR_CHKSUM, it has to
981	be calculated after storing the new formula checksum. /*
982	checksum = buf_calc_page_old_checksum(page);
983	break;
984	case SRV_CHECKSUM_ALGORITHM_CRC32:
985	case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
986	/ In other cases we write the same checksum to both fields. /
987	checksum = buf_calc_page_crc32(page);
988	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
989	checksum);
990	break;
991	case SRV_CHECKSUM_ALGORITHM_NONE:
992	case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
993	checksum = BUF_NO_CHECKSUM_MAGIC;
994	mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
995	checksum);
996	break;
997	/ no default so the compiler will emit a warning if*
998	new enum is added and not handled here /*
999	}
1000
1001	mach_write_to_4(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
1002	checksum);
1003	}
1004
1005	/******************************************************************//**
1006	Does an asynchronous write of a buffer page. NOTE: in simulated aio and
1007	also when the doublewrite buffer is used, we must call
1008	buf_dblwr_flush_buffered_writes after we have posted a batch of
1009	writes! /*
1010	static
1011	void
1012	buf_flush_write_block_low(
1013	/======================/
1014	buf_page_t* bpage, /!< in: buffer block to write /
1015	buf_flush_t flush_type, /!< in: type of flush /
1016	bool sync) /!< in: true if sync IO request /
1017	{
1018	fil_space_t* space = fil_space_acquire_for_io(bpage->id.space());
1019	if (!space) {
1020	return;
1021	}
1022	ut_ad(space->purpose == FIL_TYPE_TEMPORARY
1023	\|\| space->purpose == FIL_TYPE_IMPORT
1024	\|\| space->purpose == FIL_TYPE_TABLESPACE);
1025	ut_ad((space->purpose == FIL_TYPE_TEMPORARY)
1026	== (space == fil_system.temp_space));
1027	page_t* frame = NULL;
1028	#ifdef UNIV_DEBUG
1029	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1030	ut_ad(!buf_pool_mutex_own(buf_pool));
1031	#endif /* UNIV_DEBUG */
1032
1033	DBUG_PRINT("ib_buf", ("flush %s %u page %u:%u",
1034	sync ? "sync" : "async", (unsigned) flush_type,
1035	bpage->id.space(), bpage->id.page_no()));
1036
1037	ut_ad(buf_page_in_file(bpage));
1038
1039	/ We are not holding buf_pool->mutex or block_mutex here.*
1040	Nevertheless, it is safe to access bpage, because it is
1041	io_fixed and oldest_modification != 0. Thus, it cannot be
1042	relocated in the buffer pool or removed from flush_list or
1043	LRU_list. /*
1044	ut_ad(!buf_pool_mutex_own(buf_pool));
1045	ut_ad(!buf_flush_list_mutex_own(buf_pool));
1046	ut_ad(!buf_page_get_mutex(bpage)->is_owned());
1047	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
1048	ut_ad(bpage->oldest_modification != `0`);
1049
1050	#ifdef UNIV_IBUF_COUNT_DEBUG
1051	ut_a(ibuf_count_get(bpage->id) == `0`);
1052	#endif /* UNIV_IBUF_COUNT_DEBUG */
1053
1054	ut_ad(bpage->newest_modification != `0`);
1055
1056	/ Force the log to the disk before writing the modified block /
1057	if (!srv_read_only_mode) {
1058	log_write_up_to(bpage->newest_modification, true);
1059	}
1060
1061	switch (buf_page_get_state(bpage)) {
1062	case BUF_BLOCK_POOL_WATCH:
1063	case BUF_BLOCK_ZIP_PAGE: / The page should be dirty. /
1064	case BUF_BLOCK_NOT_USED:
1065	case BUF_BLOCK_READY_FOR_USE:
1066	case BUF_BLOCK_MEMORY:
1067	case BUF_BLOCK_REMOVE_HASH:
1068	ut_error;
1069	break;
1070	case BUF_BLOCK_ZIP_DIRTY:
1071	frame = bpage->zip.data;
1072
1073	mach_write_to_8(frame + FIL_PAGE_LSN,
1074	bpage->newest_modification);
1075
1076	ut_a(page_zip_verify_checksum(frame, bpage->size.physical()));
1077	break;
1078	case BUF_BLOCK_FILE_PAGE:
1079	frame = bpage->zip.data;
1080	if (!frame) {
1081	frame = ((buf_block_t*) bpage)->frame;
1082	}
1083
1084	buf_flush_init_for_writing(
1085	reinterpret_cast<const buf_block_t*>(bpage),
1086	reinterpret_cast<const buf_block_t*>(bpage)->frame,
1087	bpage->zip.data ? &bpage->zip : NULL,
1088	bpage->newest_modification);
1089	break;
1090	}
1091
1092	frame = buf_page_encrypt_before_write(space, bpage, frame);
1093
1094	ut_ad(space->purpose == FIL_TYPE_TABLESPACE
1095	\|\| space->atomic_write_supported);
1096	if (!space->use_doublewrite()) {
1097	ulint type = IORequest::WRITE \| IORequest::DO_NOT_WAKE;
1098
1099	IORequest request(type, bpage);
1100
1101	/ TODO: pass the tablespace to fil_io() /
1102	fil_io(request,
1103	sync, bpage->id, bpage->size, `0`, bpage->size.physical(),
1104	frame, bpage);
1105	} else {
1106	ut_ad(!srv_read_only_mode);
1107
1108	if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
1109	buf_dblwr_write_single_page(bpage, sync);
1110	} else {
1111	ut_ad(!sync);
1112	buf_dblwr_add_to_batch(bpage);
1113	}
1114	}
1115
1116	/ When doing single page flushing the IO is done synchronously*
1117	and we flush the changes to disk only for the tablespace we
1118	are working on. /*
1119	if (sync) {
1120	ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE);
1121	if (space->purpose != FIL_TYPE_TEMPORARY) {
1122	fil_flush(space);
1123	}
1124
1125	/ The tablespace could already have been dropped,*
1126	because fil_io(request, sync) would already have
1127	decremented the node->n_pending. However,
1128	buf_page_io_complete() only needs to look up the
1129	tablespace during read requests, not during writes. /*
1130	ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
1131	#ifdef UNIV_DEBUG
1132	dberr_t err =
1133	#endif
1134	/ true means we want to evict this page from the*
1135	LRU list as well. /*
1136	buf_page_io_complete(bpage, space->use_doublewrite(), true);
1137
1138	ut_ad(err == DB_SUCCESS);
1139	}
1140
1141	space->release_for_io();
1142
1143	/ Increment the counter of I/O operations used*
1144	for selecting LRU policy. /*
1145	buf_LRU_stat_inc_io();
1146	}
1147
1148	/******************************************************************//**
1149	Writes a flushable page asynchronously from the buffer pool to a file.
1150	NOTE: in simulated aio we must call
1151	os_aio_simulated_wake_handler_threads after we have posted a batch of
1152	writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be
1153	held upon entering this function, and they will be released by this
1154	function if it returns true.
1155	@return TRUE if the page was flushed /*
1156	ibool
1157	buf_flush_page(
1158	/===========/
1159	buf_pool_t* buf_pool, /!< in: buffer pool instance /
1160	buf_page_t* bpage, /!< in: buffer control block /
1161	buf_flush_t flush_type, /!< in: type of flush /
1162	bool sync) /!< in: true if sync IO request /
1163	{
1164	BPageMutex* block_mutex;
1165
1166	ut_ad(flush_type < BUF_FLUSH_N_TYPES);
1167	ut_ad(buf_pool_mutex_own(buf_pool));
1168	ut_ad(buf_page_in_file(bpage));
1169	ut_ad(!sync \|\| flush_type == BUF_FLUSH_SINGLE_PAGE);
1170
1171	block_mutex = buf_page_get_mutex(bpage);
1172	ut_ad(mutex_own(block_mutex));
1173
1174	ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
1175
1176	bool is_uncompressed;
1177
1178	is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
1179	ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
1180
1181	ibool flush;
1182	rw_lock_t* rw_lock;
1183	bool no_fix_count = bpage->buf_fix_count == `0`;
1184
1185	if (!is_uncompressed) {
1186	flush = TRUE;
1187	rw_lock = NULL;
1188	} else if (!(no_fix_count \|\| flush_type == BUF_FLUSH_LIST)
1189	\|\| (!no_fix_count
1190	&& srv_shutdown_state <= SRV_SHUTDOWN_CLEANUP
1191	&& fsp_is_system_temporary(bpage->id.space()))) {
1192	/ This is a heuristic, to avoid expensive SX attempts. /
1193	/ For table residing in temporary tablespace sync is done*
1194	using IO_FIX and so before scheduling for flush ensure that
1195	page is not fixed. /*
1196	flush = FALSE;
1197	} else {
1198	rw_lock = &reinterpret_cast<buf_block_t*>(bpage)->lock;
1199	if (flush_type != BUF_FLUSH_LIST) {
1200	flush = rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE);
1201	} else {
1202	/ Will SX lock later /
1203	flush = TRUE;
1204	}
1205	}
1206
1207	if (flush) {
1208
1209	/ We are committed to flushing by the time we get here /
1210
1211	buf_page_set_io_fix(bpage, BUF_IO_WRITE);
1212
1213	buf_page_set_flush_type(bpage, flush_type);
1214
1215	if (buf_pool->n_flush[flush_type] == `0`) {
1216	os_event_reset(buf_pool->no_flush[flush_type]);
1217	}
1218
1219	++buf_pool->n_flush[flush_type];
1220	ut_ad(buf_pool->n_flush[flush_type] != `0`);
1221
1222	mutex_exit(block_mutex);
1223
1224	buf_pool_mutex_exit(buf_pool);
1225
1226	if (flush_type == BUF_FLUSH_LIST
1227	&& is_uncompressed
1228	&& !rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE)) {
1229
1230	if (!fsp_is_system_temporary(bpage->id.space())) {
1231	/ avoiding deadlock possibility involves*
1232	doublewrite buffer, should flush it, because
1233	it might hold the another block->lock. /*
1234	buf_dblwr_flush_buffered_writes();
1235	} else {
1236	buf_dblwr_sync_datafiles();
1237	}
1238
1239	rw_lock_sx_lock_gen(rw_lock, BUF_IO_WRITE);
1240	}
1241
1242	/ If there is an observer that want to know if the asynchronous*
1243	flushing was sent then notify it.
1244	Note: we set flush observer to a page with x-latch, so we can
1245	guarantee that notify_flush and notify_remove are called in pair
1246	with s-latch on a uncompressed page. /*
1247	if (bpage->flush_observer != NULL) {
1248	buf_pool_mutex_enter(buf_pool);
1249
1250	bpage->flush_observer->notify_flush(buf_pool, bpage);
1251
1252	buf_pool_mutex_exit(buf_pool);
1253	}
1254
1255	/ Even though bpage is not protected by any mutex at this*
1256	point, it is safe to access bpage, because it is io_fixed and
1257	oldest_modification != 0. Thus, it cannot be relocated in the
1258	buffer pool or removed from flush_list or LRU_list. /*
1259
1260	buf_flush_write_block_low(bpage, flush_type, sync);
1261	}
1262
1263	return(flush);
1264	}
1265
1266	# if defined UNIV_DEBUG \|\| defined UNIV_IBUF_DEBUG
1267	/******************************************************************//**
1268	Writes a flushable page asynchronously from the buffer pool to a file.
1269	NOTE: buf_pool->mutex and block->mutex must be held upon entering this
1270	function, and they will be released by this function after flushing.
1271	This is loosely based on buf_flush_batch() and buf_flush_page().
1272	@return TRUE if the page was flushed and the mutexes released /*
1273	ibool
1274	buf_flush_page_try(
1275	/===============/
1276	buf_pool_t* buf_pool, /!< in/out: buffer pool instance /
1277	buf_block_t* block) /!< in/out: buffer control block /
1278	{
1279	ut_ad(buf_pool_mutex_own(buf_pool));
1280	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
1281	ut_ad(buf_page_mutex_own(block));
1282
1283	if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_SINGLE_PAGE)) {
1284	return(FALSE);
1285	}
1286
1287	/ The following call will release the buffer pool and*
1288	block mutex. /*
1289	return(buf_flush_page(
1290	buf_pool, &block->page,
1291	BUF_FLUSH_SINGLE_PAGE, true));
1292	}
1293	# endif /* UNIV_DEBUG \|\| UNIV_IBUF_DEBUG */
1294
1295	/* Check the page is in buffer pool and can be flushed.*
1296	@param[in] page_id page id
1297	@param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST
1298	@return true if the page can be flushed. /*
1299	static
1300	bool
1301	buf_flush_check_neighbor(
1302	const page_id_t& page_id,
1303	buf_flush_t flush_type)
1304	{
1305	buf_page_t* bpage;
1306	buf_pool_t* buf_pool = buf_pool_get(page_id);
1307	bool ret;
1308
1309	ut_ad(flush_type == BUF_FLUSH_LRU
1310	\|\| flush_type == BUF_FLUSH_LIST);
1311
1312	buf_pool_mutex_enter(buf_pool);
1313
1314	/ We only want to flush pages from this buffer pool. /
1315	bpage = buf_page_hash_get(buf_pool, page_id);
1316
1317	if (!bpage) {
1318
1319	buf_pool_mutex_exit(buf_pool);
1320	return(false);
1321	}
1322
1323	ut_a(buf_page_in_file(bpage));
1324
1325	/ We avoid flushing 'non-old' blocks in an LRU flush,*
1326	because the flushed blocks are soon freed /*
1327
1328	ret = false;
1329	if (flush_type != BUF_FLUSH_LRU \|\| buf_page_is_old(bpage)) {
1330	BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1331
1332	mutex_enter(block_mutex);
1333	if (buf_flush_ready_for_flush(bpage, flush_type)) {
1334	ret = true;
1335	}
1336	mutex_exit(block_mutex);
1337	}
1338	buf_pool_mutex_exit(buf_pool);
1339
1340	return(ret);
1341	}
1342
1343	/* Flushes to disk all flushable pages within the flush area.*
1344	@param[in] page_id page id
1345	@param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST
1346	@param[in] n_flushed number of pages flushed so far in this batch
1347	@param[in] n_to_flush maximum number of pages we are allowed to flush
1348	@return number of pages flushed /*
1349	static
1350	ulint
1351	buf_flush_try_neighbors(
1352	const page_id_t& page_id,
1353	buf_flush_t flush_type,
1354	ulint n_flushed,
1355	ulint n_to_flush)
1356	{
1357	ulint i;
1358	ulint low;
1359	ulint high;
1360	ulint count = `0`;
1361	buf_pool_t* buf_pool = buf_pool_get(page_id);
1362
1363	ut_ad(flush_type == BUF_FLUSH_LRU \|\| flush_type == BUF_FLUSH_LIST);
1364
1365	if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN
1366	\|\| srv_flush_neighbors == `0`) {
1367	/ If there is little space or neighbor flushing is*
1368	not enabled then just flush the victim. /*
1369	low = page_id.page_no();
1370	high = page_id.page_no() + `1`;
1371	} else {
1372	/ When flushed, dirty blocks are searched in*
1373	neighborhoods of this size, and flushed along with the
1374	original page. /*
1375
1376	ulint buf_flush_area;
1377
1378	buf_flush_area = ut_min(
1379	BUF_READ_AHEAD_AREA(buf_pool),
1380	buf_pool->curr_size / `16`);
1381
1382	low = (page_id.page_no() / buf_flush_area) * buf_flush_area;
1383	high = (page_id.page_no() / buf_flush_area + `1`) * buf_flush_area;
1384
1385	if (srv_flush_neighbors == `1`) {
1386	/ adjust 'low' and 'high' to limit*
1387	for contiguous dirty area /*
1388	if (page_id.page_no() > low) {
1389	for (i = page_id.page_no() - `1`; i >= low; i--) {
1390	if (!buf_flush_check_neighbor(
1391	page_id_t (page_id.space(), i),
1392	flush_type)) {
1393
1394	break;
1395	}
1396
1397	if (i == low) {
1398	/ Avoid overwrap when low == 0*
1399	and calling
1400	buf_flush_check_neighbor() with
1401	i == (ulint) -1 /*
1402	i--;
1403	break;
1404	}
1405	}
1406	low = i + `1`;
1407	}
1408
1409	for (i = page_id.page_no() + `1`;
1410	i < high
1411	&& buf_flush_check_neighbor(
1412	page_id_t (page_id.space(), i),
1413	flush_type);
1414	i++) {
1415	/ do nothing /
1416	}
1417	high = i;
1418	}
1419	}
1420
1421	const ulint space_size = fil_space_get_size(page_id.space());
1422	if (high > space_size) {
1423	high = space_size;
1424	}
1425
1426	DBUG_PRINT("ib_buf", ("flush %u:%u..%u",
1427	page_id.space(),
1428	(unsigned) low, (unsigned) high));
1429
1430	for (ulint i = low; i < high; i++) {
1431	buf_page_t* bpage;
1432
1433	if ((count + n_flushed) >= n_to_flush) {
1434
1435	/ We have already flushed enough pages and*
1436	should call it a day. There is, however, one
1437	exception. If the page whose neighbors we
1438	are flushing has not been flushed yet then
1439	we'll try to flush the victim that we
1440	selected originally. /*
1441	if (i <= page_id.page_no()) {
1442	i = page_id.page_no();
1443	} else {
1444	break;
1445	}
1446	}
1447
1448	const page_id_t cur_page_id(page_id.space(), i);
1449
1450	buf_pool = buf_pool_get(cur_page_id);
1451
1452	buf_pool_mutex_enter(buf_pool);
1453
1454	/ We only want to flush pages from this buffer pool. /
1455	bpage = buf_page_hash_get(buf_pool, cur_page_id);
1456
1457	if (bpage == NULL) {
1458
1459	buf_pool_mutex_exit(buf_pool);
1460	continue;
1461	}
1462
1463	ut_a(buf_page_in_file(bpage));
1464
1465	/ We avoid flushing 'non-old' blocks in an LRU flush,*
1466	because the flushed blocks are soon freed /*
1467
1468	if (flush_type != BUF_FLUSH_LRU
1469	\|\| i == page_id.page_no()
1470	\|\| buf_page_is_old(bpage)) {
1471
1472	BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1473
1474	mutex_enter(block_mutex);
1475
1476	if (buf_flush_ready_for_flush(bpage, flush_type)
1477	&& (i == page_id.page_no()
1478	\|\| bpage->buf_fix_count == `0`)) {
1479
1480	/ We also try to flush those*
1481	neighbors != offset /*
1482
1483	if (buf_flush_page(
1484	buf_pool, bpage, flush_type, false)) {
1485
1486	++count;
1487	} else {
1488	mutex_exit(block_mutex);
1489	buf_pool_mutex_exit(buf_pool);
1490	}
1491
1492	continue;
1493	} else {
1494	mutex_exit(block_mutex);
1495	}
1496	}
1497	buf_pool_mutex_exit(buf_pool);
1498	}
1499
1500	if (count > `1`) {
1501	MONITOR_INC_VALUE_CUMULATIVE(
1502	MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
1503	MONITOR_FLUSH_NEIGHBOR_COUNT,
1504	MONITOR_FLUSH_NEIGHBOR_PAGES,
1505	(count - `1`));
1506	}
1507
1508	return(count);
1509	}
1510
1511	/* Check if the block is modified and ready for flushing.*
1512	If the the block is ready to flush then flush the page and try o flush
1513	its neighbors.
1514	@param[in] bpage buffer control block,
1515	must be buf_page_in_file(bpage)
1516	@param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST
1517	@param[in] n_to_flush number of pages to flush
1518	@param[in,out] count number of pages flushed
1519	@return TRUE if buf_pool mutex was released during this function.
1520	This does not guarantee that some pages were written as well.
1521	Number of pages written are incremented to the count. /*
1522	static
1523	bool
1524	buf_flush_page_and_try_neighbors(
1525	buf_page_t* bpage,
1526	buf_flush_t flush_type,
1527	ulint n_to_flush,
1528	ulint* count)
1529	{
1530	#ifdef UNIV_DEBUG
1531	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1532
1533	ut_ad(buf_pool_mutex_own(buf_pool));
1534	#endif /* UNIV_DEBUG */
1535
1536	bool flushed;
1537	BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1538
1539	mutex_enter(block_mutex);
1540
1541	ut_a(buf_page_in_file(bpage));
1542
1543	if (buf_flush_ready_for_flush(bpage, flush_type)) {
1544	buf_pool_t* buf_pool;
1545
1546	buf_pool = buf_pool_from_bpage(bpage);
1547
1548	const page_id_t page_id = bpage->id;
1549
1550	mutex_exit(block_mutex);
1551
1552	buf_pool_mutex_exit(buf_pool);
1553
1554	/ Try to flush also all the neighbors /
1555	*count += buf_flush_try_neighbors(
1556	page_id, flush_type, *count, n_to_flush);
1557
1558	buf_pool_mutex_enter(buf_pool);
1559	flushed = TRUE;
1560	} else {
1561	mutex_exit(block_mutex);
1562
1563	flushed = false;
1564	}
1565
1566	ut_ad(buf_pool_mutex_own(buf_pool));
1567
1568	return(flushed);
1569	}
1570
1571	/*****************************************************************//**
1572	This utility moves the uncompressed frames of pages to the free list.
1573	Note that this function does not actually flush any data to disk. It
1574	just detaches the uncompressed frames from the compressed pages at the
1575	tail of the unzip_LRU and puts those freed frames in the free list.
1576	Note that it is a best effort attempt and it is not guaranteed that
1577	after a call to this function there will be 'max' blocks in the free
1578	list.
1579	@return number of blocks moved to the free list. /*
1580	static
1581	ulint
1582	buf_free_from_unzip_LRU_list_batch(
1583	/===============================/
1584	buf_pool_t* buf_pool, /!< in: buffer pool instance /
1585	ulint max) /!< in: desired number of*
1586	blocks in the free_list /*
1587	{
1588	ulint scanned = `0`;
1589	ulint count = `0`;
1590	ulint free_len = UT_LIST_GET_LEN(buf_pool->free);
1591	ulint lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1592
1593	ut_ad(buf_pool_mutex_own(buf_pool));
1594
1595	buf_block_t* block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1596
1597	while (block != NULL
1598	&& count < max
1599	&& free_len < srv_LRU_scan_depth
1600	&& lru_len > UT_LIST_GET_LEN(buf_pool->LRU) / `10`) {
1601
1602	++scanned;
1603	if (buf_LRU_free_page(&block->page, false)) {
1604	/ Block was freed. buf_pool->mutex potentially*
1605	released and reacquired /*
1606	++count;
1607	block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1608
1609	} else {
1610
1611	block = UT_LIST_GET_PREV(unzip_LRU, block);
1612	}
1613
1614	free_len = UT_LIST_GET_LEN(buf_pool->free);
1615	lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1616	}
1617
1618	ut_ad(buf_pool_mutex_own(buf_pool));
1619
1620	if (scanned) {
1621	MONITOR_INC_VALUE_CUMULATIVE(
1622	MONITOR_LRU_BATCH_SCANNED,
1623	MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1624	MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1625	scanned);
1626	}
1627
1628	return(count);
1629	}
1630
1631	/*****************************************************************//**
1632	This utility flushes dirty blocks from the end of the LRU list.
1633	The calling thread is not allowed to own any latches on pages!
1634	It attempts to make 'max' blocks available in the free list. Note that
1635	it is a best effort attempt and it is not guaranteed that after a call
1636	to this function there will be 'max' blocks in the free list./*
1637
1638	void
1639	buf_flush_LRU_list_batch(
1640	/=====================/
1641	buf_pool_t* buf_pool, /!< in: buffer pool instance /
1642	ulint max, /!< in: desired number of*
1643	blocks in the free_list /*
1644	flush_counters_t* n) /!< out: flushed/evicted page*
1645	counts /*
1646	{
1647	buf_page_t* bpage;
1648	ulint scanned = `0`;
1649	ulint free_len = UT_LIST_GET_LEN(buf_pool->free);
1650	ulint lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1651	ulint withdraw_depth = `0`;
1652
1653	n->flushed = `0`;
1654	n->evicted = `0`;
1655	n->unzip_LRU_evicted = `0`;
1656	ut_ad(buf_pool_mutex_own(buf_pool));
1657	if (buf_pool->curr_size < buf_pool->old_size
1658	&& buf_pool->withdraw_target > `0`) {
1659	withdraw_depth = buf_pool->withdraw_target
1660	- UT_LIST_GET_LEN(buf_pool->withdraw);
1661	}
1662
1663	for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1664	bpage != NULL && n->flushed + n->evicted < max
1665	&& free_len < srv_LRU_scan_depth + withdraw_depth
1666	&& lru_len > BUF_LRU_MIN_LEN;
1667	++scanned,
1668	bpage = buf_pool->lru_hp.get()) {
1669
1670	buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
1671	buf_pool->lru_hp.set(prev);
1672
1673	BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1674
1675	mutex_enter(block_mutex);
1676
1677	if (buf_flush_ready_for_replace(bpage)) {
1678	/ block is ready for eviction i.e., it is*
1679	clean and is not IO-fixed or buffer fixed. /*
1680	mutex_exit(block_mutex);
1681	if (buf_LRU_free_page(bpage, true)) {
1682	++n->evicted;
1683	}
1684	} else if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_LRU)) {
1685	/ Block is ready for flush. Dispatch an IO*
1686	request. The IO helper thread will put it on
1687	free list in IO completion routine. /*
1688	mutex_exit(block_mutex);
1689	buf_flush_page_and_try_neighbors(
1690	bpage, BUF_FLUSH_LRU, max, &n->flushed);
1691	} else {
1692	/ Can't evict or dispatch this block. Go to*
1693	previous. /*
1694	ut_ad(buf_pool->lru_hp.is_hp(prev));
1695	mutex_exit(block_mutex);
1696	}
1697
1698	ut_ad(!mutex_own(block_mutex));
1699	ut_ad(buf_pool_mutex_own(buf_pool));
1700
1701	free_len = UT_LIST_GET_LEN(buf_pool->free);
1702	lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1703	}
1704
1705	buf_pool->lru_hp.set(NULL);
1706
1707	/ We keep track of all flushes happening as part of LRU*
1708	flush. When estimating the desired rate at which flush_list
1709	should be flushed, we factor in this value. /*
1710	buf_lru_flush_page_count += n->flushed;
1711
1712	ut_ad(buf_pool_mutex_own(buf_pool));
1713
1714	if (n->evicted) {
1715	MONITOR_INC_VALUE_CUMULATIVE(
1716	MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
1717	MONITOR_LRU_BATCH_EVICT_COUNT,
1718	MONITOR_LRU_BATCH_EVICT_PAGES,
1719	n->evicted);
1720	}
1721
1722	if (scanned) {
1723	MONITOR_INC_VALUE_CUMULATIVE(
1724	MONITOR_LRU_BATCH_SCANNED,
1725	MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1726	MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1727	scanned);
1728	}
1729	}
1730
1731	/*****************************************************************//**
1732	Flush and move pages from LRU or unzip_LRU list to the free list.
1733	Whether LRU or unzip_LRU is used depends on the state of the system./*
1734
1735	static
1736	void
1737	buf_do_LRU_batch(
1738	/=============/
1739	buf_pool_t* buf_pool, /!< in: buffer pool instance /
1740	ulint max, /!< in: desired number of*
1741	blocks in the free_list /*
1742	flush_counters_t* n) /!< out: flushed/evicted page*
1743	counts /*
1744	{
1745	if (buf_LRU_evict_from_unzip_LRU(buf_pool)) {
1746	n->unzip_LRU_evicted = buf_free_from_unzip_LRU_list_batch(buf_pool, max);
1747	} else {
1748	n->unzip_LRU_evicted = `0`;
1749	}
1750
1751	if (max > n->unzip_LRU_evicted) {
1752	buf_flush_LRU_list_batch(buf_pool, max - n->unzip_LRU_evicted, n);
1753	} else {
1754	n->evicted = `0`;
1755	n->flushed = `0`;
1756	}
1757
1758	/ Add evicted pages from unzip_LRU to the evicted pages from*
1759	the simple LRU. /*
1760	n->evicted += n->unzip_LRU_evicted;
1761	}
1762
1763	/* This utility flushes dirty blocks from the end of the flush_list.*
1764	The calling thread is not allowed to own any latches on pages!
1765	@param[in] buf_pool buffer pool instance
1766	@param[in] min_n wished minimum mumber of blocks flushed (it is
1767	not guaranteed that the actual number is that big, though)
1768	@param[in] lsn_limit all blocks whose oldest_modification is smaller
1769	than this should be flushed (if their number does not exceed min_n)
1770	@return number of blocks for which the write request was queued;
1771	ULINT_UNDEFINED if there was a flush of the same type already
1772	running /*
1773	static
1774	ulint
1775	buf_do_flush_list_batch(
1776	buf_pool_t* buf_pool,
1777	ulint min_n,
1778	lsn_t lsn_limit)
1779	{
1780	ulint count = `0`;
1781	ulint scanned = `0`;
1782
1783	ut_ad(buf_pool_mutex_own(buf_pool));
1784
1785	/ Start from the end of the list looking for a suitable*
1786	block to be flushed. /*
1787	buf_flush_list_mutex_enter(buf_pool);
1788	ulint len = UT_LIST_GET_LEN(buf_pool->flush_list);
1789
1790	/ In order not to degenerate this scan to O(nn) we attempt
1791	to preserve pointer of previous block in the flush list. To do
1792	so we declare it a hazard pointer. Any thread working on the
1793	flush list must check the hazard pointer and if it is removing
1794	the same block then it must reset it. /*
1795	for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
1796	count < min_n && bpage != NULL && len > `0`
1797	&& bpage->oldest_modification < lsn_limit;
1798	bpage = buf_pool->flush_hp.get(),
1799	++scanned) {
1800
1801	buf_page_t* prev;
1802
1803	ut_a(bpage->oldest_modification > `0`);
1804	ut_ad(bpage->in_flush_list);
1805
1806	prev = UT_LIST_GET_PREV(list, bpage);
1807	buf_pool->flush_hp.set(prev);
1808	buf_flush_list_mutex_exit(buf_pool);
1809
1810	#ifdef UNIV_DEBUG
1811	bool flushed =
1812	#endif /* UNIV_DEBUG */
1813	buf_flush_page_and_try_neighbors(
1814	bpage, BUF_FLUSH_LIST, min_n, &count);
1815
1816	buf_flush_list_mutex_enter(buf_pool);
1817
1818	ut_ad(flushed \|\| buf_pool->flush_hp.is_hp(prev));
1819
1820	--len;
1821	}
1822
1823	buf_pool->flush_hp.set(NULL);
1824	buf_flush_list_mutex_exit(buf_pool);
1825
1826	if (scanned) {
1827	MONITOR_INC_VALUE_CUMULATIVE(
1828	MONITOR_FLUSH_BATCH_SCANNED,
1829	MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
1830	MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
1831	scanned);
1832	}
1833
1834	if (count) {
1835	MONITOR_INC_VALUE_CUMULATIVE(
1836	MONITOR_FLUSH_BATCH_TOTAL_PAGE,
1837	MONITOR_FLUSH_BATCH_COUNT,
1838	MONITOR_FLUSH_BATCH_PAGES,
1839	count);
1840	}
1841
1842	ut_ad(buf_pool_mutex_own(buf_pool));
1843
1844	return(count);
1845	}
1846
1847	/* This utility flushes dirty blocks from the end of the LRU list or*
1848	flush_list.
1849	NOTE 1: in the case of an LRU flush the calling thread may own latches to
1850	pages: to avoid deadlocks, this function must be written so that it cannot
1851	end up waiting for these latches! NOTE 2: in the case of a flush list flush,
1852	the calling thread is not allowed to own any latches on pages!
1853	@param[in] buf_pool buffer pool instance
1854	@param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST; if
1855	BUF_FLUSH_LIST, then the caller must not own any latches on pages
1856	@param[in] min_n wished minimum mumber of blocks flushed (it is
1857	not guaranteed that the actual number is that big, though)
1858	@param[in] lsn_limit in the case of BUF_FLUSH_LIST all blocks whose
1859	oldest_modification is smaller than this should be flushed (if their number
1860	does not exceed min_n), otherwise ignored /*
1861	static
1862	void
1863	buf_flush_batch(
1864	buf_pool_t* buf_pool,
1865	buf_flush_t flush_type,
1866	ulint min_n,
1867	lsn_t lsn_limit,
1868	flush_counters_t* n) /!< out: flushed/evicted page*
1869	counts /*
1870	{
1871	ut_ad(flush_type == BUF_FLUSH_LRU \|\| flush_type == BUF_FLUSH_LIST);
1872	ut_ad(flush_type == BUF_FLUSH_LRU
1873	\|\| !sync_check_iterate(dict_sync_check()));
1874
1875	buf_pool_mutex_enter(buf_pool);
1876
1877	/ Note: The buffer pool mutex is released and reacquired within*
1878	the flush functions. /*
1879	switch (flush_type) {
1880	case BUF_FLUSH_LRU:
1881	buf_do_LRU_batch(buf_pool, min_n, n);
1882	break;
1883	case BUF_FLUSH_LIST:
1884	n->flushed = buf_do_flush_list_batch(buf_pool, min_n, lsn_limit);
1885	n->evicted = `0`;
1886	break;
1887	default:
1888	ut_error;
1889	}
1890
1891	buf_pool_mutex_exit(buf_pool);
1892
1893	DBUG_LOG("ib_buf", "flush " << flush_type << " completed");
1894	}
1895
1896	/****************************************************************//**
1897	Gather the aggregated stats for both flush list and LRU list flushing.
1898	@param page_count_flush number of pages flushed from the end of the flush_list
1899	@param page_count_LRU number of pages flushed from the end of the LRU list
1900	*/
1901	static
1902	void
1903	buf_flush_stats(
1904	/============/
1905	ulint page_count_flush,
1906	ulint page_count_LRU)
1907	{
1908	DBUG_PRINT("ib_buf", ("flush completed, from flush_list %u pages, "
1909	"from LRU_list %u pages",
1910	unsigned(page_count_flush),
1911	unsigned(page_count_LRU)));
1912
1913	srv_stats.buf_pool_flushed.add(page_count_flush + page_count_LRU);
1914	}
1915
1916	/****************************************************************//**
1917	Start a buffer flush batch for LRU or flush list /*
1918	static
1919	ibool
1920	buf_flush_start(
1921	/============/
1922	buf_pool_t* buf_pool, /!< buffer pool instance /
1923	buf_flush_t flush_type) /!< in: BUF_FLUSH_LRU*
1924	or BUF_FLUSH_LIST /*
1925	{
1926	ut_ad(flush_type == BUF_FLUSH_LRU \|\| flush_type == BUF_FLUSH_LIST);
1927
1928	buf_pool_mutex_enter(buf_pool);
1929
1930	if (buf_pool->n_flush[flush_type] > `0`
1931	\|\| buf_pool->init_flush[flush_type] == TRUE) {
1932
1933	/ There is already a flush batch of the same type running /
1934
1935	buf_pool_mutex_exit(buf_pool);
1936
1937	return(FALSE);
1938	}
1939
1940	buf_pool->init_flush[flush_type] = TRUE;
1941
1942	os_event_reset(buf_pool->no_flush[flush_type]);
1943
1944	buf_pool_mutex_exit(buf_pool);
1945
1946	return(TRUE);
1947	}
1948
1949	/****************************************************************//**
1950	End a buffer flush batch for LRU or flush list /*
1951	static
1952	void
1953	buf_flush_end(
1954	/==========/
1955	buf_pool_t* buf_pool, /!< buffer pool instance /
1956	buf_flush_t flush_type) /!< in: BUF_FLUSH_LRU*
1957	or BUF_FLUSH_LIST /*
1958	{
1959	buf_pool_mutex_enter(buf_pool);
1960
1961	buf_pool->init_flush[flush_type] = FALSE;
1962
1963	buf_pool->try_LRU_scan = TRUE;
1964
1965	if (buf_pool->n_flush[flush_type] == `0`) {
1966
1967	/ The running flush batch has ended /
1968
1969	os_event_set(buf_pool->no_flush[flush_type]);
1970	}
1971
1972	buf_pool_mutex_exit(buf_pool);
1973
1974	if (!srv_read_only_mode) {
1975	buf_dblwr_flush_buffered_writes();
1976	} else {
1977	os_aio_simulated_wake_handler_threads();
1978	}
1979	}
1980
1981	/****************************************************************//**
1982	Waits until a flush batch of the given type ends /*
1983	void
1984	buf_flush_wait_batch_end(
1985	/=====================/
1986	buf_pool_t* buf_pool, /!< buffer pool instance /
1987	buf_flush_t type) /!< in: BUF_FLUSH_LRU*
1988	or BUF_FLUSH_LIST /*
1989	{
1990	ut_ad(type == BUF_FLUSH_LRU \|\| type == BUF_FLUSH_LIST);
1991
1992	if (buf_pool == NULL) {
1993	ulint i;
1994
1995	for (i = `0`; i < srv_buf_pool_instances; ++i) {
1996	buf_pool_t* buf_pool;
1997
1998	buf_pool = buf_pool_from_array(i);
1999
2000	thd_wait_begin(NULL, THD_WAIT_DISKIO);
2001	os_event_wait(buf_pool->no_flush[type]);
2002	thd_wait_end(NULL);
2003	}
2004	} else {
2005	thd_wait_begin(NULL, THD_WAIT_DISKIO);
2006	os_event_wait(buf_pool->no_flush[type]);
2007	thd_wait_end(NULL);
2008	}
2009	}
2010
2011	/* Do flushing batch of a given type.*
2012	NOTE: The calling thread is not allowed to own any latches on pages!
2013	@param[in,out] buf_pool buffer pool instance
2014	@param[in] type flush type
2015	@param[in] min_n wished minimum mumber of blocks flushed
2016	(it is not guaranteed that the actual number is that big, though)
2017	@param[in] lsn_limit in the case BUF_FLUSH_LIST all blocks whose
2018	oldest_modification is smaller than this should be flushed (if their number
2019	does not exceed min_n), otherwise ignored
2020	@param[out] n_processed the number of pages which were processed is
2021	passed back to caller. Ignored if NULL
2022	@retval true if a batch was queued successfully.
2023	@retval false if another batch of same type was already running. /*
2024	bool
2025	buf_flush_do_batch(
2026	buf_pool_t* buf_pool,
2027	buf_flush_t type,
2028	ulint min_n,
2029	lsn_t lsn_limit,
2030	flush_counters_t* n)
2031	{
2032	ut_ad(type == BUF_FLUSH_LRU \|\| type == BUF_FLUSH_LIST);
2033
2034	if (n != NULL) {
2035	n->flushed = `0`;
2036	}
2037
2038	if (!buf_flush_start(buf_pool, type)) {
2039	return(false);
2040	}
2041
2042	buf_flush_batch(buf_pool, type, min_n, lsn_limit, n);
2043
2044	buf_flush_end(buf_pool, type);
2045
2046	return(true);
2047	}
2048	/**
2049	Waits until a flush batch of the given lsn ends
2050	@param[in] new_oldest target oldest_modified_lsn to wait for /*
2051
2052	void
2053	buf_flush_wait_flushed(
2054	lsn_t new_oldest)
2055	{
2056	for (ulint i = `0`; i < srv_buf_pool_instances; ++i) {
2057	buf_pool_t* buf_pool;
2058	lsn_t oldest;
2059
2060	buf_pool = buf_pool_from_array(i);
2061
2062	for (;;) {
2063	/ We don't need to wait for fsync of the flushed*
2064	blocks, because anyway we need fsync to make chekpoint.
2065	So, we don't need to wait for the batch end here. /*
2066
2067	buf_flush_list_mutex_enter(buf_pool);
2068
2069	buf_page_t* bpage;
2070
2071	/ We don't need to wait for system temporary pages /
2072	for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
2073	bpage != NULL
2074	&& fsp_is_system_temporary(bpage->id.space());
2075	bpage = UT_LIST_GET_PREV(list, bpage)) {
2076	/ Do nothing. /
2077	}
2078
2079	if (bpage != NULL) {
2080	ut_ad(bpage->in_flush_list);
2081	oldest = bpage->oldest_modification;
2082	} else {
2083	oldest = `0`;
2084	}
2085
2086	buf_flush_list_mutex_exit(buf_pool);
2087
2088	if (oldest == `0` \|\| oldest >= new_oldest) {
2089	break;
2090	}
2091
2092	/ sleep and retry /
2093	os_thread_sleep(buf_flush_wait_flushed_sleep_time);
2094
2095	MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
2096	}
2097	}
2098	}
2099
2100	/* This utility flushes dirty blocks from the end of the flush list of all*
2101	buffer pool instances.
2102	NOTE: The calling thread is not allowed to own any latches on pages!
2103	@param[in] min_n wished minimum mumber of blocks flushed (it is
2104	not guaranteed that the actual number is that big, though)
2105	@param[in] lsn_limit in the case BUF_FLUSH_LIST all blocks whose
2106	oldest_modification is smaller than this should be flushed (if their number
2107	does not exceed min_n), otherwise ignored
2108	@param[out] n_processed the number of pages which were processed is
2109	passed back to caller. Ignored if NULL.
2110	@return true if a batch was queued successfully for each buffer pool
2111	instance. false if another batch of same type was already running in
2112	at least one of the buffer pool instance /*
2113	bool
2114	buf_flush_lists(
2115	ulint min_n,
2116	lsn_t lsn_limit,
2117	ulint* n_processed)
2118	{
2119	ulint i;
2120	ulint n_flushed = `0`;
2121	bool success = true;
2122
2123	if (n_processed) {
2124	*n_processed = `0`;
2125	}
2126
2127	if (min_n != ULINT_MAX) {
2128	/ Ensure that flushing is spread evenly amongst the*
2129	buffer pool instances. When min_n is ULINT_MAX
2130	we need to flush everything up to the lsn limit
2131	so no limit here. /*
2132	min_n = (min_n + srv_buf_pool_instances - `1`)
2133	/ srv_buf_pool_instances;
2134	}
2135
2136	/ Flush to lsn_limit in all buffer pool instances /
2137	for (i = `0`; i < srv_buf_pool_instances; i++) {
2138	buf_pool_t* buf_pool;
2139	flush_counters_t n;
2140
2141	memset(&n, `0`, sizeof(flush_counters_t));
2142	buf_pool = buf_pool_from_array(i);
2143
2144	if (!buf_flush_do_batch(buf_pool,
2145	BUF_FLUSH_LIST,
2146	min_n,
2147	lsn_limit,
2148	&n)) {
2149	/ We have two choices here. If lsn_limit was*
2150	specified then skipping an instance of buffer
2151	pool means we cannot guarantee that all pages
2152	up to lsn_limit has been flushed. We can
2153	return right now with failure or we can try
2154	to flush remaining buffer pools up to the
2155	lsn_limit. We attempt to flush other buffer
2156	pools based on the assumption that it will
2157	help in the retry which will follow the
2158	failure. /*
2159	success = false;
2160
2161	}
2162
2163	n_flushed += n.flushed;
2164	}
2165
2166	if (n_flushed) {
2167	buf_flush_stats(n_flushed, `0`);
2168	if (n_processed) {
2169	*n_processed = n_flushed;
2170	}
2171	}
2172
2173	return(success);
2174	}
2175
2176	/****************************************************************//**
2177	This function picks up a single page from the tail of the LRU
2178	list, flushes it (if it is dirty), removes it from page_hash and LRU
2179	list and puts it on the free list. It is called from user threads when
2180	they are unable to find a replaceable page at the tail of the LRU
2181	list i.e.: when the background LRU flushing in the page_cleaner thread
2182	is not fast enough to keep pace with the workload.
2183	@return true if success. /*
2184	bool
2185	buf_flush_single_page_from_LRU(
2186	/===========================/
2187	buf_pool_t* buf_pool) /!< in/out: buffer pool instance /
2188	{
2189	ulint scanned;
2190	buf_page_t* bpage;
2191	ibool freed;
2192
2193	buf_pool_mutex_enter(buf_pool);
2194
2195	for (bpage = buf_pool->single_scan_itr.start(), scanned = `0`,
2196	freed = false;
2197	bpage != NULL;
2198	++scanned, bpage = buf_pool->single_scan_itr.get()) {
2199
2200	ut_ad(buf_pool_mutex_own(buf_pool));
2201
2202	buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
2203	buf_pool->single_scan_itr.set(prev);
2204	BPageMutex* block_mutex;
2205
2206	block_mutex = buf_page_get_mutex(bpage);
2207
2208	mutex_enter(block_mutex);
2209
2210	if (buf_flush_ready_for_replace(bpage)) {
2211	/ block is ready for eviction i.e., it is*
2212	clean and is not IO-fixed or buffer fixed. /*
2213	mutex_exit(block_mutex);
2214
2215	if (buf_LRU_free_page(bpage, true)) {
2216	buf_pool_mutex_exit(buf_pool);
2217	freed = true;
2218	break;
2219	}
2220
2221	} else if (buf_flush_ready_for_flush(
2222	bpage, BUF_FLUSH_SINGLE_PAGE)) {
2223
2224	/ Block is ready for flush. Try and dispatch an IO*
2225	request. We'll put it on free list in IO completion
2226	routine if it is not buffer fixed. The following call
2227	will release the buffer pool and block mutex.
2228
2229	Note: There is no guarantee that this page has actually
2230	been freed, only that it has been flushed to disk /*
2231
2232	freed = buf_flush_page(
2233	buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, true);
2234
2235	if (freed) {
2236	break;
2237	}
2238
2239	mutex_exit(block_mutex);
2240	} else {
2241	mutex_exit(block_mutex);
2242	}
2243	ut_ad(!mutex_own(block_mutex));
2244	}
2245	if (!freed) {
2246	/ Can't find a single flushable page. /
2247	ut_ad(!bpage);
2248	buf_pool_mutex_exit(buf_pool);
2249	}
2250
2251	if (scanned) {
2252	MONITOR_INC_VALUE_CUMULATIVE(
2253	MONITOR_LRU_SINGLE_FLUSH_SCANNED,
2254	MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
2255	MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
2256	scanned);
2257	}
2258
2259	ut_ad(!buf_pool_mutex_own(buf_pool));
2260	return(freed);
2261	}
2262
2263	/**
2264	Clears up tail of the LRU list of a given buffer pool instance:
2265	* Put replaceable pages at the tail of LRU to the free list
2266	* Flush dirty pages at the tail of LRU to the disk
2267	The depth to which we scan each buffer pool is controlled by dynamic
2268	config parameter innodb_LRU_scan_depth.
2269	@param buf_pool buffer pool instance
2270	@return total pages flushed /*
2271	static
2272	ulint
2273	buf_flush_LRU_list(
2274	buf_pool_t* buf_pool)
2275	{
2276	ulint scan_depth, withdraw_depth;
2277	flush_counters_t n;
2278
2279	memset(&n, `0`, sizeof(flush_counters_t));
2280
2281	ut_ad(buf_pool);
2282	/ srv_LRU_scan_depth can be arbitrarily large value.*
2283	We cap it with current LRU size. /*
2284	buf_pool_mutex_enter(buf_pool);
2285	scan_depth = UT_LIST_GET_LEN(buf_pool->LRU);
2286	if (buf_pool->curr_size < buf_pool->old_size
2287	&& buf_pool->withdraw_target > `0`) {
2288	withdraw_depth = buf_pool->withdraw_target
2289	- UT_LIST_GET_LEN(buf_pool->withdraw);
2290	} else {
2291	withdraw_depth = `0`;
2292	}
2293	buf_pool_mutex_exit(buf_pool);
2294	if (withdraw_depth > srv_LRU_scan_depth) {
2295	scan_depth = ut_min(withdraw_depth, scan_depth);
2296	} else {
2297	scan_depth = ut_min(static_cast<ulint>(srv_LRU_scan_depth),
2298	scan_depth);
2299	}
2300	/ Currently one of page_cleaners is the only thread*
2301	that can trigger an LRU flush at the same time.
2302	So, it is not possible that a batch triggered during
2303	last iteration is still running, /*
2304	buf_flush_do_batch(buf_pool, BUF_FLUSH_LRU, scan_depth,
2305	`0`, &n);
2306
2307	return(n.flushed);
2308	}
2309
2310	/*******************************************************************//**
2311	Wait for any possible LRU flushes that are in progress to end. /*
2312	void
2313	buf_flush_wait_LRU_batch_end(void)
2314	/==============================/
2315	{
2316	for (ulint i = `0`; i < srv_buf_pool_instances; i++) {
2317	buf_pool_t* buf_pool;
2318
2319	buf_pool = buf_pool_from_array(i);
2320
2321	buf_pool_mutex_enter(buf_pool);
2322
2323	if (buf_pool->n_flush[BUF_FLUSH_LRU] > `0`
2324	\|\| buf_pool->init_flush[BUF_FLUSH_LRU]) {
2325
2326	buf_pool_mutex_exit(buf_pool);
2327	buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
2328	} else {
2329	buf_pool_mutex_exit(buf_pool);
2330	}
2331	}
2332	}
2333
2334	/*******************************************************************//**
2335	Calculates if flushing is required based on number of dirty pages in
2336	the buffer pool.
2337	@return percent of io_capacity to flush to manage dirty page ratio /*
2338	static
2339	ulint
2340	af_get_pct_for_dirty()
2341	/==================/
2342	{
2343	double dirty_pct = buf_get_modified_ratio_pct();
2344
2345	if (dirty_pct == `0.0`) {
2346	/ No pages modified /
2347	return(`0`);
2348	}
2349
2350	ut_a(srv_max_dirty_pages_pct_lwm
2351	<= srv_max_buf_pool_modified_pct);
2352
2353	if (srv_max_dirty_pages_pct_lwm == `0`) {
2354	/ The user has not set the option to preflush dirty*
2355	pages as we approach the high water mark. /*
2356	if (dirty_pct >= srv_max_buf_pool_modified_pct) {
2357	/ We have crossed the high water mark of dirty*
2358	pages In this case we start flushing at 100% of
2359	innodb_io_capacity. /*
2360	return(`100`);
2361	}
2362	} else if (dirty_pct >= srv_max_dirty_pages_pct_lwm) {
2363	/ We should start flushing pages gradually. /
2364	return(static_cast<ulint>((dirty_pct * `100`)
2365	/ (srv_max_buf_pool_modified_pct + `1`)));
2366	}
2367
2368	return(`0`);
2369	}
2370
2371	/*******************************************************************//**
2372	Calculates if flushing is required based on redo generation rate.
2373	@return percent of io_capacity to flush to manage redo space /*
2374	static
2375	ulint
2376	af_get_pct_for_lsn(
2377	/===============/
2378	lsn_t age) /!< in: current age of LSN. /
2379	{
2380	lsn_t max_async_age;
2381	lsn_t lsn_age_factor;
2382	lsn_t af_lwm = (lsn_t) ((srv_adaptive_flushing_lwm
2383	* log_get_capacity()) / `100`);
2384
2385	if (age < af_lwm) {
2386	/ No adaptive flushing. /
2387	return(`0`);
2388	}
2389
2390	max_async_age = log_get_max_modified_age_async();
2391
2392	if (age < max_async_age && !srv_adaptive_flushing) {
2393	/ We have still not reached the max_async point and*
2394	the user has disabled adaptive flushing. /*
2395	return(`0`);
2396	}
2397
2398	/ If we are here then we know that either:*
2399	1) User has enabled adaptive flushing
2400	2) User may have disabled adaptive flushing but we have reached
2401	max_async_age. /*
2402	lsn_age_factor = (age * `100`) / max_async_age;
2403
2404	ut_ad(srv_max_io_capacity >= srv_io_capacity);
2405	return(static_cast<ulint>(
2406	((srv_max_io_capacity / srv_io_capacity)
2407	* (lsn_age_factor * sqrt((double)lsn_age_factor)))
2408	/ `7.5`));
2409	}
2410
2411	/*******************************************************************//**
2412	This function is called approximately once every second by the
2413	page_cleaner thread. Based on various factors it decides if there is a
2414	need to do flushing.
2415	@return number of pages recommended to be flushed
2416	@param lsn_limit pointer to return LSN up to which flushing must happen
2417	@param last_pages_in the number of pages flushed by the last flush_list
2418	flushing. /*
2419	static
2420	ulint
2421	page_cleaner_flush_pages_recommendation(
2422	/====================================/
2423	lsn_t* lsn_limit,
2424	ulint last_pages_in)
2425	{
2426	static lsn_t prev_lsn = `0`;
2427	static ulint sum_pages = `0`;
2428	static ulint avg_page_rate = `0`;
2429	static ulint n_iterations = `0`;
2430	static time_t prev_time;
2431	lsn_t oldest_lsn;
2432	lsn_t cur_lsn;
2433	lsn_t age;
2434	lsn_t lsn_rate;
2435	ulint n_pages = `0`;
2436	ulint pct_for_dirty = `0`;
2437	ulint pct_for_lsn = `0`;
2438	ulint pct_total = `0`;
2439
2440	cur_lsn = log_get_lsn_nowait();
2441
2442	/ log_get_lsn_nowait tries to get log_sys.mutex with*
2443	mutex_enter_nowait, if this does not succeed function
2444	returns 0, do not use that value to update stats. /*
2445	if (cur_lsn == `0`) {
2446	return(`0`);
2447	}
2448
2449	if (prev_lsn == `0`) {
2450	/ First time around. /
2451	prev_lsn = cur_lsn;
2452	prev_time = ut_time();
2453	return(`0`);
2454	}
2455
2456	if (prev_lsn == cur_lsn) {
2457	return(`0`);
2458	}
2459
2460	sum_pages += last_pages_in;
2461
2462	time_t curr_time = ut_time();
2463	double time_elapsed = difftime(curr_time, prev_time);
2464
2465	/ We update our variables every srv_flushing_avg_loops*
2466	iterations to smooth out transition in workload. /*
2467	if (++n_iterations >= srv_flushing_avg_loops
2468	\|\| time_elapsed >= srv_flushing_avg_loops) {
2469
2470	if (time_elapsed < `1`) {
2471	time_elapsed = `1`;
2472	}
2473
2474	avg_page_rate = static_cast<ulint>(
2475	((static_cast<double>(sum_pages)
2476	/ time_elapsed)
2477	+ avg_page_rate) / `2`);
2478
2479	/ How much LSN we have generated since last call. /
2480	lsn_rate = static_cast<lsn_t>(
2481	static_cast<double>(cur_lsn - prev_lsn)
2482	/ time_elapsed);
2483
2484	lsn_avg_rate = (lsn_avg_rate + lsn_rate) / `2`;
2485
2486	/ aggregate stats of all slots /
2487	mutex_enter(&page_cleaner.mutex);
2488
2489	ulint flush_tm = page_cleaner.flush_time;
2490	ulint flush_pass = page_cleaner.flush_pass;
2491
2492	page_cleaner.flush_time = `0`;
2493	page_cleaner.flush_pass = `0`;
2494
2495	ulint lru_tm = `0`;
2496	ulint list_tm = `0`;
2497	ulint lru_pass = `0`;
2498	ulint list_pass = `0`;
2499
2500	for (ulint i = `0`; i < page_cleaner.n_slots; i++) {
2501	page_cleaner_slot_t* slot;
2502
2503	slot = &page_cleaner.slots[i];
2504
2505	lru_tm += slot->flush_lru_time;
2506	lru_pass += slot->flush_lru_pass;
2507	list_tm += slot->flush_list_time;
2508	list_pass += slot->flush_list_pass;
2509
2510	slot->flush_lru_time = `0`;
2511	slot->flush_lru_pass = `0`;
2512	slot->flush_list_time = `0`;
2513	slot->flush_list_pass = `0`;
2514	}
2515
2516	mutex_exit(&page_cleaner.mutex);
2517
2518	/ minimum values are 1, to avoid dividing by zero. /
2519	if (lru_tm < `1`) {
2520	lru_tm = `1`;
2521	}
2522	if (list_tm < `1`) {
2523	list_tm = `1`;
2524	}
2525	if (flush_tm < `1`) {
2526	flush_tm = `1`;
2527	}
2528
2529	if (lru_pass < `1`) {
2530	lru_pass = `1`;
2531	}
2532	if (list_pass < `1`) {
2533	list_pass = `1`;
2534	}
2535	if (flush_pass < `1`) {
2536	flush_pass = `1`;
2537	}
2538
2539	MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_SLOT,
2540	list_tm / list_pass);
2541	MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_SLOT,
2542	lru_tm / lru_pass);
2543
2544	MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_THREAD,
2545	list_tm / (srv_n_page_cleaners * flush_pass));
2546	MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_THREAD,
2547	lru_tm / (srv_n_page_cleaners * flush_pass));
2548	MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_EST,
2549	flush_tm * list_tm / flush_pass
2550	/ (list_tm + lru_tm));
2551	MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_EST,
2552	flush_tm * lru_tm / flush_pass
2553	/ (list_tm + lru_tm));
2554	MONITOR_SET(MONITOR_FLUSH_AVG_TIME, flush_tm / flush_pass);
2555
2556	MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS,
2557	list_pass / page_cleaner.n_slots);
2558	MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_PASS,
2559	lru_pass / page_cleaner.n_slots);
2560	MONITOR_SET(MONITOR_FLUSH_AVG_PASS, flush_pass);
2561
2562	prev_lsn = cur_lsn;
2563	prev_time = curr_time;
2564
2565	n_iterations = `0`;
2566
2567	sum_pages = `0`;
2568	}
2569
2570	oldest_lsn = buf_pool_get_oldest_modification();
2571
2572	ut_ad(oldest_lsn <= log_get_lsn());
2573
2574	age = cur_lsn > oldest_lsn ? cur_lsn - oldest_lsn : `0`;
2575
2576	pct_for_dirty = af_get_pct_for_dirty();
2577	pct_for_lsn = af_get_pct_for_lsn(age);
2578
2579	pct_total = ut_max(pct_for_dirty, pct_for_lsn);
2580
2581	/ Estimate pages to be flushed for the lsn progress /
2582	ulint sum_pages_for_lsn = `0`;
2583	lsn_t target_lsn = oldest_lsn
2584	+ lsn_avg_rate * buf_flush_lsn_scan_factor;
2585
2586	for (ulint i = `0`; i < srv_buf_pool_instances; i++) {
2587	buf_pool_t* buf_pool = buf_pool_from_array(i);
2588	ulint pages_for_lsn = `0`;
2589
2590	buf_flush_list_mutex_enter(buf_pool);
2591	for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool->flush_list);
2592	b != NULL;
2593	b = UT_LIST_GET_PREV(list, b)) {
2594	if (b->oldest_modification > target_lsn) {
2595	break;
2596	}
2597	++pages_for_lsn;
2598	}
2599	buf_flush_list_mutex_exit(buf_pool);
2600
2601	sum_pages_for_lsn += pages_for_lsn;
2602
2603	mutex_enter(&page_cleaner.mutex);
2604	ut_ad(page_cleaner.slots[i].state
2605	== PAGE_CLEANER_STATE_NONE);
2606	page_cleaner.slots[i].n_pages_requested
2607	= pages_for_lsn / buf_flush_lsn_scan_factor + `1`;
2608	mutex_exit(&page_cleaner.mutex);
2609	}
2610
2611	sum_pages_for_lsn /= buf_flush_lsn_scan_factor;
2612	if(sum_pages_for_lsn < `1`) {
2613	sum_pages_for_lsn = `1`;
2614	}
2615
2616	/ Cap the maximum IO capacity that we are going to use by*
2617	max_io_capacity. Limit the value to avoid too quick increase /*
2618	ulint pages_for_lsn =
2619	std::min<ulint>(sum_pages_for_lsn, srv_max_io_capacity * `2`);
2620
2621	n_pages = (PCT_IO(pct_total) + avg_page_rate + pages_for_lsn) / `3`;
2622
2623	if (n_pages > srv_max_io_capacity) {
2624	n_pages = srv_max_io_capacity;
2625	}
2626
2627	/ Normalize request for each instance /
2628	mutex_enter(&page_cleaner.mutex);
2629	ut_ad(page_cleaner.n_slots_requested == `0`);
2630	ut_ad(page_cleaner.n_slots_flushing == `0`);
2631	ut_ad(page_cleaner.n_slots_finished == `0`);
2632
2633	for (ulint i = `0`; i < srv_buf_pool_instances; i++) {
2634	/ if REDO has enough of free space,*
2635	don't care about age distribution of pages /*
2636	page_cleaner.slots[i].n_pages_requested = pct_for_lsn > `30` ?
2637	page_cleaner.slots[i].n_pages_requested
2638	* n_pages / sum_pages_for_lsn + `1`
2639	: n_pages / srv_buf_pool_instances;
2640	}
2641	mutex_exit(&page_cleaner.mutex);
2642
2643	MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
2644
2645	MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, sum_pages_for_lsn);
2646
2647	MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate);
2648	MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate);
2649	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty);
2650	MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
2651
2652	*lsn_limit = LSN_MAX;
2653
2654	return(n_pages);
2655	}
2656
2657	/*******************************************************************//**
2658	Puts the page_cleaner thread to sleep if it has finished work in less
2659	than a second
2660	@retval 0 wake up by event set,
2661	@retval OS_SYNC_TIME_EXCEEDED if timeout was exceeded
2662	@param next_loop_time time when next loop iteration should start
2663	@param sig_count zero or the value returned by previous call of
2664	os_event_reset()
2665	@param cur_time current time as in ut_time_ms() /*
2666	static
2667	ulint
2668	pc_sleep_if_needed(
2669	/===============/
2670	ulint next_loop_time,
2671	int64_t sig_count,
2672	ulint cur_time)
2673	{
2674	/ No sleep if we are cleaning the buffer pool during the shutdown*
2675	with everything else finished /*
2676	if (srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE)
2677	return OS_SYNC_TIME_EXCEEDED;
2678
2679	if (next_loop_time > cur_time) {
2680	/ Get sleep interval in micro seconds. We use*
2681	ut_min() to avoid long sleep in case of wrap around. /*
2682	ulint sleep_us;
2683
2684	sleep_us = ut_min(static_cast<ulint>(`1000000`),
2685	(next_loop_time - cur_time) * `1000`);
2686
2687	return(os_event_wait_time_low(buf_flush_event,
2688	sleep_us, sig_count));
2689	}
2690
2691	return(OS_SYNC_TIME_EXCEEDED);
2692	}
2693
2694	/****************************************************************//**
2695	Initialize page_cleaner. /*
2696	void
2697	buf_flush_page_cleaner_init(void)
2698	/=============================/
2699	{
2700	ut_ad(!page_cleaner.is_running);
2701
2702	mutex_create(LATCH_ID_PAGE_CLEANER, &page_cleaner.mutex);
2703
2704	page_cleaner.is_requested = os_event_create("pc_is_requested");
2705	page_cleaner.is_finished = os_event_create("pc_is_finished");
2706	page_cleaner.is_started = os_event_create("pc_is_started");
2707	page_cleaner.n_slots = static_cast<ulint>(srv_buf_pool_instances);
2708
2709	ut_d(page_cleaner.n_disabled_debug = `0`);
2710
2711	page_cleaner.is_running = true;
2712	}
2713
2714	/**
2715	Requests for all slots to flush all buffer pool instances.
2716	@param min_n wished minimum mumber of blocks flushed
2717	(it is not guaranteed that the actual number is that big)
2718	@param lsn_limit in the case BUF_FLUSH_LIST all blocks whose
2719	oldest_modification is smaller than this should be flushed
2720	(if their number does not exceed min_n), otherwise ignored
2721	*/
2722	static
2723	void
2724	pc_request(
2725	ulint min_n,
2726	lsn_t lsn_limit)
2727	{
2728	if (min_n != ULINT_MAX) {
2729	/ Ensure that flushing is spread evenly amongst the*
2730	buffer pool instances. When min_n is ULINT_MAX
2731	we need to flush everything up to the lsn limit
2732	so no limit here. /*
2733	min_n = (min_n + srv_buf_pool_instances - `1`)
2734	/ srv_buf_pool_instances;
2735	}
2736
2737	mutex_enter(&page_cleaner.mutex);
2738
2739	ut_ad(page_cleaner.n_slots_requested == `0`);
2740	ut_ad(page_cleaner.n_slots_flushing == `0`);
2741	ut_ad(page_cleaner.n_slots_finished == `0`);
2742
2743	page_cleaner.requested = (min_n > `0`);
2744	page_cleaner.lsn_limit = lsn_limit;
2745
2746	for (ulint i = `0`; i < page_cleaner.n_slots; i++) {
2747	page_cleaner_slot_t* slot = &page_cleaner.slots[i];
2748
2749	ut_ad(slot->state == PAGE_CLEANER_STATE_NONE);
2750
2751	if (min_n == ULINT_MAX) {
2752	slot->n_pages_requested = ULINT_MAX;
2753	} else if (min_n == `0`) {
2754	slot->n_pages_requested = `0`;
2755	}
2756
2757	/ slot->n_pages_requested was already set by*
2758	page_cleaner_flush_pages_recommendation() /*
2759
2760	slot->state = PAGE_CLEANER_STATE_REQUESTED;
2761	}
2762
2763	page_cleaner.n_slots_requested = page_cleaner.n_slots;
2764	page_cleaner.n_slots_flushing = `0`;
2765	page_cleaner.n_slots_finished = `0`;
2766
2767	os_event_set(page_cleaner.is_requested);
2768
2769	mutex_exit(&page_cleaner.mutex);
2770	}
2771
2772	/**
2773	Do flush for one slot.
2774	@return the number of the slots which has not been treated yet. /*
2775	static
2776	ulint
2777	pc_flush_slot(void)
2778	{
2779	ulint lru_tm = `0`;
2780	ulint list_tm = `0`;
2781	ulint lru_pass = `0`;
2782	ulint list_pass = `0`;
2783
2784	mutex_enter(&page_cleaner.mutex);
2785
2786	if (!page_cleaner.n_slots_requested) {
2787	os_event_reset(page_cleaner.is_requested);
2788	} else {
2789	page_cleaner_slot_t* slot = NULL;
2790	ulint i;
2791
2792	for (i = `0`; i < page_cleaner.n_slots; i++) {
2793	slot = &page_cleaner.slots[i];
2794
2795	if (slot->state == PAGE_CLEANER_STATE_REQUESTED) {
2796	break;
2797	}
2798	}
2799
2800	/ slot should be found because*
2801	page_cleaner.n_slots_requested > 0 /*
2802	ut_a(i < page_cleaner.n_slots);
2803
2804	buf_pool_t* buf_pool = buf_pool_from_array(i);
2805
2806	page_cleaner.n_slots_requested--;
2807	page_cleaner.n_slots_flushing++;
2808	slot->state = PAGE_CLEANER_STATE_FLUSHING;
2809
2810	if (UNIV_UNLIKELY(!page_cleaner.is_running)) {
2811	slot->n_flushed_lru = `0`;
2812	slot->n_flushed_list = `0`;
2813	goto finish_mutex;
2814	}
2815
2816	if (page_cleaner.n_slots_requested == `0`) {
2817	os_event_reset(page_cleaner.is_requested);
2818	}
2819
2820	mutex_exit(&page_cleaner.mutex);
2821
2822	lru_tm = ut_time_ms();
2823
2824	/ Flush pages from end of LRU if required /
2825	slot->n_flushed_lru = buf_flush_LRU_list(buf_pool);
2826
2827	lru_tm = ut_time_ms() - lru_tm;
2828	lru_pass++;
2829
2830	if (UNIV_UNLIKELY(!page_cleaner.is_running)) {
2831	slot->n_flushed_list = `0`;
2832	goto finish;
2833	}
2834
2835	/ Flush pages from flush_list if required /
2836	if (page_cleaner.requested) {
2837	flush_counters_t n;
2838	memset(&n, `0`, sizeof(flush_counters_t));
2839	list_tm = ut_time_ms();
2840
2841	slot->succeeded_list = buf_flush_do_batch(
2842	buf_pool, BUF_FLUSH_LIST,
2843	slot->n_pages_requested,
2844	page_cleaner.lsn_limit,
2845	&n);
2846
2847	slot->n_flushed_list = n.flushed;
2848
2849	list_tm = ut_time_ms() - list_tm;
2850	list_pass++;
2851	} else {
2852	slot->n_flushed_list = `0`;
2853	slot->succeeded_list = true;
2854	}
2855	finish:
2856	mutex_enter(&page_cleaner.mutex);
2857	finish_mutex:
2858	page_cleaner.n_slots_flushing--;
2859	page_cleaner.n_slots_finished++;
2860	slot->state = PAGE_CLEANER_STATE_FINISHED;
2861
2862	slot->flush_lru_time += lru_tm;
2863	slot->flush_list_time += list_tm;
2864	slot->flush_lru_pass += lru_pass;
2865	slot->flush_list_pass += list_pass;
2866
2867	if (page_cleaner.n_slots_requested == `0`
2868	&& page_cleaner.n_slots_flushing == `0`) {
2869	os_event_set(page_cleaner.is_finished);
2870	}
2871	}
2872
2873	ulint ret = page_cleaner.n_slots_requested;
2874
2875	mutex_exit(&page_cleaner.mutex);
2876
2877	return(ret);
2878	}
2879
2880	/**
2881	Wait until all flush requests are finished.
2882	@param n_flushed_lru number of pages flushed from the end of the LRU list.
2883	@param n_flushed_list number of pages flushed from the end of the
2884	flush_list.
2885	@return true if all flush_list flushing batch were success. /*
2886	static
2887	bool
2888	pc_wait_finished(
2889	ulint* n_flushed_lru,
2890	ulint* n_flushed_list)
2891	{
2892	bool all_succeeded = true;
2893
2894	*n_flushed_lru = `0`;
2895	*n_flushed_list = `0`;
2896
2897	os_event_wait(page_cleaner.is_finished);
2898
2899	mutex_enter(&page_cleaner.mutex);
2900
2901	ut_ad(page_cleaner.n_slots_requested == `0`);
2902	ut_ad(page_cleaner.n_slots_flushing == `0`);
2903	ut_ad(page_cleaner.n_slots_finished == page_cleaner.n_slots);
2904
2905	for (ulint i = `0`; i < page_cleaner.n_slots; i++) {
2906	page_cleaner_slot_t* slot = &page_cleaner.slots[i];
2907
2908	ut_ad(slot->state == PAGE_CLEANER_STATE_FINISHED);
2909
2910	*n_flushed_lru += slot->n_flushed_lru;
2911	*n_flushed_list += slot->n_flushed_list;
2912	all_succeeded &= slot->succeeded_list;
2913
2914	slot->state = PAGE_CLEANER_STATE_NONE;
2915
2916	slot->n_pages_requested = `0`;
2917	}
2918
2919	page_cleaner.n_slots_finished = `0`;
2920
2921	os_event_reset(page_cleaner.is_finished);
2922
2923	mutex_exit(&page_cleaner.mutex);
2924
2925	return(all_succeeded);
2926	}
2927
2928	#ifdef UNIV_LINUX
2929	/**
2930	Set priority for page_cleaner threads.
2931	@param[in] priority priority intended to set
2932	@return true if set as intended /*
2933	static
2934	bool
2935	buf_flush_page_cleaner_set_priority(
2936	int priority)
2937	{
2938	setpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid),
2939	priority);
2940	return(getpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid))
2941	== priority);
2942	}
2943	#endif /* UNIV_LINUX */
2944
2945	#ifdef UNIV_DEBUG
2946	/* Loop used to disable page cleaner threads. /
2947	static
2948	void
2949	buf_flush_page_cleaner_disabled_loop(void)
2950	{
2951	if (!innodb_page_cleaner_disabled_debug) {
2952	/ We return to avoid entering and exiting mutex. /
2953	return;
2954	}
2955
2956	mutex_enter(&page_cleaner.mutex);
2957	page_cleaner.n_disabled_debug++;
2958	mutex_exit(&page_cleaner.mutex);
2959
2960	while (innodb_page_cleaner_disabled_debug
2961	&& srv_shutdown_state == SRV_SHUTDOWN_NONE
2962	&& page_cleaner.is_running) {
2963
2964	os_thread_sleep(`100000`); / [A] /
2965	}
2966
2967	/ We need to wait for threads exiting here, otherwise we would*
2968	encounter problem when we quickly perform following steps:
2969	1) SET GLOBAL innodb_page_cleaner_disabled_debug = 1;
2970	2) SET GLOBAL innodb_page_cleaner_disabled_debug = 0;
2971	3) SET GLOBAL innodb_page_cleaner_disabled_debug = 1;
2972	That's because after step 1 this thread could still be sleeping
2973	inside the loop above at [A] and steps 2, 3 could happen before
2974	this thread wakes up from [A]. In such case this thread would
2975	not re-increment n_disabled_debug and we would be waiting for
2976	him forever in buf_flush_page_cleaner_disabled_debug_update(...).
2977
2978	Therefore we are waiting in step 2 for this thread exiting here. /*
2979
2980	mutex_enter(&page_cleaner.mutex);
2981	page_cleaner.n_disabled_debug--;
2982	mutex_exit(&page_cleaner.mutex);
2983	}
2984
2985	/* Disables page cleaner threads (coordinator and workers).*
2986	@param[in] save immediate result from check function /*
2987	void buf_flush_page_cleaner_disabled_debug_update(THD*,
2988	st_mysql_sys_var, void**,
2989	const void* save)
2990	{
2991	if (!page_cleaner.is_running) {
2992	return;
2993	}
2994
2995	if (!*static_cast<const my_bool*>(save)) {
2996	if (!innodb_page_cleaner_disabled_debug) {
2997	return;
2998	}
2999
3000	innodb_page_cleaner_disabled_debug = false;
3001
3002	/ Enable page cleaner threads. /
3003	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3004	mutex_enter(&page_cleaner.mutex);
3005	const ulint n = page_cleaner.n_disabled_debug;
3006	mutex_exit(&page_cleaner.mutex);
3007	/ Check if all threads have been enabled, to avoid*
3008	problem when we decide to re-disable them soon. /*
3009	if (n == `0`) {
3010	break;
3011	}
3012	}
3013	return;
3014	}
3015
3016	if (innodb_page_cleaner_disabled_debug) {
3017	return;
3018	}
3019
3020	innodb_page_cleaner_disabled_debug = true;
3021
3022	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3023	/ Workers are possibly sleeping on is_requested.*
3024
3025	We have to wake them, otherwise they could possibly
3026	have never noticed, that they should be disabled,
3027	and we would wait for them here forever.
3028
3029	That's why we have sleep-loop instead of simply
3030	waiting on some disabled_debug_event. /*
3031	os_event_set(page_cleaner.is_requested);
3032
3033	mutex_enter(&page_cleaner.mutex);
3034
3035	ut_ad(page_cleaner.n_disabled_debug
3036	<= srv_n_page_cleaners);
3037
3038	if (page_cleaner.n_disabled_debug
3039	== srv_n_page_cleaners) {
3040
3041	mutex_exit(&page_cleaner.mutex);
3042	break;
3043	}
3044
3045	mutex_exit(&page_cleaner.mutex);
3046
3047	os_thread_sleep(`100000`);
3048	}
3049	}
3050	#endif /* UNIV_DEBUG */
3051
3052	/****************************************************************//**
3053	page_cleaner thread tasked with flushing dirty pages from the buffer
3054	pools. As of now we'll have only one coordinator.
3055	@return a dummy parameter /*
3056	extern "C"
3057	os_thread_ret_t
3058	DECLARE_THREAD(buf_flush_page_cleaner_coordinator)(void*)
3059	{
3060	my_thread_init();
3061	#ifdef UNIV_PFS_THREAD
3062	pfs_register_thread(page_cleaner_thread_key);
3063	#endif /* UNIV_PFS_THREAD */
3064	ut_ad(!srv_read_only_mode);
3065
3066	#ifdef UNIV_DEBUG_THREAD_CREATION
3067	ib::info() << "page_cleaner thread running, id "
3068	<< os_thread_pf(os_thread_get_curr_id());
3069	#endif /* UNIV_DEBUG_THREAD_CREATION */
3070	#ifdef UNIV_LINUX
3071	/ linux might be able to set different setting for each thread.*
3072	worth to try to set high priority for page cleaner threads /*
3073	if (buf_flush_page_cleaner_set_priority(
3074	buf_flush_page_cleaner_priority)) {
3075
3076	ib::info () << "page_cleaner coordinator priority: "
3077	<< buf_flush_page_cleaner_priority;
3078	} else {
3079	ib::info () << "If the mysqld execution user is authorized,"
3080	" page cleaner thread priority can be changed."
3081	" See the man page of setpriority().";
3082	}
3083	/ Signal that setpriority() has been attempted. /
3084	os_event_set(recv_sys->flush_end);
3085	#endif /* UNIV_LINUX */
3086
3087	do {
3088	/ treat flushing requests during recovery. /
3089	ulint n_flushed_lru = `0`;
3090	ulint n_flushed_list = `0`;
3091
3092	os_event_wait(recv_sys->flush_start);
3093
3094	if (!recv_writer_thread_active) {
3095	break;
3096	}
3097
3098	switch (recv_sys->flush_type) {
3099	case BUF_FLUSH_LRU:
3100	/ Flush pages from end of LRU if required /
3101	pc_request(`0`, LSN_MAX);
3102	while (pc_flush_slot() > `0`) {}
3103	pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3104	break;
3105
3106	case BUF_FLUSH_LIST:
3107	/ Flush all pages /
3108	do {
3109	pc_request(ULINT_MAX, LSN_MAX);
3110	while (pc_flush_slot() > `0`) {}
3111	} while (!pc_wait_finished(&n_flushed_lru,
3112	&n_flushed_list));
3113	break;
3114
3115	default:
3116	ut_ad(`0`);
3117	}
3118
3119	os_event_reset(recv_sys->flush_start);
3120	os_event_set(recv_sys->flush_end);
3121	} while (recv_writer_thread_active);
3122
3123	os_event_wait(buf_flush_event);
3124
3125	ulint ret_sleep = `0`;
3126	ulint n_evicted = `0`;
3127	ulint n_flushed_last = `0`;
3128	ulint warn_interval = `1`;
3129	ulint warn_count = `0`;
3130	int64_t sig_count = os_event_reset(buf_flush_event);
3131	ulint next_loop_time = ut_time_ms() + `1000`;
3132	ulint n_flushed = `0`;
3133	ulint last_activity = srv_get_activity_count();
3134	ulint last_pages = `0`;
3135
3136	while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3137	ulint curr_time = ut_time_ms();
3138
3139	/ The page_cleaner skips sleep if the server is*
3140	idle and there are no pending IOs in the buffer pool
3141	and there is work to do. /*
3142	if (srv_check_activity(last_activity)
3143	\|\| buf_get_n_pending_read_ios()
3144	\|\| n_flushed == `0`) {
3145
3146	ret_sleep = pc_sleep_if_needed(
3147	next_loop_time, sig_count, curr_time);
3148	} else if (curr_time > next_loop_time) {
3149	ret_sleep = OS_SYNC_TIME_EXCEEDED;
3150	} else {
3151	ret_sleep = `0`;
3152	}
3153
3154	if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
3155	break;
3156	}
3157
3158	sig_count = os_event_reset(buf_flush_event);
3159
3160	if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3161	if (global_system_variables.log_warnings > `2`
3162	&& curr_time > next_loop_time + `3000`
3163	&& !(test_flags & TEST_SIGINT)) {
3164	if (warn_count == `0`) {
3165	ib::info () << "page_cleaner: 1000ms"
3166	" intended loop took "
3167	<< `1000` + curr_time
3168	- next_loop_time
3169	<< "ms. The settings might not"
3170	" be optimal. (flushed="
3171	<< n_flushed_last
3172	<< " and evicted="
3173	<< n_evicted
3174	<< ", during the time.)";
3175	if (warn_interval > `300`) {
3176	warn_interval = `600`;
3177	} else {
3178	warn_interval *= `2`;
3179	}
3180
3181	warn_count = warn_interval;
3182	} else {
3183	--warn_count;
3184	}
3185	} else {
3186	/ reset counter /
3187	warn_interval = `1`;
3188	warn_count = `0`;
3189	}
3190
3191	next_loop_time = curr_time + `1000`;
3192	n_flushed_last = n_evicted = `0`;
3193	}
3194
3195	if (ret_sleep != OS_SYNC_TIME_EXCEEDED
3196	&& srv_flush_sync
3197	&& buf_flush_sync_lsn > `0`) {
3198	/ woke up for flush_sync /
3199	mutex_enter(&page_cleaner.mutex);
3200	lsn_t lsn_limit = buf_flush_sync_lsn;
3201	buf_flush_sync_lsn = `0`;
3202	mutex_exit(&page_cleaner.mutex);
3203
3204	/ Request flushing for threads /
3205	pc_request(ULINT_MAX, lsn_limit);
3206
3207	ulint tm = ut_time_ms();
3208
3209	/ Coordinator also treats requests /
3210	while (pc_flush_slot() > `0`) {}
3211
3212	/ only coordinator is using these counters,*
3213	so no need to protect by lock. /*
3214	page_cleaner.flush_time += ut_time_ms() - tm;
3215	page_cleaner.flush_pass++;
3216
3217	/ Wait for all slots to be finished /
3218	ulint n_flushed_lru = `0`;
3219	ulint n_flushed_list = `0`;
3220	pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3221
3222	if (n_flushed_list > `0` \|\| n_flushed_lru > `0`) {
3223	buf_flush_stats(n_flushed_list, n_flushed_lru);
3224
3225	MONITOR_INC_VALUE_CUMULATIVE(
3226	MONITOR_FLUSH_SYNC_TOTAL_PAGE,
3227	MONITOR_FLUSH_SYNC_COUNT,
3228	MONITOR_FLUSH_SYNC_PAGES,
3229	n_flushed_lru + n_flushed_list);
3230	}
3231
3232	n_flushed = n_flushed_lru + n_flushed_list;
3233
3234	} else if (srv_check_activity(last_activity)) {
3235	ulint n_to_flush;
3236	lsn_t lsn_limit = `0`;
3237
3238	/ Estimate pages from flush_list to be flushed /
3239	if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3240	last_activity = srv_get_activity_count();
3241	n_to_flush =
3242	page_cleaner_flush_pages_recommendation(
3243	&lsn_limit, last_pages);
3244	} else {
3245	n_to_flush = `0`;
3246	}
3247
3248	/ Request flushing for threads /
3249	pc_request(n_to_flush, lsn_limit);
3250
3251	ulint tm = ut_time_ms();
3252
3253	/ Coordinator also treats requests /
3254	while (pc_flush_slot() > `0`) {
3255	/ No op /
3256	}
3257
3258	/ only coordinator is using these counters,*
3259	so no need to protect by lock. /*
3260	page_cleaner.flush_time += ut_time_ms() - tm;
3261	page_cleaner.flush_pass++ ;
3262
3263	/ Wait for all slots to be finished /
3264	ulint n_flushed_lru = `0`;
3265	ulint n_flushed_list = `0`;
3266
3267	pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3268
3269	if (n_flushed_list > `0` \|\| n_flushed_lru > `0`) {
3270	buf_flush_stats(n_flushed_list, n_flushed_lru);
3271	}
3272
3273	if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3274	last_pages = n_flushed_list;
3275	}
3276
3277	n_evicted += n_flushed_lru;
3278	n_flushed_last += n_flushed_list;
3279
3280	n_flushed = n_flushed_lru + n_flushed_list;
3281
3282	if (n_flushed_lru) {
3283	MONITOR_INC_VALUE_CUMULATIVE(
3284	MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
3285	MONITOR_LRU_BATCH_FLUSH_COUNT,
3286	MONITOR_LRU_BATCH_FLUSH_PAGES,
3287	n_flushed_lru);
3288	}
3289
3290	if (n_flushed_list) {
3291	MONITOR_INC_VALUE_CUMULATIVE(
3292	MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
3293	MONITOR_FLUSH_ADAPTIVE_COUNT,
3294	MONITOR_FLUSH_ADAPTIVE_PAGES,
3295	n_flushed_list);
3296	}
3297
3298	} else if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3299	/ no activity, slept enough /
3300	buf_flush_lists(PCT_IO(`100`), LSN_MAX, &n_flushed);
3301
3302	n_flushed_last += n_flushed;
3303
3304	if (n_flushed) {
3305	MONITOR_INC_VALUE_CUMULATIVE(
3306	MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
3307	MONITOR_FLUSH_BACKGROUND_COUNT,
3308	MONITOR_FLUSH_BACKGROUND_PAGES,
3309	n_flushed);
3310
3311	}
3312
3313	} else {
3314	/ no activity, but woken up by event /
3315	n_flushed = `0`;
3316	}
3317
3318	ut_d(buf_flush_page_cleaner_disabled_loop());
3319	}
3320
3321	ut_ad(srv_shutdown_state > `0`);
3322	if (srv_fast_shutdown == `2`
3323	\|\| srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
3324	/ In very fast shutdown or when innodb failed to start, we*
3325	simulate a crash of the buffer pool. We are not required to do
3326	any flushing. /*
3327	goto thread_exit;
3328	}
3329
3330	/ In case of normal and slow shutdown the page_cleaner thread*
3331	must wait for all other activity in the server to die down.
3332	Note that we can start flushing the buffer pool as soon as the
3333	server enters shutdown phase but we must stay alive long enough
3334	to ensure that any work done by the master or purge threads is
3335	also flushed.
3336	During shutdown we pass through two stages. In the first stage,
3337	when SRV_SHUTDOWN_CLEANUP is set other threads like the master
3338	and the purge threads may be working as well. We start flushing
3339	the buffer pool but can't be sure that no new pages are being
3340	dirtied until we enter SRV_SHUTDOWN_FLUSH_PHASE phase. /*
3341
3342	do {
3343	pc_request(ULINT_MAX, LSN_MAX);
3344
3345	while (pc_flush_slot() > `0`) {}
3346
3347	ulint n_flushed_lru = `0`;
3348	ulint n_flushed_list = `0`;
3349	pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3350
3351	n_flushed = n_flushed_lru + n_flushed_list;
3352
3353	/ We sleep only if there are no pages to flush /
3354	if (n_flushed == `0`) {
3355	os_thread_sleep(`100000`);
3356	}
3357	} while (srv_shutdown_state == SRV_SHUTDOWN_CLEANUP);
3358
3359	/ At this point all threads including the master and the purge*
3360	thread must have been suspended. /*
3361	ut_a(srv_get_active_thread_type() == SRV_NONE);
3362	ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
3363
3364	/ We can now make a final sweep on flushing the buffer pool*
3365	and exit after we have cleaned the whole buffer pool.
3366	It is important that we wait for any running batch that has
3367	been triggered by us to finish. Otherwise we can end up
3368	considering end of that batch as a finish of our final
3369	sweep and we'll come out of the loop leaving behind dirty pages
3370	in the flush_list /*
3371	buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3372	buf_flush_wait_LRU_batch_end();
3373
3374	bool success;
3375
3376	do {
3377	pc_request(ULINT_MAX, LSN_MAX);
3378
3379	while (pc_flush_slot() > `0`) {}
3380
3381	ulint n_flushed_lru = `0`;
3382	ulint n_flushed_list = `0`;
3383	success = pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3384
3385	n_flushed = n_flushed_lru + n_flushed_list;
3386
3387	buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3388	buf_flush_wait_LRU_batch_end();
3389
3390	} while (!success \|\| n_flushed > `0`);
3391
3392	/ Some sanity checks /
3393	ut_a(srv_get_active_thread_type() == SRV_NONE);
3394	ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
3395
3396	for (ulint i = `0`; i < srv_buf_pool_instances; i++) {
3397	buf_pool_t* buf_pool = buf_pool_from_array(i);
3398	ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == `0`);
3399	}
3400
3401	/ We have lived our life. Time to die. /
3402
3403	thread_exit:
3404	/ All worker threads are waiting for the event here,*
3405	and no more access to page_cleaner structure by them.
3406	Wakes worker threads up just to make them exit. /*
3407	page_cleaner.is_running = false;
3408
3409	/ waiting for all worker threads exit /
3410	while (page_cleaner.n_workers) {
3411	os_event_set(page_cleaner.is_requested);
3412	os_thread_sleep(`10000`);
3413	}
3414
3415	mutex_destroy(&page_cleaner.mutex);
3416
3417	os_event_destroy(page_cleaner.is_finished);
3418	os_event_destroy(page_cleaner.is_requested);
3419	os_event_destroy(page_cleaner.is_started);
3420
3421	buf_page_cleaner_is_active = false;
3422
3423	my_thread_end();
3424	/ We count the number of threads in os_thread_exit(). A created*
3425	thread should always use that to exit and not use return() to exit. /*
3426	os_thread_exit();
3427
3428	OS_THREAD_DUMMY_RETURN;
3429	}
3430
3431	/* Adjust thread count for page cleaner workers.*
3432	@param[in] new_cnt Number of threads to be used /*
3433	void
3434	buf_flush_set_page_cleaner_thread_cnt(ulong new_cnt)
3435	{
3436	mutex_enter(&page_cleaner.mutex);
3437
3438	srv_n_page_cleaners = new_cnt;
3439	if (new_cnt > page_cleaner.n_workers) {
3440	/ User has increased the number of page*
3441	cleaner threads. /*
3442	ulint add = new_cnt - page_cleaner.n_workers;
3443	for (ulint i = `0`; i < add; i++) {
3444	os_thread_id_t cleaner_thread_id;
3445	os_thread_create(buf_flush_page_cleaner_worker, NULL, &cleaner_thread_id);
3446	}
3447	}
3448
3449	mutex_exit(&page_cleaner.mutex);
3450
3451	/ Wait until defined number of workers has started. /
3452	while (page_cleaner.is_running &&
3453	page_cleaner.n_workers != (srv_n_page_cleaners - `1`)) {
3454	os_event_set(page_cleaner.is_requested);
3455	os_event_reset(page_cleaner.is_started);
3456	os_event_wait_time(page_cleaner.is_started, `1000000`);
3457	}
3458	}
3459
3460	/****************************************************************//**
3461	Worker thread of page_cleaner.
3462	@return a dummy parameter /*
3463	extern "C"
3464	os_thread_ret_t
3465	DECLARE_THREAD(buf_flush_page_cleaner_worker)(
3466	/==========================================/
3467	void* arg MY_ATTRIBUTE((unused)))
3468	/!< in: a dummy parameter required by*
3469	os_thread_create /*
3470	{
3471	my_thread_init();
3472	#ifndef DBUG_OFF
3473	os_thread_id_t cleaner_thread_id = os_thread_get_curr_id();
3474	#endif
3475
3476	mutex_enter(&page_cleaner.mutex);
3477	ulint thread_no = page_cleaner.n_workers++;
3478
3479	DBUG_LOG("ib_buf", "Thread " << cleaner_thread_id
3480	<< " started; n_workers=" << page_cleaner.n_workers);
3481
3482	/ Signal that we have started /
3483	os_event_set(page_cleaner.is_started);
3484	mutex_exit(&page_cleaner.mutex);
3485
3486	#ifdef UNIV_LINUX
3487	/ linux might be able to set different setting for each thread*
3488	worth to try to set high priority for page cleaner threads /*
3489	if (buf_flush_page_cleaner_set_priority(
3490	buf_flush_page_cleaner_priority)) {
3491
3492	ib::info () << "page_cleaner worker priority: "
3493	<< buf_flush_page_cleaner_priority;
3494	}
3495	#endif /* UNIV_LINUX */
3496
3497	while (true) {
3498	os_event_wait(page_cleaner.is_requested);
3499
3500	ut_d(buf_flush_page_cleaner_disabled_loop());
3501
3502	if (!page_cleaner.is_running) {
3503	break;
3504	}
3505
3506	ut_ad(srv_n_page_cleaners >= `1`);
3507
3508	/ If number of page cleaner threads is decreased*
3509	exit those that are not anymore needed. /*
3510	if (srv_shutdown_state == SRV_SHUTDOWN_NONE &&
3511	thread_no >= (srv_n_page_cleaners - `1`)) {
3512	DBUG_LOG("ib_buf", "Exiting "
3513	<< thread_no
3514	<< " page cleaner worker thread_id "
3515	<< os_thread_pf(cleaner_thread_id)
3516	<< " total threads " << srv_n_page_cleaners << ".");
3517	break;
3518	}
3519
3520	pc_flush_slot();
3521	}
3522
3523	mutex_enter(&page_cleaner.mutex);
3524	page_cleaner.n_workers--;
3525
3526	DBUG_LOG("ib_buf", "Thread " << cleaner_thread_id
3527	<< " exiting; n_workers=" << page_cleaner.n_workers);
3528
3529	/ Signal that we have stopped /
3530	os_event_set(page_cleaner.is_started);
3531	mutex_exit(&page_cleaner.mutex);
3532
3533	my_thread_end();
3534
3535	os_thread_exit();
3536
3537	OS_THREAD_DUMMY_RETURN;
3538	}
3539
3540	/*****************************************************************//**
3541	Synchronously flush dirty blocks from the end of the flush list of all buffer
3542	pool instances.
3543	NOTE: The calling thread is not allowed to own any latches on pages! /*
3544	void
3545	buf_flush_sync_all_buf_pools(void)
3546	/==============================/
3547	{
3548	bool success;
3549	do {
3550	success = buf_flush_lists(ULINT_MAX, LSN_MAX, NULL);
3551	buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3552	} while (!success);
3553
3554	ut_a(success);
3555	}
3556
3557	/* Request IO burst and wake page_cleaner up.*
3558	@param[in] lsn_limit upper limit of LSN to be flushed /*
3559	void
3560	buf_flush_request_force(
3561	lsn_t lsn_limit)
3562	{
3563	/ adjust based on lsn_avg_rate not to get old /
3564	lsn_t lsn_target = lsn_limit + lsn_avg_rate * `3`;
3565
3566	mutex_enter(&page_cleaner.mutex);
3567	if (lsn_target > buf_flush_sync_lsn) {
3568	buf_flush_sync_lsn = lsn_target;
3569	}
3570	mutex_exit(&page_cleaner.mutex);
3571
3572	os_event_set(buf_flush_event);
3573	}
3574	#if defined UNIV_DEBUG \|\| defined UNIV_BUF_DEBUG
3575
3576	/* Functor to validate the flush list. /
3577	struct Check {
3578	void operator()(const buf_page_t* elem)
3579	{
3580	ut_a(elem->in_flush_list);
3581	}
3582	};
3583
3584	/****************************************************************//**
3585	Validates the flush list.
3586	@return TRUE if ok /*
3587	static
3588	ibool
3589	buf_flush_validate_low(
3590	/===================/
3591	buf_pool_t* buf_pool) /!< in: Buffer pool instance /
3592	{
3593	buf_page_t* bpage;
3594	const ib_rbt_node_t* rnode = NULL;
3595	Check check;
3596
3597	ut_ad(buf_flush_list_mutex_own(buf_pool));
3598
3599	ut_list_validate(buf_pool->flush_list, check);
3600
3601	bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
3602
3603	/ If we are in recovery mode i.e.: flush_rbt != NULL*
3604	then each block in the flush_list must also be present
3605	in the flush_rbt. /*
3606	if (buf_pool->flush_rbt != NULL) {
3607	rnode = rbt_first(buf_pool->flush_rbt);
3608	}
3609
3610	while (bpage != NULL) {
3611	const lsn_t om = bpage->oldest_modification;
3612
3613	ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
3614
3615	ut_ad(bpage->in_flush_list);
3616
3617	/ A page in buf_pool->flush_list can be in*
3618	BUF_BLOCK_REMOVE_HASH state. This happens when a page
3619	is in the middle of being relocated. In that case the
3620	original descriptor can have this state and still be
3621	in the flush list waiting to acquire the
3622	buf_pool->flush_list_mutex to complete the relocation. /*
3623	ut_a(buf_page_in_file(bpage)
3624	\|\| buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
3625	ut_a(om > `0`);
3626
3627	if (buf_pool->flush_rbt != NULL) {
3628	buf_page_t** prpage;
3629
3630	ut_a(rnode != NULL);
3631	prpage = rbt_value(buf_page_t*, rnode);
3632
3633	ut_a(*prpage != NULL);
3634	ut_a(*prpage == bpage);
3635	rnode = rbt_next(buf_pool->flush_rbt, rnode);
3636	}
3637
3638	bpage = UT_LIST_GET_NEXT(list, bpage);
3639
3640	ut_a(bpage == NULL \|\| om >= bpage->oldest_modification);
3641	}
3642
3643	/ By this time we must have exhausted the traversal of*
3644	flush_rbt (if active) as well. /*
3645	ut_a(rnode == NULL);
3646
3647	return(TRUE);
3648	}
3649
3650	/****************************************************************//**
3651	Validates the flush list.
3652	@return TRUE if ok /*
3653	ibool
3654	buf_flush_validate(
3655	/===============/
3656	buf_pool_t* buf_pool) /!< buffer pool instance /
3657	{
3658	ibool ret;
3659
3660	buf_flush_list_mutex_enter(buf_pool);
3661
3662	ret = buf_flush_validate_low(buf_pool);
3663
3664	buf_flush_list_mutex_exit(buf_pool);
3665
3666	return(ret);
3667	}
3668
3669	#endif /* UNIV_DEBUG \|\| UNIV_BUF_DEBUG */
3670
3671	/****************************************************************//**
3672	Check if there are any dirty pages that belong to a space id in the flush
3673	list in a particular buffer pool.
3674	@return number of dirty pages present in a single buffer pool /*
3675	ulint
3676	buf_pool_get_dirty_pages_count(
3677	/===========================/
3678	buf_pool_t* buf_pool, /!< in: buffer pool /
3679	ulint id, /!< in: space id to check /
3680	FlushObserver* observer) /!< in: flush observer to check /
3681
3682	{
3683	ulint count = `0`;
3684
3685	buf_pool_mutex_enter(buf_pool);
3686	buf_flush_list_mutex_enter(buf_pool);
3687
3688	buf_page_t* bpage;
3689
3690	for (bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
3691	bpage != `0`;
3692	bpage = UT_LIST_GET_NEXT(list, bpage)) {
3693
3694	ut_ad(buf_page_in_file(bpage));
3695	ut_ad(bpage->in_flush_list);
3696	ut_ad(bpage->oldest_modification > `0`);
3697
3698	if ((observer != NULL
3699	&& observer == bpage->flush_observer)
3700	\|\| (observer == NULL
3701	&& id == bpage->id.space())) {
3702	++count;
3703	}
3704	}
3705
3706	buf_flush_list_mutex_exit(buf_pool);
3707	buf_pool_mutex_exit(buf_pool);
3708
3709	return(count);
3710	}
3711
3712	/****************************************************************//**
3713	Check if there are any dirty pages that belong to a space id in the flush list.
3714	@return number of dirty pages present in all the buffer pools /*
3715	static
3716	ulint
3717	buf_flush_get_dirty_pages_count(
3718	/============================/
3719	ulint id, /!< in: space id to check /
3720	FlushObserver* observer) /!< in: flush observer to check /
3721	{
3722	ulint count = `0`;
3723
3724	for (ulint i = `0`; i < srv_buf_pool_instances; ++i) {
3725	buf_pool_t* buf_pool;
3726
3727	buf_pool = buf_pool_from_array(i);
3728
3729	count += buf_pool_get_dirty_pages_count(buf_pool, id, observer);
3730	}
3731
3732	return(count);
3733	}
3734
3735	/* FlushObserver constructor*
3736	@param[in] space tablespace
3737	@param[in] trx trx instance
3738	@param[in] stage performance schema accounting object,
3739	used by ALTER TABLE. It is passed to log_preflush_pool_modified_pages()
3740	for accounting. /*
3741	FlushObserver::FlushObserver(
3742	fil_space_t* space,
3743	trx_t* trx,
3744	ut_stage_alter_t* stage)
3745	:
3746	m_space(space),
3747	m_trx(trx),
3748	m_stage(stage),
3749	m_interrupted(false)
3750	{
3751	m_flushed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances));
3752	m_removed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances));
3753
3754	for (ulint i = `0`; i < srv_buf_pool_instances; i++) {
3755	m_flushed->at(i) = `0`;
3756	m_removed->at(i) = `0`;
3757	}
3758
3759	DBUG_LOG("flush", "FlushObserver(): trx->id=" << m_trx->id);
3760	}
3761
3762	/* FlushObserver deconstructor /
3763	FlushObserver::~FlushObserver()
3764	{
3765	ut_ad(buf_flush_get_dirty_pages_count(m_space->id, this) == `0`);
3766
3767	UT_DELETE(m_flushed);
3768	UT_DELETE(m_removed);
3769
3770	DBUG_LOG("flush", "~FlushObserver(): trx->id=" << m_trx->id);
3771	}
3772
3773	/* Check whether trx is interrupted*
3774	@return true if trx is interrupted /*
3775	bool
3776	FlushObserver::check_interrupted()
3777	{
3778	if (trx_is_interrupted(m_trx)) {
3779	interrupted();
3780
3781	return(true);
3782	}
3783
3784	return(false);
3785	}
3786
3787	/* Notify observer of a flush*
3788	@param[in] buf_pool buffer pool instance
3789	@param[in] bpage buffer page to flush /*
3790	void
3791	FlushObserver::notify_flush(
3792	buf_pool_t* buf_pool,
3793	buf_page_t* bpage)
3794	{
3795	ut_ad(buf_pool_mutex_own(buf_pool));
3796
3797	m_flushed->at(buf_pool->instance_no)++;
3798
3799	if (m_stage != NULL) {
3800	m_stage->inc();
3801	}
3802
3803	DBUG_LOG("flush", "Flush " << bpage->id);
3804	}
3805
3806	/* Notify observer of a remove*
3807	@param[in] buf_pool buffer pool instance
3808	@param[in] bpage buffer page flushed /*
3809	void
3810	FlushObserver::notify_remove(
3811	buf_pool_t* buf_pool,
3812	buf_page_t* bpage)
3813	{
3814	ut_ad(buf_pool_mutex_own(buf_pool));
3815
3816	m_removed->at(buf_pool->instance_no)++;
3817
3818	DBUG_LOG("flush", "Remove " << bpage->id);
3819	}
3820
3821	/* Flush dirty pages and wait. /
3822	void
3823	FlushObserver::flush()
3824	{
3825	ut_ad(m_trx);
3826
3827	if (!m_interrupted && m_stage) {
3828	m_stage->begin_phase_flush(buf_flush_get_dirty_pages_count(
3829	m_space->id, this));
3830	}
3831
3832	buf_LRU_flush_or_remove_pages(m_space->id, this);
3833
3834	/ Wait for all dirty pages were flushed. /
3835	for (ulint i = `0`; i < srv_buf_pool_instances; i++) {
3836	while (!is_complete(i)) {
3837
3838	os_thread_sleep(`2000`);
3839	}
3840	}
3841	}
3842

Browse the source code of MariaDB/storage/innobase/buf/buf0flu.cc