1/*****************************************************************************
2
3Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved.
4Copyright (c) 2013, 2018, MariaDB Corporation.
5Copyright (c) 2013, 2014, Fusion-io
6
7This program is free software; you can redistribute it and/or modify it under
8the terms of the GNU General Public License as published by the Free Software
9Foundation; version 2 of the License.
10
11This program is distributed in the hope that it will be useful, but WITHOUT
12ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License along with
16this program; if not, write to the Free Software Foundation, Inc.,
1751 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
18
19*****************************************************************************/
20
21/**************************************************//**
22@file buf/buf0flu.cc
23The database buffer buf_pool flush algorithm
24
25Created 11/11/1995 Heikki Tuuri
26*******************************************************/
27
28#include "ha_prototypes.h"
29#include <mysql/service_thd_wait.h>
30#include <my_dbug.h>
31#include <sql_class.h>
32
33#include "buf0flu.h"
34#include "buf0buf.h"
35#include "buf0checksum.h"
36#include "srv0start.h"
37#include "srv0srv.h"
38#include "page0zip.h"
39#include "ut0byte.h"
40#include "page0page.h"
41#include "fil0fil.h"
42#include "buf0lru.h"
43#include "buf0rea.h"
44#include "ibuf0ibuf.h"
45#include "log0log.h"
46#include "os0file.h"
47#include "trx0sys.h"
48#include "srv0mon.h"
49#include "fsp0sysspace.h"
50#include "ut0stage.h"
51#include "fil0pagecompress.h"
52#ifdef UNIV_LINUX
53/* include defs for CPU time priority settings */
54#include <unistd.h>
55#include <sys/syscall.h>
56#include <sys/time.h>
57#include <sys/resource.h>
58static const int buf_flush_page_cleaner_priority = -20;
59#endif /* UNIV_LINUX */
60
61/** Sleep time in microseconds for loop waiting for the oldest
62modification lsn */
63static const ulint buf_flush_wait_flushed_sleep_time = 10000;
64
65#include <my_service_manager.h>
66
67/** Number of pages flushed through non flush_list flushes. */
68static ulint buf_lru_flush_page_count = 0;
69
70/** Flag indicating if the page_cleaner is in active state. This flag
71is set to TRUE by the page_cleaner thread when it is spawned and is set
72back to FALSE at shutdown by the page_cleaner as well. Therefore no
73need to protect it by a mutex. It is only ever read by the thread
74doing the shutdown */
75bool buf_page_cleaner_is_active;
76
77/** Factor for scan length to determine n_pages for intended oldest LSN
78progress */
79static ulint buf_flush_lsn_scan_factor = 3;
80
81/** Average redo generation rate */
82static lsn_t lsn_avg_rate = 0;
83
84/** Target oldest LSN for the requested flush_sync */
85static lsn_t buf_flush_sync_lsn = 0;
86
87#ifdef UNIV_PFS_THREAD
88mysql_pfs_key_t page_cleaner_thread_key;
89#endif /* UNIV_PFS_THREAD */
90
91/** Event to synchronise with the flushing. */
92os_event_t buf_flush_event;
93
94/** State for page cleaner array slot */
95enum page_cleaner_state_t {
96 /** Not requested any yet.
97 Moved from FINISHED by the coordinator. */
98 PAGE_CLEANER_STATE_NONE = 0,
99 /** Requested but not started flushing.
100 Moved from NONE by the coordinator. */
101 PAGE_CLEANER_STATE_REQUESTED,
102 /** Flushing is on going.
103 Moved from REQUESTED by the worker. */
104 PAGE_CLEANER_STATE_FLUSHING,
105 /** Flushing was finished.
106 Moved from FLUSHING by the worker. */
107 PAGE_CLEANER_STATE_FINISHED
108};
109
110/** Page cleaner request state for each buffer pool instance */
111struct page_cleaner_slot_t {
112 page_cleaner_state_t state; /*!< state of the request.
113 protected by page_cleaner_t::mutex
114 if the worker thread got the slot and
115 set to PAGE_CLEANER_STATE_FLUSHING,
116 n_flushed_lru and n_flushed_list can be
117 updated only by the worker thread */
118 /* This value is set during state==PAGE_CLEANER_STATE_NONE */
119 ulint n_pages_requested;
120 /*!< number of requested pages
121 for the slot */
122 /* These values are updated during state==PAGE_CLEANER_STATE_FLUSHING,
123 and commited with state==PAGE_CLEANER_STATE_FINISHED.
124 The consistency is protected by the 'state' */
125 ulint n_flushed_lru;
126 /*!< number of flushed pages
127 by LRU scan flushing */
128 ulint n_flushed_list;
129 /*!< number of flushed pages
130 by flush_list flushing */
131 bool succeeded_list;
132 /*!< true if flush_list flushing
133 succeeded. */
134 ulint flush_lru_time;
135 /*!< elapsed time for LRU flushing */
136 ulint flush_list_time;
137 /*!< elapsed time for flush_list
138 flushing */
139 ulint flush_lru_pass;
140 /*!< count to attempt LRU flushing */
141 ulint flush_list_pass;
142 /*!< count to attempt flush_list
143 flushing */
144};
145
146/** Page cleaner structure common for all threads */
147struct page_cleaner_t {
148 ib_mutex_t mutex; /*!< mutex to protect whole of
149 page_cleaner_t struct and
150 page_cleaner_slot_t slots. */
151 os_event_t is_requested; /*!< event to activate worker
152 threads. */
153 os_event_t is_finished; /*!< event to signal that all
154 slots were finished. */
155 os_event_t is_started; /*!< event to signal that
156 thread is started/exiting */
157 volatile ulint n_workers; /*!< number of worker threads
158 in existence */
159 bool requested; /*!< true if requested pages
160 to flush */
161 lsn_t lsn_limit; /*!< upper limit of LSN to be
162 flushed */
163 ulint n_slots; /*!< total number of slots */
164 ulint n_slots_requested;
165 /*!< number of slots
166 in the state
167 PAGE_CLEANER_STATE_REQUESTED */
168 ulint n_slots_flushing;
169 /*!< number of slots
170 in the state
171 PAGE_CLEANER_STATE_FLUSHING */
172 ulint n_slots_finished;
173 /*!< number of slots
174 in the state
175 PAGE_CLEANER_STATE_FINISHED */
176 ulint flush_time; /*!< elapsed time to flush
177 requests for all slots */
178 ulint flush_pass; /*!< count to finish to flush
179 requests for all slots */
180 page_cleaner_slot_t slots[MAX_BUFFER_POOLS];
181 bool is_running; /*!< false if attempt
182 to shutdown */
183
184#ifdef UNIV_DEBUG
185 ulint n_disabled_debug;
186 /*<! how many of pc threads
187 have been disabled */
188#endif /* UNIV_DEBUG */
189};
190
191static page_cleaner_t page_cleaner;
192
193#ifdef UNIV_DEBUG
194my_bool innodb_page_cleaner_disabled_debug;
195#endif /* UNIV_DEBUG */
196
197/** If LRU list of a buf_pool is less than this size then LRU eviction
198should not happen. This is because when we do LRU flushing we also put
199the blocks on free list. If LRU list is very small then we can end up
200in thrashing. */
201#define BUF_LRU_MIN_LEN 256
202
203/* @} */
204
205/******************************************************************//**
206Increases flush_list size in bytes with the page size in inline function */
207static inline
208void
209incr_flush_list_size_in_bytes(
210/*==========================*/
211 buf_block_t* block, /*!< in: control block */
212 buf_pool_t* buf_pool) /*!< in: buffer pool instance */
213{
214 ut_ad(buf_flush_list_mutex_own(buf_pool));
215
216 buf_pool->stat.flush_list_bytes += block->page.size.physical();
217
218 ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size);
219}
220
221#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
222/******************************************************************//**
223Validates the flush list.
224@return TRUE if ok */
225static
226ibool
227buf_flush_validate_low(
228/*===================*/
229 buf_pool_t* buf_pool); /*!< in: Buffer pool instance */
230
231/******************************************************************//**
232Validates the flush list some of the time.
233@return TRUE if ok or the check was skipped */
234static
235ibool
236buf_flush_validate_skip(
237/*====================*/
238 buf_pool_t* buf_pool) /*!< in: Buffer pool instance */
239{
240/** Try buf_flush_validate_low() every this many times */
241# define BUF_FLUSH_VALIDATE_SKIP 23
242
243 /** The buf_flush_validate_low() call skip counter.
244 Use a signed type because of the race condition below. */
245 static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
246
247 /* There is a race condition below, but it does not matter,
248 because this call is only for heuristic purposes. We want to
249 reduce the call frequency of the costly buf_flush_validate_low()
250 check in debug builds. */
251 if (--buf_flush_validate_count > 0) {
252 return(TRUE);
253 }
254
255 buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP;
256 return(buf_flush_validate_low(buf_pool));
257}
258#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
259
260/******************************************************************//**
261Insert a block in the flush_rbt and returns a pointer to its
262predecessor or NULL if no predecessor. The ordering is maintained
263on the basis of the <oldest_modification, space, offset> key.
264@return pointer to the predecessor or NULL if no predecessor. */
265static
266buf_page_t*
267buf_flush_insert_in_flush_rbt(
268/*==========================*/
269 buf_page_t* bpage) /*!< in: bpage to be inserted. */
270{
271 const ib_rbt_node_t* c_node;
272 const ib_rbt_node_t* p_node;
273 buf_page_t* prev = NULL;
274 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
275
276 ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE);
277 ut_ad(buf_flush_list_mutex_own(buf_pool));
278
279 /* Insert this buffer into the rbt. */
280 c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage);
281 ut_a(c_node != NULL);
282
283 /* Get the predecessor. */
284 p_node = rbt_prev(buf_pool->flush_rbt, c_node);
285
286 if (p_node != NULL) {
287 buf_page_t** value;
288 value = rbt_value(buf_page_t*, p_node);
289 prev = *value;
290 ut_a(prev != NULL);
291 }
292
293 return(prev);
294}
295
296/*********************************************************//**
297Delete a bpage from the flush_rbt. */
298static
299void
300buf_flush_delete_from_flush_rbt(
301/*============================*/
302 buf_page_t* bpage) /*!< in: bpage to be removed. */
303{
304#ifdef UNIV_DEBUG
305 ibool ret = FALSE;
306#endif /* UNIV_DEBUG */
307 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
308
309 ut_ad(buf_flush_list_mutex_own(buf_pool));
310
311#ifdef UNIV_DEBUG
312 ret =
313#endif /* UNIV_DEBUG */
314 rbt_delete(buf_pool->flush_rbt, &bpage);
315
316 ut_ad(ret);
317}
318
319/*****************************************************************//**
320Compare two modified blocks in the buffer pool. The key for comparison
321is:
322key = <oldest_modification, space, offset>
323This comparison is used to maintian ordering of blocks in the
324buf_pool->flush_rbt.
325Note that for the purpose of flush_rbt, we only need to order blocks
326on the oldest_modification. The other two fields are used to uniquely
327identify the blocks.
328@return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */
329static
330int
331buf_flush_block_cmp(
332/*================*/
333 const void* p1, /*!< in: block1 */
334 const void* p2) /*!< in: block2 */
335{
336 int ret;
337 const buf_page_t* b1 = *(const buf_page_t**) p1;
338 const buf_page_t* b2 = *(const buf_page_t**) p2;
339
340 ut_ad(b1 != NULL);
341 ut_ad(b2 != NULL);
342
343#ifdef UNIV_DEBUG
344 buf_pool_t* buf_pool = buf_pool_from_bpage(b1);
345#endif /* UNIV_DEBUG */
346
347 ut_ad(buf_flush_list_mutex_own(buf_pool));
348
349 ut_ad(b1->in_flush_list);
350 ut_ad(b2->in_flush_list);
351
352 if (b2->oldest_modification > b1->oldest_modification) {
353 return(1);
354 } else if (b2->oldest_modification < b1->oldest_modification) {
355 return(-1);
356 }
357
358 /* If oldest_modification is same then decide on the space. */
359 ret = (int)(b2->id.space() - b1->id.space());
360
361 /* Or else decide ordering on the page number. */
362 return(ret ? ret : (int) (b2->id.page_no() - b1->id.page_no()));
363}
364
365/********************************************************************//**
366Initialize the red-black tree to speed up insertions into the flush_list
367during recovery process. Should be called at the start of recovery
368process before any page has been read/written. */
369void
370buf_flush_init_flush_rbt(void)
371/*==========================*/
372{
373 ulint i;
374
375 for (i = 0; i < srv_buf_pool_instances; i++) {
376 buf_pool_t* buf_pool;
377
378 buf_pool = buf_pool_from_array(i);
379
380 buf_flush_list_mutex_enter(buf_pool);
381
382 ut_ad(buf_pool->flush_rbt == NULL);
383
384 /* Create red black tree for speedy insertions in flush list. */
385 buf_pool->flush_rbt = rbt_create(
386 sizeof(buf_page_t*), buf_flush_block_cmp);
387
388 buf_flush_list_mutex_exit(buf_pool);
389 }
390}
391
392/********************************************************************//**
393Frees up the red-black tree. */
394void
395buf_flush_free_flush_rbt(void)
396/*==========================*/
397{
398 ulint i;
399
400 for (i = 0; i < srv_buf_pool_instances; i++) {
401 buf_pool_t* buf_pool;
402
403 buf_pool = buf_pool_from_array(i);
404
405 buf_flush_list_mutex_enter(buf_pool);
406
407#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
408 ut_a(buf_flush_validate_low(buf_pool));
409#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
410
411 rbt_free(buf_pool->flush_rbt);
412 buf_pool->flush_rbt = NULL;
413
414 buf_flush_list_mutex_exit(buf_pool);
415 }
416}
417
418/********************************************************************//**
419Inserts a modified block into the flush list. */
420void
421buf_flush_insert_into_flush_list(
422/*=============================*/
423 buf_pool_t* buf_pool, /*!< buffer pool instance */
424 buf_block_t* block, /*!< in/out: block which is modified */
425 lsn_t lsn) /*!< in: oldest modification */
426{
427 ut_ad(!buf_pool_mutex_own(buf_pool));
428 ut_ad(log_flush_order_mutex_own());
429 ut_ad(buf_page_mutex_own(block));
430
431 buf_flush_list_mutex_enter(buf_pool);
432
433 ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL)
434 || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification
435 <= lsn));
436
437 /* If we are in the recovery then we need to update the flush
438 red-black tree as well. */
439 if (buf_pool->flush_rbt != NULL) {
440 buf_flush_list_mutex_exit(buf_pool);
441 buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn);
442 return;
443 }
444
445 ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
446 ut_ad(!block->page.in_flush_list);
447
448 ut_d(block->page.in_flush_list = TRUE);
449 block->page.oldest_modification = lsn;
450
451 UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page);
452
453 incr_flush_list_size_in_bytes(block, buf_pool);
454
455#ifdef UNIV_DEBUG_VALGRIND
456 void* p;
457
458 if (block->page.size.is_compressed()) {
459 p = block->page.zip.data;
460 } else {
461 p = block->frame;
462 }
463
464 UNIV_MEM_ASSERT_RW(p, block->page.size.physical());
465#endif /* UNIV_DEBUG_VALGRIND */
466
467#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
468 ut_a(buf_flush_validate_skip(buf_pool));
469#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
470
471 buf_flush_list_mutex_exit(buf_pool);
472}
473
474/********************************************************************//**
475Inserts a modified block into the flush list in the right sorted position.
476This function is used by recovery, because there the modifications do not
477necessarily come in the order of lsn's. */
478void
479buf_flush_insert_sorted_into_flush_list(
480/*====================================*/
481 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
482 buf_block_t* block, /*!< in/out: block which is modified */
483 lsn_t lsn) /*!< in: oldest modification */
484{
485 buf_page_t* prev_b;
486 buf_page_t* b;
487
488 ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE);
489 ut_ad(!buf_pool_mutex_own(buf_pool));
490 ut_ad(log_flush_order_mutex_own());
491 ut_ad(buf_page_mutex_own(block));
492 ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
493
494 buf_flush_list_mutex_enter(buf_pool);
495
496 /* The field in_LRU_list is protected by buf_pool->mutex, which
497 we are not holding. However, while a block is in the flush
498 list, it is dirty and cannot be discarded, not from the
499 page_hash or from the LRU list. At most, the uncompressed
500 page frame of a compressed block may be discarded or created
501 (copying the block->page to or from a buf_page_t that is
502 dynamically allocated from buf_buddy_alloc()). Because those
503 transitions hold block->mutex and the flush list mutex (via
504 buf_flush_relocate_on_flush_list()), there is no possibility
505 of a race condition in the assertions below. */
506 ut_ad(block->page.in_LRU_list);
507 ut_ad(block->page.in_page_hash);
508 /* buf_buddy_block_register() will take a block in the
509 BUF_BLOCK_MEMORY state, not a file page. */
510 ut_ad(!block->page.in_zip_hash);
511
512 ut_ad(!block->page.in_flush_list);
513 ut_d(block->page.in_flush_list = TRUE);
514 block->page.oldest_modification = lsn;
515
516#ifdef UNIV_DEBUG_VALGRIND
517 void* p;
518
519 if (block->page.size.is_compressed()) {
520 p = block->page.zip.data;
521 } else {
522 p = block->frame;
523 }
524
525 UNIV_MEM_ASSERT_RW(p, block->page.size.physical());
526#endif /* UNIV_DEBUG_VALGRIND */
527
528 prev_b = NULL;
529
530 /* For the most part when this function is called the flush_rbt
531 should not be NULL. In a very rare boundary case it is possible
532 that the flush_rbt has already been freed by the recovery thread
533 before the last page was hooked up in the flush_list by the
534 io-handler thread. In that case we'll just do a simple
535 linear search in the else block. */
536 if (buf_pool->flush_rbt != NULL) {
537
538 prev_b = buf_flush_insert_in_flush_rbt(&block->page);
539
540 } else {
541
542 b = UT_LIST_GET_FIRST(buf_pool->flush_list);
543
544 while (b != NULL && b->oldest_modification
545 > block->page.oldest_modification) {
546
547 ut_ad(b->in_flush_list);
548 prev_b = b;
549 b = UT_LIST_GET_NEXT(list, b);
550 }
551 }
552
553 if (prev_b == NULL) {
554 UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page);
555 } else {
556 UT_LIST_INSERT_AFTER(buf_pool->flush_list, prev_b, &block->page);
557 }
558
559 incr_flush_list_size_in_bytes(block, buf_pool);
560
561#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
562 ut_a(buf_flush_validate_low(buf_pool));
563#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
564
565 buf_flush_list_mutex_exit(buf_pool);
566}
567
568/********************************************************************//**
569Returns TRUE if the file page block is immediately suitable for replacement,
570i.e., the transition FILE_PAGE => NOT_USED allowed.
571@return TRUE if can replace immediately */
572ibool
573buf_flush_ready_for_replace(
574/*========================*/
575 buf_page_t* bpage) /*!< in: buffer control block, must be
576 buf_page_in_file(bpage) and in the LRU list */
577{
578#ifdef UNIV_DEBUG
579 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
580 ut_ad(buf_pool_mutex_own(buf_pool));
581#endif /* UNIV_DEBUG */
582 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
583 ut_ad(bpage->in_LRU_list);
584
585 if (buf_page_in_file(bpage)) {
586
587 return(bpage->oldest_modification == 0
588 && bpage->buf_fix_count == 0
589 && buf_page_get_io_fix(bpage) == BUF_IO_NONE);
590 }
591
592 ib::fatal() << "Buffer block " << bpage << " state " << bpage->state
593 << " in the LRU list!";
594
595 return(FALSE);
596}
597
598/********************************************************************//**
599Returns true if the block is modified and ready for flushing.
600@return true if can flush immediately */
601bool
602buf_flush_ready_for_flush(
603/*======================*/
604 buf_page_t* bpage, /*!< in: buffer control block, must be
605 buf_page_in_file(bpage) */
606 buf_flush_t flush_type)/*!< in: type of flush */
607{
608#ifdef UNIV_DEBUG
609 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
610 ut_ad(buf_pool_mutex_own(buf_pool));
611#endif /* UNIV_DEBUG */
612
613 ut_a(buf_page_in_file(bpage));
614 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
615 ut_ad(flush_type < BUF_FLUSH_N_TYPES);
616
617 if (bpage->oldest_modification == 0
618 || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
619 return(false);
620 }
621
622 ut_ad(bpage->in_flush_list);
623
624 switch (flush_type) {
625 case BUF_FLUSH_LIST:
626 case BUF_FLUSH_LRU:
627 case BUF_FLUSH_SINGLE_PAGE:
628 return(true);
629
630 case BUF_FLUSH_N_TYPES:
631 break;
632 }
633
634 ut_error;
635 return(false);
636}
637
638/********************************************************************//**
639Remove a block from the flush list of modified blocks. */
640void
641buf_flush_remove(
642/*=============*/
643 buf_page_t* bpage) /*!< in: pointer to the block in question */
644{
645 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
646
647#if 0 // FIXME: Rate-limit the output. Move this to the page cleaner?
648 if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE)) {
649 service_manager_extend_timeout(
650 INNODB_EXTEND_TIMEOUT_INTERVAL,
651 "Flush and remove page with tablespace id %u"
652 ", Poolid " ULINTPF ", flush list length " ULINTPF,
653 bpage->space, buf_pool->instance_no,
654 UT_LIST_GET_LEN(buf_pool->flush_list));
655 }
656#endif
657
658 ut_ad(buf_pool_mutex_own(buf_pool));
659 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
660 ut_ad(bpage->in_flush_list);
661
662 buf_flush_list_mutex_enter(buf_pool);
663
664 /* Important that we adjust the hazard pointer before removing
665 the bpage from flush list. */
666 buf_pool->flush_hp.adjust(bpage);
667
668 switch (buf_page_get_state(bpage)) {
669 case BUF_BLOCK_POOL_WATCH:
670 case BUF_BLOCK_ZIP_PAGE:
671 /* Clean compressed pages should not be on the flush list */
672 case BUF_BLOCK_NOT_USED:
673 case BUF_BLOCK_READY_FOR_USE:
674 case BUF_BLOCK_MEMORY:
675 case BUF_BLOCK_REMOVE_HASH:
676 ut_error;
677 return;
678 case BUF_BLOCK_ZIP_DIRTY:
679 buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE);
680 UT_LIST_REMOVE(buf_pool->flush_list, bpage);
681#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
682 buf_LRU_insert_zip_clean(bpage);
683#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
684 break;
685 case BUF_BLOCK_FILE_PAGE:
686 UT_LIST_REMOVE(buf_pool->flush_list, bpage);
687 break;
688 }
689
690 /* If the flush_rbt is active then delete from there as well. */
691 if (buf_pool->flush_rbt != NULL) {
692 buf_flush_delete_from_flush_rbt(bpage);
693 }
694
695 /* Must be done after we have removed it from the flush_rbt
696 because we assert on in_flush_list in comparison function. */
697 ut_d(bpage->in_flush_list = FALSE);
698
699 buf_pool->stat.flush_list_bytes -= bpage->size.physical();
700
701 bpage->oldest_modification = 0;
702
703#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
704 ut_a(buf_flush_validate_skip(buf_pool));
705#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
706
707 /* If there is an observer that want to know if the asynchronous
708 flushing was done then notify it. */
709 if (bpage->flush_observer != NULL) {
710 bpage->flush_observer->notify_remove(buf_pool, bpage);
711
712 bpage->flush_observer = NULL;
713 }
714
715 buf_flush_list_mutex_exit(buf_pool);
716}
717
718/*******************************************************************//**
719Relocates a buffer control block on the flush_list.
720Note that it is assumed that the contents of bpage have already been
721copied to dpage.
722IMPORTANT: When this function is called bpage and dpage are not
723exact copies of each other. For example, they both will have different
724::state. Also the ::list pointers in dpage may be stale. We need to
725use the current list node (bpage) to do the list manipulation because
726the list pointers could have changed between the time that we copied
727the contents of bpage to the dpage and the flush list manipulation
728below. */
729void
730buf_flush_relocate_on_flush_list(
731/*=============================*/
732 buf_page_t* bpage, /*!< in/out: control block being moved */
733 buf_page_t* dpage) /*!< in/out: destination block */
734{
735 buf_page_t* prev;
736 buf_page_t* prev_b = NULL;
737 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
738
739 ut_ad(buf_pool_mutex_own(buf_pool));
740 /* Must reside in the same buffer pool. */
741 ut_ad(buf_pool == buf_pool_from_bpage(dpage));
742
743 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
744
745 buf_flush_list_mutex_enter(buf_pool);
746
747 /* FIXME: At this point we have both buf_pool and flush_list
748 mutexes. Theoretically removal of a block from flush list is
749 only covered by flush_list mutex but currently we do
750 have buf_pool mutex in buf_flush_remove() therefore this block
751 is guaranteed to be in the flush list. We need to check if
752 this will work without the assumption of block removing code
753 having the buf_pool mutex. */
754 ut_ad(bpage->in_flush_list);
755 ut_ad(dpage->in_flush_list);
756
757 /* If recovery is active we must swap the control blocks in
758 the flush_rbt as well. */
759 if (buf_pool->flush_rbt != NULL) {
760 buf_flush_delete_from_flush_rbt(bpage);
761 prev_b = buf_flush_insert_in_flush_rbt(dpage);
762 }
763
764 /* Important that we adjust the hazard pointer before removing
765 the bpage from the flush list. */
766 buf_pool->flush_hp.adjust(bpage);
767
768 /* Must be done after we have removed it from the flush_rbt
769 because we assert on in_flush_list in comparison function. */
770 ut_d(bpage->in_flush_list = FALSE);
771
772 prev = UT_LIST_GET_PREV(list, bpage);
773 UT_LIST_REMOVE(buf_pool->flush_list, bpage);
774
775 if (prev) {
776 ut_ad(prev->in_flush_list);
777 UT_LIST_INSERT_AFTER( buf_pool->flush_list, prev, dpage);
778 } else {
779 UT_LIST_ADD_FIRST(buf_pool->flush_list, dpage);
780 }
781
782 /* Just an extra check. Previous in flush_list
783 should be the same control block as in flush_rbt. */
784 ut_a(buf_pool->flush_rbt == NULL || prev_b == prev);
785
786#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
787 ut_a(buf_flush_validate_low(buf_pool));
788#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
789
790 buf_flush_list_mutex_exit(buf_pool);
791}
792
793/** Update the flush system data structures when a write is completed.
794@param[in,out] bpage flushed page
795@param[in] dblwr whether the doublewrite buffer was used */
796void buf_flush_write_complete(buf_page_t* bpage, bool dblwr)
797{
798 buf_flush_t flush_type;
799 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
800
801 ut_ad(bpage);
802
803 buf_flush_remove(bpage);
804
805 flush_type = buf_page_get_flush_type(bpage);
806 buf_pool->n_flush[flush_type]--;
807 ut_ad(buf_pool->n_flush[flush_type] != ULINT_MAX);
808
809 ut_ad(buf_pool_mutex_own(buf_pool));
810
811 if (buf_pool->n_flush[flush_type] == 0
812 && buf_pool->init_flush[flush_type] == FALSE) {
813
814 /* The running flush batch has ended */
815
816 os_event_set(buf_pool->no_flush[flush_type]);
817 }
818
819 if (dblwr) {
820 buf_dblwr_update(bpage, flush_type);
821 }
822}
823
824/** Calculate the checksum of a page from compressed table and update
825the page.
826@param[in,out] page page to update
827@param[in] size compressed page size
828@param[in] lsn LSN to stamp on the page */
829void
830buf_flush_update_zip_checksum(
831 buf_frame_t* page,
832 ulint size,
833 lsn_t lsn)
834{
835 ut_a(size > 0);
836
837 const uint32_t checksum = page_zip_calc_checksum(
838 page, size,
839 static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm));
840
841 mach_write_to_8(page + FIL_PAGE_LSN, lsn);
842 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum);
843}
844
845/** Initialize a page for writing to the tablespace.
846@param[in] block buffer block; NULL if bypassing the buffer pool
847@param[in,out] page page frame
848@param[in,out] page_zip_ compressed page, or NULL if uncompressed
849@param[in] newest_lsn newest modification LSN to the page */
850void
851buf_flush_init_for_writing(
852 const buf_block_t* block,
853 byte* page,
854 void* page_zip_,
855 lsn_t newest_lsn)
856{
857 ut_ad(block == NULL || block->frame == page);
858 ut_ad(block == NULL || page_zip_ == NULL
859 || &block->page.zip == page_zip_);
860 ut_ad(page);
861
862 if (page_zip_) {
863 page_zip_des_t* page_zip;
864 ulint size;
865
866 page_zip = static_cast<page_zip_des_t*>(page_zip_);
867 size = page_zip_get_size(page_zip);
868
869 ut_ad(size);
870 ut_ad(ut_is_2pow(size));
871 ut_ad(size <= UNIV_ZIP_SIZE_MAX);
872
873 switch (fil_page_get_type(page)) {
874 case FIL_PAGE_TYPE_ALLOCATED:
875 case FIL_PAGE_INODE:
876 case FIL_PAGE_IBUF_BITMAP:
877 case FIL_PAGE_TYPE_FSP_HDR:
878 case FIL_PAGE_TYPE_XDES:
879 /* These are essentially uncompressed pages. */
880 memcpy(page_zip->data, page, size);
881 /* fall through */
882 case FIL_PAGE_TYPE_ZBLOB:
883 case FIL_PAGE_TYPE_ZBLOB2:
884 case FIL_PAGE_INDEX:
885 case FIL_PAGE_RTREE:
886
887 buf_flush_update_zip_checksum(
888 page_zip->data, size, newest_lsn);
889
890 return;
891 }
892
893 ib::error() << "The compressed page to be written"
894 " seems corrupt:";
895 ut_print_buf(stderr, page, size);
896 fputs("\nInnoDB: Possibly older version of the page:", stderr);
897 ut_print_buf(stderr, page_zip->data, size);
898 putc('\n', stderr);
899 ut_error;
900 }
901
902 /* Write the newest modification lsn to the page header and trailer */
903 mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn);
904
905 mach_write_to_8(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
906 newest_lsn);
907
908 if (block && srv_page_size == 16384) {
909 /* The page type could be garbage in old files
910 created before MySQL 5.5. Such files always
911 had a page size of 16 kilobytes. */
912 ulint page_type = fil_page_get_type(page);
913 ulint reset_type = page_type;
914
915 switch (block->page.id.page_no() % 16384) {
916 case 0:
917 reset_type = block->page.id.page_no() == 0
918 ? FIL_PAGE_TYPE_FSP_HDR
919 : FIL_PAGE_TYPE_XDES;
920 break;
921 case 1:
922 reset_type = FIL_PAGE_IBUF_BITMAP;
923 break;
924 case FSP_TRX_SYS_PAGE_NO:
925 if (block->page.id.page_no()
926 == TRX_SYS_PAGE_NO
927 && block->page.id.space()
928 == TRX_SYS_SPACE) {
929 reset_type = FIL_PAGE_TYPE_TRX_SYS;
930 break;
931 }
932 /* fall through */
933 default:
934 switch (page_type) {
935 case FIL_PAGE_INDEX:
936 case FIL_PAGE_TYPE_INSTANT:
937 case FIL_PAGE_RTREE:
938 case FIL_PAGE_UNDO_LOG:
939 case FIL_PAGE_INODE:
940 case FIL_PAGE_IBUF_FREE_LIST:
941 case FIL_PAGE_TYPE_ALLOCATED:
942 case FIL_PAGE_TYPE_SYS:
943 case FIL_PAGE_TYPE_TRX_SYS:
944 case FIL_PAGE_TYPE_BLOB:
945 case FIL_PAGE_TYPE_ZBLOB:
946 case FIL_PAGE_TYPE_ZBLOB2:
947 break;
948 case FIL_PAGE_TYPE_FSP_HDR:
949 case FIL_PAGE_TYPE_XDES:
950 case FIL_PAGE_IBUF_BITMAP:
951 /* These pages should have
952 predetermined page numbers
953 (see above). */
954 default:
955 reset_type = FIL_PAGE_TYPE_UNKNOWN;
956 break;
957 }
958 }
959
960 if (UNIV_UNLIKELY(page_type != reset_type)) {
961 ib::info()
962 << "Resetting invalid page "
963 << block->page.id << " type "
964 << page_type << " to "
965 << reset_type << " when flushing.";
966 fil_page_set_type(page, reset_type);
967 }
968 }
969
970 uint32_t checksum= 0;
971
972 switch (srv_checksum_algorithm_t(srv_checksum_algorithm)) {
973 case SRV_CHECKSUM_ALGORITHM_INNODB:
974 case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
975 checksum = buf_calc_page_new_checksum(page);
976 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
977 checksum);
978 /* With the InnoDB checksum, we overwrite the first 4 bytes of
979 the end lsn field to store the old formula checksum. Since it
980 depends also on the field FIL_PAGE_SPACE_OR_CHKSUM, it has to
981 be calculated after storing the new formula checksum. */
982 checksum = buf_calc_page_old_checksum(page);
983 break;
984 case SRV_CHECKSUM_ALGORITHM_CRC32:
985 case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
986 /* In other cases we write the same checksum to both fields. */
987 checksum = buf_calc_page_crc32(page);
988 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
989 checksum);
990 break;
991 case SRV_CHECKSUM_ALGORITHM_NONE:
992 case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
993 checksum = BUF_NO_CHECKSUM_MAGIC;
994 mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM,
995 checksum);
996 break;
997 /* no default so the compiler will emit a warning if
998 new enum is added and not handled here */
999 }
1000
1001 mach_write_to_4(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM,
1002 checksum);
1003}
1004
1005/********************************************************************//**
1006Does an asynchronous write of a buffer page. NOTE: in simulated aio and
1007also when the doublewrite buffer is used, we must call
1008buf_dblwr_flush_buffered_writes after we have posted a batch of
1009writes! */
1010static
1011void
1012buf_flush_write_block_low(
1013/*======================*/
1014 buf_page_t* bpage, /*!< in: buffer block to write */
1015 buf_flush_t flush_type, /*!< in: type of flush */
1016 bool sync) /*!< in: true if sync IO request */
1017{
1018 fil_space_t* space = fil_space_acquire_for_io(bpage->id.space());
1019 if (!space) {
1020 return;
1021 }
1022 ut_ad(space->purpose == FIL_TYPE_TEMPORARY
1023 || space->purpose == FIL_TYPE_IMPORT
1024 || space->purpose == FIL_TYPE_TABLESPACE);
1025 ut_ad((space->purpose == FIL_TYPE_TEMPORARY)
1026 == (space == fil_system.temp_space));
1027 page_t* frame = NULL;
1028#ifdef UNIV_DEBUG
1029 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1030 ut_ad(!buf_pool_mutex_own(buf_pool));
1031#endif /* UNIV_DEBUG */
1032
1033 DBUG_PRINT("ib_buf", ("flush %s %u page %u:%u",
1034 sync ? "sync" : "async", (unsigned) flush_type,
1035 bpage->id.space(), bpage->id.page_no()));
1036
1037 ut_ad(buf_page_in_file(bpage));
1038
1039 /* We are not holding buf_pool->mutex or block_mutex here.
1040 Nevertheless, it is safe to access bpage, because it is
1041 io_fixed and oldest_modification != 0. Thus, it cannot be
1042 relocated in the buffer pool or removed from flush_list or
1043 LRU_list. */
1044 ut_ad(!buf_pool_mutex_own(buf_pool));
1045 ut_ad(!buf_flush_list_mutex_own(buf_pool));
1046 ut_ad(!buf_page_get_mutex(bpage)->is_owned());
1047 ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
1048 ut_ad(bpage->oldest_modification != 0);
1049
1050#ifdef UNIV_IBUF_COUNT_DEBUG
1051 ut_a(ibuf_count_get(bpage->id) == 0);
1052#endif /* UNIV_IBUF_COUNT_DEBUG */
1053
1054 ut_ad(bpage->newest_modification != 0);
1055
1056 /* Force the log to the disk before writing the modified block */
1057 if (!srv_read_only_mode) {
1058 log_write_up_to(bpage->newest_modification, true);
1059 }
1060
1061 switch (buf_page_get_state(bpage)) {
1062 case BUF_BLOCK_POOL_WATCH:
1063 case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */
1064 case BUF_BLOCK_NOT_USED:
1065 case BUF_BLOCK_READY_FOR_USE:
1066 case BUF_BLOCK_MEMORY:
1067 case BUF_BLOCK_REMOVE_HASH:
1068 ut_error;
1069 break;
1070 case BUF_BLOCK_ZIP_DIRTY:
1071 frame = bpage->zip.data;
1072
1073 mach_write_to_8(frame + FIL_PAGE_LSN,
1074 bpage->newest_modification);
1075
1076 ut_a(page_zip_verify_checksum(frame, bpage->size.physical()));
1077 break;
1078 case BUF_BLOCK_FILE_PAGE:
1079 frame = bpage->zip.data;
1080 if (!frame) {
1081 frame = ((buf_block_t*) bpage)->frame;
1082 }
1083
1084 buf_flush_init_for_writing(
1085 reinterpret_cast<const buf_block_t*>(bpage),
1086 reinterpret_cast<const buf_block_t*>(bpage)->frame,
1087 bpage->zip.data ? &bpage->zip : NULL,
1088 bpage->newest_modification);
1089 break;
1090 }
1091
1092 frame = buf_page_encrypt_before_write(space, bpage, frame);
1093
1094 ut_ad(space->purpose == FIL_TYPE_TABLESPACE
1095 || space->atomic_write_supported);
1096 if (!space->use_doublewrite()) {
1097 ulint type = IORequest::WRITE | IORequest::DO_NOT_WAKE;
1098
1099 IORequest request(type, bpage);
1100
1101 /* TODO: pass the tablespace to fil_io() */
1102 fil_io(request,
1103 sync, bpage->id, bpage->size, 0, bpage->size.physical(),
1104 frame, bpage);
1105 } else {
1106 ut_ad(!srv_read_only_mode);
1107
1108 if (flush_type == BUF_FLUSH_SINGLE_PAGE) {
1109 buf_dblwr_write_single_page(bpage, sync);
1110 } else {
1111 ut_ad(!sync);
1112 buf_dblwr_add_to_batch(bpage);
1113 }
1114 }
1115
1116 /* When doing single page flushing the IO is done synchronously
1117 and we flush the changes to disk only for the tablespace we
1118 are working on. */
1119 if (sync) {
1120 ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE);
1121 if (space->purpose != FIL_TYPE_TEMPORARY) {
1122 fil_flush(space);
1123 }
1124
1125 /* The tablespace could already have been dropped,
1126 because fil_io(request, sync) would already have
1127 decremented the node->n_pending. However,
1128 buf_page_io_complete() only needs to look up the
1129 tablespace during read requests, not during writes. */
1130 ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE);
1131#ifdef UNIV_DEBUG
1132 dberr_t err =
1133#endif
1134 /* true means we want to evict this page from the
1135 LRU list as well. */
1136 buf_page_io_complete(bpage, space->use_doublewrite(), true);
1137
1138 ut_ad(err == DB_SUCCESS);
1139 }
1140
1141 space->release_for_io();
1142
1143 /* Increment the counter of I/O operations used
1144 for selecting LRU policy. */
1145 buf_LRU_stat_inc_io();
1146}
1147
1148/********************************************************************//**
1149Writes a flushable page asynchronously from the buffer pool to a file.
1150NOTE: in simulated aio we must call
1151os_aio_simulated_wake_handler_threads after we have posted a batch of
1152writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be
1153held upon entering this function, and they will be released by this
1154function if it returns true.
1155@return TRUE if the page was flushed */
1156ibool
1157buf_flush_page(
1158/*===========*/
1159 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1160 buf_page_t* bpage, /*!< in: buffer control block */
1161 buf_flush_t flush_type, /*!< in: type of flush */
1162 bool sync) /*!< in: true if sync IO request */
1163{
1164 BPageMutex* block_mutex;
1165
1166 ut_ad(flush_type < BUF_FLUSH_N_TYPES);
1167 ut_ad(buf_pool_mutex_own(buf_pool));
1168 ut_ad(buf_page_in_file(bpage));
1169 ut_ad(!sync || flush_type == BUF_FLUSH_SINGLE_PAGE);
1170
1171 block_mutex = buf_page_get_mutex(bpage);
1172 ut_ad(mutex_own(block_mutex));
1173
1174 ut_ad(buf_flush_ready_for_flush(bpage, flush_type));
1175
1176 bool is_uncompressed;
1177
1178 is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE);
1179 ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex));
1180
1181 ibool flush;
1182 rw_lock_t* rw_lock;
1183 bool no_fix_count = bpage->buf_fix_count == 0;
1184
1185 if (!is_uncompressed) {
1186 flush = TRUE;
1187 rw_lock = NULL;
1188 } else if (!(no_fix_count || flush_type == BUF_FLUSH_LIST)
1189 || (!no_fix_count
1190 && srv_shutdown_state <= SRV_SHUTDOWN_CLEANUP
1191 && fsp_is_system_temporary(bpage->id.space()))) {
1192 /* This is a heuristic, to avoid expensive SX attempts. */
1193 /* For table residing in temporary tablespace sync is done
1194 using IO_FIX and so before scheduling for flush ensure that
1195 page is not fixed. */
1196 flush = FALSE;
1197 } else {
1198 rw_lock = &reinterpret_cast<buf_block_t*>(bpage)->lock;
1199 if (flush_type != BUF_FLUSH_LIST) {
1200 flush = rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE);
1201 } else {
1202 /* Will SX lock later */
1203 flush = TRUE;
1204 }
1205 }
1206
1207 if (flush) {
1208
1209 /* We are committed to flushing by the time we get here */
1210
1211 buf_page_set_io_fix(bpage, BUF_IO_WRITE);
1212
1213 buf_page_set_flush_type(bpage, flush_type);
1214
1215 if (buf_pool->n_flush[flush_type] == 0) {
1216 os_event_reset(buf_pool->no_flush[flush_type]);
1217 }
1218
1219 ++buf_pool->n_flush[flush_type];
1220 ut_ad(buf_pool->n_flush[flush_type] != 0);
1221
1222 mutex_exit(block_mutex);
1223
1224 buf_pool_mutex_exit(buf_pool);
1225
1226 if (flush_type == BUF_FLUSH_LIST
1227 && is_uncompressed
1228 && !rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE)) {
1229
1230 if (!fsp_is_system_temporary(bpage->id.space())) {
1231 /* avoiding deadlock possibility involves
1232 doublewrite buffer, should flush it, because
1233 it might hold the another block->lock. */
1234 buf_dblwr_flush_buffered_writes();
1235 } else {
1236 buf_dblwr_sync_datafiles();
1237 }
1238
1239 rw_lock_sx_lock_gen(rw_lock, BUF_IO_WRITE);
1240 }
1241
1242 /* If there is an observer that want to know if the asynchronous
1243 flushing was sent then notify it.
1244 Note: we set flush observer to a page with x-latch, so we can
1245 guarantee that notify_flush and notify_remove are called in pair
1246 with s-latch on a uncompressed page. */
1247 if (bpage->flush_observer != NULL) {
1248 buf_pool_mutex_enter(buf_pool);
1249
1250 bpage->flush_observer->notify_flush(buf_pool, bpage);
1251
1252 buf_pool_mutex_exit(buf_pool);
1253 }
1254
1255 /* Even though bpage is not protected by any mutex at this
1256 point, it is safe to access bpage, because it is io_fixed and
1257 oldest_modification != 0. Thus, it cannot be relocated in the
1258 buffer pool or removed from flush_list or LRU_list. */
1259
1260 buf_flush_write_block_low(bpage, flush_type, sync);
1261 }
1262
1263 return(flush);
1264}
1265
1266# if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
1267/********************************************************************//**
1268Writes a flushable page asynchronously from the buffer pool to a file.
1269NOTE: buf_pool->mutex and block->mutex must be held upon entering this
1270function, and they will be released by this function after flushing.
1271This is loosely based on buf_flush_batch() and buf_flush_page().
1272@return TRUE if the page was flushed and the mutexes released */
1273ibool
1274buf_flush_page_try(
1275/*===============*/
1276 buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */
1277 buf_block_t* block) /*!< in/out: buffer control block */
1278{
1279 ut_ad(buf_pool_mutex_own(buf_pool));
1280 ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
1281 ut_ad(buf_page_mutex_own(block));
1282
1283 if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_SINGLE_PAGE)) {
1284 return(FALSE);
1285 }
1286
1287 /* The following call will release the buffer pool and
1288 block mutex. */
1289 return(buf_flush_page(
1290 buf_pool, &block->page,
1291 BUF_FLUSH_SINGLE_PAGE, true));
1292}
1293# endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
1294
1295/** Check the page is in buffer pool and can be flushed.
1296@param[in] page_id page id
1297@param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST
1298@return true if the page can be flushed. */
1299static
1300bool
1301buf_flush_check_neighbor(
1302 const page_id_t& page_id,
1303 buf_flush_t flush_type)
1304{
1305 buf_page_t* bpage;
1306 buf_pool_t* buf_pool = buf_pool_get(page_id);
1307 bool ret;
1308
1309 ut_ad(flush_type == BUF_FLUSH_LRU
1310 || flush_type == BUF_FLUSH_LIST);
1311
1312 buf_pool_mutex_enter(buf_pool);
1313
1314 /* We only want to flush pages from this buffer pool. */
1315 bpage = buf_page_hash_get(buf_pool, page_id);
1316
1317 if (!bpage) {
1318
1319 buf_pool_mutex_exit(buf_pool);
1320 return(false);
1321 }
1322
1323 ut_a(buf_page_in_file(bpage));
1324
1325 /* We avoid flushing 'non-old' blocks in an LRU flush,
1326 because the flushed blocks are soon freed */
1327
1328 ret = false;
1329 if (flush_type != BUF_FLUSH_LRU || buf_page_is_old(bpage)) {
1330 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1331
1332 mutex_enter(block_mutex);
1333 if (buf_flush_ready_for_flush(bpage, flush_type)) {
1334 ret = true;
1335 }
1336 mutex_exit(block_mutex);
1337 }
1338 buf_pool_mutex_exit(buf_pool);
1339
1340 return(ret);
1341}
1342
1343/** Flushes to disk all flushable pages within the flush area.
1344@param[in] page_id page id
1345@param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST
1346@param[in] n_flushed number of pages flushed so far in this batch
1347@param[in] n_to_flush maximum number of pages we are allowed to flush
1348@return number of pages flushed */
1349static
1350ulint
1351buf_flush_try_neighbors(
1352 const page_id_t& page_id,
1353 buf_flush_t flush_type,
1354 ulint n_flushed,
1355 ulint n_to_flush)
1356{
1357 ulint i;
1358 ulint low;
1359 ulint high;
1360 ulint count = 0;
1361 buf_pool_t* buf_pool = buf_pool_get(page_id);
1362
1363 ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1364
1365 if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN
1366 || srv_flush_neighbors == 0) {
1367 /* If there is little space or neighbor flushing is
1368 not enabled then just flush the victim. */
1369 low = page_id.page_no();
1370 high = page_id.page_no() + 1;
1371 } else {
1372 /* When flushed, dirty blocks are searched in
1373 neighborhoods of this size, and flushed along with the
1374 original page. */
1375
1376 ulint buf_flush_area;
1377
1378 buf_flush_area = ut_min(
1379 BUF_READ_AHEAD_AREA(buf_pool),
1380 buf_pool->curr_size / 16);
1381
1382 low = (page_id.page_no() / buf_flush_area) * buf_flush_area;
1383 high = (page_id.page_no() / buf_flush_area + 1) * buf_flush_area;
1384
1385 if (srv_flush_neighbors == 1) {
1386 /* adjust 'low' and 'high' to limit
1387 for contiguous dirty area */
1388 if (page_id.page_no() > low) {
1389 for (i = page_id.page_no() - 1; i >= low; i--) {
1390 if (!buf_flush_check_neighbor(
1391 page_id_t(page_id.space(), i),
1392 flush_type)) {
1393
1394 break;
1395 }
1396
1397 if (i == low) {
1398 /* Avoid overwrap when low == 0
1399 and calling
1400 buf_flush_check_neighbor() with
1401 i == (ulint) -1 */
1402 i--;
1403 break;
1404 }
1405 }
1406 low = i + 1;
1407 }
1408
1409 for (i = page_id.page_no() + 1;
1410 i < high
1411 && buf_flush_check_neighbor(
1412 page_id_t(page_id.space(), i),
1413 flush_type);
1414 i++) {
1415 /* do nothing */
1416 }
1417 high = i;
1418 }
1419 }
1420
1421 const ulint space_size = fil_space_get_size(page_id.space());
1422 if (high > space_size) {
1423 high = space_size;
1424 }
1425
1426 DBUG_PRINT("ib_buf", ("flush %u:%u..%u",
1427 page_id.space(),
1428 (unsigned) low, (unsigned) high));
1429
1430 for (ulint i = low; i < high; i++) {
1431 buf_page_t* bpage;
1432
1433 if ((count + n_flushed) >= n_to_flush) {
1434
1435 /* We have already flushed enough pages and
1436 should call it a day. There is, however, one
1437 exception. If the page whose neighbors we
1438 are flushing has not been flushed yet then
1439 we'll try to flush the victim that we
1440 selected originally. */
1441 if (i <= page_id.page_no()) {
1442 i = page_id.page_no();
1443 } else {
1444 break;
1445 }
1446 }
1447
1448 const page_id_t cur_page_id(page_id.space(), i);
1449
1450 buf_pool = buf_pool_get(cur_page_id);
1451
1452 buf_pool_mutex_enter(buf_pool);
1453
1454 /* We only want to flush pages from this buffer pool. */
1455 bpage = buf_page_hash_get(buf_pool, cur_page_id);
1456
1457 if (bpage == NULL) {
1458
1459 buf_pool_mutex_exit(buf_pool);
1460 continue;
1461 }
1462
1463 ut_a(buf_page_in_file(bpage));
1464
1465 /* We avoid flushing 'non-old' blocks in an LRU flush,
1466 because the flushed blocks are soon freed */
1467
1468 if (flush_type != BUF_FLUSH_LRU
1469 || i == page_id.page_no()
1470 || buf_page_is_old(bpage)) {
1471
1472 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1473
1474 mutex_enter(block_mutex);
1475
1476 if (buf_flush_ready_for_flush(bpage, flush_type)
1477 && (i == page_id.page_no()
1478 || bpage->buf_fix_count == 0)) {
1479
1480 /* We also try to flush those
1481 neighbors != offset */
1482
1483 if (buf_flush_page(
1484 buf_pool, bpage, flush_type, false)) {
1485
1486 ++count;
1487 } else {
1488 mutex_exit(block_mutex);
1489 buf_pool_mutex_exit(buf_pool);
1490 }
1491
1492 continue;
1493 } else {
1494 mutex_exit(block_mutex);
1495 }
1496 }
1497 buf_pool_mutex_exit(buf_pool);
1498 }
1499
1500 if (count > 1) {
1501 MONITOR_INC_VALUE_CUMULATIVE(
1502 MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE,
1503 MONITOR_FLUSH_NEIGHBOR_COUNT,
1504 MONITOR_FLUSH_NEIGHBOR_PAGES,
1505 (count - 1));
1506 }
1507
1508 return(count);
1509}
1510
1511/** Check if the block is modified and ready for flushing.
1512If the the block is ready to flush then flush the page and try o flush
1513its neighbors.
1514@param[in] bpage buffer control block,
1515must be buf_page_in_file(bpage)
1516@param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST
1517@param[in] n_to_flush number of pages to flush
1518@param[in,out] count number of pages flushed
1519@return TRUE if buf_pool mutex was released during this function.
1520This does not guarantee that some pages were written as well.
1521Number of pages written are incremented to the count. */
1522static
1523bool
1524buf_flush_page_and_try_neighbors(
1525 buf_page_t* bpage,
1526 buf_flush_t flush_type,
1527 ulint n_to_flush,
1528 ulint* count)
1529{
1530#ifdef UNIV_DEBUG
1531 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1532
1533 ut_ad(buf_pool_mutex_own(buf_pool));
1534#endif /* UNIV_DEBUG */
1535
1536 bool flushed;
1537 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1538
1539 mutex_enter(block_mutex);
1540
1541 ut_a(buf_page_in_file(bpage));
1542
1543 if (buf_flush_ready_for_flush(bpage, flush_type)) {
1544 buf_pool_t* buf_pool;
1545
1546 buf_pool = buf_pool_from_bpage(bpage);
1547
1548 const page_id_t page_id = bpage->id;
1549
1550 mutex_exit(block_mutex);
1551
1552 buf_pool_mutex_exit(buf_pool);
1553
1554 /* Try to flush also all the neighbors */
1555 *count += buf_flush_try_neighbors(
1556 page_id, flush_type, *count, n_to_flush);
1557
1558 buf_pool_mutex_enter(buf_pool);
1559 flushed = TRUE;
1560 } else {
1561 mutex_exit(block_mutex);
1562
1563 flushed = false;
1564 }
1565
1566 ut_ad(buf_pool_mutex_own(buf_pool));
1567
1568 return(flushed);
1569}
1570
1571/*******************************************************************//**
1572This utility moves the uncompressed frames of pages to the free list.
1573Note that this function does not actually flush any data to disk. It
1574just detaches the uncompressed frames from the compressed pages at the
1575tail of the unzip_LRU and puts those freed frames in the free list.
1576Note that it is a best effort attempt and it is not guaranteed that
1577after a call to this function there will be 'max' blocks in the free
1578list.
1579@return number of blocks moved to the free list. */
1580static
1581ulint
1582buf_free_from_unzip_LRU_list_batch(
1583/*===============================*/
1584 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1585 ulint max) /*!< in: desired number of
1586 blocks in the free_list */
1587{
1588 ulint scanned = 0;
1589 ulint count = 0;
1590 ulint free_len = UT_LIST_GET_LEN(buf_pool->free);
1591 ulint lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1592
1593 ut_ad(buf_pool_mutex_own(buf_pool));
1594
1595 buf_block_t* block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1596
1597 while (block != NULL
1598 && count < max
1599 && free_len < srv_LRU_scan_depth
1600 && lru_len > UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
1601
1602 ++scanned;
1603 if (buf_LRU_free_page(&block->page, false)) {
1604 /* Block was freed. buf_pool->mutex potentially
1605 released and reacquired */
1606 ++count;
1607 block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
1608
1609 } else {
1610
1611 block = UT_LIST_GET_PREV(unzip_LRU, block);
1612 }
1613
1614 free_len = UT_LIST_GET_LEN(buf_pool->free);
1615 lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
1616 }
1617
1618 ut_ad(buf_pool_mutex_own(buf_pool));
1619
1620 if (scanned) {
1621 MONITOR_INC_VALUE_CUMULATIVE(
1622 MONITOR_LRU_BATCH_SCANNED,
1623 MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1624 MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1625 scanned);
1626 }
1627
1628 return(count);
1629}
1630
1631/*******************************************************************//**
1632This utility flushes dirty blocks from the end of the LRU list.
1633The calling thread is not allowed to own any latches on pages!
1634It attempts to make 'max' blocks available in the free list. Note that
1635it is a best effort attempt and it is not guaranteed that after a call
1636to this function there will be 'max' blocks in the free list.*/
1637
1638void
1639buf_flush_LRU_list_batch(
1640/*=====================*/
1641 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1642 ulint max, /*!< in: desired number of
1643 blocks in the free_list */
1644 flush_counters_t* n) /*!< out: flushed/evicted page
1645 counts */
1646{
1647 buf_page_t* bpage;
1648 ulint scanned = 0;
1649 ulint free_len = UT_LIST_GET_LEN(buf_pool->free);
1650 ulint lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1651 ulint withdraw_depth = 0;
1652
1653 n->flushed = 0;
1654 n->evicted = 0;
1655 n->unzip_LRU_evicted = 0;
1656 ut_ad(buf_pool_mutex_own(buf_pool));
1657 if (buf_pool->curr_size < buf_pool->old_size
1658 && buf_pool->withdraw_target > 0) {
1659 withdraw_depth = buf_pool->withdraw_target
1660 - UT_LIST_GET_LEN(buf_pool->withdraw);
1661 }
1662
1663 for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1664 bpage != NULL && n->flushed + n->evicted < max
1665 && free_len < srv_LRU_scan_depth + withdraw_depth
1666 && lru_len > BUF_LRU_MIN_LEN;
1667 ++scanned,
1668 bpage = buf_pool->lru_hp.get()) {
1669
1670 buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
1671 buf_pool->lru_hp.set(prev);
1672
1673 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1674
1675 mutex_enter(block_mutex);
1676
1677 if (buf_flush_ready_for_replace(bpage)) {
1678 /* block is ready for eviction i.e., it is
1679 clean and is not IO-fixed or buffer fixed. */
1680 mutex_exit(block_mutex);
1681 if (buf_LRU_free_page(bpage, true)) {
1682 ++n->evicted;
1683 }
1684 } else if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_LRU)) {
1685 /* Block is ready for flush. Dispatch an IO
1686 request. The IO helper thread will put it on
1687 free list in IO completion routine. */
1688 mutex_exit(block_mutex);
1689 buf_flush_page_and_try_neighbors(
1690 bpage, BUF_FLUSH_LRU, max, &n->flushed);
1691 } else {
1692 /* Can't evict or dispatch this block. Go to
1693 previous. */
1694 ut_ad(buf_pool->lru_hp.is_hp(prev));
1695 mutex_exit(block_mutex);
1696 }
1697
1698 ut_ad(!mutex_own(block_mutex));
1699 ut_ad(buf_pool_mutex_own(buf_pool));
1700
1701 free_len = UT_LIST_GET_LEN(buf_pool->free);
1702 lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1703 }
1704
1705 buf_pool->lru_hp.set(NULL);
1706
1707 /* We keep track of all flushes happening as part of LRU
1708 flush. When estimating the desired rate at which flush_list
1709 should be flushed, we factor in this value. */
1710 buf_lru_flush_page_count += n->flushed;
1711
1712 ut_ad(buf_pool_mutex_own(buf_pool));
1713
1714 if (n->evicted) {
1715 MONITOR_INC_VALUE_CUMULATIVE(
1716 MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE,
1717 MONITOR_LRU_BATCH_EVICT_COUNT,
1718 MONITOR_LRU_BATCH_EVICT_PAGES,
1719 n->evicted);
1720 }
1721
1722 if (scanned) {
1723 MONITOR_INC_VALUE_CUMULATIVE(
1724 MONITOR_LRU_BATCH_SCANNED,
1725 MONITOR_LRU_BATCH_SCANNED_NUM_CALL,
1726 MONITOR_LRU_BATCH_SCANNED_PER_CALL,
1727 scanned);
1728 }
1729}
1730
1731/*******************************************************************//**
1732Flush and move pages from LRU or unzip_LRU list to the free list.
1733Whether LRU or unzip_LRU is used depends on the state of the system.*/
1734
1735static
1736void
1737buf_do_LRU_batch(
1738/*=============*/
1739 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1740 ulint max, /*!< in: desired number of
1741 blocks in the free_list */
1742 flush_counters_t* n) /*!< out: flushed/evicted page
1743 counts */
1744{
1745 if (buf_LRU_evict_from_unzip_LRU(buf_pool)) {
1746 n->unzip_LRU_evicted = buf_free_from_unzip_LRU_list_batch(buf_pool, max);
1747 } else {
1748 n->unzip_LRU_evicted = 0;
1749 }
1750
1751 if (max > n->unzip_LRU_evicted) {
1752 buf_flush_LRU_list_batch(buf_pool, max - n->unzip_LRU_evicted, n);
1753 } else {
1754 n->evicted = 0;
1755 n->flushed = 0;
1756 }
1757
1758 /* Add evicted pages from unzip_LRU to the evicted pages from
1759 the simple LRU. */
1760 n->evicted += n->unzip_LRU_evicted;
1761}
1762
1763/** This utility flushes dirty blocks from the end of the flush_list.
1764The calling thread is not allowed to own any latches on pages!
1765@param[in] buf_pool buffer pool instance
1766@param[in] min_n wished minimum mumber of blocks flushed (it is
1767not guaranteed that the actual number is that big, though)
1768@param[in] lsn_limit all blocks whose oldest_modification is smaller
1769than this should be flushed (if their number does not exceed min_n)
1770@return number of blocks for which the write request was queued;
1771ULINT_UNDEFINED if there was a flush of the same type already
1772running */
1773static
1774ulint
1775buf_do_flush_list_batch(
1776 buf_pool_t* buf_pool,
1777 ulint min_n,
1778 lsn_t lsn_limit)
1779{
1780 ulint count = 0;
1781 ulint scanned = 0;
1782
1783 ut_ad(buf_pool_mutex_own(buf_pool));
1784
1785 /* Start from the end of the list looking for a suitable
1786 block to be flushed. */
1787 buf_flush_list_mutex_enter(buf_pool);
1788 ulint len = UT_LIST_GET_LEN(buf_pool->flush_list);
1789
1790 /* In order not to degenerate this scan to O(n*n) we attempt
1791 to preserve pointer of previous block in the flush list. To do
1792 so we declare it a hazard pointer. Any thread working on the
1793 flush list must check the hazard pointer and if it is removing
1794 the same block then it must reset it. */
1795 for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
1796 count < min_n && bpage != NULL && len > 0
1797 && bpage->oldest_modification < lsn_limit;
1798 bpage = buf_pool->flush_hp.get(),
1799 ++scanned) {
1800
1801 buf_page_t* prev;
1802
1803 ut_a(bpage->oldest_modification > 0);
1804 ut_ad(bpage->in_flush_list);
1805
1806 prev = UT_LIST_GET_PREV(list, bpage);
1807 buf_pool->flush_hp.set(prev);
1808 buf_flush_list_mutex_exit(buf_pool);
1809
1810#ifdef UNIV_DEBUG
1811 bool flushed =
1812#endif /* UNIV_DEBUG */
1813 buf_flush_page_and_try_neighbors(
1814 bpage, BUF_FLUSH_LIST, min_n, &count);
1815
1816 buf_flush_list_mutex_enter(buf_pool);
1817
1818 ut_ad(flushed || buf_pool->flush_hp.is_hp(prev));
1819
1820 --len;
1821 }
1822
1823 buf_pool->flush_hp.set(NULL);
1824 buf_flush_list_mutex_exit(buf_pool);
1825
1826 if (scanned) {
1827 MONITOR_INC_VALUE_CUMULATIVE(
1828 MONITOR_FLUSH_BATCH_SCANNED,
1829 MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL,
1830 MONITOR_FLUSH_BATCH_SCANNED_PER_CALL,
1831 scanned);
1832 }
1833
1834 if (count) {
1835 MONITOR_INC_VALUE_CUMULATIVE(
1836 MONITOR_FLUSH_BATCH_TOTAL_PAGE,
1837 MONITOR_FLUSH_BATCH_COUNT,
1838 MONITOR_FLUSH_BATCH_PAGES,
1839 count);
1840 }
1841
1842 ut_ad(buf_pool_mutex_own(buf_pool));
1843
1844 return(count);
1845}
1846
1847/** This utility flushes dirty blocks from the end of the LRU list or
1848flush_list.
1849NOTE 1: in the case of an LRU flush the calling thread may own latches to
1850pages: to avoid deadlocks, this function must be written so that it cannot
1851end up waiting for these latches! NOTE 2: in the case of a flush list flush,
1852the calling thread is not allowed to own any latches on pages!
1853@param[in] buf_pool buffer pool instance
1854@param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST; if
1855BUF_FLUSH_LIST, then the caller must not own any latches on pages
1856@param[in] min_n wished minimum mumber of blocks flushed (it is
1857not guaranteed that the actual number is that big, though)
1858@param[in] lsn_limit in the case of BUF_FLUSH_LIST all blocks whose
1859oldest_modification is smaller than this should be flushed (if their number
1860does not exceed min_n), otherwise ignored */
1861static
1862void
1863buf_flush_batch(
1864 buf_pool_t* buf_pool,
1865 buf_flush_t flush_type,
1866 ulint min_n,
1867 lsn_t lsn_limit,
1868 flush_counters_t* n) /*!< out: flushed/evicted page
1869 counts */
1870{
1871 ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1872 ut_ad(flush_type == BUF_FLUSH_LRU
1873 || !sync_check_iterate(dict_sync_check()));
1874
1875 buf_pool_mutex_enter(buf_pool);
1876
1877 /* Note: The buffer pool mutex is released and reacquired within
1878 the flush functions. */
1879 switch (flush_type) {
1880 case BUF_FLUSH_LRU:
1881 buf_do_LRU_batch(buf_pool, min_n, n);
1882 break;
1883 case BUF_FLUSH_LIST:
1884 n->flushed = buf_do_flush_list_batch(buf_pool, min_n, lsn_limit);
1885 n->evicted = 0;
1886 break;
1887 default:
1888 ut_error;
1889 }
1890
1891 buf_pool_mutex_exit(buf_pool);
1892
1893 DBUG_LOG("ib_buf", "flush " << flush_type << " completed");
1894}
1895
1896/******************************************************************//**
1897Gather the aggregated stats for both flush list and LRU list flushing.
1898@param page_count_flush number of pages flushed from the end of the flush_list
1899@param page_count_LRU number of pages flushed from the end of the LRU list
1900*/
1901static
1902void
1903buf_flush_stats(
1904/*============*/
1905 ulint page_count_flush,
1906 ulint page_count_LRU)
1907{
1908 DBUG_PRINT("ib_buf", ("flush completed, from flush_list %u pages, "
1909 "from LRU_list %u pages",
1910 unsigned(page_count_flush),
1911 unsigned(page_count_LRU)));
1912
1913 srv_stats.buf_pool_flushed.add(page_count_flush + page_count_LRU);
1914}
1915
1916/******************************************************************//**
1917Start a buffer flush batch for LRU or flush list */
1918static
1919ibool
1920buf_flush_start(
1921/*============*/
1922 buf_pool_t* buf_pool, /*!< buffer pool instance */
1923 buf_flush_t flush_type) /*!< in: BUF_FLUSH_LRU
1924 or BUF_FLUSH_LIST */
1925{
1926 ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST);
1927
1928 buf_pool_mutex_enter(buf_pool);
1929
1930 if (buf_pool->n_flush[flush_type] > 0
1931 || buf_pool->init_flush[flush_type] == TRUE) {
1932
1933 /* There is already a flush batch of the same type running */
1934
1935 buf_pool_mutex_exit(buf_pool);
1936
1937 return(FALSE);
1938 }
1939
1940 buf_pool->init_flush[flush_type] = TRUE;
1941
1942 os_event_reset(buf_pool->no_flush[flush_type]);
1943
1944 buf_pool_mutex_exit(buf_pool);
1945
1946 return(TRUE);
1947}
1948
1949/******************************************************************//**
1950End a buffer flush batch for LRU or flush list */
1951static
1952void
1953buf_flush_end(
1954/*==========*/
1955 buf_pool_t* buf_pool, /*!< buffer pool instance */
1956 buf_flush_t flush_type) /*!< in: BUF_FLUSH_LRU
1957 or BUF_FLUSH_LIST */
1958{
1959 buf_pool_mutex_enter(buf_pool);
1960
1961 buf_pool->init_flush[flush_type] = FALSE;
1962
1963 buf_pool->try_LRU_scan = TRUE;
1964
1965 if (buf_pool->n_flush[flush_type] == 0) {
1966
1967 /* The running flush batch has ended */
1968
1969 os_event_set(buf_pool->no_flush[flush_type]);
1970 }
1971
1972 buf_pool_mutex_exit(buf_pool);
1973
1974 if (!srv_read_only_mode) {
1975 buf_dblwr_flush_buffered_writes();
1976 } else {
1977 os_aio_simulated_wake_handler_threads();
1978 }
1979}
1980
1981/******************************************************************//**
1982Waits until a flush batch of the given type ends */
1983void
1984buf_flush_wait_batch_end(
1985/*=====================*/
1986 buf_pool_t* buf_pool, /*!< buffer pool instance */
1987 buf_flush_t type) /*!< in: BUF_FLUSH_LRU
1988 or BUF_FLUSH_LIST */
1989{
1990 ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
1991
1992 if (buf_pool == NULL) {
1993 ulint i;
1994
1995 for (i = 0; i < srv_buf_pool_instances; ++i) {
1996 buf_pool_t* buf_pool;
1997
1998 buf_pool = buf_pool_from_array(i);
1999
2000 thd_wait_begin(NULL, THD_WAIT_DISKIO);
2001 os_event_wait(buf_pool->no_flush[type]);
2002 thd_wait_end(NULL);
2003 }
2004 } else {
2005 thd_wait_begin(NULL, THD_WAIT_DISKIO);
2006 os_event_wait(buf_pool->no_flush[type]);
2007 thd_wait_end(NULL);
2008 }
2009}
2010
2011/** Do flushing batch of a given type.
2012NOTE: The calling thread is not allowed to own any latches on pages!
2013@param[in,out] buf_pool buffer pool instance
2014@param[in] type flush type
2015@param[in] min_n wished minimum mumber of blocks flushed
2016(it is not guaranteed that the actual number is that big, though)
2017@param[in] lsn_limit in the case BUF_FLUSH_LIST all blocks whose
2018oldest_modification is smaller than this should be flushed (if their number
2019does not exceed min_n), otherwise ignored
2020@param[out] n_processed the number of pages which were processed is
2021passed back to caller. Ignored if NULL
2022@retval true if a batch was queued successfully.
2023@retval false if another batch of same type was already running. */
2024bool
2025buf_flush_do_batch(
2026 buf_pool_t* buf_pool,
2027 buf_flush_t type,
2028 ulint min_n,
2029 lsn_t lsn_limit,
2030 flush_counters_t* n)
2031{
2032 ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST);
2033
2034 if (n != NULL) {
2035 n->flushed = 0;
2036 }
2037
2038 if (!buf_flush_start(buf_pool, type)) {
2039 return(false);
2040 }
2041
2042 buf_flush_batch(buf_pool, type, min_n, lsn_limit, n);
2043
2044 buf_flush_end(buf_pool, type);
2045
2046 return(true);
2047}
2048/**
2049Waits until a flush batch of the given lsn ends
2050@param[in] new_oldest target oldest_modified_lsn to wait for */
2051
2052void
2053buf_flush_wait_flushed(
2054 lsn_t new_oldest)
2055{
2056 for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
2057 buf_pool_t* buf_pool;
2058 lsn_t oldest;
2059
2060 buf_pool = buf_pool_from_array(i);
2061
2062 for (;;) {
2063 /* We don't need to wait for fsync of the flushed
2064 blocks, because anyway we need fsync to make chekpoint.
2065 So, we don't need to wait for the batch end here. */
2066
2067 buf_flush_list_mutex_enter(buf_pool);
2068
2069 buf_page_t* bpage;
2070
2071 /* We don't need to wait for system temporary pages */
2072 for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
2073 bpage != NULL
2074 && fsp_is_system_temporary(bpage->id.space());
2075 bpage = UT_LIST_GET_PREV(list, bpage)) {
2076 /* Do nothing. */
2077 }
2078
2079 if (bpage != NULL) {
2080 ut_ad(bpage->in_flush_list);
2081 oldest = bpage->oldest_modification;
2082 } else {
2083 oldest = 0;
2084 }
2085
2086 buf_flush_list_mutex_exit(buf_pool);
2087
2088 if (oldest == 0 || oldest >= new_oldest) {
2089 break;
2090 }
2091
2092 /* sleep and retry */
2093 os_thread_sleep(buf_flush_wait_flushed_sleep_time);
2094
2095 MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS);
2096 }
2097 }
2098}
2099
2100/** This utility flushes dirty blocks from the end of the flush list of all
2101buffer pool instances.
2102NOTE: The calling thread is not allowed to own any latches on pages!
2103@param[in] min_n wished minimum mumber of blocks flushed (it is
2104not guaranteed that the actual number is that big, though)
2105@param[in] lsn_limit in the case BUF_FLUSH_LIST all blocks whose
2106oldest_modification is smaller than this should be flushed (if their number
2107does not exceed min_n), otherwise ignored
2108@param[out] n_processed the number of pages which were processed is
2109passed back to caller. Ignored if NULL.
2110@return true if a batch was queued successfully for each buffer pool
2111instance. false if another batch of same type was already running in
2112at least one of the buffer pool instance */
2113bool
2114buf_flush_lists(
2115 ulint min_n,
2116 lsn_t lsn_limit,
2117 ulint* n_processed)
2118{
2119 ulint i;
2120 ulint n_flushed = 0;
2121 bool success = true;
2122
2123 if (n_processed) {
2124 *n_processed = 0;
2125 }
2126
2127 if (min_n != ULINT_MAX) {
2128 /* Ensure that flushing is spread evenly amongst the
2129 buffer pool instances. When min_n is ULINT_MAX
2130 we need to flush everything up to the lsn limit
2131 so no limit here. */
2132 min_n = (min_n + srv_buf_pool_instances - 1)
2133 / srv_buf_pool_instances;
2134 }
2135
2136 /* Flush to lsn_limit in all buffer pool instances */
2137 for (i = 0; i < srv_buf_pool_instances; i++) {
2138 buf_pool_t* buf_pool;
2139 flush_counters_t n;
2140
2141 memset(&n, 0, sizeof(flush_counters_t));
2142 buf_pool = buf_pool_from_array(i);
2143
2144 if (!buf_flush_do_batch(buf_pool,
2145 BUF_FLUSH_LIST,
2146 min_n,
2147 lsn_limit,
2148 &n)) {
2149 /* We have two choices here. If lsn_limit was
2150 specified then skipping an instance of buffer
2151 pool means we cannot guarantee that all pages
2152 up to lsn_limit has been flushed. We can
2153 return right now with failure or we can try
2154 to flush remaining buffer pools up to the
2155 lsn_limit. We attempt to flush other buffer
2156 pools based on the assumption that it will
2157 help in the retry which will follow the
2158 failure. */
2159 success = false;
2160
2161 }
2162
2163 n_flushed += n.flushed;
2164 }
2165
2166 if (n_flushed) {
2167 buf_flush_stats(n_flushed, 0);
2168 if (n_processed) {
2169 *n_processed = n_flushed;
2170 }
2171 }
2172
2173 return(success);
2174}
2175
2176/******************************************************************//**
2177This function picks up a single page from the tail of the LRU
2178list, flushes it (if it is dirty), removes it from page_hash and LRU
2179list and puts it on the free list. It is called from user threads when
2180they are unable to find a replaceable page at the tail of the LRU
2181list i.e.: when the background LRU flushing in the page_cleaner thread
2182is not fast enough to keep pace with the workload.
2183@return true if success. */
2184bool
2185buf_flush_single_page_from_LRU(
2186/*===========================*/
2187 buf_pool_t* buf_pool) /*!< in/out: buffer pool instance */
2188{
2189 ulint scanned;
2190 buf_page_t* bpage;
2191 ibool freed;
2192
2193 buf_pool_mutex_enter(buf_pool);
2194
2195 for (bpage = buf_pool->single_scan_itr.start(), scanned = 0,
2196 freed = false;
2197 bpage != NULL;
2198 ++scanned, bpage = buf_pool->single_scan_itr.get()) {
2199
2200 ut_ad(buf_pool_mutex_own(buf_pool));
2201
2202 buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
2203 buf_pool->single_scan_itr.set(prev);
2204 BPageMutex* block_mutex;
2205
2206 block_mutex = buf_page_get_mutex(bpage);
2207
2208 mutex_enter(block_mutex);
2209
2210 if (buf_flush_ready_for_replace(bpage)) {
2211 /* block is ready for eviction i.e., it is
2212 clean and is not IO-fixed or buffer fixed. */
2213 mutex_exit(block_mutex);
2214
2215 if (buf_LRU_free_page(bpage, true)) {
2216 buf_pool_mutex_exit(buf_pool);
2217 freed = true;
2218 break;
2219 }
2220
2221 } else if (buf_flush_ready_for_flush(
2222 bpage, BUF_FLUSH_SINGLE_PAGE)) {
2223
2224 /* Block is ready for flush. Try and dispatch an IO
2225 request. We'll put it on free list in IO completion
2226 routine if it is not buffer fixed. The following call
2227 will release the buffer pool and block mutex.
2228
2229 Note: There is no guarantee that this page has actually
2230 been freed, only that it has been flushed to disk */
2231
2232 freed = buf_flush_page(
2233 buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, true);
2234
2235 if (freed) {
2236 break;
2237 }
2238
2239 mutex_exit(block_mutex);
2240 } else {
2241 mutex_exit(block_mutex);
2242 }
2243 ut_ad(!mutex_own(block_mutex));
2244 }
2245 if (!freed) {
2246 /* Can't find a single flushable page. */
2247 ut_ad(!bpage);
2248 buf_pool_mutex_exit(buf_pool);
2249 }
2250
2251 if (scanned) {
2252 MONITOR_INC_VALUE_CUMULATIVE(
2253 MONITOR_LRU_SINGLE_FLUSH_SCANNED,
2254 MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL,
2255 MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL,
2256 scanned);
2257 }
2258
2259 ut_ad(!buf_pool_mutex_own(buf_pool));
2260 return(freed);
2261}
2262
2263/**
2264Clears up tail of the LRU list of a given buffer pool instance:
2265* Put replaceable pages at the tail of LRU to the free list
2266* Flush dirty pages at the tail of LRU to the disk
2267The depth to which we scan each buffer pool is controlled by dynamic
2268config parameter innodb_LRU_scan_depth.
2269@param buf_pool buffer pool instance
2270@return total pages flushed */
2271static
2272ulint
2273buf_flush_LRU_list(
2274 buf_pool_t* buf_pool)
2275{
2276 ulint scan_depth, withdraw_depth;
2277 flush_counters_t n;
2278
2279 memset(&n, 0, sizeof(flush_counters_t));
2280
2281 ut_ad(buf_pool);
2282 /* srv_LRU_scan_depth can be arbitrarily large value.
2283 We cap it with current LRU size. */
2284 buf_pool_mutex_enter(buf_pool);
2285 scan_depth = UT_LIST_GET_LEN(buf_pool->LRU);
2286 if (buf_pool->curr_size < buf_pool->old_size
2287 && buf_pool->withdraw_target > 0) {
2288 withdraw_depth = buf_pool->withdraw_target
2289 - UT_LIST_GET_LEN(buf_pool->withdraw);
2290 } else {
2291 withdraw_depth = 0;
2292 }
2293 buf_pool_mutex_exit(buf_pool);
2294 if (withdraw_depth > srv_LRU_scan_depth) {
2295 scan_depth = ut_min(withdraw_depth, scan_depth);
2296 } else {
2297 scan_depth = ut_min(static_cast<ulint>(srv_LRU_scan_depth),
2298 scan_depth);
2299 }
2300 /* Currently one of page_cleaners is the only thread
2301 that can trigger an LRU flush at the same time.
2302 So, it is not possible that a batch triggered during
2303 last iteration is still running, */
2304 buf_flush_do_batch(buf_pool, BUF_FLUSH_LRU, scan_depth,
2305 0, &n);
2306
2307 return(n.flushed);
2308}
2309
2310/*********************************************************************//**
2311Wait for any possible LRU flushes that are in progress to end. */
2312void
2313buf_flush_wait_LRU_batch_end(void)
2314/*==============================*/
2315{
2316 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2317 buf_pool_t* buf_pool;
2318
2319 buf_pool = buf_pool_from_array(i);
2320
2321 buf_pool_mutex_enter(buf_pool);
2322
2323 if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0
2324 || buf_pool->init_flush[BUF_FLUSH_LRU]) {
2325
2326 buf_pool_mutex_exit(buf_pool);
2327 buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
2328 } else {
2329 buf_pool_mutex_exit(buf_pool);
2330 }
2331 }
2332}
2333
2334/*********************************************************************//**
2335Calculates if flushing is required based on number of dirty pages in
2336the buffer pool.
2337@return percent of io_capacity to flush to manage dirty page ratio */
2338static
2339ulint
2340af_get_pct_for_dirty()
2341/*==================*/
2342{
2343 double dirty_pct = buf_get_modified_ratio_pct();
2344
2345 if (dirty_pct == 0.0) {
2346 /* No pages modified */
2347 return(0);
2348 }
2349
2350 ut_a(srv_max_dirty_pages_pct_lwm
2351 <= srv_max_buf_pool_modified_pct);
2352
2353 if (srv_max_dirty_pages_pct_lwm == 0) {
2354 /* The user has not set the option to preflush dirty
2355 pages as we approach the high water mark. */
2356 if (dirty_pct >= srv_max_buf_pool_modified_pct) {
2357 /* We have crossed the high water mark of dirty
2358 pages In this case we start flushing at 100% of
2359 innodb_io_capacity. */
2360 return(100);
2361 }
2362 } else if (dirty_pct >= srv_max_dirty_pages_pct_lwm) {
2363 /* We should start flushing pages gradually. */
2364 return(static_cast<ulint>((dirty_pct * 100)
2365 / (srv_max_buf_pool_modified_pct + 1)));
2366 }
2367
2368 return(0);
2369}
2370
2371/*********************************************************************//**
2372Calculates if flushing is required based on redo generation rate.
2373@return percent of io_capacity to flush to manage redo space */
2374static
2375ulint
2376af_get_pct_for_lsn(
2377/*===============*/
2378 lsn_t age) /*!< in: current age of LSN. */
2379{
2380 lsn_t max_async_age;
2381 lsn_t lsn_age_factor;
2382 lsn_t af_lwm = (lsn_t) ((srv_adaptive_flushing_lwm
2383 * log_get_capacity()) / 100);
2384
2385 if (age < af_lwm) {
2386 /* No adaptive flushing. */
2387 return(0);
2388 }
2389
2390 max_async_age = log_get_max_modified_age_async();
2391
2392 if (age < max_async_age && !srv_adaptive_flushing) {
2393 /* We have still not reached the max_async point and
2394 the user has disabled adaptive flushing. */
2395 return(0);
2396 }
2397
2398 /* If we are here then we know that either:
2399 1) User has enabled adaptive flushing
2400 2) User may have disabled adaptive flushing but we have reached
2401 max_async_age. */
2402 lsn_age_factor = (age * 100) / max_async_age;
2403
2404 ut_ad(srv_max_io_capacity >= srv_io_capacity);
2405 return(static_cast<ulint>(
2406 ((srv_max_io_capacity / srv_io_capacity)
2407 * (lsn_age_factor * sqrt((double)lsn_age_factor)))
2408 / 7.5));
2409}
2410
2411/*********************************************************************//**
2412This function is called approximately once every second by the
2413page_cleaner thread. Based on various factors it decides if there is a
2414need to do flushing.
2415@return number of pages recommended to be flushed
2416@param lsn_limit pointer to return LSN up to which flushing must happen
2417@param last_pages_in the number of pages flushed by the last flush_list
2418 flushing. */
2419static
2420ulint
2421page_cleaner_flush_pages_recommendation(
2422/*====================================*/
2423 lsn_t* lsn_limit,
2424 ulint last_pages_in)
2425{
2426 static lsn_t prev_lsn = 0;
2427 static ulint sum_pages = 0;
2428 static ulint avg_page_rate = 0;
2429 static ulint n_iterations = 0;
2430 static time_t prev_time;
2431 lsn_t oldest_lsn;
2432 lsn_t cur_lsn;
2433 lsn_t age;
2434 lsn_t lsn_rate;
2435 ulint n_pages = 0;
2436 ulint pct_for_dirty = 0;
2437 ulint pct_for_lsn = 0;
2438 ulint pct_total = 0;
2439
2440 cur_lsn = log_get_lsn_nowait();
2441
2442 /* log_get_lsn_nowait tries to get log_sys.mutex with
2443 mutex_enter_nowait, if this does not succeed function
2444 returns 0, do not use that value to update stats. */
2445 if (cur_lsn == 0) {
2446 return(0);
2447 }
2448
2449 if (prev_lsn == 0) {
2450 /* First time around. */
2451 prev_lsn = cur_lsn;
2452 prev_time = ut_time();
2453 return(0);
2454 }
2455
2456 if (prev_lsn == cur_lsn) {
2457 return(0);
2458 }
2459
2460 sum_pages += last_pages_in;
2461
2462 time_t curr_time = ut_time();
2463 double time_elapsed = difftime(curr_time, prev_time);
2464
2465 /* We update our variables every srv_flushing_avg_loops
2466 iterations to smooth out transition in workload. */
2467 if (++n_iterations >= srv_flushing_avg_loops
2468 || time_elapsed >= srv_flushing_avg_loops) {
2469
2470 if (time_elapsed < 1) {
2471 time_elapsed = 1;
2472 }
2473
2474 avg_page_rate = static_cast<ulint>(
2475 ((static_cast<double>(sum_pages)
2476 / time_elapsed)
2477 + avg_page_rate) / 2);
2478
2479 /* How much LSN we have generated since last call. */
2480 lsn_rate = static_cast<lsn_t>(
2481 static_cast<double>(cur_lsn - prev_lsn)
2482 / time_elapsed);
2483
2484 lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2;
2485
2486 /* aggregate stats of all slots */
2487 mutex_enter(&page_cleaner.mutex);
2488
2489 ulint flush_tm = page_cleaner.flush_time;
2490 ulint flush_pass = page_cleaner.flush_pass;
2491
2492 page_cleaner.flush_time = 0;
2493 page_cleaner.flush_pass = 0;
2494
2495 ulint lru_tm = 0;
2496 ulint list_tm = 0;
2497 ulint lru_pass = 0;
2498 ulint list_pass = 0;
2499
2500 for (ulint i = 0; i < page_cleaner.n_slots; i++) {
2501 page_cleaner_slot_t* slot;
2502
2503 slot = &page_cleaner.slots[i];
2504
2505 lru_tm += slot->flush_lru_time;
2506 lru_pass += slot->flush_lru_pass;
2507 list_tm += slot->flush_list_time;
2508 list_pass += slot->flush_list_pass;
2509
2510 slot->flush_lru_time = 0;
2511 slot->flush_lru_pass = 0;
2512 slot->flush_list_time = 0;
2513 slot->flush_list_pass = 0;
2514 }
2515
2516 mutex_exit(&page_cleaner.mutex);
2517
2518 /* minimum values are 1, to avoid dividing by zero. */
2519 if (lru_tm < 1) {
2520 lru_tm = 1;
2521 }
2522 if (list_tm < 1) {
2523 list_tm = 1;
2524 }
2525 if (flush_tm < 1) {
2526 flush_tm = 1;
2527 }
2528
2529 if (lru_pass < 1) {
2530 lru_pass = 1;
2531 }
2532 if (list_pass < 1) {
2533 list_pass = 1;
2534 }
2535 if (flush_pass < 1) {
2536 flush_pass = 1;
2537 }
2538
2539 MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_SLOT,
2540 list_tm / list_pass);
2541 MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_SLOT,
2542 lru_tm / lru_pass);
2543
2544 MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_THREAD,
2545 list_tm / (srv_n_page_cleaners * flush_pass));
2546 MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_THREAD,
2547 lru_tm / (srv_n_page_cleaners * flush_pass));
2548 MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_EST,
2549 flush_tm * list_tm / flush_pass
2550 / (list_tm + lru_tm));
2551 MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_EST,
2552 flush_tm * lru_tm / flush_pass
2553 / (list_tm + lru_tm));
2554 MONITOR_SET(MONITOR_FLUSH_AVG_TIME, flush_tm / flush_pass);
2555
2556 MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS,
2557 list_pass / page_cleaner.n_slots);
2558 MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_PASS,
2559 lru_pass / page_cleaner.n_slots);
2560 MONITOR_SET(MONITOR_FLUSH_AVG_PASS, flush_pass);
2561
2562 prev_lsn = cur_lsn;
2563 prev_time = curr_time;
2564
2565 n_iterations = 0;
2566
2567 sum_pages = 0;
2568 }
2569
2570 oldest_lsn = buf_pool_get_oldest_modification();
2571
2572 ut_ad(oldest_lsn <= log_get_lsn());
2573
2574 age = cur_lsn > oldest_lsn ? cur_lsn - oldest_lsn : 0;
2575
2576 pct_for_dirty = af_get_pct_for_dirty();
2577 pct_for_lsn = af_get_pct_for_lsn(age);
2578
2579 pct_total = ut_max(pct_for_dirty, pct_for_lsn);
2580
2581 /* Estimate pages to be flushed for the lsn progress */
2582 ulint sum_pages_for_lsn = 0;
2583 lsn_t target_lsn = oldest_lsn
2584 + lsn_avg_rate * buf_flush_lsn_scan_factor;
2585
2586 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2587 buf_pool_t* buf_pool = buf_pool_from_array(i);
2588 ulint pages_for_lsn = 0;
2589
2590 buf_flush_list_mutex_enter(buf_pool);
2591 for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool->flush_list);
2592 b != NULL;
2593 b = UT_LIST_GET_PREV(list, b)) {
2594 if (b->oldest_modification > target_lsn) {
2595 break;
2596 }
2597 ++pages_for_lsn;
2598 }
2599 buf_flush_list_mutex_exit(buf_pool);
2600
2601 sum_pages_for_lsn += pages_for_lsn;
2602
2603 mutex_enter(&page_cleaner.mutex);
2604 ut_ad(page_cleaner.slots[i].state
2605 == PAGE_CLEANER_STATE_NONE);
2606 page_cleaner.slots[i].n_pages_requested
2607 = pages_for_lsn / buf_flush_lsn_scan_factor + 1;
2608 mutex_exit(&page_cleaner.mutex);
2609 }
2610
2611 sum_pages_for_lsn /= buf_flush_lsn_scan_factor;
2612 if(sum_pages_for_lsn < 1) {
2613 sum_pages_for_lsn = 1;
2614 }
2615
2616 /* Cap the maximum IO capacity that we are going to use by
2617 max_io_capacity. Limit the value to avoid too quick increase */
2618 ulint pages_for_lsn =
2619 std::min<ulint>(sum_pages_for_lsn, srv_max_io_capacity * 2);
2620
2621 n_pages = (PCT_IO(pct_total) + avg_page_rate + pages_for_lsn) / 3;
2622
2623 if (n_pages > srv_max_io_capacity) {
2624 n_pages = srv_max_io_capacity;
2625 }
2626
2627 /* Normalize request for each instance */
2628 mutex_enter(&page_cleaner.mutex);
2629 ut_ad(page_cleaner.n_slots_requested == 0);
2630 ut_ad(page_cleaner.n_slots_flushing == 0);
2631 ut_ad(page_cleaner.n_slots_finished == 0);
2632
2633 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2634 /* if REDO has enough of free space,
2635 don't care about age distribution of pages */
2636 page_cleaner.slots[i].n_pages_requested = pct_for_lsn > 30 ?
2637 page_cleaner.slots[i].n_pages_requested
2638 * n_pages / sum_pages_for_lsn + 1
2639 : n_pages / srv_buf_pool_instances;
2640 }
2641 mutex_exit(&page_cleaner.mutex);
2642
2643 MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages);
2644
2645 MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, sum_pages_for_lsn);
2646
2647 MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate);
2648 MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate);
2649 MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty);
2650 MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn);
2651
2652 *lsn_limit = LSN_MAX;
2653
2654 return(n_pages);
2655}
2656
2657/*********************************************************************//**
2658Puts the page_cleaner thread to sleep if it has finished work in less
2659than a second
2660@retval 0 wake up by event set,
2661@retval OS_SYNC_TIME_EXCEEDED if timeout was exceeded
2662@param next_loop_time time when next loop iteration should start
2663@param sig_count zero or the value returned by previous call of
2664 os_event_reset()
2665@param cur_time current time as in ut_time_ms() */
2666static
2667ulint
2668pc_sleep_if_needed(
2669/*===============*/
2670 ulint next_loop_time,
2671 int64_t sig_count,
2672 ulint cur_time)
2673{
2674 /* No sleep if we are cleaning the buffer pool during the shutdown
2675 with everything else finished */
2676 if (srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE)
2677 return OS_SYNC_TIME_EXCEEDED;
2678
2679 if (next_loop_time > cur_time) {
2680 /* Get sleep interval in micro seconds. We use
2681 ut_min() to avoid long sleep in case of wrap around. */
2682 ulint sleep_us;
2683
2684 sleep_us = ut_min(static_cast<ulint>(1000000),
2685 (next_loop_time - cur_time) * 1000);
2686
2687 return(os_event_wait_time_low(buf_flush_event,
2688 sleep_us, sig_count));
2689 }
2690
2691 return(OS_SYNC_TIME_EXCEEDED);
2692}
2693
2694/******************************************************************//**
2695Initialize page_cleaner. */
2696void
2697buf_flush_page_cleaner_init(void)
2698/*=============================*/
2699{
2700 ut_ad(!page_cleaner.is_running);
2701
2702 mutex_create(LATCH_ID_PAGE_CLEANER, &page_cleaner.mutex);
2703
2704 page_cleaner.is_requested = os_event_create("pc_is_requested");
2705 page_cleaner.is_finished = os_event_create("pc_is_finished");
2706 page_cleaner.is_started = os_event_create("pc_is_started");
2707 page_cleaner.n_slots = static_cast<ulint>(srv_buf_pool_instances);
2708
2709 ut_d(page_cleaner.n_disabled_debug = 0);
2710
2711 page_cleaner.is_running = true;
2712}
2713
2714/**
2715Requests for all slots to flush all buffer pool instances.
2716@param min_n wished minimum mumber of blocks flushed
2717 (it is not guaranteed that the actual number is that big)
2718@param lsn_limit in the case BUF_FLUSH_LIST all blocks whose
2719 oldest_modification is smaller than this should be flushed
2720 (if their number does not exceed min_n), otherwise ignored
2721*/
2722static
2723void
2724pc_request(
2725 ulint min_n,
2726 lsn_t lsn_limit)
2727{
2728 if (min_n != ULINT_MAX) {
2729 /* Ensure that flushing is spread evenly amongst the
2730 buffer pool instances. When min_n is ULINT_MAX
2731 we need to flush everything up to the lsn limit
2732 so no limit here. */
2733 min_n = (min_n + srv_buf_pool_instances - 1)
2734 / srv_buf_pool_instances;
2735 }
2736
2737 mutex_enter(&page_cleaner.mutex);
2738
2739 ut_ad(page_cleaner.n_slots_requested == 0);
2740 ut_ad(page_cleaner.n_slots_flushing == 0);
2741 ut_ad(page_cleaner.n_slots_finished == 0);
2742
2743 page_cleaner.requested = (min_n > 0);
2744 page_cleaner.lsn_limit = lsn_limit;
2745
2746 for (ulint i = 0; i < page_cleaner.n_slots; i++) {
2747 page_cleaner_slot_t* slot = &page_cleaner.slots[i];
2748
2749 ut_ad(slot->state == PAGE_CLEANER_STATE_NONE);
2750
2751 if (min_n == ULINT_MAX) {
2752 slot->n_pages_requested = ULINT_MAX;
2753 } else if (min_n == 0) {
2754 slot->n_pages_requested = 0;
2755 }
2756
2757 /* slot->n_pages_requested was already set by
2758 page_cleaner_flush_pages_recommendation() */
2759
2760 slot->state = PAGE_CLEANER_STATE_REQUESTED;
2761 }
2762
2763 page_cleaner.n_slots_requested = page_cleaner.n_slots;
2764 page_cleaner.n_slots_flushing = 0;
2765 page_cleaner.n_slots_finished = 0;
2766
2767 os_event_set(page_cleaner.is_requested);
2768
2769 mutex_exit(&page_cleaner.mutex);
2770}
2771
2772/**
2773Do flush for one slot.
2774@return the number of the slots which has not been treated yet. */
2775static
2776ulint
2777pc_flush_slot(void)
2778{
2779 ulint lru_tm = 0;
2780 ulint list_tm = 0;
2781 ulint lru_pass = 0;
2782 ulint list_pass = 0;
2783
2784 mutex_enter(&page_cleaner.mutex);
2785
2786 if (!page_cleaner.n_slots_requested) {
2787 os_event_reset(page_cleaner.is_requested);
2788 } else {
2789 page_cleaner_slot_t* slot = NULL;
2790 ulint i;
2791
2792 for (i = 0; i < page_cleaner.n_slots; i++) {
2793 slot = &page_cleaner.slots[i];
2794
2795 if (slot->state == PAGE_CLEANER_STATE_REQUESTED) {
2796 break;
2797 }
2798 }
2799
2800 /* slot should be found because
2801 page_cleaner.n_slots_requested > 0 */
2802 ut_a(i < page_cleaner.n_slots);
2803
2804 buf_pool_t* buf_pool = buf_pool_from_array(i);
2805
2806 page_cleaner.n_slots_requested--;
2807 page_cleaner.n_slots_flushing++;
2808 slot->state = PAGE_CLEANER_STATE_FLUSHING;
2809
2810 if (UNIV_UNLIKELY(!page_cleaner.is_running)) {
2811 slot->n_flushed_lru = 0;
2812 slot->n_flushed_list = 0;
2813 goto finish_mutex;
2814 }
2815
2816 if (page_cleaner.n_slots_requested == 0) {
2817 os_event_reset(page_cleaner.is_requested);
2818 }
2819
2820 mutex_exit(&page_cleaner.mutex);
2821
2822 lru_tm = ut_time_ms();
2823
2824 /* Flush pages from end of LRU if required */
2825 slot->n_flushed_lru = buf_flush_LRU_list(buf_pool);
2826
2827 lru_tm = ut_time_ms() - lru_tm;
2828 lru_pass++;
2829
2830 if (UNIV_UNLIKELY(!page_cleaner.is_running)) {
2831 slot->n_flushed_list = 0;
2832 goto finish;
2833 }
2834
2835 /* Flush pages from flush_list if required */
2836 if (page_cleaner.requested) {
2837 flush_counters_t n;
2838 memset(&n, 0, sizeof(flush_counters_t));
2839 list_tm = ut_time_ms();
2840
2841 slot->succeeded_list = buf_flush_do_batch(
2842 buf_pool, BUF_FLUSH_LIST,
2843 slot->n_pages_requested,
2844 page_cleaner.lsn_limit,
2845 &n);
2846
2847 slot->n_flushed_list = n.flushed;
2848
2849 list_tm = ut_time_ms() - list_tm;
2850 list_pass++;
2851 } else {
2852 slot->n_flushed_list = 0;
2853 slot->succeeded_list = true;
2854 }
2855finish:
2856 mutex_enter(&page_cleaner.mutex);
2857finish_mutex:
2858 page_cleaner.n_slots_flushing--;
2859 page_cleaner.n_slots_finished++;
2860 slot->state = PAGE_CLEANER_STATE_FINISHED;
2861
2862 slot->flush_lru_time += lru_tm;
2863 slot->flush_list_time += list_tm;
2864 slot->flush_lru_pass += lru_pass;
2865 slot->flush_list_pass += list_pass;
2866
2867 if (page_cleaner.n_slots_requested == 0
2868 && page_cleaner.n_slots_flushing == 0) {
2869 os_event_set(page_cleaner.is_finished);
2870 }
2871 }
2872
2873 ulint ret = page_cleaner.n_slots_requested;
2874
2875 mutex_exit(&page_cleaner.mutex);
2876
2877 return(ret);
2878}
2879
2880/**
2881Wait until all flush requests are finished.
2882@param n_flushed_lru number of pages flushed from the end of the LRU list.
2883@param n_flushed_list number of pages flushed from the end of the
2884 flush_list.
2885@return true if all flush_list flushing batch were success. */
2886static
2887bool
2888pc_wait_finished(
2889 ulint* n_flushed_lru,
2890 ulint* n_flushed_list)
2891{
2892 bool all_succeeded = true;
2893
2894 *n_flushed_lru = 0;
2895 *n_flushed_list = 0;
2896
2897 os_event_wait(page_cleaner.is_finished);
2898
2899 mutex_enter(&page_cleaner.mutex);
2900
2901 ut_ad(page_cleaner.n_slots_requested == 0);
2902 ut_ad(page_cleaner.n_slots_flushing == 0);
2903 ut_ad(page_cleaner.n_slots_finished == page_cleaner.n_slots);
2904
2905 for (ulint i = 0; i < page_cleaner.n_slots; i++) {
2906 page_cleaner_slot_t* slot = &page_cleaner.slots[i];
2907
2908 ut_ad(slot->state == PAGE_CLEANER_STATE_FINISHED);
2909
2910 *n_flushed_lru += slot->n_flushed_lru;
2911 *n_flushed_list += slot->n_flushed_list;
2912 all_succeeded &= slot->succeeded_list;
2913
2914 slot->state = PAGE_CLEANER_STATE_NONE;
2915
2916 slot->n_pages_requested = 0;
2917 }
2918
2919 page_cleaner.n_slots_finished = 0;
2920
2921 os_event_reset(page_cleaner.is_finished);
2922
2923 mutex_exit(&page_cleaner.mutex);
2924
2925 return(all_succeeded);
2926}
2927
2928#ifdef UNIV_LINUX
2929/**
2930Set priority for page_cleaner threads.
2931@param[in] priority priority intended to set
2932@return true if set as intended */
2933static
2934bool
2935buf_flush_page_cleaner_set_priority(
2936 int priority)
2937{
2938 setpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid),
2939 priority);
2940 return(getpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid))
2941 == priority);
2942}
2943#endif /* UNIV_LINUX */
2944
2945#ifdef UNIV_DEBUG
2946/** Loop used to disable page cleaner threads. */
2947static
2948void
2949buf_flush_page_cleaner_disabled_loop(void)
2950{
2951 if (!innodb_page_cleaner_disabled_debug) {
2952 /* We return to avoid entering and exiting mutex. */
2953 return;
2954 }
2955
2956 mutex_enter(&page_cleaner.mutex);
2957 page_cleaner.n_disabled_debug++;
2958 mutex_exit(&page_cleaner.mutex);
2959
2960 while (innodb_page_cleaner_disabled_debug
2961 && srv_shutdown_state == SRV_SHUTDOWN_NONE
2962 && page_cleaner.is_running) {
2963
2964 os_thread_sleep(100000); /* [A] */
2965 }
2966
2967 /* We need to wait for threads exiting here, otherwise we would
2968 encounter problem when we quickly perform following steps:
2969 1) SET GLOBAL innodb_page_cleaner_disabled_debug = 1;
2970 2) SET GLOBAL innodb_page_cleaner_disabled_debug = 0;
2971 3) SET GLOBAL innodb_page_cleaner_disabled_debug = 1;
2972 That's because after step 1 this thread could still be sleeping
2973 inside the loop above at [A] and steps 2, 3 could happen before
2974 this thread wakes up from [A]. In such case this thread would
2975 not re-increment n_disabled_debug and we would be waiting for
2976 him forever in buf_flush_page_cleaner_disabled_debug_update(...).
2977
2978 Therefore we are waiting in step 2 for this thread exiting here. */
2979
2980 mutex_enter(&page_cleaner.mutex);
2981 page_cleaner.n_disabled_debug--;
2982 mutex_exit(&page_cleaner.mutex);
2983}
2984
2985/** Disables page cleaner threads (coordinator and workers).
2986@param[in] save immediate result from check function */
2987void buf_flush_page_cleaner_disabled_debug_update(THD*,
2988 st_mysql_sys_var*, void*,
2989 const void* save)
2990{
2991 if (!page_cleaner.is_running) {
2992 return;
2993 }
2994
2995 if (!*static_cast<const my_bool*>(save)) {
2996 if (!innodb_page_cleaner_disabled_debug) {
2997 return;
2998 }
2999
3000 innodb_page_cleaner_disabled_debug = false;
3001
3002 /* Enable page cleaner threads. */
3003 while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3004 mutex_enter(&page_cleaner.mutex);
3005 const ulint n = page_cleaner.n_disabled_debug;
3006 mutex_exit(&page_cleaner.mutex);
3007 /* Check if all threads have been enabled, to avoid
3008 problem when we decide to re-disable them soon. */
3009 if (n == 0) {
3010 break;
3011 }
3012 }
3013 return;
3014 }
3015
3016 if (innodb_page_cleaner_disabled_debug) {
3017 return;
3018 }
3019
3020 innodb_page_cleaner_disabled_debug = true;
3021
3022 while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3023 /* Workers are possibly sleeping on is_requested.
3024
3025 We have to wake them, otherwise they could possibly
3026 have never noticed, that they should be disabled,
3027 and we would wait for them here forever.
3028
3029 That's why we have sleep-loop instead of simply
3030 waiting on some disabled_debug_event. */
3031 os_event_set(page_cleaner.is_requested);
3032
3033 mutex_enter(&page_cleaner.mutex);
3034
3035 ut_ad(page_cleaner.n_disabled_debug
3036 <= srv_n_page_cleaners);
3037
3038 if (page_cleaner.n_disabled_debug
3039 == srv_n_page_cleaners) {
3040
3041 mutex_exit(&page_cleaner.mutex);
3042 break;
3043 }
3044
3045 mutex_exit(&page_cleaner.mutex);
3046
3047 os_thread_sleep(100000);
3048 }
3049}
3050#endif /* UNIV_DEBUG */
3051
3052/******************************************************************//**
3053page_cleaner thread tasked with flushing dirty pages from the buffer
3054pools. As of now we'll have only one coordinator.
3055@return a dummy parameter */
3056extern "C"
3057os_thread_ret_t
3058DECLARE_THREAD(buf_flush_page_cleaner_coordinator)(void*)
3059{
3060 my_thread_init();
3061#ifdef UNIV_PFS_THREAD
3062 pfs_register_thread(page_cleaner_thread_key);
3063#endif /* UNIV_PFS_THREAD */
3064 ut_ad(!srv_read_only_mode);
3065
3066#ifdef UNIV_DEBUG_THREAD_CREATION
3067 ib::info() << "page_cleaner thread running, id "
3068 << os_thread_pf(os_thread_get_curr_id());
3069#endif /* UNIV_DEBUG_THREAD_CREATION */
3070#ifdef UNIV_LINUX
3071 /* linux might be able to set different setting for each thread.
3072 worth to try to set high priority for page cleaner threads */
3073 if (buf_flush_page_cleaner_set_priority(
3074 buf_flush_page_cleaner_priority)) {
3075
3076 ib::info() << "page_cleaner coordinator priority: "
3077 << buf_flush_page_cleaner_priority;
3078 } else {
3079 ib::info() << "If the mysqld execution user is authorized,"
3080 " page cleaner thread priority can be changed."
3081 " See the man page of setpriority().";
3082 }
3083 /* Signal that setpriority() has been attempted. */
3084 os_event_set(recv_sys->flush_end);
3085#endif /* UNIV_LINUX */
3086
3087 do {
3088 /* treat flushing requests during recovery. */
3089 ulint n_flushed_lru = 0;
3090 ulint n_flushed_list = 0;
3091
3092 os_event_wait(recv_sys->flush_start);
3093
3094 if (!recv_writer_thread_active) {
3095 break;
3096 }
3097
3098 switch (recv_sys->flush_type) {
3099 case BUF_FLUSH_LRU:
3100 /* Flush pages from end of LRU if required */
3101 pc_request(0, LSN_MAX);
3102 while (pc_flush_slot() > 0) {}
3103 pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3104 break;
3105
3106 case BUF_FLUSH_LIST:
3107 /* Flush all pages */
3108 do {
3109 pc_request(ULINT_MAX, LSN_MAX);
3110 while (pc_flush_slot() > 0) {}
3111 } while (!pc_wait_finished(&n_flushed_lru,
3112 &n_flushed_list));
3113 break;
3114
3115 default:
3116 ut_ad(0);
3117 }
3118
3119 os_event_reset(recv_sys->flush_start);
3120 os_event_set(recv_sys->flush_end);
3121 } while (recv_writer_thread_active);
3122
3123 os_event_wait(buf_flush_event);
3124
3125 ulint ret_sleep = 0;
3126 ulint n_evicted = 0;
3127 ulint n_flushed_last = 0;
3128 ulint warn_interval = 1;
3129 ulint warn_count = 0;
3130 int64_t sig_count = os_event_reset(buf_flush_event);
3131 ulint next_loop_time = ut_time_ms() + 1000;
3132 ulint n_flushed = 0;
3133 ulint last_activity = srv_get_activity_count();
3134 ulint last_pages = 0;
3135
3136 while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3137 ulint curr_time = ut_time_ms();
3138
3139 /* The page_cleaner skips sleep if the server is
3140 idle and there are no pending IOs in the buffer pool
3141 and there is work to do. */
3142 if (srv_check_activity(last_activity)
3143 || buf_get_n_pending_read_ios()
3144 || n_flushed == 0) {
3145
3146 ret_sleep = pc_sleep_if_needed(
3147 next_loop_time, sig_count, curr_time);
3148 } else if (curr_time > next_loop_time) {
3149 ret_sleep = OS_SYNC_TIME_EXCEEDED;
3150 } else {
3151 ret_sleep = 0;
3152 }
3153
3154 if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
3155 break;
3156 }
3157
3158 sig_count = os_event_reset(buf_flush_event);
3159
3160 if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3161 if (global_system_variables.log_warnings > 2
3162 && curr_time > next_loop_time + 3000
3163 && !(test_flags & TEST_SIGINT)) {
3164 if (warn_count == 0) {
3165 ib::info() << "page_cleaner: 1000ms"
3166 " intended loop took "
3167 << 1000 + curr_time
3168 - next_loop_time
3169 << "ms. The settings might not"
3170 " be optimal. (flushed="
3171 << n_flushed_last
3172 << " and evicted="
3173 << n_evicted
3174 << ", during the time.)";
3175 if (warn_interval > 300) {
3176 warn_interval = 600;
3177 } else {
3178 warn_interval *= 2;
3179 }
3180
3181 warn_count = warn_interval;
3182 } else {
3183 --warn_count;
3184 }
3185 } else {
3186 /* reset counter */
3187 warn_interval = 1;
3188 warn_count = 0;
3189 }
3190
3191 next_loop_time = curr_time + 1000;
3192 n_flushed_last = n_evicted = 0;
3193 }
3194
3195 if (ret_sleep != OS_SYNC_TIME_EXCEEDED
3196 && srv_flush_sync
3197 && buf_flush_sync_lsn > 0) {
3198 /* woke up for flush_sync */
3199 mutex_enter(&page_cleaner.mutex);
3200 lsn_t lsn_limit = buf_flush_sync_lsn;
3201 buf_flush_sync_lsn = 0;
3202 mutex_exit(&page_cleaner.mutex);
3203
3204 /* Request flushing for threads */
3205 pc_request(ULINT_MAX, lsn_limit);
3206
3207 ulint tm = ut_time_ms();
3208
3209 /* Coordinator also treats requests */
3210 while (pc_flush_slot() > 0) {}
3211
3212 /* only coordinator is using these counters,
3213 so no need to protect by lock. */
3214 page_cleaner.flush_time += ut_time_ms() - tm;
3215 page_cleaner.flush_pass++;
3216
3217 /* Wait for all slots to be finished */
3218 ulint n_flushed_lru = 0;
3219 ulint n_flushed_list = 0;
3220 pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3221
3222 if (n_flushed_list > 0 || n_flushed_lru > 0) {
3223 buf_flush_stats(n_flushed_list, n_flushed_lru);
3224
3225 MONITOR_INC_VALUE_CUMULATIVE(
3226 MONITOR_FLUSH_SYNC_TOTAL_PAGE,
3227 MONITOR_FLUSH_SYNC_COUNT,
3228 MONITOR_FLUSH_SYNC_PAGES,
3229 n_flushed_lru + n_flushed_list);
3230 }
3231
3232 n_flushed = n_flushed_lru + n_flushed_list;
3233
3234 } else if (srv_check_activity(last_activity)) {
3235 ulint n_to_flush;
3236 lsn_t lsn_limit = 0;
3237
3238 /* Estimate pages from flush_list to be flushed */
3239 if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3240 last_activity = srv_get_activity_count();
3241 n_to_flush =
3242 page_cleaner_flush_pages_recommendation(
3243 &lsn_limit, last_pages);
3244 } else {
3245 n_to_flush = 0;
3246 }
3247
3248 /* Request flushing for threads */
3249 pc_request(n_to_flush, lsn_limit);
3250
3251 ulint tm = ut_time_ms();
3252
3253 /* Coordinator also treats requests */
3254 while (pc_flush_slot() > 0) {
3255 /* No op */
3256 }
3257
3258 /* only coordinator is using these counters,
3259 so no need to protect by lock. */
3260 page_cleaner.flush_time += ut_time_ms() - tm;
3261 page_cleaner.flush_pass++ ;
3262
3263 /* Wait for all slots to be finished */
3264 ulint n_flushed_lru = 0;
3265 ulint n_flushed_list = 0;
3266
3267 pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3268
3269 if (n_flushed_list > 0 || n_flushed_lru > 0) {
3270 buf_flush_stats(n_flushed_list, n_flushed_lru);
3271 }
3272
3273 if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3274 last_pages = n_flushed_list;
3275 }
3276
3277 n_evicted += n_flushed_lru;
3278 n_flushed_last += n_flushed_list;
3279
3280 n_flushed = n_flushed_lru + n_flushed_list;
3281
3282 if (n_flushed_lru) {
3283 MONITOR_INC_VALUE_CUMULATIVE(
3284 MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
3285 MONITOR_LRU_BATCH_FLUSH_COUNT,
3286 MONITOR_LRU_BATCH_FLUSH_PAGES,
3287 n_flushed_lru);
3288 }
3289
3290 if (n_flushed_list) {
3291 MONITOR_INC_VALUE_CUMULATIVE(
3292 MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE,
3293 MONITOR_FLUSH_ADAPTIVE_COUNT,
3294 MONITOR_FLUSH_ADAPTIVE_PAGES,
3295 n_flushed_list);
3296 }
3297
3298 } else if (ret_sleep == OS_SYNC_TIME_EXCEEDED) {
3299 /* no activity, slept enough */
3300 buf_flush_lists(PCT_IO(100), LSN_MAX, &n_flushed);
3301
3302 n_flushed_last += n_flushed;
3303
3304 if (n_flushed) {
3305 MONITOR_INC_VALUE_CUMULATIVE(
3306 MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE,
3307 MONITOR_FLUSH_BACKGROUND_COUNT,
3308 MONITOR_FLUSH_BACKGROUND_PAGES,
3309 n_flushed);
3310
3311 }
3312
3313 } else {
3314 /* no activity, but woken up by event */
3315 n_flushed = 0;
3316 }
3317
3318 ut_d(buf_flush_page_cleaner_disabled_loop());
3319 }
3320
3321 ut_ad(srv_shutdown_state > 0);
3322 if (srv_fast_shutdown == 2
3323 || srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) {
3324 /* In very fast shutdown or when innodb failed to start, we
3325 simulate a crash of the buffer pool. We are not required to do
3326 any flushing. */
3327 goto thread_exit;
3328 }
3329
3330 /* In case of normal and slow shutdown the page_cleaner thread
3331 must wait for all other activity in the server to die down.
3332 Note that we can start flushing the buffer pool as soon as the
3333 server enters shutdown phase but we must stay alive long enough
3334 to ensure that any work done by the master or purge threads is
3335 also flushed.
3336 During shutdown we pass through two stages. In the first stage,
3337 when SRV_SHUTDOWN_CLEANUP is set other threads like the master
3338 and the purge threads may be working as well. We start flushing
3339 the buffer pool but can't be sure that no new pages are being
3340 dirtied until we enter SRV_SHUTDOWN_FLUSH_PHASE phase. */
3341
3342 do {
3343 pc_request(ULINT_MAX, LSN_MAX);
3344
3345 while (pc_flush_slot() > 0) {}
3346
3347 ulint n_flushed_lru = 0;
3348 ulint n_flushed_list = 0;
3349 pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3350
3351 n_flushed = n_flushed_lru + n_flushed_list;
3352
3353 /* We sleep only if there are no pages to flush */
3354 if (n_flushed == 0) {
3355 os_thread_sleep(100000);
3356 }
3357 } while (srv_shutdown_state == SRV_SHUTDOWN_CLEANUP);
3358
3359 /* At this point all threads including the master and the purge
3360 thread must have been suspended. */
3361 ut_a(srv_get_active_thread_type() == SRV_NONE);
3362 ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
3363
3364 /* We can now make a final sweep on flushing the buffer pool
3365 and exit after we have cleaned the whole buffer pool.
3366 It is important that we wait for any running batch that has
3367 been triggered by us to finish. Otherwise we can end up
3368 considering end of that batch as a finish of our final
3369 sweep and we'll come out of the loop leaving behind dirty pages
3370 in the flush_list */
3371 buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3372 buf_flush_wait_LRU_batch_end();
3373
3374 bool success;
3375
3376 do {
3377 pc_request(ULINT_MAX, LSN_MAX);
3378
3379 while (pc_flush_slot() > 0) {}
3380
3381 ulint n_flushed_lru = 0;
3382 ulint n_flushed_list = 0;
3383 success = pc_wait_finished(&n_flushed_lru, &n_flushed_list);
3384
3385 n_flushed = n_flushed_lru + n_flushed_list;
3386
3387 buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3388 buf_flush_wait_LRU_batch_end();
3389
3390 } while (!success || n_flushed > 0);
3391
3392 /* Some sanity checks */
3393 ut_a(srv_get_active_thread_type() == SRV_NONE);
3394 ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE);
3395
3396 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3397 buf_pool_t* buf_pool = buf_pool_from_array(i);
3398 ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == 0);
3399 }
3400
3401 /* We have lived our life. Time to die. */
3402
3403thread_exit:
3404 /* All worker threads are waiting for the event here,
3405 and no more access to page_cleaner structure by them.
3406 Wakes worker threads up just to make them exit. */
3407 page_cleaner.is_running = false;
3408
3409 /* waiting for all worker threads exit */
3410 while (page_cleaner.n_workers) {
3411 os_event_set(page_cleaner.is_requested);
3412 os_thread_sleep(10000);
3413 }
3414
3415 mutex_destroy(&page_cleaner.mutex);
3416
3417 os_event_destroy(page_cleaner.is_finished);
3418 os_event_destroy(page_cleaner.is_requested);
3419 os_event_destroy(page_cleaner.is_started);
3420
3421 buf_page_cleaner_is_active = false;
3422
3423 my_thread_end();
3424 /* We count the number of threads in os_thread_exit(). A created
3425 thread should always use that to exit and not use return() to exit. */
3426 os_thread_exit();
3427
3428 OS_THREAD_DUMMY_RETURN;
3429}
3430
3431/** Adjust thread count for page cleaner workers.
3432@param[in] new_cnt Number of threads to be used */
3433void
3434buf_flush_set_page_cleaner_thread_cnt(ulong new_cnt)
3435{
3436 mutex_enter(&page_cleaner.mutex);
3437
3438 srv_n_page_cleaners = new_cnt;
3439 if (new_cnt > page_cleaner.n_workers) {
3440 /* User has increased the number of page
3441 cleaner threads. */
3442 ulint add = new_cnt - page_cleaner.n_workers;
3443 for (ulint i = 0; i < add; i++) {
3444 os_thread_id_t cleaner_thread_id;
3445 os_thread_create(buf_flush_page_cleaner_worker, NULL, &cleaner_thread_id);
3446 }
3447 }
3448
3449 mutex_exit(&page_cleaner.mutex);
3450
3451 /* Wait until defined number of workers has started. */
3452 while (page_cleaner.is_running &&
3453 page_cleaner.n_workers != (srv_n_page_cleaners - 1)) {
3454 os_event_set(page_cleaner.is_requested);
3455 os_event_reset(page_cleaner.is_started);
3456 os_event_wait_time(page_cleaner.is_started, 1000000);
3457 }
3458}
3459
3460/******************************************************************//**
3461Worker thread of page_cleaner.
3462@return a dummy parameter */
3463extern "C"
3464os_thread_ret_t
3465DECLARE_THREAD(buf_flush_page_cleaner_worker)(
3466/*==========================================*/
3467 void* arg MY_ATTRIBUTE((unused)))
3468 /*!< in: a dummy parameter required by
3469 os_thread_create */
3470{
3471 my_thread_init();
3472#ifndef DBUG_OFF
3473 os_thread_id_t cleaner_thread_id = os_thread_get_curr_id();
3474#endif
3475
3476 mutex_enter(&page_cleaner.mutex);
3477 ulint thread_no = page_cleaner.n_workers++;
3478
3479 DBUG_LOG("ib_buf", "Thread " << cleaner_thread_id
3480 << " started; n_workers=" << page_cleaner.n_workers);
3481
3482 /* Signal that we have started */
3483 os_event_set(page_cleaner.is_started);
3484 mutex_exit(&page_cleaner.mutex);
3485
3486#ifdef UNIV_LINUX
3487 /* linux might be able to set different setting for each thread
3488 worth to try to set high priority for page cleaner threads */
3489 if (buf_flush_page_cleaner_set_priority(
3490 buf_flush_page_cleaner_priority)) {
3491
3492 ib::info() << "page_cleaner worker priority: "
3493 << buf_flush_page_cleaner_priority;
3494 }
3495#endif /* UNIV_LINUX */
3496
3497 while (true) {
3498 os_event_wait(page_cleaner.is_requested);
3499
3500 ut_d(buf_flush_page_cleaner_disabled_loop());
3501
3502 if (!page_cleaner.is_running) {
3503 break;
3504 }
3505
3506 ut_ad(srv_n_page_cleaners >= 1);
3507
3508 /* If number of page cleaner threads is decreased
3509 exit those that are not anymore needed. */
3510 if (srv_shutdown_state == SRV_SHUTDOWN_NONE &&
3511 thread_no >= (srv_n_page_cleaners - 1)) {
3512 DBUG_LOG("ib_buf", "Exiting "
3513 << thread_no
3514 << " page cleaner worker thread_id "
3515 << os_thread_pf(cleaner_thread_id)
3516 << " total threads " << srv_n_page_cleaners << ".");
3517 break;
3518 }
3519
3520 pc_flush_slot();
3521 }
3522
3523 mutex_enter(&page_cleaner.mutex);
3524 page_cleaner.n_workers--;
3525
3526 DBUG_LOG("ib_buf", "Thread " << cleaner_thread_id
3527 << " exiting; n_workers=" << page_cleaner.n_workers);
3528
3529 /* Signal that we have stopped */
3530 os_event_set(page_cleaner.is_started);
3531 mutex_exit(&page_cleaner.mutex);
3532
3533 my_thread_end();
3534
3535 os_thread_exit();
3536
3537 OS_THREAD_DUMMY_RETURN;
3538}
3539
3540/*******************************************************************//**
3541Synchronously flush dirty blocks from the end of the flush list of all buffer
3542pool instances.
3543NOTE: The calling thread is not allowed to own any latches on pages! */
3544void
3545buf_flush_sync_all_buf_pools(void)
3546/*==============================*/
3547{
3548 bool success;
3549 do {
3550 success = buf_flush_lists(ULINT_MAX, LSN_MAX, NULL);
3551 buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST);
3552 } while (!success);
3553
3554 ut_a(success);
3555}
3556
3557/** Request IO burst and wake page_cleaner up.
3558@param[in] lsn_limit upper limit of LSN to be flushed */
3559void
3560buf_flush_request_force(
3561 lsn_t lsn_limit)
3562{
3563 /* adjust based on lsn_avg_rate not to get old */
3564 lsn_t lsn_target = lsn_limit + lsn_avg_rate * 3;
3565
3566 mutex_enter(&page_cleaner.mutex);
3567 if (lsn_target > buf_flush_sync_lsn) {
3568 buf_flush_sync_lsn = lsn_target;
3569 }
3570 mutex_exit(&page_cleaner.mutex);
3571
3572 os_event_set(buf_flush_event);
3573}
3574#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3575
3576/** Functor to validate the flush list. */
3577struct Check {
3578 void operator()(const buf_page_t* elem)
3579 {
3580 ut_a(elem->in_flush_list);
3581 }
3582};
3583
3584/******************************************************************//**
3585Validates the flush list.
3586@return TRUE if ok */
3587static
3588ibool
3589buf_flush_validate_low(
3590/*===================*/
3591 buf_pool_t* buf_pool) /*!< in: Buffer pool instance */
3592{
3593 buf_page_t* bpage;
3594 const ib_rbt_node_t* rnode = NULL;
3595 Check check;
3596
3597 ut_ad(buf_flush_list_mutex_own(buf_pool));
3598
3599 ut_list_validate(buf_pool->flush_list, check);
3600
3601 bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
3602
3603 /* If we are in recovery mode i.e.: flush_rbt != NULL
3604 then each block in the flush_list must also be present
3605 in the flush_rbt. */
3606 if (buf_pool->flush_rbt != NULL) {
3607 rnode = rbt_first(buf_pool->flush_rbt);
3608 }
3609
3610 while (bpage != NULL) {
3611 const lsn_t om = bpage->oldest_modification;
3612
3613 ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
3614
3615 ut_ad(bpage->in_flush_list);
3616
3617 /* A page in buf_pool->flush_list can be in
3618 BUF_BLOCK_REMOVE_HASH state. This happens when a page
3619 is in the middle of being relocated. In that case the
3620 original descriptor can have this state and still be
3621 in the flush list waiting to acquire the
3622 buf_pool->flush_list_mutex to complete the relocation. */
3623 ut_a(buf_page_in_file(bpage)
3624 || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH);
3625 ut_a(om > 0);
3626
3627 if (buf_pool->flush_rbt != NULL) {
3628 buf_page_t** prpage;
3629
3630 ut_a(rnode != NULL);
3631 prpage = rbt_value(buf_page_t*, rnode);
3632
3633 ut_a(*prpage != NULL);
3634 ut_a(*prpage == bpage);
3635 rnode = rbt_next(buf_pool->flush_rbt, rnode);
3636 }
3637
3638 bpage = UT_LIST_GET_NEXT(list, bpage);
3639
3640 ut_a(bpage == NULL || om >= bpage->oldest_modification);
3641 }
3642
3643 /* By this time we must have exhausted the traversal of
3644 flush_rbt (if active) as well. */
3645 ut_a(rnode == NULL);
3646
3647 return(TRUE);
3648}
3649
3650/******************************************************************//**
3651Validates the flush list.
3652@return TRUE if ok */
3653ibool
3654buf_flush_validate(
3655/*===============*/
3656 buf_pool_t* buf_pool) /*!< buffer pool instance */
3657{
3658 ibool ret;
3659
3660 buf_flush_list_mutex_enter(buf_pool);
3661
3662 ret = buf_flush_validate_low(buf_pool);
3663
3664 buf_flush_list_mutex_exit(buf_pool);
3665
3666 return(ret);
3667}
3668
3669#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3670
3671/******************************************************************//**
3672Check if there are any dirty pages that belong to a space id in the flush
3673list in a particular buffer pool.
3674@return number of dirty pages present in a single buffer pool */
3675ulint
3676buf_pool_get_dirty_pages_count(
3677/*===========================*/
3678 buf_pool_t* buf_pool, /*!< in: buffer pool */
3679 ulint id, /*!< in: space id to check */
3680 FlushObserver* observer) /*!< in: flush observer to check */
3681
3682{
3683 ulint count = 0;
3684
3685 buf_pool_mutex_enter(buf_pool);
3686 buf_flush_list_mutex_enter(buf_pool);
3687
3688 buf_page_t* bpage;
3689
3690 for (bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
3691 bpage != 0;
3692 bpage = UT_LIST_GET_NEXT(list, bpage)) {
3693
3694 ut_ad(buf_page_in_file(bpage));
3695 ut_ad(bpage->in_flush_list);
3696 ut_ad(bpage->oldest_modification > 0);
3697
3698 if ((observer != NULL
3699 && observer == bpage->flush_observer)
3700 || (observer == NULL
3701 && id == bpage->id.space())) {
3702 ++count;
3703 }
3704 }
3705
3706 buf_flush_list_mutex_exit(buf_pool);
3707 buf_pool_mutex_exit(buf_pool);
3708
3709 return(count);
3710}
3711
3712/******************************************************************//**
3713Check if there are any dirty pages that belong to a space id in the flush list.
3714@return number of dirty pages present in all the buffer pools */
3715static
3716ulint
3717buf_flush_get_dirty_pages_count(
3718/*============================*/
3719 ulint id, /*!< in: space id to check */
3720 FlushObserver* observer) /*!< in: flush observer to check */
3721{
3722 ulint count = 0;
3723
3724 for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
3725 buf_pool_t* buf_pool;
3726
3727 buf_pool = buf_pool_from_array(i);
3728
3729 count += buf_pool_get_dirty_pages_count(buf_pool, id, observer);
3730 }
3731
3732 return(count);
3733}
3734
3735/** FlushObserver constructor
3736@param[in] space tablespace
3737@param[in] trx trx instance
3738@param[in] stage performance schema accounting object,
3739used by ALTER TABLE. It is passed to log_preflush_pool_modified_pages()
3740for accounting. */
3741FlushObserver::FlushObserver(
3742 fil_space_t* space,
3743 trx_t* trx,
3744 ut_stage_alter_t* stage)
3745 :
3746 m_space(space),
3747 m_trx(trx),
3748 m_stage(stage),
3749 m_interrupted(false)
3750{
3751 m_flushed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances));
3752 m_removed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances));
3753
3754 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3755 m_flushed->at(i) = 0;
3756 m_removed->at(i) = 0;
3757 }
3758
3759 DBUG_LOG("flush", "FlushObserver(): trx->id=" << m_trx->id);
3760}
3761
3762/** FlushObserver deconstructor */
3763FlushObserver::~FlushObserver()
3764{
3765 ut_ad(buf_flush_get_dirty_pages_count(m_space->id, this) == 0);
3766
3767 UT_DELETE(m_flushed);
3768 UT_DELETE(m_removed);
3769
3770 DBUG_LOG("flush", "~FlushObserver(): trx->id=" << m_trx->id);
3771}
3772
3773/** Check whether trx is interrupted
3774@return true if trx is interrupted */
3775bool
3776FlushObserver::check_interrupted()
3777{
3778 if (trx_is_interrupted(m_trx)) {
3779 interrupted();
3780
3781 return(true);
3782 }
3783
3784 return(false);
3785}
3786
3787/** Notify observer of a flush
3788@param[in] buf_pool buffer pool instance
3789@param[in] bpage buffer page to flush */
3790void
3791FlushObserver::notify_flush(
3792 buf_pool_t* buf_pool,
3793 buf_page_t* bpage)
3794{
3795 ut_ad(buf_pool_mutex_own(buf_pool));
3796
3797 m_flushed->at(buf_pool->instance_no)++;
3798
3799 if (m_stage != NULL) {
3800 m_stage->inc();
3801 }
3802
3803 DBUG_LOG("flush", "Flush " << bpage->id);
3804}
3805
3806/** Notify observer of a remove
3807@param[in] buf_pool buffer pool instance
3808@param[in] bpage buffer page flushed */
3809void
3810FlushObserver::notify_remove(
3811 buf_pool_t* buf_pool,
3812 buf_page_t* bpage)
3813{
3814 ut_ad(buf_pool_mutex_own(buf_pool));
3815
3816 m_removed->at(buf_pool->instance_no)++;
3817
3818 DBUG_LOG("flush", "Remove " << bpage->id);
3819}
3820
3821/** Flush dirty pages and wait. */
3822void
3823FlushObserver::flush()
3824{
3825 ut_ad(m_trx);
3826
3827 if (!m_interrupted && m_stage) {
3828 m_stage->begin_phase_flush(buf_flush_get_dirty_pages_count(
3829 m_space->id, this));
3830 }
3831
3832 buf_LRU_flush_or_remove_pages(m_space->id, this);
3833
3834 /* Wait for all dirty pages were flushed. */
3835 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3836 while (!is_complete(i)) {
3837
3838 os_thread_sleep(2000);
3839 }
3840 }
3841}
3842