| 1 | /***************************************************************************** |
| 2 | |
| 3 | Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. |
| 4 | Copyright (c) 2013, 2018, MariaDB Corporation. |
| 5 | Copyright (c) 2013, 2014, Fusion-io |
| 6 | |
| 7 | This program is free software; you can redistribute it and/or modify it under |
| 8 | the terms of the GNU General Public License as published by the Free Software |
| 9 | Foundation; version 2 of the License. |
| 10 | |
| 11 | This program is distributed in the hope that it will be useful, but WITHOUT |
| 12 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| 13 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
| 14 | |
| 15 | You should have received a copy of the GNU General Public License along with |
| 16 | this program; if not, write to the Free Software Foundation, Inc., |
| 17 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA |
| 18 | |
| 19 | *****************************************************************************/ |
| 20 | |
| 21 | /**************************************************//** |
| 22 | @file buf/buf0flu.cc |
| 23 | The database buffer buf_pool flush algorithm |
| 24 | |
| 25 | Created 11/11/1995 Heikki Tuuri |
| 26 | *******************************************************/ |
| 27 | |
| 28 | #include "ha_prototypes.h" |
| 29 | #include <mysql/service_thd_wait.h> |
| 30 | #include <my_dbug.h> |
| 31 | #include <sql_class.h> |
| 32 | |
| 33 | #include "buf0flu.h" |
| 34 | #include "buf0buf.h" |
| 35 | #include "buf0checksum.h" |
| 36 | #include "srv0start.h" |
| 37 | #include "srv0srv.h" |
| 38 | #include "page0zip.h" |
| 39 | #include "ut0byte.h" |
| 40 | #include "page0page.h" |
| 41 | #include "fil0fil.h" |
| 42 | #include "buf0lru.h" |
| 43 | #include "buf0rea.h" |
| 44 | #include "ibuf0ibuf.h" |
| 45 | #include "log0log.h" |
| 46 | #include "os0file.h" |
| 47 | #include "trx0sys.h" |
| 48 | #include "srv0mon.h" |
| 49 | #include "fsp0sysspace.h" |
| 50 | #include "ut0stage.h" |
| 51 | #include "fil0pagecompress.h" |
| 52 | #ifdef UNIV_LINUX |
| 53 | /* include defs for CPU time priority settings */ |
| 54 | #include <unistd.h> |
| 55 | #include <sys/syscall.h> |
| 56 | #include <sys/time.h> |
| 57 | #include <sys/resource.h> |
| 58 | static const int buf_flush_page_cleaner_priority = -20; |
| 59 | #endif /* UNIV_LINUX */ |
| 60 | |
| 61 | /** Sleep time in microseconds for loop waiting for the oldest |
| 62 | modification lsn */ |
| 63 | static const ulint buf_flush_wait_flushed_sleep_time = 10000; |
| 64 | |
| 65 | #include <my_service_manager.h> |
| 66 | |
| 67 | /** Number of pages flushed through non flush_list flushes. */ |
| 68 | static ulint buf_lru_flush_page_count = 0; |
| 69 | |
| 70 | /** Flag indicating if the page_cleaner is in active state. This flag |
| 71 | is set to TRUE by the page_cleaner thread when it is spawned and is set |
| 72 | back to FALSE at shutdown by the page_cleaner as well. Therefore no |
| 73 | need to protect it by a mutex. It is only ever read by the thread |
| 74 | doing the shutdown */ |
| 75 | bool buf_page_cleaner_is_active; |
| 76 | |
| 77 | /** Factor for scan length to determine n_pages for intended oldest LSN |
| 78 | progress */ |
| 79 | static ulint buf_flush_lsn_scan_factor = 3; |
| 80 | |
| 81 | /** Average redo generation rate */ |
| 82 | static lsn_t lsn_avg_rate = 0; |
| 83 | |
| 84 | /** Target oldest LSN for the requested flush_sync */ |
| 85 | static lsn_t buf_flush_sync_lsn = 0; |
| 86 | |
| 87 | #ifdef UNIV_PFS_THREAD |
| 88 | mysql_pfs_key_t page_cleaner_thread_key; |
| 89 | #endif /* UNIV_PFS_THREAD */ |
| 90 | |
| 91 | /** Event to synchronise with the flushing. */ |
| 92 | os_event_t buf_flush_event; |
| 93 | |
| 94 | /** State for page cleaner array slot */ |
| 95 | enum page_cleaner_state_t { |
| 96 | /** Not requested any yet. |
| 97 | Moved from FINISHED by the coordinator. */ |
| 98 | PAGE_CLEANER_STATE_NONE = 0, |
| 99 | /** Requested but not started flushing. |
| 100 | Moved from NONE by the coordinator. */ |
| 101 | PAGE_CLEANER_STATE_REQUESTED, |
| 102 | /** Flushing is on going. |
| 103 | Moved from REQUESTED by the worker. */ |
| 104 | PAGE_CLEANER_STATE_FLUSHING, |
| 105 | /** Flushing was finished. |
| 106 | Moved from FLUSHING by the worker. */ |
| 107 | PAGE_CLEANER_STATE_FINISHED |
| 108 | }; |
| 109 | |
| 110 | /** Page cleaner request state for each buffer pool instance */ |
| 111 | struct page_cleaner_slot_t { |
| 112 | page_cleaner_state_t state; /*!< state of the request. |
| 113 | protected by page_cleaner_t::mutex |
| 114 | if the worker thread got the slot and |
| 115 | set to PAGE_CLEANER_STATE_FLUSHING, |
| 116 | n_flushed_lru and n_flushed_list can be |
| 117 | updated only by the worker thread */ |
| 118 | /* This value is set during state==PAGE_CLEANER_STATE_NONE */ |
| 119 | ulint n_pages_requested; |
| 120 | /*!< number of requested pages |
| 121 | for the slot */ |
| 122 | /* These values are updated during state==PAGE_CLEANER_STATE_FLUSHING, |
| 123 | and commited with state==PAGE_CLEANER_STATE_FINISHED. |
| 124 | The consistency is protected by the 'state' */ |
| 125 | ulint n_flushed_lru; |
| 126 | /*!< number of flushed pages |
| 127 | by LRU scan flushing */ |
| 128 | ulint n_flushed_list; |
| 129 | /*!< number of flushed pages |
| 130 | by flush_list flushing */ |
| 131 | bool succeeded_list; |
| 132 | /*!< true if flush_list flushing |
| 133 | succeeded. */ |
| 134 | ulint flush_lru_time; |
| 135 | /*!< elapsed time for LRU flushing */ |
| 136 | ulint flush_list_time; |
| 137 | /*!< elapsed time for flush_list |
| 138 | flushing */ |
| 139 | ulint flush_lru_pass; |
| 140 | /*!< count to attempt LRU flushing */ |
| 141 | ulint flush_list_pass; |
| 142 | /*!< count to attempt flush_list |
| 143 | flushing */ |
| 144 | }; |
| 145 | |
| 146 | /** Page cleaner structure common for all threads */ |
| 147 | struct page_cleaner_t { |
| 148 | ib_mutex_t mutex; /*!< mutex to protect whole of |
| 149 | page_cleaner_t struct and |
| 150 | page_cleaner_slot_t slots. */ |
| 151 | os_event_t is_requested; /*!< event to activate worker |
| 152 | threads. */ |
| 153 | os_event_t is_finished; /*!< event to signal that all |
| 154 | slots were finished. */ |
| 155 | os_event_t is_started; /*!< event to signal that |
| 156 | thread is started/exiting */ |
| 157 | volatile ulint n_workers; /*!< number of worker threads |
| 158 | in existence */ |
| 159 | bool requested; /*!< true if requested pages |
| 160 | to flush */ |
| 161 | lsn_t lsn_limit; /*!< upper limit of LSN to be |
| 162 | flushed */ |
| 163 | ulint n_slots; /*!< total number of slots */ |
| 164 | ulint n_slots_requested; |
| 165 | /*!< number of slots |
| 166 | in the state |
| 167 | PAGE_CLEANER_STATE_REQUESTED */ |
| 168 | ulint n_slots_flushing; |
| 169 | /*!< number of slots |
| 170 | in the state |
| 171 | PAGE_CLEANER_STATE_FLUSHING */ |
| 172 | ulint n_slots_finished; |
| 173 | /*!< number of slots |
| 174 | in the state |
| 175 | PAGE_CLEANER_STATE_FINISHED */ |
| 176 | ulint flush_time; /*!< elapsed time to flush |
| 177 | requests for all slots */ |
| 178 | ulint flush_pass; /*!< count to finish to flush |
| 179 | requests for all slots */ |
| 180 | page_cleaner_slot_t slots[MAX_BUFFER_POOLS]; |
| 181 | bool is_running; /*!< false if attempt |
| 182 | to shutdown */ |
| 183 | |
| 184 | #ifdef UNIV_DEBUG |
| 185 | ulint n_disabled_debug; |
| 186 | /*<! how many of pc threads |
| 187 | have been disabled */ |
| 188 | #endif /* UNIV_DEBUG */ |
| 189 | }; |
| 190 | |
| 191 | static page_cleaner_t page_cleaner; |
| 192 | |
| 193 | #ifdef UNIV_DEBUG |
| 194 | my_bool innodb_page_cleaner_disabled_debug; |
| 195 | #endif /* UNIV_DEBUG */ |
| 196 | |
| 197 | /** If LRU list of a buf_pool is less than this size then LRU eviction |
| 198 | should not happen. This is because when we do LRU flushing we also put |
| 199 | the blocks on free list. If LRU list is very small then we can end up |
| 200 | in thrashing. */ |
| 201 | #define BUF_LRU_MIN_LEN 256 |
| 202 | |
| 203 | /* @} */ |
| 204 | |
| 205 | /******************************************************************//** |
| 206 | Increases flush_list size in bytes with the page size in inline function */ |
| 207 | static inline |
| 208 | void |
| 209 | incr_flush_list_size_in_bytes( |
| 210 | /*==========================*/ |
| 211 | buf_block_t* block, /*!< in: control block */ |
| 212 | buf_pool_t* buf_pool) /*!< in: buffer pool instance */ |
| 213 | { |
| 214 | ut_ad(buf_flush_list_mutex_own(buf_pool)); |
| 215 | |
| 216 | buf_pool->stat.flush_list_bytes += block->page.size.physical(); |
| 217 | |
| 218 | ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size); |
| 219 | } |
| 220 | |
| 221 | #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG |
| 222 | /******************************************************************//** |
| 223 | Validates the flush list. |
| 224 | @return TRUE if ok */ |
| 225 | static |
| 226 | ibool |
| 227 | buf_flush_validate_low( |
| 228 | /*===================*/ |
| 229 | buf_pool_t* buf_pool); /*!< in: Buffer pool instance */ |
| 230 | |
| 231 | /******************************************************************//** |
| 232 | Validates the flush list some of the time. |
| 233 | @return TRUE if ok or the check was skipped */ |
| 234 | static |
| 235 | ibool |
| 236 | buf_flush_validate_skip( |
| 237 | /*====================*/ |
| 238 | buf_pool_t* buf_pool) /*!< in: Buffer pool instance */ |
| 239 | { |
| 240 | /** Try buf_flush_validate_low() every this many times */ |
| 241 | # define BUF_FLUSH_VALIDATE_SKIP 23 |
| 242 | |
| 243 | /** The buf_flush_validate_low() call skip counter. |
| 244 | Use a signed type because of the race condition below. */ |
| 245 | static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP; |
| 246 | |
| 247 | /* There is a race condition below, but it does not matter, |
| 248 | because this call is only for heuristic purposes. We want to |
| 249 | reduce the call frequency of the costly buf_flush_validate_low() |
| 250 | check in debug builds. */ |
| 251 | if (--buf_flush_validate_count > 0) { |
| 252 | return(TRUE); |
| 253 | } |
| 254 | |
| 255 | buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP; |
| 256 | return(buf_flush_validate_low(buf_pool)); |
| 257 | } |
| 258 | #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ |
| 259 | |
| 260 | /******************************************************************//** |
| 261 | Insert a block in the flush_rbt and returns a pointer to its |
| 262 | predecessor or NULL if no predecessor. The ordering is maintained |
| 263 | on the basis of the <oldest_modification, space, offset> key. |
| 264 | @return pointer to the predecessor or NULL if no predecessor. */ |
| 265 | static |
| 266 | buf_page_t* |
| 267 | buf_flush_insert_in_flush_rbt( |
| 268 | /*==========================*/ |
| 269 | buf_page_t* bpage) /*!< in: bpage to be inserted. */ |
| 270 | { |
| 271 | const ib_rbt_node_t* c_node; |
| 272 | const ib_rbt_node_t* p_node; |
| 273 | buf_page_t* prev = NULL; |
| 274 | buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); |
| 275 | |
| 276 | ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE); |
| 277 | ut_ad(buf_flush_list_mutex_own(buf_pool)); |
| 278 | |
| 279 | /* Insert this buffer into the rbt. */ |
| 280 | c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage); |
| 281 | ut_a(c_node != NULL); |
| 282 | |
| 283 | /* Get the predecessor. */ |
| 284 | p_node = rbt_prev(buf_pool->flush_rbt, c_node); |
| 285 | |
| 286 | if (p_node != NULL) { |
| 287 | buf_page_t** value; |
| 288 | value = rbt_value(buf_page_t*, p_node); |
| 289 | prev = *value; |
| 290 | ut_a(prev != NULL); |
| 291 | } |
| 292 | |
| 293 | return(prev); |
| 294 | } |
| 295 | |
| 296 | /*********************************************************//** |
| 297 | Delete a bpage from the flush_rbt. */ |
| 298 | static |
| 299 | void |
| 300 | buf_flush_delete_from_flush_rbt( |
| 301 | /*============================*/ |
| 302 | buf_page_t* bpage) /*!< in: bpage to be removed. */ |
| 303 | { |
| 304 | #ifdef UNIV_DEBUG |
| 305 | ibool ret = FALSE; |
| 306 | #endif /* UNIV_DEBUG */ |
| 307 | buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); |
| 308 | |
| 309 | ut_ad(buf_flush_list_mutex_own(buf_pool)); |
| 310 | |
| 311 | #ifdef UNIV_DEBUG |
| 312 | ret = |
| 313 | #endif /* UNIV_DEBUG */ |
| 314 | rbt_delete(buf_pool->flush_rbt, &bpage); |
| 315 | |
| 316 | ut_ad(ret); |
| 317 | } |
| 318 | |
| 319 | /*****************************************************************//** |
| 320 | Compare two modified blocks in the buffer pool. The key for comparison |
| 321 | is: |
| 322 | key = <oldest_modification, space, offset> |
| 323 | This comparison is used to maintian ordering of blocks in the |
| 324 | buf_pool->flush_rbt. |
| 325 | Note that for the purpose of flush_rbt, we only need to order blocks |
| 326 | on the oldest_modification. The other two fields are used to uniquely |
| 327 | identify the blocks. |
| 328 | @return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */ |
| 329 | static |
| 330 | int |
| 331 | buf_flush_block_cmp( |
| 332 | /*================*/ |
| 333 | const void* p1, /*!< in: block1 */ |
| 334 | const void* p2) /*!< in: block2 */ |
| 335 | { |
| 336 | int ret; |
| 337 | const buf_page_t* b1 = *(const buf_page_t**) p1; |
| 338 | const buf_page_t* b2 = *(const buf_page_t**) p2; |
| 339 | |
| 340 | ut_ad(b1 != NULL); |
| 341 | ut_ad(b2 != NULL); |
| 342 | |
| 343 | #ifdef UNIV_DEBUG |
| 344 | buf_pool_t* buf_pool = buf_pool_from_bpage(b1); |
| 345 | #endif /* UNIV_DEBUG */ |
| 346 | |
| 347 | ut_ad(buf_flush_list_mutex_own(buf_pool)); |
| 348 | |
| 349 | ut_ad(b1->in_flush_list); |
| 350 | ut_ad(b2->in_flush_list); |
| 351 | |
| 352 | if (b2->oldest_modification > b1->oldest_modification) { |
| 353 | return(1); |
| 354 | } else if (b2->oldest_modification < b1->oldest_modification) { |
| 355 | return(-1); |
| 356 | } |
| 357 | |
| 358 | /* If oldest_modification is same then decide on the space. */ |
| 359 | ret = (int)(b2->id.space() - b1->id.space()); |
| 360 | |
| 361 | /* Or else decide ordering on the page number. */ |
| 362 | return(ret ? ret : (int) (b2->id.page_no() - b1->id.page_no())); |
| 363 | } |
| 364 | |
| 365 | /********************************************************************//** |
| 366 | Initialize the red-black tree to speed up insertions into the flush_list |
| 367 | during recovery process. Should be called at the start of recovery |
| 368 | process before any page has been read/written. */ |
| 369 | void |
| 370 | buf_flush_init_flush_rbt(void) |
| 371 | /*==========================*/ |
| 372 | { |
| 373 | ulint i; |
| 374 | |
| 375 | for (i = 0; i < srv_buf_pool_instances; i++) { |
| 376 | buf_pool_t* buf_pool; |
| 377 | |
| 378 | buf_pool = buf_pool_from_array(i); |
| 379 | |
| 380 | buf_flush_list_mutex_enter(buf_pool); |
| 381 | |
| 382 | ut_ad(buf_pool->flush_rbt == NULL); |
| 383 | |
| 384 | /* Create red black tree for speedy insertions in flush list. */ |
| 385 | buf_pool->flush_rbt = rbt_create( |
| 386 | sizeof(buf_page_t*), buf_flush_block_cmp); |
| 387 | |
| 388 | buf_flush_list_mutex_exit(buf_pool); |
| 389 | } |
| 390 | } |
| 391 | |
| 392 | /********************************************************************//** |
| 393 | Frees up the red-black tree. */ |
| 394 | void |
| 395 | buf_flush_free_flush_rbt(void) |
| 396 | /*==========================*/ |
| 397 | { |
| 398 | ulint i; |
| 399 | |
| 400 | for (i = 0; i < srv_buf_pool_instances; i++) { |
| 401 | buf_pool_t* buf_pool; |
| 402 | |
| 403 | buf_pool = buf_pool_from_array(i); |
| 404 | |
| 405 | buf_flush_list_mutex_enter(buf_pool); |
| 406 | |
| 407 | #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG |
| 408 | ut_a(buf_flush_validate_low(buf_pool)); |
| 409 | #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ |
| 410 | |
| 411 | rbt_free(buf_pool->flush_rbt); |
| 412 | buf_pool->flush_rbt = NULL; |
| 413 | |
| 414 | buf_flush_list_mutex_exit(buf_pool); |
| 415 | } |
| 416 | } |
| 417 | |
| 418 | /********************************************************************//** |
| 419 | Inserts a modified block into the flush list. */ |
| 420 | void |
| 421 | buf_flush_insert_into_flush_list( |
| 422 | /*=============================*/ |
| 423 | buf_pool_t* buf_pool, /*!< buffer pool instance */ |
| 424 | buf_block_t* block, /*!< in/out: block which is modified */ |
| 425 | lsn_t lsn) /*!< in: oldest modification */ |
| 426 | { |
| 427 | ut_ad(!buf_pool_mutex_own(buf_pool)); |
| 428 | ut_ad(log_flush_order_mutex_own()); |
| 429 | ut_ad(buf_page_mutex_own(block)); |
| 430 | |
| 431 | buf_flush_list_mutex_enter(buf_pool); |
| 432 | |
| 433 | ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL) |
| 434 | || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification |
| 435 | <= lsn)); |
| 436 | |
| 437 | /* If we are in the recovery then we need to update the flush |
| 438 | red-black tree as well. */ |
| 439 | if (buf_pool->flush_rbt != NULL) { |
| 440 | buf_flush_list_mutex_exit(buf_pool); |
| 441 | buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn); |
| 442 | return; |
| 443 | } |
| 444 | |
| 445 | ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); |
| 446 | ut_ad(!block->page.in_flush_list); |
| 447 | |
| 448 | ut_d(block->page.in_flush_list = TRUE); |
| 449 | block->page.oldest_modification = lsn; |
| 450 | |
| 451 | UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page); |
| 452 | |
| 453 | incr_flush_list_size_in_bytes(block, buf_pool); |
| 454 | |
| 455 | #ifdef UNIV_DEBUG_VALGRIND |
| 456 | void* p; |
| 457 | |
| 458 | if (block->page.size.is_compressed()) { |
| 459 | p = block->page.zip.data; |
| 460 | } else { |
| 461 | p = block->frame; |
| 462 | } |
| 463 | |
| 464 | UNIV_MEM_ASSERT_RW(p, block->page.size.physical()); |
| 465 | #endif /* UNIV_DEBUG_VALGRIND */ |
| 466 | |
| 467 | #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG |
| 468 | ut_a(buf_flush_validate_skip(buf_pool)); |
| 469 | #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ |
| 470 | |
| 471 | buf_flush_list_mutex_exit(buf_pool); |
| 472 | } |
| 473 | |
| 474 | /********************************************************************//** |
| 475 | Inserts a modified block into the flush list in the right sorted position. |
| 476 | This function is used by recovery, because there the modifications do not |
| 477 | necessarily come in the order of lsn's. */ |
| 478 | void |
| 479 | buf_flush_insert_sorted_into_flush_list( |
| 480 | /*====================================*/ |
| 481 | buf_pool_t* buf_pool, /*!< in: buffer pool instance */ |
| 482 | buf_block_t* block, /*!< in/out: block which is modified */ |
| 483 | lsn_t lsn) /*!< in: oldest modification */ |
| 484 | { |
| 485 | buf_page_t* prev_b; |
| 486 | buf_page_t* b; |
| 487 | |
| 488 | ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE); |
| 489 | ut_ad(!buf_pool_mutex_own(buf_pool)); |
| 490 | ut_ad(log_flush_order_mutex_own()); |
| 491 | ut_ad(buf_page_mutex_own(block)); |
| 492 | ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); |
| 493 | |
| 494 | buf_flush_list_mutex_enter(buf_pool); |
| 495 | |
| 496 | /* The field in_LRU_list is protected by buf_pool->mutex, which |
| 497 | we are not holding. However, while a block is in the flush |
| 498 | list, it is dirty and cannot be discarded, not from the |
| 499 | page_hash or from the LRU list. At most, the uncompressed |
| 500 | page frame of a compressed block may be discarded or created |
| 501 | (copying the block->page to or from a buf_page_t that is |
| 502 | dynamically allocated from buf_buddy_alloc()). Because those |
| 503 | transitions hold block->mutex and the flush list mutex (via |
| 504 | buf_flush_relocate_on_flush_list()), there is no possibility |
| 505 | of a race condition in the assertions below. */ |
| 506 | ut_ad(block->page.in_LRU_list); |
| 507 | ut_ad(block->page.in_page_hash); |
| 508 | /* buf_buddy_block_register() will take a block in the |
| 509 | BUF_BLOCK_MEMORY state, not a file page. */ |
| 510 | ut_ad(!block->page.in_zip_hash); |
| 511 | |
| 512 | ut_ad(!block->page.in_flush_list); |
| 513 | ut_d(block->page.in_flush_list = TRUE); |
| 514 | block->page.oldest_modification = lsn; |
| 515 | |
| 516 | #ifdef UNIV_DEBUG_VALGRIND |
| 517 | void* p; |
| 518 | |
| 519 | if (block->page.size.is_compressed()) { |
| 520 | p = block->page.zip.data; |
| 521 | } else { |
| 522 | p = block->frame; |
| 523 | } |
| 524 | |
| 525 | UNIV_MEM_ASSERT_RW(p, block->page.size.physical()); |
| 526 | #endif /* UNIV_DEBUG_VALGRIND */ |
| 527 | |
| 528 | prev_b = NULL; |
| 529 | |
| 530 | /* For the most part when this function is called the flush_rbt |
| 531 | should not be NULL. In a very rare boundary case it is possible |
| 532 | that the flush_rbt has already been freed by the recovery thread |
| 533 | before the last page was hooked up in the flush_list by the |
| 534 | io-handler thread. In that case we'll just do a simple |
| 535 | linear search in the else block. */ |
| 536 | if (buf_pool->flush_rbt != NULL) { |
| 537 | |
| 538 | prev_b = buf_flush_insert_in_flush_rbt(&block->page); |
| 539 | |
| 540 | } else { |
| 541 | |
| 542 | b = UT_LIST_GET_FIRST(buf_pool->flush_list); |
| 543 | |
| 544 | while (b != NULL && b->oldest_modification |
| 545 | > block->page.oldest_modification) { |
| 546 | |
| 547 | ut_ad(b->in_flush_list); |
| 548 | prev_b = b; |
| 549 | b = UT_LIST_GET_NEXT(list, b); |
| 550 | } |
| 551 | } |
| 552 | |
| 553 | if (prev_b == NULL) { |
| 554 | UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page); |
| 555 | } else { |
| 556 | UT_LIST_INSERT_AFTER(buf_pool->flush_list, prev_b, &block->page); |
| 557 | } |
| 558 | |
| 559 | incr_flush_list_size_in_bytes(block, buf_pool); |
| 560 | |
| 561 | #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG |
| 562 | ut_a(buf_flush_validate_low(buf_pool)); |
| 563 | #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ |
| 564 | |
| 565 | buf_flush_list_mutex_exit(buf_pool); |
| 566 | } |
| 567 | |
| 568 | /********************************************************************//** |
| 569 | Returns TRUE if the file page block is immediately suitable for replacement, |
| 570 | i.e., the transition FILE_PAGE => NOT_USED allowed. |
| 571 | @return TRUE if can replace immediately */ |
| 572 | ibool |
| 573 | buf_flush_ready_for_replace( |
| 574 | /*========================*/ |
| 575 | buf_page_t* bpage) /*!< in: buffer control block, must be |
| 576 | buf_page_in_file(bpage) and in the LRU list */ |
| 577 | { |
| 578 | #ifdef UNIV_DEBUG |
| 579 | buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); |
| 580 | ut_ad(buf_pool_mutex_own(buf_pool)); |
| 581 | #endif /* UNIV_DEBUG */ |
| 582 | ut_ad(mutex_own(buf_page_get_mutex(bpage))); |
| 583 | ut_ad(bpage->in_LRU_list); |
| 584 | |
| 585 | if (buf_page_in_file(bpage)) { |
| 586 | |
| 587 | return(bpage->oldest_modification == 0 |
| 588 | && bpage->buf_fix_count == 0 |
| 589 | && buf_page_get_io_fix(bpage) == BUF_IO_NONE); |
| 590 | } |
| 591 | |
| 592 | ib::fatal() << "Buffer block " << bpage << " state " << bpage->state |
| 593 | << " in the LRU list!" ; |
| 594 | |
| 595 | return(FALSE); |
| 596 | } |
| 597 | |
| 598 | /********************************************************************//** |
| 599 | Returns true if the block is modified and ready for flushing. |
| 600 | @return true if can flush immediately */ |
| 601 | bool |
| 602 | buf_flush_ready_for_flush( |
| 603 | /*======================*/ |
| 604 | buf_page_t* bpage, /*!< in: buffer control block, must be |
| 605 | buf_page_in_file(bpage) */ |
| 606 | buf_flush_t flush_type)/*!< in: type of flush */ |
| 607 | { |
| 608 | #ifdef UNIV_DEBUG |
| 609 | buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); |
| 610 | ut_ad(buf_pool_mutex_own(buf_pool)); |
| 611 | #endif /* UNIV_DEBUG */ |
| 612 | |
| 613 | ut_a(buf_page_in_file(bpage)); |
| 614 | ut_ad(mutex_own(buf_page_get_mutex(bpage))); |
| 615 | ut_ad(flush_type < BUF_FLUSH_N_TYPES); |
| 616 | |
| 617 | if (bpage->oldest_modification == 0 |
| 618 | || buf_page_get_io_fix(bpage) != BUF_IO_NONE) { |
| 619 | return(false); |
| 620 | } |
| 621 | |
| 622 | ut_ad(bpage->in_flush_list); |
| 623 | |
| 624 | switch (flush_type) { |
| 625 | case BUF_FLUSH_LIST: |
| 626 | case BUF_FLUSH_LRU: |
| 627 | case BUF_FLUSH_SINGLE_PAGE: |
| 628 | return(true); |
| 629 | |
| 630 | case BUF_FLUSH_N_TYPES: |
| 631 | break; |
| 632 | } |
| 633 | |
| 634 | ut_error; |
| 635 | return(false); |
| 636 | } |
| 637 | |
| 638 | /********************************************************************//** |
| 639 | Remove a block from the flush list of modified blocks. */ |
| 640 | void |
| 641 | buf_flush_remove( |
| 642 | /*=============*/ |
| 643 | buf_page_t* bpage) /*!< in: pointer to the block in question */ |
| 644 | { |
| 645 | buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); |
| 646 | |
| 647 | #if 0 // FIXME: Rate-limit the output. Move this to the page cleaner? |
| 648 | if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE)) { |
| 649 | service_manager_extend_timeout( |
| 650 | INNODB_EXTEND_TIMEOUT_INTERVAL, |
| 651 | "Flush and remove page with tablespace id %u" |
| 652 | ", Poolid " ULINTPF ", flush list length " ULINTPF, |
| 653 | bpage->space, buf_pool->instance_no, |
| 654 | UT_LIST_GET_LEN(buf_pool->flush_list)); |
| 655 | } |
| 656 | #endif |
| 657 | |
| 658 | ut_ad(buf_pool_mutex_own(buf_pool)); |
| 659 | ut_ad(mutex_own(buf_page_get_mutex(bpage))); |
| 660 | ut_ad(bpage->in_flush_list); |
| 661 | |
| 662 | buf_flush_list_mutex_enter(buf_pool); |
| 663 | |
| 664 | /* Important that we adjust the hazard pointer before removing |
| 665 | the bpage from flush list. */ |
| 666 | buf_pool->flush_hp.adjust(bpage); |
| 667 | |
| 668 | switch (buf_page_get_state(bpage)) { |
| 669 | case BUF_BLOCK_POOL_WATCH: |
| 670 | case BUF_BLOCK_ZIP_PAGE: |
| 671 | /* Clean compressed pages should not be on the flush list */ |
| 672 | case BUF_BLOCK_NOT_USED: |
| 673 | case BUF_BLOCK_READY_FOR_USE: |
| 674 | case BUF_BLOCK_MEMORY: |
| 675 | case BUF_BLOCK_REMOVE_HASH: |
| 676 | ut_error; |
| 677 | return; |
| 678 | case BUF_BLOCK_ZIP_DIRTY: |
| 679 | buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE); |
| 680 | UT_LIST_REMOVE(buf_pool->flush_list, bpage); |
| 681 | #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG |
| 682 | buf_LRU_insert_zip_clean(bpage); |
| 683 | #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ |
| 684 | break; |
| 685 | case BUF_BLOCK_FILE_PAGE: |
| 686 | UT_LIST_REMOVE(buf_pool->flush_list, bpage); |
| 687 | break; |
| 688 | } |
| 689 | |
| 690 | /* If the flush_rbt is active then delete from there as well. */ |
| 691 | if (buf_pool->flush_rbt != NULL) { |
| 692 | buf_flush_delete_from_flush_rbt(bpage); |
| 693 | } |
| 694 | |
| 695 | /* Must be done after we have removed it from the flush_rbt |
| 696 | because we assert on in_flush_list in comparison function. */ |
| 697 | ut_d(bpage->in_flush_list = FALSE); |
| 698 | |
| 699 | buf_pool->stat.flush_list_bytes -= bpage->size.physical(); |
| 700 | |
| 701 | bpage->oldest_modification = 0; |
| 702 | |
| 703 | #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG |
| 704 | ut_a(buf_flush_validate_skip(buf_pool)); |
| 705 | #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ |
| 706 | |
| 707 | /* If there is an observer that want to know if the asynchronous |
| 708 | flushing was done then notify it. */ |
| 709 | if (bpage->flush_observer != NULL) { |
| 710 | bpage->flush_observer->notify_remove(buf_pool, bpage); |
| 711 | |
| 712 | bpage->flush_observer = NULL; |
| 713 | } |
| 714 | |
| 715 | buf_flush_list_mutex_exit(buf_pool); |
| 716 | } |
| 717 | |
| 718 | /*******************************************************************//** |
| 719 | Relocates a buffer control block on the flush_list. |
| 720 | Note that it is assumed that the contents of bpage have already been |
| 721 | copied to dpage. |
| 722 | IMPORTANT: When this function is called bpage and dpage are not |
| 723 | exact copies of each other. For example, they both will have different |
| 724 | ::state. Also the ::list pointers in dpage may be stale. We need to |
| 725 | use the current list node (bpage) to do the list manipulation because |
| 726 | the list pointers could have changed between the time that we copied |
| 727 | the contents of bpage to the dpage and the flush list manipulation |
| 728 | below. */ |
| 729 | void |
| 730 | buf_flush_relocate_on_flush_list( |
| 731 | /*=============================*/ |
| 732 | buf_page_t* bpage, /*!< in/out: control block being moved */ |
| 733 | buf_page_t* dpage) /*!< in/out: destination block */ |
| 734 | { |
| 735 | buf_page_t* prev; |
| 736 | buf_page_t* prev_b = NULL; |
| 737 | buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); |
| 738 | |
| 739 | ut_ad(buf_pool_mutex_own(buf_pool)); |
| 740 | /* Must reside in the same buffer pool. */ |
| 741 | ut_ad(buf_pool == buf_pool_from_bpage(dpage)); |
| 742 | |
| 743 | ut_ad(mutex_own(buf_page_get_mutex(bpage))); |
| 744 | |
| 745 | buf_flush_list_mutex_enter(buf_pool); |
| 746 | |
| 747 | /* FIXME: At this point we have both buf_pool and flush_list |
| 748 | mutexes. Theoretically removal of a block from flush list is |
| 749 | only covered by flush_list mutex but currently we do |
| 750 | have buf_pool mutex in buf_flush_remove() therefore this block |
| 751 | is guaranteed to be in the flush list. We need to check if |
| 752 | this will work without the assumption of block removing code |
| 753 | having the buf_pool mutex. */ |
| 754 | ut_ad(bpage->in_flush_list); |
| 755 | ut_ad(dpage->in_flush_list); |
| 756 | |
| 757 | /* If recovery is active we must swap the control blocks in |
| 758 | the flush_rbt as well. */ |
| 759 | if (buf_pool->flush_rbt != NULL) { |
| 760 | buf_flush_delete_from_flush_rbt(bpage); |
| 761 | prev_b = buf_flush_insert_in_flush_rbt(dpage); |
| 762 | } |
| 763 | |
| 764 | /* Important that we adjust the hazard pointer before removing |
| 765 | the bpage from the flush list. */ |
| 766 | buf_pool->flush_hp.adjust(bpage); |
| 767 | |
| 768 | /* Must be done after we have removed it from the flush_rbt |
| 769 | because we assert on in_flush_list in comparison function. */ |
| 770 | ut_d(bpage->in_flush_list = FALSE); |
| 771 | |
| 772 | prev = UT_LIST_GET_PREV(list, bpage); |
| 773 | UT_LIST_REMOVE(buf_pool->flush_list, bpage); |
| 774 | |
| 775 | if (prev) { |
| 776 | ut_ad(prev->in_flush_list); |
| 777 | UT_LIST_INSERT_AFTER( buf_pool->flush_list, prev, dpage); |
| 778 | } else { |
| 779 | UT_LIST_ADD_FIRST(buf_pool->flush_list, dpage); |
| 780 | } |
| 781 | |
| 782 | /* Just an extra check. Previous in flush_list |
| 783 | should be the same control block as in flush_rbt. */ |
| 784 | ut_a(buf_pool->flush_rbt == NULL || prev_b == prev); |
| 785 | |
| 786 | #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG |
| 787 | ut_a(buf_flush_validate_low(buf_pool)); |
| 788 | #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ |
| 789 | |
| 790 | buf_flush_list_mutex_exit(buf_pool); |
| 791 | } |
| 792 | |
| 793 | /** Update the flush system data structures when a write is completed. |
| 794 | @param[in,out] bpage flushed page |
| 795 | @param[in] dblwr whether the doublewrite buffer was used */ |
| 796 | void buf_flush_write_complete(buf_page_t* bpage, bool dblwr) |
| 797 | { |
| 798 | buf_flush_t flush_type; |
| 799 | buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); |
| 800 | |
| 801 | ut_ad(bpage); |
| 802 | |
| 803 | buf_flush_remove(bpage); |
| 804 | |
| 805 | flush_type = buf_page_get_flush_type(bpage); |
| 806 | buf_pool->n_flush[flush_type]--; |
| 807 | ut_ad(buf_pool->n_flush[flush_type] != ULINT_MAX); |
| 808 | |
| 809 | ut_ad(buf_pool_mutex_own(buf_pool)); |
| 810 | |
| 811 | if (buf_pool->n_flush[flush_type] == 0 |
| 812 | && buf_pool->init_flush[flush_type] == FALSE) { |
| 813 | |
| 814 | /* The running flush batch has ended */ |
| 815 | |
| 816 | os_event_set(buf_pool->no_flush[flush_type]); |
| 817 | } |
| 818 | |
| 819 | if (dblwr) { |
| 820 | buf_dblwr_update(bpage, flush_type); |
| 821 | } |
| 822 | } |
| 823 | |
| 824 | /** Calculate the checksum of a page from compressed table and update |
| 825 | the page. |
| 826 | @param[in,out] page page to update |
| 827 | @param[in] size compressed page size |
| 828 | @param[in] lsn LSN to stamp on the page */ |
| 829 | void |
| 830 | buf_flush_update_zip_checksum( |
| 831 | buf_frame_t* page, |
| 832 | ulint size, |
| 833 | lsn_t lsn) |
| 834 | { |
| 835 | ut_a(size > 0); |
| 836 | |
| 837 | const uint32_t checksum = page_zip_calc_checksum( |
| 838 | page, size, |
| 839 | static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm)); |
| 840 | |
| 841 | mach_write_to_8(page + FIL_PAGE_LSN, lsn); |
| 842 | mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum); |
| 843 | } |
| 844 | |
| 845 | /** Initialize a page for writing to the tablespace. |
| 846 | @param[in] block buffer block; NULL if bypassing the buffer pool |
| 847 | @param[in,out] page page frame |
| 848 | @param[in,out] page_zip_ compressed page, or NULL if uncompressed |
| 849 | @param[in] newest_lsn newest modification LSN to the page */ |
| 850 | void |
| 851 | buf_flush_init_for_writing( |
| 852 | const buf_block_t* block, |
| 853 | byte* page, |
| 854 | void* page_zip_, |
| 855 | lsn_t newest_lsn) |
| 856 | { |
| 857 | ut_ad(block == NULL || block->frame == page); |
| 858 | ut_ad(block == NULL || page_zip_ == NULL |
| 859 | || &block->page.zip == page_zip_); |
| 860 | ut_ad(page); |
| 861 | |
| 862 | if (page_zip_) { |
| 863 | page_zip_des_t* page_zip; |
| 864 | ulint size; |
| 865 | |
| 866 | page_zip = static_cast<page_zip_des_t*>(page_zip_); |
| 867 | size = page_zip_get_size(page_zip); |
| 868 | |
| 869 | ut_ad(size); |
| 870 | ut_ad(ut_is_2pow(size)); |
| 871 | ut_ad(size <= UNIV_ZIP_SIZE_MAX); |
| 872 | |
| 873 | switch (fil_page_get_type(page)) { |
| 874 | case FIL_PAGE_TYPE_ALLOCATED: |
| 875 | case FIL_PAGE_INODE: |
| 876 | case FIL_PAGE_IBUF_BITMAP: |
| 877 | case FIL_PAGE_TYPE_FSP_HDR: |
| 878 | case FIL_PAGE_TYPE_XDES: |
| 879 | /* These are essentially uncompressed pages. */ |
| 880 | memcpy(page_zip->data, page, size); |
| 881 | /* fall through */ |
| 882 | case FIL_PAGE_TYPE_ZBLOB: |
| 883 | case FIL_PAGE_TYPE_ZBLOB2: |
| 884 | case FIL_PAGE_INDEX: |
| 885 | case FIL_PAGE_RTREE: |
| 886 | |
| 887 | buf_flush_update_zip_checksum( |
| 888 | page_zip->data, size, newest_lsn); |
| 889 | |
| 890 | return; |
| 891 | } |
| 892 | |
| 893 | ib::error() << "The compressed page to be written" |
| 894 | " seems corrupt:" ; |
| 895 | ut_print_buf(stderr, page, size); |
| 896 | fputs("\nInnoDB: Possibly older version of the page:" , stderr); |
| 897 | ut_print_buf(stderr, page_zip->data, size); |
| 898 | putc('\n', stderr); |
| 899 | ut_error; |
| 900 | } |
| 901 | |
| 902 | /* Write the newest modification lsn to the page header and trailer */ |
| 903 | mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn); |
| 904 | |
| 905 | mach_write_to_8(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM, |
| 906 | newest_lsn); |
| 907 | |
| 908 | if (block && srv_page_size == 16384) { |
| 909 | /* The page type could be garbage in old files |
| 910 | created before MySQL 5.5. Such files always |
| 911 | had a page size of 16 kilobytes. */ |
| 912 | ulint page_type = fil_page_get_type(page); |
| 913 | ulint reset_type = page_type; |
| 914 | |
| 915 | switch (block->page.id.page_no() % 16384) { |
| 916 | case 0: |
| 917 | reset_type = block->page.id.page_no() == 0 |
| 918 | ? FIL_PAGE_TYPE_FSP_HDR |
| 919 | : FIL_PAGE_TYPE_XDES; |
| 920 | break; |
| 921 | case 1: |
| 922 | reset_type = FIL_PAGE_IBUF_BITMAP; |
| 923 | break; |
| 924 | case FSP_TRX_SYS_PAGE_NO: |
| 925 | if (block->page.id.page_no() |
| 926 | == TRX_SYS_PAGE_NO |
| 927 | && block->page.id.space() |
| 928 | == TRX_SYS_SPACE) { |
| 929 | reset_type = FIL_PAGE_TYPE_TRX_SYS; |
| 930 | break; |
| 931 | } |
| 932 | /* fall through */ |
| 933 | default: |
| 934 | switch (page_type) { |
| 935 | case FIL_PAGE_INDEX: |
| 936 | case FIL_PAGE_TYPE_INSTANT: |
| 937 | case FIL_PAGE_RTREE: |
| 938 | case FIL_PAGE_UNDO_LOG: |
| 939 | case FIL_PAGE_INODE: |
| 940 | case FIL_PAGE_IBUF_FREE_LIST: |
| 941 | case FIL_PAGE_TYPE_ALLOCATED: |
| 942 | case FIL_PAGE_TYPE_SYS: |
| 943 | case FIL_PAGE_TYPE_TRX_SYS: |
| 944 | case FIL_PAGE_TYPE_BLOB: |
| 945 | case FIL_PAGE_TYPE_ZBLOB: |
| 946 | case FIL_PAGE_TYPE_ZBLOB2: |
| 947 | break; |
| 948 | case FIL_PAGE_TYPE_FSP_HDR: |
| 949 | case FIL_PAGE_TYPE_XDES: |
| 950 | case FIL_PAGE_IBUF_BITMAP: |
| 951 | /* These pages should have |
| 952 | predetermined page numbers |
| 953 | (see above). */ |
| 954 | default: |
| 955 | reset_type = FIL_PAGE_TYPE_UNKNOWN; |
| 956 | break; |
| 957 | } |
| 958 | } |
| 959 | |
| 960 | if (UNIV_UNLIKELY(page_type != reset_type)) { |
| 961 | ib::info() |
| 962 | << "Resetting invalid page " |
| 963 | << block->page.id << " type " |
| 964 | << page_type << " to " |
| 965 | << reset_type << " when flushing." ; |
| 966 | fil_page_set_type(page, reset_type); |
| 967 | } |
| 968 | } |
| 969 | |
| 970 | uint32_t checksum= 0; |
| 971 | |
| 972 | switch (srv_checksum_algorithm_t(srv_checksum_algorithm)) { |
| 973 | case SRV_CHECKSUM_ALGORITHM_INNODB: |
| 974 | case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB: |
| 975 | checksum = buf_calc_page_new_checksum(page); |
| 976 | mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, |
| 977 | checksum); |
| 978 | /* With the InnoDB checksum, we overwrite the first 4 bytes of |
| 979 | the end lsn field to store the old formula checksum. Since it |
| 980 | depends also on the field FIL_PAGE_SPACE_OR_CHKSUM, it has to |
| 981 | be calculated after storing the new formula checksum. */ |
| 982 | checksum = buf_calc_page_old_checksum(page); |
| 983 | break; |
| 984 | case SRV_CHECKSUM_ALGORITHM_CRC32: |
| 985 | case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: |
| 986 | /* In other cases we write the same checksum to both fields. */ |
| 987 | checksum = buf_calc_page_crc32(page); |
| 988 | mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, |
| 989 | checksum); |
| 990 | break; |
| 991 | case SRV_CHECKSUM_ALGORITHM_NONE: |
| 992 | case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: |
| 993 | checksum = BUF_NO_CHECKSUM_MAGIC; |
| 994 | mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, |
| 995 | checksum); |
| 996 | break; |
| 997 | /* no default so the compiler will emit a warning if |
| 998 | new enum is added and not handled here */ |
| 999 | } |
| 1000 | |
| 1001 | mach_write_to_4(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM, |
| 1002 | checksum); |
| 1003 | } |
| 1004 | |
| 1005 | /********************************************************************//** |
| 1006 | Does an asynchronous write of a buffer page. NOTE: in simulated aio and |
| 1007 | also when the doublewrite buffer is used, we must call |
| 1008 | buf_dblwr_flush_buffered_writes after we have posted a batch of |
| 1009 | writes! */ |
| 1010 | static |
| 1011 | void |
| 1012 | buf_flush_write_block_low( |
| 1013 | /*======================*/ |
| 1014 | buf_page_t* bpage, /*!< in: buffer block to write */ |
| 1015 | buf_flush_t flush_type, /*!< in: type of flush */ |
| 1016 | bool sync) /*!< in: true if sync IO request */ |
| 1017 | { |
| 1018 | fil_space_t* space = fil_space_acquire_for_io(bpage->id.space()); |
| 1019 | if (!space) { |
| 1020 | return; |
| 1021 | } |
| 1022 | ut_ad(space->purpose == FIL_TYPE_TEMPORARY |
| 1023 | || space->purpose == FIL_TYPE_IMPORT |
| 1024 | || space->purpose == FIL_TYPE_TABLESPACE); |
| 1025 | ut_ad((space->purpose == FIL_TYPE_TEMPORARY) |
| 1026 | == (space == fil_system.temp_space)); |
| 1027 | page_t* frame = NULL; |
| 1028 | #ifdef UNIV_DEBUG |
| 1029 | buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); |
| 1030 | ut_ad(!buf_pool_mutex_own(buf_pool)); |
| 1031 | #endif /* UNIV_DEBUG */ |
| 1032 | |
| 1033 | DBUG_PRINT("ib_buf" , ("flush %s %u page %u:%u" , |
| 1034 | sync ? "sync" : "async" , (unsigned) flush_type, |
| 1035 | bpage->id.space(), bpage->id.page_no())); |
| 1036 | |
| 1037 | ut_ad(buf_page_in_file(bpage)); |
| 1038 | |
| 1039 | /* We are not holding buf_pool->mutex or block_mutex here. |
| 1040 | Nevertheless, it is safe to access bpage, because it is |
| 1041 | io_fixed and oldest_modification != 0. Thus, it cannot be |
| 1042 | relocated in the buffer pool or removed from flush_list or |
| 1043 | LRU_list. */ |
| 1044 | ut_ad(!buf_pool_mutex_own(buf_pool)); |
| 1045 | ut_ad(!buf_flush_list_mutex_own(buf_pool)); |
| 1046 | ut_ad(!buf_page_get_mutex(bpage)->is_owned()); |
| 1047 | ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE); |
| 1048 | ut_ad(bpage->oldest_modification != 0); |
| 1049 | |
| 1050 | #ifdef UNIV_IBUF_COUNT_DEBUG |
| 1051 | ut_a(ibuf_count_get(bpage->id) == 0); |
| 1052 | #endif /* UNIV_IBUF_COUNT_DEBUG */ |
| 1053 | |
| 1054 | ut_ad(bpage->newest_modification != 0); |
| 1055 | |
| 1056 | /* Force the log to the disk before writing the modified block */ |
| 1057 | if (!srv_read_only_mode) { |
| 1058 | log_write_up_to(bpage->newest_modification, true); |
| 1059 | } |
| 1060 | |
| 1061 | switch (buf_page_get_state(bpage)) { |
| 1062 | case BUF_BLOCK_POOL_WATCH: |
| 1063 | case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */ |
| 1064 | case BUF_BLOCK_NOT_USED: |
| 1065 | case BUF_BLOCK_READY_FOR_USE: |
| 1066 | case BUF_BLOCK_MEMORY: |
| 1067 | case BUF_BLOCK_REMOVE_HASH: |
| 1068 | ut_error; |
| 1069 | break; |
| 1070 | case BUF_BLOCK_ZIP_DIRTY: |
| 1071 | frame = bpage->zip.data; |
| 1072 | |
| 1073 | mach_write_to_8(frame + FIL_PAGE_LSN, |
| 1074 | bpage->newest_modification); |
| 1075 | |
| 1076 | ut_a(page_zip_verify_checksum(frame, bpage->size.physical())); |
| 1077 | break; |
| 1078 | case BUF_BLOCK_FILE_PAGE: |
| 1079 | frame = bpage->zip.data; |
| 1080 | if (!frame) { |
| 1081 | frame = ((buf_block_t*) bpage)->frame; |
| 1082 | } |
| 1083 | |
| 1084 | buf_flush_init_for_writing( |
| 1085 | reinterpret_cast<const buf_block_t*>(bpage), |
| 1086 | reinterpret_cast<const buf_block_t*>(bpage)->frame, |
| 1087 | bpage->zip.data ? &bpage->zip : NULL, |
| 1088 | bpage->newest_modification); |
| 1089 | break; |
| 1090 | } |
| 1091 | |
| 1092 | frame = buf_page_encrypt_before_write(space, bpage, frame); |
| 1093 | |
| 1094 | ut_ad(space->purpose == FIL_TYPE_TABLESPACE |
| 1095 | || space->atomic_write_supported); |
| 1096 | if (!space->use_doublewrite()) { |
| 1097 | ulint type = IORequest::WRITE | IORequest::DO_NOT_WAKE; |
| 1098 | |
| 1099 | IORequest request(type, bpage); |
| 1100 | |
| 1101 | /* TODO: pass the tablespace to fil_io() */ |
| 1102 | fil_io(request, |
| 1103 | sync, bpage->id, bpage->size, 0, bpage->size.physical(), |
| 1104 | frame, bpage); |
| 1105 | } else { |
| 1106 | ut_ad(!srv_read_only_mode); |
| 1107 | |
| 1108 | if (flush_type == BUF_FLUSH_SINGLE_PAGE) { |
| 1109 | buf_dblwr_write_single_page(bpage, sync); |
| 1110 | } else { |
| 1111 | ut_ad(!sync); |
| 1112 | buf_dblwr_add_to_batch(bpage); |
| 1113 | } |
| 1114 | } |
| 1115 | |
| 1116 | /* When doing single page flushing the IO is done synchronously |
| 1117 | and we flush the changes to disk only for the tablespace we |
| 1118 | are working on. */ |
| 1119 | if (sync) { |
| 1120 | ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE); |
| 1121 | if (space->purpose != FIL_TYPE_TEMPORARY) { |
| 1122 | fil_flush(space); |
| 1123 | } |
| 1124 | |
| 1125 | /* The tablespace could already have been dropped, |
| 1126 | because fil_io(request, sync) would already have |
| 1127 | decremented the node->n_pending. However, |
| 1128 | buf_page_io_complete() only needs to look up the |
| 1129 | tablespace during read requests, not during writes. */ |
| 1130 | ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE); |
| 1131 | #ifdef UNIV_DEBUG |
| 1132 | dberr_t err = |
| 1133 | #endif |
| 1134 | /* true means we want to evict this page from the |
| 1135 | LRU list as well. */ |
| 1136 | buf_page_io_complete(bpage, space->use_doublewrite(), true); |
| 1137 | |
| 1138 | ut_ad(err == DB_SUCCESS); |
| 1139 | } |
| 1140 | |
| 1141 | space->release_for_io(); |
| 1142 | |
| 1143 | /* Increment the counter of I/O operations used |
| 1144 | for selecting LRU policy. */ |
| 1145 | buf_LRU_stat_inc_io(); |
| 1146 | } |
| 1147 | |
| 1148 | /********************************************************************//** |
| 1149 | Writes a flushable page asynchronously from the buffer pool to a file. |
| 1150 | NOTE: in simulated aio we must call |
| 1151 | os_aio_simulated_wake_handler_threads after we have posted a batch of |
| 1152 | writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be |
| 1153 | held upon entering this function, and they will be released by this |
| 1154 | function if it returns true. |
| 1155 | @return TRUE if the page was flushed */ |
| 1156 | ibool |
| 1157 | buf_flush_page( |
| 1158 | /*===========*/ |
| 1159 | buf_pool_t* buf_pool, /*!< in: buffer pool instance */ |
| 1160 | buf_page_t* bpage, /*!< in: buffer control block */ |
| 1161 | buf_flush_t flush_type, /*!< in: type of flush */ |
| 1162 | bool sync) /*!< in: true if sync IO request */ |
| 1163 | { |
| 1164 | BPageMutex* block_mutex; |
| 1165 | |
| 1166 | ut_ad(flush_type < BUF_FLUSH_N_TYPES); |
| 1167 | ut_ad(buf_pool_mutex_own(buf_pool)); |
| 1168 | ut_ad(buf_page_in_file(bpage)); |
| 1169 | ut_ad(!sync || flush_type == BUF_FLUSH_SINGLE_PAGE); |
| 1170 | |
| 1171 | block_mutex = buf_page_get_mutex(bpage); |
| 1172 | ut_ad(mutex_own(block_mutex)); |
| 1173 | |
| 1174 | ut_ad(buf_flush_ready_for_flush(bpage, flush_type)); |
| 1175 | |
| 1176 | bool is_uncompressed; |
| 1177 | |
| 1178 | is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); |
| 1179 | ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex)); |
| 1180 | |
| 1181 | ibool flush; |
| 1182 | rw_lock_t* rw_lock; |
| 1183 | bool no_fix_count = bpage->buf_fix_count == 0; |
| 1184 | |
| 1185 | if (!is_uncompressed) { |
| 1186 | flush = TRUE; |
| 1187 | rw_lock = NULL; |
| 1188 | } else if (!(no_fix_count || flush_type == BUF_FLUSH_LIST) |
| 1189 | || (!no_fix_count |
| 1190 | && srv_shutdown_state <= SRV_SHUTDOWN_CLEANUP |
| 1191 | && fsp_is_system_temporary(bpage->id.space()))) { |
| 1192 | /* This is a heuristic, to avoid expensive SX attempts. */ |
| 1193 | /* For table residing in temporary tablespace sync is done |
| 1194 | using IO_FIX and so before scheduling for flush ensure that |
| 1195 | page is not fixed. */ |
| 1196 | flush = FALSE; |
| 1197 | } else { |
| 1198 | rw_lock = &reinterpret_cast<buf_block_t*>(bpage)->lock; |
| 1199 | if (flush_type != BUF_FLUSH_LIST) { |
| 1200 | flush = rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE); |
| 1201 | } else { |
| 1202 | /* Will SX lock later */ |
| 1203 | flush = TRUE; |
| 1204 | } |
| 1205 | } |
| 1206 | |
| 1207 | if (flush) { |
| 1208 | |
| 1209 | /* We are committed to flushing by the time we get here */ |
| 1210 | |
| 1211 | buf_page_set_io_fix(bpage, BUF_IO_WRITE); |
| 1212 | |
| 1213 | buf_page_set_flush_type(bpage, flush_type); |
| 1214 | |
| 1215 | if (buf_pool->n_flush[flush_type] == 0) { |
| 1216 | os_event_reset(buf_pool->no_flush[flush_type]); |
| 1217 | } |
| 1218 | |
| 1219 | ++buf_pool->n_flush[flush_type]; |
| 1220 | ut_ad(buf_pool->n_flush[flush_type] != 0); |
| 1221 | |
| 1222 | mutex_exit(block_mutex); |
| 1223 | |
| 1224 | buf_pool_mutex_exit(buf_pool); |
| 1225 | |
| 1226 | if (flush_type == BUF_FLUSH_LIST |
| 1227 | && is_uncompressed |
| 1228 | && !rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE)) { |
| 1229 | |
| 1230 | if (!fsp_is_system_temporary(bpage->id.space())) { |
| 1231 | /* avoiding deadlock possibility involves |
| 1232 | doublewrite buffer, should flush it, because |
| 1233 | it might hold the another block->lock. */ |
| 1234 | buf_dblwr_flush_buffered_writes(); |
| 1235 | } else { |
| 1236 | buf_dblwr_sync_datafiles(); |
| 1237 | } |
| 1238 | |
| 1239 | rw_lock_sx_lock_gen(rw_lock, BUF_IO_WRITE); |
| 1240 | } |
| 1241 | |
| 1242 | /* If there is an observer that want to know if the asynchronous |
| 1243 | flushing was sent then notify it. |
| 1244 | Note: we set flush observer to a page with x-latch, so we can |
| 1245 | guarantee that notify_flush and notify_remove are called in pair |
| 1246 | with s-latch on a uncompressed page. */ |
| 1247 | if (bpage->flush_observer != NULL) { |
| 1248 | buf_pool_mutex_enter(buf_pool); |
| 1249 | |
| 1250 | bpage->flush_observer->notify_flush(buf_pool, bpage); |
| 1251 | |
| 1252 | buf_pool_mutex_exit(buf_pool); |
| 1253 | } |
| 1254 | |
| 1255 | /* Even though bpage is not protected by any mutex at this |
| 1256 | point, it is safe to access bpage, because it is io_fixed and |
| 1257 | oldest_modification != 0. Thus, it cannot be relocated in the |
| 1258 | buffer pool or removed from flush_list or LRU_list. */ |
| 1259 | |
| 1260 | buf_flush_write_block_low(bpage, flush_type, sync); |
| 1261 | } |
| 1262 | |
| 1263 | return(flush); |
| 1264 | } |
| 1265 | |
| 1266 | # if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG |
| 1267 | /********************************************************************//** |
| 1268 | Writes a flushable page asynchronously from the buffer pool to a file. |
| 1269 | NOTE: buf_pool->mutex and block->mutex must be held upon entering this |
| 1270 | function, and they will be released by this function after flushing. |
| 1271 | This is loosely based on buf_flush_batch() and buf_flush_page(). |
| 1272 | @return TRUE if the page was flushed and the mutexes released */ |
| 1273 | ibool |
| 1274 | buf_flush_page_try( |
| 1275 | /*===============*/ |
| 1276 | buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ |
| 1277 | buf_block_t* block) /*!< in/out: buffer control block */ |
| 1278 | { |
| 1279 | ut_ad(buf_pool_mutex_own(buf_pool)); |
| 1280 | ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); |
| 1281 | ut_ad(buf_page_mutex_own(block)); |
| 1282 | |
| 1283 | if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_SINGLE_PAGE)) { |
| 1284 | return(FALSE); |
| 1285 | } |
| 1286 | |
| 1287 | /* The following call will release the buffer pool and |
| 1288 | block mutex. */ |
| 1289 | return(buf_flush_page( |
| 1290 | buf_pool, &block->page, |
| 1291 | BUF_FLUSH_SINGLE_PAGE, true)); |
| 1292 | } |
| 1293 | # endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ |
| 1294 | |
| 1295 | /** Check the page is in buffer pool and can be flushed. |
| 1296 | @param[in] page_id page id |
| 1297 | @param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST |
| 1298 | @return true if the page can be flushed. */ |
| 1299 | static |
| 1300 | bool |
| 1301 | buf_flush_check_neighbor( |
| 1302 | const page_id_t& page_id, |
| 1303 | buf_flush_t flush_type) |
| 1304 | { |
| 1305 | buf_page_t* bpage; |
| 1306 | buf_pool_t* buf_pool = buf_pool_get(page_id); |
| 1307 | bool ret; |
| 1308 | |
| 1309 | ut_ad(flush_type == BUF_FLUSH_LRU |
| 1310 | || flush_type == BUF_FLUSH_LIST); |
| 1311 | |
| 1312 | buf_pool_mutex_enter(buf_pool); |
| 1313 | |
| 1314 | /* We only want to flush pages from this buffer pool. */ |
| 1315 | bpage = buf_page_hash_get(buf_pool, page_id); |
| 1316 | |
| 1317 | if (!bpage) { |
| 1318 | |
| 1319 | buf_pool_mutex_exit(buf_pool); |
| 1320 | return(false); |
| 1321 | } |
| 1322 | |
| 1323 | ut_a(buf_page_in_file(bpage)); |
| 1324 | |
| 1325 | /* We avoid flushing 'non-old' blocks in an LRU flush, |
| 1326 | because the flushed blocks are soon freed */ |
| 1327 | |
| 1328 | ret = false; |
| 1329 | if (flush_type != BUF_FLUSH_LRU || buf_page_is_old(bpage)) { |
| 1330 | BPageMutex* block_mutex = buf_page_get_mutex(bpage); |
| 1331 | |
| 1332 | mutex_enter(block_mutex); |
| 1333 | if (buf_flush_ready_for_flush(bpage, flush_type)) { |
| 1334 | ret = true; |
| 1335 | } |
| 1336 | mutex_exit(block_mutex); |
| 1337 | } |
| 1338 | buf_pool_mutex_exit(buf_pool); |
| 1339 | |
| 1340 | return(ret); |
| 1341 | } |
| 1342 | |
| 1343 | /** Flushes to disk all flushable pages within the flush area. |
| 1344 | @param[in] page_id page id |
| 1345 | @param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST |
| 1346 | @param[in] n_flushed number of pages flushed so far in this batch |
| 1347 | @param[in] n_to_flush maximum number of pages we are allowed to flush |
| 1348 | @return number of pages flushed */ |
| 1349 | static |
| 1350 | ulint |
| 1351 | buf_flush_try_neighbors( |
| 1352 | const page_id_t& page_id, |
| 1353 | buf_flush_t flush_type, |
| 1354 | ulint n_flushed, |
| 1355 | ulint n_to_flush) |
| 1356 | { |
| 1357 | ulint i; |
| 1358 | ulint low; |
| 1359 | ulint high; |
| 1360 | ulint count = 0; |
| 1361 | buf_pool_t* buf_pool = buf_pool_get(page_id); |
| 1362 | |
| 1363 | ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); |
| 1364 | |
| 1365 | if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN |
| 1366 | || srv_flush_neighbors == 0) { |
| 1367 | /* If there is little space or neighbor flushing is |
| 1368 | not enabled then just flush the victim. */ |
| 1369 | low = page_id.page_no(); |
| 1370 | high = page_id.page_no() + 1; |
| 1371 | } else { |
| 1372 | /* When flushed, dirty blocks are searched in |
| 1373 | neighborhoods of this size, and flushed along with the |
| 1374 | original page. */ |
| 1375 | |
| 1376 | ulint buf_flush_area; |
| 1377 | |
| 1378 | buf_flush_area = ut_min( |
| 1379 | BUF_READ_AHEAD_AREA(buf_pool), |
| 1380 | buf_pool->curr_size / 16); |
| 1381 | |
| 1382 | low = (page_id.page_no() / buf_flush_area) * buf_flush_area; |
| 1383 | high = (page_id.page_no() / buf_flush_area + 1) * buf_flush_area; |
| 1384 | |
| 1385 | if (srv_flush_neighbors == 1) { |
| 1386 | /* adjust 'low' and 'high' to limit |
| 1387 | for contiguous dirty area */ |
| 1388 | if (page_id.page_no() > low) { |
| 1389 | for (i = page_id.page_no() - 1; i >= low; i--) { |
| 1390 | if (!buf_flush_check_neighbor( |
| 1391 | page_id_t(page_id.space(), i), |
| 1392 | flush_type)) { |
| 1393 | |
| 1394 | break; |
| 1395 | } |
| 1396 | |
| 1397 | if (i == low) { |
| 1398 | /* Avoid overwrap when low == 0 |
| 1399 | and calling |
| 1400 | buf_flush_check_neighbor() with |
| 1401 | i == (ulint) -1 */ |
| 1402 | i--; |
| 1403 | break; |
| 1404 | } |
| 1405 | } |
| 1406 | low = i + 1; |
| 1407 | } |
| 1408 | |
| 1409 | for (i = page_id.page_no() + 1; |
| 1410 | i < high |
| 1411 | && buf_flush_check_neighbor( |
| 1412 | page_id_t(page_id.space(), i), |
| 1413 | flush_type); |
| 1414 | i++) { |
| 1415 | /* do nothing */ |
| 1416 | } |
| 1417 | high = i; |
| 1418 | } |
| 1419 | } |
| 1420 | |
| 1421 | const ulint space_size = fil_space_get_size(page_id.space()); |
| 1422 | if (high > space_size) { |
| 1423 | high = space_size; |
| 1424 | } |
| 1425 | |
| 1426 | DBUG_PRINT("ib_buf" , ("flush %u:%u..%u" , |
| 1427 | page_id.space(), |
| 1428 | (unsigned) low, (unsigned) high)); |
| 1429 | |
| 1430 | for (ulint i = low; i < high; i++) { |
| 1431 | buf_page_t* bpage; |
| 1432 | |
| 1433 | if ((count + n_flushed) >= n_to_flush) { |
| 1434 | |
| 1435 | /* We have already flushed enough pages and |
| 1436 | should call it a day. There is, however, one |
| 1437 | exception. If the page whose neighbors we |
| 1438 | are flushing has not been flushed yet then |
| 1439 | we'll try to flush the victim that we |
| 1440 | selected originally. */ |
| 1441 | if (i <= page_id.page_no()) { |
| 1442 | i = page_id.page_no(); |
| 1443 | } else { |
| 1444 | break; |
| 1445 | } |
| 1446 | } |
| 1447 | |
| 1448 | const page_id_t cur_page_id(page_id.space(), i); |
| 1449 | |
| 1450 | buf_pool = buf_pool_get(cur_page_id); |
| 1451 | |
| 1452 | buf_pool_mutex_enter(buf_pool); |
| 1453 | |
| 1454 | /* We only want to flush pages from this buffer pool. */ |
| 1455 | bpage = buf_page_hash_get(buf_pool, cur_page_id); |
| 1456 | |
| 1457 | if (bpage == NULL) { |
| 1458 | |
| 1459 | buf_pool_mutex_exit(buf_pool); |
| 1460 | continue; |
| 1461 | } |
| 1462 | |
| 1463 | ut_a(buf_page_in_file(bpage)); |
| 1464 | |
| 1465 | /* We avoid flushing 'non-old' blocks in an LRU flush, |
| 1466 | because the flushed blocks are soon freed */ |
| 1467 | |
| 1468 | if (flush_type != BUF_FLUSH_LRU |
| 1469 | || i == page_id.page_no() |
| 1470 | || buf_page_is_old(bpage)) { |
| 1471 | |
| 1472 | BPageMutex* block_mutex = buf_page_get_mutex(bpage); |
| 1473 | |
| 1474 | mutex_enter(block_mutex); |
| 1475 | |
| 1476 | if (buf_flush_ready_for_flush(bpage, flush_type) |
| 1477 | && (i == page_id.page_no() |
| 1478 | || bpage->buf_fix_count == 0)) { |
| 1479 | |
| 1480 | /* We also try to flush those |
| 1481 | neighbors != offset */ |
| 1482 | |
| 1483 | if (buf_flush_page( |
| 1484 | buf_pool, bpage, flush_type, false)) { |
| 1485 | |
| 1486 | ++count; |
| 1487 | } else { |
| 1488 | mutex_exit(block_mutex); |
| 1489 | buf_pool_mutex_exit(buf_pool); |
| 1490 | } |
| 1491 | |
| 1492 | continue; |
| 1493 | } else { |
| 1494 | mutex_exit(block_mutex); |
| 1495 | } |
| 1496 | } |
| 1497 | buf_pool_mutex_exit(buf_pool); |
| 1498 | } |
| 1499 | |
| 1500 | if (count > 1) { |
| 1501 | MONITOR_INC_VALUE_CUMULATIVE( |
| 1502 | MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE, |
| 1503 | MONITOR_FLUSH_NEIGHBOR_COUNT, |
| 1504 | MONITOR_FLUSH_NEIGHBOR_PAGES, |
| 1505 | (count - 1)); |
| 1506 | } |
| 1507 | |
| 1508 | return(count); |
| 1509 | } |
| 1510 | |
| 1511 | /** Check if the block is modified and ready for flushing. |
| 1512 | If the the block is ready to flush then flush the page and try o flush |
| 1513 | its neighbors. |
| 1514 | @param[in] bpage buffer control block, |
| 1515 | must be buf_page_in_file(bpage) |
| 1516 | @param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST |
| 1517 | @param[in] n_to_flush number of pages to flush |
| 1518 | @param[in,out] count number of pages flushed |
| 1519 | @return TRUE if buf_pool mutex was released during this function. |
| 1520 | This does not guarantee that some pages were written as well. |
| 1521 | Number of pages written are incremented to the count. */ |
| 1522 | static |
| 1523 | bool |
| 1524 | buf_flush_page_and_try_neighbors( |
| 1525 | buf_page_t* bpage, |
| 1526 | buf_flush_t flush_type, |
| 1527 | ulint n_to_flush, |
| 1528 | ulint* count) |
| 1529 | { |
| 1530 | #ifdef UNIV_DEBUG |
| 1531 | buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); |
| 1532 | |
| 1533 | ut_ad(buf_pool_mutex_own(buf_pool)); |
| 1534 | #endif /* UNIV_DEBUG */ |
| 1535 | |
| 1536 | bool flushed; |
| 1537 | BPageMutex* block_mutex = buf_page_get_mutex(bpage); |
| 1538 | |
| 1539 | mutex_enter(block_mutex); |
| 1540 | |
| 1541 | ut_a(buf_page_in_file(bpage)); |
| 1542 | |
| 1543 | if (buf_flush_ready_for_flush(bpage, flush_type)) { |
| 1544 | buf_pool_t* buf_pool; |
| 1545 | |
| 1546 | buf_pool = buf_pool_from_bpage(bpage); |
| 1547 | |
| 1548 | const page_id_t page_id = bpage->id; |
| 1549 | |
| 1550 | mutex_exit(block_mutex); |
| 1551 | |
| 1552 | buf_pool_mutex_exit(buf_pool); |
| 1553 | |
| 1554 | /* Try to flush also all the neighbors */ |
| 1555 | *count += buf_flush_try_neighbors( |
| 1556 | page_id, flush_type, *count, n_to_flush); |
| 1557 | |
| 1558 | buf_pool_mutex_enter(buf_pool); |
| 1559 | flushed = TRUE; |
| 1560 | } else { |
| 1561 | mutex_exit(block_mutex); |
| 1562 | |
| 1563 | flushed = false; |
| 1564 | } |
| 1565 | |
| 1566 | ut_ad(buf_pool_mutex_own(buf_pool)); |
| 1567 | |
| 1568 | return(flushed); |
| 1569 | } |
| 1570 | |
| 1571 | /*******************************************************************//** |
| 1572 | This utility moves the uncompressed frames of pages to the free list. |
| 1573 | Note that this function does not actually flush any data to disk. It |
| 1574 | just detaches the uncompressed frames from the compressed pages at the |
| 1575 | tail of the unzip_LRU and puts those freed frames in the free list. |
| 1576 | Note that it is a best effort attempt and it is not guaranteed that |
| 1577 | after a call to this function there will be 'max' blocks in the free |
| 1578 | list. |
| 1579 | @return number of blocks moved to the free list. */ |
| 1580 | static |
| 1581 | ulint |
| 1582 | buf_free_from_unzip_LRU_list_batch( |
| 1583 | /*===============================*/ |
| 1584 | buf_pool_t* buf_pool, /*!< in: buffer pool instance */ |
| 1585 | ulint max) /*!< in: desired number of |
| 1586 | blocks in the free_list */ |
| 1587 | { |
| 1588 | ulint scanned = 0; |
| 1589 | ulint count = 0; |
| 1590 | ulint free_len = UT_LIST_GET_LEN(buf_pool->free); |
| 1591 | ulint lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU); |
| 1592 | |
| 1593 | ut_ad(buf_pool_mutex_own(buf_pool)); |
| 1594 | |
| 1595 | buf_block_t* block = UT_LIST_GET_LAST(buf_pool->unzip_LRU); |
| 1596 | |
| 1597 | while (block != NULL |
| 1598 | && count < max |
| 1599 | && free_len < srv_LRU_scan_depth |
| 1600 | && lru_len > UT_LIST_GET_LEN(buf_pool->LRU) / 10) { |
| 1601 | |
| 1602 | ++scanned; |
| 1603 | if (buf_LRU_free_page(&block->page, false)) { |
| 1604 | /* Block was freed. buf_pool->mutex potentially |
| 1605 | released and reacquired */ |
| 1606 | ++count; |
| 1607 | block = UT_LIST_GET_LAST(buf_pool->unzip_LRU); |
| 1608 | |
| 1609 | } else { |
| 1610 | |
| 1611 | block = UT_LIST_GET_PREV(unzip_LRU, block); |
| 1612 | } |
| 1613 | |
| 1614 | free_len = UT_LIST_GET_LEN(buf_pool->free); |
| 1615 | lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU); |
| 1616 | } |
| 1617 | |
| 1618 | ut_ad(buf_pool_mutex_own(buf_pool)); |
| 1619 | |
| 1620 | if (scanned) { |
| 1621 | MONITOR_INC_VALUE_CUMULATIVE( |
| 1622 | MONITOR_LRU_BATCH_SCANNED, |
| 1623 | MONITOR_LRU_BATCH_SCANNED_NUM_CALL, |
| 1624 | MONITOR_LRU_BATCH_SCANNED_PER_CALL, |
| 1625 | scanned); |
| 1626 | } |
| 1627 | |
| 1628 | return(count); |
| 1629 | } |
| 1630 | |
| 1631 | /*******************************************************************//** |
| 1632 | This utility flushes dirty blocks from the end of the LRU list. |
| 1633 | The calling thread is not allowed to own any latches on pages! |
| 1634 | It attempts to make 'max' blocks available in the free list. Note that |
| 1635 | it is a best effort attempt and it is not guaranteed that after a call |
| 1636 | to this function there will be 'max' blocks in the free list.*/ |
| 1637 | |
| 1638 | void |
| 1639 | buf_flush_LRU_list_batch( |
| 1640 | /*=====================*/ |
| 1641 | buf_pool_t* buf_pool, /*!< in: buffer pool instance */ |
| 1642 | ulint max, /*!< in: desired number of |
| 1643 | blocks in the free_list */ |
| 1644 | flush_counters_t* n) /*!< out: flushed/evicted page |
| 1645 | counts */ |
| 1646 | { |
| 1647 | buf_page_t* bpage; |
| 1648 | ulint scanned = 0; |
| 1649 | ulint free_len = UT_LIST_GET_LEN(buf_pool->free); |
| 1650 | ulint lru_len = UT_LIST_GET_LEN(buf_pool->LRU); |
| 1651 | ulint withdraw_depth = 0; |
| 1652 | |
| 1653 | n->flushed = 0; |
| 1654 | n->evicted = 0; |
| 1655 | n->unzip_LRU_evicted = 0; |
| 1656 | ut_ad(buf_pool_mutex_own(buf_pool)); |
| 1657 | if (buf_pool->curr_size < buf_pool->old_size |
| 1658 | && buf_pool->withdraw_target > 0) { |
| 1659 | withdraw_depth = buf_pool->withdraw_target |
| 1660 | - UT_LIST_GET_LEN(buf_pool->withdraw); |
| 1661 | } |
| 1662 | |
| 1663 | for (bpage = UT_LIST_GET_LAST(buf_pool->LRU); |
| 1664 | bpage != NULL && n->flushed + n->evicted < max |
| 1665 | && free_len < srv_LRU_scan_depth + withdraw_depth |
| 1666 | && lru_len > BUF_LRU_MIN_LEN; |
| 1667 | ++scanned, |
| 1668 | bpage = buf_pool->lru_hp.get()) { |
| 1669 | |
| 1670 | buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage); |
| 1671 | buf_pool->lru_hp.set(prev); |
| 1672 | |
| 1673 | BPageMutex* block_mutex = buf_page_get_mutex(bpage); |
| 1674 | |
| 1675 | mutex_enter(block_mutex); |
| 1676 | |
| 1677 | if (buf_flush_ready_for_replace(bpage)) { |
| 1678 | /* block is ready for eviction i.e., it is |
| 1679 | clean and is not IO-fixed or buffer fixed. */ |
| 1680 | mutex_exit(block_mutex); |
| 1681 | if (buf_LRU_free_page(bpage, true)) { |
| 1682 | ++n->evicted; |
| 1683 | } |
| 1684 | } else if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_LRU)) { |
| 1685 | /* Block is ready for flush. Dispatch an IO |
| 1686 | request. The IO helper thread will put it on |
| 1687 | free list in IO completion routine. */ |
| 1688 | mutex_exit(block_mutex); |
| 1689 | buf_flush_page_and_try_neighbors( |
| 1690 | bpage, BUF_FLUSH_LRU, max, &n->flushed); |
| 1691 | } else { |
| 1692 | /* Can't evict or dispatch this block. Go to |
| 1693 | previous. */ |
| 1694 | ut_ad(buf_pool->lru_hp.is_hp(prev)); |
| 1695 | mutex_exit(block_mutex); |
| 1696 | } |
| 1697 | |
| 1698 | ut_ad(!mutex_own(block_mutex)); |
| 1699 | ut_ad(buf_pool_mutex_own(buf_pool)); |
| 1700 | |
| 1701 | free_len = UT_LIST_GET_LEN(buf_pool->free); |
| 1702 | lru_len = UT_LIST_GET_LEN(buf_pool->LRU); |
| 1703 | } |
| 1704 | |
| 1705 | buf_pool->lru_hp.set(NULL); |
| 1706 | |
| 1707 | /* We keep track of all flushes happening as part of LRU |
| 1708 | flush. When estimating the desired rate at which flush_list |
| 1709 | should be flushed, we factor in this value. */ |
| 1710 | buf_lru_flush_page_count += n->flushed; |
| 1711 | |
| 1712 | ut_ad(buf_pool_mutex_own(buf_pool)); |
| 1713 | |
| 1714 | if (n->evicted) { |
| 1715 | MONITOR_INC_VALUE_CUMULATIVE( |
| 1716 | MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE, |
| 1717 | MONITOR_LRU_BATCH_EVICT_COUNT, |
| 1718 | MONITOR_LRU_BATCH_EVICT_PAGES, |
| 1719 | n->evicted); |
| 1720 | } |
| 1721 | |
| 1722 | if (scanned) { |
| 1723 | MONITOR_INC_VALUE_CUMULATIVE( |
| 1724 | MONITOR_LRU_BATCH_SCANNED, |
| 1725 | MONITOR_LRU_BATCH_SCANNED_NUM_CALL, |
| 1726 | MONITOR_LRU_BATCH_SCANNED_PER_CALL, |
| 1727 | scanned); |
| 1728 | } |
| 1729 | } |
| 1730 | |
| 1731 | /*******************************************************************//** |
| 1732 | Flush and move pages from LRU or unzip_LRU list to the free list. |
| 1733 | Whether LRU or unzip_LRU is used depends on the state of the system.*/ |
| 1734 | |
| 1735 | static |
| 1736 | void |
| 1737 | buf_do_LRU_batch( |
| 1738 | /*=============*/ |
| 1739 | buf_pool_t* buf_pool, /*!< in: buffer pool instance */ |
| 1740 | ulint max, /*!< in: desired number of |
| 1741 | blocks in the free_list */ |
| 1742 | flush_counters_t* n) /*!< out: flushed/evicted page |
| 1743 | counts */ |
| 1744 | { |
| 1745 | if (buf_LRU_evict_from_unzip_LRU(buf_pool)) { |
| 1746 | n->unzip_LRU_evicted = buf_free_from_unzip_LRU_list_batch(buf_pool, max); |
| 1747 | } else { |
| 1748 | n->unzip_LRU_evicted = 0; |
| 1749 | } |
| 1750 | |
| 1751 | if (max > n->unzip_LRU_evicted) { |
| 1752 | buf_flush_LRU_list_batch(buf_pool, max - n->unzip_LRU_evicted, n); |
| 1753 | } else { |
| 1754 | n->evicted = 0; |
| 1755 | n->flushed = 0; |
| 1756 | } |
| 1757 | |
| 1758 | /* Add evicted pages from unzip_LRU to the evicted pages from |
| 1759 | the simple LRU. */ |
| 1760 | n->evicted += n->unzip_LRU_evicted; |
| 1761 | } |
| 1762 | |
| 1763 | /** This utility flushes dirty blocks from the end of the flush_list. |
| 1764 | The calling thread is not allowed to own any latches on pages! |
| 1765 | @param[in] buf_pool buffer pool instance |
| 1766 | @param[in] min_n wished minimum mumber of blocks flushed (it is |
| 1767 | not guaranteed that the actual number is that big, though) |
| 1768 | @param[in] lsn_limit all blocks whose oldest_modification is smaller |
| 1769 | than this should be flushed (if their number does not exceed min_n) |
| 1770 | @return number of blocks for which the write request was queued; |
| 1771 | ULINT_UNDEFINED if there was a flush of the same type already |
| 1772 | running */ |
| 1773 | static |
| 1774 | ulint |
| 1775 | buf_do_flush_list_batch( |
| 1776 | buf_pool_t* buf_pool, |
| 1777 | ulint min_n, |
| 1778 | lsn_t lsn_limit) |
| 1779 | { |
| 1780 | ulint count = 0; |
| 1781 | ulint scanned = 0; |
| 1782 | |
| 1783 | ut_ad(buf_pool_mutex_own(buf_pool)); |
| 1784 | |
| 1785 | /* Start from the end of the list looking for a suitable |
| 1786 | block to be flushed. */ |
| 1787 | buf_flush_list_mutex_enter(buf_pool); |
| 1788 | ulint len = UT_LIST_GET_LEN(buf_pool->flush_list); |
| 1789 | |
| 1790 | /* In order not to degenerate this scan to O(n*n) we attempt |
| 1791 | to preserve pointer of previous block in the flush list. To do |
| 1792 | so we declare it a hazard pointer. Any thread working on the |
| 1793 | flush list must check the hazard pointer and if it is removing |
| 1794 | the same block then it must reset it. */ |
| 1795 | for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->flush_list); |
| 1796 | count < min_n && bpage != NULL && len > 0 |
| 1797 | && bpage->oldest_modification < lsn_limit; |
| 1798 | bpage = buf_pool->flush_hp.get(), |
| 1799 | ++scanned) { |
| 1800 | |
| 1801 | buf_page_t* prev; |
| 1802 | |
| 1803 | ut_a(bpage->oldest_modification > 0); |
| 1804 | ut_ad(bpage->in_flush_list); |
| 1805 | |
| 1806 | prev = UT_LIST_GET_PREV(list, bpage); |
| 1807 | buf_pool->flush_hp.set(prev); |
| 1808 | buf_flush_list_mutex_exit(buf_pool); |
| 1809 | |
| 1810 | #ifdef UNIV_DEBUG |
| 1811 | bool flushed = |
| 1812 | #endif /* UNIV_DEBUG */ |
| 1813 | buf_flush_page_and_try_neighbors( |
| 1814 | bpage, BUF_FLUSH_LIST, min_n, &count); |
| 1815 | |
| 1816 | buf_flush_list_mutex_enter(buf_pool); |
| 1817 | |
| 1818 | ut_ad(flushed || buf_pool->flush_hp.is_hp(prev)); |
| 1819 | |
| 1820 | --len; |
| 1821 | } |
| 1822 | |
| 1823 | buf_pool->flush_hp.set(NULL); |
| 1824 | buf_flush_list_mutex_exit(buf_pool); |
| 1825 | |
| 1826 | if (scanned) { |
| 1827 | MONITOR_INC_VALUE_CUMULATIVE( |
| 1828 | MONITOR_FLUSH_BATCH_SCANNED, |
| 1829 | MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL, |
| 1830 | MONITOR_FLUSH_BATCH_SCANNED_PER_CALL, |
| 1831 | scanned); |
| 1832 | } |
| 1833 | |
| 1834 | if (count) { |
| 1835 | MONITOR_INC_VALUE_CUMULATIVE( |
| 1836 | MONITOR_FLUSH_BATCH_TOTAL_PAGE, |
| 1837 | MONITOR_FLUSH_BATCH_COUNT, |
| 1838 | MONITOR_FLUSH_BATCH_PAGES, |
| 1839 | count); |
| 1840 | } |
| 1841 | |
| 1842 | ut_ad(buf_pool_mutex_own(buf_pool)); |
| 1843 | |
| 1844 | return(count); |
| 1845 | } |
| 1846 | |
| 1847 | /** This utility flushes dirty blocks from the end of the LRU list or |
| 1848 | flush_list. |
| 1849 | NOTE 1: in the case of an LRU flush the calling thread may own latches to |
| 1850 | pages: to avoid deadlocks, this function must be written so that it cannot |
| 1851 | end up waiting for these latches! NOTE 2: in the case of a flush list flush, |
| 1852 | the calling thread is not allowed to own any latches on pages! |
| 1853 | @param[in] buf_pool buffer pool instance |
| 1854 | @param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST; if |
| 1855 | BUF_FLUSH_LIST, then the caller must not own any latches on pages |
| 1856 | @param[in] min_n wished minimum mumber of blocks flushed (it is |
| 1857 | not guaranteed that the actual number is that big, though) |
| 1858 | @param[in] lsn_limit in the case of BUF_FLUSH_LIST all blocks whose |
| 1859 | oldest_modification is smaller than this should be flushed (if their number |
| 1860 | does not exceed min_n), otherwise ignored */ |
| 1861 | static |
| 1862 | void |
| 1863 | buf_flush_batch( |
| 1864 | buf_pool_t* buf_pool, |
| 1865 | buf_flush_t flush_type, |
| 1866 | ulint min_n, |
| 1867 | lsn_t lsn_limit, |
| 1868 | flush_counters_t* n) /*!< out: flushed/evicted page |
| 1869 | counts */ |
| 1870 | { |
| 1871 | ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); |
| 1872 | ut_ad(flush_type == BUF_FLUSH_LRU |
| 1873 | || !sync_check_iterate(dict_sync_check())); |
| 1874 | |
| 1875 | buf_pool_mutex_enter(buf_pool); |
| 1876 | |
| 1877 | /* Note: The buffer pool mutex is released and reacquired within |
| 1878 | the flush functions. */ |
| 1879 | switch (flush_type) { |
| 1880 | case BUF_FLUSH_LRU: |
| 1881 | buf_do_LRU_batch(buf_pool, min_n, n); |
| 1882 | break; |
| 1883 | case BUF_FLUSH_LIST: |
| 1884 | n->flushed = buf_do_flush_list_batch(buf_pool, min_n, lsn_limit); |
| 1885 | n->evicted = 0; |
| 1886 | break; |
| 1887 | default: |
| 1888 | ut_error; |
| 1889 | } |
| 1890 | |
| 1891 | buf_pool_mutex_exit(buf_pool); |
| 1892 | |
| 1893 | DBUG_LOG("ib_buf" , "flush " << flush_type << " completed" ); |
| 1894 | } |
| 1895 | |
| 1896 | /******************************************************************//** |
| 1897 | Gather the aggregated stats for both flush list and LRU list flushing. |
| 1898 | @param page_count_flush number of pages flushed from the end of the flush_list |
| 1899 | @param page_count_LRU number of pages flushed from the end of the LRU list |
| 1900 | */ |
| 1901 | static |
| 1902 | void |
| 1903 | buf_flush_stats( |
| 1904 | /*============*/ |
| 1905 | ulint page_count_flush, |
| 1906 | ulint page_count_LRU) |
| 1907 | { |
| 1908 | DBUG_PRINT("ib_buf" , ("flush completed, from flush_list %u pages, " |
| 1909 | "from LRU_list %u pages" , |
| 1910 | unsigned(page_count_flush), |
| 1911 | unsigned(page_count_LRU))); |
| 1912 | |
| 1913 | srv_stats.buf_pool_flushed.add(page_count_flush + page_count_LRU); |
| 1914 | } |
| 1915 | |
| 1916 | /******************************************************************//** |
| 1917 | Start a buffer flush batch for LRU or flush list */ |
| 1918 | static |
| 1919 | ibool |
| 1920 | buf_flush_start( |
| 1921 | /*============*/ |
| 1922 | buf_pool_t* buf_pool, /*!< buffer pool instance */ |
| 1923 | buf_flush_t flush_type) /*!< in: BUF_FLUSH_LRU |
| 1924 | or BUF_FLUSH_LIST */ |
| 1925 | { |
| 1926 | ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); |
| 1927 | |
| 1928 | buf_pool_mutex_enter(buf_pool); |
| 1929 | |
| 1930 | if (buf_pool->n_flush[flush_type] > 0 |
| 1931 | || buf_pool->init_flush[flush_type] == TRUE) { |
| 1932 | |
| 1933 | /* There is already a flush batch of the same type running */ |
| 1934 | |
| 1935 | buf_pool_mutex_exit(buf_pool); |
| 1936 | |
| 1937 | return(FALSE); |
| 1938 | } |
| 1939 | |
| 1940 | buf_pool->init_flush[flush_type] = TRUE; |
| 1941 | |
| 1942 | os_event_reset(buf_pool->no_flush[flush_type]); |
| 1943 | |
| 1944 | buf_pool_mutex_exit(buf_pool); |
| 1945 | |
| 1946 | return(TRUE); |
| 1947 | } |
| 1948 | |
| 1949 | /******************************************************************//** |
| 1950 | End a buffer flush batch for LRU or flush list */ |
| 1951 | static |
| 1952 | void |
| 1953 | buf_flush_end( |
| 1954 | /*==========*/ |
| 1955 | buf_pool_t* buf_pool, /*!< buffer pool instance */ |
| 1956 | buf_flush_t flush_type) /*!< in: BUF_FLUSH_LRU |
| 1957 | or BUF_FLUSH_LIST */ |
| 1958 | { |
| 1959 | buf_pool_mutex_enter(buf_pool); |
| 1960 | |
| 1961 | buf_pool->init_flush[flush_type] = FALSE; |
| 1962 | |
| 1963 | buf_pool->try_LRU_scan = TRUE; |
| 1964 | |
| 1965 | if (buf_pool->n_flush[flush_type] == 0) { |
| 1966 | |
| 1967 | /* The running flush batch has ended */ |
| 1968 | |
| 1969 | os_event_set(buf_pool->no_flush[flush_type]); |
| 1970 | } |
| 1971 | |
| 1972 | buf_pool_mutex_exit(buf_pool); |
| 1973 | |
| 1974 | if (!srv_read_only_mode) { |
| 1975 | buf_dblwr_flush_buffered_writes(); |
| 1976 | } else { |
| 1977 | os_aio_simulated_wake_handler_threads(); |
| 1978 | } |
| 1979 | } |
| 1980 | |
| 1981 | /******************************************************************//** |
| 1982 | Waits until a flush batch of the given type ends */ |
| 1983 | void |
| 1984 | buf_flush_wait_batch_end( |
| 1985 | /*=====================*/ |
| 1986 | buf_pool_t* buf_pool, /*!< buffer pool instance */ |
| 1987 | buf_flush_t type) /*!< in: BUF_FLUSH_LRU |
| 1988 | or BUF_FLUSH_LIST */ |
| 1989 | { |
| 1990 | ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST); |
| 1991 | |
| 1992 | if (buf_pool == NULL) { |
| 1993 | ulint i; |
| 1994 | |
| 1995 | for (i = 0; i < srv_buf_pool_instances; ++i) { |
| 1996 | buf_pool_t* buf_pool; |
| 1997 | |
| 1998 | buf_pool = buf_pool_from_array(i); |
| 1999 | |
| 2000 | thd_wait_begin(NULL, THD_WAIT_DISKIO); |
| 2001 | os_event_wait(buf_pool->no_flush[type]); |
| 2002 | thd_wait_end(NULL); |
| 2003 | } |
| 2004 | } else { |
| 2005 | thd_wait_begin(NULL, THD_WAIT_DISKIO); |
| 2006 | os_event_wait(buf_pool->no_flush[type]); |
| 2007 | thd_wait_end(NULL); |
| 2008 | } |
| 2009 | } |
| 2010 | |
| 2011 | /** Do flushing batch of a given type. |
| 2012 | NOTE: The calling thread is not allowed to own any latches on pages! |
| 2013 | @param[in,out] buf_pool buffer pool instance |
| 2014 | @param[in] type flush type |
| 2015 | @param[in] min_n wished minimum mumber of blocks flushed |
| 2016 | (it is not guaranteed that the actual number is that big, though) |
| 2017 | @param[in] lsn_limit in the case BUF_FLUSH_LIST all blocks whose |
| 2018 | oldest_modification is smaller than this should be flushed (if their number |
| 2019 | does not exceed min_n), otherwise ignored |
| 2020 | @param[out] n_processed the number of pages which were processed is |
| 2021 | passed back to caller. Ignored if NULL |
| 2022 | @retval true if a batch was queued successfully. |
| 2023 | @retval false if another batch of same type was already running. */ |
| 2024 | bool |
| 2025 | buf_flush_do_batch( |
| 2026 | buf_pool_t* buf_pool, |
| 2027 | buf_flush_t type, |
| 2028 | ulint min_n, |
| 2029 | lsn_t lsn_limit, |
| 2030 | flush_counters_t* n) |
| 2031 | { |
| 2032 | ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST); |
| 2033 | |
| 2034 | if (n != NULL) { |
| 2035 | n->flushed = 0; |
| 2036 | } |
| 2037 | |
| 2038 | if (!buf_flush_start(buf_pool, type)) { |
| 2039 | return(false); |
| 2040 | } |
| 2041 | |
| 2042 | buf_flush_batch(buf_pool, type, min_n, lsn_limit, n); |
| 2043 | |
| 2044 | buf_flush_end(buf_pool, type); |
| 2045 | |
| 2046 | return(true); |
| 2047 | } |
| 2048 | /** |
| 2049 | Waits until a flush batch of the given lsn ends |
| 2050 | @param[in] new_oldest target oldest_modified_lsn to wait for */ |
| 2051 | |
| 2052 | void |
| 2053 | buf_flush_wait_flushed( |
| 2054 | lsn_t new_oldest) |
| 2055 | { |
| 2056 | for (ulint i = 0; i < srv_buf_pool_instances; ++i) { |
| 2057 | buf_pool_t* buf_pool; |
| 2058 | lsn_t oldest; |
| 2059 | |
| 2060 | buf_pool = buf_pool_from_array(i); |
| 2061 | |
| 2062 | for (;;) { |
| 2063 | /* We don't need to wait for fsync of the flushed |
| 2064 | blocks, because anyway we need fsync to make chekpoint. |
| 2065 | So, we don't need to wait for the batch end here. */ |
| 2066 | |
| 2067 | buf_flush_list_mutex_enter(buf_pool); |
| 2068 | |
| 2069 | buf_page_t* bpage; |
| 2070 | |
| 2071 | /* We don't need to wait for system temporary pages */ |
| 2072 | for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list); |
| 2073 | bpage != NULL |
| 2074 | && fsp_is_system_temporary(bpage->id.space()); |
| 2075 | bpage = UT_LIST_GET_PREV(list, bpage)) { |
| 2076 | /* Do nothing. */ |
| 2077 | } |
| 2078 | |
| 2079 | if (bpage != NULL) { |
| 2080 | ut_ad(bpage->in_flush_list); |
| 2081 | oldest = bpage->oldest_modification; |
| 2082 | } else { |
| 2083 | oldest = 0; |
| 2084 | } |
| 2085 | |
| 2086 | buf_flush_list_mutex_exit(buf_pool); |
| 2087 | |
| 2088 | if (oldest == 0 || oldest >= new_oldest) { |
| 2089 | break; |
| 2090 | } |
| 2091 | |
| 2092 | /* sleep and retry */ |
| 2093 | os_thread_sleep(buf_flush_wait_flushed_sleep_time); |
| 2094 | |
| 2095 | MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS); |
| 2096 | } |
| 2097 | } |
| 2098 | } |
| 2099 | |
| 2100 | /** This utility flushes dirty blocks from the end of the flush list of all |
| 2101 | buffer pool instances. |
| 2102 | NOTE: The calling thread is not allowed to own any latches on pages! |
| 2103 | @param[in] min_n wished minimum mumber of blocks flushed (it is |
| 2104 | not guaranteed that the actual number is that big, though) |
| 2105 | @param[in] lsn_limit in the case BUF_FLUSH_LIST all blocks whose |
| 2106 | oldest_modification is smaller than this should be flushed (if their number |
| 2107 | does not exceed min_n), otherwise ignored |
| 2108 | @param[out] n_processed the number of pages which were processed is |
| 2109 | passed back to caller. Ignored if NULL. |
| 2110 | @return true if a batch was queued successfully for each buffer pool |
| 2111 | instance. false if another batch of same type was already running in |
| 2112 | at least one of the buffer pool instance */ |
| 2113 | bool |
| 2114 | buf_flush_lists( |
| 2115 | ulint min_n, |
| 2116 | lsn_t lsn_limit, |
| 2117 | ulint* n_processed) |
| 2118 | { |
| 2119 | ulint i; |
| 2120 | ulint n_flushed = 0; |
| 2121 | bool success = true; |
| 2122 | |
| 2123 | if (n_processed) { |
| 2124 | *n_processed = 0; |
| 2125 | } |
| 2126 | |
| 2127 | if (min_n != ULINT_MAX) { |
| 2128 | /* Ensure that flushing is spread evenly amongst the |
| 2129 | buffer pool instances. When min_n is ULINT_MAX |
| 2130 | we need to flush everything up to the lsn limit |
| 2131 | so no limit here. */ |
| 2132 | min_n = (min_n + srv_buf_pool_instances - 1) |
| 2133 | / srv_buf_pool_instances; |
| 2134 | } |
| 2135 | |
| 2136 | /* Flush to lsn_limit in all buffer pool instances */ |
| 2137 | for (i = 0; i < srv_buf_pool_instances; i++) { |
| 2138 | buf_pool_t* buf_pool; |
| 2139 | flush_counters_t n; |
| 2140 | |
| 2141 | memset(&n, 0, sizeof(flush_counters_t)); |
| 2142 | buf_pool = buf_pool_from_array(i); |
| 2143 | |
| 2144 | if (!buf_flush_do_batch(buf_pool, |
| 2145 | BUF_FLUSH_LIST, |
| 2146 | min_n, |
| 2147 | lsn_limit, |
| 2148 | &n)) { |
| 2149 | /* We have two choices here. If lsn_limit was |
| 2150 | specified then skipping an instance of buffer |
| 2151 | pool means we cannot guarantee that all pages |
| 2152 | up to lsn_limit has been flushed. We can |
| 2153 | return right now with failure or we can try |
| 2154 | to flush remaining buffer pools up to the |
| 2155 | lsn_limit. We attempt to flush other buffer |
| 2156 | pools based on the assumption that it will |
| 2157 | help in the retry which will follow the |
| 2158 | failure. */ |
| 2159 | success = false; |
| 2160 | |
| 2161 | } |
| 2162 | |
| 2163 | n_flushed += n.flushed; |
| 2164 | } |
| 2165 | |
| 2166 | if (n_flushed) { |
| 2167 | buf_flush_stats(n_flushed, 0); |
| 2168 | if (n_processed) { |
| 2169 | *n_processed = n_flushed; |
| 2170 | } |
| 2171 | } |
| 2172 | |
| 2173 | return(success); |
| 2174 | } |
| 2175 | |
| 2176 | /******************************************************************//** |
| 2177 | This function picks up a single page from the tail of the LRU |
| 2178 | list, flushes it (if it is dirty), removes it from page_hash and LRU |
| 2179 | list and puts it on the free list. It is called from user threads when |
| 2180 | they are unable to find a replaceable page at the tail of the LRU |
| 2181 | list i.e.: when the background LRU flushing in the page_cleaner thread |
| 2182 | is not fast enough to keep pace with the workload. |
| 2183 | @return true if success. */ |
| 2184 | bool |
| 2185 | buf_flush_single_page_from_LRU( |
| 2186 | /*===========================*/ |
| 2187 | buf_pool_t* buf_pool) /*!< in/out: buffer pool instance */ |
| 2188 | { |
| 2189 | ulint scanned; |
| 2190 | buf_page_t* bpage; |
| 2191 | ibool freed; |
| 2192 | |
| 2193 | buf_pool_mutex_enter(buf_pool); |
| 2194 | |
| 2195 | for (bpage = buf_pool->single_scan_itr.start(), scanned = 0, |
| 2196 | freed = false; |
| 2197 | bpage != NULL; |
| 2198 | ++scanned, bpage = buf_pool->single_scan_itr.get()) { |
| 2199 | |
| 2200 | ut_ad(buf_pool_mutex_own(buf_pool)); |
| 2201 | |
| 2202 | buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage); |
| 2203 | buf_pool->single_scan_itr.set(prev); |
| 2204 | BPageMutex* block_mutex; |
| 2205 | |
| 2206 | block_mutex = buf_page_get_mutex(bpage); |
| 2207 | |
| 2208 | mutex_enter(block_mutex); |
| 2209 | |
| 2210 | if (buf_flush_ready_for_replace(bpage)) { |
| 2211 | /* block is ready for eviction i.e., it is |
| 2212 | clean and is not IO-fixed or buffer fixed. */ |
| 2213 | mutex_exit(block_mutex); |
| 2214 | |
| 2215 | if (buf_LRU_free_page(bpage, true)) { |
| 2216 | buf_pool_mutex_exit(buf_pool); |
| 2217 | freed = true; |
| 2218 | break; |
| 2219 | } |
| 2220 | |
| 2221 | } else if (buf_flush_ready_for_flush( |
| 2222 | bpage, BUF_FLUSH_SINGLE_PAGE)) { |
| 2223 | |
| 2224 | /* Block is ready for flush. Try and dispatch an IO |
| 2225 | request. We'll put it on free list in IO completion |
| 2226 | routine if it is not buffer fixed. The following call |
| 2227 | will release the buffer pool and block mutex. |
| 2228 | |
| 2229 | Note: There is no guarantee that this page has actually |
| 2230 | been freed, only that it has been flushed to disk */ |
| 2231 | |
| 2232 | freed = buf_flush_page( |
| 2233 | buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, true); |
| 2234 | |
| 2235 | if (freed) { |
| 2236 | break; |
| 2237 | } |
| 2238 | |
| 2239 | mutex_exit(block_mutex); |
| 2240 | } else { |
| 2241 | mutex_exit(block_mutex); |
| 2242 | } |
| 2243 | ut_ad(!mutex_own(block_mutex)); |
| 2244 | } |
| 2245 | if (!freed) { |
| 2246 | /* Can't find a single flushable page. */ |
| 2247 | ut_ad(!bpage); |
| 2248 | buf_pool_mutex_exit(buf_pool); |
| 2249 | } |
| 2250 | |
| 2251 | if (scanned) { |
| 2252 | MONITOR_INC_VALUE_CUMULATIVE( |
| 2253 | MONITOR_LRU_SINGLE_FLUSH_SCANNED, |
| 2254 | MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL, |
| 2255 | MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL, |
| 2256 | scanned); |
| 2257 | } |
| 2258 | |
| 2259 | ut_ad(!buf_pool_mutex_own(buf_pool)); |
| 2260 | return(freed); |
| 2261 | } |
| 2262 | |
| 2263 | /** |
| 2264 | Clears up tail of the LRU list of a given buffer pool instance: |
| 2265 | * Put replaceable pages at the tail of LRU to the free list |
| 2266 | * Flush dirty pages at the tail of LRU to the disk |
| 2267 | The depth to which we scan each buffer pool is controlled by dynamic |
| 2268 | config parameter innodb_LRU_scan_depth. |
| 2269 | @param buf_pool buffer pool instance |
| 2270 | @return total pages flushed */ |
| 2271 | static |
| 2272 | ulint |
| 2273 | buf_flush_LRU_list( |
| 2274 | buf_pool_t* buf_pool) |
| 2275 | { |
| 2276 | ulint scan_depth, withdraw_depth; |
| 2277 | flush_counters_t n; |
| 2278 | |
| 2279 | memset(&n, 0, sizeof(flush_counters_t)); |
| 2280 | |
| 2281 | ut_ad(buf_pool); |
| 2282 | /* srv_LRU_scan_depth can be arbitrarily large value. |
| 2283 | We cap it with current LRU size. */ |
| 2284 | buf_pool_mutex_enter(buf_pool); |
| 2285 | scan_depth = UT_LIST_GET_LEN(buf_pool->LRU); |
| 2286 | if (buf_pool->curr_size < buf_pool->old_size |
| 2287 | && buf_pool->withdraw_target > 0) { |
| 2288 | withdraw_depth = buf_pool->withdraw_target |
| 2289 | - UT_LIST_GET_LEN(buf_pool->withdraw); |
| 2290 | } else { |
| 2291 | withdraw_depth = 0; |
| 2292 | } |
| 2293 | buf_pool_mutex_exit(buf_pool); |
| 2294 | if (withdraw_depth > srv_LRU_scan_depth) { |
| 2295 | scan_depth = ut_min(withdraw_depth, scan_depth); |
| 2296 | } else { |
| 2297 | scan_depth = ut_min(static_cast<ulint>(srv_LRU_scan_depth), |
| 2298 | scan_depth); |
| 2299 | } |
| 2300 | /* Currently one of page_cleaners is the only thread |
| 2301 | that can trigger an LRU flush at the same time. |
| 2302 | So, it is not possible that a batch triggered during |
| 2303 | last iteration is still running, */ |
| 2304 | buf_flush_do_batch(buf_pool, BUF_FLUSH_LRU, scan_depth, |
| 2305 | 0, &n); |
| 2306 | |
| 2307 | return(n.flushed); |
| 2308 | } |
| 2309 | |
| 2310 | /*********************************************************************//** |
| 2311 | Wait for any possible LRU flushes that are in progress to end. */ |
| 2312 | void |
| 2313 | buf_flush_wait_LRU_batch_end(void) |
| 2314 | /*==============================*/ |
| 2315 | { |
| 2316 | for (ulint i = 0; i < srv_buf_pool_instances; i++) { |
| 2317 | buf_pool_t* buf_pool; |
| 2318 | |
| 2319 | buf_pool = buf_pool_from_array(i); |
| 2320 | |
| 2321 | buf_pool_mutex_enter(buf_pool); |
| 2322 | |
| 2323 | if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0 |
| 2324 | || buf_pool->init_flush[BUF_FLUSH_LRU]) { |
| 2325 | |
| 2326 | buf_pool_mutex_exit(buf_pool); |
| 2327 | buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU); |
| 2328 | } else { |
| 2329 | buf_pool_mutex_exit(buf_pool); |
| 2330 | } |
| 2331 | } |
| 2332 | } |
| 2333 | |
| 2334 | /*********************************************************************//** |
| 2335 | Calculates if flushing is required based on number of dirty pages in |
| 2336 | the buffer pool. |
| 2337 | @return percent of io_capacity to flush to manage dirty page ratio */ |
| 2338 | static |
| 2339 | ulint |
| 2340 | af_get_pct_for_dirty() |
| 2341 | /*==================*/ |
| 2342 | { |
| 2343 | double dirty_pct = buf_get_modified_ratio_pct(); |
| 2344 | |
| 2345 | if (dirty_pct == 0.0) { |
| 2346 | /* No pages modified */ |
| 2347 | return(0); |
| 2348 | } |
| 2349 | |
| 2350 | ut_a(srv_max_dirty_pages_pct_lwm |
| 2351 | <= srv_max_buf_pool_modified_pct); |
| 2352 | |
| 2353 | if (srv_max_dirty_pages_pct_lwm == 0) { |
| 2354 | /* The user has not set the option to preflush dirty |
| 2355 | pages as we approach the high water mark. */ |
| 2356 | if (dirty_pct >= srv_max_buf_pool_modified_pct) { |
| 2357 | /* We have crossed the high water mark of dirty |
| 2358 | pages In this case we start flushing at 100% of |
| 2359 | innodb_io_capacity. */ |
| 2360 | return(100); |
| 2361 | } |
| 2362 | } else if (dirty_pct >= srv_max_dirty_pages_pct_lwm) { |
| 2363 | /* We should start flushing pages gradually. */ |
| 2364 | return(static_cast<ulint>((dirty_pct * 100) |
| 2365 | / (srv_max_buf_pool_modified_pct + 1))); |
| 2366 | } |
| 2367 | |
| 2368 | return(0); |
| 2369 | } |
| 2370 | |
| 2371 | /*********************************************************************//** |
| 2372 | Calculates if flushing is required based on redo generation rate. |
| 2373 | @return percent of io_capacity to flush to manage redo space */ |
| 2374 | static |
| 2375 | ulint |
| 2376 | af_get_pct_for_lsn( |
| 2377 | /*===============*/ |
| 2378 | lsn_t age) /*!< in: current age of LSN. */ |
| 2379 | { |
| 2380 | lsn_t max_async_age; |
| 2381 | lsn_t lsn_age_factor; |
| 2382 | lsn_t af_lwm = (lsn_t) ((srv_adaptive_flushing_lwm |
| 2383 | * log_get_capacity()) / 100); |
| 2384 | |
| 2385 | if (age < af_lwm) { |
| 2386 | /* No adaptive flushing. */ |
| 2387 | return(0); |
| 2388 | } |
| 2389 | |
| 2390 | max_async_age = log_get_max_modified_age_async(); |
| 2391 | |
| 2392 | if (age < max_async_age && !srv_adaptive_flushing) { |
| 2393 | /* We have still not reached the max_async point and |
| 2394 | the user has disabled adaptive flushing. */ |
| 2395 | return(0); |
| 2396 | } |
| 2397 | |
| 2398 | /* If we are here then we know that either: |
| 2399 | 1) User has enabled adaptive flushing |
| 2400 | 2) User may have disabled adaptive flushing but we have reached |
| 2401 | max_async_age. */ |
| 2402 | lsn_age_factor = (age * 100) / max_async_age; |
| 2403 | |
| 2404 | ut_ad(srv_max_io_capacity >= srv_io_capacity); |
| 2405 | return(static_cast<ulint>( |
| 2406 | ((srv_max_io_capacity / srv_io_capacity) |
| 2407 | * (lsn_age_factor * sqrt((double)lsn_age_factor))) |
| 2408 | / 7.5)); |
| 2409 | } |
| 2410 | |
| 2411 | /*********************************************************************//** |
| 2412 | This function is called approximately once every second by the |
| 2413 | page_cleaner thread. Based on various factors it decides if there is a |
| 2414 | need to do flushing. |
| 2415 | @return number of pages recommended to be flushed |
| 2416 | @param lsn_limit pointer to return LSN up to which flushing must happen |
| 2417 | @param last_pages_in the number of pages flushed by the last flush_list |
| 2418 | flushing. */ |
| 2419 | static |
| 2420 | ulint |
| 2421 | page_cleaner_flush_pages_recommendation( |
| 2422 | /*====================================*/ |
| 2423 | lsn_t* lsn_limit, |
| 2424 | ulint last_pages_in) |
| 2425 | { |
| 2426 | static lsn_t prev_lsn = 0; |
| 2427 | static ulint sum_pages = 0; |
| 2428 | static ulint avg_page_rate = 0; |
| 2429 | static ulint n_iterations = 0; |
| 2430 | static time_t prev_time; |
| 2431 | lsn_t oldest_lsn; |
| 2432 | lsn_t cur_lsn; |
| 2433 | lsn_t age; |
| 2434 | lsn_t lsn_rate; |
| 2435 | ulint n_pages = 0; |
| 2436 | ulint pct_for_dirty = 0; |
| 2437 | ulint pct_for_lsn = 0; |
| 2438 | ulint pct_total = 0; |
| 2439 | |
| 2440 | cur_lsn = log_get_lsn_nowait(); |
| 2441 | |
| 2442 | /* log_get_lsn_nowait tries to get log_sys.mutex with |
| 2443 | mutex_enter_nowait, if this does not succeed function |
| 2444 | returns 0, do not use that value to update stats. */ |
| 2445 | if (cur_lsn == 0) { |
| 2446 | return(0); |
| 2447 | } |
| 2448 | |
| 2449 | if (prev_lsn == 0) { |
| 2450 | /* First time around. */ |
| 2451 | prev_lsn = cur_lsn; |
| 2452 | prev_time = ut_time(); |
| 2453 | return(0); |
| 2454 | } |
| 2455 | |
| 2456 | if (prev_lsn == cur_lsn) { |
| 2457 | return(0); |
| 2458 | } |
| 2459 | |
| 2460 | sum_pages += last_pages_in; |
| 2461 | |
| 2462 | time_t curr_time = ut_time(); |
| 2463 | double time_elapsed = difftime(curr_time, prev_time); |
| 2464 | |
| 2465 | /* We update our variables every srv_flushing_avg_loops |
| 2466 | iterations to smooth out transition in workload. */ |
| 2467 | if (++n_iterations >= srv_flushing_avg_loops |
| 2468 | || time_elapsed >= srv_flushing_avg_loops) { |
| 2469 | |
| 2470 | if (time_elapsed < 1) { |
| 2471 | time_elapsed = 1; |
| 2472 | } |
| 2473 | |
| 2474 | avg_page_rate = static_cast<ulint>( |
| 2475 | ((static_cast<double>(sum_pages) |
| 2476 | / time_elapsed) |
| 2477 | + avg_page_rate) / 2); |
| 2478 | |
| 2479 | /* How much LSN we have generated since last call. */ |
| 2480 | lsn_rate = static_cast<lsn_t>( |
| 2481 | static_cast<double>(cur_lsn - prev_lsn) |
| 2482 | / time_elapsed); |
| 2483 | |
| 2484 | lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2; |
| 2485 | |
| 2486 | /* aggregate stats of all slots */ |
| 2487 | mutex_enter(&page_cleaner.mutex); |
| 2488 | |
| 2489 | ulint flush_tm = page_cleaner.flush_time; |
| 2490 | ulint flush_pass = page_cleaner.flush_pass; |
| 2491 | |
| 2492 | page_cleaner.flush_time = 0; |
| 2493 | page_cleaner.flush_pass = 0; |
| 2494 | |
| 2495 | ulint lru_tm = 0; |
| 2496 | ulint list_tm = 0; |
| 2497 | ulint lru_pass = 0; |
| 2498 | ulint list_pass = 0; |
| 2499 | |
| 2500 | for (ulint i = 0; i < page_cleaner.n_slots; i++) { |
| 2501 | page_cleaner_slot_t* slot; |
| 2502 | |
| 2503 | slot = &page_cleaner.slots[i]; |
| 2504 | |
| 2505 | lru_tm += slot->flush_lru_time; |
| 2506 | lru_pass += slot->flush_lru_pass; |
| 2507 | list_tm += slot->flush_list_time; |
| 2508 | list_pass += slot->flush_list_pass; |
| 2509 | |
| 2510 | slot->flush_lru_time = 0; |
| 2511 | slot->flush_lru_pass = 0; |
| 2512 | slot->flush_list_time = 0; |
| 2513 | slot->flush_list_pass = 0; |
| 2514 | } |
| 2515 | |
| 2516 | mutex_exit(&page_cleaner.mutex); |
| 2517 | |
| 2518 | /* minimum values are 1, to avoid dividing by zero. */ |
| 2519 | if (lru_tm < 1) { |
| 2520 | lru_tm = 1; |
| 2521 | } |
| 2522 | if (list_tm < 1) { |
| 2523 | list_tm = 1; |
| 2524 | } |
| 2525 | if (flush_tm < 1) { |
| 2526 | flush_tm = 1; |
| 2527 | } |
| 2528 | |
| 2529 | if (lru_pass < 1) { |
| 2530 | lru_pass = 1; |
| 2531 | } |
| 2532 | if (list_pass < 1) { |
| 2533 | list_pass = 1; |
| 2534 | } |
| 2535 | if (flush_pass < 1) { |
| 2536 | flush_pass = 1; |
| 2537 | } |
| 2538 | |
| 2539 | MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_SLOT, |
| 2540 | list_tm / list_pass); |
| 2541 | MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_SLOT, |
| 2542 | lru_tm / lru_pass); |
| 2543 | |
| 2544 | MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_THREAD, |
| 2545 | list_tm / (srv_n_page_cleaners * flush_pass)); |
| 2546 | MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_THREAD, |
| 2547 | lru_tm / (srv_n_page_cleaners * flush_pass)); |
| 2548 | MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_EST, |
| 2549 | flush_tm * list_tm / flush_pass |
| 2550 | / (list_tm + lru_tm)); |
| 2551 | MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_EST, |
| 2552 | flush_tm * lru_tm / flush_pass |
| 2553 | / (list_tm + lru_tm)); |
| 2554 | MONITOR_SET(MONITOR_FLUSH_AVG_TIME, flush_tm / flush_pass); |
| 2555 | |
| 2556 | MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, |
| 2557 | list_pass / page_cleaner.n_slots); |
| 2558 | MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_PASS, |
| 2559 | lru_pass / page_cleaner.n_slots); |
| 2560 | MONITOR_SET(MONITOR_FLUSH_AVG_PASS, flush_pass); |
| 2561 | |
| 2562 | prev_lsn = cur_lsn; |
| 2563 | prev_time = curr_time; |
| 2564 | |
| 2565 | n_iterations = 0; |
| 2566 | |
| 2567 | sum_pages = 0; |
| 2568 | } |
| 2569 | |
| 2570 | oldest_lsn = buf_pool_get_oldest_modification(); |
| 2571 | |
| 2572 | ut_ad(oldest_lsn <= log_get_lsn()); |
| 2573 | |
| 2574 | age = cur_lsn > oldest_lsn ? cur_lsn - oldest_lsn : 0; |
| 2575 | |
| 2576 | pct_for_dirty = af_get_pct_for_dirty(); |
| 2577 | pct_for_lsn = af_get_pct_for_lsn(age); |
| 2578 | |
| 2579 | pct_total = ut_max(pct_for_dirty, pct_for_lsn); |
| 2580 | |
| 2581 | /* Estimate pages to be flushed for the lsn progress */ |
| 2582 | ulint sum_pages_for_lsn = 0; |
| 2583 | lsn_t target_lsn = oldest_lsn |
| 2584 | + lsn_avg_rate * buf_flush_lsn_scan_factor; |
| 2585 | |
| 2586 | for (ulint i = 0; i < srv_buf_pool_instances; i++) { |
| 2587 | buf_pool_t* buf_pool = buf_pool_from_array(i); |
| 2588 | ulint pages_for_lsn = 0; |
| 2589 | |
| 2590 | buf_flush_list_mutex_enter(buf_pool); |
| 2591 | for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool->flush_list); |
| 2592 | b != NULL; |
| 2593 | b = UT_LIST_GET_PREV(list, b)) { |
| 2594 | if (b->oldest_modification > target_lsn) { |
| 2595 | break; |
| 2596 | } |
| 2597 | ++pages_for_lsn; |
| 2598 | } |
| 2599 | buf_flush_list_mutex_exit(buf_pool); |
| 2600 | |
| 2601 | sum_pages_for_lsn += pages_for_lsn; |
| 2602 | |
| 2603 | mutex_enter(&page_cleaner.mutex); |
| 2604 | ut_ad(page_cleaner.slots[i].state |
| 2605 | == PAGE_CLEANER_STATE_NONE); |
| 2606 | page_cleaner.slots[i].n_pages_requested |
| 2607 | = pages_for_lsn / buf_flush_lsn_scan_factor + 1; |
| 2608 | mutex_exit(&page_cleaner.mutex); |
| 2609 | } |
| 2610 | |
| 2611 | sum_pages_for_lsn /= buf_flush_lsn_scan_factor; |
| 2612 | if(sum_pages_for_lsn < 1) { |
| 2613 | sum_pages_for_lsn = 1; |
| 2614 | } |
| 2615 | |
| 2616 | /* Cap the maximum IO capacity that we are going to use by |
| 2617 | max_io_capacity. Limit the value to avoid too quick increase */ |
| 2618 | ulint pages_for_lsn = |
| 2619 | std::min<ulint>(sum_pages_for_lsn, srv_max_io_capacity * 2); |
| 2620 | |
| 2621 | n_pages = (PCT_IO(pct_total) + avg_page_rate + pages_for_lsn) / 3; |
| 2622 | |
| 2623 | if (n_pages > srv_max_io_capacity) { |
| 2624 | n_pages = srv_max_io_capacity; |
| 2625 | } |
| 2626 | |
| 2627 | /* Normalize request for each instance */ |
| 2628 | mutex_enter(&page_cleaner.mutex); |
| 2629 | ut_ad(page_cleaner.n_slots_requested == 0); |
| 2630 | ut_ad(page_cleaner.n_slots_flushing == 0); |
| 2631 | ut_ad(page_cleaner.n_slots_finished == 0); |
| 2632 | |
| 2633 | for (ulint i = 0; i < srv_buf_pool_instances; i++) { |
| 2634 | /* if REDO has enough of free space, |
| 2635 | don't care about age distribution of pages */ |
| 2636 | page_cleaner.slots[i].n_pages_requested = pct_for_lsn > 30 ? |
| 2637 | page_cleaner.slots[i].n_pages_requested |
| 2638 | * n_pages / sum_pages_for_lsn + 1 |
| 2639 | : n_pages / srv_buf_pool_instances; |
| 2640 | } |
| 2641 | mutex_exit(&page_cleaner.mutex); |
| 2642 | |
| 2643 | MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages); |
| 2644 | |
| 2645 | MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, sum_pages_for_lsn); |
| 2646 | |
| 2647 | MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate); |
| 2648 | MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate); |
| 2649 | MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty); |
| 2650 | MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn); |
| 2651 | |
| 2652 | *lsn_limit = LSN_MAX; |
| 2653 | |
| 2654 | return(n_pages); |
| 2655 | } |
| 2656 | |
| 2657 | /*********************************************************************//** |
| 2658 | Puts the page_cleaner thread to sleep if it has finished work in less |
| 2659 | than a second |
| 2660 | @retval 0 wake up by event set, |
| 2661 | @retval OS_SYNC_TIME_EXCEEDED if timeout was exceeded |
| 2662 | @param next_loop_time time when next loop iteration should start |
| 2663 | @param sig_count zero or the value returned by previous call of |
| 2664 | os_event_reset() |
| 2665 | @param cur_time current time as in ut_time_ms() */ |
| 2666 | static |
| 2667 | ulint |
| 2668 | pc_sleep_if_needed( |
| 2669 | /*===============*/ |
| 2670 | ulint next_loop_time, |
| 2671 | int64_t sig_count, |
| 2672 | ulint cur_time) |
| 2673 | { |
| 2674 | /* No sleep if we are cleaning the buffer pool during the shutdown |
| 2675 | with everything else finished */ |
| 2676 | if (srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE) |
| 2677 | return OS_SYNC_TIME_EXCEEDED; |
| 2678 | |
| 2679 | if (next_loop_time > cur_time) { |
| 2680 | /* Get sleep interval in micro seconds. We use |
| 2681 | ut_min() to avoid long sleep in case of wrap around. */ |
| 2682 | ulint sleep_us; |
| 2683 | |
| 2684 | sleep_us = ut_min(static_cast<ulint>(1000000), |
| 2685 | (next_loop_time - cur_time) * 1000); |
| 2686 | |
| 2687 | return(os_event_wait_time_low(buf_flush_event, |
| 2688 | sleep_us, sig_count)); |
| 2689 | } |
| 2690 | |
| 2691 | return(OS_SYNC_TIME_EXCEEDED); |
| 2692 | } |
| 2693 | |
| 2694 | /******************************************************************//** |
| 2695 | Initialize page_cleaner. */ |
| 2696 | void |
| 2697 | buf_flush_page_cleaner_init(void) |
| 2698 | /*=============================*/ |
| 2699 | { |
| 2700 | ut_ad(!page_cleaner.is_running); |
| 2701 | |
| 2702 | mutex_create(LATCH_ID_PAGE_CLEANER, &page_cleaner.mutex); |
| 2703 | |
| 2704 | page_cleaner.is_requested = os_event_create("pc_is_requested" ); |
| 2705 | page_cleaner.is_finished = os_event_create("pc_is_finished" ); |
| 2706 | page_cleaner.is_started = os_event_create("pc_is_started" ); |
| 2707 | page_cleaner.n_slots = static_cast<ulint>(srv_buf_pool_instances); |
| 2708 | |
| 2709 | ut_d(page_cleaner.n_disabled_debug = 0); |
| 2710 | |
| 2711 | page_cleaner.is_running = true; |
| 2712 | } |
| 2713 | |
| 2714 | /** |
| 2715 | Requests for all slots to flush all buffer pool instances. |
| 2716 | @param min_n wished minimum mumber of blocks flushed |
| 2717 | (it is not guaranteed that the actual number is that big) |
| 2718 | @param lsn_limit in the case BUF_FLUSH_LIST all blocks whose |
| 2719 | oldest_modification is smaller than this should be flushed |
| 2720 | (if their number does not exceed min_n), otherwise ignored |
| 2721 | */ |
| 2722 | static |
| 2723 | void |
| 2724 | pc_request( |
| 2725 | ulint min_n, |
| 2726 | lsn_t lsn_limit) |
| 2727 | { |
| 2728 | if (min_n != ULINT_MAX) { |
| 2729 | /* Ensure that flushing is spread evenly amongst the |
| 2730 | buffer pool instances. When min_n is ULINT_MAX |
| 2731 | we need to flush everything up to the lsn limit |
| 2732 | so no limit here. */ |
| 2733 | min_n = (min_n + srv_buf_pool_instances - 1) |
| 2734 | / srv_buf_pool_instances; |
| 2735 | } |
| 2736 | |
| 2737 | mutex_enter(&page_cleaner.mutex); |
| 2738 | |
| 2739 | ut_ad(page_cleaner.n_slots_requested == 0); |
| 2740 | ut_ad(page_cleaner.n_slots_flushing == 0); |
| 2741 | ut_ad(page_cleaner.n_slots_finished == 0); |
| 2742 | |
| 2743 | page_cleaner.requested = (min_n > 0); |
| 2744 | page_cleaner.lsn_limit = lsn_limit; |
| 2745 | |
| 2746 | for (ulint i = 0; i < page_cleaner.n_slots; i++) { |
| 2747 | page_cleaner_slot_t* slot = &page_cleaner.slots[i]; |
| 2748 | |
| 2749 | ut_ad(slot->state == PAGE_CLEANER_STATE_NONE); |
| 2750 | |
| 2751 | if (min_n == ULINT_MAX) { |
| 2752 | slot->n_pages_requested = ULINT_MAX; |
| 2753 | } else if (min_n == 0) { |
| 2754 | slot->n_pages_requested = 0; |
| 2755 | } |
| 2756 | |
| 2757 | /* slot->n_pages_requested was already set by |
| 2758 | page_cleaner_flush_pages_recommendation() */ |
| 2759 | |
| 2760 | slot->state = PAGE_CLEANER_STATE_REQUESTED; |
| 2761 | } |
| 2762 | |
| 2763 | page_cleaner.n_slots_requested = page_cleaner.n_slots; |
| 2764 | page_cleaner.n_slots_flushing = 0; |
| 2765 | page_cleaner.n_slots_finished = 0; |
| 2766 | |
| 2767 | os_event_set(page_cleaner.is_requested); |
| 2768 | |
| 2769 | mutex_exit(&page_cleaner.mutex); |
| 2770 | } |
| 2771 | |
| 2772 | /** |
| 2773 | Do flush for one slot. |
| 2774 | @return the number of the slots which has not been treated yet. */ |
| 2775 | static |
| 2776 | ulint |
| 2777 | pc_flush_slot(void) |
| 2778 | { |
| 2779 | ulint lru_tm = 0; |
| 2780 | ulint list_tm = 0; |
| 2781 | ulint lru_pass = 0; |
| 2782 | ulint list_pass = 0; |
| 2783 | |
| 2784 | mutex_enter(&page_cleaner.mutex); |
| 2785 | |
| 2786 | if (!page_cleaner.n_slots_requested) { |
| 2787 | os_event_reset(page_cleaner.is_requested); |
| 2788 | } else { |
| 2789 | page_cleaner_slot_t* slot = NULL; |
| 2790 | ulint i; |
| 2791 | |
| 2792 | for (i = 0; i < page_cleaner.n_slots; i++) { |
| 2793 | slot = &page_cleaner.slots[i]; |
| 2794 | |
| 2795 | if (slot->state == PAGE_CLEANER_STATE_REQUESTED) { |
| 2796 | break; |
| 2797 | } |
| 2798 | } |
| 2799 | |
| 2800 | /* slot should be found because |
| 2801 | page_cleaner.n_slots_requested > 0 */ |
| 2802 | ut_a(i < page_cleaner.n_slots); |
| 2803 | |
| 2804 | buf_pool_t* buf_pool = buf_pool_from_array(i); |
| 2805 | |
| 2806 | page_cleaner.n_slots_requested--; |
| 2807 | page_cleaner.n_slots_flushing++; |
| 2808 | slot->state = PAGE_CLEANER_STATE_FLUSHING; |
| 2809 | |
| 2810 | if (UNIV_UNLIKELY(!page_cleaner.is_running)) { |
| 2811 | slot->n_flushed_lru = 0; |
| 2812 | slot->n_flushed_list = 0; |
| 2813 | goto finish_mutex; |
| 2814 | } |
| 2815 | |
| 2816 | if (page_cleaner.n_slots_requested == 0) { |
| 2817 | os_event_reset(page_cleaner.is_requested); |
| 2818 | } |
| 2819 | |
| 2820 | mutex_exit(&page_cleaner.mutex); |
| 2821 | |
| 2822 | lru_tm = ut_time_ms(); |
| 2823 | |
| 2824 | /* Flush pages from end of LRU if required */ |
| 2825 | slot->n_flushed_lru = buf_flush_LRU_list(buf_pool); |
| 2826 | |
| 2827 | lru_tm = ut_time_ms() - lru_tm; |
| 2828 | lru_pass++; |
| 2829 | |
| 2830 | if (UNIV_UNLIKELY(!page_cleaner.is_running)) { |
| 2831 | slot->n_flushed_list = 0; |
| 2832 | goto finish; |
| 2833 | } |
| 2834 | |
| 2835 | /* Flush pages from flush_list if required */ |
| 2836 | if (page_cleaner.requested) { |
| 2837 | flush_counters_t n; |
| 2838 | memset(&n, 0, sizeof(flush_counters_t)); |
| 2839 | list_tm = ut_time_ms(); |
| 2840 | |
| 2841 | slot->succeeded_list = buf_flush_do_batch( |
| 2842 | buf_pool, BUF_FLUSH_LIST, |
| 2843 | slot->n_pages_requested, |
| 2844 | page_cleaner.lsn_limit, |
| 2845 | &n); |
| 2846 | |
| 2847 | slot->n_flushed_list = n.flushed; |
| 2848 | |
| 2849 | list_tm = ut_time_ms() - list_tm; |
| 2850 | list_pass++; |
| 2851 | } else { |
| 2852 | slot->n_flushed_list = 0; |
| 2853 | slot->succeeded_list = true; |
| 2854 | } |
| 2855 | finish: |
| 2856 | mutex_enter(&page_cleaner.mutex); |
| 2857 | finish_mutex: |
| 2858 | page_cleaner.n_slots_flushing--; |
| 2859 | page_cleaner.n_slots_finished++; |
| 2860 | slot->state = PAGE_CLEANER_STATE_FINISHED; |
| 2861 | |
| 2862 | slot->flush_lru_time += lru_tm; |
| 2863 | slot->flush_list_time += list_tm; |
| 2864 | slot->flush_lru_pass += lru_pass; |
| 2865 | slot->flush_list_pass += list_pass; |
| 2866 | |
| 2867 | if (page_cleaner.n_slots_requested == 0 |
| 2868 | && page_cleaner.n_slots_flushing == 0) { |
| 2869 | os_event_set(page_cleaner.is_finished); |
| 2870 | } |
| 2871 | } |
| 2872 | |
| 2873 | ulint ret = page_cleaner.n_slots_requested; |
| 2874 | |
| 2875 | mutex_exit(&page_cleaner.mutex); |
| 2876 | |
| 2877 | return(ret); |
| 2878 | } |
| 2879 | |
| 2880 | /** |
| 2881 | Wait until all flush requests are finished. |
| 2882 | @param n_flushed_lru number of pages flushed from the end of the LRU list. |
| 2883 | @param n_flushed_list number of pages flushed from the end of the |
| 2884 | flush_list. |
| 2885 | @return true if all flush_list flushing batch were success. */ |
| 2886 | static |
| 2887 | bool |
| 2888 | pc_wait_finished( |
| 2889 | ulint* n_flushed_lru, |
| 2890 | ulint* n_flushed_list) |
| 2891 | { |
| 2892 | bool all_succeeded = true; |
| 2893 | |
| 2894 | *n_flushed_lru = 0; |
| 2895 | *n_flushed_list = 0; |
| 2896 | |
| 2897 | os_event_wait(page_cleaner.is_finished); |
| 2898 | |
| 2899 | mutex_enter(&page_cleaner.mutex); |
| 2900 | |
| 2901 | ut_ad(page_cleaner.n_slots_requested == 0); |
| 2902 | ut_ad(page_cleaner.n_slots_flushing == 0); |
| 2903 | ut_ad(page_cleaner.n_slots_finished == page_cleaner.n_slots); |
| 2904 | |
| 2905 | for (ulint i = 0; i < page_cleaner.n_slots; i++) { |
| 2906 | page_cleaner_slot_t* slot = &page_cleaner.slots[i]; |
| 2907 | |
| 2908 | ut_ad(slot->state == PAGE_CLEANER_STATE_FINISHED); |
| 2909 | |
| 2910 | *n_flushed_lru += slot->n_flushed_lru; |
| 2911 | *n_flushed_list += slot->n_flushed_list; |
| 2912 | all_succeeded &= slot->succeeded_list; |
| 2913 | |
| 2914 | slot->state = PAGE_CLEANER_STATE_NONE; |
| 2915 | |
| 2916 | slot->n_pages_requested = 0; |
| 2917 | } |
| 2918 | |
| 2919 | page_cleaner.n_slots_finished = 0; |
| 2920 | |
| 2921 | os_event_reset(page_cleaner.is_finished); |
| 2922 | |
| 2923 | mutex_exit(&page_cleaner.mutex); |
| 2924 | |
| 2925 | return(all_succeeded); |
| 2926 | } |
| 2927 | |
| 2928 | #ifdef UNIV_LINUX |
| 2929 | /** |
| 2930 | Set priority for page_cleaner threads. |
| 2931 | @param[in] priority priority intended to set |
| 2932 | @return true if set as intended */ |
| 2933 | static |
| 2934 | bool |
| 2935 | buf_flush_page_cleaner_set_priority( |
| 2936 | int priority) |
| 2937 | { |
| 2938 | setpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid), |
| 2939 | priority); |
| 2940 | return(getpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid)) |
| 2941 | == priority); |
| 2942 | } |
| 2943 | #endif /* UNIV_LINUX */ |
| 2944 | |
| 2945 | #ifdef UNIV_DEBUG |
| 2946 | /** Loop used to disable page cleaner threads. */ |
| 2947 | static |
| 2948 | void |
| 2949 | buf_flush_page_cleaner_disabled_loop(void) |
| 2950 | { |
| 2951 | if (!innodb_page_cleaner_disabled_debug) { |
| 2952 | /* We return to avoid entering and exiting mutex. */ |
| 2953 | return; |
| 2954 | } |
| 2955 | |
| 2956 | mutex_enter(&page_cleaner.mutex); |
| 2957 | page_cleaner.n_disabled_debug++; |
| 2958 | mutex_exit(&page_cleaner.mutex); |
| 2959 | |
| 2960 | while (innodb_page_cleaner_disabled_debug |
| 2961 | && srv_shutdown_state == SRV_SHUTDOWN_NONE |
| 2962 | && page_cleaner.is_running) { |
| 2963 | |
| 2964 | os_thread_sleep(100000); /* [A] */ |
| 2965 | } |
| 2966 | |
| 2967 | /* We need to wait for threads exiting here, otherwise we would |
| 2968 | encounter problem when we quickly perform following steps: |
| 2969 | 1) SET GLOBAL innodb_page_cleaner_disabled_debug = 1; |
| 2970 | 2) SET GLOBAL innodb_page_cleaner_disabled_debug = 0; |
| 2971 | 3) SET GLOBAL innodb_page_cleaner_disabled_debug = 1; |
| 2972 | That's because after step 1 this thread could still be sleeping |
| 2973 | inside the loop above at [A] and steps 2, 3 could happen before |
| 2974 | this thread wakes up from [A]. In such case this thread would |
| 2975 | not re-increment n_disabled_debug and we would be waiting for |
| 2976 | him forever in buf_flush_page_cleaner_disabled_debug_update(...). |
| 2977 | |
| 2978 | Therefore we are waiting in step 2 for this thread exiting here. */ |
| 2979 | |
| 2980 | mutex_enter(&page_cleaner.mutex); |
| 2981 | page_cleaner.n_disabled_debug--; |
| 2982 | mutex_exit(&page_cleaner.mutex); |
| 2983 | } |
| 2984 | |
| 2985 | /** Disables page cleaner threads (coordinator and workers). |
| 2986 | @param[in] save immediate result from check function */ |
| 2987 | void buf_flush_page_cleaner_disabled_debug_update(THD*, |
| 2988 | st_mysql_sys_var*, void*, |
| 2989 | const void* save) |
| 2990 | { |
| 2991 | if (!page_cleaner.is_running) { |
| 2992 | return; |
| 2993 | } |
| 2994 | |
| 2995 | if (!*static_cast<const my_bool*>(save)) { |
| 2996 | if (!innodb_page_cleaner_disabled_debug) { |
| 2997 | return; |
| 2998 | } |
| 2999 | |
| 3000 | innodb_page_cleaner_disabled_debug = false; |
| 3001 | |
| 3002 | /* Enable page cleaner threads. */ |
| 3003 | while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { |
| 3004 | mutex_enter(&page_cleaner.mutex); |
| 3005 | const ulint n = page_cleaner.n_disabled_debug; |
| 3006 | mutex_exit(&page_cleaner.mutex); |
| 3007 | /* Check if all threads have been enabled, to avoid |
| 3008 | problem when we decide to re-disable them soon. */ |
| 3009 | if (n == 0) { |
| 3010 | break; |
| 3011 | } |
| 3012 | } |
| 3013 | return; |
| 3014 | } |
| 3015 | |
| 3016 | if (innodb_page_cleaner_disabled_debug) { |
| 3017 | return; |
| 3018 | } |
| 3019 | |
| 3020 | innodb_page_cleaner_disabled_debug = true; |
| 3021 | |
| 3022 | while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { |
| 3023 | /* Workers are possibly sleeping on is_requested. |
| 3024 | |
| 3025 | We have to wake them, otherwise they could possibly |
| 3026 | have never noticed, that they should be disabled, |
| 3027 | and we would wait for them here forever. |
| 3028 | |
| 3029 | That's why we have sleep-loop instead of simply |
| 3030 | waiting on some disabled_debug_event. */ |
| 3031 | os_event_set(page_cleaner.is_requested); |
| 3032 | |
| 3033 | mutex_enter(&page_cleaner.mutex); |
| 3034 | |
| 3035 | ut_ad(page_cleaner.n_disabled_debug |
| 3036 | <= srv_n_page_cleaners); |
| 3037 | |
| 3038 | if (page_cleaner.n_disabled_debug |
| 3039 | == srv_n_page_cleaners) { |
| 3040 | |
| 3041 | mutex_exit(&page_cleaner.mutex); |
| 3042 | break; |
| 3043 | } |
| 3044 | |
| 3045 | mutex_exit(&page_cleaner.mutex); |
| 3046 | |
| 3047 | os_thread_sleep(100000); |
| 3048 | } |
| 3049 | } |
| 3050 | #endif /* UNIV_DEBUG */ |
| 3051 | |
| 3052 | /******************************************************************//** |
| 3053 | page_cleaner thread tasked with flushing dirty pages from the buffer |
| 3054 | pools. As of now we'll have only one coordinator. |
| 3055 | @return a dummy parameter */ |
| 3056 | extern "C" |
| 3057 | os_thread_ret_t |
| 3058 | DECLARE_THREAD(buf_flush_page_cleaner_coordinator)(void*) |
| 3059 | { |
| 3060 | my_thread_init(); |
| 3061 | #ifdef UNIV_PFS_THREAD |
| 3062 | pfs_register_thread(page_cleaner_thread_key); |
| 3063 | #endif /* UNIV_PFS_THREAD */ |
| 3064 | ut_ad(!srv_read_only_mode); |
| 3065 | |
| 3066 | #ifdef UNIV_DEBUG_THREAD_CREATION |
| 3067 | ib::info() << "page_cleaner thread running, id " |
| 3068 | << os_thread_pf(os_thread_get_curr_id()); |
| 3069 | #endif /* UNIV_DEBUG_THREAD_CREATION */ |
| 3070 | #ifdef UNIV_LINUX |
| 3071 | /* linux might be able to set different setting for each thread. |
| 3072 | worth to try to set high priority for page cleaner threads */ |
| 3073 | if (buf_flush_page_cleaner_set_priority( |
| 3074 | buf_flush_page_cleaner_priority)) { |
| 3075 | |
| 3076 | ib::info() << "page_cleaner coordinator priority: " |
| 3077 | << buf_flush_page_cleaner_priority; |
| 3078 | } else { |
| 3079 | ib::info() << "If the mysqld execution user is authorized," |
| 3080 | " page cleaner thread priority can be changed." |
| 3081 | " See the man page of setpriority()." ; |
| 3082 | } |
| 3083 | /* Signal that setpriority() has been attempted. */ |
| 3084 | os_event_set(recv_sys->flush_end); |
| 3085 | #endif /* UNIV_LINUX */ |
| 3086 | |
| 3087 | do { |
| 3088 | /* treat flushing requests during recovery. */ |
| 3089 | ulint n_flushed_lru = 0; |
| 3090 | ulint n_flushed_list = 0; |
| 3091 | |
| 3092 | os_event_wait(recv_sys->flush_start); |
| 3093 | |
| 3094 | if (!recv_writer_thread_active) { |
| 3095 | break; |
| 3096 | } |
| 3097 | |
| 3098 | switch (recv_sys->flush_type) { |
| 3099 | case BUF_FLUSH_LRU: |
| 3100 | /* Flush pages from end of LRU if required */ |
| 3101 | pc_request(0, LSN_MAX); |
| 3102 | while (pc_flush_slot() > 0) {} |
| 3103 | pc_wait_finished(&n_flushed_lru, &n_flushed_list); |
| 3104 | break; |
| 3105 | |
| 3106 | case BUF_FLUSH_LIST: |
| 3107 | /* Flush all pages */ |
| 3108 | do { |
| 3109 | pc_request(ULINT_MAX, LSN_MAX); |
| 3110 | while (pc_flush_slot() > 0) {} |
| 3111 | } while (!pc_wait_finished(&n_flushed_lru, |
| 3112 | &n_flushed_list)); |
| 3113 | break; |
| 3114 | |
| 3115 | default: |
| 3116 | ut_ad(0); |
| 3117 | } |
| 3118 | |
| 3119 | os_event_reset(recv_sys->flush_start); |
| 3120 | os_event_set(recv_sys->flush_end); |
| 3121 | } while (recv_writer_thread_active); |
| 3122 | |
| 3123 | os_event_wait(buf_flush_event); |
| 3124 | |
| 3125 | ulint ret_sleep = 0; |
| 3126 | ulint n_evicted = 0; |
| 3127 | ulint n_flushed_last = 0; |
| 3128 | ulint warn_interval = 1; |
| 3129 | ulint warn_count = 0; |
| 3130 | int64_t sig_count = os_event_reset(buf_flush_event); |
| 3131 | ulint next_loop_time = ut_time_ms() + 1000; |
| 3132 | ulint n_flushed = 0; |
| 3133 | ulint last_activity = srv_get_activity_count(); |
| 3134 | ulint last_pages = 0; |
| 3135 | |
| 3136 | while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { |
| 3137 | ulint curr_time = ut_time_ms(); |
| 3138 | |
| 3139 | /* The page_cleaner skips sleep if the server is |
| 3140 | idle and there are no pending IOs in the buffer pool |
| 3141 | and there is work to do. */ |
| 3142 | if (srv_check_activity(last_activity) |
| 3143 | || buf_get_n_pending_read_ios() |
| 3144 | || n_flushed == 0) { |
| 3145 | |
| 3146 | ret_sleep = pc_sleep_if_needed( |
| 3147 | next_loop_time, sig_count, curr_time); |
| 3148 | } else if (curr_time > next_loop_time) { |
| 3149 | ret_sleep = OS_SYNC_TIME_EXCEEDED; |
| 3150 | } else { |
| 3151 | ret_sleep = 0; |
| 3152 | } |
| 3153 | |
| 3154 | if (srv_shutdown_state != SRV_SHUTDOWN_NONE) { |
| 3155 | break; |
| 3156 | } |
| 3157 | |
| 3158 | sig_count = os_event_reset(buf_flush_event); |
| 3159 | |
| 3160 | if (ret_sleep == OS_SYNC_TIME_EXCEEDED) { |
| 3161 | if (global_system_variables.log_warnings > 2 |
| 3162 | && curr_time > next_loop_time + 3000 |
| 3163 | && !(test_flags & TEST_SIGINT)) { |
| 3164 | if (warn_count == 0) { |
| 3165 | ib::info() << "page_cleaner: 1000ms" |
| 3166 | " intended loop took " |
| 3167 | << 1000 + curr_time |
| 3168 | - next_loop_time |
| 3169 | << "ms. The settings might not" |
| 3170 | " be optimal. (flushed=" |
| 3171 | << n_flushed_last |
| 3172 | << " and evicted=" |
| 3173 | << n_evicted |
| 3174 | << ", during the time.)" ; |
| 3175 | if (warn_interval > 300) { |
| 3176 | warn_interval = 600; |
| 3177 | } else { |
| 3178 | warn_interval *= 2; |
| 3179 | } |
| 3180 | |
| 3181 | warn_count = warn_interval; |
| 3182 | } else { |
| 3183 | --warn_count; |
| 3184 | } |
| 3185 | } else { |
| 3186 | /* reset counter */ |
| 3187 | warn_interval = 1; |
| 3188 | warn_count = 0; |
| 3189 | } |
| 3190 | |
| 3191 | next_loop_time = curr_time + 1000; |
| 3192 | n_flushed_last = n_evicted = 0; |
| 3193 | } |
| 3194 | |
| 3195 | if (ret_sleep != OS_SYNC_TIME_EXCEEDED |
| 3196 | && srv_flush_sync |
| 3197 | && buf_flush_sync_lsn > 0) { |
| 3198 | /* woke up for flush_sync */ |
| 3199 | mutex_enter(&page_cleaner.mutex); |
| 3200 | lsn_t lsn_limit = buf_flush_sync_lsn; |
| 3201 | buf_flush_sync_lsn = 0; |
| 3202 | mutex_exit(&page_cleaner.mutex); |
| 3203 | |
| 3204 | /* Request flushing for threads */ |
| 3205 | pc_request(ULINT_MAX, lsn_limit); |
| 3206 | |
| 3207 | ulint tm = ut_time_ms(); |
| 3208 | |
| 3209 | /* Coordinator also treats requests */ |
| 3210 | while (pc_flush_slot() > 0) {} |
| 3211 | |
| 3212 | /* only coordinator is using these counters, |
| 3213 | so no need to protect by lock. */ |
| 3214 | page_cleaner.flush_time += ut_time_ms() - tm; |
| 3215 | page_cleaner.flush_pass++; |
| 3216 | |
| 3217 | /* Wait for all slots to be finished */ |
| 3218 | ulint n_flushed_lru = 0; |
| 3219 | ulint n_flushed_list = 0; |
| 3220 | pc_wait_finished(&n_flushed_lru, &n_flushed_list); |
| 3221 | |
| 3222 | if (n_flushed_list > 0 || n_flushed_lru > 0) { |
| 3223 | buf_flush_stats(n_flushed_list, n_flushed_lru); |
| 3224 | |
| 3225 | MONITOR_INC_VALUE_CUMULATIVE( |
| 3226 | MONITOR_FLUSH_SYNC_TOTAL_PAGE, |
| 3227 | MONITOR_FLUSH_SYNC_COUNT, |
| 3228 | MONITOR_FLUSH_SYNC_PAGES, |
| 3229 | n_flushed_lru + n_flushed_list); |
| 3230 | } |
| 3231 | |
| 3232 | n_flushed = n_flushed_lru + n_flushed_list; |
| 3233 | |
| 3234 | } else if (srv_check_activity(last_activity)) { |
| 3235 | ulint n_to_flush; |
| 3236 | lsn_t lsn_limit = 0; |
| 3237 | |
| 3238 | /* Estimate pages from flush_list to be flushed */ |
| 3239 | if (ret_sleep == OS_SYNC_TIME_EXCEEDED) { |
| 3240 | last_activity = srv_get_activity_count(); |
| 3241 | n_to_flush = |
| 3242 | page_cleaner_flush_pages_recommendation( |
| 3243 | &lsn_limit, last_pages); |
| 3244 | } else { |
| 3245 | n_to_flush = 0; |
| 3246 | } |
| 3247 | |
| 3248 | /* Request flushing for threads */ |
| 3249 | pc_request(n_to_flush, lsn_limit); |
| 3250 | |
| 3251 | ulint tm = ut_time_ms(); |
| 3252 | |
| 3253 | /* Coordinator also treats requests */ |
| 3254 | while (pc_flush_slot() > 0) { |
| 3255 | /* No op */ |
| 3256 | } |
| 3257 | |
| 3258 | /* only coordinator is using these counters, |
| 3259 | so no need to protect by lock. */ |
| 3260 | page_cleaner.flush_time += ut_time_ms() - tm; |
| 3261 | page_cleaner.flush_pass++ ; |
| 3262 | |
| 3263 | /* Wait for all slots to be finished */ |
| 3264 | ulint n_flushed_lru = 0; |
| 3265 | ulint n_flushed_list = 0; |
| 3266 | |
| 3267 | pc_wait_finished(&n_flushed_lru, &n_flushed_list); |
| 3268 | |
| 3269 | if (n_flushed_list > 0 || n_flushed_lru > 0) { |
| 3270 | buf_flush_stats(n_flushed_list, n_flushed_lru); |
| 3271 | } |
| 3272 | |
| 3273 | if (ret_sleep == OS_SYNC_TIME_EXCEEDED) { |
| 3274 | last_pages = n_flushed_list; |
| 3275 | } |
| 3276 | |
| 3277 | n_evicted += n_flushed_lru; |
| 3278 | n_flushed_last += n_flushed_list; |
| 3279 | |
| 3280 | n_flushed = n_flushed_lru + n_flushed_list; |
| 3281 | |
| 3282 | if (n_flushed_lru) { |
| 3283 | MONITOR_INC_VALUE_CUMULATIVE( |
| 3284 | MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE, |
| 3285 | MONITOR_LRU_BATCH_FLUSH_COUNT, |
| 3286 | MONITOR_LRU_BATCH_FLUSH_PAGES, |
| 3287 | n_flushed_lru); |
| 3288 | } |
| 3289 | |
| 3290 | if (n_flushed_list) { |
| 3291 | MONITOR_INC_VALUE_CUMULATIVE( |
| 3292 | MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, |
| 3293 | MONITOR_FLUSH_ADAPTIVE_COUNT, |
| 3294 | MONITOR_FLUSH_ADAPTIVE_PAGES, |
| 3295 | n_flushed_list); |
| 3296 | } |
| 3297 | |
| 3298 | } else if (ret_sleep == OS_SYNC_TIME_EXCEEDED) { |
| 3299 | /* no activity, slept enough */ |
| 3300 | buf_flush_lists(PCT_IO(100), LSN_MAX, &n_flushed); |
| 3301 | |
| 3302 | n_flushed_last += n_flushed; |
| 3303 | |
| 3304 | if (n_flushed) { |
| 3305 | MONITOR_INC_VALUE_CUMULATIVE( |
| 3306 | MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, |
| 3307 | MONITOR_FLUSH_BACKGROUND_COUNT, |
| 3308 | MONITOR_FLUSH_BACKGROUND_PAGES, |
| 3309 | n_flushed); |
| 3310 | |
| 3311 | } |
| 3312 | |
| 3313 | } else { |
| 3314 | /* no activity, but woken up by event */ |
| 3315 | n_flushed = 0; |
| 3316 | } |
| 3317 | |
| 3318 | ut_d(buf_flush_page_cleaner_disabled_loop()); |
| 3319 | } |
| 3320 | |
| 3321 | ut_ad(srv_shutdown_state > 0); |
| 3322 | if (srv_fast_shutdown == 2 |
| 3323 | || srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { |
| 3324 | /* In very fast shutdown or when innodb failed to start, we |
| 3325 | simulate a crash of the buffer pool. We are not required to do |
| 3326 | any flushing. */ |
| 3327 | goto thread_exit; |
| 3328 | } |
| 3329 | |
| 3330 | /* In case of normal and slow shutdown the page_cleaner thread |
| 3331 | must wait for all other activity in the server to die down. |
| 3332 | Note that we can start flushing the buffer pool as soon as the |
| 3333 | server enters shutdown phase but we must stay alive long enough |
| 3334 | to ensure that any work done by the master or purge threads is |
| 3335 | also flushed. |
| 3336 | During shutdown we pass through two stages. In the first stage, |
| 3337 | when SRV_SHUTDOWN_CLEANUP is set other threads like the master |
| 3338 | and the purge threads may be working as well. We start flushing |
| 3339 | the buffer pool but can't be sure that no new pages are being |
| 3340 | dirtied until we enter SRV_SHUTDOWN_FLUSH_PHASE phase. */ |
| 3341 | |
| 3342 | do { |
| 3343 | pc_request(ULINT_MAX, LSN_MAX); |
| 3344 | |
| 3345 | while (pc_flush_slot() > 0) {} |
| 3346 | |
| 3347 | ulint n_flushed_lru = 0; |
| 3348 | ulint n_flushed_list = 0; |
| 3349 | pc_wait_finished(&n_flushed_lru, &n_flushed_list); |
| 3350 | |
| 3351 | n_flushed = n_flushed_lru + n_flushed_list; |
| 3352 | |
| 3353 | /* We sleep only if there are no pages to flush */ |
| 3354 | if (n_flushed == 0) { |
| 3355 | os_thread_sleep(100000); |
| 3356 | } |
| 3357 | } while (srv_shutdown_state == SRV_SHUTDOWN_CLEANUP); |
| 3358 | |
| 3359 | /* At this point all threads including the master and the purge |
| 3360 | thread must have been suspended. */ |
| 3361 | ut_a(srv_get_active_thread_type() == SRV_NONE); |
| 3362 | ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE); |
| 3363 | |
| 3364 | /* We can now make a final sweep on flushing the buffer pool |
| 3365 | and exit after we have cleaned the whole buffer pool. |
| 3366 | It is important that we wait for any running batch that has |
| 3367 | been triggered by us to finish. Otherwise we can end up |
| 3368 | considering end of that batch as a finish of our final |
| 3369 | sweep and we'll come out of the loop leaving behind dirty pages |
| 3370 | in the flush_list */ |
| 3371 | buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); |
| 3372 | buf_flush_wait_LRU_batch_end(); |
| 3373 | |
| 3374 | bool success; |
| 3375 | |
| 3376 | do { |
| 3377 | pc_request(ULINT_MAX, LSN_MAX); |
| 3378 | |
| 3379 | while (pc_flush_slot() > 0) {} |
| 3380 | |
| 3381 | ulint n_flushed_lru = 0; |
| 3382 | ulint n_flushed_list = 0; |
| 3383 | success = pc_wait_finished(&n_flushed_lru, &n_flushed_list); |
| 3384 | |
| 3385 | n_flushed = n_flushed_lru + n_flushed_list; |
| 3386 | |
| 3387 | buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); |
| 3388 | buf_flush_wait_LRU_batch_end(); |
| 3389 | |
| 3390 | } while (!success || n_flushed > 0); |
| 3391 | |
| 3392 | /* Some sanity checks */ |
| 3393 | ut_a(srv_get_active_thread_type() == SRV_NONE); |
| 3394 | ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE); |
| 3395 | |
| 3396 | for (ulint i = 0; i < srv_buf_pool_instances; i++) { |
| 3397 | buf_pool_t* buf_pool = buf_pool_from_array(i); |
| 3398 | ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == 0); |
| 3399 | } |
| 3400 | |
| 3401 | /* We have lived our life. Time to die. */ |
| 3402 | |
| 3403 | thread_exit: |
| 3404 | /* All worker threads are waiting for the event here, |
| 3405 | and no more access to page_cleaner structure by them. |
| 3406 | Wakes worker threads up just to make them exit. */ |
| 3407 | page_cleaner.is_running = false; |
| 3408 | |
| 3409 | /* waiting for all worker threads exit */ |
| 3410 | while (page_cleaner.n_workers) { |
| 3411 | os_event_set(page_cleaner.is_requested); |
| 3412 | os_thread_sleep(10000); |
| 3413 | } |
| 3414 | |
| 3415 | mutex_destroy(&page_cleaner.mutex); |
| 3416 | |
| 3417 | os_event_destroy(page_cleaner.is_finished); |
| 3418 | os_event_destroy(page_cleaner.is_requested); |
| 3419 | os_event_destroy(page_cleaner.is_started); |
| 3420 | |
| 3421 | buf_page_cleaner_is_active = false; |
| 3422 | |
| 3423 | my_thread_end(); |
| 3424 | /* We count the number of threads in os_thread_exit(). A created |
| 3425 | thread should always use that to exit and not use return() to exit. */ |
| 3426 | os_thread_exit(); |
| 3427 | |
| 3428 | OS_THREAD_DUMMY_RETURN; |
| 3429 | } |
| 3430 | |
| 3431 | /** Adjust thread count for page cleaner workers. |
| 3432 | @param[in] new_cnt Number of threads to be used */ |
| 3433 | void |
| 3434 | buf_flush_set_page_cleaner_thread_cnt(ulong new_cnt) |
| 3435 | { |
| 3436 | mutex_enter(&page_cleaner.mutex); |
| 3437 | |
| 3438 | srv_n_page_cleaners = new_cnt; |
| 3439 | if (new_cnt > page_cleaner.n_workers) { |
| 3440 | /* User has increased the number of page |
| 3441 | cleaner threads. */ |
| 3442 | ulint add = new_cnt - page_cleaner.n_workers; |
| 3443 | for (ulint i = 0; i < add; i++) { |
| 3444 | os_thread_id_t cleaner_thread_id; |
| 3445 | os_thread_create(buf_flush_page_cleaner_worker, NULL, &cleaner_thread_id); |
| 3446 | } |
| 3447 | } |
| 3448 | |
| 3449 | mutex_exit(&page_cleaner.mutex); |
| 3450 | |
| 3451 | /* Wait until defined number of workers has started. */ |
| 3452 | while (page_cleaner.is_running && |
| 3453 | page_cleaner.n_workers != (srv_n_page_cleaners - 1)) { |
| 3454 | os_event_set(page_cleaner.is_requested); |
| 3455 | os_event_reset(page_cleaner.is_started); |
| 3456 | os_event_wait_time(page_cleaner.is_started, 1000000); |
| 3457 | } |
| 3458 | } |
| 3459 | |
| 3460 | /******************************************************************//** |
| 3461 | Worker thread of page_cleaner. |
| 3462 | @return a dummy parameter */ |
| 3463 | extern "C" |
| 3464 | os_thread_ret_t |
| 3465 | DECLARE_THREAD(buf_flush_page_cleaner_worker)( |
| 3466 | /*==========================================*/ |
| 3467 | void* arg MY_ATTRIBUTE((unused))) |
| 3468 | /*!< in: a dummy parameter required by |
| 3469 | os_thread_create */ |
| 3470 | { |
| 3471 | my_thread_init(); |
| 3472 | #ifndef DBUG_OFF |
| 3473 | os_thread_id_t cleaner_thread_id = os_thread_get_curr_id(); |
| 3474 | #endif |
| 3475 | |
| 3476 | mutex_enter(&page_cleaner.mutex); |
| 3477 | ulint thread_no = page_cleaner.n_workers++; |
| 3478 | |
| 3479 | DBUG_LOG("ib_buf" , "Thread " << cleaner_thread_id |
| 3480 | << " started; n_workers=" << page_cleaner.n_workers); |
| 3481 | |
| 3482 | /* Signal that we have started */ |
| 3483 | os_event_set(page_cleaner.is_started); |
| 3484 | mutex_exit(&page_cleaner.mutex); |
| 3485 | |
| 3486 | #ifdef UNIV_LINUX |
| 3487 | /* linux might be able to set different setting for each thread |
| 3488 | worth to try to set high priority for page cleaner threads */ |
| 3489 | if (buf_flush_page_cleaner_set_priority( |
| 3490 | buf_flush_page_cleaner_priority)) { |
| 3491 | |
| 3492 | ib::info() << "page_cleaner worker priority: " |
| 3493 | << buf_flush_page_cleaner_priority; |
| 3494 | } |
| 3495 | #endif /* UNIV_LINUX */ |
| 3496 | |
| 3497 | while (true) { |
| 3498 | os_event_wait(page_cleaner.is_requested); |
| 3499 | |
| 3500 | ut_d(buf_flush_page_cleaner_disabled_loop()); |
| 3501 | |
| 3502 | if (!page_cleaner.is_running) { |
| 3503 | break; |
| 3504 | } |
| 3505 | |
| 3506 | ut_ad(srv_n_page_cleaners >= 1); |
| 3507 | |
| 3508 | /* If number of page cleaner threads is decreased |
| 3509 | exit those that are not anymore needed. */ |
| 3510 | if (srv_shutdown_state == SRV_SHUTDOWN_NONE && |
| 3511 | thread_no >= (srv_n_page_cleaners - 1)) { |
| 3512 | DBUG_LOG("ib_buf" , "Exiting " |
| 3513 | << thread_no |
| 3514 | << " page cleaner worker thread_id " |
| 3515 | << os_thread_pf(cleaner_thread_id) |
| 3516 | << " total threads " << srv_n_page_cleaners << "." ); |
| 3517 | break; |
| 3518 | } |
| 3519 | |
| 3520 | pc_flush_slot(); |
| 3521 | } |
| 3522 | |
| 3523 | mutex_enter(&page_cleaner.mutex); |
| 3524 | page_cleaner.n_workers--; |
| 3525 | |
| 3526 | DBUG_LOG("ib_buf" , "Thread " << cleaner_thread_id |
| 3527 | << " exiting; n_workers=" << page_cleaner.n_workers); |
| 3528 | |
| 3529 | /* Signal that we have stopped */ |
| 3530 | os_event_set(page_cleaner.is_started); |
| 3531 | mutex_exit(&page_cleaner.mutex); |
| 3532 | |
| 3533 | my_thread_end(); |
| 3534 | |
| 3535 | os_thread_exit(); |
| 3536 | |
| 3537 | OS_THREAD_DUMMY_RETURN; |
| 3538 | } |
| 3539 | |
| 3540 | /*******************************************************************//** |
| 3541 | Synchronously flush dirty blocks from the end of the flush list of all buffer |
| 3542 | pool instances. |
| 3543 | NOTE: The calling thread is not allowed to own any latches on pages! */ |
| 3544 | void |
| 3545 | buf_flush_sync_all_buf_pools(void) |
| 3546 | /*==============================*/ |
| 3547 | { |
| 3548 | bool success; |
| 3549 | do { |
| 3550 | success = buf_flush_lists(ULINT_MAX, LSN_MAX, NULL); |
| 3551 | buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); |
| 3552 | } while (!success); |
| 3553 | |
| 3554 | ut_a(success); |
| 3555 | } |
| 3556 | |
| 3557 | /** Request IO burst and wake page_cleaner up. |
| 3558 | @param[in] lsn_limit upper limit of LSN to be flushed */ |
| 3559 | void |
| 3560 | buf_flush_request_force( |
| 3561 | lsn_t lsn_limit) |
| 3562 | { |
| 3563 | /* adjust based on lsn_avg_rate not to get old */ |
| 3564 | lsn_t lsn_target = lsn_limit + lsn_avg_rate * 3; |
| 3565 | |
| 3566 | mutex_enter(&page_cleaner.mutex); |
| 3567 | if (lsn_target > buf_flush_sync_lsn) { |
| 3568 | buf_flush_sync_lsn = lsn_target; |
| 3569 | } |
| 3570 | mutex_exit(&page_cleaner.mutex); |
| 3571 | |
| 3572 | os_event_set(buf_flush_event); |
| 3573 | } |
| 3574 | #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG |
| 3575 | |
| 3576 | /** Functor to validate the flush list. */ |
| 3577 | struct Check { |
| 3578 | void operator()(const buf_page_t* elem) |
| 3579 | { |
| 3580 | ut_a(elem->in_flush_list); |
| 3581 | } |
| 3582 | }; |
| 3583 | |
| 3584 | /******************************************************************//** |
| 3585 | Validates the flush list. |
| 3586 | @return TRUE if ok */ |
| 3587 | static |
| 3588 | ibool |
| 3589 | buf_flush_validate_low( |
| 3590 | /*===================*/ |
| 3591 | buf_pool_t* buf_pool) /*!< in: Buffer pool instance */ |
| 3592 | { |
| 3593 | buf_page_t* bpage; |
| 3594 | const ib_rbt_node_t* rnode = NULL; |
| 3595 | Check check; |
| 3596 | |
| 3597 | ut_ad(buf_flush_list_mutex_own(buf_pool)); |
| 3598 | |
| 3599 | ut_list_validate(buf_pool->flush_list, check); |
| 3600 | |
| 3601 | bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); |
| 3602 | |
| 3603 | /* If we are in recovery mode i.e.: flush_rbt != NULL |
| 3604 | then each block in the flush_list must also be present |
| 3605 | in the flush_rbt. */ |
| 3606 | if (buf_pool->flush_rbt != NULL) { |
| 3607 | rnode = rbt_first(buf_pool->flush_rbt); |
| 3608 | } |
| 3609 | |
| 3610 | while (bpage != NULL) { |
| 3611 | const lsn_t om = bpage->oldest_modification; |
| 3612 | |
| 3613 | ut_ad(buf_pool_from_bpage(bpage) == buf_pool); |
| 3614 | |
| 3615 | ut_ad(bpage->in_flush_list); |
| 3616 | |
| 3617 | /* A page in buf_pool->flush_list can be in |
| 3618 | BUF_BLOCK_REMOVE_HASH state. This happens when a page |
| 3619 | is in the middle of being relocated. In that case the |
| 3620 | original descriptor can have this state and still be |
| 3621 | in the flush list waiting to acquire the |
| 3622 | buf_pool->flush_list_mutex to complete the relocation. */ |
| 3623 | ut_a(buf_page_in_file(bpage) |
| 3624 | || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH); |
| 3625 | ut_a(om > 0); |
| 3626 | |
| 3627 | if (buf_pool->flush_rbt != NULL) { |
| 3628 | buf_page_t** prpage; |
| 3629 | |
| 3630 | ut_a(rnode != NULL); |
| 3631 | prpage = rbt_value(buf_page_t*, rnode); |
| 3632 | |
| 3633 | ut_a(*prpage != NULL); |
| 3634 | ut_a(*prpage == bpage); |
| 3635 | rnode = rbt_next(buf_pool->flush_rbt, rnode); |
| 3636 | } |
| 3637 | |
| 3638 | bpage = UT_LIST_GET_NEXT(list, bpage); |
| 3639 | |
| 3640 | ut_a(bpage == NULL || om >= bpage->oldest_modification); |
| 3641 | } |
| 3642 | |
| 3643 | /* By this time we must have exhausted the traversal of |
| 3644 | flush_rbt (if active) as well. */ |
| 3645 | ut_a(rnode == NULL); |
| 3646 | |
| 3647 | return(TRUE); |
| 3648 | } |
| 3649 | |
| 3650 | /******************************************************************//** |
| 3651 | Validates the flush list. |
| 3652 | @return TRUE if ok */ |
| 3653 | ibool |
| 3654 | buf_flush_validate( |
| 3655 | /*===============*/ |
| 3656 | buf_pool_t* buf_pool) /*!< buffer pool instance */ |
| 3657 | { |
| 3658 | ibool ret; |
| 3659 | |
| 3660 | buf_flush_list_mutex_enter(buf_pool); |
| 3661 | |
| 3662 | ret = buf_flush_validate_low(buf_pool); |
| 3663 | |
| 3664 | buf_flush_list_mutex_exit(buf_pool); |
| 3665 | |
| 3666 | return(ret); |
| 3667 | } |
| 3668 | |
| 3669 | #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ |
| 3670 | |
| 3671 | /******************************************************************//** |
| 3672 | Check if there are any dirty pages that belong to a space id in the flush |
| 3673 | list in a particular buffer pool. |
| 3674 | @return number of dirty pages present in a single buffer pool */ |
| 3675 | ulint |
| 3676 | buf_pool_get_dirty_pages_count( |
| 3677 | /*===========================*/ |
| 3678 | buf_pool_t* buf_pool, /*!< in: buffer pool */ |
| 3679 | ulint id, /*!< in: space id to check */ |
| 3680 | FlushObserver* observer) /*!< in: flush observer to check */ |
| 3681 | |
| 3682 | { |
| 3683 | ulint count = 0; |
| 3684 | |
| 3685 | buf_pool_mutex_enter(buf_pool); |
| 3686 | buf_flush_list_mutex_enter(buf_pool); |
| 3687 | |
| 3688 | buf_page_t* bpage; |
| 3689 | |
| 3690 | for (bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); |
| 3691 | bpage != 0; |
| 3692 | bpage = UT_LIST_GET_NEXT(list, bpage)) { |
| 3693 | |
| 3694 | ut_ad(buf_page_in_file(bpage)); |
| 3695 | ut_ad(bpage->in_flush_list); |
| 3696 | ut_ad(bpage->oldest_modification > 0); |
| 3697 | |
| 3698 | if ((observer != NULL |
| 3699 | && observer == bpage->flush_observer) |
| 3700 | || (observer == NULL |
| 3701 | && id == bpage->id.space())) { |
| 3702 | ++count; |
| 3703 | } |
| 3704 | } |
| 3705 | |
| 3706 | buf_flush_list_mutex_exit(buf_pool); |
| 3707 | buf_pool_mutex_exit(buf_pool); |
| 3708 | |
| 3709 | return(count); |
| 3710 | } |
| 3711 | |
| 3712 | /******************************************************************//** |
| 3713 | Check if there are any dirty pages that belong to a space id in the flush list. |
| 3714 | @return number of dirty pages present in all the buffer pools */ |
| 3715 | static |
| 3716 | ulint |
| 3717 | buf_flush_get_dirty_pages_count( |
| 3718 | /*============================*/ |
| 3719 | ulint id, /*!< in: space id to check */ |
| 3720 | FlushObserver* observer) /*!< in: flush observer to check */ |
| 3721 | { |
| 3722 | ulint count = 0; |
| 3723 | |
| 3724 | for (ulint i = 0; i < srv_buf_pool_instances; ++i) { |
| 3725 | buf_pool_t* buf_pool; |
| 3726 | |
| 3727 | buf_pool = buf_pool_from_array(i); |
| 3728 | |
| 3729 | count += buf_pool_get_dirty_pages_count(buf_pool, id, observer); |
| 3730 | } |
| 3731 | |
| 3732 | return(count); |
| 3733 | } |
| 3734 | |
| 3735 | /** FlushObserver constructor |
| 3736 | @param[in] space tablespace |
| 3737 | @param[in] trx trx instance |
| 3738 | @param[in] stage performance schema accounting object, |
| 3739 | used by ALTER TABLE. It is passed to log_preflush_pool_modified_pages() |
| 3740 | for accounting. */ |
| 3741 | FlushObserver::FlushObserver( |
| 3742 | fil_space_t* space, |
| 3743 | trx_t* trx, |
| 3744 | ut_stage_alter_t* stage) |
| 3745 | : |
| 3746 | m_space(space), |
| 3747 | m_trx(trx), |
| 3748 | m_stage(stage), |
| 3749 | m_interrupted(false) |
| 3750 | { |
| 3751 | m_flushed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances)); |
| 3752 | m_removed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances)); |
| 3753 | |
| 3754 | for (ulint i = 0; i < srv_buf_pool_instances; i++) { |
| 3755 | m_flushed->at(i) = 0; |
| 3756 | m_removed->at(i) = 0; |
| 3757 | } |
| 3758 | |
| 3759 | DBUG_LOG("flush" , "FlushObserver(): trx->id=" << m_trx->id); |
| 3760 | } |
| 3761 | |
| 3762 | /** FlushObserver deconstructor */ |
| 3763 | FlushObserver::~FlushObserver() |
| 3764 | { |
| 3765 | ut_ad(buf_flush_get_dirty_pages_count(m_space->id, this) == 0); |
| 3766 | |
| 3767 | UT_DELETE(m_flushed); |
| 3768 | UT_DELETE(m_removed); |
| 3769 | |
| 3770 | DBUG_LOG("flush" , "~FlushObserver(): trx->id=" << m_trx->id); |
| 3771 | } |
| 3772 | |
| 3773 | /** Check whether trx is interrupted |
| 3774 | @return true if trx is interrupted */ |
| 3775 | bool |
| 3776 | FlushObserver::check_interrupted() |
| 3777 | { |
| 3778 | if (trx_is_interrupted(m_trx)) { |
| 3779 | interrupted(); |
| 3780 | |
| 3781 | return(true); |
| 3782 | } |
| 3783 | |
| 3784 | return(false); |
| 3785 | } |
| 3786 | |
| 3787 | /** Notify observer of a flush |
| 3788 | @param[in] buf_pool buffer pool instance |
| 3789 | @param[in] bpage buffer page to flush */ |
| 3790 | void |
| 3791 | FlushObserver::notify_flush( |
| 3792 | buf_pool_t* buf_pool, |
| 3793 | buf_page_t* bpage) |
| 3794 | { |
| 3795 | ut_ad(buf_pool_mutex_own(buf_pool)); |
| 3796 | |
| 3797 | m_flushed->at(buf_pool->instance_no)++; |
| 3798 | |
| 3799 | if (m_stage != NULL) { |
| 3800 | m_stage->inc(); |
| 3801 | } |
| 3802 | |
| 3803 | DBUG_LOG("flush" , "Flush " << bpage->id); |
| 3804 | } |
| 3805 | |
| 3806 | /** Notify observer of a remove |
| 3807 | @param[in] buf_pool buffer pool instance |
| 3808 | @param[in] bpage buffer page flushed */ |
| 3809 | void |
| 3810 | FlushObserver::notify_remove( |
| 3811 | buf_pool_t* buf_pool, |
| 3812 | buf_page_t* bpage) |
| 3813 | { |
| 3814 | ut_ad(buf_pool_mutex_own(buf_pool)); |
| 3815 | |
| 3816 | m_removed->at(buf_pool->instance_no)++; |
| 3817 | |
| 3818 | DBUG_LOG("flush" , "Remove " << bpage->id); |
| 3819 | } |
| 3820 | |
| 3821 | /** Flush dirty pages and wait. */ |
| 3822 | void |
| 3823 | FlushObserver::flush() |
| 3824 | { |
| 3825 | ut_ad(m_trx); |
| 3826 | |
| 3827 | if (!m_interrupted && m_stage) { |
| 3828 | m_stage->begin_phase_flush(buf_flush_get_dirty_pages_count( |
| 3829 | m_space->id, this)); |
| 3830 | } |
| 3831 | |
| 3832 | buf_LRU_flush_or_remove_pages(m_space->id, this); |
| 3833 | |
| 3834 | /* Wait for all dirty pages were flushed. */ |
| 3835 | for (ulint i = 0; i < srv_buf_pool_instances; i++) { |
| 3836 | while (!is_complete(i)) { |
| 3837 | |
| 3838 | os_thread_sleep(2000); |
| 3839 | } |
| 3840 | } |
| 3841 | } |
| 3842 | |