1 | /***************************************************************************** |
2 | |
3 | Copyright (c) 1995, 2017, Oracle and/or its affiliates. All Rights Reserved. |
4 | Copyright (c) 2013, 2018, MariaDB Corporation. |
5 | Copyright (c) 2013, 2014, Fusion-io |
6 | |
7 | This program is free software; you can redistribute it and/or modify it under |
8 | the terms of the GNU General Public License as published by the Free Software |
9 | Foundation; version 2 of the License. |
10 | |
11 | This program is distributed in the hope that it will be useful, but WITHOUT |
12 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
13 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
14 | |
15 | You should have received a copy of the GNU General Public License along with |
16 | this program; if not, write to the Free Software Foundation, Inc., |
17 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA |
18 | |
19 | *****************************************************************************/ |
20 | |
21 | /**************************************************//** |
22 | @file buf/buf0flu.cc |
23 | The database buffer buf_pool flush algorithm |
24 | |
25 | Created 11/11/1995 Heikki Tuuri |
26 | *******************************************************/ |
27 | |
28 | #include "ha_prototypes.h" |
29 | #include <mysql/service_thd_wait.h> |
30 | #include <my_dbug.h> |
31 | #include <sql_class.h> |
32 | |
33 | #include "buf0flu.h" |
34 | #include "buf0buf.h" |
35 | #include "buf0checksum.h" |
36 | #include "srv0start.h" |
37 | #include "srv0srv.h" |
38 | #include "page0zip.h" |
39 | #include "ut0byte.h" |
40 | #include "page0page.h" |
41 | #include "fil0fil.h" |
42 | #include "buf0lru.h" |
43 | #include "buf0rea.h" |
44 | #include "ibuf0ibuf.h" |
45 | #include "log0log.h" |
46 | #include "os0file.h" |
47 | #include "trx0sys.h" |
48 | #include "srv0mon.h" |
49 | #include "fsp0sysspace.h" |
50 | #include "ut0stage.h" |
51 | #include "fil0pagecompress.h" |
52 | #ifdef UNIV_LINUX |
53 | /* include defs for CPU time priority settings */ |
54 | #include <unistd.h> |
55 | #include <sys/syscall.h> |
56 | #include <sys/time.h> |
57 | #include <sys/resource.h> |
58 | static const int buf_flush_page_cleaner_priority = -20; |
59 | #endif /* UNIV_LINUX */ |
60 | |
61 | /** Sleep time in microseconds for loop waiting for the oldest |
62 | modification lsn */ |
63 | static const ulint buf_flush_wait_flushed_sleep_time = 10000; |
64 | |
65 | #include <my_service_manager.h> |
66 | |
67 | /** Number of pages flushed through non flush_list flushes. */ |
68 | static ulint buf_lru_flush_page_count = 0; |
69 | |
70 | /** Flag indicating if the page_cleaner is in active state. This flag |
71 | is set to TRUE by the page_cleaner thread when it is spawned and is set |
72 | back to FALSE at shutdown by the page_cleaner as well. Therefore no |
73 | need to protect it by a mutex. It is only ever read by the thread |
74 | doing the shutdown */ |
75 | bool buf_page_cleaner_is_active; |
76 | |
77 | /** Factor for scan length to determine n_pages for intended oldest LSN |
78 | progress */ |
79 | static ulint buf_flush_lsn_scan_factor = 3; |
80 | |
81 | /** Average redo generation rate */ |
82 | static lsn_t lsn_avg_rate = 0; |
83 | |
84 | /** Target oldest LSN for the requested flush_sync */ |
85 | static lsn_t buf_flush_sync_lsn = 0; |
86 | |
87 | #ifdef UNIV_PFS_THREAD |
88 | mysql_pfs_key_t page_cleaner_thread_key; |
89 | #endif /* UNIV_PFS_THREAD */ |
90 | |
91 | /** Event to synchronise with the flushing. */ |
92 | os_event_t buf_flush_event; |
93 | |
94 | /** State for page cleaner array slot */ |
95 | enum page_cleaner_state_t { |
96 | /** Not requested any yet. |
97 | Moved from FINISHED by the coordinator. */ |
98 | PAGE_CLEANER_STATE_NONE = 0, |
99 | /** Requested but not started flushing. |
100 | Moved from NONE by the coordinator. */ |
101 | PAGE_CLEANER_STATE_REQUESTED, |
102 | /** Flushing is on going. |
103 | Moved from REQUESTED by the worker. */ |
104 | PAGE_CLEANER_STATE_FLUSHING, |
105 | /** Flushing was finished. |
106 | Moved from FLUSHING by the worker. */ |
107 | PAGE_CLEANER_STATE_FINISHED |
108 | }; |
109 | |
110 | /** Page cleaner request state for each buffer pool instance */ |
111 | struct page_cleaner_slot_t { |
112 | page_cleaner_state_t state; /*!< state of the request. |
113 | protected by page_cleaner_t::mutex |
114 | if the worker thread got the slot and |
115 | set to PAGE_CLEANER_STATE_FLUSHING, |
116 | n_flushed_lru and n_flushed_list can be |
117 | updated only by the worker thread */ |
118 | /* This value is set during state==PAGE_CLEANER_STATE_NONE */ |
119 | ulint n_pages_requested; |
120 | /*!< number of requested pages |
121 | for the slot */ |
122 | /* These values are updated during state==PAGE_CLEANER_STATE_FLUSHING, |
123 | and commited with state==PAGE_CLEANER_STATE_FINISHED. |
124 | The consistency is protected by the 'state' */ |
125 | ulint n_flushed_lru; |
126 | /*!< number of flushed pages |
127 | by LRU scan flushing */ |
128 | ulint n_flushed_list; |
129 | /*!< number of flushed pages |
130 | by flush_list flushing */ |
131 | bool succeeded_list; |
132 | /*!< true if flush_list flushing |
133 | succeeded. */ |
134 | ulint flush_lru_time; |
135 | /*!< elapsed time for LRU flushing */ |
136 | ulint flush_list_time; |
137 | /*!< elapsed time for flush_list |
138 | flushing */ |
139 | ulint flush_lru_pass; |
140 | /*!< count to attempt LRU flushing */ |
141 | ulint flush_list_pass; |
142 | /*!< count to attempt flush_list |
143 | flushing */ |
144 | }; |
145 | |
146 | /** Page cleaner structure common for all threads */ |
147 | struct page_cleaner_t { |
148 | ib_mutex_t mutex; /*!< mutex to protect whole of |
149 | page_cleaner_t struct and |
150 | page_cleaner_slot_t slots. */ |
151 | os_event_t is_requested; /*!< event to activate worker |
152 | threads. */ |
153 | os_event_t is_finished; /*!< event to signal that all |
154 | slots were finished. */ |
155 | os_event_t is_started; /*!< event to signal that |
156 | thread is started/exiting */ |
157 | volatile ulint n_workers; /*!< number of worker threads |
158 | in existence */ |
159 | bool requested; /*!< true if requested pages |
160 | to flush */ |
161 | lsn_t lsn_limit; /*!< upper limit of LSN to be |
162 | flushed */ |
163 | ulint n_slots; /*!< total number of slots */ |
164 | ulint n_slots_requested; |
165 | /*!< number of slots |
166 | in the state |
167 | PAGE_CLEANER_STATE_REQUESTED */ |
168 | ulint n_slots_flushing; |
169 | /*!< number of slots |
170 | in the state |
171 | PAGE_CLEANER_STATE_FLUSHING */ |
172 | ulint n_slots_finished; |
173 | /*!< number of slots |
174 | in the state |
175 | PAGE_CLEANER_STATE_FINISHED */ |
176 | ulint flush_time; /*!< elapsed time to flush |
177 | requests for all slots */ |
178 | ulint flush_pass; /*!< count to finish to flush |
179 | requests for all slots */ |
180 | page_cleaner_slot_t slots[MAX_BUFFER_POOLS]; |
181 | bool is_running; /*!< false if attempt |
182 | to shutdown */ |
183 | |
184 | #ifdef UNIV_DEBUG |
185 | ulint n_disabled_debug; |
186 | /*<! how many of pc threads |
187 | have been disabled */ |
188 | #endif /* UNIV_DEBUG */ |
189 | }; |
190 | |
191 | static page_cleaner_t page_cleaner; |
192 | |
193 | #ifdef UNIV_DEBUG |
194 | my_bool innodb_page_cleaner_disabled_debug; |
195 | #endif /* UNIV_DEBUG */ |
196 | |
197 | /** If LRU list of a buf_pool is less than this size then LRU eviction |
198 | should not happen. This is because when we do LRU flushing we also put |
199 | the blocks on free list. If LRU list is very small then we can end up |
200 | in thrashing. */ |
201 | #define BUF_LRU_MIN_LEN 256 |
202 | |
203 | /* @} */ |
204 | |
205 | /******************************************************************//** |
206 | Increases flush_list size in bytes with the page size in inline function */ |
207 | static inline |
208 | void |
209 | incr_flush_list_size_in_bytes( |
210 | /*==========================*/ |
211 | buf_block_t* block, /*!< in: control block */ |
212 | buf_pool_t* buf_pool) /*!< in: buffer pool instance */ |
213 | { |
214 | ut_ad(buf_flush_list_mutex_own(buf_pool)); |
215 | |
216 | buf_pool->stat.flush_list_bytes += block->page.size.physical(); |
217 | |
218 | ut_ad(buf_pool->stat.flush_list_bytes <= buf_pool->curr_pool_size); |
219 | } |
220 | |
221 | #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG |
222 | /******************************************************************//** |
223 | Validates the flush list. |
224 | @return TRUE if ok */ |
225 | static |
226 | ibool |
227 | buf_flush_validate_low( |
228 | /*===================*/ |
229 | buf_pool_t* buf_pool); /*!< in: Buffer pool instance */ |
230 | |
231 | /******************************************************************//** |
232 | Validates the flush list some of the time. |
233 | @return TRUE if ok or the check was skipped */ |
234 | static |
235 | ibool |
236 | buf_flush_validate_skip( |
237 | /*====================*/ |
238 | buf_pool_t* buf_pool) /*!< in: Buffer pool instance */ |
239 | { |
240 | /** Try buf_flush_validate_low() every this many times */ |
241 | # define BUF_FLUSH_VALIDATE_SKIP 23 |
242 | |
243 | /** The buf_flush_validate_low() call skip counter. |
244 | Use a signed type because of the race condition below. */ |
245 | static int buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP; |
246 | |
247 | /* There is a race condition below, but it does not matter, |
248 | because this call is only for heuristic purposes. We want to |
249 | reduce the call frequency of the costly buf_flush_validate_low() |
250 | check in debug builds. */ |
251 | if (--buf_flush_validate_count > 0) { |
252 | return(TRUE); |
253 | } |
254 | |
255 | buf_flush_validate_count = BUF_FLUSH_VALIDATE_SKIP; |
256 | return(buf_flush_validate_low(buf_pool)); |
257 | } |
258 | #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ |
259 | |
260 | /******************************************************************//** |
261 | Insert a block in the flush_rbt and returns a pointer to its |
262 | predecessor or NULL if no predecessor. The ordering is maintained |
263 | on the basis of the <oldest_modification, space, offset> key. |
264 | @return pointer to the predecessor or NULL if no predecessor. */ |
265 | static |
266 | buf_page_t* |
267 | buf_flush_insert_in_flush_rbt( |
268 | /*==========================*/ |
269 | buf_page_t* bpage) /*!< in: bpage to be inserted. */ |
270 | { |
271 | const ib_rbt_node_t* c_node; |
272 | const ib_rbt_node_t* p_node; |
273 | buf_page_t* prev = NULL; |
274 | buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); |
275 | |
276 | ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE); |
277 | ut_ad(buf_flush_list_mutex_own(buf_pool)); |
278 | |
279 | /* Insert this buffer into the rbt. */ |
280 | c_node = rbt_insert(buf_pool->flush_rbt, &bpage, &bpage); |
281 | ut_a(c_node != NULL); |
282 | |
283 | /* Get the predecessor. */ |
284 | p_node = rbt_prev(buf_pool->flush_rbt, c_node); |
285 | |
286 | if (p_node != NULL) { |
287 | buf_page_t** value; |
288 | value = rbt_value(buf_page_t*, p_node); |
289 | prev = *value; |
290 | ut_a(prev != NULL); |
291 | } |
292 | |
293 | return(prev); |
294 | } |
295 | |
296 | /*********************************************************//** |
297 | Delete a bpage from the flush_rbt. */ |
298 | static |
299 | void |
300 | buf_flush_delete_from_flush_rbt( |
301 | /*============================*/ |
302 | buf_page_t* bpage) /*!< in: bpage to be removed. */ |
303 | { |
304 | #ifdef UNIV_DEBUG |
305 | ibool ret = FALSE; |
306 | #endif /* UNIV_DEBUG */ |
307 | buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); |
308 | |
309 | ut_ad(buf_flush_list_mutex_own(buf_pool)); |
310 | |
311 | #ifdef UNIV_DEBUG |
312 | ret = |
313 | #endif /* UNIV_DEBUG */ |
314 | rbt_delete(buf_pool->flush_rbt, &bpage); |
315 | |
316 | ut_ad(ret); |
317 | } |
318 | |
319 | /*****************************************************************//** |
320 | Compare two modified blocks in the buffer pool. The key for comparison |
321 | is: |
322 | key = <oldest_modification, space, offset> |
323 | This comparison is used to maintian ordering of blocks in the |
324 | buf_pool->flush_rbt. |
325 | Note that for the purpose of flush_rbt, we only need to order blocks |
326 | on the oldest_modification. The other two fields are used to uniquely |
327 | identify the blocks. |
328 | @return < 0 if b2 < b1, 0 if b2 == b1, > 0 if b2 > b1 */ |
329 | static |
330 | int |
331 | buf_flush_block_cmp( |
332 | /*================*/ |
333 | const void* p1, /*!< in: block1 */ |
334 | const void* p2) /*!< in: block2 */ |
335 | { |
336 | int ret; |
337 | const buf_page_t* b1 = *(const buf_page_t**) p1; |
338 | const buf_page_t* b2 = *(const buf_page_t**) p2; |
339 | |
340 | ut_ad(b1 != NULL); |
341 | ut_ad(b2 != NULL); |
342 | |
343 | #ifdef UNIV_DEBUG |
344 | buf_pool_t* buf_pool = buf_pool_from_bpage(b1); |
345 | #endif /* UNIV_DEBUG */ |
346 | |
347 | ut_ad(buf_flush_list_mutex_own(buf_pool)); |
348 | |
349 | ut_ad(b1->in_flush_list); |
350 | ut_ad(b2->in_flush_list); |
351 | |
352 | if (b2->oldest_modification > b1->oldest_modification) { |
353 | return(1); |
354 | } else if (b2->oldest_modification < b1->oldest_modification) { |
355 | return(-1); |
356 | } |
357 | |
358 | /* If oldest_modification is same then decide on the space. */ |
359 | ret = (int)(b2->id.space() - b1->id.space()); |
360 | |
361 | /* Or else decide ordering on the page number. */ |
362 | return(ret ? ret : (int) (b2->id.page_no() - b1->id.page_no())); |
363 | } |
364 | |
365 | /********************************************************************//** |
366 | Initialize the red-black tree to speed up insertions into the flush_list |
367 | during recovery process. Should be called at the start of recovery |
368 | process before any page has been read/written. */ |
369 | void |
370 | buf_flush_init_flush_rbt(void) |
371 | /*==========================*/ |
372 | { |
373 | ulint i; |
374 | |
375 | for (i = 0; i < srv_buf_pool_instances; i++) { |
376 | buf_pool_t* buf_pool; |
377 | |
378 | buf_pool = buf_pool_from_array(i); |
379 | |
380 | buf_flush_list_mutex_enter(buf_pool); |
381 | |
382 | ut_ad(buf_pool->flush_rbt == NULL); |
383 | |
384 | /* Create red black tree for speedy insertions in flush list. */ |
385 | buf_pool->flush_rbt = rbt_create( |
386 | sizeof(buf_page_t*), buf_flush_block_cmp); |
387 | |
388 | buf_flush_list_mutex_exit(buf_pool); |
389 | } |
390 | } |
391 | |
392 | /********************************************************************//** |
393 | Frees up the red-black tree. */ |
394 | void |
395 | buf_flush_free_flush_rbt(void) |
396 | /*==========================*/ |
397 | { |
398 | ulint i; |
399 | |
400 | for (i = 0; i < srv_buf_pool_instances; i++) { |
401 | buf_pool_t* buf_pool; |
402 | |
403 | buf_pool = buf_pool_from_array(i); |
404 | |
405 | buf_flush_list_mutex_enter(buf_pool); |
406 | |
407 | #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG |
408 | ut_a(buf_flush_validate_low(buf_pool)); |
409 | #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ |
410 | |
411 | rbt_free(buf_pool->flush_rbt); |
412 | buf_pool->flush_rbt = NULL; |
413 | |
414 | buf_flush_list_mutex_exit(buf_pool); |
415 | } |
416 | } |
417 | |
418 | /********************************************************************//** |
419 | Inserts a modified block into the flush list. */ |
420 | void |
421 | buf_flush_insert_into_flush_list( |
422 | /*=============================*/ |
423 | buf_pool_t* buf_pool, /*!< buffer pool instance */ |
424 | buf_block_t* block, /*!< in/out: block which is modified */ |
425 | lsn_t lsn) /*!< in: oldest modification */ |
426 | { |
427 | ut_ad(!buf_pool_mutex_own(buf_pool)); |
428 | ut_ad(log_flush_order_mutex_own()); |
429 | ut_ad(buf_page_mutex_own(block)); |
430 | |
431 | buf_flush_list_mutex_enter(buf_pool); |
432 | |
433 | ut_ad((UT_LIST_GET_FIRST(buf_pool->flush_list) == NULL) |
434 | || (UT_LIST_GET_FIRST(buf_pool->flush_list)->oldest_modification |
435 | <= lsn)); |
436 | |
437 | /* If we are in the recovery then we need to update the flush |
438 | red-black tree as well. */ |
439 | if (buf_pool->flush_rbt != NULL) { |
440 | buf_flush_list_mutex_exit(buf_pool); |
441 | buf_flush_insert_sorted_into_flush_list(buf_pool, block, lsn); |
442 | return; |
443 | } |
444 | |
445 | ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); |
446 | ut_ad(!block->page.in_flush_list); |
447 | |
448 | ut_d(block->page.in_flush_list = TRUE); |
449 | block->page.oldest_modification = lsn; |
450 | |
451 | UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page); |
452 | |
453 | incr_flush_list_size_in_bytes(block, buf_pool); |
454 | |
455 | #ifdef UNIV_DEBUG_VALGRIND |
456 | void* p; |
457 | |
458 | if (block->page.size.is_compressed()) { |
459 | p = block->page.zip.data; |
460 | } else { |
461 | p = block->frame; |
462 | } |
463 | |
464 | UNIV_MEM_ASSERT_RW(p, block->page.size.physical()); |
465 | #endif /* UNIV_DEBUG_VALGRIND */ |
466 | |
467 | #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG |
468 | ut_a(buf_flush_validate_skip(buf_pool)); |
469 | #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ |
470 | |
471 | buf_flush_list_mutex_exit(buf_pool); |
472 | } |
473 | |
474 | /********************************************************************//** |
475 | Inserts a modified block into the flush list in the right sorted position. |
476 | This function is used by recovery, because there the modifications do not |
477 | necessarily come in the order of lsn's. */ |
478 | void |
479 | buf_flush_insert_sorted_into_flush_list( |
480 | /*====================================*/ |
481 | buf_pool_t* buf_pool, /*!< in: buffer pool instance */ |
482 | buf_block_t* block, /*!< in/out: block which is modified */ |
483 | lsn_t lsn) /*!< in: oldest modification */ |
484 | { |
485 | buf_page_t* prev_b; |
486 | buf_page_t* b; |
487 | |
488 | ut_ad(srv_shutdown_state != SRV_SHUTDOWN_FLUSH_PHASE); |
489 | ut_ad(!buf_pool_mutex_own(buf_pool)); |
490 | ut_ad(log_flush_order_mutex_own()); |
491 | ut_ad(buf_page_mutex_own(block)); |
492 | ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); |
493 | |
494 | buf_flush_list_mutex_enter(buf_pool); |
495 | |
496 | /* The field in_LRU_list is protected by buf_pool->mutex, which |
497 | we are not holding. However, while a block is in the flush |
498 | list, it is dirty and cannot be discarded, not from the |
499 | page_hash or from the LRU list. At most, the uncompressed |
500 | page frame of a compressed block may be discarded or created |
501 | (copying the block->page to or from a buf_page_t that is |
502 | dynamically allocated from buf_buddy_alloc()). Because those |
503 | transitions hold block->mutex and the flush list mutex (via |
504 | buf_flush_relocate_on_flush_list()), there is no possibility |
505 | of a race condition in the assertions below. */ |
506 | ut_ad(block->page.in_LRU_list); |
507 | ut_ad(block->page.in_page_hash); |
508 | /* buf_buddy_block_register() will take a block in the |
509 | BUF_BLOCK_MEMORY state, not a file page. */ |
510 | ut_ad(!block->page.in_zip_hash); |
511 | |
512 | ut_ad(!block->page.in_flush_list); |
513 | ut_d(block->page.in_flush_list = TRUE); |
514 | block->page.oldest_modification = lsn; |
515 | |
516 | #ifdef UNIV_DEBUG_VALGRIND |
517 | void* p; |
518 | |
519 | if (block->page.size.is_compressed()) { |
520 | p = block->page.zip.data; |
521 | } else { |
522 | p = block->frame; |
523 | } |
524 | |
525 | UNIV_MEM_ASSERT_RW(p, block->page.size.physical()); |
526 | #endif /* UNIV_DEBUG_VALGRIND */ |
527 | |
528 | prev_b = NULL; |
529 | |
530 | /* For the most part when this function is called the flush_rbt |
531 | should not be NULL. In a very rare boundary case it is possible |
532 | that the flush_rbt has already been freed by the recovery thread |
533 | before the last page was hooked up in the flush_list by the |
534 | io-handler thread. In that case we'll just do a simple |
535 | linear search in the else block. */ |
536 | if (buf_pool->flush_rbt != NULL) { |
537 | |
538 | prev_b = buf_flush_insert_in_flush_rbt(&block->page); |
539 | |
540 | } else { |
541 | |
542 | b = UT_LIST_GET_FIRST(buf_pool->flush_list); |
543 | |
544 | while (b != NULL && b->oldest_modification |
545 | > block->page.oldest_modification) { |
546 | |
547 | ut_ad(b->in_flush_list); |
548 | prev_b = b; |
549 | b = UT_LIST_GET_NEXT(list, b); |
550 | } |
551 | } |
552 | |
553 | if (prev_b == NULL) { |
554 | UT_LIST_ADD_FIRST(buf_pool->flush_list, &block->page); |
555 | } else { |
556 | UT_LIST_INSERT_AFTER(buf_pool->flush_list, prev_b, &block->page); |
557 | } |
558 | |
559 | incr_flush_list_size_in_bytes(block, buf_pool); |
560 | |
561 | #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG |
562 | ut_a(buf_flush_validate_low(buf_pool)); |
563 | #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ |
564 | |
565 | buf_flush_list_mutex_exit(buf_pool); |
566 | } |
567 | |
568 | /********************************************************************//** |
569 | Returns TRUE if the file page block is immediately suitable for replacement, |
570 | i.e., the transition FILE_PAGE => NOT_USED allowed. |
571 | @return TRUE if can replace immediately */ |
572 | ibool |
573 | buf_flush_ready_for_replace( |
574 | /*========================*/ |
575 | buf_page_t* bpage) /*!< in: buffer control block, must be |
576 | buf_page_in_file(bpage) and in the LRU list */ |
577 | { |
578 | #ifdef UNIV_DEBUG |
579 | buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); |
580 | ut_ad(buf_pool_mutex_own(buf_pool)); |
581 | #endif /* UNIV_DEBUG */ |
582 | ut_ad(mutex_own(buf_page_get_mutex(bpage))); |
583 | ut_ad(bpage->in_LRU_list); |
584 | |
585 | if (buf_page_in_file(bpage)) { |
586 | |
587 | return(bpage->oldest_modification == 0 |
588 | && bpage->buf_fix_count == 0 |
589 | && buf_page_get_io_fix(bpage) == BUF_IO_NONE); |
590 | } |
591 | |
592 | ib::fatal() << "Buffer block " << bpage << " state " << bpage->state |
593 | << " in the LRU list!" ; |
594 | |
595 | return(FALSE); |
596 | } |
597 | |
598 | /********************************************************************//** |
599 | Returns true if the block is modified and ready for flushing. |
600 | @return true if can flush immediately */ |
601 | bool |
602 | buf_flush_ready_for_flush( |
603 | /*======================*/ |
604 | buf_page_t* bpage, /*!< in: buffer control block, must be |
605 | buf_page_in_file(bpage) */ |
606 | buf_flush_t flush_type)/*!< in: type of flush */ |
607 | { |
608 | #ifdef UNIV_DEBUG |
609 | buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); |
610 | ut_ad(buf_pool_mutex_own(buf_pool)); |
611 | #endif /* UNIV_DEBUG */ |
612 | |
613 | ut_a(buf_page_in_file(bpage)); |
614 | ut_ad(mutex_own(buf_page_get_mutex(bpage))); |
615 | ut_ad(flush_type < BUF_FLUSH_N_TYPES); |
616 | |
617 | if (bpage->oldest_modification == 0 |
618 | || buf_page_get_io_fix(bpage) != BUF_IO_NONE) { |
619 | return(false); |
620 | } |
621 | |
622 | ut_ad(bpage->in_flush_list); |
623 | |
624 | switch (flush_type) { |
625 | case BUF_FLUSH_LIST: |
626 | case BUF_FLUSH_LRU: |
627 | case BUF_FLUSH_SINGLE_PAGE: |
628 | return(true); |
629 | |
630 | case BUF_FLUSH_N_TYPES: |
631 | break; |
632 | } |
633 | |
634 | ut_error; |
635 | return(false); |
636 | } |
637 | |
638 | /********************************************************************//** |
639 | Remove a block from the flush list of modified blocks. */ |
640 | void |
641 | buf_flush_remove( |
642 | /*=============*/ |
643 | buf_page_t* bpage) /*!< in: pointer to the block in question */ |
644 | { |
645 | buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); |
646 | |
647 | #if 0 // FIXME: Rate-limit the output. Move this to the page cleaner? |
648 | if (UNIV_UNLIKELY(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE)) { |
649 | service_manager_extend_timeout( |
650 | INNODB_EXTEND_TIMEOUT_INTERVAL, |
651 | "Flush and remove page with tablespace id %u" |
652 | ", Poolid " ULINTPF ", flush list length " ULINTPF, |
653 | bpage->space, buf_pool->instance_no, |
654 | UT_LIST_GET_LEN(buf_pool->flush_list)); |
655 | } |
656 | #endif |
657 | |
658 | ut_ad(buf_pool_mutex_own(buf_pool)); |
659 | ut_ad(mutex_own(buf_page_get_mutex(bpage))); |
660 | ut_ad(bpage->in_flush_list); |
661 | |
662 | buf_flush_list_mutex_enter(buf_pool); |
663 | |
664 | /* Important that we adjust the hazard pointer before removing |
665 | the bpage from flush list. */ |
666 | buf_pool->flush_hp.adjust(bpage); |
667 | |
668 | switch (buf_page_get_state(bpage)) { |
669 | case BUF_BLOCK_POOL_WATCH: |
670 | case BUF_BLOCK_ZIP_PAGE: |
671 | /* Clean compressed pages should not be on the flush list */ |
672 | case BUF_BLOCK_NOT_USED: |
673 | case BUF_BLOCK_READY_FOR_USE: |
674 | case BUF_BLOCK_MEMORY: |
675 | case BUF_BLOCK_REMOVE_HASH: |
676 | ut_error; |
677 | return; |
678 | case BUF_BLOCK_ZIP_DIRTY: |
679 | buf_page_set_state(bpage, BUF_BLOCK_ZIP_PAGE); |
680 | UT_LIST_REMOVE(buf_pool->flush_list, bpage); |
681 | #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG |
682 | buf_LRU_insert_zip_clean(bpage); |
683 | #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ |
684 | break; |
685 | case BUF_BLOCK_FILE_PAGE: |
686 | UT_LIST_REMOVE(buf_pool->flush_list, bpage); |
687 | break; |
688 | } |
689 | |
690 | /* If the flush_rbt is active then delete from there as well. */ |
691 | if (buf_pool->flush_rbt != NULL) { |
692 | buf_flush_delete_from_flush_rbt(bpage); |
693 | } |
694 | |
695 | /* Must be done after we have removed it from the flush_rbt |
696 | because we assert on in_flush_list in comparison function. */ |
697 | ut_d(bpage->in_flush_list = FALSE); |
698 | |
699 | buf_pool->stat.flush_list_bytes -= bpage->size.physical(); |
700 | |
701 | bpage->oldest_modification = 0; |
702 | |
703 | #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG |
704 | ut_a(buf_flush_validate_skip(buf_pool)); |
705 | #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ |
706 | |
707 | /* If there is an observer that want to know if the asynchronous |
708 | flushing was done then notify it. */ |
709 | if (bpage->flush_observer != NULL) { |
710 | bpage->flush_observer->notify_remove(buf_pool, bpage); |
711 | |
712 | bpage->flush_observer = NULL; |
713 | } |
714 | |
715 | buf_flush_list_mutex_exit(buf_pool); |
716 | } |
717 | |
718 | /*******************************************************************//** |
719 | Relocates a buffer control block on the flush_list. |
720 | Note that it is assumed that the contents of bpage have already been |
721 | copied to dpage. |
722 | IMPORTANT: When this function is called bpage and dpage are not |
723 | exact copies of each other. For example, they both will have different |
724 | ::state. Also the ::list pointers in dpage may be stale. We need to |
725 | use the current list node (bpage) to do the list manipulation because |
726 | the list pointers could have changed between the time that we copied |
727 | the contents of bpage to the dpage and the flush list manipulation |
728 | below. */ |
729 | void |
730 | buf_flush_relocate_on_flush_list( |
731 | /*=============================*/ |
732 | buf_page_t* bpage, /*!< in/out: control block being moved */ |
733 | buf_page_t* dpage) /*!< in/out: destination block */ |
734 | { |
735 | buf_page_t* prev; |
736 | buf_page_t* prev_b = NULL; |
737 | buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); |
738 | |
739 | ut_ad(buf_pool_mutex_own(buf_pool)); |
740 | /* Must reside in the same buffer pool. */ |
741 | ut_ad(buf_pool == buf_pool_from_bpage(dpage)); |
742 | |
743 | ut_ad(mutex_own(buf_page_get_mutex(bpage))); |
744 | |
745 | buf_flush_list_mutex_enter(buf_pool); |
746 | |
747 | /* FIXME: At this point we have both buf_pool and flush_list |
748 | mutexes. Theoretically removal of a block from flush list is |
749 | only covered by flush_list mutex but currently we do |
750 | have buf_pool mutex in buf_flush_remove() therefore this block |
751 | is guaranteed to be in the flush list. We need to check if |
752 | this will work without the assumption of block removing code |
753 | having the buf_pool mutex. */ |
754 | ut_ad(bpage->in_flush_list); |
755 | ut_ad(dpage->in_flush_list); |
756 | |
757 | /* If recovery is active we must swap the control blocks in |
758 | the flush_rbt as well. */ |
759 | if (buf_pool->flush_rbt != NULL) { |
760 | buf_flush_delete_from_flush_rbt(bpage); |
761 | prev_b = buf_flush_insert_in_flush_rbt(dpage); |
762 | } |
763 | |
764 | /* Important that we adjust the hazard pointer before removing |
765 | the bpage from the flush list. */ |
766 | buf_pool->flush_hp.adjust(bpage); |
767 | |
768 | /* Must be done after we have removed it from the flush_rbt |
769 | because we assert on in_flush_list in comparison function. */ |
770 | ut_d(bpage->in_flush_list = FALSE); |
771 | |
772 | prev = UT_LIST_GET_PREV(list, bpage); |
773 | UT_LIST_REMOVE(buf_pool->flush_list, bpage); |
774 | |
775 | if (prev) { |
776 | ut_ad(prev->in_flush_list); |
777 | UT_LIST_INSERT_AFTER( buf_pool->flush_list, prev, dpage); |
778 | } else { |
779 | UT_LIST_ADD_FIRST(buf_pool->flush_list, dpage); |
780 | } |
781 | |
782 | /* Just an extra check. Previous in flush_list |
783 | should be the same control block as in flush_rbt. */ |
784 | ut_a(buf_pool->flush_rbt == NULL || prev_b == prev); |
785 | |
786 | #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG |
787 | ut_a(buf_flush_validate_low(buf_pool)); |
788 | #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ |
789 | |
790 | buf_flush_list_mutex_exit(buf_pool); |
791 | } |
792 | |
793 | /** Update the flush system data structures when a write is completed. |
794 | @param[in,out] bpage flushed page |
795 | @param[in] dblwr whether the doublewrite buffer was used */ |
796 | void buf_flush_write_complete(buf_page_t* bpage, bool dblwr) |
797 | { |
798 | buf_flush_t flush_type; |
799 | buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); |
800 | |
801 | ut_ad(bpage); |
802 | |
803 | buf_flush_remove(bpage); |
804 | |
805 | flush_type = buf_page_get_flush_type(bpage); |
806 | buf_pool->n_flush[flush_type]--; |
807 | ut_ad(buf_pool->n_flush[flush_type] != ULINT_MAX); |
808 | |
809 | ut_ad(buf_pool_mutex_own(buf_pool)); |
810 | |
811 | if (buf_pool->n_flush[flush_type] == 0 |
812 | && buf_pool->init_flush[flush_type] == FALSE) { |
813 | |
814 | /* The running flush batch has ended */ |
815 | |
816 | os_event_set(buf_pool->no_flush[flush_type]); |
817 | } |
818 | |
819 | if (dblwr) { |
820 | buf_dblwr_update(bpage, flush_type); |
821 | } |
822 | } |
823 | |
824 | /** Calculate the checksum of a page from compressed table and update |
825 | the page. |
826 | @param[in,out] page page to update |
827 | @param[in] size compressed page size |
828 | @param[in] lsn LSN to stamp on the page */ |
829 | void |
830 | buf_flush_update_zip_checksum( |
831 | buf_frame_t* page, |
832 | ulint size, |
833 | lsn_t lsn) |
834 | { |
835 | ut_a(size > 0); |
836 | |
837 | const uint32_t checksum = page_zip_calc_checksum( |
838 | page, size, |
839 | static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm)); |
840 | |
841 | mach_write_to_8(page + FIL_PAGE_LSN, lsn); |
842 | mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, checksum); |
843 | } |
844 | |
845 | /** Initialize a page for writing to the tablespace. |
846 | @param[in] block buffer block; NULL if bypassing the buffer pool |
847 | @param[in,out] page page frame |
848 | @param[in,out] page_zip_ compressed page, or NULL if uncompressed |
849 | @param[in] newest_lsn newest modification LSN to the page */ |
850 | void |
851 | buf_flush_init_for_writing( |
852 | const buf_block_t* block, |
853 | byte* page, |
854 | void* page_zip_, |
855 | lsn_t newest_lsn) |
856 | { |
857 | ut_ad(block == NULL || block->frame == page); |
858 | ut_ad(block == NULL || page_zip_ == NULL |
859 | || &block->page.zip == page_zip_); |
860 | ut_ad(page); |
861 | |
862 | if (page_zip_) { |
863 | page_zip_des_t* page_zip; |
864 | ulint size; |
865 | |
866 | page_zip = static_cast<page_zip_des_t*>(page_zip_); |
867 | size = page_zip_get_size(page_zip); |
868 | |
869 | ut_ad(size); |
870 | ut_ad(ut_is_2pow(size)); |
871 | ut_ad(size <= UNIV_ZIP_SIZE_MAX); |
872 | |
873 | switch (fil_page_get_type(page)) { |
874 | case FIL_PAGE_TYPE_ALLOCATED: |
875 | case FIL_PAGE_INODE: |
876 | case FIL_PAGE_IBUF_BITMAP: |
877 | case FIL_PAGE_TYPE_FSP_HDR: |
878 | case FIL_PAGE_TYPE_XDES: |
879 | /* These are essentially uncompressed pages. */ |
880 | memcpy(page_zip->data, page, size); |
881 | /* fall through */ |
882 | case FIL_PAGE_TYPE_ZBLOB: |
883 | case FIL_PAGE_TYPE_ZBLOB2: |
884 | case FIL_PAGE_INDEX: |
885 | case FIL_PAGE_RTREE: |
886 | |
887 | buf_flush_update_zip_checksum( |
888 | page_zip->data, size, newest_lsn); |
889 | |
890 | return; |
891 | } |
892 | |
893 | ib::error() << "The compressed page to be written" |
894 | " seems corrupt:" ; |
895 | ut_print_buf(stderr, page, size); |
896 | fputs("\nInnoDB: Possibly older version of the page:" , stderr); |
897 | ut_print_buf(stderr, page_zip->data, size); |
898 | putc('\n', stderr); |
899 | ut_error; |
900 | } |
901 | |
902 | /* Write the newest modification lsn to the page header and trailer */ |
903 | mach_write_to_8(page + FIL_PAGE_LSN, newest_lsn); |
904 | |
905 | mach_write_to_8(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM, |
906 | newest_lsn); |
907 | |
908 | if (block && srv_page_size == 16384) { |
909 | /* The page type could be garbage in old files |
910 | created before MySQL 5.5. Such files always |
911 | had a page size of 16 kilobytes. */ |
912 | ulint page_type = fil_page_get_type(page); |
913 | ulint reset_type = page_type; |
914 | |
915 | switch (block->page.id.page_no() % 16384) { |
916 | case 0: |
917 | reset_type = block->page.id.page_no() == 0 |
918 | ? FIL_PAGE_TYPE_FSP_HDR |
919 | : FIL_PAGE_TYPE_XDES; |
920 | break; |
921 | case 1: |
922 | reset_type = FIL_PAGE_IBUF_BITMAP; |
923 | break; |
924 | case FSP_TRX_SYS_PAGE_NO: |
925 | if (block->page.id.page_no() |
926 | == TRX_SYS_PAGE_NO |
927 | && block->page.id.space() |
928 | == TRX_SYS_SPACE) { |
929 | reset_type = FIL_PAGE_TYPE_TRX_SYS; |
930 | break; |
931 | } |
932 | /* fall through */ |
933 | default: |
934 | switch (page_type) { |
935 | case FIL_PAGE_INDEX: |
936 | case FIL_PAGE_TYPE_INSTANT: |
937 | case FIL_PAGE_RTREE: |
938 | case FIL_PAGE_UNDO_LOG: |
939 | case FIL_PAGE_INODE: |
940 | case FIL_PAGE_IBUF_FREE_LIST: |
941 | case FIL_PAGE_TYPE_ALLOCATED: |
942 | case FIL_PAGE_TYPE_SYS: |
943 | case FIL_PAGE_TYPE_TRX_SYS: |
944 | case FIL_PAGE_TYPE_BLOB: |
945 | case FIL_PAGE_TYPE_ZBLOB: |
946 | case FIL_PAGE_TYPE_ZBLOB2: |
947 | break; |
948 | case FIL_PAGE_TYPE_FSP_HDR: |
949 | case FIL_PAGE_TYPE_XDES: |
950 | case FIL_PAGE_IBUF_BITMAP: |
951 | /* These pages should have |
952 | predetermined page numbers |
953 | (see above). */ |
954 | default: |
955 | reset_type = FIL_PAGE_TYPE_UNKNOWN; |
956 | break; |
957 | } |
958 | } |
959 | |
960 | if (UNIV_UNLIKELY(page_type != reset_type)) { |
961 | ib::info() |
962 | << "Resetting invalid page " |
963 | << block->page.id << " type " |
964 | << page_type << " to " |
965 | << reset_type << " when flushing." ; |
966 | fil_page_set_type(page, reset_type); |
967 | } |
968 | } |
969 | |
970 | uint32_t checksum= 0; |
971 | |
972 | switch (srv_checksum_algorithm_t(srv_checksum_algorithm)) { |
973 | case SRV_CHECKSUM_ALGORITHM_INNODB: |
974 | case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB: |
975 | checksum = buf_calc_page_new_checksum(page); |
976 | mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, |
977 | checksum); |
978 | /* With the InnoDB checksum, we overwrite the first 4 bytes of |
979 | the end lsn field to store the old formula checksum. Since it |
980 | depends also on the field FIL_PAGE_SPACE_OR_CHKSUM, it has to |
981 | be calculated after storing the new formula checksum. */ |
982 | checksum = buf_calc_page_old_checksum(page); |
983 | break; |
984 | case SRV_CHECKSUM_ALGORITHM_CRC32: |
985 | case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32: |
986 | /* In other cases we write the same checksum to both fields. */ |
987 | checksum = buf_calc_page_crc32(page); |
988 | mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, |
989 | checksum); |
990 | break; |
991 | case SRV_CHECKSUM_ALGORITHM_NONE: |
992 | case SRV_CHECKSUM_ALGORITHM_STRICT_NONE: |
993 | checksum = BUF_NO_CHECKSUM_MAGIC; |
994 | mach_write_to_4(page + FIL_PAGE_SPACE_OR_CHKSUM, |
995 | checksum); |
996 | break; |
997 | /* no default so the compiler will emit a warning if |
998 | new enum is added and not handled here */ |
999 | } |
1000 | |
1001 | mach_write_to_4(page + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM, |
1002 | checksum); |
1003 | } |
1004 | |
1005 | /********************************************************************//** |
1006 | Does an asynchronous write of a buffer page. NOTE: in simulated aio and |
1007 | also when the doublewrite buffer is used, we must call |
1008 | buf_dblwr_flush_buffered_writes after we have posted a batch of |
1009 | writes! */ |
1010 | static |
1011 | void |
1012 | buf_flush_write_block_low( |
1013 | /*======================*/ |
1014 | buf_page_t* bpage, /*!< in: buffer block to write */ |
1015 | buf_flush_t flush_type, /*!< in: type of flush */ |
1016 | bool sync) /*!< in: true if sync IO request */ |
1017 | { |
1018 | fil_space_t* space = fil_space_acquire_for_io(bpage->id.space()); |
1019 | if (!space) { |
1020 | return; |
1021 | } |
1022 | ut_ad(space->purpose == FIL_TYPE_TEMPORARY |
1023 | || space->purpose == FIL_TYPE_IMPORT |
1024 | || space->purpose == FIL_TYPE_TABLESPACE); |
1025 | ut_ad((space->purpose == FIL_TYPE_TEMPORARY) |
1026 | == (space == fil_system.temp_space)); |
1027 | page_t* frame = NULL; |
1028 | #ifdef UNIV_DEBUG |
1029 | buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); |
1030 | ut_ad(!buf_pool_mutex_own(buf_pool)); |
1031 | #endif /* UNIV_DEBUG */ |
1032 | |
1033 | DBUG_PRINT("ib_buf" , ("flush %s %u page %u:%u" , |
1034 | sync ? "sync" : "async" , (unsigned) flush_type, |
1035 | bpage->id.space(), bpage->id.page_no())); |
1036 | |
1037 | ut_ad(buf_page_in_file(bpage)); |
1038 | |
1039 | /* We are not holding buf_pool->mutex or block_mutex here. |
1040 | Nevertheless, it is safe to access bpage, because it is |
1041 | io_fixed and oldest_modification != 0. Thus, it cannot be |
1042 | relocated in the buffer pool or removed from flush_list or |
1043 | LRU_list. */ |
1044 | ut_ad(!buf_pool_mutex_own(buf_pool)); |
1045 | ut_ad(!buf_flush_list_mutex_own(buf_pool)); |
1046 | ut_ad(!buf_page_get_mutex(bpage)->is_owned()); |
1047 | ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE); |
1048 | ut_ad(bpage->oldest_modification != 0); |
1049 | |
1050 | #ifdef UNIV_IBUF_COUNT_DEBUG |
1051 | ut_a(ibuf_count_get(bpage->id) == 0); |
1052 | #endif /* UNIV_IBUF_COUNT_DEBUG */ |
1053 | |
1054 | ut_ad(bpage->newest_modification != 0); |
1055 | |
1056 | /* Force the log to the disk before writing the modified block */ |
1057 | if (!srv_read_only_mode) { |
1058 | log_write_up_to(bpage->newest_modification, true); |
1059 | } |
1060 | |
1061 | switch (buf_page_get_state(bpage)) { |
1062 | case BUF_BLOCK_POOL_WATCH: |
1063 | case BUF_BLOCK_ZIP_PAGE: /* The page should be dirty. */ |
1064 | case BUF_BLOCK_NOT_USED: |
1065 | case BUF_BLOCK_READY_FOR_USE: |
1066 | case BUF_BLOCK_MEMORY: |
1067 | case BUF_BLOCK_REMOVE_HASH: |
1068 | ut_error; |
1069 | break; |
1070 | case BUF_BLOCK_ZIP_DIRTY: |
1071 | frame = bpage->zip.data; |
1072 | |
1073 | mach_write_to_8(frame + FIL_PAGE_LSN, |
1074 | bpage->newest_modification); |
1075 | |
1076 | ut_a(page_zip_verify_checksum(frame, bpage->size.physical())); |
1077 | break; |
1078 | case BUF_BLOCK_FILE_PAGE: |
1079 | frame = bpage->zip.data; |
1080 | if (!frame) { |
1081 | frame = ((buf_block_t*) bpage)->frame; |
1082 | } |
1083 | |
1084 | buf_flush_init_for_writing( |
1085 | reinterpret_cast<const buf_block_t*>(bpage), |
1086 | reinterpret_cast<const buf_block_t*>(bpage)->frame, |
1087 | bpage->zip.data ? &bpage->zip : NULL, |
1088 | bpage->newest_modification); |
1089 | break; |
1090 | } |
1091 | |
1092 | frame = buf_page_encrypt_before_write(space, bpage, frame); |
1093 | |
1094 | ut_ad(space->purpose == FIL_TYPE_TABLESPACE |
1095 | || space->atomic_write_supported); |
1096 | if (!space->use_doublewrite()) { |
1097 | ulint type = IORequest::WRITE | IORequest::DO_NOT_WAKE; |
1098 | |
1099 | IORequest request(type, bpage); |
1100 | |
1101 | /* TODO: pass the tablespace to fil_io() */ |
1102 | fil_io(request, |
1103 | sync, bpage->id, bpage->size, 0, bpage->size.physical(), |
1104 | frame, bpage); |
1105 | } else { |
1106 | ut_ad(!srv_read_only_mode); |
1107 | |
1108 | if (flush_type == BUF_FLUSH_SINGLE_PAGE) { |
1109 | buf_dblwr_write_single_page(bpage, sync); |
1110 | } else { |
1111 | ut_ad(!sync); |
1112 | buf_dblwr_add_to_batch(bpage); |
1113 | } |
1114 | } |
1115 | |
1116 | /* When doing single page flushing the IO is done synchronously |
1117 | and we flush the changes to disk only for the tablespace we |
1118 | are working on. */ |
1119 | if (sync) { |
1120 | ut_ad(flush_type == BUF_FLUSH_SINGLE_PAGE); |
1121 | if (space->purpose != FIL_TYPE_TEMPORARY) { |
1122 | fil_flush(space); |
1123 | } |
1124 | |
1125 | /* The tablespace could already have been dropped, |
1126 | because fil_io(request, sync) would already have |
1127 | decremented the node->n_pending. However, |
1128 | buf_page_io_complete() only needs to look up the |
1129 | tablespace during read requests, not during writes. */ |
1130 | ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_WRITE); |
1131 | #ifdef UNIV_DEBUG |
1132 | dberr_t err = |
1133 | #endif |
1134 | /* true means we want to evict this page from the |
1135 | LRU list as well. */ |
1136 | buf_page_io_complete(bpage, space->use_doublewrite(), true); |
1137 | |
1138 | ut_ad(err == DB_SUCCESS); |
1139 | } |
1140 | |
1141 | space->release_for_io(); |
1142 | |
1143 | /* Increment the counter of I/O operations used |
1144 | for selecting LRU policy. */ |
1145 | buf_LRU_stat_inc_io(); |
1146 | } |
1147 | |
1148 | /********************************************************************//** |
1149 | Writes a flushable page asynchronously from the buffer pool to a file. |
1150 | NOTE: in simulated aio we must call |
1151 | os_aio_simulated_wake_handler_threads after we have posted a batch of |
1152 | writes! NOTE: buf_pool->mutex and buf_page_get_mutex(bpage) must be |
1153 | held upon entering this function, and they will be released by this |
1154 | function if it returns true. |
1155 | @return TRUE if the page was flushed */ |
1156 | ibool |
1157 | buf_flush_page( |
1158 | /*===========*/ |
1159 | buf_pool_t* buf_pool, /*!< in: buffer pool instance */ |
1160 | buf_page_t* bpage, /*!< in: buffer control block */ |
1161 | buf_flush_t flush_type, /*!< in: type of flush */ |
1162 | bool sync) /*!< in: true if sync IO request */ |
1163 | { |
1164 | BPageMutex* block_mutex; |
1165 | |
1166 | ut_ad(flush_type < BUF_FLUSH_N_TYPES); |
1167 | ut_ad(buf_pool_mutex_own(buf_pool)); |
1168 | ut_ad(buf_page_in_file(bpage)); |
1169 | ut_ad(!sync || flush_type == BUF_FLUSH_SINGLE_PAGE); |
1170 | |
1171 | block_mutex = buf_page_get_mutex(bpage); |
1172 | ut_ad(mutex_own(block_mutex)); |
1173 | |
1174 | ut_ad(buf_flush_ready_for_flush(bpage, flush_type)); |
1175 | |
1176 | bool is_uncompressed; |
1177 | |
1178 | is_uncompressed = (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE); |
1179 | ut_ad(is_uncompressed == (block_mutex != &buf_pool->zip_mutex)); |
1180 | |
1181 | ibool flush; |
1182 | rw_lock_t* rw_lock; |
1183 | bool no_fix_count = bpage->buf_fix_count == 0; |
1184 | |
1185 | if (!is_uncompressed) { |
1186 | flush = TRUE; |
1187 | rw_lock = NULL; |
1188 | } else if (!(no_fix_count || flush_type == BUF_FLUSH_LIST) |
1189 | || (!no_fix_count |
1190 | && srv_shutdown_state <= SRV_SHUTDOWN_CLEANUP |
1191 | && fsp_is_system_temporary(bpage->id.space()))) { |
1192 | /* This is a heuristic, to avoid expensive SX attempts. */ |
1193 | /* For table residing in temporary tablespace sync is done |
1194 | using IO_FIX and so before scheduling for flush ensure that |
1195 | page is not fixed. */ |
1196 | flush = FALSE; |
1197 | } else { |
1198 | rw_lock = &reinterpret_cast<buf_block_t*>(bpage)->lock; |
1199 | if (flush_type != BUF_FLUSH_LIST) { |
1200 | flush = rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE); |
1201 | } else { |
1202 | /* Will SX lock later */ |
1203 | flush = TRUE; |
1204 | } |
1205 | } |
1206 | |
1207 | if (flush) { |
1208 | |
1209 | /* We are committed to flushing by the time we get here */ |
1210 | |
1211 | buf_page_set_io_fix(bpage, BUF_IO_WRITE); |
1212 | |
1213 | buf_page_set_flush_type(bpage, flush_type); |
1214 | |
1215 | if (buf_pool->n_flush[flush_type] == 0) { |
1216 | os_event_reset(buf_pool->no_flush[flush_type]); |
1217 | } |
1218 | |
1219 | ++buf_pool->n_flush[flush_type]; |
1220 | ut_ad(buf_pool->n_flush[flush_type] != 0); |
1221 | |
1222 | mutex_exit(block_mutex); |
1223 | |
1224 | buf_pool_mutex_exit(buf_pool); |
1225 | |
1226 | if (flush_type == BUF_FLUSH_LIST |
1227 | && is_uncompressed |
1228 | && !rw_lock_sx_lock_nowait(rw_lock, BUF_IO_WRITE)) { |
1229 | |
1230 | if (!fsp_is_system_temporary(bpage->id.space())) { |
1231 | /* avoiding deadlock possibility involves |
1232 | doublewrite buffer, should flush it, because |
1233 | it might hold the another block->lock. */ |
1234 | buf_dblwr_flush_buffered_writes(); |
1235 | } else { |
1236 | buf_dblwr_sync_datafiles(); |
1237 | } |
1238 | |
1239 | rw_lock_sx_lock_gen(rw_lock, BUF_IO_WRITE); |
1240 | } |
1241 | |
1242 | /* If there is an observer that want to know if the asynchronous |
1243 | flushing was sent then notify it. |
1244 | Note: we set flush observer to a page with x-latch, so we can |
1245 | guarantee that notify_flush and notify_remove are called in pair |
1246 | with s-latch on a uncompressed page. */ |
1247 | if (bpage->flush_observer != NULL) { |
1248 | buf_pool_mutex_enter(buf_pool); |
1249 | |
1250 | bpage->flush_observer->notify_flush(buf_pool, bpage); |
1251 | |
1252 | buf_pool_mutex_exit(buf_pool); |
1253 | } |
1254 | |
1255 | /* Even though bpage is not protected by any mutex at this |
1256 | point, it is safe to access bpage, because it is io_fixed and |
1257 | oldest_modification != 0. Thus, it cannot be relocated in the |
1258 | buffer pool or removed from flush_list or LRU_list. */ |
1259 | |
1260 | buf_flush_write_block_low(bpage, flush_type, sync); |
1261 | } |
1262 | |
1263 | return(flush); |
1264 | } |
1265 | |
1266 | # if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG |
1267 | /********************************************************************//** |
1268 | Writes a flushable page asynchronously from the buffer pool to a file. |
1269 | NOTE: buf_pool->mutex and block->mutex must be held upon entering this |
1270 | function, and they will be released by this function after flushing. |
1271 | This is loosely based on buf_flush_batch() and buf_flush_page(). |
1272 | @return TRUE if the page was flushed and the mutexes released */ |
1273 | ibool |
1274 | buf_flush_page_try( |
1275 | /*===============*/ |
1276 | buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */ |
1277 | buf_block_t* block) /*!< in/out: buffer control block */ |
1278 | { |
1279 | ut_ad(buf_pool_mutex_own(buf_pool)); |
1280 | ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE); |
1281 | ut_ad(buf_page_mutex_own(block)); |
1282 | |
1283 | if (!buf_flush_ready_for_flush(&block->page, BUF_FLUSH_SINGLE_PAGE)) { |
1284 | return(FALSE); |
1285 | } |
1286 | |
1287 | /* The following call will release the buffer pool and |
1288 | block mutex. */ |
1289 | return(buf_flush_page( |
1290 | buf_pool, &block->page, |
1291 | BUF_FLUSH_SINGLE_PAGE, true)); |
1292 | } |
1293 | # endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */ |
1294 | |
1295 | /** Check the page is in buffer pool and can be flushed. |
1296 | @param[in] page_id page id |
1297 | @param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST |
1298 | @return true if the page can be flushed. */ |
1299 | static |
1300 | bool |
1301 | buf_flush_check_neighbor( |
1302 | const page_id_t& page_id, |
1303 | buf_flush_t flush_type) |
1304 | { |
1305 | buf_page_t* bpage; |
1306 | buf_pool_t* buf_pool = buf_pool_get(page_id); |
1307 | bool ret; |
1308 | |
1309 | ut_ad(flush_type == BUF_FLUSH_LRU |
1310 | || flush_type == BUF_FLUSH_LIST); |
1311 | |
1312 | buf_pool_mutex_enter(buf_pool); |
1313 | |
1314 | /* We only want to flush pages from this buffer pool. */ |
1315 | bpage = buf_page_hash_get(buf_pool, page_id); |
1316 | |
1317 | if (!bpage) { |
1318 | |
1319 | buf_pool_mutex_exit(buf_pool); |
1320 | return(false); |
1321 | } |
1322 | |
1323 | ut_a(buf_page_in_file(bpage)); |
1324 | |
1325 | /* We avoid flushing 'non-old' blocks in an LRU flush, |
1326 | because the flushed blocks are soon freed */ |
1327 | |
1328 | ret = false; |
1329 | if (flush_type != BUF_FLUSH_LRU || buf_page_is_old(bpage)) { |
1330 | BPageMutex* block_mutex = buf_page_get_mutex(bpage); |
1331 | |
1332 | mutex_enter(block_mutex); |
1333 | if (buf_flush_ready_for_flush(bpage, flush_type)) { |
1334 | ret = true; |
1335 | } |
1336 | mutex_exit(block_mutex); |
1337 | } |
1338 | buf_pool_mutex_exit(buf_pool); |
1339 | |
1340 | return(ret); |
1341 | } |
1342 | |
1343 | /** Flushes to disk all flushable pages within the flush area. |
1344 | @param[in] page_id page id |
1345 | @param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST |
1346 | @param[in] n_flushed number of pages flushed so far in this batch |
1347 | @param[in] n_to_flush maximum number of pages we are allowed to flush |
1348 | @return number of pages flushed */ |
1349 | static |
1350 | ulint |
1351 | buf_flush_try_neighbors( |
1352 | const page_id_t& page_id, |
1353 | buf_flush_t flush_type, |
1354 | ulint n_flushed, |
1355 | ulint n_to_flush) |
1356 | { |
1357 | ulint i; |
1358 | ulint low; |
1359 | ulint high; |
1360 | ulint count = 0; |
1361 | buf_pool_t* buf_pool = buf_pool_get(page_id); |
1362 | |
1363 | ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); |
1364 | |
1365 | if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN |
1366 | || srv_flush_neighbors == 0) { |
1367 | /* If there is little space or neighbor flushing is |
1368 | not enabled then just flush the victim. */ |
1369 | low = page_id.page_no(); |
1370 | high = page_id.page_no() + 1; |
1371 | } else { |
1372 | /* When flushed, dirty blocks are searched in |
1373 | neighborhoods of this size, and flushed along with the |
1374 | original page. */ |
1375 | |
1376 | ulint buf_flush_area; |
1377 | |
1378 | buf_flush_area = ut_min( |
1379 | BUF_READ_AHEAD_AREA(buf_pool), |
1380 | buf_pool->curr_size / 16); |
1381 | |
1382 | low = (page_id.page_no() / buf_flush_area) * buf_flush_area; |
1383 | high = (page_id.page_no() / buf_flush_area + 1) * buf_flush_area; |
1384 | |
1385 | if (srv_flush_neighbors == 1) { |
1386 | /* adjust 'low' and 'high' to limit |
1387 | for contiguous dirty area */ |
1388 | if (page_id.page_no() > low) { |
1389 | for (i = page_id.page_no() - 1; i >= low; i--) { |
1390 | if (!buf_flush_check_neighbor( |
1391 | page_id_t(page_id.space(), i), |
1392 | flush_type)) { |
1393 | |
1394 | break; |
1395 | } |
1396 | |
1397 | if (i == low) { |
1398 | /* Avoid overwrap when low == 0 |
1399 | and calling |
1400 | buf_flush_check_neighbor() with |
1401 | i == (ulint) -1 */ |
1402 | i--; |
1403 | break; |
1404 | } |
1405 | } |
1406 | low = i + 1; |
1407 | } |
1408 | |
1409 | for (i = page_id.page_no() + 1; |
1410 | i < high |
1411 | && buf_flush_check_neighbor( |
1412 | page_id_t(page_id.space(), i), |
1413 | flush_type); |
1414 | i++) { |
1415 | /* do nothing */ |
1416 | } |
1417 | high = i; |
1418 | } |
1419 | } |
1420 | |
1421 | const ulint space_size = fil_space_get_size(page_id.space()); |
1422 | if (high > space_size) { |
1423 | high = space_size; |
1424 | } |
1425 | |
1426 | DBUG_PRINT("ib_buf" , ("flush %u:%u..%u" , |
1427 | page_id.space(), |
1428 | (unsigned) low, (unsigned) high)); |
1429 | |
1430 | for (ulint i = low; i < high; i++) { |
1431 | buf_page_t* bpage; |
1432 | |
1433 | if ((count + n_flushed) >= n_to_flush) { |
1434 | |
1435 | /* We have already flushed enough pages and |
1436 | should call it a day. There is, however, one |
1437 | exception. If the page whose neighbors we |
1438 | are flushing has not been flushed yet then |
1439 | we'll try to flush the victim that we |
1440 | selected originally. */ |
1441 | if (i <= page_id.page_no()) { |
1442 | i = page_id.page_no(); |
1443 | } else { |
1444 | break; |
1445 | } |
1446 | } |
1447 | |
1448 | const page_id_t cur_page_id(page_id.space(), i); |
1449 | |
1450 | buf_pool = buf_pool_get(cur_page_id); |
1451 | |
1452 | buf_pool_mutex_enter(buf_pool); |
1453 | |
1454 | /* We only want to flush pages from this buffer pool. */ |
1455 | bpage = buf_page_hash_get(buf_pool, cur_page_id); |
1456 | |
1457 | if (bpage == NULL) { |
1458 | |
1459 | buf_pool_mutex_exit(buf_pool); |
1460 | continue; |
1461 | } |
1462 | |
1463 | ut_a(buf_page_in_file(bpage)); |
1464 | |
1465 | /* We avoid flushing 'non-old' blocks in an LRU flush, |
1466 | because the flushed blocks are soon freed */ |
1467 | |
1468 | if (flush_type != BUF_FLUSH_LRU |
1469 | || i == page_id.page_no() |
1470 | || buf_page_is_old(bpage)) { |
1471 | |
1472 | BPageMutex* block_mutex = buf_page_get_mutex(bpage); |
1473 | |
1474 | mutex_enter(block_mutex); |
1475 | |
1476 | if (buf_flush_ready_for_flush(bpage, flush_type) |
1477 | && (i == page_id.page_no() |
1478 | || bpage->buf_fix_count == 0)) { |
1479 | |
1480 | /* We also try to flush those |
1481 | neighbors != offset */ |
1482 | |
1483 | if (buf_flush_page( |
1484 | buf_pool, bpage, flush_type, false)) { |
1485 | |
1486 | ++count; |
1487 | } else { |
1488 | mutex_exit(block_mutex); |
1489 | buf_pool_mutex_exit(buf_pool); |
1490 | } |
1491 | |
1492 | continue; |
1493 | } else { |
1494 | mutex_exit(block_mutex); |
1495 | } |
1496 | } |
1497 | buf_pool_mutex_exit(buf_pool); |
1498 | } |
1499 | |
1500 | if (count > 1) { |
1501 | MONITOR_INC_VALUE_CUMULATIVE( |
1502 | MONITOR_FLUSH_NEIGHBOR_TOTAL_PAGE, |
1503 | MONITOR_FLUSH_NEIGHBOR_COUNT, |
1504 | MONITOR_FLUSH_NEIGHBOR_PAGES, |
1505 | (count - 1)); |
1506 | } |
1507 | |
1508 | return(count); |
1509 | } |
1510 | |
1511 | /** Check if the block is modified and ready for flushing. |
1512 | If the the block is ready to flush then flush the page and try o flush |
1513 | its neighbors. |
1514 | @param[in] bpage buffer control block, |
1515 | must be buf_page_in_file(bpage) |
1516 | @param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST |
1517 | @param[in] n_to_flush number of pages to flush |
1518 | @param[in,out] count number of pages flushed |
1519 | @return TRUE if buf_pool mutex was released during this function. |
1520 | This does not guarantee that some pages were written as well. |
1521 | Number of pages written are incremented to the count. */ |
1522 | static |
1523 | bool |
1524 | buf_flush_page_and_try_neighbors( |
1525 | buf_page_t* bpage, |
1526 | buf_flush_t flush_type, |
1527 | ulint n_to_flush, |
1528 | ulint* count) |
1529 | { |
1530 | #ifdef UNIV_DEBUG |
1531 | buf_pool_t* buf_pool = buf_pool_from_bpage(bpage); |
1532 | |
1533 | ut_ad(buf_pool_mutex_own(buf_pool)); |
1534 | #endif /* UNIV_DEBUG */ |
1535 | |
1536 | bool flushed; |
1537 | BPageMutex* block_mutex = buf_page_get_mutex(bpage); |
1538 | |
1539 | mutex_enter(block_mutex); |
1540 | |
1541 | ut_a(buf_page_in_file(bpage)); |
1542 | |
1543 | if (buf_flush_ready_for_flush(bpage, flush_type)) { |
1544 | buf_pool_t* buf_pool; |
1545 | |
1546 | buf_pool = buf_pool_from_bpage(bpage); |
1547 | |
1548 | const page_id_t page_id = bpage->id; |
1549 | |
1550 | mutex_exit(block_mutex); |
1551 | |
1552 | buf_pool_mutex_exit(buf_pool); |
1553 | |
1554 | /* Try to flush also all the neighbors */ |
1555 | *count += buf_flush_try_neighbors( |
1556 | page_id, flush_type, *count, n_to_flush); |
1557 | |
1558 | buf_pool_mutex_enter(buf_pool); |
1559 | flushed = TRUE; |
1560 | } else { |
1561 | mutex_exit(block_mutex); |
1562 | |
1563 | flushed = false; |
1564 | } |
1565 | |
1566 | ut_ad(buf_pool_mutex_own(buf_pool)); |
1567 | |
1568 | return(flushed); |
1569 | } |
1570 | |
1571 | /*******************************************************************//** |
1572 | This utility moves the uncompressed frames of pages to the free list. |
1573 | Note that this function does not actually flush any data to disk. It |
1574 | just detaches the uncompressed frames from the compressed pages at the |
1575 | tail of the unzip_LRU and puts those freed frames in the free list. |
1576 | Note that it is a best effort attempt and it is not guaranteed that |
1577 | after a call to this function there will be 'max' blocks in the free |
1578 | list. |
1579 | @return number of blocks moved to the free list. */ |
1580 | static |
1581 | ulint |
1582 | buf_free_from_unzip_LRU_list_batch( |
1583 | /*===============================*/ |
1584 | buf_pool_t* buf_pool, /*!< in: buffer pool instance */ |
1585 | ulint max) /*!< in: desired number of |
1586 | blocks in the free_list */ |
1587 | { |
1588 | ulint scanned = 0; |
1589 | ulint count = 0; |
1590 | ulint free_len = UT_LIST_GET_LEN(buf_pool->free); |
1591 | ulint lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU); |
1592 | |
1593 | ut_ad(buf_pool_mutex_own(buf_pool)); |
1594 | |
1595 | buf_block_t* block = UT_LIST_GET_LAST(buf_pool->unzip_LRU); |
1596 | |
1597 | while (block != NULL |
1598 | && count < max |
1599 | && free_len < srv_LRU_scan_depth |
1600 | && lru_len > UT_LIST_GET_LEN(buf_pool->LRU) / 10) { |
1601 | |
1602 | ++scanned; |
1603 | if (buf_LRU_free_page(&block->page, false)) { |
1604 | /* Block was freed. buf_pool->mutex potentially |
1605 | released and reacquired */ |
1606 | ++count; |
1607 | block = UT_LIST_GET_LAST(buf_pool->unzip_LRU); |
1608 | |
1609 | } else { |
1610 | |
1611 | block = UT_LIST_GET_PREV(unzip_LRU, block); |
1612 | } |
1613 | |
1614 | free_len = UT_LIST_GET_LEN(buf_pool->free); |
1615 | lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU); |
1616 | } |
1617 | |
1618 | ut_ad(buf_pool_mutex_own(buf_pool)); |
1619 | |
1620 | if (scanned) { |
1621 | MONITOR_INC_VALUE_CUMULATIVE( |
1622 | MONITOR_LRU_BATCH_SCANNED, |
1623 | MONITOR_LRU_BATCH_SCANNED_NUM_CALL, |
1624 | MONITOR_LRU_BATCH_SCANNED_PER_CALL, |
1625 | scanned); |
1626 | } |
1627 | |
1628 | return(count); |
1629 | } |
1630 | |
1631 | /*******************************************************************//** |
1632 | This utility flushes dirty blocks from the end of the LRU list. |
1633 | The calling thread is not allowed to own any latches on pages! |
1634 | It attempts to make 'max' blocks available in the free list. Note that |
1635 | it is a best effort attempt and it is not guaranteed that after a call |
1636 | to this function there will be 'max' blocks in the free list.*/ |
1637 | |
1638 | void |
1639 | buf_flush_LRU_list_batch( |
1640 | /*=====================*/ |
1641 | buf_pool_t* buf_pool, /*!< in: buffer pool instance */ |
1642 | ulint max, /*!< in: desired number of |
1643 | blocks in the free_list */ |
1644 | flush_counters_t* n) /*!< out: flushed/evicted page |
1645 | counts */ |
1646 | { |
1647 | buf_page_t* bpage; |
1648 | ulint scanned = 0; |
1649 | ulint free_len = UT_LIST_GET_LEN(buf_pool->free); |
1650 | ulint lru_len = UT_LIST_GET_LEN(buf_pool->LRU); |
1651 | ulint withdraw_depth = 0; |
1652 | |
1653 | n->flushed = 0; |
1654 | n->evicted = 0; |
1655 | n->unzip_LRU_evicted = 0; |
1656 | ut_ad(buf_pool_mutex_own(buf_pool)); |
1657 | if (buf_pool->curr_size < buf_pool->old_size |
1658 | && buf_pool->withdraw_target > 0) { |
1659 | withdraw_depth = buf_pool->withdraw_target |
1660 | - UT_LIST_GET_LEN(buf_pool->withdraw); |
1661 | } |
1662 | |
1663 | for (bpage = UT_LIST_GET_LAST(buf_pool->LRU); |
1664 | bpage != NULL && n->flushed + n->evicted < max |
1665 | && free_len < srv_LRU_scan_depth + withdraw_depth |
1666 | && lru_len > BUF_LRU_MIN_LEN; |
1667 | ++scanned, |
1668 | bpage = buf_pool->lru_hp.get()) { |
1669 | |
1670 | buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage); |
1671 | buf_pool->lru_hp.set(prev); |
1672 | |
1673 | BPageMutex* block_mutex = buf_page_get_mutex(bpage); |
1674 | |
1675 | mutex_enter(block_mutex); |
1676 | |
1677 | if (buf_flush_ready_for_replace(bpage)) { |
1678 | /* block is ready for eviction i.e., it is |
1679 | clean and is not IO-fixed or buffer fixed. */ |
1680 | mutex_exit(block_mutex); |
1681 | if (buf_LRU_free_page(bpage, true)) { |
1682 | ++n->evicted; |
1683 | } |
1684 | } else if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_LRU)) { |
1685 | /* Block is ready for flush. Dispatch an IO |
1686 | request. The IO helper thread will put it on |
1687 | free list in IO completion routine. */ |
1688 | mutex_exit(block_mutex); |
1689 | buf_flush_page_and_try_neighbors( |
1690 | bpage, BUF_FLUSH_LRU, max, &n->flushed); |
1691 | } else { |
1692 | /* Can't evict or dispatch this block. Go to |
1693 | previous. */ |
1694 | ut_ad(buf_pool->lru_hp.is_hp(prev)); |
1695 | mutex_exit(block_mutex); |
1696 | } |
1697 | |
1698 | ut_ad(!mutex_own(block_mutex)); |
1699 | ut_ad(buf_pool_mutex_own(buf_pool)); |
1700 | |
1701 | free_len = UT_LIST_GET_LEN(buf_pool->free); |
1702 | lru_len = UT_LIST_GET_LEN(buf_pool->LRU); |
1703 | } |
1704 | |
1705 | buf_pool->lru_hp.set(NULL); |
1706 | |
1707 | /* We keep track of all flushes happening as part of LRU |
1708 | flush. When estimating the desired rate at which flush_list |
1709 | should be flushed, we factor in this value. */ |
1710 | buf_lru_flush_page_count += n->flushed; |
1711 | |
1712 | ut_ad(buf_pool_mutex_own(buf_pool)); |
1713 | |
1714 | if (n->evicted) { |
1715 | MONITOR_INC_VALUE_CUMULATIVE( |
1716 | MONITOR_LRU_BATCH_EVICT_TOTAL_PAGE, |
1717 | MONITOR_LRU_BATCH_EVICT_COUNT, |
1718 | MONITOR_LRU_BATCH_EVICT_PAGES, |
1719 | n->evicted); |
1720 | } |
1721 | |
1722 | if (scanned) { |
1723 | MONITOR_INC_VALUE_CUMULATIVE( |
1724 | MONITOR_LRU_BATCH_SCANNED, |
1725 | MONITOR_LRU_BATCH_SCANNED_NUM_CALL, |
1726 | MONITOR_LRU_BATCH_SCANNED_PER_CALL, |
1727 | scanned); |
1728 | } |
1729 | } |
1730 | |
1731 | /*******************************************************************//** |
1732 | Flush and move pages from LRU or unzip_LRU list to the free list. |
1733 | Whether LRU or unzip_LRU is used depends on the state of the system.*/ |
1734 | |
1735 | static |
1736 | void |
1737 | buf_do_LRU_batch( |
1738 | /*=============*/ |
1739 | buf_pool_t* buf_pool, /*!< in: buffer pool instance */ |
1740 | ulint max, /*!< in: desired number of |
1741 | blocks in the free_list */ |
1742 | flush_counters_t* n) /*!< out: flushed/evicted page |
1743 | counts */ |
1744 | { |
1745 | if (buf_LRU_evict_from_unzip_LRU(buf_pool)) { |
1746 | n->unzip_LRU_evicted = buf_free_from_unzip_LRU_list_batch(buf_pool, max); |
1747 | } else { |
1748 | n->unzip_LRU_evicted = 0; |
1749 | } |
1750 | |
1751 | if (max > n->unzip_LRU_evicted) { |
1752 | buf_flush_LRU_list_batch(buf_pool, max - n->unzip_LRU_evicted, n); |
1753 | } else { |
1754 | n->evicted = 0; |
1755 | n->flushed = 0; |
1756 | } |
1757 | |
1758 | /* Add evicted pages from unzip_LRU to the evicted pages from |
1759 | the simple LRU. */ |
1760 | n->evicted += n->unzip_LRU_evicted; |
1761 | } |
1762 | |
1763 | /** This utility flushes dirty blocks from the end of the flush_list. |
1764 | The calling thread is not allowed to own any latches on pages! |
1765 | @param[in] buf_pool buffer pool instance |
1766 | @param[in] min_n wished minimum mumber of blocks flushed (it is |
1767 | not guaranteed that the actual number is that big, though) |
1768 | @param[in] lsn_limit all blocks whose oldest_modification is smaller |
1769 | than this should be flushed (if their number does not exceed min_n) |
1770 | @return number of blocks for which the write request was queued; |
1771 | ULINT_UNDEFINED if there was a flush of the same type already |
1772 | running */ |
1773 | static |
1774 | ulint |
1775 | buf_do_flush_list_batch( |
1776 | buf_pool_t* buf_pool, |
1777 | ulint min_n, |
1778 | lsn_t lsn_limit) |
1779 | { |
1780 | ulint count = 0; |
1781 | ulint scanned = 0; |
1782 | |
1783 | ut_ad(buf_pool_mutex_own(buf_pool)); |
1784 | |
1785 | /* Start from the end of the list looking for a suitable |
1786 | block to be flushed. */ |
1787 | buf_flush_list_mutex_enter(buf_pool); |
1788 | ulint len = UT_LIST_GET_LEN(buf_pool->flush_list); |
1789 | |
1790 | /* In order not to degenerate this scan to O(n*n) we attempt |
1791 | to preserve pointer of previous block in the flush list. To do |
1792 | so we declare it a hazard pointer. Any thread working on the |
1793 | flush list must check the hazard pointer and if it is removing |
1794 | the same block then it must reset it. */ |
1795 | for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->flush_list); |
1796 | count < min_n && bpage != NULL && len > 0 |
1797 | && bpage->oldest_modification < lsn_limit; |
1798 | bpage = buf_pool->flush_hp.get(), |
1799 | ++scanned) { |
1800 | |
1801 | buf_page_t* prev; |
1802 | |
1803 | ut_a(bpage->oldest_modification > 0); |
1804 | ut_ad(bpage->in_flush_list); |
1805 | |
1806 | prev = UT_LIST_GET_PREV(list, bpage); |
1807 | buf_pool->flush_hp.set(prev); |
1808 | buf_flush_list_mutex_exit(buf_pool); |
1809 | |
1810 | #ifdef UNIV_DEBUG |
1811 | bool flushed = |
1812 | #endif /* UNIV_DEBUG */ |
1813 | buf_flush_page_and_try_neighbors( |
1814 | bpage, BUF_FLUSH_LIST, min_n, &count); |
1815 | |
1816 | buf_flush_list_mutex_enter(buf_pool); |
1817 | |
1818 | ut_ad(flushed || buf_pool->flush_hp.is_hp(prev)); |
1819 | |
1820 | --len; |
1821 | } |
1822 | |
1823 | buf_pool->flush_hp.set(NULL); |
1824 | buf_flush_list_mutex_exit(buf_pool); |
1825 | |
1826 | if (scanned) { |
1827 | MONITOR_INC_VALUE_CUMULATIVE( |
1828 | MONITOR_FLUSH_BATCH_SCANNED, |
1829 | MONITOR_FLUSH_BATCH_SCANNED_NUM_CALL, |
1830 | MONITOR_FLUSH_BATCH_SCANNED_PER_CALL, |
1831 | scanned); |
1832 | } |
1833 | |
1834 | if (count) { |
1835 | MONITOR_INC_VALUE_CUMULATIVE( |
1836 | MONITOR_FLUSH_BATCH_TOTAL_PAGE, |
1837 | MONITOR_FLUSH_BATCH_COUNT, |
1838 | MONITOR_FLUSH_BATCH_PAGES, |
1839 | count); |
1840 | } |
1841 | |
1842 | ut_ad(buf_pool_mutex_own(buf_pool)); |
1843 | |
1844 | return(count); |
1845 | } |
1846 | |
1847 | /** This utility flushes dirty blocks from the end of the LRU list or |
1848 | flush_list. |
1849 | NOTE 1: in the case of an LRU flush the calling thread may own latches to |
1850 | pages: to avoid deadlocks, this function must be written so that it cannot |
1851 | end up waiting for these latches! NOTE 2: in the case of a flush list flush, |
1852 | the calling thread is not allowed to own any latches on pages! |
1853 | @param[in] buf_pool buffer pool instance |
1854 | @param[in] flush_type BUF_FLUSH_LRU or BUF_FLUSH_LIST; if |
1855 | BUF_FLUSH_LIST, then the caller must not own any latches on pages |
1856 | @param[in] min_n wished minimum mumber of blocks flushed (it is |
1857 | not guaranteed that the actual number is that big, though) |
1858 | @param[in] lsn_limit in the case of BUF_FLUSH_LIST all blocks whose |
1859 | oldest_modification is smaller than this should be flushed (if their number |
1860 | does not exceed min_n), otherwise ignored */ |
1861 | static |
1862 | void |
1863 | buf_flush_batch( |
1864 | buf_pool_t* buf_pool, |
1865 | buf_flush_t flush_type, |
1866 | ulint min_n, |
1867 | lsn_t lsn_limit, |
1868 | flush_counters_t* n) /*!< out: flushed/evicted page |
1869 | counts */ |
1870 | { |
1871 | ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); |
1872 | ut_ad(flush_type == BUF_FLUSH_LRU |
1873 | || !sync_check_iterate(dict_sync_check())); |
1874 | |
1875 | buf_pool_mutex_enter(buf_pool); |
1876 | |
1877 | /* Note: The buffer pool mutex is released and reacquired within |
1878 | the flush functions. */ |
1879 | switch (flush_type) { |
1880 | case BUF_FLUSH_LRU: |
1881 | buf_do_LRU_batch(buf_pool, min_n, n); |
1882 | break; |
1883 | case BUF_FLUSH_LIST: |
1884 | n->flushed = buf_do_flush_list_batch(buf_pool, min_n, lsn_limit); |
1885 | n->evicted = 0; |
1886 | break; |
1887 | default: |
1888 | ut_error; |
1889 | } |
1890 | |
1891 | buf_pool_mutex_exit(buf_pool); |
1892 | |
1893 | DBUG_LOG("ib_buf" , "flush " << flush_type << " completed" ); |
1894 | } |
1895 | |
1896 | /******************************************************************//** |
1897 | Gather the aggregated stats for both flush list and LRU list flushing. |
1898 | @param page_count_flush number of pages flushed from the end of the flush_list |
1899 | @param page_count_LRU number of pages flushed from the end of the LRU list |
1900 | */ |
1901 | static |
1902 | void |
1903 | buf_flush_stats( |
1904 | /*============*/ |
1905 | ulint page_count_flush, |
1906 | ulint page_count_LRU) |
1907 | { |
1908 | DBUG_PRINT("ib_buf" , ("flush completed, from flush_list %u pages, " |
1909 | "from LRU_list %u pages" , |
1910 | unsigned(page_count_flush), |
1911 | unsigned(page_count_LRU))); |
1912 | |
1913 | srv_stats.buf_pool_flushed.add(page_count_flush + page_count_LRU); |
1914 | } |
1915 | |
1916 | /******************************************************************//** |
1917 | Start a buffer flush batch for LRU or flush list */ |
1918 | static |
1919 | ibool |
1920 | buf_flush_start( |
1921 | /*============*/ |
1922 | buf_pool_t* buf_pool, /*!< buffer pool instance */ |
1923 | buf_flush_t flush_type) /*!< in: BUF_FLUSH_LRU |
1924 | or BUF_FLUSH_LIST */ |
1925 | { |
1926 | ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST); |
1927 | |
1928 | buf_pool_mutex_enter(buf_pool); |
1929 | |
1930 | if (buf_pool->n_flush[flush_type] > 0 |
1931 | || buf_pool->init_flush[flush_type] == TRUE) { |
1932 | |
1933 | /* There is already a flush batch of the same type running */ |
1934 | |
1935 | buf_pool_mutex_exit(buf_pool); |
1936 | |
1937 | return(FALSE); |
1938 | } |
1939 | |
1940 | buf_pool->init_flush[flush_type] = TRUE; |
1941 | |
1942 | os_event_reset(buf_pool->no_flush[flush_type]); |
1943 | |
1944 | buf_pool_mutex_exit(buf_pool); |
1945 | |
1946 | return(TRUE); |
1947 | } |
1948 | |
1949 | /******************************************************************//** |
1950 | End a buffer flush batch for LRU or flush list */ |
1951 | static |
1952 | void |
1953 | buf_flush_end( |
1954 | /*==========*/ |
1955 | buf_pool_t* buf_pool, /*!< buffer pool instance */ |
1956 | buf_flush_t flush_type) /*!< in: BUF_FLUSH_LRU |
1957 | or BUF_FLUSH_LIST */ |
1958 | { |
1959 | buf_pool_mutex_enter(buf_pool); |
1960 | |
1961 | buf_pool->init_flush[flush_type] = FALSE; |
1962 | |
1963 | buf_pool->try_LRU_scan = TRUE; |
1964 | |
1965 | if (buf_pool->n_flush[flush_type] == 0) { |
1966 | |
1967 | /* The running flush batch has ended */ |
1968 | |
1969 | os_event_set(buf_pool->no_flush[flush_type]); |
1970 | } |
1971 | |
1972 | buf_pool_mutex_exit(buf_pool); |
1973 | |
1974 | if (!srv_read_only_mode) { |
1975 | buf_dblwr_flush_buffered_writes(); |
1976 | } else { |
1977 | os_aio_simulated_wake_handler_threads(); |
1978 | } |
1979 | } |
1980 | |
1981 | /******************************************************************//** |
1982 | Waits until a flush batch of the given type ends */ |
1983 | void |
1984 | buf_flush_wait_batch_end( |
1985 | /*=====================*/ |
1986 | buf_pool_t* buf_pool, /*!< buffer pool instance */ |
1987 | buf_flush_t type) /*!< in: BUF_FLUSH_LRU |
1988 | or BUF_FLUSH_LIST */ |
1989 | { |
1990 | ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST); |
1991 | |
1992 | if (buf_pool == NULL) { |
1993 | ulint i; |
1994 | |
1995 | for (i = 0; i < srv_buf_pool_instances; ++i) { |
1996 | buf_pool_t* buf_pool; |
1997 | |
1998 | buf_pool = buf_pool_from_array(i); |
1999 | |
2000 | thd_wait_begin(NULL, THD_WAIT_DISKIO); |
2001 | os_event_wait(buf_pool->no_flush[type]); |
2002 | thd_wait_end(NULL); |
2003 | } |
2004 | } else { |
2005 | thd_wait_begin(NULL, THD_WAIT_DISKIO); |
2006 | os_event_wait(buf_pool->no_flush[type]); |
2007 | thd_wait_end(NULL); |
2008 | } |
2009 | } |
2010 | |
2011 | /** Do flushing batch of a given type. |
2012 | NOTE: The calling thread is not allowed to own any latches on pages! |
2013 | @param[in,out] buf_pool buffer pool instance |
2014 | @param[in] type flush type |
2015 | @param[in] min_n wished minimum mumber of blocks flushed |
2016 | (it is not guaranteed that the actual number is that big, though) |
2017 | @param[in] lsn_limit in the case BUF_FLUSH_LIST all blocks whose |
2018 | oldest_modification is smaller than this should be flushed (if their number |
2019 | does not exceed min_n), otherwise ignored |
2020 | @param[out] n_processed the number of pages which were processed is |
2021 | passed back to caller. Ignored if NULL |
2022 | @retval true if a batch was queued successfully. |
2023 | @retval false if another batch of same type was already running. */ |
2024 | bool |
2025 | buf_flush_do_batch( |
2026 | buf_pool_t* buf_pool, |
2027 | buf_flush_t type, |
2028 | ulint min_n, |
2029 | lsn_t lsn_limit, |
2030 | flush_counters_t* n) |
2031 | { |
2032 | ut_ad(type == BUF_FLUSH_LRU || type == BUF_FLUSH_LIST); |
2033 | |
2034 | if (n != NULL) { |
2035 | n->flushed = 0; |
2036 | } |
2037 | |
2038 | if (!buf_flush_start(buf_pool, type)) { |
2039 | return(false); |
2040 | } |
2041 | |
2042 | buf_flush_batch(buf_pool, type, min_n, lsn_limit, n); |
2043 | |
2044 | buf_flush_end(buf_pool, type); |
2045 | |
2046 | return(true); |
2047 | } |
2048 | /** |
2049 | Waits until a flush batch of the given lsn ends |
2050 | @param[in] new_oldest target oldest_modified_lsn to wait for */ |
2051 | |
2052 | void |
2053 | buf_flush_wait_flushed( |
2054 | lsn_t new_oldest) |
2055 | { |
2056 | for (ulint i = 0; i < srv_buf_pool_instances; ++i) { |
2057 | buf_pool_t* buf_pool; |
2058 | lsn_t oldest; |
2059 | |
2060 | buf_pool = buf_pool_from_array(i); |
2061 | |
2062 | for (;;) { |
2063 | /* We don't need to wait for fsync of the flushed |
2064 | blocks, because anyway we need fsync to make chekpoint. |
2065 | So, we don't need to wait for the batch end here. */ |
2066 | |
2067 | buf_flush_list_mutex_enter(buf_pool); |
2068 | |
2069 | buf_page_t* bpage; |
2070 | |
2071 | /* We don't need to wait for system temporary pages */ |
2072 | for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list); |
2073 | bpage != NULL |
2074 | && fsp_is_system_temporary(bpage->id.space()); |
2075 | bpage = UT_LIST_GET_PREV(list, bpage)) { |
2076 | /* Do nothing. */ |
2077 | } |
2078 | |
2079 | if (bpage != NULL) { |
2080 | ut_ad(bpage->in_flush_list); |
2081 | oldest = bpage->oldest_modification; |
2082 | } else { |
2083 | oldest = 0; |
2084 | } |
2085 | |
2086 | buf_flush_list_mutex_exit(buf_pool); |
2087 | |
2088 | if (oldest == 0 || oldest >= new_oldest) { |
2089 | break; |
2090 | } |
2091 | |
2092 | /* sleep and retry */ |
2093 | os_thread_sleep(buf_flush_wait_flushed_sleep_time); |
2094 | |
2095 | MONITOR_INC(MONITOR_FLUSH_SYNC_WAITS); |
2096 | } |
2097 | } |
2098 | } |
2099 | |
2100 | /** This utility flushes dirty blocks from the end of the flush list of all |
2101 | buffer pool instances. |
2102 | NOTE: The calling thread is not allowed to own any latches on pages! |
2103 | @param[in] min_n wished minimum mumber of blocks flushed (it is |
2104 | not guaranteed that the actual number is that big, though) |
2105 | @param[in] lsn_limit in the case BUF_FLUSH_LIST all blocks whose |
2106 | oldest_modification is smaller than this should be flushed (if their number |
2107 | does not exceed min_n), otherwise ignored |
2108 | @param[out] n_processed the number of pages which were processed is |
2109 | passed back to caller. Ignored if NULL. |
2110 | @return true if a batch was queued successfully for each buffer pool |
2111 | instance. false if another batch of same type was already running in |
2112 | at least one of the buffer pool instance */ |
2113 | bool |
2114 | buf_flush_lists( |
2115 | ulint min_n, |
2116 | lsn_t lsn_limit, |
2117 | ulint* n_processed) |
2118 | { |
2119 | ulint i; |
2120 | ulint n_flushed = 0; |
2121 | bool success = true; |
2122 | |
2123 | if (n_processed) { |
2124 | *n_processed = 0; |
2125 | } |
2126 | |
2127 | if (min_n != ULINT_MAX) { |
2128 | /* Ensure that flushing is spread evenly amongst the |
2129 | buffer pool instances. When min_n is ULINT_MAX |
2130 | we need to flush everything up to the lsn limit |
2131 | so no limit here. */ |
2132 | min_n = (min_n + srv_buf_pool_instances - 1) |
2133 | / srv_buf_pool_instances; |
2134 | } |
2135 | |
2136 | /* Flush to lsn_limit in all buffer pool instances */ |
2137 | for (i = 0; i < srv_buf_pool_instances; i++) { |
2138 | buf_pool_t* buf_pool; |
2139 | flush_counters_t n; |
2140 | |
2141 | memset(&n, 0, sizeof(flush_counters_t)); |
2142 | buf_pool = buf_pool_from_array(i); |
2143 | |
2144 | if (!buf_flush_do_batch(buf_pool, |
2145 | BUF_FLUSH_LIST, |
2146 | min_n, |
2147 | lsn_limit, |
2148 | &n)) { |
2149 | /* We have two choices here. If lsn_limit was |
2150 | specified then skipping an instance of buffer |
2151 | pool means we cannot guarantee that all pages |
2152 | up to lsn_limit has been flushed. We can |
2153 | return right now with failure or we can try |
2154 | to flush remaining buffer pools up to the |
2155 | lsn_limit. We attempt to flush other buffer |
2156 | pools based on the assumption that it will |
2157 | help in the retry which will follow the |
2158 | failure. */ |
2159 | success = false; |
2160 | |
2161 | } |
2162 | |
2163 | n_flushed += n.flushed; |
2164 | } |
2165 | |
2166 | if (n_flushed) { |
2167 | buf_flush_stats(n_flushed, 0); |
2168 | if (n_processed) { |
2169 | *n_processed = n_flushed; |
2170 | } |
2171 | } |
2172 | |
2173 | return(success); |
2174 | } |
2175 | |
2176 | /******************************************************************//** |
2177 | This function picks up a single page from the tail of the LRU |
2178 | list, flushes it (if it is dirty), removes it from page_hash and LRU |
2179 | list and puts it on the free list. It is called from user threads when |
2180 | they are unable to find a replaceable page at the tail of the LRU |
2181 | list i.e.: when the background LRU flushing in the page_cleaner thread |
2182 | is not fast enough to keep pace with the workload. |
2183 | @return true if success. */ |
2184 | bool |
2185 | buf_flush_single_page_from_LRU( |
2186 | /*===========================*/ |
2187 | buf_pool_t* buf_pool) /*!< in/out: buffer pool instance */ |
2188 | { |
2189 | ulint scanned; |
2190 | buf_page_t* bpage; |
2191 | ibool freed; |
2192 | |
2193 | buf_pool_mutex_enter(buf_pool); |
2194 | |
2195 | for (bpage = buf_pool->single_scan_itr.start(), scanned = 0, |
2196 | freed = false; |
2197 | bpage != NULL; |
2198 | ++scanned, bpage = buf_pool->single_scan_itr.get()) { |
2199 | |
2200 | ut_ad(buf_pool_mutex_own(buf_pool)); |
2201 | |
2202 | buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage); |
2203 | buf_pool->single_scan_itr.set(prev); |
2204 | BPageMutex* block_mutex; |
2205 | |
2206 | block_mutex = buf_page_get_mutex(bpage); |
2207 | |
2208 | mutex_enter(block_mutex); |
2209 | |
2210 | if (buf_flush_ready_for_replace(bpage)) { |
2211 | /* block is ready for eviction i.e., it is |
2212 | clean and is not IO-fixed or buffer fixed. */ |
2213 | mutex_exit(block_mutex); |
2214 | |
2215 | if (buf_LRU_free_page(bpage, true)) { |
2216 | buf_pool_mutex_exit(buf_pool); |
2217 | freed = true; |
2218 | break; |
2219 | } |
2220 | |
2221 | } else if (buf_flush_ready_for_flush( |
2222 | bpage, BUF_FLUSH_SINGLE_PAGE)) { |
2223 | |
2224 | /* Block is ready for flush. Try and dispatch an IO |
2225 | request. We'll put it on free list in IO completion |
2226 | routine if it is not buffer fixed. The following call |
2227 | will release the buffer pool and block mutex. |
2228 | |
2229 | Note: There is no guarantee that this page has actually |
2230 | been freed, only that it has been flushed to disk */ |
2231 | |
2232 | freed = buf_flush_page( |
2233 | buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, true); |
2234 | |
2235 | if (freed) { |
2236 | break; |
2237 | } |
2238 | |
2239 | mutex_exit(block_mutex); |
2240 | } else { |
2241 | mutex_exit(block_mutex); |
2242 | } |
2243 | ut_ad(!mutex_own(block_mutex)); |
2244 | } |
2245 | if (!freed) { |
2246 | /* Can't find a single flushable page. */ |
2247 | ut_ad(!bpage); |
2248 | buf_pool_mutex_exit(buf_pool); |
2249 | } |
2250 | |
2251 | if (scanned) { |
2252 | MONITOR_INC_VALUE_CUMULATIVE( |
2253 | MONITOR_LRU_SINGLE_FLUSH_SCANNED, |
2254 | MONITOR_LRU_SINGLE_FLUSH_SCANNED_NUM_CALL, |
2255 | MONITOR_LRU_SINGLE_FLUSH_SCANNED_PER_CALL, |
2256 | scanned); |
2257 | } |
2258 | |
2259 | ut_ad(!buf_pool_mutex_own(buf_pool)); |
2260 | return(freed); |
2261 | } |
2262 | |
2263 | /** |
2264 | Clears up tail of the LRU list of a given buffer pool instance: |
2265 | * Put replaceable pages at the tail of LRU to the free list |
2266 | * Flush dirty pages at the tail of LRU to the disk |
2267 | The depth to which we scan each buffer pool is controlled by dynamic |
2268 | config parameter innodb_LRU_scan_depth. |
2269 | @param buf_pool buffer pool instance |
2270 | @return total pages flushed */ |
2271 | static |
2272 | ulint |
2273 | buf_flush_LRU_list( |
2274 | buf_pool_t* buf_pool) |
2275 | { |
2276 | ulint scan_depth, withdraw_depth; |
2277 | flush_counters_t n; |
2278 | |
2279 | memset(&n, 0, sizeof(flush_counters_t)); |
2280 | |
2281 | ut_ad(buf_pool); |
2282 | /* srv_LRU_scan_depth can be arbitrarily large value. |
2283 | We cap it with current LRU size. */ |
2284 | buf_pool_mutex_enter(buf_pool); |
2285 | scan_depth = UT_LIST_GET_LEN(buf_pool->LRU); |
2286 | if (buf_pool->curr_size < buf_pool->old_size |
2287 | && buf_pool->withdraw_target > 0) { |
2288 | withdraw_depth = buf_pool->withdraw_target |
2289 | - UT_LIST_GET_LEN(buf_pool->withdraw); |
2290 | } else { |
2291 | withdraw_depth = 0; |
2292 | } |
2293 | buf_pool_mutex_exit(buf_pool); |
2294 | if (withdraw_depth > srv_LRU_scan_depth) { |
2295 | scan_depth = ut_min(withdraw_depth, scan_depth); |
2296 | } else { |
2297 | scan_depth = ut_min(static_cast<ulint>(srv_LRU_scan_depth), |
2298 | scan_depth); |
2299 | } |
2300 | /* Currently one of page_cleaners is the only thread |
2301 | that can trigger an LRU flush at the same time. |
2302 | So, it is not possible that a batch triggered during |
2303 | last iteration is still running, */ |
2304 | buf_flush_do_batch(buf_pool, BUF_FLUSH_LRU, scan_depth, |
2305 | 0, &n); |
2306 | |
2307 | return(n.flushed); |
2308 | } |
2309 | |
2310 | /*********************************************************************//** |
2311 | Wait for any possible LRU flushes that are in progress to end. */ |
2312 | void |
2313 | buf_flush_wait_LRU_batch_end(void) |
2314 | /*==============================*/ |
2315 | { |
2316 | for (ulint i = 0; i < srv_buf_pool_instances; i++) { |
2317 | buf_pool_t* buf_pool; |
2318 | |
2319 | buf_pool = buf_pool_from_array(i); |
2320 | |
2321 | buf_pool_mutex_enter(buf_pool); |
2322 | |
2323 | if (buf_pool->n_flush[BUF_FLUSH_LRU] > 0 |
2324 | || buf_pool->init_flush[BUF_FLUSH_LRU]) { |
2325 | |
2326 | buf_pool_mutex_exit(buf_pool); |
2327 | buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU); |
2328 | } else { |
2329 | buf_pool_mutex_exit(buf_pool); |
2330 | } |
2331 | } |
2332 | } |
2333 | |
2334 | /*********************************************************************//** |
2335 | Calculates if flushing is required based on number of dirty pages in |
2336 | the buffer pool. |
2337 | @return percent of io_capacity to flush to manage dirty page ratio */ |
2338 | static |
2339 | ulint |
2340 | af_get_pct_for_dirty() |
2341 | /*==================*/ |
2342 | { |
2343 | double dirty_pct = buf_get_modified_ratio_pct(); |
2344 | |
2345 | if (dirty_pct == 0.0) { |
2346 | /* No pages modified */ |
2347 | return(0); |
2348 | } |
2349 | |
2350 | ut_a(srv_max_dirty_pages_pct_lwm |
2351 | <= srv_max_buf_pool_modified_pct); |
2352 | |
2353 | if (srv_max_dirty_pages_pct_lwm == 0) { |
2354 | /* The user has not set the option to preflush dirty |
2355 | pages as we approach the high water mark. */ |
2356 | if (dirty_pct >= srv_max_buf_pool_modified_pct) { |
2357 | /* We have crossed the high water mark of dirty |
2358 | pages In this case we start flushing at 100% of |
2359 | innodb_io_capacity. */ |
2360 | return(100); |
2361 | } |
2362 | } else if (dirty_pct >= srv_max_dirty_pages_pct_lwm) { |
2363 | /* We should start flushing pages gradually. */ |
2364 | return(static_cast<ulint>((dirty_pct * 100) |
2365 | / (srv_max_buf_pool_modified_pct + 1))); |
2366 | } |
2367 | |
2368 | return(0); |
2369 | } |
2370 | |
2371 | /*********************************************************************//** |
2372 | Calculates if flushing is required based on redo generation rate. |
2373 | @return percent of io_capacity to flush to manage redo space */ |
2374 | static |
2375 | ulint |
2376 | af_get_pct_for_lsn( |
2377 | /*===============*/ |
2378 | lsn_t age) /*!< in: current age of LSN. */ |
2379 | { |
2380 | lsn_t max_async_age; |
2381 | lsn_t lsn_age_factor; |
2382 | lsn_t af_lwm = (lsn_t) ((srv_adaptive_flushing_lwm |
2383 | * log_get_capacity()) / 100); |
2384 | |
2385 | if (age < af_lwm) { |
2386 | /* No adaptive flushing. */ |
2387 | return(0); |
2388 | } |
2389 | |
2390 | max_async_age = log_get_max_modified_age_async(); |
2391 | |
2392 | if (age < max_async_age && !srv_adaptive_flushing) { |
2393 | /* We have still not reached the max_async point and |
2394 | the user has disabled adaptive flushing. */ |
2395 | return(0); |
2396 | } |
2397 | |
2398 | /* If we are here then we know that either: |
2399 | 1) User has enabled adaptive flushing |
2400 | 2) User may have disabled adaptive flushing but we have reached |
2401 | max_async_age. */ |
2402 | lsn_age_factor = (age * 100) / max_async_age; |
2403 | |
2404 | ut_ad(srv_max_io_capacity >= srv_io_capacity); |
2405 | return(static_cast<ulint>( |
2406 | ((srv_max_io_capacity / srv_io_capacity) |
2407 | * (lsn_age_factor * sqrt((double)lsn_age_factor))) |
2408 | / 7.5)); |
2409 | } |
2410 | |
2411 | /*********************************************************************//** |
2412 | This function is called approximately once every second by the |
2413 | page_cleaner thread. Based on various factors it decides if there is a |
2414 | need to do flushing. |
2415 | @return number of pages recommended to be flushed |
2416 | @param lsn_limit pointer to return LSN up to which flushing must happen |
2417 | @param last_pages_in the number of pages flushed by the last flush_list |
2418 | flushing. */ |
2419 | static |
2420 | ulint |
2421 | page_cleaner_flush_pages_recommendation( |
2422 | /*====================================*/ |
2423 | lsn_t* lsn_limit, |
2424 | ulint last_pages_in) |
2425 | { |
2426 | static lsn_t prev_lsn = 0; |
2427 | static ulint sum_pages = 0; |
2428 | static ulint avg_page_rate = 0; |
2429 | static ulint n_iterations = 0; |
2430 | static time_t prev_time; |
2431 | lsn_t oldest_lsn; |
2432 | lsn_t cur_lsn; |
2433 | lsn_t age; |
2434 | lsn_t lsn_rate; |
2435 | ulint n_pages = 0; |
2436 | ulint pct_for_dirty = 0; |
2437 | ulint pct_for_lsn = 0; |
2438 | ulint pct_total = 0; |
2439 | |
2440 | cur_lsn = log_get_lsn_nowait(); |
2441 | |
2442 | /* log_get_lsn_nowait tries to get log_sys.mutex with |
2443 | mutex_enter_nowait, if this does not succeed function |
2444 | returns 0, do not use that value to update stats. */ |
2445 | if (cur_lsn == 0) { |
2446 | return(0); |
2447 | } |
2448 | |
2449 | if (prev_lsn == 0) { |
2450 | /* First time around. */ |
2451 | prev_lsn = cur_lsn; |
2452 | prev_time = ut_time(); |
2453 | return(0); |
2454 | } |
2455 | |
2456 | if (prev_lsn == cur_lsn) { |
2457 | return(0); |
2458 | } |
2459 | |
2460 | sum_pages += last_pages_in; |
2461 | |
2462 | time_t curr_time = ut_time(); |
2463 | double time_elapsed = difftime(curr_time, prev_time); |
2464 | |
2465 | /* We update our variables every srv_flushing_avg_loops |
2466 | iterations to smooth out transition in workload. */ |
2467 | if (++n_iterations >= srv_flushing_avg_loops |
2468 | || time_elapsed >= srv_flushing_avg_loops) { |
2469 | |
2470 | if (time_elapsed < 1) { |
2471 | time_elapsed = 1; |
2472 | } |
2473 | |
2474 | avg_page_rate = static_cast<ulint>( |
2475 | ((static_cast<double>(sum_pages) |
2476 | / time_elapsed) |
2477 | + avg_page_rate) / 2); |
2478 | |
2479 | /* How much LSN we have generated since last call. */ |
2480 | lsn_rate = static_cast<lsn_t>( |
2481 | static_cast<double>(cur_lsn - prev_lsn) |
2482 | / time_elapsed); |
2483 | |
2484 | lsn_avg_rate = (lsn_avg_rate + lsn_rate) / 2; |
2485 | |
2486 | /* aggregate stats of all slots */ |
2487 | mutex_enter(&page_cleaner.mutex); |
2488 | |
2489 | ulint flush_tm = page_cleaner.flush_time; |
2490 | ulint flush_pass = page_cleaner.flush_pass; |
2491 | |
2492 | page_cleaner.flush_time = 0; |
2493 | page_cleaner.flush_pass = 0; |
2494 | |
2495 | ulint lru_tm = 0; |
2496 | ulint list_tm = 0; |
2497 | ulint lru_pass = 0; |
2498 | ulint list_pass = 0; |
2499 | |
2500 | for (ulint i = 0; i < page_cleaner.n_slots; i++) { |
2501 | page_cleaner_slot_t* slot; |
2502 | |
2503 | slot = &page_cleaner.slots[i]; |
2504 | |
2505 | lru_tm += slot->flush_lru_time; |
2506 | lru_pass += slot->flush_lru_pass; |
2507 | list_tm += slot->flush_list_time; |
2508 | list_pass += slot->flush_list_pass; |
2509 | |
2510 | slot->flush_lru_time = 0; |
2511 | slot->flush_lru_pass = 0; |
2512 | slot->flush_list_time = 0; |
2513 | slot->flush_list_pass = 0; |
2514 | } |
2515 | |
2516 | mutex_exit(&page_cleaner.mutex); |
2517 | |
2518 | /* minimum values are 1, to avoid dividing by zero. */ |
2519 | if (lru_tm < 1) { |
2520 | lru_tm = 1; |
2521 | } |
2522 | if (list_tm < 1) { |
2523 | list_tm = 1; |
2524 | } |
2525 | if (flush_tm < 1) { |
2526 | flush_tm = 1; |
2527 | } |
2528 | |
2529 | if (lru_pass < 1) { |
2530 | lru_pass = 1; |
2531 | } |
2532 | if (list_pass < 1) { |
2533 | list_pass = 1; |
2534 | } |
2535 | if (flush_pass < 1) { |
2536 | flush_pass = 1; |
2537 | } |
2538 | |
2539 | MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_SLOT, |
2540 | list_tm / list_pass); |
2541 | MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_SLOT, |
2542 | lru_tm / lru_pass); |
2543 | |
2544 | MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_THREAD, |
2545 | list_tm / (srv_n_page_cleaners * flush_pass)); |
2546 | MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_THREAD, |
2547 | lru_tm / (srv_n_page_cleaners * flush_pass)); |
2548 | MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_TIME_EST, |
2549 | flush_tm * list_tm / flush_pass |
2550 | / (list_tm + lru_tm)); |
2551 | MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_TIME_EST, |
2552 | flush_tm * lru_tm / flush_pass |
2553 | / (list_tm + lru_tm)); |
2554 | MONITOR_SET(MONITOR_FLUSH_AVG_TIME, flush_tm / flush_pass); |
2555 | |
2556 | MONITOR_SET(MONITOR_FLUSH_ADAPTIVE_AVG_PASS, |
2557 | list_pass / page_cleaner.n_slots); |
2558 | MONITOR_SET(MONITOR_LRU_BATCH_FLUSH_AVG_PASS, |
2559 | lru_pass / page_cleaner.n_slots); |
2560 | MONITOR_SET(MONITOR_FLUSH_AVG_PASS, flush_pass); |
2561 | |
2562 | prev_lsn = cur_lsn; |
2563 | prev_time = curr_time; |
2564 | |
2565 | n_iterations = 0; |
2566 | |
2567 | sum_pages = 0; |
2568 | } |
2569 | |
2570 | oldest_lsn = buf_pool_get_oldest_modification(); |
2571 | |
2572 | ut_ad(oldest_lsn <= log_get_lsn()); |
2573 | |
2574 | age = cur_lsn > oldest_lsn ? cur_lsn - oldest_lsn : 0; |
2575 | |
2576 | pct_for_dirty = af_get_pct_for_dirty(); |
2577 | pct_for_lsn = af_get_pct_for_lsn(age); |
2578 | |
2579 | pct_total = ut_max(pct_for_dirty, pct_for_lsn); |
2580 | |
2581 | /* Estimate pages to be flushed for the lsn progress */ |
2582 | ulint sum_pages_for_lsn = 0; |
2583 | lsn_t target_lsn = oldest_lsn |
2584 | + lsn_avg_rate * buf_flush_lsn_scan_factor; |
2585 | |
2586 | for (ulint i = 0; i < srv_buf_pool_instances; i++) { |
2587 | buf_pool_t* buf_pool = buf_pool_from_array(i); |
2588 | ulint pages_for_lsn = 0; |
2589 | |
2590 | buf_flush_list_mutex_enter(buf_pool); |
2591 | for (buf_page_t* b = UT_LIST_GET_LAST(buf_pool->flush_list); |
2592 | b != NULL; |
2593 | b = UT_LIST_GET_PREV(list, b)) { |
2594 | if (b->oldest_modification > target_lsn) { |
2595 | break; |
2596 | } |
2597 | ++pages_for_lsn; |
2598 | } |
2599 | buf_flush_list_mutex_exit(buf_pool); |
2600 | |
2601 | sum_pages_for_lsn += pages_for_lsn; |
2602 | |
2603 | mutex_enter(&page_cleaner.mutex); |
2604 | ut_ad(page_cleaner.slots[i].state |
2605 | == PAGE_CLEANER_STATE_NONE); |
2606 | page_cleaner.slots[i].n_pages_requested |
2607 | = pages_for_lsn / buf_flush_lsn_scan_factor + 1; |
2608 | mutex_exit(&page_cleaner.mutex); |
2609 | } |
2610 | |
2611 | sum_pages_for_lsn /= buf_flush_lsn_scan_factor; |
2612 | if(sum_pages_for_lsn < 1) { |
2613 | sum_pages_for_lsn = 1; |
2614 | } |
2615 | |
2616 | /* Cap the maximum IO capacity that we are going to use by |
2617 | max_io_capacity. Limit the value to avoid too quick increase */ |
2618 | ulint pages_for_lsn = |
2619 | std::min<ulint>(sum_pages_for_lsn, srv_max_io_capacity * 2); |
2620 | |
2621 | n_pages = (PCT_IO(pct_total) + avg_page_rate + pages_for_lsn) / 3; |
2622 | |
2623 | if (n_pages > srv_max_io_capacity) { |
2624 | n_pages = srv_max_io_capacity; |
2625 | } |
2626 | |
2627 | /* Normalize request for each instance */ |
2628 | mutex_enter(&page_cleaner.mutex); |
2629 | ut_ad(page_cleaner.n_slots_requested == 0); |
2630 | ut_ad(page_cleaner.n_slots_flushing == 0); |
2631 | ut_ad(page_cleaner.n_slots_finished == 0); |
2632 | |
2633 | for (ulint i = 0; i < srv_buf_pool_instances; i++) { |
2634 | /* if REDO has enough of free space, |
2635 | don't care about age distribution of pages */ |
2636 | page_cleaner.slots[i].n_pages_requested = pct_for_lsn > 30 ? |
2637 | page_cleaner.slots[i].n_pages_requested |
2638 | * n_pages / sum_pages_for_lsn + 1 |
2639 | : n_pages / srv_buf_pool_instances; |
2640 | } |
2641 | mutex_exit(&page_cleaner.mutex); |
2642 | |
2643 | MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_REQUESTED, n_pages); |
2644 | |
2645 | MONITOR_SET(MONITOR_FLUSH_N_TO_FLUSH_BY_AGE, sum_pages_for_lsn); |
2646 | |
2647 | MONITOR_SET(MONITOR_FLUSH_AVG_PAGE_RATE, avg_page_rate); |
2648 | MONITOR_SET(MONITOR_FLUSH_LSN_AVG_RATE, lsn_avg_rate); |
2649 | MONITOR_SET(MONITOR_FLUSH_PCT_FOR_DIRTY, pct_for_dirty); |
2650 | MONITOR_SET(MONITOR_FLUSH_PCT_FOR_LSN, pct_for_lsn); |
2651 | |
2652 | *lsn_limit = LSN_MAX; |
2653 | |
2654 | return(n_pages); |
2655 | } |
2656 | |
2657 | /*********************************************************************//** |
2658 | Puts the page_cleaner thread to sleep if it has finished work in less |
2659 | than a second |
2660 | @retval 0 wake up by event set, |
2661 | @retval OS_SYNC_TIME_EXCEEDED if timeout was exceeded |
2662 | @param next_loop_time time when next loop iteration should start |
2663 | @param sig_count zero or the value returned by previous call of |
2664 | os_event_reset() |
2665 | @param cur_time current time as in ut_time_ms() */ |
2666 | static |
2667 | ulint |
2668 | pc_sleep_if_needed( |
2669 | /*===============*/ |
2670 | ulint next_loop_time, |
2671 | int64_t sig_count, |
2672 | ulint cur_time) |
2673 | { |
2674 | /* No sleep if we are cleaning the buffer pool during the shutdown |
2675 | with everything else finished */ |
2676 | if (srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE) |
2677 | return OS_SYNC_TIME_EXCEEDED; |
2678 | |
2679 | if (next_loop_time > cur_time) { |
2680 | /* Get sleep interval in micro seconds. We use |
2681 | ut_min() to avoid long sleep in case of wrap around. */ |
2682 | ulint sleep_us; |
2683 | |
2684 | sleep_us = ut_min(static_cast<ulint>(1000000), |
2685 | (next_loop_time - cur_time) * 1000); |
2686 | |
2687 | return(os_event_wait_time_low(buf_flush_event, |
2688 | sleep_us, sig_count)); |
2689 | } |
2690 | |
2691 | return(OS_SYNC_TIME_EXCEEDED); |
2692 | } |
2693 | |
2694 | /******************************************************************//** |
2695 | Initialize page_cleaner. */ |
2696 | void |
2697 | buf_flush_page_cleaner_init(void) |
2698 | /*=============================*/ |
2699 | { |
2700 | ut_ad(!page_cleaner.is_running); |
2701 | |
2702 | mutex_create(LATCH_ID_PAGE_CLEANER, &page_cleaner.mutex); |
2703 | |
2704 | page_cleaner.is_requested = os_event_create("pc_is_requested" ); |
2705 | page_cleaner.is_finished = os_event_create("pc_is_finished" ); |
2706 | page_cleaner.is_started = os_event_create("pc_is_started" ); |
2707 | page_cleaner.n_slots = static_cast<ulint>(srv_buf_pool_instances); |
2708 | |
2709 | ut_d(page_cleaner.n_disabled_debug = 0); |
2710 | |
2711 | page_cleaner.is_running = true; |
2712 | } |
2713 | |
2714 | /** |
2715 | Requests for all slots to flush all buffer pool instances. |
2716 | @param min_n wished minimum mumber of blocks flushed |
2717 | (it is not guaranteed that the actual number is that big) |
2718 | @param lsn_limit in the case BUF_FLUSH_LIST all blocks whose |
2719 | oldest_modification is smaller than this should be flushed |
2720 | (if their number does not exceed min_n), otherwise ignored |
2721 | */ |
2722 | static |
2723 | void |
2724 | pc_request( |
2725 | ulint min_n, |
2726 | lsn_t lsn_limit) |
2727 | { |
2728 | if (min_n != ULINT_MAX) { |
2729 | /* Ensure that flushing is spread evenly amongst the |
2730 | buffer pool instances. When min_n is ULINT_MAX |
2731 | we need to flush everything up to the lsn limit |
2732 | so no limit here. */ |
2733 | min_n = (min_n + srv_buf_pool_instances - 1) |
2734 | / srv_buf_pool_instances; |
2735 | } |
2736 | |
2737 | mutex_enter(&page_cleaner.mutex); |
2738 | |
2739 | ut_ad(page_cleaner.n_slots_requested == 0); |
2740 | ut_ad(page_cleaner.n_slots_flushing == 0); |
2741 | ut_ad(page_cleaner.n_slots_finished == 0); |
2742 | |
2743 | page_cleaner.requested = (min_n > 0); |
2744 | page_cleaner.lsn_limit = lsn_limit; |
2745 | |
2746 | for (ulint i = 0; i < page_cleaner.n_slots; i++) { |
2747 | page_cleaner_slot_t* slot = &page_cleaner.slots[i]; |
2748 | |
2749 | ut_ad(slot->state == PAGE_CLEANER_STATE_NONE); |
2750 | |
2751 | if (min_n == ULINT_MAX) { |
2752 | slot->n_pages_requested = ULINT_MAX; |
2753 | } else if (min_n == 0) { |
2754 | slot->n_pages_requested = 0; |
2755 | } |
2756 | |
2757 | /* slot->n_pages_requested was already set by |
2758 | page_cleaner_flush_pages_recommendation() */ |
2759 | |
2760 | slot->state = PAGE_CLEANER_STATE_REQUESTED; |
2761 | } |
2762 | |
2763 | page_cleaner.n_slots_requested = page_cleaner.n_slots; |
2764 | page_cleaner.n_slots_flushing = 0; |
2765 | page_cleaner.n_slots_finished = 0; |
2766 | |
2767 | os_event_set(page_cleaner.is_requested); |
2768 | |
2769 | mutex_exit(&page_cleaner.mutex); |
2770 | } |
2771 | |
2772 | /** |
2773 | Do flush for one slot. |
2774 | @return the number of the slots which has not been treated yet. */ |
2775 | static |
2776 | ulint |
2777 | pc_flush_slot(void) |
2778 | { |
2779 | ulint lru_tm = 0; |
2780 | ulint list_tm = 0; |
2781 | ulint lru_pass = 0; |
2782 | ulint list_pass = 0; |
2783 | |
2784 | mutex_enter(&page_cleaner.mutex); |
2785 | |
2786 | if (!page_cleaner.n_slots_requested) { |
2787 | os_event_reset(page_cleaner.is_requested); |
2788 | } else { |
2789 | page_cleaner_slot_t* slot = NULL; |
2790 | ulint i; |
2791 | |
2792 | for (i = 0; i < page_cleaner.n_slots; i++) { |
2793 | slot = &page_cleaner.slots[i]; |
2794 | |
2795 | if (slot->state == PAGE_CLEANER_STATE_REQUESTED) { |
2796 | break; |
2797 | } |
2798 | } |
2799 | |
2800 | /* slot should be found because |
2801 | page_cleaner.n_slots_requested > 0 */ |
2802 | ut_a(i < page_cleaner.n_slots); |
2803 | |
2804 | buf_pool_t* buf_pool = buf_pool_from_array(i); |
2805 | |
2806 | page_cleaner.n_slots_requested--; |
2807 | page_cleaner.n_slots_flushing++; |
2808 | slot->state = PAGE_CLEANER_STATE_FLUSHING; |
2809 | |
2810 | if (UNIV_UNLIKELY(!page_cleaner.is_running)) { |
2811 | slot->n_flushed_lru = 0; |
2812 | slot->n_flushed_list = 0; |
2813 | goto finish_mutex; |
2814 | } |
2815 | |
2816 | if (page_cleaner.n_slots_requested == 0) { |
2817 | os_event_reset(page_cleaner.is_requested); |
2818 | } |
2819 | |
2820 | mutex_exit(&page_cleaner.mutex); |
2821 | |
2822 | lru_tm = ut_time_ms(); |
2823 | |
2824 | /* Flush pages from end of LRU if required */ |
2825 | slot->n_flushed_lru = buf_flush_LRU_list(buf_pool); |
2826 | |
2827 | lru_tm = ut_time_ms() - lru_tm; |
2828 | lru_pass++; |
2829 | |
2830 | if (UNIV_UNLIKELY(!page_cleaner.is_running)) { |
2831 | slot->n_flushed_list = 0; |
2832 | goto finish; |
2833 | } |
2834 | |
2835 | /* Flush pages from flush_list if required */ |
2836 | if (page_cleaner.requested) { |
2837 | flush_counters_t n; |
2838 | memset(&n, 0, sizeof(flush_counters_t)); |
2839 | list_tm = ut_time_ms(); |
2840 | |
2841 | slot->succeeded_list = buf_flush_do_batch( |
2842 | buf_pool, BUF_FLUSH_LIST, |
2843 | slot->n_pages_requested, |
2844 | page_cleaner.lsn_limit, |
2845 | &n); |
2846 | |
2847 | slot->n_flushed_list = n.flushed; |
2848 | |
2849 | list_tm = ut_time_ms() - list_tm; |
2850 | list_pass++; |
2851 | } else { |
2852 | slot->n_flushed_list = 0; |
2853 | slot->succeeded_list = true; |
2854 | } |
2855 | finish: |
2856 | mutex_enter(&page_cleaner.mutex); |
2857 | finish_mutex: |
2858 | page_cleaner.n_slots_flushing--; |
2859 | page_cleaner.n_slots_finished++; |
2860 | slot->state = PAGE_CLEANER_STATE_FINISHED; |
2861 | |
2862 | slot->flush_lru_time += lru_tm; |
2863 | slot->flush_list_time += list_tm; |
2864 | slot->flush_lru_pass += lru_pass; |
2865 | slot->flush_list_pass += list_pass; |
2866 | |
2867 | if (page_cleaner.n_slots_requested == 0 |
2868 | && page_cleaner.n_slots_flushing == 0) { |
2869 | os_event_set(page_cleaner.is_finished); |
2870 | } |
2871 | } |
2872 | |
2873 | ulint ret = page_cleaner.n_slots_requested; |
2874 | |
2875 | mutex_exit(&page_cleaner.mutex); |
2876 | |
2877 | return(ret); |
2878 | } |
2879 | |
2880 | /** |
2881 | Wait until all flush requests are finished. |
2882 | @param n_flushed_lru number of pages flushed from the end of the LRU list. |
2883 | @param n_flushed_list number of pages flushed from the end of the |
2884 | flush_list. |
2885 | @return true if all flush_list flushing batch were success. */ |
2886 | static |
2887 | bool |
2888 | pc_wait_finished( |
2889 | ulint* n_flushed_lru, |
2890 | ulint* n_flushed_list) |
2891 | { |
2892 | bool all_succeeded = true; |
2893 | |
2894 | *n_flushed_lru = 0; |
2895 | *n_flushed_list = 0; |
2896 | |
2897 | os_event_wait(page_cleaner.is_finished); |
2898 | |
2899 | mutex_enter(&page_cleaner.mutex); |
2900 | |
2901 | ut_ad(page_cleaner.n_slots_requested == 0); |
2902 | ut_ad(page_cleaner.n_slots_flushing == 0); |
2903 | ut_ad(page_cleaner.n_slots_finished == page_cleaner.n_slots); |
2904 | |
2905 | for (ulint i = 0; i < page_cleaner.n_slots; i++) { |
2906 | page_cleaner_slot_t* slot = &page_cleaner.slots[i]; |
2907 | |
2908 | ut_ad(slot->state == PAGE_CLEANER_STATE_FINISHED); |
2909 | |
2910 | *n_flushed_lru += slot->n_flushed_lru; |
2911 | *n_flushed_list += slot->n_flushed_list; |
2912 | all_succeeded &= slot->succeeded_list; |
2913 | |
2914 | slot->state = PAGE_CLEANER_STATE_NONE; |
2915 | |
2916 | slot->n_pages_requested = 0; |
2917 | } |
2918 | |
2919 | page_cleaner.n_slots_finished = 0; |
2920 | |
2921 | os_event_reset(page_cleaner.is_finished); |
2922 | |
2923 | mutex_exit(&page_cleaner.mutex); |
2924 | |
2925 | return(all_succeeded); |
2926 | } |
2927 | |
2928 | #ifdef UNIV_LINUX |
2929 | /** |
2930 | Set priority for page_cleaner threads. |
2931 | @param[in] priority priority intended to set |
2932 | @return true if set as intended */ |
2933 | static |
2934 | bool |
2935 | buf_flush_page_cleaner_set_priority( |
2936 | int priority) |
2937 | { |
2938 | setpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid), |
2939 | priority); |
2940 | return(getpriority(PRIO_PROCESS, (pid_t)syscall(SYS_gettid)) |
2941 | == priority); |
2942 | } |
2943 | #endif /* UNIV_LINUX */ |
2944 | |
2945 | #ifdef UNIV_DEBUG |
2946 | /** Loop used to disable page cleaner threads. */ |
2947 | static |
2948 | void |
2949 | buf_flush_page_cleaner_disabled_loop(void) |
2950 | { |
2951 | if (!innodb_page_cleaner_disabled_debug) { |
2952 | /* We return to avoid entering and exiting mutex. */ |
2953 | return; |
2954 | } |
2955 | |
2956 | mutex_enter(&page_cleaner.mutex); |
2957 | page_cleaner.n_disabled_debug++; |
2958 | mutex_exit(&page_cleaner.mutex); |
2959 | |
2960 | while (innodb_page_cleaner_disabled_debug |
2961 | && srv_shutdown_state == SRV_SHUTDOWN_NONE |
2962 | && page_cleaner.is_running) { |
2963 | |
2964 | os_thread_sleep(100000); /* [A] */ |
2965 | } |
2966 | |
2967 | /* We need to wait for threads exiting here, otherwise we would |
2968 | encounter problem when we quickly perform following steps: |
2969 | 1) SET GLOBAL innodb_page_cleaner_disabled_debug = 1; |
2970 | 2) SET GLOBAL innodb_page_cleaner_disabled_debug = 0; |
2971 | 3) SET GLOBAL innodb_page_cleaner_disabled_debug = 1; |
2972 | That's because after step 1 this thread could still be sleeping |
2973 | inside the loop above at [A] and steps 2, 3 could happen before |
2974 | this thread wakes up from [A]. In such case this thread would |
2975 | not re-increment n_disabled_debug and we would be waiting for |
2976 | him forever in buf_flush_page_cleaner_disabled_debug_update(...). |
2977 | |
2978 | Therefore we are waiting in step 2 for this thread exiting here. */ |
2979 | |
2980 | mutex_enter(&page_cleaner.mutex); |
2981 | page_cleaner.n_disabled_debug--; |
2982 | mutex_exit(&page_cleaner.mutex); |
2983 | } |
2984 | |
2985 | /** Disables page cleaner threads (coordinator and workers). |
2986 | @param[in] save immediate result from check function */ |
2987 | void buf_flush_page_cleaner_disabled_debug_update(THD*, |
2988 | st_mysql_sys_var*, void*, |
2989 | const void* save) |
2990 | { |
2991 | if (!page_cleaner.is_running) { |
2992 | return; |
2993 | } |
2994 | |
2995 | if (!*static_cast<const my_bool*>(save)) { |
2996 | if (!innodb_page_cleaner_disabled_debug) { |
2997 | return; |
2998 | } |
2999 | |
3000 | innodb_page_cleaner_disabled_debug = false; |
3001 | |
3002 | /* Enable page cleaner threads. */ |
3003 | while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { |
3004 | mutex_enter(&page_cleaner.mutex); |
3005 | const ulint n = page_cleaner.n_disabled_debug; |
3006 | mutex_exit(&page_cleaner.mutex); |
3007 | /* Check if all threads have been enabled, to avoid |
3008 | problem when we decide to re-disable them soon. */ |
3009 | if (n == 0) { |
3010 | break; |
3011 | } |
3012 | } |
3013 | return; |
3014 | } |
3015 | |
3016 | if (innodb_page_cleaner_disabled_debug) { |
3017 | return; |
3018 | } |
3019 | |
3020 | innodb_page_cleaner_disabled_debug = true; |
3021 | |
3022 | while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { |
3023 | /* Workers are possibly sleeping on is_requested. |
3024 | |
3025 | We have to wake them, otherwise they could possibly |
3026 | have never noticed, that they should be disabled, |
3027 | and we would wait for them here forever. |
3028 | |
3029 | That's why we have sleep-loop instead of simply |
3030 | waiting on some disabled_debug_event. */ |
3031 | os_event_set(page_cleaner.is_requested); |
3032 | |
3033 | mutex_enter(&page_cleaner.mutex); |
3034 | |
3035 | ut_ad(page_cleaner.n_disabled_debug |
3036 | <= srv_n_page_cleaners); |
3037 | |
3038 | if (page_cleaner.n_disabled_debug |
3039 | == srv_n_page_cleaners) { |
3040 | |
3041 | mutex_exit(&page_cleaner.mutex); |
3042 | break; |
3043 | } |
3044 | |
3045 | mutex_exit(&page_cleaner.mutex); |
3046 | |
3047 | os_thread_sleep(100000); |
3048 | } |
3049 | } |
3050 | #endif /* UNIV_DEBUG */ |
3051 | |
3052 | /******************************************************************//** |
3053 | page_cleaner thread tasked with flushing dirty pages from the buffer |
3054 | pools. As of now we'll have only one coordinator. |
3055 | @return a dummy parameter */ |
3056 | extern "C" |
3057 | os_thread_ret_t |
3058 | DECLARE_THREAD(buf_flush_page_cleaner_coordinator)(void*) |
3059 | { |
3060 | my_thread_init(); |
3061 | #ifdef UNIV_PFS_THREAD |
3062 | pfs_register_thread(page_cleaner_thread_key); |
3063 | #endif /* UNIV_PFS_THREAD */ |
3064 | ut_ad(!srv_read_only_mode); |
3065 | |
3066 | #ifdef UNIV_DEBUG_THREAD_CREATION |
3067 | ib::info() << "page_cleaner thread running, id " |
3068 | << os_thread_pf(os_thread_get_curr_id()); |
3069 | #endif /* UNIV_DEBUG_THREAD_CREATION */ |
3070 | #ifdef UNIV_LINUX |
3071 | /* linux might be able to set different setting for each thread. |
3072 | worth to try to set high priority for page cleaner threads */ |
3073 | if (buf_flush_page_cleaner_set_priority( |
3074 | buf_flush_page_cleaner_priority)) { |
3075 | |
3076 | ib::info() << "page_cleaner coordinator priority: " |
3077 | << buf_flush_page_cleaner_priority; |
3078 | } else { |
3079 | ib::info() << "If the mysqld execution user is authorized," |
3080 | " page cleaner thread priority can be changed." |
3081 | " See the man page of setpriority()." ; |
3082 | } |
3083 | /* Signal that setpriority() has been attempted. */ |
3084 | os_event_set(recv_sys->flush_end); |
3085 | #endif /* UNIV_LINUX */ |
3086 | |
3087 | do { |
3088 | /* treat flushing requests during recovery. */ |
3089 | ulint n_flushed_lru = 0; |
3090 | ulint n_flushed_list = 0; |
3091 | |
3092 | os_event_wait(recv_sys->flush_start); |
3093 | |
3094 | if (!recv_writer_thread_active) { |
3095 | break; |
3096 | } |
3097 | |
3098 | switch (recv_sys->flush_type) { |
3099 | case BUF_FLUSH_LRU: |
3100 | /* Flush pages from end of LRU if required */ |
3101 | pc_request(0, LSN_MAX); |
3102 | while (pc_flush_slot() > 0) {} |
3103 | pc_wait_finished(&n_flushed_lru, &n_flushed_list); |
3104 | break; |
3105 | |
3106 | case BUF_FLUSH_LIST: |
3107 | /* Flush all pages */ |
3108 | do { |
3109 | pc_request(ULINT_MAX, LSN_MAX); |
3110 | while (pc_flush_slot() > 0) {} |
3111 | } while (!pc_wait_finished(&n_flushed_lru, |
3112 | &n_flushed_list)); |
3113 | break; |
3114 | |
3115 | default: |
3116 | ut_ad(0); |
3117 | } |
3118 | |
3119 | os_event_reset(recv_sys->flush_start); |
3120 | os_event_set(recv_sys->flush_end); |
3121 | } while (recv_writer_thread_active); |
3122 | |
3123 | os_event_wait(buf_flush_event); |
3124 | |
3125 | ulint ret_sleep = 0; |
3126 | ulint n_evicted = 0; |
3127 | ulint n_flushed_last = 0; |
3128 | ulint warn_interval = 1; |
3129 | ulint warn_count = 0; |
3130 | int64_t sig_count = os_event_reset(buf_flush_event); |
3131 | ulint next_loop_time = ut_time_ms() + 1000; |
3132 | ulint n_flushed = 0; |
3133 | ulint last_activity = srv_get_activity_count(); |
3134 | ulint last_pages = 0; |
3135 | |
3136 | while (srv_shutdown_state == SRV_SHUTDOWN_NONE) { |
3137 | ulint curr_time = ut_time_ms(); |
3138 | |
3139 | /* The page_cleaner skips sleep if the server is |
3140 | idle and there are no pending IOs in the buffer pool |
3141 | and there is work to do. */ |
3142 | if (srv_check_activity(last_activity) |
3143 | || buf_get_n_pending_read_ios() |
3144 | || n_flushed == 0) { |
3145 | |
3146 | ret_sleep = pc_sleep_if_needed( |
3147 | next_loop_time, sig_count, curr_time); |
3148 | } else if (curr_time > next_loop_time) { |
3149 | ret_sleep = OS_SYNC_TIME_EXCEEDED; |
3150 | } else { |
3151 | ret_sleep = 0; |
3152 | } |
3153 | |
3154 | if (srv_shutdown_state != SRV_SHUTDOWN_NONE) { |
3155 | break; |
3156 | } |
3157 | |
3158 | sig_count = os_event_reset(buf_flush_event); |
3159 | |
3160 | if (ret_sleep == OS_SYNC_TIME_EXCEEDED) { |
3161 | if (global_system_variables.log_warnings > 2 |
3162 | && curr_time > next_loop_time + 3000 |
3163 | && !(test_flags & TEST_SIGINT)) { |
3164 | if (warn_count == 0) { |
3165 | ib::info() << "page_cleaner: 1000ms" |
3166 | " intended loop took " |
3167 | << 1000 + curr_time |
3168 | - next_loop_time |
3169 | << "ms. The settings might not" |
3170 | " be optimal. (flushed=" |
3171 | << n_flushed_last |
3172 | << " and evicted=" |
3173 | << n_evicted |
3174 | << ", during the time.)" ; |
3175 | if (warn_interval > 300) { |
3176 | warn_interval = 600; |
3177 | } else { |
3178 | warn_interval *= 2; |
3179 | } |
3180 | |
3181 | warn_count = warn_interval; |
3182 | } else { |
3183 | --warn_count; |
3184 | } |
3185 | } else { |
3186 | /* reset counter */ |
3187 | warn_interval = 1; |
3188 | warn_count = 0; |
3189 | } |
3190 | |
3191 | next_loop_time = curr_time + 1000; |
3192 | n_flushed_last = n_evicted = 0; |
3193 | } |
3194 | |
3195 | if (ret_sleep != OS_SYNC_TIME_EXCEEDED |
3196 | && srv_flush_sync |
3197 | && buf_flush_sync_lsn > 0) { |
3198 | /* woke up for flush_sync */ |
3199 | mutex_enter(&page_cleaner.mutex); |
3200 | lsn_t lsn_limit = buf_flush_sync_lsn; |
3201 | buf_flush_sync_lsn = 0; |
3202 | mutex_exit(&page_cleaner.mutex); |
3203 | |
3204 | /* Request flushing for threads */ |
3205 | pc_request(ULINT_MAX, lsn_limit); |
3206 | |
3207 | ulint tm = ut_time_ms(); |
3208 | |
3209 | /* Coordinator also treats requests */ |
3210 | while (pc_flush_slot() > 0) {} |
3211 | |
3212 | /* only coordinator is using these counters, |
3213 | so no need to protect by lock. */ |
3214 | page_cleaner.flush_time += ut_time_ms() - tm; |
3215 | page_cleaner.flush_pass++; |
3216 | |
3217 | /* Wait for all slots to be finished */ |
3218 | ulint n_flushed_lru = 0; |
3219 | ulint n_flushed_list = 0; |
3220 | pc_wait_finished(&n_flushed_lru, &n_flushed_list); |
3221 | |
3222 | if (n_flushed_list > 0 || n_flushed_lru > 0) { |
3223 | buf_flush_stats(n_flushed_list, n_flushed_lru); |
3224 | |
3225 | MONITOR_INC_VALUE_CUMULATIVE( |
3226 | MONITOR_FLUSH_SYNC_TOTAL_PAGE, |
3227 | MONITOR_FLUSH_SYNC_COUNT, |
3228 | MONITOR_FLUSH_SYNC_PAGES, |
3229 | n_flushed_lru + n_flushed_list); |
3230 | } |
3231 | |
3232 | n_flushed = n_flushed_lru + n_flushed_list; |
3233 | |
3234 | } else if (srv_check_activity(last_activity)) { |
3235 | ulint n_to_flush; |
3236 | lsn_t lsn_limit = 0; |
3237 | |
3238 | /* Estimate pages from flush_list to be flushed */ |
3239 | if (ret_sleep == OS_SYNC_TIME_EXCEEDED) { |
3240 | last_activity = srv_get_activity_count(); |
3241 | n_to_flush = |
3242 | page_cleaner_flush_pages_recommendation( |
3243 | &lsn_limit, last_pages); |
3244 | } else { |
3245 | n_to_flush = 0; |
3246 | } |
3247 | |
3248 | /* Request flushing for threads */ |
3249 | pc_request(n_to_flush, lsn_limit); |
3250 | |
3251 | ulint tm = ut_time_ms(); |
3252 | |
3253 | /* Coordinator also treats requests */ |
3254 | while (pc_flush_slot() > 0) { |
3255 | /* No op */ |
3256 | } |
3257 | |
3258 | /* only coordinator is using these counters, |
3259 | so no need to protect by lock. */ |
3260 | page_cleaner.flush_time += ut_time_ms() - tm; |
3261 | page_cleaner.flush_pass++ ; |
3262 | |
3263 | /* Wait for all slots to be finished */ |
3264 | ulint n_flushed_lru = 0; |
3265 | ulint n_flushed_list = 0; |
3266 | |
3267 | pc_wait_finished(&n_flushed_lru, &n_flushed_list); |
3268 | |
3269 | if (n_flushed_list > 0 || n_flushed_lru > 0) { |
3270 | buf_flush_stats(n_flushed_list, n_flushed_lru); |
3271 | } |
3272 | |
3273 | if (ret_sleep == OS_SYNC_TIME_EXCEEDED) { |
3274 | last_pages = n_flushed_list; |
3275 | } |
3276 | |
3277 | n_evicted += n_flushed_lru; |
3278 | n_flushed_last += n_flushed_list; |
3279 | |
3280 | n_flushed = n_flushed_lru + n_flushed_list; |
3281 | |
3282 | if (n_flushed_lru) { |
3283 | MONITOR_INC_VALUE_CUMULATIVE( |
3284 | MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE, |
3285 | MONITOR_LRU_BATCH_FLUSH_COUNT, |
3286 | MONITOR_LRU_BATCH_FLUSH_PAGES, |
3287 | n_flushed_lru); |
3288 | } |
3289 | |
3290 | if (n_flushed_list) { |
3291 | MONITOR_INC_VALUE_CUMULATIVE( |
3292 | MONITOR_FLUSH_ADAPTIVE_TOTAL_PAGE, |
3293 | MONITOR_FLUSH_ADAPTIVE_COUNT, |
3294 | MONITOR_FLUSH_ADAPTIVE_PAGES, |
3295 | n_flushed_list); |
3296 | } |
3297 | |
3298 | } else if (ret_sleep == OS_SYNC_TIME_EXCEEDED) { |
3299 | /* no activity, slept enough */ |
3300 | buf_flush_lists(PCT_IO(100), LSN_MAX, &n_flushed); |
3301 | |
3302 | n_flushed_last += n_flushed; |
3303 | |
3304 | if (n_flushed) { |
3305 | MONITOR_INC_VALUE_CUMULATIVE( |
3306 | MONITOR_FLUSH_BACKGROUND_TOTAL_PAGE, |
3307 | MONITOR_FLUSH_BACKGROUND_COUNT, |
3308 | MONITOR_FLUSH_BACKGROUND_PAGES, |
3309 | n_flushed); |
3310 | |
3311 | } |
3312 | |
3313 | } else { |
3314 | /* no activity, but woken up by event */ |
3315 | n_flushed = 0; |
3316 | } |
3317 | |
3318 | ut_d(buf_flush_page_cleaner_disabled_loop()); |
3319 | } |
3320 | |
3321 | ut_ad(srv_shutdown_state > 0); |
3322 | if (srv_fast_shutdown == 2 |
3323 | || srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { |
3324 | /* In very fast shutdown or when innodb failed to start, we |
3325 | simulate a crash of the buffer pool. We are not required to do |
3326 | any flushing. */ |
3327 | goto thread_exit; |
3328 | } |
3329 | |
3330 | /* In case of normal and slow shutdown the page_cleaner thread |
3331 | must wait for all other activity in the server to die down. |
3332 | Note that we can start flushing the buffer pool as soon as the |
3333 | server enters shutdown phase but we must stay alive long enough |
3334 | to ensure that any work done by the master or purge threads is |
3335 | also flushed. |
3336 | During shutdown we pass through two stages. In the first stage, |
3337 | when SRV_SHUTDOWN_CLEANUP is set other threads like the master |
3338 | and the purge threads may be working as well. We start flushing |
3339 | the buffer pool but can't be sure that no new pages are being |
3340 | dirtied until we enter SRV_SHUTDOWN_FLUSH_PHASE phase. */ |
3341 | |
3342 | do { |
3343 | pc_request(ULINT_MAX, LSN_MAX); |
3344 | |
3345 | while (pc_flush_slot() > 0) {} |
3346 | |
3347 | ulint n_flushed_lru = 0; |
3348 | ulint n_flushed_list = 0; |
3349 | pc_wait_finished(&n_flushed_lru, &n_flushed_list); |
3350 | |
3351 | n_flushed = n_flushed_lru + n_flushed_list; |
3352 | |
3353 | /* We sleep only if there are no pages to flush */ |
3354 | if (n_flushed == 0) { |
3355 | os_thread_sleep(100000); |
3356 | } |
3357 | } while (srv_shutdown_state == SRV_SHUTDOWN_CLEANUP); |
3358 | |
3359 | /* At this point all threads including the master and the purge |
3360 | thread must have been suspended. */ |
3361 | ut_a(srv_get_active_thread_type() == SRV_NONE); |
3362 | ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE); |
3363 | |
3364 | /* We can now make a final sweep on flushing the buffer pool |
3365 | and exit after we have cleaned the whole buffer pool. |
3366 | It is important that we wait for any running batch that has |
3367 | been triggered by us to finish. Otherwise we can end up |
3368 | considering end of that batch as a finish of our final |
3369 | sweep and we'll come out of the loop leaving behind dirty pages |
3370 | in the flush_list */ |
3371 | buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); |
3372 | buf_flush_wait_LRU_batch_end(); |
3373 | |
3374 | bool success; |
3375 | |
3376 | do { |
3377 | pc_request(ULINT_MAX, LSN_MAX); |
3378 | |
3379 | while (pc_flush_slot() > 0) {} |
3380 | |
3381 | ulint n_flushed_lru = 0; |
3382 | ulint n_flushed_list = 0; |
3383 | success = pc_wait_finished(&n_flushed_lru, &n_flushed_list); |
3384 | |
3385 | n_flushed = n_flushed_lru + n_flushed_list; |
3386 | |
3387 | buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); |
3388 | buf_flush_wait_LRU_batch_end(); |
3389 | |
3390 | } while (!success || n_flushed > 0); |
3391 | |
3392 | /* Some sanity checks */ |
3393 | ut_a(srv_get_active_thread_type() == SRV_NONE); |
3394 | ut_a(srv_shutdown_state == SRV_SHUTDOWN_FLUSH_PHASE); |
3395 | |
3396 | for (ulint i = 0; i < srv_buf_pool_instances; i++) { |
3397 | buf_pool_t* buf_pool = buf_pool_from_array(i); |
3398 | ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == 0); |
3399 | } |
3400 | |
3401 | /* We have lived our life. Time to die. */ |
3402 | |
3403 | thread_exit: |
3404 | /* All worker threads are waiting for the event here, |
3405 | and no more access to page_cleaner structure by them. |
3406 | Wakes worker threads up just to make them exit. */ |
3407 | page_cleaner.is_running = false; |
3408 | |
3409 | /* waiting for all worker threads exit */ |
3410 | while (page_cleaner.n_workers) { |
3411 | os_event_set(page_cleaner.is_requested); |
3412 | os_thread_sleep(10000); |
3413 | } |
3414 | |
3415 | mutex_destroy(&page_cleaner.mutex); |
3416 | |
3417 | os_event_destroy(page_cleaner.is_finished); |
3418 | os_event_destroy(page_cleaner.is_requested); |
3419 | os_event_destroy(page_cleaner.is_started); |
3420 | |
3421 | buf_page_cleaner_is_active = false; |
3422 | |
3423 | my_thread_end(); |
3424 | /* We count the number of threads in os_thread_exit(). A created |
3425 | thread should always use that to exit and not use return() to exit. */ |
3426 | os_thread_exit(); |
3427 | |
3428 | OS_THREAD_DUMMY_RETURN; |
3429 | } |
3430 | |
3431 | /** Adjust thread count for page cleaner workers. |
3432 | @param[in] new_cnt Number of threads to be used */ |
3433 | void |
3434 | buf_flush_set_page_cleaner_thread_cnt(ulong new_cnt) |
3435 | { |
3436 | mutex_enter(&page_cleaner.mutex); |
3437 | |
3438 | srv_n_page_cleaners = new_cnt; |
3439 | if (new_cnt > page_cleaner.n_workers) { |
3440 | /* User has increased the number of page |
3441 | cleaner threads. */ |
3442 | ulint add = new_cnt - page_cleaner.n_workers; |
3443 | for (ulint i = 0; i < add; i++) { |
3444 | os_thread_id_t cleaner_thread_id; |
3445 | os_thread_create(buf_flush_page_cleaner_worker, NULL, &cleaner_thread_id); |
3446 | } |
3447 | } |
3448 | |
3449 | mutex_exit(&page_cleaner.mutex); |
3450 | |
3451 | /* Wait until defined number of workers has started. */ |
3452 | while (page_cleaner.is_running && |
3453 | page_cleaner.n_workers != (srv_n_page_cleaners - 1)) { |
3454 | os_event_set(page_cleaner.is_requested); |
3455 | os_event_reset(page_cleaner.is_started); |
3456 | os_event_wait_time(page_cleaner.is_started, 1000000); |
3457 | } |
3458 | } |
3459 | |
3460 | /******************************************************************//** |
3461 | Worker thread of page_cleaner. |
3462 | @return a dummy parameter */ |
3463 | extern "C" |
3464 | os_thread_ret_t |
3465 | DECLARE_THREAD(buf_flush_page_cleaner_worker)( |
3466 | /*==========================================*/ |
3467 | void* arg MY_ATTRIBUTE((unused))) |
3468 | /*!< in: a dummy parameter required by |
3469 | os_thread_create */ |
3470 | { |
3471 | my_thread_init(); |
3472 | #ifndef DBUG_OFF |
3473 | os_thread_id_t cleaner_thread_id = os_thread_get_curr_id(); |
3474 | #endif |
3475 | |
3476 | mutex_enter(&page_cleaner.mutex); |
3477 | ulint thread_no = page_cleaner.n_workers++; |
3478 | |
3479 | DBUG_LOG("ib_buf" , "Thread " << cleaner_thread_id |
3480 | << " started; n_workers=" << page_cleaner.n_workers); |
3481 | |
3482 | /* Signal that we have started */ |
3483 | os_event_set(page_cleaner.is_started); |
3484 | mutex_exit(&page_cleaner.mutex); |
3485 | |
3486 | #ifdef UNIV_LINUX |
3487 | /* linux might be able to set different setting for each thread |
3488 | worth to try to set high priority for page cleaner threads */ |
3489 | if (buf_flush_page_cleaner_set_priority( |
3490 | buf_flush_page_cleaner_priority)) { |
3491 | |
3492 | ib::info() << "page_cleaner worker priority: " |
3493 | << buf_flush_page_cleaner_priority; |
3494 | } |
3495 | #endif /* UNIV_LINUX */ |
3496 | |
3497 | while (true) { |
3498 | os_event_wait(page_cleaner.is_requested); |
3499 | |
3500 | ut_d(buf_flush_page_cleaner_disabled_loop()); |
3501 | |
3502 | if (!page_cleaner.is_running) { |
3503 | break; |
3504 | } |
3505 | |
3506 | ut_ad(srv_n_page_cleaners >= 1); |
3507 | |
3508 | /* If number of page cleaner threads is decreased |
3509 | exit those that are not anymore needed. */ |
3510 | if (srv_shutdown_state == SRV_SHUTDOWN_NONE && |
3511 | thread_no >= (srv_n_page_cleaners - 1)) { |
3512 | DBUG_LOG("ib_buf" , "Exiting " |
3513 | << thread_no |
3514 | << " page cleaner worker thread_id " |
3515 | << os_thread_pf(cleaner_thread_id) |
3516 | << " total threads " << srv_n_page_cleaners << "." ); |
3517 | break; |
3518 | } |
3519 | |
3520 | pc_flush_slot(); |
3521 | } |
3522 | |
3523 | mutex_enter(&page_cleaner.mutex); |
3524 | page_cleaner.n_workers--; |
3525 | |
3526 | DBUG_LOG("ib_buf" , "Thread " << cleaner_thread_id |
3527 | << " exiting; n_workers=" << page_cleaner.n_workers); |
3528 | |
3529 | /* Signal that we have stopped */ |
3530 | os_event_set(page_cleaner.is_started); |
3531 | mutex_exit(&page_cleaner.mutex); |
3532 | |
3533 | my_thread_end(); |
3534 | |
3535 | os_thread_exit(); |
3536 | |
3537 | OS_THREAD_DUMMY_RETURN; |
3538 | } |
3539 | |
3540 | /*******************************************************************//** |
3541 | Synchronously flush dirty blocks from the end of the flush list of all buffer |
3542 | pool instances. |
3543 | NOTE: The calling thread is not allowed to own any latches on pages! */ |
3544 | void |
3545 | buf_flush_sync_all_buf_pools(void) |
3546 | /*==============================*/ |
3547 | { |
3548 | bool success; |
3549 | do { |
3550 | success = buf_flush_lists(ULINT_MAX, LSN_MAX, NULL); |
3551 | buf_flush_wait_batch_end(NULL, BUF_FLUSH_LIST); |
3552 | } while (!success); |
3553 | |
3554 | ut_a(success); |
3555 | } |
3556 | |
3557 | /** Request IO burst and wake page_cleaner up. |
3558 | @param[in] lsn_limit upper limit of LSN to be flushed */ |
3559 | void |
3560 | buf_flush_request_force( |
3561 | lsn_t lsn_limit) |
3562 | { |
3563 | /* adjust based on lsn_avg_rate not to get old */ |
3564 | lsn_t lsn_target = lsn_limit + lsn_avg_rate * 3; |
3565 | |
3566 | mutex_enter(&page_cleaner.mutex); |
3567 | if (lsn_target > buf_flush_sync_lsn) { |
3568 | buf_flush_sync_lsn = lsn_target; |
3569 | } |
3570 | mutex_exit(&page_cleaner.mutex); |
3571 | |
3572 | os_event_set(buf_flush_event); |
3573 | } |
3574 | #if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG |
3575 | |
3576 | /** Functor to validate the flush list. */ |
3577 | struct Check { |
3578 | void operator()(const buf_page_t* elem) |
3579 | { |
3580 | ut_a(elem->in_flush_list); |
3581 | } |
3582 | }; |
3583 | |
3584 | /******************************************************************//** |
3585 | Validates the flush list. |
3586 | @return TRUE if ok */ |
3587 | static |
3588 | ibool |
3589 | buf_flush_validate_low( |
3590 | /*===================*/ |
3591 | buf_pool_t* buf_pool) /*!< in: Buffer pool instance */ |
3592 | { |
3593 | buf_page_t* bpage; |
3594 | const ib_rbt_node_t* rnode = NULL; |
3595 | Check check; |
3596 | |
3597 | ut_ad(buf_flush_list_mutex_own(buf_pool)); |
3598 | |
3599 | ut_list_validate(buf_pool->flush_list, check); |
3600 | |
3601 | bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); |
3602 | |
3603 | /* If we are in recovery mode i.e.: flush_rbt != NULL |
3604 | then each block in the flush_list must also be present |
3605 | in the flush_rbt. */ |
3606 | if (buf_pool->flush_rbt != NULL) { |
3607 | rnode = rbt_first(buf_pool->flush_rbt); |
3608 | } |
3609 | |
3610 | while (bpage != NULL) { |
3611 | const lsn_t om = bpage->oldest_modification; |
3612 | |
3613 | ut_ad(buf_pool_from_bpage(bpage) == buf_pool); |
3614 | |
3615 | ut_ad(bpage->in_flush_list); |
3616 | |
3617 | /* A page in buf_pool->flush_list can be in |
3618 | BUF_BLOCK_REMOVE_HASH state. This happens when a page |
3619 | is in the middle of being relocated. In that case the |
3620 | original descriptor can have this state and still be |
3621 | in the flush list waiting to acquire the |
3622 | buf_pool->flush_list_mutex to complete the relocation. */ |
3623 | ut_a(buf_page_in_file(bpage) |
3624 | || buf_page_get_state(bpage) == BUF_BLOCK_REMOVE_HASH); |
3625 | ut_a(om > 0); |
3626 | |
3627 | if (buf_pool->flush_rbt != NULL) { |
3628 | buf_page_t** prpage; |
3629 | |
3630 | ut_a(rnode != NULL); |
3631 | prpage = rbt_value(buf_page_t*, rnode); |
3632 | |
3633 | ut_a(*prpage != NULL); |
3634 | ut_a(*prpage == bpage); |
3635 | rnode = rbt_next(buf_pool->flush_rbt, rnode); |
3636 | } |
3637 | |
3638 | bpage = UT_LIST_GET_NEXT(list, bpage); |
3639 | |
3640 | ut_a(bpage == NULL || om >= bpage->oldest_modification); |
3641 | } |
3642 | |
3643 | /* By this time we must have exhausted the traversal of |
3644 | flush_rbt (if active) as well. */ |
3645 | ut_a(rnode == NULL); |
3646 | |
3647 | return(TRUE); |
3648 | } |
3649 | |
3650 | /******************************************************************//** |
3651 | Validates the flush list. |
3652 | @return TRUE if ok */ |
3653 | ibool |
3654 | buf_flush_validate( |
3655 | /*===============*/ |
3656 | buf_pool_t* buf_pool) /*!< buffer pool instance */ |
3657 | { |
3658 | ibool ret; |
3659 | |
3660 | buf_flush_list_mutex_enter(buf_pool); |
3661 | |
3662 | ret = buf_flush_validate_low(buf_pool); |
3663 | |
3664 | buf_flush_list_mutex_exit(buf_pool); |
3665 | |
3666 | return(ret); |
3667 | } |
3668 | |
3669 | #endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */ |
3670 | |
3671 | /******************************************************************//** |
3672 | Check if there are any dirty pages that belong to a space id in the flush |
3673 | list in a particular buffer pool. |
3674 | @return number of dirty pages present in a single buffer pool */ |
3675 | ulint |
3676 | buf_pool_get_dirty_pages_count( |
3677 | /*===========================*/ |
3678 | buf_pool_t* buf_pool, /*!< in: buffer pool */ |
3679 | ulint id, /*!< in: space id to check */ |
3680 | FlushObserver* observer) /*!< in: flush observer to check */ |
3681 | |
3682 | { |
3683 | ulint count = 0; |
3684 | |
3685 | buf_pool_mutex_enter(buf_pool); |
3686 | buf_flush_list_mutex_enter(buf_pool); |
3687 | |
3688 | buf_page_t* bpage; |
3689 | |
3690 | for (bpage = UT_LIST_GET_FIRST(buf_pool->flush_list); |
3691 | bpage != 0; |
3692 | bpage = UT_LIST_GET_NEXT(list, bpage)) { |
3693 | |
3694 | ut_ad(buf_page_in_file(bpage)); |
3695 | ut_ad(bpage->in_flush_list); |
3696 | ut_ad(bpage->oldest_modification > 0); |
3697 | |
3698 | if ((observer != NULL |
3699 | && observer == bpage->flush_observer) |
3700 | || (observer == NULL |
3701 | && id == bpage->id.space())) { |
3702 | ++count; |
3703 | } |
3704 | } |
3705 | |
3706 | buf_flush_list_mutex_exit(buf_pool); |
3707 | buf_pool_mutex_exit(buf_pool); |
3708 | |
3709 | return(count); |
3710 | } |
3711 | |
3712 | /******************************************************************//** |
3713 | Check if there are any dirty pages that belong to a space id in the flush list. |
3714 | @return number of dirty pages present in all the buffer pools */ |
3715 | static |
3716 | ulint |
3717 | buf_flush_get_dirty_pages_count( |
3718 | /*============================*/ |
3719 | ulint id, /*!< in: space id to check */ |
3720 | FlushObserver* observer) /*!< in: flush observer to check */ |
3721 | { |
3722 | ulint count = 0; |
3723 | |
3724 | for (ulint i = 0; i < srv_buf_pool_instances; ++i) { |
3725 | buf_pool_t* buf_pool; |
3726 | |
3727 | buf_pool = buf_pool_from_array(i); |
3728 | |
3729 | count += buf_pool_get_dirty_pages_count(buf_pool, id, observer); |
3730 | } |
3731 | |
3732 | return(count); |
3733 | } |
3734 | |
3735 | /** FlushObserver constructor |
3736 | @param[in] space tablespace |
3737 | @param[in] trx trx instance |
3738 | @param[in] stage performance schema accounting object, |
3739 | used by ALTER TABLE. It is passed to log_preflush_pool_modified_pages() |
3740 | for accounting. */ |
3741 | FlushObserver::FlushObserver( |
3742 | fil_space_t* space, |
3743 | trx_t* trx, |
3744 | ut_stage_alter_t* stage) |
3745 | : |
3746 | m_space(space), |
3747 | m_trx(trx), |
3748 | m_stage(stage), |
3749 | m_interrupted(false) |
3750 | { |
3751 | m_flushed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances)); |
3752 | m_removed = UT_NEW_NOKEY(std::vector<ulint>(srv_buf_pool_instances)); |
3753 | |
3754 | for (ulint i = 0; i < srv_buf_pool_instances; i++) { |
3755 | m_flushed->at(i) = 0; |
3756 | m_removed->at(i) = 0; |
3757 | } |
3758 | |
3759 | DBUG_LOG("flush" , "FlushObserver(): trx->id=" << m_trx->id); |
3760 | } |
3761 | |
3762 | /** FlushObserver deconstructor */ |
3763 | FlushObserver::~FlushObserver() |
3764 | { |
3765 | ut_ad(buf_flush_get_dirty_pages_count(m_space->id, this) == 0); |
3766 | |
3767 | UT_DELETE(m_flushed); |
3768 | UT_DELETE(m_removed); |
3769 | |
3770 | DBUG_LOG("flush" , "~FlushObserver(): trx->id=" << m_trx->id); |
3771 | } |
3772 | |
3773 | /** Check whether trx is interrupted |
3774 | @return true if trx is interrupted */ |
3775 | bool |
3776 | FlushObserver::check_interrupted() |
3777 | { |
3778 | if (trx_is_interrupted(m_trx)) { |
3779 | interrupted(); |
3780 | |
3781 | return(true); |
3782 | } |
3783 | |
3784 | return(false); |
3785 | } |
3786 | |
3787 | /** Notify observer of a flush |
3788 | @param[in] buf_pool buffer pool instance |
3789 | @param[in] bpage buffer page to flush */ |
3790 | void |
3791 | FlushObserver::notify_flush( |
3792 | buf_pool_t* buf_pool, |
3793 | buf_page_t* bpage) |
3794 | { |
3795 | ut_ad(buf_pool_mutex_own(buf_pool)); |
3796 | |
3797 | m_flushed->at(buf_pool->instance_no)++; |
3798 | |
3799 | if (m_stage != NULL) { |
3800 | m_stage->inc(); |
3801 | } |
3802 | |
3803 | DBUG_LOG("flush" , "Flush " << bpage->id); |
3804 | } |
3805 | |
3806 | /** Notify observer of a remove |
3807 | @param[in] buf_pool buffer pool instance |
3808 | @param[in] bpage buffer page flushed */ |
3809 | void |
3810 | FlushObserver::notify_remove( |
3811 | buf_pool_t* buf_pool, |
3812 | buf_page_t* bpage) |
3813 | { |
3814 | ut_ad(buf_pool_mutex_own(buf_pool)); |
3815 | |
3816 | m_removed->at(buf_pool->instance_no)++; |
3817 | |
3818 | DBUG_LOG("flush" , "Remove " << bpage->id); |
3819 | } |
3820 | |
3821 | /** Flush dirty pages and wait. */ |
3822 | void |
3823 | FlushObserver::flush() |
3824 | { |
3825 | ut_ad(m_trx); |
3826 | |
3827 | if (!m_interrupted && m_stage) { |
3828 | m_stage->begin_phase_flush(buf_flush_get_dirty_pages_count( |
3829 | m_space->id, this)); |
3830 | } |
3831 | |
3832 | buf_LRU_flush_or_remove_pages(m_space->id, this); |
3833 | |
3834 | /* Wait for all dirty pages were flushed. */ |
3835 | for (ulint i = 0; i < srv_buf_pool_instances; i++) { |
3836 | while (!is_complete(i)) { |
3837 | |
3838 | os_thread_sleep(2000); |
3839 | } |
3840 | } |
3841 | } |
3842 | |