1/*****************************************************************************
2
3Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
4Copyright (c) 2017, 2018, MariaDB Corporation.
5
6This program is free software; you can redistribute it and/or modify it under
7the terms of the GNU General Public License as published by the Free Software
8Foundation; version 2 of the License.
9
10This program is distributed in the hope that it will be useful, but WITHOUT
11ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14You should have received a copy of the GNU General Public License along with
15this program; if not, write to the Free Software Foundation, Inc.,
1651 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
17
18*****************************************************************************/
19
20/**************************************************//**
21@file buf/buf0lru.cc
22The database buffer replacement algorithm
23
24Created 11/5/1995 Heikki Tuuri
25*******************************************************/
26
27#include "buf0lru.h"
28#include "ut0byte.h"
29#include "ut0rnd.h"
30#include "sync0rw.h"
31#include "hash0hash.h"
32#include "os0event.h"
33#include "fil0fil.h"
34#include "btr0btr.h"
35#include "buf0buddy.h"
36#include "buf0buf.h"
37#include "buf0dblwr.h"
38#include "buf0flu.h"
39#include "buf0rea.h"
40#include "btr0sea.h"
41#include "ibuf0ibuf.h"
42#include "os0file.h"
43#include "page0zip.h"
44#include "log0recv.h"
45#include "srv0srv.h"
46#include "srv0mon.h"
47#include "lock0lock.h"
48
49/** The number of blocks from the LRU_old pointer onward, including
50the block pointed to, must be buf_pool->LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
51of the whole LRU list length, except that the tolerance defined below
52is allowed. Note that the tolerance must be small enough such that for
53even the BUF_LRU_OLD_MIN_LEN long LRU list, the LRU_old pointer is not
54allowed to point to either end of the LRU list. */
55
56static const ulint BUF_LRU_OLD_TOLERANCE = 20;
57
58/** The minimum amount of non-old blocks when the LRU_old list exists
59(that is, when there are more than BUF_LRU_OLD_MIN_LEN blocks).
60@see buf_LRU_old_adjust_len */
61#define BUF_LRU_NON_OLD_MIN_LEN 5
62
63/** When dropping the search hash index entries before deleting an ibd
64file, we build a local array of pages belonging to that tablespace
65in the buffer pool. Following is the size of that array.
66We also release buf_pool->mutex after scanning this many pages of the
67flush_list when dropping a table. This is to ensure that other threads
68are not blocked for extended period of time when using very large
69buffer pools. */
70static const ulint BUF_LRU_DROP_SEARCH_SIZE = 1024;
71
72/** We scan these many blocks when looking for a clean page to evict
73during LRU eviction. */
74static const ulint BUF_LRU_SEARCH_SCAN_THRESHOLD = 100;
75
76/** If we switch on the InnoDB monitor because there are too few available
77frames in the buffer pool, we set this to TRUE */
78static bool buf_lru_switched_on_innodb_mon = false;
79
80/** True if diagnostic message about difficult to find free blocks
81in the buffer bool has already printed. */
82static bool buf_lru_free_blocks_error_printed;
83
84/******************************************************************//**
85These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O
86and page_zip_decompress() operations. Based on the statistics,
87buf_LRU_evict_from_unzip_LRU() decides if we want to evict from
88unzip_LRU or the regular LRU. From unzip_LRU, we will only evict the
89uncompressed frame (meaning we can evict dirty blocks as well). From
90the regular LRU, we will evict the entire block (i.e.: both the
91uncompressed and compressed data), which must be clean. */
92
93/* @{ */
94
95/** Number of intervals for which we keep the history of these stats.
96Each interval is 1 second, defined by the rate at which
97srv_error_monitor_thread() calls buf_LRU_stat_update(). */
98static const ulint BUF_LRU_STAT_N_INTERVAL = 50;
99
100/** Co-efficient with which we multiply I/O operations to equate them
101with page_zip_decompress() operations. */
102static const ulint BUF_LRU_IO_TO_UNZIP_FACTOR = 50;
103
104/** Sampled values buf_LRU_stat_cur.
105Not protected by any mutex. Updated by buf_LRU_stat_update(). */
106static buf_LRU_stat_t buf_LRU_stat_arr[BUF_LRU_STAT_N_INTERVAL];
107
108/** Cursor to buf_LRU_stat_arr[] that is updated in a round-robin fashion. */
109static ulint buf_LRU_stat_arr_ind;
110
111/** Current operation counters. Not protected by any mutex. Cleared
112by buf_LRU_stat_update(). */
113buf_LRU_stat_t buf_LRU_stat_cur;
114
115/** Running sum of past values of buf_LRU_stat_cur.
116Updated by buf_LRU_stat_update(). Not Protected by any mutex. */
117buf_LRU_stat_t buf_LRU_stat_sum;
118
119/* @} */
120
121/** @name Heuristics for detecting index scan @{ */
122/** Move blocks to "new" LRU list only if the first access was at
123least this many milliseconds ago. Not protected by any mutex or latch. */
124uint buf_LRU_old_threshold_ms;
125/* @} */
126
127/******************************************************************//**
128Takes a block out of the LRU list and page hash table.
129If the block is compressed-only (BUF_BLOCK_ZIP_PAGE),
130the object will be freed.
131
132The caller must hold buf_pool->mutex, the buf_page_get_mutex() mutex
133and the appropriate hash_lock. This function will release the
134buf_page_get_mutex() and the hash_lock.
135
136If a compressed page is freed other compressed pages may be relocated.
137@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The
138caller needs to free the page to the free list
139@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
140this case the block is already returned to the buddy allocator. */
141static MY_ATTRIBUTE((warn_unused_result))
142bool
143buf_LRU_block_remove_hashed(
144/*========================*/
145 buf_page_t* bpage, /*!< in: block, must contain a file page and
146 be in a state where it can be freed; there
147 may or may not be a hash index to the page */
148 bool zip); /*!< in: true if should remove also the
149 compressed page of an uncompressed page */
150/******************************************************************//**
151Puts a file page whose has no hash index to the free list. */
152static
153void
154buf_LRU_block_free_hashed_page(
155/*===========================*/
156 buf_block_t* block); /*!< in: block, must contain a file page and
157 be in a state where it can be freed */
158
159/******************************************************************//**
160Increases LRU size in bytes with page size inline function */
161static inline
162void
163incr_LRU_size_in_bytes(
164/*===================*/
165 buf_page_t* bpage, /*!< in: control block */
166 buf_pool_t* buf_pool) /*!< in: buffer pool instance */
167{
168 ut_ad(buf_pool_mutex_own(buf_pool));
169
170 buf_pool->stat.LRU_bytes += bpage->size.physical();
171
172 ut_ad(buf_pool->stat.LRU_bytes <= buf_pool->curr_pool_size);
173}
174
175/******************************************************************//**
176Determines if the unzip_LRU list should be used for evicting a victim
177instead of the general LRU list.
178@return TRUE if should use unzip_LRU */
179ibool
180buf_LRU_evict_from_unzip_LRU(
181/*=========================*/
182 buf_pool_t* buf_pool)
183{
184 ut_ad(buf_pool_mutex_own(buf_pool));
185
186 /* If the unzip_LRU list is empty, we can only use the LRU. */
187 if (UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0) {
188 return(FALSE);
189 }
190
191 /* If unzip_LRU is at most 10% of the size of the LRU list,
192 then use the LRU. This slack allows us to keep hot
193 decompressed pages in the buffer pool. */
194 if (UT_LIST_GET_LEN(buf_pool->unzip_LRU)
195 <= UT_LIST_GET_LEN(buf_pool->LRU) / 10) {
196 return(FALSE);
197 }
198
199 /* If eviction hasn't started yet, we assume by default
200 that a workload is disk bound. */
201 if (buf_pool->freed_page_clock == 0) {
202 return(TRUE);
203 }
204
205 /* Calculate the average over past intervals, and add the values
206 of the current interval. */
207 ulint io_avg = buf_LRU_stat_sum.io / BUF_LRU_STAT_N_INTERVAL
208 + buf_LRU_stat_cur.io;
209
210 ulint unzip_avg = buf_LRU_stat_sum.unzip / BUF_LRU_STAT_N_INTERVAL
211 + buf_LRU_stat_cur.unzip;
212
213 /* Decide based on our formula. If the load is I/O bound
214 (unzip_avg is smaller than the weighted io_avg), evict an
215 uncompressed frame from unzip_LRU. Otherwise we assume that
216 the load is CPU bound and evict from the regular LRU. */
217 return(unzip_avg <= io_avg * BUF_LRU_IO_TO_UNZIP_FACTOR);
218}
219
220#ifdef BTR_CUR_HASH_ADAPT
221/** Attempts to drop page hash index on a batch of pages belonging to a
222particular space id.
223@param[in] space_id space id
224@param[in] arr array of page_no
225@param[in] count number of entries in array */
226static
227void
228buf_LRU_drop_page_hash_batch(
229 ulint space_id,
230 const ulint* arr,
231 ulint count)
232{
233 ut_ad(count <= BUF_LRU_DROP_SEARCH_SIZE);
234
235 for (ulint i = 0; i < count; ++i, ++arr) {
236 /* While our only caller
237 buf_LRU_drop_page_hash_for_tablespace()
238 is being executed for DROP TABLE or similar,
239 the table cannot be evicted from the buffer pool.
240 Note: this should not be executed for DROP TABLESPACE,
241 because DROP TABLESPACE would be refused if tables existed
242 in the tablespace, and a previous DROP TABLE would have
243 already removed the AHI entries. */
244 btr_search_drop_page_hash_when_freed(
245 page_id_t(space_id, *arr));
246 }
247}
248
249/******************************************************************//**
250When doing a DROP TABLE/DISCARD TABLESPACE we have to drop all page
251hash index entries belonging to that table. This function tries to
252do that in batch. Note that this is a 'best effort' attempt and does
253not guarantee that ALL hash entries will be removed. */
254static
255void
256buf_LRU_drop_page_hash_for_tablespace(
257/*==================================*/
258 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
259 ulint id) /*!< in: space id */
260{
261 ulint* page_arr = static_cast<ulint*>(ut_malloc_nokey(
262 sizeof(ulint) * BUF_LRU_DROP_SEARCH_SIZE));
263
264 ulint num_entries = 0;
265
266 buf_pool_mutex_enter(buf_pool);
267
268scan_again:
269 for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->LRU);
270 bpage != NULL;
271 /* No op */) {
272
273 buf_page_t* prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
274
275 ut_a(buf_page_in_file(bpage));
276
277 if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE
278 || bpage->id.space() != id
279 || bpage->io_fix != BUF_IO_NONE) {
280 /* Compressed pages are never hashed.
281 Skip blocks of other tablespaces.
282 Skip I/O-fixed blocks (to be dealt with later). */
283next_page:
284 bpage = prev_bpage;
285 continue;
286 }
287
288 buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
289
290 mutex_enter(&block->mutex);
291
292 /* This debug check uses a dirty read that could
293 theoretically cause false positives while
294 buf_pool_clear_hash_index() is executing.
295 (Other conflicting access paths to the adaptive hash
296 index should not be possible, because when a
297 tablespace is being discarded or dropped, there must
298 be no concurrect access to the contained tables.) */
299 assert_block_ahi_valid(block);
300
301 bool skip = bpage->buf_fix_count > 0 || !block->index;
302
303 mutex_exit(&block->mutex);
304
305 if (skip) {
306 /* Skip this block, because there are
307 no adaptive hash index entries
308 pointing to it, or because we cannot
309 drop them due to the buffer-fix. */
310 goto next_page;
311 }
312
313 /* Store the page number so that we can drop the hash
314 index in a batch later. */
315 page_arr[num_entries] = bpage->id.page_no();
316 ut_a(num_entries < BUF_LRU_DROP_SEARCH_SIZE);
317 ++num_entries;
318
319 if (num_entries < BUF_LRU_DROP_SEARCH_SIZE) {
320 goto next_page;
321 }
322
323 /* Array full. We release the buf_pool->mutex to obey
324 the latching order. */
325 buf_pool_mutex_exit(buf_pool);
326
327 buf_LRU_drop_page_hash_batch(id, page_arr, num_entries);
328
329 num_entries = 0;
330
331 buf_pool_mutex_enter(buf_pool);
332
333 /* Note that we released the buf_pool mutex above
334 after reading the prev_bpage during processing of a
335 page_hash_batch (i.e.: when the array was full).
336 Because prev_bpage could belong to a compressed-only
337 block, it may have been relocated, and thus the
338 pointer cannot be trusted. Because bpage is of type
339 buf_block_t, it is safe to dereference.
340
341 bpage can change in the LRU list. This is OK because
342 this function is a 'best effort' to drop as many
343 search hash entries as possible and it does not
344 guarantee that ALL such entries will be dropped. */
345
346 /* If, however, bpage has been removed from LRU list
347 to the free list then we should restart the scan.
348 bpage->state is protected by buf_pool mutex. */
349 if (bpage != NULL
350 && buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
351
352 goto scan_again;
353 }
354 }
355
356 buf_pool_mutex_exit(buf_pool);
357
358 /* Drop any remaining batch of search hashed pages. */
359 buf_LRU_drop_page_hash_batch(id, page_arr, num_entries);
360 ut_free(page_arr);
361}
362#endif /* BTR_CUR_HASH_ADAPT */
363
364/******************************************************************//**
365While flushing (or removing dirty) pages from a tablespace we don't
366want to hog the CPU and resources. Release the buffer pool and block
367mutex and try to force a context switch. Then reacquire the same mutexes.
368The current page is "fixed" before the release of the mutexes and then
369"unfixed" again once we have reacquired the mutexes. */
370static
371void
372buf_flush_yield(
373/*============*/
374 buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */
375 buf_page_t* bpage) /*!< in/out: current page */
376{
377 BPageMutex* block_mutex;
378
379 ut_ad(buf_pool_mutex_own(buf_pool));
380 ut_ad(buf_page_in_file(bpage));
381
382 block_mutex = buf_page_get_mutex(bpage);
383
384 mutex_enter(block_mutex);
385
386 /* "Fix" the block so that the position cannot be
387 changed after we release the buffer pool and
388 block mutexes. */
389 buf_page_set_sticky(bpage);
390
391 /* Now it is safe to release the buf_pool->mutex. */
392 buf_pool_mutex_exit(buf_pool);
393
394 mutex_exit(block_mutex);
395 /* Try and force a context switch. */
396 os_thread_yield();
397
398 buf_pool_mutex_enter(buf_pool);
399
400 mutex_enter(block_mutex);
401
402 /* "Unfix" the block now that we have both the
403 buffer pool and block mutex again. */
404 buf_page_unset_sticky(bpage);
405 mutex_exit(block_mutex);
406}
407
408/******************************************************************//**
409If we have hogged the resources for too long then release the buffer
410pool and flush list mutex and do a thread yield. Set the current page
411to "sticky" so that it is not relocated during the yield.
412@return true if yielded */
413static MY_ATTRIBUTE((warn_unused_result))
414bool
415buf_flush_try_yield(
416/*================*/
417 buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */
418 buf_page_t* bpage, /*!< in/out: bpage to remove */
419 ulint processed) /*!< in: number of pages processed */
420{
421 /* Every BUF_LRU_DROP_SEARCH_SIZE iterations in the
422 loop we release buf_pool->mutex to let other threads
423 do their job but only if the block is not IO fixed. This
424 ensures that the block stays in its position in the
425 flush_list. */
426
427 if (bpage != NULL
428 && processed >= BUF_LRU_DROP_SEARCH_SIZE
429 && buf_page_get_io_fix(bpage) == BUF_IO_NONE) {
430
431 buf_flush_list_mutex_exit(buf_pool);
432
433 /* Release the buffer pool and block mutex
434 to give the other threads a go. */
435
436 buf_flush_yield(buf_pool, bpage);
437
438 buf_flush_list_mutex_enter(buf_pool);
439
440 /* Should not have been removed from the flush
441 list during the yield. However, this check is
442 not sufficient to catch a remove -> add. */
443
444 ut_ad(bpage->in_flush_list);
445
446 return(true);
447 }
448
449 return(false);
450}
451
452/******************************************************************//**
453Removes a single page from a given tablespace inside a specific
454buffer pool instance.
455@return true if page was removed. */
456static MY_ATTRIBUTE((warn_unused_result))
457bool
458buf_flush_or_remove_page(
459/*=====================*/
460 buf_pool_t* buf_pool, /*!< in/out: buffer pool instance */
461 buf_page_t* bpage, /*!< in/out: bpage to remove */
462 bool flush) /*!< in: flush to disk if true but
463 don't remove else remove without
464 flushing to disk */
465{
466 ut_ad(buf_pool_mutex_own(buf_pool));
467 ut_ad(buf_flush_list_mutex_own(buf_pool));
468
469 /* bpage->space and bpage->io_fix are protected by
470 buf_pool->mutex and block_mutex. It is safe to check
471 them while holding buf_pool->mutex only. */
472
473 if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
474
475 /* We cannot remove this page during this scan
476 yet; maybe the system is currently reading it
477 in, or flushing the modifications to the file */
478 return(false);
479
480 }
481
482 BPageMutex* block_mutex;
483 bool processed = false;
484
485 block_mutex = buf_page_get_mutex(bpage);
486
487 /* We have to release the flush_list_mutex to obey the
488 latching order. We are however guaranteed that the page
489 will stay in the flush_list and won't be relocated because
490 buf_flush_remove() and buf_flush_relocate_on_flush_list()
491 need buf_pool->mutex as well. */
492
493 buf_flush_list_mutex_exit(buf_pool);
494
495 mutex_enter(block_mutex);
496
497 ut_ad(bpage->oldest_modification != 0);
498
499 if (!flush) {
500
501 buf_flush_remove(bpage);
502
503 mutex_exit(block_mutex);
504
505 processed = true;
506
507 } else if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_SINGLE_PAGE)) {
508
509 /* The following call will release the buffer pool
510 and block mutex. */
511 processed = buf_flush_page(
512 buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, false);
513
514 if (processed) {
515 /* Wake possible simulated aio thread to actually
516 post the writes to the operating system */
517 os_aio_simulated_wake_handler_threads();
518 buf_pool_mutex_enter(buf_pool);
519 } else {
520 mutex_exit(block_mutex);
521 }
522 } else {
523 mutex_exit(block_mutex);
524 }
525
526 buf_flush_list_mutex_enter(buf_pool);
527
528 ut_ad(!mutex_own(block_mutex));
529 ut_ad(buf_pool_mutex_own(buf_pool));
530
531 return(processed);
532}
533
534/** Remove all dirty pages belonging to a given tablespace inside a specific
535buffer pool instance when we are deleting the data file(s) of that
536tablespace. The pages still remain a part of LRU and are evicted from
537the list as they age towards the tail of the LRU.
538@param[in,out] buf_pool buffer pool
539@param[in] id tablespace identifier
540@param[in] observer flush observer (to check for interrupt),
541 or NULL if the files should not be written to
542@return whether all dirty pages were freed */
543static MY_ATTRIBUTE((warn_unused_result))
544bool
545buf_flush_or_remove_pages(
546 buf_pool_t* buf_pool,
547 ulint id,
548 FlushObserver* observer)
549{
550 buf_page_t* prev;
551 buf_page_t* bpage;
552 ulint processed = 0;
553
554 buf_flush_list_mutex_enter(buf_pool);
555
556rescan:
557 bool all_freed = true;
558
559 for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
560 bpage != NULL;
561 bpage = prev) {
562
563 ut_a(buf_page_in_file(bpage));
564
565 /* Save the previous link because once we free the
566 page we can't rely on the links. */
567
568 prev = UT_LIST_GET_PREV(list, bpage);
569
570 /* Flush the pages matching space id,
571 or pages matching the flush observer. */
572 if (observer && observer->is_partial_flush()) {
573 if (observer != bpage->flush_observer) {
574 /* Skip this block. */
575 } else if (!buf_flush_or_remove_page(
576 buf_pool, bpage,
577 !observer->is_interrupted())) {
578 all_freed = false;
579 } else if (!observer->is_interrupted()) {
580 /* The processing was successful. And during the
581 processing we have released the buf_pool mutex
582 when calling buf_page_flush(). We cannot trust
583 prev pointer. */
584 goto rescan;
585 }
586 } else if (id != bpage->id.space()) {
587 /* Skip this block, because it is for a
588 different tablespace. */
589 } else if (!buf_flush_or_remove_page(
590 buf_pool, bpage, observer != NULL)) {
591
592 /* Remove was unsuccessful, we have to try again
593 by scanning the entire list from the end.
594 This also means that we never released the
595 buf_pool mutex. Therefore we can trust the prev
596 pointer.
597 buf_flush_or_remove_page() released the
598 flush list mutex but not the buf_pool mutex.
599 Therefore it is possible that a new page was
600 added to the flush list. For example, in case
601 where we are at the head of the flush list and
602 prev == NULL. That is OK because we have the
603 tablespace quiesced and no new pages for this
604 space-id should enter flush_list. This is
605 because the only callers of this function are
606 DROP TABLE and FLUSH TABLE FOR EXPORT.
607 We know that we'll have to do at least one more
608 scan but we don't break out of loop here and
609 try to do as much work as we can in this
610 iteration. */
611
612 all_freed = false;
613 } else if (observer) {
614
615 /* The processing was successful. And during the
616 processing we have released the buf_pool mutex
617 when calling buf_page_flush(). We cannot trust
618 prev pointer. */
619 goto rescan;
620 }
621
622 ++processed;
623
624 /* Yield if we have hogged the CPU and mutexes for too long. */
625 if (buf_flush_try_yield(buf_pool, prev, processed)) {
626
627 /* Reset the batch size counter if we had to yield. */
628
629 processed = 0;
630 }
631
632 /* The check for trx is interrupted is expensive, we want
633 to check every N iterations. */
634 if (!processed && observer) {
635 observer->check_interrupted();
636 }
637 }
638
639 buf_flush_list_mutex_exit(buf_pool);
640
641 return(all_freed);
642}
643
644/** Remove or flush all the dirty pages that belong to a given tablespace
645inside a specific buffer pool instance. The pages will remain in the LRU
646list and will be evicted from the LRU list as they age and move towards
647the tail of the LRU list.
648@param[in,out] buf_pool buffer pool
649@param[in] id tablespace identifier
650@param[in] observer flush observer,
651 or NULL if the files should not be written to
652*/
653static
654void
655buf_flush_dirty_pages(
656 buf_pool_t* buf_pool,
657 ulint id,
658 FlushObserver* observer)
659{
660 for (;;) {
661 buf_pool_mutex_enter(buf_pool);
662
663 bool freed = buf_flush_or_remove_pages(buf_pool, id, observer);
664
665 buf_pool_mutex_exit(buf_pool);
666
667 ut_ad(buf_flush_validate(buf_pool));
668
669 if (freed) {
670 break;
671 }
672
673 os_thread_sleep(2000);
674 ut_ad(buf_flush_validate(buf_pool));
675 }
676
677 ut_ad((observer && observer->is_interrupted())
678 || buf_pool_get_dirty_pages_count(buf_pool, id, observer) == 0);
679}
680
681/** Empty the flush list for all pages belonging to a tablespace.
682@param[in] id tablespace identifier
683@param[in] observer flush observer,
684 or NULL if nothing is to be written */
685void
686buf_LRU_flush_or_remove_pages(
687 ulint id,
688 FlushObserver* observer
689#ifdef BTR_CUR_HASH_ADAPT
690 , bool drop_ahi /*!< whether to drop the adaptive hash index */
691#endif /* BTR_CUR_HASH_ADAPT */
692 )
693{
694 /* Pages in the system tablespace must never be discarded. */
695 ut_ad(id || observer);
696
697 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
698 buf_pool_t* buf_pool = buf_pool_from_array(i);
699#ifdef BTR_CUR_HASH_ADAPT
700 if (drop_ahi) {
701 buf_LRU_drop_page_hash_for_tablespace(buf_pool, id);
702 }
703#endif /* BTR_CUR_HASH_ADAPT */
704 buf_flush_dirty_pages(buf_pool, id, observer);
705 }
706
707 if (observer && !observer->is_interrupted()) {
708 /* Ensure that all asynchronous IO is completed. */
709 os_aio_wait_until_no_pending_writes();
710 fil_flush(id);
711 }
712}
713
714#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
715/********************************************************************//**
716Insert a compressed block into buf_pool->zip_clean in the LRU order. */
717void
718buf_LRU_insert_zip_clean(
719/*=====================*/
720 buf_page_t* bpage) /*!< in: pointer to the block in question */
721{
722 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
723
724 ut_ad(buf_pool_mutex_own(buf_pool));
725 ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE);
726
727 /* Find the first successor of bpage in the LRU list
728 that is in the zip_clean list. */
729 buf_page_t* b = bpage;
730
731 do {
732 b = UT_LIST_GET_NEXT(LRU, b);
733 } while (b && buf_page_get_state(b) != BUF_BLOCK_ZIP_PAGE);
734
735 /* Insert bpage before b, i.e., after the predecessor of b. */
736 if (b != NULL) {
737 b = UT_LIST_GET_PREV(list, b);
738 }
739
740 if (b != NULL) {
741 UT_LIST_INSERT_AFTER(buf_pool->zip_clean, b, bpage);
742 } else {
743 UT_LIST_ADD_FIRST(buf_pool->zip_clean, bpage);
744 }
745}
746#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
747
748/******************************************************************//**
749Try to free an uncompressed page of a compressed block from the unzip
750LRU list. The compressed page is preserved, and it need not be clean.
751@return true if freed */
752static
753bool
754buf_LRU_free_from_unzip_LRU_list(
755/*=============================*/
756 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
757 bool scan_all) /*!< in: scan whole LRU list
758 if true, otherwise scan only
759 srv_LRU_scan_depth / 2 blocks. */
760{
761 ut_ad(buf_pool_mutex_own(buf_pool));
762
763 if (!buf_LRU_evict_from_unzip_LRU(buf_pool)) {
764 return(false);
765 }
766
767 ulint scanned = 0;
768 bool freed = false;
769
770 for (buf_block_t* block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
771 block != NULL
772 && !freed
773 && (scan_all || scanned < srv_LRU_scan_depth);
774 ++scanned) {
775
776 buf_block_t* prev_block;
777
778 prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
779
780 ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
781 ut_ad(block->in_unzip_LRU_list);
782 ut_ad(block->page.in_LRU_list);
783
784 freed = buf_LRU_free_page(&block->page, false);
785
786 block = prev_block;
787 }
788
789 if (scanned) {
790 MONITOR_INC_VALUE_CUMULATIVE(
791 MONITOR_LRU_UNZIP_SEARCH_SCANNED,
792 MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
793 MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL,
794 scanned);
795 }
796
797 return(freed);
798}
799
800/******************************************************************//**
801Try to free a clean page from the common LRU list.
802@return true if freed */
803static
804bool
805buf_LRU_free_from_common_LRU_list(
806/*==============================*/
807 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
808 bool scan_all) /*!< in: scan whole LRU list
809 if true, otherwise scan only
810 up to BUF_LRU_SEARCH_SCAN_THRESHOLD */
811{
812 ut_ad(buf_pool_mutex_own(buf_pool));
813
814 ulint scanned = 0;
815 bool freed = false;
816
817 for (buf_page_t* bpage = buf_pool->lru_scan_itr.start();
818 bpage != NULL
819 && !freed
820 && (scan_all || scanned < BUF_LRU_SEARCH_SCAN_THRESHOLD);
821 ++scanned, bpage = buf_pool->lru_scan_itr.get()) {
822
823 buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
824 BPageMutex* mutex = buf_page_get_mutex(bpage);
825
826 buf_pool->lru_scan_itr.set(prev);
827
828 mutex_enter(mutex);
829
830 ut_ad(buf_page_in_file(bpage));
831 ut_ad(bpage->in_LRU_list);
832
833 unsigned accessed = buf_page_is_accessed(bpage);
834
835 if (buf_flush_ready_for_replace(bpage)) {
836 mutex_exit(mutex);
837 freed = buf_LRU_free_page(bpage, true);
838 } else {
839 mutex_exit(mutex);
840 }
841
842 if (freed && !accessed) {
843 /* Keep track of pages that are evicted without
844 ever being accessed. This gives us a measure of
845 the effectiveness of readahead */
846 ++buf_pool->stat.n_ra_pages_evicted;
847 }
848
849 ut_ad(buf_pool_mutex_own(buf_pool));
850 ut_ad(!mutex_own(mutex));
851 }
852
853 if (scanned) {
854 MONITOR_INC_VALUE_CUMULATIVE(
855 MONITOR_LRU_SEARCH_SCANNED,
856 MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
857 MONITOR_LRU_SEARCH_SCANNED_PER_CALL,
858 scanned);
859 }
860
861 return(freed);
862}
863
864/******************************************************************//**
865Try to free a replaceable block.
866@return true if found and freed */
867bool
868buf_LRU_scan_and_free_block(
869/*========================*/
870 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
871 bool scan_all) /*!< in: scan whole LRU list
872 if true, otherwise scan only
873 BUF_LRU_SEARCH_SCAN_THRESHOLD
874 blocks. */
875{
876 ut_ad(buf_pool_mutex_own(buf_pool));
877
878 return(buf_LRU_free_from_unzip_LRU_list(buf_pool, scan_all)
879 || buf_LRU_free_from_common_LRU_list(buf_pool, scan_all));
880}
881
882/******************************************************************//**
883Returns TRUE if less than 25 % of the buffer pool in any instance is
884available. This can be used in heuristics to prevent huge transactions
885eating up the whole buffer pool for their locks.
886@return TRUE if less than 25 % of buffer pool left */
887ibool
888buf_LRU_buf_pool_running_out(void)
889/*==============================*/
890{
891 ibool ret = FALSE;
892
893 for (ulint i = 0; i < srv_buf_pool_instances && !ret; i++) {
894 buf_pool_t* buf_pool;
895
896 buf_pool = buf_pool_from_array(i);
897
898 buf_pool_mutex_enter(buf_pool);
899
900 if (!recv_recovery_is_on()
901 && UT_LIST_GET_LEN(buf_pool->free)
902 + UT_LIST_GET_LEN(buf_pool->LRU)
903 < ut_min(buf_pool->curr_size,
904 buf_pool->old_size) / 4) {
905
906 ret = TRUE;
907 }
908
909 buf_pool_mutex_exit(buf_pool);
910 }
911
912 return(ret);
913}
914
915/******************************************************************//**
916Returns a free block from the buf_pool. The block is taken off the
917free list. If it is empty, returns NULL.
918@return a free control block, or NULL if the buf_block->free list is empty */
919buf_block_t*
920buf_LRU_get_free_only(
921/*==================*/
922 buf_pool_t* buf_pool)
923{
924 buf_block_t* block;
925
926 ut_ad(buf_pool_mutex_own(buf_pool));
927
928 block = reinterpret_cast<buf_block_t*>(
929 UT_LIST_GET_FIRST(buf_pool->free));
930
931 while (block != NULL) {
932
933 ut_ad(block->page.in_free_list);
934 ut_d(block->page.in_free_list = FALSE);
935 ut_ad(!block->page.in_flush_list);
936 ut_ad(!block->page.in_LRU_list);
937 ut_a(!buf_page_in_file(&block->page));
938 UT_LIST_REMOVE(buf_pool->free, &block->page);
939
940 if (buf_pool->curr_size >= buf_pool->old_size
941 || UT_LIST_GET_LEN(buf_pool->withdraw)
942 >= buf_pool->withdraw_target
943 || !buf_block_will_withdrawn(buf_pool, block)) {
944 /* found valid free block */
945 buf_page_mutex_enter(block);
946 /* No adaptive hash index entries may point to
947 a free block. */
948 assert_block_ahi_empty(block);
949
950 buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE);
951 UNIV_MEM_ALLOC(block->frame, srv_page_size);
952
953 ut_ad(buf_pool_from_block(block) == buf_pool);
954
955 buf_page_mutex_exit(block);
956 break;
957 }
958
959 /* This should be withdrawn */
960 UT_LIST_ADD_LAST(
961 buf_pool->withdraw,
962 &block->page);
963 ut_d(block->in_withdraw_list = TRUE);
964
965 block = reinterpret_cast<buf_block_t*>(
966 UT_LIST_GET_FIRST(buf_pool->free));
967 }
968
969 return(block);
970}
971
972/******************************************************************//**
973Checks how much of buf_pool is occupied by non-data objects like
974AHI, lock heaps etc. Depending on the size of non-data objects this
975function will either assert or issue a warning and switch on the
976status monitor. */
977static
978void
979buf_LRU_check_size_of_non_data_objects(
980/*===================================*/
981 const buf_pool_t* buf_pool) /*!< in: buffer pool instance */
982{
983 ut_ad(buf_pool_mutex_own(buf_pool));
984
985 if (!recv_recovery_is_on()
986 && buf_pool->curr_size == buf_pool->old_size
987 && UT_LIST_GET_LEN(buf_pool->free)
988 + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / 20) {
989
990 ib::fatal() << "Over 95 percent of the buffer pool is"
991 " occupied by lock heaps"
992#ifdef BTR_CUR_HASH_ADAPT
993 " or the adaptive hash index!"
994#endif /* BTR_CUR_HASH_ADAPT */
995 " Check that your transactions do not set too many"
996 " row locks, or review if"
997 " innodb_buffer_pool_size="
998 << (buf_pool->curr_size >> (20U - srv_page_size_shift))
999 << "M could be bigger.";
1000 } else if (!recv_recovery_is_on()
1001 && buf_pool->curr_size == buf_pool->old_size
1002 && (UT_LIST_GET_LEN(buf_pool->free)
1003 + UT_LIST_GET_LEN(buf_pool->LRU))
1004 < buf_pool->curr_size / 3) {
1005
1006 if (!buf_lru_switched_on_innodb_mon) {
1007
1008 /* Over 67 % of the buffer pool is occupied by lock
1009 heaps or the adaptive hash index. This may be a memory
1010 leak! */
1011
1012 ib::warn() << "Over 67 percent of the buffer pool is"
1013 " occupied by lock heaps"
1014#ifdef BTR_CUR_HASH_ADAPT
1015 " or the adaptive hash index!"
1016#endif /* BTR_CUR_HASH_ADAPT */
1017 " Check that your transactions do not"
1018 " set too many row locks."
1019 " innodb_buffer_pool_size="
1020 << (buf_pool->curr_size >>
1021 (20U - srv_page_size_shift)) << "M."
1022 " Starting the InnoDB Monitor to print"
1023 " diagnostics.";
1024
1025 buf_lru_switched_on_innodb_mon = true;
1026 srv_print_innodb_monitor = TRUE;
1027 os_event_set(srv_monitor_event);
1028 }
1029
1030 } else if (buf_lru_switched_on_innodb_mon) {
1031
1032 /* Switch off the InnoDB Monitor; this is a simple way
1033 to stop the monitor if the situation becomes less urgent,
1034 but may also surprise users if the user also switched on the
1035 monitor! */
1036
1037 buf_lru_switched_on_innodb_mon = false;
1038 srv_print_innodb_monitor = FALSE;
1039 }
1040}
1041
1042/******************************************************************//**
1043Returns a free block from the buf_pool. The block is taken off the
1044free list. If free list is empty, blocks are moved from the end of the
1045LRU list to the free list.
1046This function is called from a user thread when it needs a clean
1047block to read in a page. Note that we only ever get a block from
1048the free list. Even when we flush a page or find a page in LRU scan
1049we put it to free list to be used.
1050* iteration 0:
1051 * get a block from free list, success:done
1052 * if buf_pool->try_LRU_scan is set
1053 * scan LRU up to srv_LRU_scan_depth to find a clean block
1054 * the above will put the block on free list
1055 * success:retry the free list
1056 * flush one dirty page from tail of LRU to disk
1057 * the above will put the block on free list
1058 * success: retry the free list
1059* iteration 1:
1060 * same as iteration 0 except:
1061 * scan whole LRU list
1062 * scan LRU list even if buf_pool->try_LRU_scan is not set
1063* iteration > 1:
1064 * same as iteration 1 but sleep 10ms
1065@return the free control block, in state BUF_BLOCK_READY_FOR_USE */
1066buf_block_t*
1067buf_LRU_get_free_block(
1068/*===================*/
1069 buf_pool_t* buf_pool) /*!< in/out: buffer pool instance */
1070{
1071 buf_block_t* block = NULL;
1072 bool freed = false;
1073 ulint n_iterations = 0;
1074 ulint flush_failures = 0;
1075
1076 MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH);
1077loop:
1078 buf_pool_mutex_enter(buf_pool);
1079
1080 buf_LRU_check_size_of_non_data_objects(buf_pool);
1081
1082 DBUG_EXECUTE_IF("ib_lru_force_no_free_page",
1083 if (!buf_lru_free_blocks_error_printed) {
1084 n_iterations = 21;
1085 goto not_found;});
1086
1087 /* If there is a block in the free list, take it */
1088 block = buf_LRU_get_free_only(buf_pool);
1089
1090 if (block != NULL) {
1091
1092 buf_pool_mutex_exit(buf_pool);
1093 ut_ad(buf_pool_from_block(block) == buf_pool);
1094 memset(&block->page.zip, 0, sizeof block->page.zip);
1095
1096 block->skip_flush_check = false;
1097 block->page.flush_observer = NULL;
1098 return(block);
1099 }
1100
1101 MONITOR_INC( MONITOR_LRU_GET_FREE_LOOPS );
1102 freed = false;
1103 if (buf_pool->try_LRU_scan || n_iterations > 0) {
1104 /* If no block was in the free list, search from the
1105 end of the LRU list and try to free a block there.
1106 If we are doing for the first time we'll scan only
1107 tail of the LRU list otherwise we scan the whole LRU
1108 list. */
1109 freed = buf_LRU_scan_and_free_block(
1110 buf_pool, n_iterations > 0);
1111
1112 if (!freed && n_iterations == 0) {
1113 /* Tell other threads that there is no point
1114 in scanning the LRU list. This flag is set to
1115 TRUE again when we flush a batch from this
1116 buffer pool. */
1117 buf_pool->try_LRU_scan = FALSE;
1118
1119 /* Also tell the page_cleaner thread that
1120 there is work for it to do. */
1121 os_event_set(buf_flush_event);
1122 }
1123 }
1124
1125#ifndef DBUG_OFF
1126not_found:
1127#endif
1128
1129 buf_pool_mutex_exit(buf_pool);
1130
1131 if (freed) {
1132 goto loop;
1133 }
1134
1135 if (n_iterations > 20 && !buf_lru_free_blocks_error_printed
1136 && srv_buf_pool_old_size == srv_buf_pool_size) {
1137
1138 ib::warn() << "Difficult to find free blocks in the buffer pool"
1139 " (" << n_iterations << " search iterations)! "
1140 << flush_failures << " failed attempts to"
1141 " flush a page!"
1142 " Consider increasing innodb_buffer_pool_size."
1143 " Pending flushes (fsync) log: "
1144 << fil_n_pending_log_flushes
1145 << "; buffer pool: "
1146 << fil_n_pending_tablespace_flushes
1147 << ". " << os_n_file_reads << " OS file reads, "
1148 << os_n_file_writes << " OS file writes, "
1149 << os_n_fsyncs
1150 << " OS fsyncs.";
1151
1152 buf_lru_free_blocks_error_printed = true;
1153 }
1154
1155 /* If we have scanned the whole LRU and still are unable to
1156 find a free block then we should sleep here to let the
1157 page_cleaner do an LRU batch for us. */
1158
1159 if (!srv_read_only_mode) {
1160 os_event_set(buf_flush_event);
1161 }
1162
1163 if (n_iterations > 1) {
1164
1165 MONITOR_INC( MONITOR_LRU_GET_FREE_WAITS );
1166 os_thread_sleep(10000);
1167 }
1168
1169 /* No free block was found: try to flush the LRU list.
1170 This call will flush one page from the LRU and put it on the
1171 free list. That means that the free block is up for grabs for
1172 all user threads.
1173
1174 TODO: A more elegant way would have been to return the freed
1175 up block to the caller here but the code that deals with
1176 removing the block from page_hash and LRU_list is fairly
1177 involved (particularly in case of compressed pages). We
1178 can do that in a separate patch sometime in future. */
1179
1180 if (!buf_flush_single_page_from_LRU(buf_pool)) {
1181 MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT);
1182 ++flush_failures;
1183 }
1184
1185 srv_stats.buf_pool_wait_free.inc();
1186
1187 n_iterations++;
1188
1189 goto loop;
1190}
1191
1192/*******************************************************************//**
1193Moves the LRU_old pointer so that the length of the old blocks list
1194is inside the allowed limits. */
1195UNIV_INLINE
1196void
1197buf_LRU_old_adjust_len(
1198/*===================*/
1199 buf_pool_t* buf_pool) /*!< in: buffer pool instance */
1200{
1201 ulint old_len;
1202 ulint new_len;
1203
1204 ut_a(buf_pool->LRU_old);
1205 ut_ad(buf_pool_mutex_own(buf_pool));
1206 ut_ad(buf_pool->LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN);
1207 ut_ad(buf_pool->LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX);
1208 compile_time_assert(BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN
1209 > BUF_LRU_OLD_RATIO_DIV
1210 * (BUF_LRU_OLD_TOLERANCE + 5));
1211 compile_time_assert(BUF_LRU_NON_OLD_MIN_LEN < BUF_LRU_OLD_MIN_LEN);
1212
1213#ifdef UNIV_LRU_DEBUG
1214 /* buf_pool->LRU_old must be the first item in the LRU list
1215 whose "old" flag is set. */
1216 ut_a(buf_pool->LRU_old->old);
1217 ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
1218 || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
1219 ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
1220 || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
1221#endif /* UNIV_LRU_DEBUG */
1222
1223 old_len = buf_pool->LRU_old_len;
1224 new_len = ut_min(UT_LIST_GET_LEN(buf_pool->LRU)
1225 * buf_pool->LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV,
1226 UT_LIST_GET_LEN(buf_pool->LRU)
1227 - (BUF_LRU_OLD_TOLERANCE
1228 + BUF_LRU_NON_OLD_MIN_LEN));
1229
1230 for (;;) {
1231 buf_page_t* LRU_old = buf_pool->LRU_old;
1232
1233 ut_a(LRU_old);
1234 ut_ad(LRU_old->in_LRU_list);
1235#ifdef UNIV_LRU_DEBUG
1236 ut_a(LRU_old->old);
1237#endif /* UNIV_LRU_DEBUG */
1238
1239 /* Update the LRU_old pointer if necessary */
1240
1241 if (old_len + BUF_LRU_OLD_TOLERANCE < new_len) {
1242
1243 buf_pool->LRU_old = LRU_old = UT_LIST_GET_PREV(
1244 LRU, LRU_old);
1245#ifdef UNIV_LRU_DEBUG
1246 ut_a(!LRU_old->old);
1247#endif /* UNIV_LRU_DEBUG */
1248 old_len = ++buf_pool->LRU_old_len;
1249 buf_page_set_old(LRU_old, TRUE);
1250
1251 } else if (old_len > new_len + BUF_LRU_OLD_TOLERANCE) {
1252
1253 buf_pool->LRU_old = UT_LIST_GET_NEXT(LRU, LRU_old);
1254 old_len = --buf_pool->LRU_old_len;
1255 buf_page_set_old(LRU_old, FALSE);
1256 } else {
1257 return;
1258 }
1259 }
1260}
1261
1262/*******************************************************************//**
1263Initializes the old blocks pointer in the LRU list. This function should be
1264called when the LRU list grows to BUF_LRU_OLD_MIN_LEN length. */
1265static
1266void
1267buf_LRU_old_init(
1268/*=============*/
1269 buf_pool_t* buf_pool)
1270{
1271 ut_ad(buf_pool_mutex_own(buf_pool));
1272 ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN);
1273
1274 /* We first initialize all blocks in the LRU list as old and then use
1275 the adjust function to move the LRU_old pointer to the right
1276 position */
1277
1278 for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1279 bpage != NULL;
1280 bpage = UT_LIST_GET_PREV(LRU, bpage)) {
1281
1282 ut_ad(bpage->in_LRU_list);
1283 ut_ad(buf_page_in_file(bpage));
1284
1285 /* This loop temporarily violates the
1286 assertions of buf_page_set_old(). */
1287 bpage->old = TRUE;
1288 }
1289
1290 buf_pool->LRU_old = UT_LIST_GET_FIRST(buf_pool->LRU);
1291 buf_pool->LRU_old_len = UT_LIST_GET_LEN(buf_pool->LRU);
1292
1293 buf_LRU_old_adjust_len(buf_pool);
1294}
1295
1296/******************************************************************//**
1297Remove a block from the unzip_LRU list if it belonged to the list. */
1298static
1299void
1300buf_unzip_LRU_remove_block_if_needed(
1301/*=================================*/
1302 buf_page_t* bpage) /*!< in/out: control block */
1303{
1304 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1305
1306 ut_ad(buf_page_in_file(bpage));
1307 ut_ad(buf_pool_mutex_own(buf_pool));
1308
1309 if (buf_page_belongs_to_unzip_LRU(bpage)) {
1310 buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
1311
1312 ut_ad(block->in_unzip_LRU_list);
1313 ut_d(block->in_unzip_LRU_list = FALSE);
1314
1315 UT_LIST_REMOVE(buf_pool->unzip_LRU, block);
1316 }
1317}
1318
1319/******************************************************************//**
1320Adjust LRU hazard pointers if needed. */
1321void
1322buf_LRU_adjust_hp(
1323/*==============*/
1324 buf_pool_t* buf_pool,/*!< in: buffer pool instance */
1325 const buf_page_t* bpage) /*!< in: control block */
1326{
1327 buf_pool->lru_hp.adjust(bpage);
1328 buf_pool->lru_scan_itr.adjust(bpage);
1329 buf_pool->single_scan_itr.adjust(bpage);
1330}
1331
1332/******************************************************************//**
1333Removes a block from the LRU list. */
1334UNIV_INLINE
1335void
1336buf_LRU_remove_block(
1337/*=================*/
1338 buf_page_t* bpage) /*!< in: control block */
1339{
1340 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1341
1342 ut_ad(buf_pool_mutex_own(buf_pool));
1343
1344 ut_a(buf_page_in_file(bpage));
1345
1346 ut_ad(bpage->in_LRU_list);
1347
1348 /* Important that we adjust the hazard pointers before removing
1349 bpage from the LRU list. */
1350 buf_LRU_adjust_hp(buf_pool, bpage);
1351
1352 /* If the LRU_old pointer is defined and points to just this block,
1353 move it backward one step */
1354
1355 if (bpage == buf_pool->LRU_old) {
1356
1357 /* Below: the previous block is guaranteed to exist,
1358 because the LRU_old pointer is only allowed to differ
1359 by BUF_LRU_OLD_TOLERANCE from strict
1360 buf_pool->LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV of the LRU
1361 list length. */
1362 buf_page_t* prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
1363
1364 ut_a(prev_bpage);
1365#ifdef UNIV_LRU_DEBUG
1366 ut_a(!prev_bpage->old);
1367#endif /* UNIV_LRU_DEBUG */
1368 buf_pool->LRU_old = prev_bpage;
1369 buf_page_set_old(prev_bpage, TRUE);
1370
1371 buf_pool->LRU_old_len++;
1372 }
1373
1374 /* Remove the block from the LRU list */
1375 UT_LIST_REMOVE(buf_pool->LRU, bpage);
1376 ut_d(bpage->in_LRU_list = FALSE);
1377
1378 buf_pool->stat.LRU_bytes -= bpage->size.physical();
1379
1380 buf_unzip_LRU_remove_block_if_needed(bpage);
1381
1382 /* If the LRU list is so short that LRU_old is not defined,
1383 clear the "old" flags and return */
1384 if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
1385
1386 for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
1387 bpage != NULL;
1388 bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
1389
1390 /* This loop temporarily violates the
1391 assertions of buf_page_set_old(). */
1392 bpage->old = FALSE;
1393 }
1394
1395 buf_pool->LRU_old = NULL;
1396 buf_pool->LRU_old_len = 0;
1397
1398 return;
1399 }
1400
1401 ut_ad(buf_pool->LRU_old);
1402
1403 /* Update the LRU_old_len field if necessary */
1404 if (buf_page_is_old(bpage)) {
1405
1406 buf_pool->LRU_old_len--;
1407 }
1408
1409 /* Adjust the length of the old block list if necessary */
1410 buf_LRU_old_adjust_len(buf_pool);
1411}
1412
1413/******************************************************************//**
1414Adds a block to the LRU list of decompressed zip pages. */
1415void
1416buf_unzip_LRU_add_block(
1417/*====================*/
1418 buf_block_t* block, /*!< in: control block */
1419 ibool old) /*!< in: TRUE if should be put to the end
1420 of the list, else put to the start */
1421{
1422 buf_pool_t* buf_pool = buf_pool_from_block(block);
1423
1424 ut_ad(buf_pool_mutex_own(buf_pool));
1425
1426 ut_a(buf_page_belongs_to_unzip_LRU(&block->page));
1427
1428 ut_ad(!block->in_unzip_LRU_list);
1429 ut_d(block->in_unzip_LRU_list = TRUE);
1430
1431 if (old) {
1432 UT_LIST_ADD_LAST(buf_pool->unzip_LRU, block);
1433 } else {
1434 UT_LIST_ADD_FIRST(buf_pool->unzip_LRU, block);
1435 }
1436}
1437
1438/******************************************************************//**
1439Adds a block to the LRU list. Please make sure that the page_size is
1440already set when invoking the function, so that we can get correct
1441page_size from the buffer page when adding a block into LRU */
1442UNIV_INLINE
1443void
1444buf_LRU_add_block_low(
1445/*==================*/
1446 buf_page_t* bpage, /*!< in: control block */
1447 ibool old) /*!< in: TRUE if should be put to the old blocks
1448 in the LRU list, else put to the start; if the
1449 LRU list is very short, the block is added to
1450 the start, regardless of this parameter */
1451{
1452 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1453
1454 ut_ad(buf_pool_mutex_own(buf_pool));
1455
1456 ut_a(buf_page_in_file(bpage));
1457 ut_ad(!bpage->in_LRU_list);
1458
1459 if (!old || (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN)) {
1460
1461 UT_LIST_ADD_FIRST(buf_pool->LRU, bpage);
1462
1463 bpage->freed_page_clock = buf_pool->freed_page_clock;
1464 } else {
1465#ifdef UNIV_LRU_DEBUG
1466 /* buf_pool->LRU_old must be the first item in the LRU list
1467 whose "old" flag is set. */
1468 ut_a(buf_pool->LRU_old->old);
1469 ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
1470 || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
1471 ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
1472 || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
1473#endif /* UNIV_LRU_DEBUG */
1474 UT_LIST_INSERT_AFTER(buf_pool->LRU, buf_pool->LRU_old,
1475 bpage);
1476
1477 buf_pool->LRU_old_len++;
1478 }
1479
1480 ut_d(bpage->in_LRU_list = TRUE);
1481
1482 incr_LRU_size_in_bytes(bpage, buf_pool);
1483
1484 if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) {
1485
1486 ut_ad(buf_pool->LRU_old);
1487
1488 /* Adjust the length of the old block list if necessary */
1489
1490 buf_page_set_old(bpage, old);
1491 buf_LRU_old_adjust_len(buf_pool);
1492
1493 } else if (UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN) {
1494
1495 /* The LRU list is now long enough for LRU_old to become
1496 defined: init it */
1497
1498 buf_LRU_old_init(buf_pool);
1499 } else {
1500 buf_page_set_old(bpage, buf_pool->LRU_old != NULL);
1501 }
1502
1503 /* If this is a zipped block with decompressed frame as well
1504 then put it on the unzip_LRU list */
1505 if (buf_page_belongs_to_unzip_LRU(bpage)) {
1506 buf_unzip_LRU_add_block((buf_block_t*) bpage, old);
1507 }
1508}
1509
1510/******************************************************************//**
1511Adds a block to the LRU list. Please make sure that the page_size is
1512already set when invoking the function, so that we can get correct
1513page_size from the buffer page when adding a block into LRU */
1514void
1515buf_LRU_add_block(
1516/*==============*/
1517 buf_page_t* bpage, /*!< in: control block */
1518 ibool old) /*!< in: TRUE if should be put to the old
1519 blocks in the LRU list, else put to the start;
1520 if the LRU list is very short, the block is
1521 added to the start, regardless of this
1522 parameter */
1523{
1524 buf_LRU_add_block_low(bpage, old);
1525}
1526
1527/******************************************************************//**
1528Moves a block to the start of the LRU list. */
1529void
1530buf_LRU_make_block_young(
1531/*=====================*/
1532 buf_page_t* bpage) /*!< in: control block */
1533{
1534 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1535
1536 ut_ad(buf_pool_mutex_own(buf_pool));
1537
1538 if (bpage->old) {
1539 buf_pool->stat.n_pages_made_young++;
1540 }
1541
1542 buf_LRU_remove_block(bpage);
1543 buf_LRU_add_block_low(bpage, FALSE);
1544}
1545
1546/******************************************************************//**
1547Try to free a block. If bpage is a descriptor of a compressed-only
1548page, the descriptor object will be freed as well.
1549
1550NOTE: If this function returns true, it will temporarily
1551release buf_pool->mutex. Furthermore, the page frame will no longer be
1552accessible via bpage.
1553
1554The caller must hold buf_pool->mutex and must not hold any
1555buf_page_get_mutex() when calling this function.
1556@return true if freed, false otherwise. */
1557bool
1558buf_LRU_free_page(
1559/*===============*/
1560 buf_page_t* bpage, /*!< in: block to be freed */
1561 bool zip) /*!< in: true if should remove also the
1562 compressed page of an uncompressed page */
1563{
1564 buf_page_t* b = NULL;
1565 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1566
1567 rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, bpage->id);
1568
1569 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1570
1571 ut_ad(buf_pool_mutex_own(buf_pool));
1572 ut_ad(buf_page_in_file(bpage));
1573 ut_ad(bpage->in_LRU_list);
1574
1575 rw_lock_x_lock(hash_lock);
1576 mutex_enter(block_mutex);
1577
1578 if (!buf_page_can_relocate(bpage)) {
1579
1580 /* Do not free buffer fixed and I/O-fixed blocks. */
1581 goto func_exit;
1582 }
1583
1584#ifdef UNIV_IBUF_COUNT_DEBUG
1585 ut_a(ibuf_count_get(bpage->id) == 0);
1586#endif /* UNIV_IBUF_COUNT_DEBUG */
1587
1588 if (zip || !bpage->zip.data) {
1589 /* This would completely free the block. */
1590 /* Do not completely free dirty blocks. */
1591
1592 if (bpage->oldest_modification) {
1593 goto func_exit;
1594 }
1595 } else if (bpage->oldest_modification > 0
1596 && buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
1597
1598 ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY);
1599
1600func_exit:
1601 rw_lock_x_unlock(hash_lock);
1602 mutex_exit(block_mutex);
1603 return(false);
1604
1605 } else if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
1606 b = buf_page_alloc_descriptor();
1607 ut_a(b);
1608 memcpy(b, bpage, sizeof *b);
1609 }
1610
1611 ut_ad(buf_pool_mutex_own(buf_pool));
1612 ut_ad(buf_page_in_file(bpage));
1613 ut_ad(bpage->in_LRU_list);
1614 ut_ad(!bpage->in_flush_list == !bpage->oldest_modification);
1615
1616 DBUG_PRINT("ib_buf", ("free page %u:%u",
1617 bpage->id.space(), bpage->id.page_no()));
1618
1619 ut_ad(rw_lock_own(hash_lock, RW_LOCK_X));
1620 ut_ad(buf_page_can_relocate(bpage));
1621
1622 if (!buf_LRU_block_remove_hashed(bpage, zip)) {
1623 return(true);
1624 }
1625
1626 /* buf_LRU_block_remove_hashed() releases the hash_lock */
1627 ut_ad(!rw_lock_own(hash_lock, RW_LOCK_X)
1628 && !rw_lock_own(hash_lock, RW_LOCK_S));
1629
1630 /* We have just freed a BUF_BLOCK_FILE_PAGE. If b != NULL
1631 then it was a compressed page with an uncompressed frame and
1632 we are interested in freeing only the uncompressed frame.
1633 Therefore we have to reinsert the compressed page descriptor
1634 into the LRU and page_hash (and possibly flush_list).
1635 if b == NULL then it was a regular page that has been freed */
1636
1637 if (b != NULL) {
1638 buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b);
1639
1640 rw_lock_x_lock(hash_lock);
1641
1642 mutex_enter(block_mutex);
1643
1644 ut_a(!buf_page_hash_get_low(buf_pool, b->id));
1645
1646 b->state = b->oldest_modification
1647 ? BUF_BLOCK_ZIP_DIRTY
1648 : BUF_BLOCK_ZIP_PAGE;
1649
1650 ut_ad(b->size.is_compressed());
1651
1652 UNIV_MEM_DESC(b->zip.data, b->size.physical());
1653
1654 /* The fields in_page_hash and in_LRU_list of
1655 the to-be-freed block descriptor should have
1656 been cleared in
1657 buf_LRU_block_remove_hashed(), which
1658 invokes buf_LRU_remove_block(). */
1659 ut_ad(!bpage->in_page_hash);
1660 ut_ad(!bpage->in_LRU_list);
1661
1662 /* bpage->state was BUF_BLOCK_FILE_PAGE because
1663 b != NULL. The type cast below is thus valid. */
1664 ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list);
1665
1666 /* The fields of bpage were copied to b before
1667 buf_LRU_block_remove_hashed() was invoked. */
1668 ut_ad(!b->in_zip_hash);
1669 ut_ad(b->in_page_hash);
1670 ut_ad(b->in_LRU_list);
1671
1672 HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
1673 b->id.fold(), b);
1674
1675 /* Insert b where bpage was in the LRU list. */
1676 if (prev_b != NULL) {
1677 ulint lru_len;
1678
1679 ut_ad(prev_b->in_LRU_list);
1680 ut_ad(buf_page_in_file(prev_b));
1681
1682 UT_LIST_INSERT_AFTER(buf_pool->LRU, prev_b, b);
1683
1684 incr_LRU_size_in_bytes(b, buf_pool);
1685
1686 if (buf_page_is_old(b)) {
1687 buf_pool->LRU_old_len++;
1688 if (buf_pool->LRU_old
1689 == UT_LIST_GET_NEXT(LRU, b)) {
1690
1691 buf_pool->LRU_old = b;
1692 }
1693 }
1694
1695 lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1696
1697 if (lru_len > BUF_LRU_OLD_MIN_LEN) {
1698 ut_ad(buf_pool->LRU_old);
1699 /* Adjust the length of the
1700 old block list if necessary */
1701 buf_LRU_old_adjust_len(buf_pool);
1702 } else if (lru_len == BUF_LRU_OLD_MIN_LEN) {
1703 /* The LRU list is now long
1704 enough for LRU_old to become
1705 defined: init it */
1706 buf_LRU_old_init(buf_pool);
1707 }
1708#ifdef UNIV_LRU_DEBUG
1709 /* Check that the "old" flag is consistent
1710 in the block and its neighbours. */
1711 buf_page_set_old(b, buf_page_is_old(b));
1712#endif /* UNIV_LRU_DEBUG */
1713 } else {
1714 ut_d(b->in_LRU_list = FALSE);
1715 buf_LRU_add_block_low(b, buf_page_is_old(b));
1716 }
1717
1718 if (b->state == BUF_BLOCK_ZIP_PAGE) {
1719#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
1720 buf_LRU_insert_zip_clean(b);
1721#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
1722 } else {
1723 /* Relocate on buf_pool->flush_list. */
1724 buf_flush_relocate_on_flush_list(bpage, b);
1725 }
1726
1727 bpage->zip.data = NULL;
1728
1729 page_zip_set_size(&bpage->zip, 0);
1730
1731 bpage->size.copy_from(page_size_t(bpage->size.logical(),
1732 bpage->size.logical(),
1733 false));
1734
1735 mutex_exit(block_mutex);
1736
1737 /* Prevent buf_page_get_gen() from
1738 decompressing the block while we release
1739 buf_pool->mutex and block_mutex. */
1740 block_mutex = buf_page_get_mutex(b);
1741
1742 mutex_enter(block_mutex);
1743
1744 buf_page_set_sticky(b);
1745
1746 mutex_exit(block_mutex);
1747
1748 rw_lock_x_unlock(hash_lock);
1749 }
1750
1751 buf_pool_mutex_exit(buf_pool);
1752
1753 /* Remove possible adaptive hash index on the page.
1754 The page was declared uninitialized by
1755 buf_LRU_block_remove_hashed(). We need to flag
1756 the contents of the page valid (which it still is) in
1757 order to avoid bogus Valgrind warnings.*/
1758
1759 UNIV_MEM_VALID(((buf_block_t*) bpage)->frame,
1760 srv_page_size);
1761 btr_search_drop_page_hash_index((buf_block_t*) bpage);
1762 UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame,
1763 srv_page_size);
1764
1765 if (b != NULL) {
1766
1767 /* Compute and stamp the compressed page
1768 checksum while not holding any mutex. The
1769 block is already half-freed
1770 (BUF_BLOCK_REMOVE_HASH) and removed from
1771 buf_pool->page_hash, thus inaccessible by any
1772 other thread. */
1773
1774 ut_ad(b->size.is_compressed());
1775
1776 const uint32_t checksum = page_zip_calc_checksum(
1777 b->zip.data,
1778 b->size.physical(),
1779 static_cast<srv_checksum_algorithm_t>(
1780 srv_checksum_algorithm));
1781
1782 mach_write_to_4(b->zip.data + FIL_PAGE_SPACE_OR_CHKSUM,
1783 checksum);
1784 }
1785
1786 buf_pool_mutex_enter(buf_pool);
1787
1788 if (b != NULL) {
1789 mutex_enter(block_mutex);
1790
1791 buf_page_unset_sticky(b);
1792
1793 mutex_exit(block_mutex);
1794 }
1795
1796 buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
1797
1798 return(true);
1799}
1800
1801/******************************************************************//**
1802Puts a block back to the free list. */
1803void
1804buf_LRU_block_free_non_file_page(
1805/*=============================*/
1806 buf_block_t* block) /*!< in: block, must not contain a file page */
1807{
1808 void* data;
1809 buf_pool_t* buf_pool = buf_pool_from_block(block);
1810
1811 ut_ad(buf_pool_mutex_own(buf_pool));
1812 ut_ad(buf_page_mutex_own(block));
1813
1814 switch (buf_block_get_state(block)) {
1815 case BUF_BLOCK_MEMORY:
1816 case BUF_BLOCK_READY_FOR_USE:
1817 break;
1818 default:
1819 ut_error;
1820 }
1821
1822 assert_block_ahi_empty(block);
1823 ut_ad(!block->page.in_free_list);
1824 ut_ad(!block->page.in_flush_list);
1825 ut_ad(!block->page.in_LRU_list);
1826
1827 buf_block_set_state(block, BUF_BLOCK_NOT_USED);
1828
1829 UNIV_MEM_ALLOC(block->frame, srv_page_size);
1830#ifdef UNIV_DEBUG
1831 /* Wipe contents of page to reveal possible stale pointers to it */
1832 memset(block->frame, '\0', srv_page_size);
1833#else
1834 /* Wipe page_no and space_id */
1835 memset(block->frame + FIL_PAGE_OFFSET, 0xfe, 4);
1836 memset(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xfe, 4);
1837#endif /* UNIV_DEBUG */
1838 data = block->page.zip.data;
1839
1840 if (data != NULL) {
1841 block->page.zip.data = NULL;
1842 buf_page_mutex_exit(block);
1843 buf_pool_mutex_exit_forbid(buf_pool);
1844
1845 ut_ad(block->page.size.is_compressed());
1846
1847 buf_buddy_free(buf_pool, data, block->page.size.physical());
1848
1849 buf_pool_mutex_exit_allow(buf_pool);
1850 buf_page_mutex_enter(block);
1851
1852 page_zip_set_size(&block->page.zip, 0);
1853
1854 block->page.size.copy_from(
1855 page_size_t(block->page.size.logical(),
1856 block->page.size.logical(),
1857 false));
1858 }
1859
1860 if (buf_pool->curr_size < buf_pool->old_size
1861 && UT_LIST_GET_LEN(buf_pool->withdraw) < buf_pool->withdraw_target
1862 && buf_block_will_withdrawn(buf_pool, block)) {
1863 /* This should be withdrawn */
1864 UT_LIST_ADD_LAST(
1865 buf_pool->withdraw,
1866 &block->page);
1867 ut_d(block->in_withdraw_list = TRUE);
1868 } else {
1869 UT_LIST_ADD_FIRST(buf_pool->free, &block->page);
1870 ut_d(block->page.in_free_list = TRUE);
1871 }
1872
1873 UNIV_MEM_FREE(block->frame, srv_page_size);
1874}
1875
1876/******************************************************************//**
1877Takes a block out of the LRU list and page hash table.
1878If the block is compressed-only (BUF_BLOCK_ZIP_PAGE),
1879the object will be freed.
1880
1881The caller must hold buf_pool->mutex, the buf_page_get_mutex() mutex
1882and the appropriate hash_lock. This function will release the
1883buf_page_get_mutex() and the hash_lock.
1884
1885If a compressed page is freed other compressed pages may be relocated.
1886@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The
1887caller needs to free the page to the free list
1888@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
1889this case the block is already returned to the buddy allocator. */
1890static
1891bool
1892buf_LRU_block_remove_hashed(
1893/*========================*/
1894 buf_page_t* bpage, /*!< in: block, must contain a file page and
1895 be in a state where it can be freed; there
1896 may or may not be a hash index to the page */
1897 bool zip) /*!< in: true if should remove also the
1898 compressed page of an uncompressed page */
1899{
1900 const buf_page_t* hashed_bpage;
1901 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1902 rw_lock_t* hash_lock;
1903
1904 ut_ad(buf_pool_mutex_own(buf_pool));
1905 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
1906
1907 hash_lock = buf_page_hash_lock_get(buf_pool, bpage->id);
1908
1909 ut_ad(rw_lock_own(hash_lock, RW_LOCK_X));
1910
1911 ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
1912 ut_a(bpage->buf_fix_count == 0);
1913
1914 buf_LRU_remove_block(bpage);
1915
1916 buf_pool->freed_page_clock += 1;
1917
1918 switch (buf_page_get_state(bpage)) {
1919 case BUF_BLOCK_FILE_PAGE:
1920 UNIV_MEM_ASSERT_W(bpage, sizeof(buf_block_t));
1921 UNIV_MEM_ASSERT_W(((buf_block_t*) bpage)->frame,
1922 srv_page_size);
1923 buf_block_modify_clock_inc((buf_block_t*) bpage);
1924 if (bpage->zip.data) {
1925 const page_t* page = ((buf_block_t*) bpage)->frame;
1926
1927 ut_a(!zip || bpage->oldest_modification == 0);
1928 ut_ad(bpage->size.is_compressed());
1929
1930 switch (fil_page_get_type(page)) {
1931 case FIL_PAGE_TYPE_ALLOCATED:
1932 case FIL_PAGE_INODE:
1933 case FIL_PAGE_IBUF_BITMAP:
1934 case FIL_PAGE_TYPE_FSP_HDR:
1935 case FIL_PAGE_TYPE_XDES:
1936 /* These are essentially uncompressed pages. */
1937 if (!zip) {
1938 /* InnoDB writes the data to the
1939 uncompressed page frame. Copy it
1940 to the compressed page, which will
1941 be preserved. */
1942 memcpy(bpage->zip.data, page,
1943 bpage->size.physical());
1944 }
1945 break;
1946 case FIL_PAGE_TYPE_ZBLOB:
1947 case FIL_PAGE_TYPE_ZBLOB2:
1948 break;
1949 case FIL_PAGE_INDEX:
1950 case FIL_PAGE_RTREE:
1951#if defined UNIV_ZIP_DEBUG && defined BTR_CUR_HASH_ADAPT
1952 ut_a(page_zip_validate(
1953 &bpage->zip, page,
1954 ((buf_block_t*) bpage)->index));
1955#endif /* UNIV_ZIP_DEBUG && BTR_CUR_HASH_ADAPT */
1956 break;
1957 default:
1958 ib::error() << "The compressed page to be"
1959 " evicted seems corrupt:";
1960 ut_print_buf(stderr, page,
1961 bpage->size.logical());
1962
1963 ib::error() << "Possibly older version of"
1964 " the page:";
1965
1966 ut_print_buf(stderr, bpage->zip.data,
1967 bpage->size.physical());
1968 putc('\n', stderr);
1969 ut_error;
1970 }
1971
1972 break;
1973 }
1974 /* fall through */
1975 case BUF_BLOCK_ZIP_PAGE:
1976 ut_a(bpage->oldest_modification == 0);
1977 if (bpage->size.is_compressed()) {
1978 UNIV_MEM_ASSERT_W(bpage->zip.data,
1979 bpage->size.physical());
1980 }
1981 break;
1982 case BUF_BLOCK_POOL_WATCH:
1983 case BUF_BLOCK_ZIP_DIRTY:
1984 case BUF_BLOCK_NOT_USED:
1985 case BUF_BLOCK_READY_FOR_USE:
1986 case BUF_BLOCK_MEMORY:
1987 case BUF_BLOCK_REMOVE_HASH:
1988 ut_error;
1989 break;
1990 }
1991
1992 hashed_bpage = buf_page_hash_get_low(buf_pool, bpage->id);
1993 if (bpage != hashed_bpage) {
1994 ib::error() << "Page " << bpage->id
1995 << " not found in the hash table";
1996
1997#ifdef UNIV_DEBUG
1998
1999
2000 ib::error()
2001 << "in_page_hash:" << bpage->in_page_hash
2002 << " in_zip_hash:" << bpage->in_zip_hash
2003 // << " in_free_list:"<< bpage->in_fee_list
2004 << " in_flush_list:" << bpage->in_flush_list
2005 << " in_LRU_list:" << bpage->in_LRU_list
2006 << " zip.data:" << bpage->zip.data
2007 << " zip_size:" << bpage->size.logical()
2008 << " page_state:" << buf_page_get_state(bpage);
2009#else
2010 ib::error()
2011 << " zip.data:" << bpage->zip.data
2012 << " zip_size:" << bpage->size.logical()
2013 << " page_state:" << buf_page_get_state(bpage);
2014#endif
2015
2016 if (hashed_bpage) {
2017
2018 ib::error() << "In hash table we find block "
2019 << hashed_bpage << " of " << hashed_bpage->id
2020 << " which is not " << bpage;
2021 }
2022
2023#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2024 mutex_exit(buf_page_get_mutex(bpage));
2025 rw_lock_x_unlock(hash_lock);
2026 buf_pool_mutex_exit(buf_pool);
2027 buf_print();
2028 buf_LRU_print();
2029 buf_validate();
2030 buf_LRU_validate();
2031#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2032 ut_error;
2033 }
2034
2035 ut_ad(!bpage->in_zip_hash);
2036 ut_ad(bpage->in_page_hash);
2037 ut_d(bpage->in_page_hash = FALSE);
2038
2039 HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, bpage->id.fold(),
2040 bpage);
2041
2042 switch (buf_page_get_state(bpage)) {
2043 case BUF_BLOCK_ZIP_PAGE:
2044 ut_ad(!bpage->in_free_list);
2045 ut_ad(!bpage->in_flush_list);
2046 ut_ad(!bpage->in_LRU_list);
2047 ut_a(bpage->zip.data);
2048 ut_a(bpage->size.is_compressed());
2049
2050#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2051 UT_LIST_REMOVE(buf_pool->zip_clean, bpage);
2052#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2053
2054 mutex_exit(&buf_pool->zip_mutex);
2055 rw_lock_x_unlock(hash_lock);
2056 buf_pool_mutex_exit_forbid(buf_pool);
2057
2058 buf_buddy_free(buf_pool, bpage->zip.data,
2059 bpage->size.physical());
2060
2061 buf_pool_mutex_exit_allow(buf_pool);
2062 buf_page_free_descriptor(bpage);
2063 return(false);
2064
2065 case BUF_BLOCK_FILE_PAGE:
2066 memset(((buf_block_t*) bpage)->frame
2067 + FIL_PAGE_OFFSET, 0xff, 4);
2068 memset(((buf_block_t*) bpage)->frame
2069 + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
2070 UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame,
2071 srv_page_size);
2072 buf_page_set_state(bpage, BUF_BLOCK_REMOVE_HASH);
2073
2074 /* Question: If we release bpage and hash mutex here
2075 then what protects us against:
2076 1) Some other thread buffer fixing this page
2077 2) Some other thread trying to read this page and
2078 not finding it in buffer pool attempting to read it
2079 from the disk.
2080 Answer:
2081 1) Cannot happen because the page is no longer in the
2082 page_hash. Only possibility is when while invalidating
2083 a tablespace we buffer fix the prev_page in LRU to
2084 avoid relocation during the scan. But that is not
2085 possible because we are holding buf_pool mutex.
2086
2087 2) Not possible because in buf_page_init_for_read()
2088 we do a look up of page_hash while holding buf_pool
2089 mutex and since we are holding buf_pool mutex here
2090 and by the time we'll release it in the caller we'd
2091 have inserted the compressed only descriptor in the
2092 page_hash. */
2093 rw_lock_x_unlock(hash_lock);
2094 mutex_exit(&((buf_block_t*) bpage)->mutex);
2095
2096 if (zip && bpage->zip.data) {
2097 /* Free the compressed page. */
2098 void* data = bpage->zip.data;
2099 bpage->zip.data = NULL;
2100
2101 ut_ad(!bpage->in_free_list);
2102 ut_ad(!bpage->in_flush_list);
2103 ut_ad(!bpage->in_LRU_list);
2104 buf_pool_mutex_exit_forbid(buf_pool);
2105
2106 buf_buddy_free(buf_pool, data, bpage->size.physical());
2107
2108 buf_pool_mutex_exit_allow(buf_pool);
2109
2110 page_zip_set_size(&bpage->zip, 0);
2111
2112 bpage->size.copy_from(
2113 page_size_t(bpage->size.logical(),
2114 bpage->size.logical(),
2115 false));
2116 }
2117
2118 return(true);
2119
2120 case BUF_BLOCK_POOL_WATCH:
2121 case BUF_BLOCK_ZIP_DIRTY:
2122 case BUF_BLOCK_NOT_USED:
2123 case BUF_BLOCK_READY_FOR_USE:
2124 case BUF_BLOCK_MEMORY:
2125 case BUF_BLOCK_REMOVE_HASH:
2126 break;
2127 }
2128
2129 ut_error;
2130 return(false);
2131}
2132
2133/******************************************************************//**
2134Puts a file page whose has no hash index to the free list. */
2135static
2136void
2137buf_LRU_block_free_hashed_page(
2138/*===========================*/
2139 buf_block_t* block) /*!< in: block, must contain a file page and
2140 be in a state where it can be freed */
2141{
2142 buf_pool_t* buf_pool = buf_pool_from_block(block);
2143 ut_ad(buf_pool_mutex_own(buf_pool));
2144
2145 buf_page_mutex_enter(block);
2146
2147 if (buf_pool->flush_rbt == NULL) {
2148 block->page.id.reset();
2149 }
2150
2151 buf_block_set_state(block, BUF_BLOCK_MEMORY);
2152
2153 buf_LRU_block_free_non_file_page(block);
2154 buf_page_mutex_exit(block);
2155}
2156
2157/******************************************************************//**
2158Remove one page from LRU list and put it to free list */
2159void
2160buf_LRU_free_one_page(
2161/*==================*/
2162 buf_page_t* bpage) /*!< in/out: block, must contain a file page and
2163 be in a state where it can be freed; there
2164 may or may not be a hash index to the page */
2165{
2166 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
2167
2168 rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, bpage->id);
2169 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
2170
2171 ut_ad(buf_pool_mutex_own(buf_pool));
2172
2173 rw_lock_x_lock(hash_lock);
2174 mutex_enter(block_mutex);
2175
2176 if (buf_LRU_block_remove_hashed(bpage, true)) {
2177 buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
2178 }
2179
2180 /* buf_LRU_block_remove_hashed() releases hash_lock and block_mutex */
2181 ut_ad(!rw_lock_own(hash_lock, RW_LOCK_X)
2182 && !rw_lock_own(hash_lock, RW_LOCK_S));
2183
2184 ut_ad(!mutex_own(block_mutex));
2185}
2186
2187/**********************************************************************//**
2188Updates buf_pool->LRU_old_ratio for one buffer pool instance.
2189@return updated old_pct */
2190static
2191uint
2192buf_LRU_old_ratio_update_instance(
2193/*==============================*/
2194 buf_pool_t* buf_pool,/*!< in: buffer pool instance */
2195 uint old_pct,/*!< in: Reserve this percentage of
2196 the buffer pool for "old" blocks. */
2197 ibool adjust) /*!< in: TRUE=adjust the LRU list;
2198 FALSE=just assign buf_pool->LRU_old_ratio
2199 during the initialization of InnoDB */
2200{
2201 uint ratio;
2202
2203 ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100;
2204 if (ratio < BUF_LRU_OLD_RATIO_MIN) {
2205 ratio = BUF_LRU_OLD_RATIO_MIN;
2206 } else if (ratio > BUF_LRU_OLD_RATIO_MAX) {
2207 ratio = BUF_LRU_OLD_RATIO_MAX;
2208 }
2209
2210 if (adjust) {
2211 buf_pool_mutex_enter(buf_pool);
2212
2213 if (ratio != buf_pool->LRU_old_ratio) {
2214 buf_pool->LRU_old_ratio = ratio;
2215
2216 if (UT_LIST_GET_LEN(buf_pool->LRU)
2217 >= BUF_LRU_OLD_MIN_LEN) {
2218
2219 buf_LRU_old_adjust_len(buf_pool);
2220 }
2221 }
2222
2223 buf_pool_mutex_exit(buf_pool);
2224 } else {
2225 buf_pool->LRU_old_ratio = ratio;
2226 }
2227 /* the reverse of
2228 ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / 100 */
2229 return((uint) (ratio * 100 / (double) BUF_LRU_OLD_RATIO_DIV + 0.5));
2230}
2231
2232/**********************************************************************//**
2233Updates buf_pool->LRU_old_ratio.
2234@return updated old_pct */
2235uint
2236buf_LRU_old_ratio_update(
2237/*=====================*/
2238 uint old_pct,/*!< in: Reserve this percentage of
2239 the buffer pool for "old" blocks. */
2240 ibool adjust) /*!< in: TRUE=adjust the LRU list;
2241 FALSE=just assign buf_pool->LRU_old_ratio
2242 during the initialization of InnoDB */
2243{
2244 uint new_ratio = 0;
2245
2246 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2247 buf_pool_t* buf_pool;
2248
2249 buf_pool = buf_pool_from_array(i);
2250
2251 new_ratio = buf_LRU_old_ratio_update_instance(
2252 buf_pool, old_pct, adjust);
2253 }
2254
2255 return(new_ratio);
2256}
2257
2258/********************************************************************//**
2259Update the historical stats that we are collecting for LRU eviction
2260policy at the end of each interval. */
2261void
2262buf_LRU_stat_update(void)
2263/*=====================*/
2264{
2265 buf_LRU_stat_t* item;
2266 buf_pool_t* buf_pool;
2267 bool evict_started = FALSE;
2268 buf_LRU_stat_t cur_stat;
2269
2270 /* If we haven't started eviction yet then don't update stats. */
2271 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2272
2273 buf_pool = buf_pool_from_array(i);
2274
2275 if (buf_pool->freed_page_clock != 0) {
2276 evict_started = true;
2277 break;
2278 }
2279 }
2280
2281 if (!evict_started) {
2282 goto func_exit;
2283 }
2284
2285 /* Update the index. */
2286 item = &buf_LRU_stat_arr[buf_LRU_stat_arr_ind];
2287 buf_LRU_stat_arr_ind++;
2288 buf_LRU_stat_arr_ind %= BUF_LRU_STAT_N_INTERVAL;
2289
2290 /* Add the current value and subtract the obsolete entry.
2291 Since buf_LRU_stat_cur is not protected by any mutex,
2292 it can be changing between adding to buf_LRU_stat_sum
2293 and copying to item. Assign it to local variables to make
2294 sure the same value assign to the buf_LRU_stat_sum
2295 and item */
2296 cur_stat = buf_LRU_stat_cur;
2297
2298 buf_LRU_stat_sum.io += cur_stat.io - item->io;
2299 buf_LRU_stat_sum.unzip += cur_stat.unzip - item->unzip;
2300
2301 /* Put current entry in the array. */
2302 memcpy(item, &cur_stat, sizeof *item);
2303
2304func_exit:
2305 /* Clear the current entry. */
2306 memset(&buf_LRU_stat_cur, 0, sizeof buf_LRU_stat_cur);
2307}
2308
2309#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2310/**********************************************************************//**
2311Validates the LRU list for one buffer pool instance. */
2312static
2313void
2314buf_LRU_validate_instance(
2315/*======================*/
2316 buf_pool_t* buf_pool)
2317{
2318 ulint old_len;
2319 ulint new_len;
2320
2321 buf_pool_mutex_enter(buf_pool);
2322
2323 if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) {
2324
2325 ut_a(buf_pool->LRU_old);
2326 old_len = buf_pool->LRU_old_len;
2327
2328 new_len = ut_min(UT_LIST_GET_LEN(buf_pool->LRU)
2329 * buf_pool->LRU_old_ratio
2330 / BUF_LRU_OLD_RATIO_DIV,
2331 UT_LIST_GET_LEN(buf_pool->LRU)
2332 - (BUF_LRU_OLD_TOLERANCE
2333 + BUF_LRU_NON_OLD_MIN_LEN));
2334
2335 ut_a(old_len >= new_len - BUF_LRU_OLD_TOLERANCE);
2336 ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE);
2337 }
2338
2339 CheckInLRUList::validate(buf_pool);
2340
2341 old_len = 0;
2342
2343 for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
2344 bpage != NULL;
2345 bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
2346
2347 switch (buf_page_get_state(bpage)) {
2348 case BUF_BLOCK_POOL_WATCH:
2349 case BUF_BLOCK_NOT_USED:
2350 case BUF_BLOCK_READY_FOR_USE:
2351 case BUF_BLOCK_MEMORY:
2352 case BUF_BLOCK_REMOVE_HASH:
2353 ut_error;
2354 break;
2355 case BUF_BLOCK_FILE_PAGE:
2356 ut_ad(((buf_block_t*) bpage)->in_unzip_LRU_list
2357 == buf_page_belongs_to_unzip_LRU(bpage));
2358 case BUF_BLOCK_ZIP_PAGE:
2359 case BUF_BLOCK_ZIP_DIRTY:
2360 break;
2361 }
2362
2363 if (buf_page_is_old(bpage)) {
2364 const buf_page_t* prev
2365 = UT_LIST_GET_PREV(LRU, bpage);
2366 const buf_page_t* next
2367 = UT_LIST_GET_NEXT(LRU, bpage);
2368
2369 if (!old_len++) {
2370 ut_a(buf_pool->LRU_old == bpage);
2371 } else {
2372 ut_a(!prev || buf_page_is_old(prev));
2373 }
2374
2375 ut_a(!next || buf_page_is_old(next));
2376 }
2377 }
2378
2379 ut_a(buf_pool->LRU_old_len == old_len);
2380
2381 CheckInFreeList::validate(buf_pool);
2382
2383 for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool->free);
2384 bpage != NULL;
2385 bpage = UT_LIST_GET_NEXT(list, bpage)) {
2386
2387 ut_a(buf_page_get_state(bpage) == BUF_BLOCK_NOT_USED);
2388 }
2389
2390 CheckUnzipLRUAndLRUList::validate(buf_pool);
2391
2392 for (buf_block_t* block = UT_LIST_GET_FIRST(buf_pool->unzip_LRU);
2393 block != NULL;
2394 block = UT_LIST_GET_NEXT(unzip_LRU, block)) {
2395
2396 ut_ad(block->in_unzip_LRU_list);
2397 ut_ad(block->page.in_LRU_list);
2398 ut_a(buf_page_belongs_to_unzip_LRU(&block->page));
2399 }
2400
2401 buf_pool_mutex_exit(buf_pool);
2402}
2403
2404/**********************************************************************//**
2405Validates the LRU list.
2406@return TRUE */
2407ibool
2408buf_LRU_validate(void)
2409/*==================*/
2410{
2411 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2412 buf_pool_t* buf_pool;
2413
2414 buf_pool = buf_pool_from_array(i);
2415 buf_LRU_validate_instance(buf_pool);
2416 }
2417
2418 return(TRUE);
2419}
2420#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
2421
2422#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
2423/**********************************************************************//**
2424Prints the LRU list for one buffer pool instance. */
2425static
2426void
2427buf_LRU_print_instance(
2428/*===================*/
2429 buf_pool_t* buf_pool)
2430{
2431 buf_pool_mutex_enter(buf_pool);
2432
2433 for (const buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
2434 bpage != NULL;
2435 bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
2436
2437 mutex_enter(buf_page_get_mutex(bpage));
2438
2439 fprintf(stderr, "BLOCK space %u page %u ",
2440 bpage->id.space(), bpage->id.page_no());
2441
2442 if (buf_page_is_old(bpage)) {
2443 fputs("old ", stderr);
2444 }
2445
2446 if (bpage->buf_fix_count) {
2447 fprintf(stderr, "buffix count %u ",
2448 bpage->buf_fix_count);
2449 }
2450
2451 if (buf_page_get_io_fix(bpage)) {
2452 fprintf(stderr, "io_fix %d ",
2453 buf_page_get_io_fix(bpage));
2454 }
2455
2456 if (bpage->oldest_modification) {
2457 fputs("modif. ", stderr);
2458 }
2459
2460 switch (buf_page_get_state(bpage)) {
2461 const byte* frame;
2462 case BUF_BLOCK_FILE_PAGE:
2463 frame = buf_block_get_frame((buf_block_t*) bpage);
2464 fprintf(stderr, "\ntype %u index id " IB_ID_FMT "\n",
2465 fil_page_get_type(frame),
2466 btr_page_get_index_id(frame));
2467 break;
2468 case BUF_BLOCK_ZIP_PAGE:
2469 frame = bpage->zip.data;
2470 fprintf(stderr, "\ntype %u size " ULINTPF
2471 " index id " IB_ID_FMT "\n",
2472 fil_page_get_type(frame),
2473 bpage->size.physical(),
2474 btr_page_get_index_id(frame));
2475 break;
2476
2477 default:
2478 fprintf(stderr, "\n!state %d!\n",
2479 buf_page_get_state(bpage));
2480 break;
2481 }
2482
2483 mutex_exit(buf_page_get_mutex(bpage));
2484 }
2485
2486 buf_pool_mutex_exit(buf_pool);
2487}
2488
2489/**********************************************************************//**
2490Prints the LRU list. */
2491void
2492buf_LRU_print(void)
2493/*===============*/
2494{
2495 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2496 buf_pool_t* buf_pool;
2497
2498 buf_pool = buf_pool_from_array(i);
2499 buf_LRU_print_instance(buf_pool);
2500 }
2501}
2502#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
2503