buf0lru.cc source code [MariaDB/storage/innobase/buf/buf0lru.cc]

1	/*****************************************************************************
2
3	Copyright (c) 1995, 2016, Oracle and/or its affiliates. All Rights Reserved.
4	Copyright (c) 2017, 2018, MariaDB Corporation.
5
6	This program is free software; you can redistribute it and/or modify it under
7	the terms of the GNU General Public License as published by the Free Software
8	Foundation; version 2 of the License.
9
10	This program is distributed in the hope that it will be useful, but WITHOUT
11	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12	FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14	You should have received a copy of the GNU General Public License along with
15	this program; if not, write to the Free Software Foundation, Inc.,
16	51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
17
18	*****************************************************************************/
19
20	/************************************************//**
21	@file buf/buf0lru.cc
22	The database buffer replacement algorithm
23
24	Created 11/5/1995 Heikki Tuuri
25	*******************************************************/
26
27	#include "buf0lru.h"
28	#include "ut0byte.h"
29	#include "ut0rnd.h"
30	#include "sync0rw.h"
31	#include "hash0hash.h"
32	#include "os0event.h"
33	#include "fil0fil.h"
34	#include "btr0btr.h"
35	#include "buf0buddy.h"
36	#include "buf0buf.h"
37	#include "buf0dblwr.h"
38	#include "buf0flu.h"
39	#include "buf0rea.h"
40	#include "btr0sea.h"
41	#include "ibuf0ibuf.h"
42	#include "os0file.h"
43	#include "page0zip.h"
44	#include "log0recv.h"
45	#include "srv0srv.h"
46	#include "srv0mon.h"
47	#include "lock0lock.h"
48
49	/* The number of blocks from the LRU_old pointer onward, including*
50	the block pointed to, must be buf_pool->LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV
51	of the whole LRU list length, except that the tolerance defined below
52	is allowed. Note that the tolerance must be small enough such that for
53	even the BUF_LRU_OLD_MIN_LEN long LRU list, the LRU_old pointer is not
54	allowed to point to either end of the LRU list. /*
55
56	static const ulint BUF_LRU_OLD_TOLERANCE = `20`;
57
58	/* The minimum amount of non-old blocks when the LRU_old list exists*
59	(that is, when there are more than BUF_LRU_OLD_MIN_LEN blocks).
60	@see buf_LRU_old_adjust_len /*
61	#define BUF_LRU_NON_OLD_MIN_LEN 5
62
63	/* When dropping the search hash index entries before deleting an ibd*
64	file, we build a local array of pages belonging to that tablespace
65	in the buffer pool. Following is the size of that array.
66	We also release buf_pool->mutex after scanning this many pages of the
67	flush_list when dropping a table. This is to ensure that other threads
68	are not blocked for extended period of time when using very large
69	buffer pools. /*
70	static const ulint BUF_LRU_DROP_SEARCH_SIZE = `1024`;
71
72	/* We scan these many blocks when looking for a clean page to evict*
73	during LRU eviction. /*
74	static const ulint BUF_LRU_SEARCH_SCAN_THRESHOLD = `100`;
75
76	/* If we switch on the InnoDB monitor because there are too few available*
77	frames in the buffer pool, we set this to TRUE /*
78	static bool buf_lru_switched_on_innodb_mon = false;
79
80	/* True if diagnostic message about difficult to find free blocks*
81	in the buffer bool has already printed. /*
82	static bool buf_lru_free_blocks_error_printed;
83
84	/****************************************************************//**
85	These statistics are not 'of' LRU but 'for' LRU. We keep count of I/O
86	and page_zip_decompress() operations. Based on the statistics,
87	buf_LRU_evict_from_unzip_LRU() decides if we want to evict from
88	unzip_LRU or the regular LRU. From unzip_LRU, we will only evict the
89	uncompressed frame (meaning we can evict dirty blocks as well). From
90	the regular LRU, we will evict the entire block (i.e.: both the
91	uncompressed and compressed data), which must be clean. /*
92
93	/ @{ /
94
95	/* Number of intervals for which we keep the history of these stats.*
96	Each interval is 1 second, defined by the rate at which
97	srv_error_monitor_thread() calls buf_LRU_stat_update(). /*
98	static const ulint BUF_LRU_STAT_N_INTERVAL = `50`;
99
100	/* Co-efficient with which we multiply I/O operations to equate them*
101	with page_zip_decompress() operations. /*
102	static const ulint BUF_LRU_IO_TO_UNZIP_FACTOR = `50`;
103
104	/* Sampled values buf_LRU_stat_cur.*
105	Not protected by any mutex. Updated by buf_LRU_stat_update(). /*
106	static buf_LRU_stat_t buf_LRU_stat_arr[BUF_LRU_STAT_N_INTERVAL];
107
108	/* Cursor to buf_LRU_stat_arr[] that is updated in a round-robin fashion. /
109	static ulint buf_LRU_stat_arr_ind;
110
111	/* Current operation counters. Not protected by any mutex. Cleared*
112	by buf_LRU_stat_update(). /*
113	buf_LRU_stat_t buf_LRU_stat_cur;
114
115	/* Running sum of past values of buf_LRU_stat_cur.*
116	Updated by buf_LRU_stat_update(). Not Protected by any mutex. /*
117	buf_LRU_stat_t buf_LRU_stat_sum;
118
119	/ @} /
120
121	/* @name Heuristics for detecting index scan @{ /
122	/* Move blocks to "new" LRU list only if the first access was at*
123	least this many milliseconds ago. Not protected by any mutex or latch. /*
124	uint buf_LRU_old_threshold_ms;
125	/ @} /
126
127	/****************************************************************//**
128	Takes a block out of the LRU list and page hash table.
129	If the block is compressed-only (BUF_BLOCK_ZIP_PAGE),
130	the object will be freed.
131
132	The caller must hold buf_pool->mutex, the buf_page_get_mutex() mutex
133	and the appropriate hash_lock. This function will release the
134	buf_page_get_mutex() and the hash_lock.
135
136	If a compressed page is freed other compressed pages may be relocated.
137	@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The
138	caller needs to free the page to the free list
139	@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
140	this case the block is already returned to the buddy allocator. /*
141	static MY_ATTRIBUTE((warn_unused_result))
142	bool
143	buf_LRU_block_remove_hashed(
144	/========================/
145	buf_page_t* bpage, /!< in: block, must contain a file page and*
146	be in a state where it can be freed; there
147	may or may not be a hash index to the page /*
148	bool zip); /!< in: true if should remove also the*
149	compressed page of an uncompressed page /*
150	/****************************************************************//**
151	Puts a file page whose has no hash index to the free list. /*
152	static
153	void
154	buf_LRU_block_free_hashed_page(
155	/===========================/
156	buf_block_t* block); /!< in: block, must contain a file page and*
157	be in a state where it can be freed /*
158
159	/****************************************************************//**
160	Increases LRU size in bytes with page size inline function /*
161	static inline
162	void
163	incr_LRU_size_in_bytes(
164	/===================/
165	buf_page_t* bpage, /!< in: control block /
166	buf_pool_t* buf_pool) /!< in: buffer pool instance /
167	{
168	ut_ad(buf_pool_mutex_own(buf_pool));
169
170	buf_pool->stat.LRU_bytes += bpage->size.physical();
171
172	ut_ad(buf_pool->stat.LRU_bytes <= buf_pool->curr_pool_size);
173	}
174
175	/****************************************************************//**
176	Determines if the unzip_LRU list should be used for evicting a victim
177	instead of the general LRU list.
178	@return TRUE if should use unzip_LRU /*
179	ibool
180	buf_LRU_evict_from_unzip_LRU(
181	/=========================/
182	buf_pool_t* buf_pool)
183	{
184	ut_ad(buf_pool_mutex_own(buf_pool));
185
186	/ If the unzip_LRU list is empty, we can only use the LRU. /
187	if (UT_LIST_GET_LEN(buf_pool->unzip_LRU) == `0`) {
188	return(FALSE);
189	}
190
191	/ If unzip_LRU is at most 10% of the size of the LRU list,*
192	then use the LRU. This slack allows us to keep hot
193	decompressed pages in the buffer pool. /*
194	if (UT_LIST_GET_LEN(buf_pool->unzip_LRU)
195	<= UT_LIST_GET_LEN(buf_pool->LRU) / `10`) {
196	return(FALSE);
197	}
198
199	/ If eviction hasn't started yet, we assume by default*
200	that a workload is disk bound. /*
201	if (buf_pool->freed_page_clock == `0`) {
202	return(TRUE);
203	}
204
205	/ Calculate the average over past intervals, and add the values*
206	of the current interval. /*
207	ulint io_avg = buf_LRU_stat_sum.io / BUF_LRU_STAT_N_INTERVAL
208	+ buf_LRU_stat_cur.io;
209
210	ulint unzip_avg = buf_LRU_stat_sum.unzip / BUF_LRU_STAT_N_INTERVAL
211	+ buf_LRU_stat_cur.unzip;
212
213	/ Decide based on our formula. If the load is I/O bound*
214	(unzip_avg is smaller than the weighted io_avg), evict an
215	uncompressed frame from unzip_LRU. Otherwise we assume that
216	the load is CPU bound and evict from the regular LRU. /*
217	return(unzip_avg <= io_avg * BUF_LRU_IO_TO_UNZIP_FACTOR);
218	}
219
220	#ifdef BTR_CUR_HASH_ADAPT
221	/* Attempts to drop page hash index on a batch of pages belonging to a*
222	particular space id.
223	@param[in] space_id space id
224	@param[in] arr array of page_no
225	@param[in] count number of entries in array /*
226	static
227	void
228	buf_LRU_drop_page_hash_batch(
229	ulint space_id,
230	const ulint* arr,
231	ulint count)
232	{
233	ut_ad(count <= BUF_LRU_DROP_SEARCH_SIZE);
234
235	for (ulint i = `0`; i < count; ++i, ++arr) {
236	/ While our only caller*
237	buf_LRU_drop_page_hash_for_tablespace()
238	is being executed for DROP TABLE or similar,
239	the table cannot be evicted from the buffer pool.
240	Note: this should not be executed for DROP TABLESPACE,
241	because DROP TABLESPACE would be refused if tables existed
242	in the tablespace, and a previous DROP TABLE would have
243	already removed the AHI entries. /*
244	btr_search_drop_page_hash_when_freed(
245	page_id_t (space_id, *arr));
246	}
247	}
248
249	/****************************************************************//**
250	When doing a DROP TABLE/DISCARD TABLESPACE we have to drop all page
251	hash index entries belonging to that table. This function tries to
252	do that in batch. Note that this is a 'best effort' attempt and does
253	not guarantee that ALL hash entries will be removed. /*
254	static
255	void
256	buf_LRU_drop_page_hash_for_tablespace(
257	/==================================/
258	buf_pool_t* buf_pool, /!< in: buffer pool instance /
259	ulint id) /!< in: space id /
260	{
261	ulint* page_arr = static_cast<ulint*>(ut_malloc_nokey(
262	sizeof(ulint) * BUF_LRU_DROP_SEARCH_SIZE));
263
264	ulint num_entries = `0`;
265
266	buf_pool_mutex_enter(buf_pool);
267
268	scan_again:
269	for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->LRU);
270	bpage != NULL;
271	/ No op /) {
272
273	buf_page_t* prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
274
275	ut_a(buf_page_in_file(bpage));
276
277	if (buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE
278	\|\| bpage->id.space() != id
279	\|\| bpage->io_fix != BUF_IO_NONE) {
280	/ Compressed pages are never hashed.*
281	Skip blocks of other tablespaces.
282	Skip I/O-fixed blocks (to be dealt with later). /*
283	next_page:
284	bpage = prev_bpage;
285	continue;
286	}
287
288	buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
289
290	mutex_enter(&block->mutex);
291
292	/ This debug check uses a dirty read that could*
293	theoretically cause false positives while
294	buf_pool_clear_hash_index() is executing.
295	(Other conflicting access paths to the adaptive hash
296	index should not be possible, because when a
297	tablespace is being discarded or dropped, there must
298	be no concurrect access to the contained tables.) /*
299	assert_block_ahi_valid(block);
300
301	bool skip = bpage->buf_fix_count > `0` \|\| !block->index;
302
303	mutex_exit(&block->mutex);
304
305	if (skip) {
306	/ Skip this block, because there are*
307	no adaptive hash index entries
308	pointing to it, or because we cannot
309	drop them due to the buffer-fix. /*
310	goto next_page;
311	}
312
313	/ Store the page number so that we can drop the hash*
314	index in a batch later. /*
315	page_arr[num_entries] = bpage->id.page_no();
316	ut_a(num_entries < BUF_LRU_DROP_SEARCH_SIZE);
317	++num_entries;
318
319	if (num_entries < BUF_LRU_DROP_SEARCH_SIZE) {
320	goto next_page;
321	}
322
323	/ Array full. We release the buf_pool->mutex to obey*
324	the latching order. /*
325	buf_pool_mutex_exit(buf_pool);
326
327	buf_LRU_drop_page_hash_batch(id, page_arr, num_entries);
328
329	num_entries = `0`;
330
331	buf_pool_mutex_enter(buf_pool);
332
333	/ Note that we released the buf_pool mutex above*
334	after reading the prev_bpage during processing of a
335	page_hash_batch (i.e.: when the array was full).
336	Because prev_bpage could belong to a compressed-only
337	block, it may have been relocated, and thus the
338	pointer cannot be trusted. Because bpage is of type
339	buf_block_t, it is safe to dereference.
340
341	bpage can change in the LRU list. This is OK because
342	this function is a 'best effort' to drop as many
343	search hash entries as possible and it does not
344	guarantee that ALL such entries will be dropped. /*
345
346	/ If, however, bpage has been removed from LRU list*
347	to the free list then we should restart the scan.
348	bpage->state is protected by buf_pool mutex. /*
349	if (bpage != NULL
350	&& buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
351
352	goto scan_again;
353	}
354	}
355
356	buf_pool_mutex_exit(buf_pool);
357
358	/ Drop any remaining batch of search hashed pages. /
359	buf_LRU_drop_page_hash_batch(id, page_arr, num_entries);
360	ut_free(page_arr);
361	}
362	#endif /* BTR_CUR_HASH_ADAPT */
363
364	/****************************************************************//**
365	While flushing (or removing dirty) pages from a tablespace we don't
366	want to hog the CPU and resources. Release the buffer pool and block
367	mutex and try to force a context switch. Then reacquire the same mutexes.
368	The current page is "fixed" before the release of the mutexes and then
369	"unfixed" again once we have reacquired the mutexes. /*
370	static
371	void
372	buf_flush_yield(
373	/============/
374	buf_pool_t* buf_pool, /!< in/out: buffer pool instance /
375	buf_page_t* bpage) /!< in/out: current page /
376	{
377	BPageMutex* block_mutex;
378
379	ut_ad(buf_pool_mutex_own(buf_pool));
380	ut_ad(buf_page_in_file(bpage));
381
382	block_mutex = buf_page_get_mutex(bpage);
383
384	mutex_enter(block_mutex);
385
386	/ "Fix" the block so that the position cannot be*
387	changed after we release the buffer pool and
388	block mutexes. /*
389	buf_page_set_sticky(bpage);
390
391	/ Now it is safe to release the buf_pool->mutex. /
392	buf_pool_mutex_exit(buf_pool);
393
394	mutex_exit(block_mutex);
395	/ Try and force a context switch. /
396	os_thread_yield();
397
398	buf_pool_mutex_enter(buf_pool);
399
400	mutex_enter(block_mutex);
401
402	/ "Unfix" the block now that we have both the*
403	buffer pool and block mutex again. /*
404	buf_page_unset_sticky(bpage);
405	mutex_exit(block_mutex);
406	}
407
408	/****************************************************************//**
409	If we have hogged the resources for too long then release the buffer
410	pool and flush list mutex and do a thread yield. Set the current page
411	to "sticky" so that it is not relocated during the yield.
412	@return true if yielded /*
413	static MY_ATTRIBUTE((warn_unused_result))
414	bool
415	buf_flush_try_yield(
416	/================/
417	buf_pool_t* buf_pool, /!< in/out: buffer pool instance /
418	buf_page_t* bpage, /!< in/out: bpage to remove /
419	ulint processed) /!< in: number of pages processed /
420	{
421	/ Every BUF_LRU_DROP_SEARCH_SIZE iterations in the*
422	loop we release buf_pool->mutex to let other threads
423	do their job but only if the block is not IO fixed. This
424	ensures that the block stays in its position in the
425	flush_list. /*
426
427	if (bpage != NULL
428	&& processed >= BUF_LRU_DROP_SEARCH_SIZE
429	&& buf_page_get_io_fix(bpage) == BUF_IO_NONE) {
430
431	buf_flush_list_mutex_exit(buf_pool);
432
433	/ Release the buffer pool and block mutex*
434	to give the other threads a go. /*
435
436	buf_flush_yield(buf_pool, bpage);
437
438	buf_flush_list_mutex_enter(buf_pool);
439
440	/ Should not have been removed from the flush*
441	list during the yield. However, this check is
442	not sufficient to catch a remove -> add. /*
443
444	ut_ad(bpage->in_flush_list);
445
446	return(true);
447	}
448
449	return(false);
450	}
451
452	/****************************************************************//**
453	Removes a single page from a given tablespace inside a specific
454	buffer pool instance.
455	@return true if page was removed. /*
456	static MY_ATTRIBUTE((warn_unused_result))
457	bool
458	buf_flush_or_remove_page(
459	/=====================/
460	buf_pool_t* buf_pool, /!< in/out: buffer pool instance /
461	buf_page_t* bpage, /!< in/out: bpage to remove /
462	bool flush) /!< in: flush to disk if true but*
463	don't remove else remove without
464	flushing to disk /*
465	{
466	ut_ad(buf_pool_mutex_own(buf_pool));
467	ut_ad(buf_flush_list_mutex_own(buf_pool));
468
469	/ bpage->space and bpage->io_fix are protected by*
470	buf_pool->mutex and block_mutex. It is safe to check
471	them while holding buf_pool->mutex only. /*
472
473	if (buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
474
475	/ We cannot remove this page during this scan*
476	yet; maybe the system is currently reading it
477	in, or flushing the modifications to the file /*
478	return(false);
479
480	}
481
482	BPageMutex* block_mutex;
483	bool processed = false;
484
485	block_mutex = buf_page_get_mutex(bpage);
486
487	/ We have to release the flush_list_mutex to obey the*
488	latching order. We are however guaranteed that the page
489	will stay in the flush_list and won't be relocated because
490	buf_flush_remove() and buf_flush_relocate_on_flush_list()
491	need buf_pool->mutex as well. /*
492
493	buf_flush_list_mutex_exit(buf_pool);
494
495	mutex_enter(block_mutex);
496
497	ut_ad(bpage->oldest_modification != `0`);
498
499	if (!flush) {
500
501	buf_flush_remove(bpage);
502
503	mutex_exit(block_mutex);
504
505	processed = true;
506
507	} else if (buf_flush_ready_for_flush(bpage, BUF_FLUSH_SINGLE_PAGE)) {
508
509	/ The following call will release the buffer pool*
510	and block mutex. /*
511	processed = buf_flush_page(
512	buf_pool, bpage, BUF_FLUSH_SINGLE_PAGE, false);
513
514	if (processed) {
515	/ Wake possible simulated aio thread to actually*
516	post the writes to the operating system /*
517	os_aio_simulated_wake_handler_threads();
518	buf_pool_mutex_enter(buf_pool);
519	} else {
520	mutex_exit(block_mutex);
521	}
522	} else {
523	mutex_exit(block_mutex);
524	}
525
526	buf_flush_list_mutex_enter(buf_pool);
527
528	ut_ad(!mutex_own(block_mutex));
529	ut_ad(buf_pool_mutex_own(buf_pool));
530
531	return(processed);
532	}
533
534	/* Remove all dirty pages belonging to a given tablespace inside a specific*
535	buffer pool instance when we are deleting the data file(s) of that
536	tablespace. The pages still remain a part of LRU and are evicted from
537	the list as they age towards the tail of the LRU.
538	@param[in,out] buf_pool buffer pool
539	@param[in] id tablespace identifier
540	@param[in] observer flush observer (to check for interrupt),
541	or NULL if the files should not be written to
542	@return whether all dirty pages were freed /*
543	static MY_ATTRIBUTE((warn_unused_result))
544	bool
545	buf_flush_or_remove_pages(
546	buf_pool_t* buf_pool,
547	ulint id,
548	FlushObserver* observer)
549	{
550	buf_page_t* prev;
551	buf_page_t* bpage;
552	ulint processed = `0`;
553
554	buf_flush_list_mutex_enter(buf_pool);
555
556	rescan:
557	bool all_freed = true;
558
559	for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
560	bpage != NULL;
561	bpage = prev) {
562
563	ut_a(buf_page_in_file(bpage));
564
565	/ Save the previous link because once we free the*
566	page we can't rely on the links. /*
567
568	prev = UT_LIST_GET_PREV(list, bpage);
569
570	/ Flush the pages matching space id,*
571	or pages matching the flush observer. /*
572	if (observer && observer->is_partial_flush()) {
573	if (observer != bpage->flush_observer) {
574	/ Skip this block. /
575	} else if (!buf_flush_or_remove_page(
576	buf_pool, bpage,
577	!observer->is_interrupted())) {
578	all_freed = false;
579	} else if (!observer->is_interrupted()) {
580	/ The processing was successful. And during the*
581	processing we have released the buf_pool mutex
582	when calling buf_page_flush(). We cannot trust
583	prev pointer. /*
584	goto rescan;
585	}
586	} else if (id != bpage->id.space()) {
587	/ Skip this block, because it is for a*
588	different tablespace. /*
589	} else if (!buf_flush_or_remove_page(
590	buf_pool, bpage, observer != NULL)) {
591
592	/ Remove was unsuccessful, we have to try again*
593	by scanning the entire list from the end.
594	This also means that we never released the
595	buf_pool mutex. Therefore we can trust the prev
596	pointer.
597	buf_flush_or_remove_page() released the
598	flush list mutex but not the buf_pool mutex.
599	Therefore it is possible that a new page was
600	added to the flush list. For example, in case
601	where we are at the head of the flush list and
602	prev == NULL. That is OK because we have the
603	tablespace quiesced and no new pages for this
604	space-id should enter flush_list. This is
605	because the only callers of this function are
606	DROP TABLE and FLUSH TABLE FOR EXPORT.
607	We know that we'll have to do at least one more
608	scan but we don't break out of loop here and
609	try to do as much work as we can in this
610	iteration. /*
611
612	all_freed = false;
613	} else if (observer) {
614
615	/ The processing was successful. And during the*
616	processing we have released the buf_pool mutex
617	when calling buf_page_flush(). We cannot trust
618	prev pointer. /*
619	goto rescan;
620	}
621
622	++processed;
623
624	/ Yield if we have hogged the CPU and mutexes for too long. /
625	if (buf_flush_try_yield(buf_pool, prev, processed)) {
626
627	/ Reset the batch size counter if we had to yield. /
628
629	processed = `0`;
630	}
631
632	/ The check for trx is interrupted is expensive, we want*
633	to check every N iterations. /*
634	if (!processed && observer) {
635	observer->check_interrupted();
636	}
637	}
638
639	buf_flush_list_mutex_exit(buf_pool);
640
641	return(all_freed);
642	}
643
644	/* Remove or flush all the dirty pages that belong to a given tablespace*
645	inside a specific buffer pool instance. The pages will remain in the LRU
646	list and will be evicted from the LRU list as they age and move towards
647	the tail of the LRU list.
648	@param[in,out] buf_pool buffer pool
649	@param[in] id tablespace identifier
650	@param[in] observer flush observer,
651	or NULL if the files should not be written to
652	*/
653	static
654	void
655	buf_flush_dirty_pages(
656	buf_pool_t* buf_pool,
657	ulint id,
658	FlushObserver* observer)
659	{
660	for (;;) {
661	buf_pool_mutex_enter(buf_pool);
662
663	bool freed = buf_flush_or_remove_pages(buf_pool, id, observer);
664
665	buf_pool_mutex_exit(buf_pool);
666
667	ut_ad(buf_flush_validate(buf_pool));
668
669	if (freed) {
670	break;
671	}
672
673	os_thread_sleep(`2000`);
674	ut_ad(buf_flush_validate(buf_pool));
675	}
676
677	ut_ad((observer && observer->is_interrupted())
678	\|\| buf_pool_get_dirty_pages_count(buf_pool, id, observer) == `0`);
679	}
680
681	/* Empty the flush list for all pages belonging to a tablespace.*
682	@param[in] id tablespace identifier
683	@param[in] observer flush observer,
684	or NULL if nothing is to be written /*
685	void
686	buf_LRU_flush_or_remove_pages(
687	ulint id,
688	FlushObserver* observer
689	#ifdef BTR_CUR_HASH_ADAPT
690	, bool drop_ahi /!< whether to drop the adaptive hash index /
691	#endif /* BTR_CUR_HASH_ADAPT */
692	)
693	{
694	/ Pages in the system tablespace must never be discarded. /
695	ut_ad(id \|\| observer);
696
697	for (ulint i = `0`; i < srv_buf_pool_instances; i++) {
698	buf_pool_t* buf_pool = buf_pool_from_array(i);
699	#ifdef BTR_CUR_HASH_ADAPT
700	if (drop_ahi) {
701	buf_LRU_drop_page_hash_for_tablespace(buf_pool, id);
702	}
703	#endif /* BTR_CUR_HASH_ADAPT */
704	buf_flush_dirty_pages(buf_pool, id, observer);
705	}
706
707	if (observer && !observer->is_interrupted()) {
708	/ Ensure that all asynchronous IO is completed. /
709	os_aio_wait_until_no_pending_writes();
710	fil_flush(id);
711	}
712	}
713
714	#if defined UNIV_DEBUG \|\| defined UNIV_BUF_DEBUG
715	/******************************************************************//**
716	Insert a compressed block into buf_pool->zip_clean in the LRU order. /*
717	void
718	buf_LRU_insert_zip_clean(
719	/=====================/
720	buf_page_t* bpage) /!< in: pointer to the block in question /
721	{
722	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
723
724	ut_ad(buf_pool_mutex_own(buf_pool));
725	ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE);
726
727	/ Find the first successor of bpage in the LRU list*
728	that is in the zip_clean list. /*
729	buf_page_t* b = bpage;
730
731	do {
732	b = UT_LIST_GET_NEXT(LRU, b);
733	} while (b && buf_page_get_state(b) != BUF_BLOCK_ZIP_PAGE);
734
735	/ Insert bpage before b, i.e., after the predecessor of b. /
736	if (b != NULL) {
737	b = UT_LIST_GET_PREV(list, b);
738	}
739
740	if (b != NULL) {
741	UT_LIST_INSERT_AFTER(buf_pool->zip_clean, b, bpage);
742	} else {
743	UT_LIST_ADD_FIRST(buf_pool->zip_clean, bpage);
744	}
745	}
746	#endif /* UNIV_DEBUG \|\| UNIV_BUF_DEBUG */
747
748	/****************************************************************//**
749	Try to free an uncompressed page of a compressed block from the unzip
750	LRU list. The compressed page is preserved, and it need not be clean.
751	@return true if freed /*
752	static
753	bool
754	buf_LRU_free_from_unzip_LRU_list(
755	/=============================/
756	buf_pool_t* buf_pool, /!< in: buffer pool instance /
757	bool scan_all) /!< in: scan whole LRU list*
758	if true, otherwise scan only
759	srv_LRU_scan_depth / 2 blocks. /*
760	{
761	ut_ad(buf_pool_mutex_own(buf_pool));
762
763	if (!buf_LRU_evict_from_unzip_LRU(buf_pool)) {
764	return(false);
765	}
766
767	ulint scanned = `0`;
768	bool freed = false;
769
770	for (buf_block_t* block = UT_LIST_GET_LAST(buf_pool->unzip_LRU);
771	block != NULL
772	&& !freed
773	&& (scan_all \|\| scanned < srv_LRU_scan_depth);
774	++scanned) {
775
776	buf_block_t* prev_block;
777
778	prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
779
780	ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
781	ut_ad(block->in_unzip_LRU_list);
782	ut_ad(block->page.in_LRU_list);
783
784	freed = buf_LRU_free_page(&block->page, false);
785
786	block = prev_block;
787	}
788
789	if (scanned) {
790	MONITOR_INC_VALUE_CUMULATIVE(
791	MONITOR_LRU_UNZIP_SEARCH_SCANNED,
792	MONITOR_LRU_UNZIP_SEARCH_SCANNED_NUM_CALL,
793	MONITOR_LRU_UNZIP_SEARCH_SCANNED_PER_CALL,
794	scanned);
795	}
796
797	return(freed);
798	}
799
800	/****************************************************************//**
801	Try to free a clean page from the common LRU list.
802	@return true if freed /*
803	static
804	bool
805	buf_LRU_free_from_common_LRU_list(
806	/==============================/
807	buf_pool_t* buf_pool, /!< in: buffer pool instance /
808	bool scan_all) /!< in: scan whole LRU list*
809	if true, otherwise scan only
810	up to BUF_LRU_SEARCH_SCAN_THRESHOLD /*
811	{
812	ut_ad(buf_pool_mutex_own(buf_pool));
813
814	ulint scanned = `0`;
815	bool freed = false;
816
817	for (buf_page_t* bpage = buf_pool->lru_scan_itr.start();
818	bpage != NULL
819	&& !freed
820	&& (scan_all \|\| scanned < BUF_LRU_SEARCH_SCAN_THRESHOLD);
821	++scanned, bpage = buf_pool->lru_scan_itr.get()) {
822
823	buf_page_t* prev = UT_LIST_GET_PREV(LRU, bpage);
824	BPageMutex* mutex = buf_page_get_mutex(bpage);
825
826	buf_pool->lru_scan_itr.set(prev);
827
828	mutex_enter(mutex);
829
830	ut_ad(buf_page_in_file(bpage));
831	ut_ad(bpage->in_LRU_list);
832
833	unsigned accessed = buf_page_is_accessed(bpage);
834
835	if (buf_flush_ready_for_replace(bpage)) {
836	mutex_exit(mutex);
837	freed = buf_LRU_free_page(bpage, true);
838	} else {
839	mutex_exit(mutex);
840	}
841
842	if (freed && !accessed) {
843	/ Keep track of pages that are evicted without*
844	ever being accessed. This gives us a measure of
845	the effectiveness of readahead /*
846	++buf_pool->stat.n_ra_pages_evicted;
847	}
848
849	ut_ad(buf_pool_mutex_own(buf_pool));
850	ut_ad(!mutex_own(mutex));
851	}
852
853	if (scanned) {
854	MONITOR_INC_VALUE_CUMULATIVE(
855	MONITOR_LRU_SEARCH_SCANNED,
856	MONITOR_LRU_SEARCH_SCANNED_NUM_CALL,
857	MONITOR_LRU_SEARCH_SCANNED_PER_CALL,
858	scanned);
859	}
860
861	return(freed);
862	}
863
864	/****************************************************************//**
865	Try to free a replaceable block.
866	@return true if found and freed /*
867	bool
868	buf_LRU_scan_and_free_block(
869	/========================/
870	buf_pool_t* buf_pool, /!< in: buffer pool instance /
871	bool scan_all) /!< in: scan whole LRU list*
872	if true, otherwise scan only
873	BUF_LRU_SEARCH_SCAN_THRESHOLD
874	blocks. /*
875	{
876	ut_ad(buf_pool_mutex_own(buf_pool));
877
878	return(buf_LRU_free_from_unzip_LRU_list(buf_pool, scan_all)
879	\|\| buf_LRU_free_from_common_LRU_list(buf_pool, scan_all));
880	}
881
882	/****************************************************************//**
883	Returns TRUE if less than 25 % of the buffer pool in any instance is
884	available. This can be used in heuristics to prevent huge transactions
885	eating up the whole buffer pool for their locks.
886	@return TRUE if less than 25 % of buffer pool left /*
887	ibool
888	buf_LRU_buf_pool_running_out(void)
889	/==============================/
890	{
891	ibool ret = FALSE;
892
893	for (ulint i = `0`; i < srv_buf_pool_instances && !ret; i++) {
894	buf_pool_t* buf_pool;
895
896	buf_pool = buf_pool_from_array(i);
897
898	buf_pool_mutex_enter(buf_pool);
899
900	if (!recv_recovery_is_on()
901	&& UT_LIST_GET_LEN(buf_pool->free)
902	+ UT_LIST_GET_LEN(buf_pool->LRU)
903	< ut_min(buf_pool->curr_size,
904	buf_pool->old_size) / `4`) {
905
906	ret = TRUE;
907	}
908
909	buf_pool_mutex_exit(buf_pool);
910	}
911
912	return(ret);
913	}
914
915	/****************************************************************//**
916	Returns a free block from the buf_pool. The block is taken off the
917	free list. If it is empty, returns NULL.
918	@return a free control block, or NULL if the buf_block->free list is empty /*
919	buf_block_t*
920	buf_LRU_get_free_only(
921	/==================/
922	buf_pool_t* buf_pool)
923	{
924	buf_block_t* block;
925
926	ut_ad(buf_pool_mutex_own(buf_pool));
927
928	block = reinterpret_cast<buf_block_t*>(
929	UT_LIST_GET_FIRST(buf_pool->free));
930
931	while (block != NULL) {
932
933	ut_ad(block->page.in_free_list);
934	ut_d(block->page.in_free_list = FALSE);
935	ut_ad(!block->page.in_flush_list);
936	ut_ad(!block->page.in_LRU_list);
937	ut_a(!buf_page_in_file(&block->page));
938	UT_LIST_REMOVE(buf_pool->free, &block->page);
939
940	if (buf_pool->curr_size >= buf_pool->old_size
941	\|\| UT_LIST_GET_LEN(buf_pool->withdraw)
942	>= buf_pool->withdraw_target
943	\|\| !buf_block_will_withdrawn(buf_pool, block)) {
944	/ found valid free block /
945	buf_page_mutex_enter(block);
946	/ No adaptive hash index entries may point to*
947	a free block. /*
948	assert_block_ahi_empty(block);
949
950	buf_block_set_state(block, BUF_BLOCK_READY_FOR_USE);
951	UNIV_MEM_ALLOC(block->frame, srv_page_size);
952
953	ut_ad(buf_pool_from_block(block) == buf_pool);
954
955	buf_page_mutex_exit(block);
956	break;
957	}
958
959	/ This should be withdrawn /
960	UT_LIST_ADD_LAST(
961	buf_pool->withdraw,
962	&block->page);
963	ut_d(block->in_withdraw_list = TRUE);
964
965	block = reinterpret_cast<buf_block_t*>(
966	UT_LIST_GET_FIRST(buf_pool->free));
967	}
968
969	return(block);
970	}
971
972	/****************************************************************//**
973	Checks how much of buf_pool is occupied by non-data objects like
974	AHI, lock heaps etc. Depending on the size of non-data objects this
975	function will either assert or issue a warning and switch on the
976	status monitor. /*
977	static
978	void
979	buf_LRU_check_size_of_non_data_objects(
980	/===================================/
981	const buf_pool_t* buf_pool) /!< in: buffer pool instance /
982	{
983	ut_ad(buf_pool_mutex_own(buf_pool));
984
985	if (!recv_recovery_is_on()
986	&& buf_pool->curr_size == buf_pool->old_size
987	&& UT_LIST_GET_LEN(buf_pool->free)
988	+ UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->curr_size / `20`) {
989
990	ib::fatal () << "Over 95 percent of the buffer pool is"
991	" occupied by lock heaps"
992	#ifdef BTR_CUR_HASH_ADAPT
993	" or the adaptive hash index!"
994	#endif /* BTR_CUR_HASH_ADAPT */
995	" Check that your transactions do not set too many"
996	" row locks, or review if"
997	" innodb_buffer_pool_size="
998	<< (buf_pool->curr_size >> (`20U` - srv_page_size_shift))
999	<< "M could be bigger.";
1000	} else if (!recv_recovery_is_on()
1001	&& buf_pool->curr_size == buf_pool->old_size
1002	&& (UT_LIST_GET_LEN(buf_pool->free)
1003	+ UT_LIST_GET_LEN(buf_pool->LRU))
1004	< buf_pool->curr_size / `3`) {
1005
1006	if (!buf_lru_switched_on_innodb_mon) {
1007
1008	/ Over 67 % of the buffer pool is occupied by lock*
1009	heaps or the adaptive hash index. This may be a memory
1010	leak! /*
1011
1012	ib::warn () << "Over 67 percent of the buffer pool is"
1013	" occupied by lock heaps"
1014	#ifdef BTR_CUR_HASH_ADAPT
1015	" or the adaptive hash index!"
1016	#endif /* BTR_CUR_HASH_ADAPT */
1017	" Check that your transactions do not"
1018	" set too many row locks."
1019	" innodb_buffer_pool_size="
1020	<< (buf_pool->curr_size >>
1021	(`20U` - srv_page_size_shift)) << "M."
1022	" Starting the InnoDB Monitor to print"
1023	" diagnostics.";
1024
1025	buf_lru_switched_on_innodb_mon = true;
1026	srv_print_innodb_monitor = TRUE;
1027	os_event_set(srv_monitor_event);
1028	}
1029
1030	} else if (buf_lru_switched_on_innodb_mon) {
1031
1032	/ Switch off the InnoDB Monitor; this is a simple way*
1033	to stop the monitor if the situation becomes less urgent,
1034	but may also surprise users if the user also switched on the
1035	monitor! /*
1036
1037	buf_lru_switched_on_innodb_mon = false;
1038	srv_print_innodb_monitor = FALSE;
1039	}
1040	}
1041
1042	/****************************************************************//**
1043	Returns a free block from the buf_pool. The block is taken off the
1044	free list. If free list is empty, blocks are moved from the end of the
1045	LRU list to the free list.
1046	This function is called from a user thread when it needs a clean
1047	block to read in a page. Note that we only ever get a block from
1048	the free list. Even when we flush a page or find a page in LRU scan
1049	we put it to free list to be used.
1050	* iteration 0:
1051	* get a block from free list, success:done
1052	* if buf_pool->try_LRU_scan is set
1053	* scan LRU up to srv_LRU_scan_depth to find a clean block
1054	* the above will put the block on free list
1055	* success:retry the free list
1056	* flush one dirty page from tail of LRU to disk
1057	* the above will put the block on free list
1058	* success: retry the free list
1059	* iteration 1:
1060	* same as iteration 0 except:
1061	* scan whole LRU list
1062	* scan LRU list even if buf_pool->try_LRU_scan is not set
1063	* iteration > 1:
1064	* same as iteration 1 but sleep 10ms
1065	@return the free control block, in state BUF_BLOCK_READY_FOR_USE /*
1066	buf_block_t*
1067	buf_LRU_get_free_block(
1068	/===================/
1069	buf_pool_t* buf_pool) /!< in/out: buffer pool instance /
1070	{
1071	buf_block_t* block = NULL;
1072	bool freed = false;
1073	ulint n_iterations = `0`;
1074	ulint flush_failures = `0`;
1075
1076	MONITOR_INC(MONITOR_LRU_GET_FREE_SEARCH);
1077	loop:
1078	buf_pool_mutex_enter(buf_pool);
1079
1080	buf_LRU_check_size_of_non_data_objects(buf_pool);
1081
1082	DBUG_EXECUTE_IF("ib_lru_force_no_free_page",
1083	if (!buf_lru_free_blocks_error_printed) {
1084	n_iterations = `21`;
1085	goto not_found;});
1086
1087	/ If there is a block in the free list, take it /
1088	block = buf_LRU_get_free_only(buf_pool);
1089
1090	if (block != NULL) {
1091
1092	buf_pool_mutex_exit(buf_pool);
1093	ut_ad(buf_pool_from_block(block) == buf_pool);
1094	memset(&block->page.zip, `0`, sizeof block->page.zip);
1095
1096	block->skip_flush_check = false;
1097	block->page.flush_observer = NULL;
1098	return(block);
1099	}
1100
1101	MONITOR_INC( MONITOR_LRU_GET_FREE_LOOPS );
1102	freed = false;
1103	if (buf_pool->try_LRU_scan \|\| n_iterations > `0`) {
1104	/ If no block was in the free list, search from the*
1105	end of the LRU list and try to free a block there.
1106	If we are doing for the first time we'll scan only
1107	tail of the LRU list otherwise we scan the whole LRU
1108	list. /*
1109	freed = buf_LRU_scan_and_free_block(
1110	buf_pool, n_iterations > `0`);
1111
1112	if (!freed && n_iterations == `0`) {
1113	/ Tell other threads that there is no point*
1114	in scanning the LRU list. This flag is set to
1115	TRUE again when we flush a batch from this
1116	buffer pool. /*
1117	buf_pool->try_LRU_scan = FALSE;
1118
1119	/ Also tell the page_cleaner thread that*
1120	there is work for it to do. /*
1121	os_event_set(buf_flush_event);
1122	}
1123	}
1124
1125	#ifndef DBUG_OFF
1126	not_found:
1127	#endif
1128
1129	buf_pool_mutex_exit(buf_pool);
1130
1131	if (freed) {
1132	goto loop;
1133	}
1134
1135	if (n_iterations > `20` && !buf_lru_free_blocks_error_printed
1136	&& srv_buf_pool_old_size == srv_buf_pool_size) {
1137
1138	ib::warn () << "Difficult to find free blocks in the buffer pool"
1139	" (" << n_iterations << " search iterations)! "
1140	<< flush_failures << " failed attempts to"
1141	" flush a page!"
1142	" Consider increasing innodb_buffer_pool_size."
1143	" Pending flushes (fsync) log: "
1144	<< fil_n_pending_log_flushes
1145	<< "; buffer pool: "
1146	<< fil_n_pending_tablespace_flushes
1147	<< ". " << os_n_file_reads << " OS file reads, "
1148	<< os_n_file_writes << " OS file writes, "
1149	<< os_n_fsyncs
1150	<< " OS fsyncs.";
1151
1152	buf_lru_free_blocks_error_printed = true;
1153	}
1154
1155	/ If we have scanned the whole LRU and still are unable to*
1156	find a free block then we should sleep here to let the
1157	page_cleaner do an LRU batch for us. /*
1158
1159	if (!srv_read_only_mode) {
1160	os_event_set(buf_flush_event);
1161	}
1162
1163	if (n_iterations > `1`) {
1164
1165	MONITOR_INC( MONITOR_LRU_GET_FREE_WAITS );
1166	os_thread_sleep(`10000`);
1167	}
1168
1169	/ No free block was found: try to flush the LRU list.*
1170	This call will flush one page from the LRU and put it on the
1171	free list. That means that the free block is up for grabs for
1172	all user threads.
1173
1174	TODO: A more elegant way would have been to return the freed
1175	up block to the caller here but the code that deals with
1176	removing the block from page_hash and LRU_list is fairly
1177	involved (particularly in case of compressed pages). We
1178	can do that in a separate patch sometime in future. /*
1179
1180	if (!buf_flush_single_page_from_LRU(buf_pool)) {
1181	MONITOR_INC(MONITOR_LRU_SINGLE_FLUSH_FAILURE_COUNT);
1182	++flush_failures;
1183	}
1184
1185	srv_stats.buf_pool_wait_free.inc();
1186
1187	n_iterations++;
1188
1189	goto loop;
1190	}
1191
1192	/*****************************************************************//**
1193	Moves the LRU_old pointer so that the length of the old blocks list
1194	is inside the allowed limits. /*
1195	UNIV_INLINE
1196	void
1197	buf_LRU_old_adjust_len(
1198	/===================/
1199	buf_pool_t* buf_pool) /!< in: buffer pool instance /
1200	{
1201	ulint old_len;
1202	ulint new_len;
1203
1204	ut_a(buf_pool->LRU_old);
1205	ut_ad(buf_pool_mutex_own(buf_pool));
1206	ut_ad(buf_pool->LRU_old_ratio >= BUF_LRU_OLD_RATIO_MIN);
1207	ut_ad(buf_pool->LRU_old_ratio <= BUF_LRU_OLD_RATIO_MAX);
1208	compile_time_assert(BUF_LRU_OLD_RATIO_MIN * BUF_LRU_OLD_MIN_LEN
1209	> BUF_LRU_OLD_RATIO_DIV
1210	* (BUF_LRU_OLD_TOLERANCE + `5`));
1211	compile_time_assert(BUF_LRU_NON_OLD_MIN_LEN < BUF_LRU_OLD_MIN_LEN);
1212
1213	#ifdef UNIV_LRU_DEBUG
1214	/ buf_pool->LRU_old must be the first item in the LRU list*
1215	whose "old" flag is set. /*
1216	ut_a(buf_pool->LRU_old->old);
1217	ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
1218	\|\| !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
1219	ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
1220	\|\| UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
1221	#endif /* UNIV_LRU_DEBUG */
1222
1223	old_len = buf_pool->LRU_old_len;
1224	new_len = ut_min(UT_LIST_GET_LEN(buf_pool->LRU)
1225	* buf_pool->LRU_old_ratio / BUF_LRU_OLD_RATIO_DIV,
1226	UT_LIST_GET_LEN(buf_pool->LRU)
1227	- (BUF_LRU_OLD_TOLERANCE
1228	+ BUF_LRU_NON_OLD_MIN_LEN));
1229
1230	for (;;) {
1231	buf_page_t* LRU_old = buf_pool->LRU_old;
1232
1233	ut_a(LRU_old);
1234	ut_ad(LRU_old->in_LRU_list);
1235	#ifdef UNIV_LRU_DEBUG
1236	ut_a(LRU_old->old);
1237	#endif /* UNIV_LRU_DEBUG */
1238
1239	/ Update the LRU_old pointer if necessary /
1240
1241	if (old_len + BUF_LRU_OLD_TOLERANCE < new_len) {
1242
1243	buf_pool->LRU_old = LRU_old = UT_LIST_GET_PREV(
1244	LRU, LRU_old);
1245	#ifdef UNIV_LRU_DEBUG
1246	ut_a(!LRU_old->old);
1247	#endif /* UNIV_LRU_DEBUG */
1248	old_len = ++buf_pool->LRU_old_len;
1249	buf_page_set_old(LRU_old, TRUE);
1250
1251	} else if (old_len > new_len + BUF_LRU_OLD_TOLERANCE) {
1252
1253	buf_pool->LRU_old = UT_LIST_GET_NEXT(LRU, LRU_old);
1254	old_len = --buf_pool->LRU_old_len;
1255	buf_page_set_old(LRU_old, FALSE);
1256	} else {
1257	return;
1258	}
1259	}
1260	}
1261
1262	/*****************************************************************//**
1263	Initializes the old blocks pointer in the LRU list. This function should be
1264	called when the LRU list grows to BUF_LRU_OLD_MIN_LEN length. /*
1265	static
1266	void
1267	buf_LRU_old_init(
1268	/=============/
1269	buf_pool_t* buf_pool)
1270	{
1271	ut_ad(buf_pool_mutex_own(buf_pool));
1272	ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN);
1273
1274	/ We first initialize all blocks in the LRU list as old and then use*
1275	the adjust function to move the LRU_old pointer to the right
1276	position /*
1277
1278	for (buf_page_t* bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1279	bpage != NULL;
1280	bpage = UT_LIST_GET_PREV(LRU, bpage)) {
1281
1282	ut_ad(bpage->in_LRU_list);
1283	ut_ad(buf_page_in_file(bpage));
1284
1285	/ This loop temporarily violates the*
1286	assertions of buf_page_set_old(). /*
1287	bpage->old = TRUE;
1288	}
1289
1290	buf_pool->LRU_old = UT_LIST_GET_FIRST(buf_pool->LRU);
1291	buf_pool->LRU_old_len = UT_LIST_GET_LEN(buf_pool->LRU);
1292
1293	buf_LRU_old_adjust_len(buf_pool);
1294	}
1295
1296	/****************************************************************//**
1297	Remove a block from the unzip_LRU list if it belonged to the list. /*
1298	static
1299	void
1300	buf_unzip_LRU_remove_block_if_needed(
1301	/=================================/
1302	buf_page_t* bpage) /!< in/out: control block /
1303	{
1304	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1305
1306	ut_ad(buf_page_in_file(bpage));
1307	ut_ad(buf_pool_mutex_own(buf_pool));
1308
1309	if (buf_page_belongs_to_unzip_LRU(bpage)) {
1310	buf_block_t* block = reinterpret_cast<buf_block_t*>(bpage);
1311
1312	ut_ad(block->in_unzip_LRU_list);
1313	ut_d(block->in_unzip_LRU_list = FALSE);
1314
1315	UT_LIST_REMOVE(buf_pool->unzip_LRU, block);
1316	}
1317	}
1318
1319	/****************************************************************//**
1320	Adjust LRU hazard pointers if needed. /*
1321	void
1322	buf_LRU_adjust_hp(
1323	/==============/
1324	buf_pool_t* buf_pool,/!< in: buffer pool instance /
1325	const buf_page_t* bpage) /!< in: control block /
1326	{
1327	buf_pool->lru_hp.adjust(bpage);
1328	buf_pool->lru_scan_itr.adjust(bpage);
1329	buf_pool->single_scan_itr.adjust(bpage);
1330	}
1331
1332	/****************************************************************//**
1333	Removes a block from the LRU list. /*
1334	UNIV_INLINE
1335	void
1336	buf_LRU_remove_block(
1337	/=================/
1338	buf_page_t* bpage) /!< in: control block /
1339	{
1340	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1341
1342	ut_ad(buf_pool_mutex_own(buf_pool));
1343
1344	ut_a(buf_page_in_file(bpage));
1345
1346	ut_ad(bpage->in_LRU_list);
1347
1348	/ Important that we adjust the hazard pointers before removing*
1349	bpage from the LRU list. /*
1350	buf_LRU_adjust_hp(buf_pool, bpage);
1351
1352	/ If the LRU_old pointer is defined and points to just this block,*
1353	move it backward one step /*
1354
1355	if (bpage == buf_pool->LRU_old) {
1356
1357	/ Below: the previous block is guaranteed to exist,*
1358	because the LRU_old pointer is only allowed to differ
1359	by BUF_LRU_OLD_TOLERANCE from strict
1360	buf_pool->LRU_old_ratio/BUF_LRU_OLD_RATIO_DIV of the LRU
1361	list length. /*
1362	buf_page_t* prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
1363
1364	ut_a(prev_bpage);
1365	#ifdef UNIV_LRU_DEBUG
1366	ut_a(!prev_bpage->old);
1367	#endif /* UNIV_LRU_DEBUG */
1368	buf_pool->LRU_old = prev_bpage;
1369	buf_page_set_old(prev_bpage, TRUE);
1370
1371	buf_pool->LRU_old_len++;
1372	}
1373
1374	/ Remove the block from the LRU list /
1375	UT_LIST_REMOVE(buf_pool->LRU, bpage);
1376	ut_d(bpage->in_LRU_list = FALSE);
1377
1378	buf_pool->stat.LRU_bytes -= bpage->size.physical();
1379
1380	buf_unzip_LRU_remove_block_if_needed(bpage);
1381
1382	/ If the LRU list is so short that LRU_old is not defined,*
1383	clear the "old" flags and return /*
1384	if (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN) {
1385
1386	for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
1387	bpage != NULL;
1388	bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
1389
1390	/ This loop temporarily violates the*
1391	assertions of buf_page_set_old(). /*
1392	bpage->old = FALSE;
1393	}
1394
1395	buf_pool->LRU_old = NULL;
1396	buf_pool->LRU_old_len = `0`;
1397
1398	return;
1399	}
1400
1401	ut_ad(buf_pool->LRU_old);
1402
1403	/ Update the LRU_old_len field if necessary /
1404	if (buf_page_is_old(bpage)) {
1405
1406	buf_pool->LRU_old_len--;
1407	}
1408
1409	/ Adjust the length of the old block list if necessary /
1410	buf_LRU_old_adjust_len(buf_pool);
1411	}
1412
1413	/****************************************************************//**
1414	Adds a block to the LRU list of decompressed zip pages. /*
1415	void
1416	buf_unzip_LRU_add_block(
1417	/====================/
1418	buf_block_t* block, /!< in: control block /
1419	ibool old) /!< in: TRUE if should be put to the end*
1420	of the list, else put to the start /*
1421	{
1422	buf_pool_t* buf_pool = buf_pool_from_block(block);
1423
1424	ut_ad(buf_pool_mutex_own(buf_pool));
1425
1426	ut_a(buf_page_belongs_to_unzip_LRU(&block->page));
1427
1428	ut_ad(!block->in_unzip_LRU_list);
1429	ut_d(block->in_unzip_LRU_list = TRUE);
1430
1431	if (old) {
1432	UT_LIST_ADD_LAST(buf_pool->unzip_LRU, block);
1433	} else {
1434	UT_LIST_ADD_FIRST(buf_pool->unzip_LRU, block);
1435	}
1436	}
1437
1438	/****************************************************************//**
1439	Adds a block to the LRU list. Please make sure that the page_size is
1440	already set when invoking the function, so that we can get correct
1441	page_size from the buffer page when adding a block into LRU /*
1442	UNIV_INLINE
1443	void
1444	buf_LRU_add_block_low(
1445	/==================/
1446	buf_page_t* bpage, /!< in: control block /
1447	ibool old) /!< in: TRUE if should be put to the old blocks*
1448	in the LRU list, else put to the start; if the
1449	LRU list is very short, the block is added to
1450	the start, regardless of this parameter /*
1451	{
1452	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1453
1454	ut_ad(buf_pool_mutex_own(buf_pool));
1455
1456	ut_a(buf_page_in_file(bpage));
1457	ut_ad(!bpage->in_LRU_list);
1458
1459	if (!old \|\| (UT_LIST_GET_LEN(buf_pool->LRU) < BUF_LRU_OLD_MIN_LEN)) {
1460
1461	UT_LIST_ADD_FIRST(buf_pool->LRU, bpage);
1462
1463	bpage->freed_page_clock = buf_pool->freed_page_clock;
1464	} else {
1465	#ifdef UNIV_LRU_DEBUG
1466	/ buf_pool->LRU_old must be the first item in the LRU list*
1467	whose "old" flag is set. /*
1468	ut_a(buf_pool->LRU_old->old);
1469	ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
1470	\|\| !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
1471	ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
1472	\|\| UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
1473	#endif /* UNIV_LRU_DEBUG */
1474	UT_LIST_INSERT_AFTER(buf_pool->LRU, buf_pool->LRU_old,
1475	bpage);
1476
1477	buf_pool->LRU_old_len++;
1478	}
1479
1480	ut_d(bpage->in_LRU_list = TRUE);
1481
1482	incr_LRU_size_in_bytes(bpage, buf_pool);
1483
1484	if (UT_LIST_GET_LEN(buf_pool->LRU) > BUF_LRU_OLD_MIN_LEN) {
1485
1486	ut_ad(buf_pool->LRU_old);
1487
1488	/ Adjust the length of the old block list if necessary /
1489
1490	buf_page_set_old(bpage, old);
1491	buf_LRU_old_adjust_len(buf_pool);
1492
1493	} else if (UT_LIST_GET_LEN(buf_pool->LRU) == BUF_LRU_OLD_MIN_LEN) {
1494
1495	/ The LRU list is now long enough for LRU_old to become*
1496	defined: init it /*
1497
1498	buf_LRU_old_init(buf_pool);
1499	} else {
1500	buf_page_set_old(bpage, buf_pool->LRU_old != NULL);
1501	}
1502
1503	/ If this is a zipped block with decompressed frame as well*
1504	then put it on the unzip_LRU list /*
1505	if (buf_page_belongs_to_unzip_LRU(bpage)) {
1506	buf_unzip_LRU_add_block((buf_block_t*) bpage, old);
1507	}
1508	}
1509
1510	/****************************************************************//**
1511	Adds a block to the LRU list. Please make sure that the page_size is
1512	already set when invoking the function, so that we can get correct
1513	page_size from the buffer page when adding a block into LRU /*
1514	void
1515	buf_LRU_add_block(
1516	/==============/
1517	buf_page_t* bpage, /!< in: control block /
1518	ibool old) /!< in: TRUE if should be put to the old*
1519	blocks in the LRU list, else put to the start;
1520	if the LRU list is very short, the block is
1521	added to the start, regardless of this
1522	parameter /*
1523	{
1524	buf_LRU_add_block_low(bpage, old);
1525	}
1526
1527	/****************************************************************//**
1528	Moves a block to the start of the LRU list. /*
1529	void
1530	buf_LRU_make_block_young(
1531	/=====================/
1532	buf_page_t* bpage) /!< in: control block /
1533	{
1534	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1535
1536	ut_ad(buf_pool_mutex_own(buf_pool));
1537
1538	if (bpage->old) {
1539	buf_pool->stat.n_pages_made_young++;
1540	}
1541
1542	buf_LRU_remove_block(bpage);
1543	buf_LRU_add_block_low(bpage, FALSE);
1544	}
1545
1546	/****************************************************************//**
1547	Try to free a block. If bpage is a descriptor of a compressed-only
1548	page, the descriptor object will be freed as well.
1549
1550	NOTE: If this function returns true, it will temporarily
1551	release buf_pool->mutex. Furthermore, the page frame will no longer be
1552	accessible via bpage.
1553
1554	The caller must hold buf_pool->mutex and must not hold any
1555	buf_page_get_mutex() when calling this function.
1556	@return true if freed, false otherwise. /*
1557	bool
1558	buf_LRU_free_page(
1559	/===============/
1560	buf_page_t* bpage, /!< in: block to be freed /
1561	bool zip) /!< in: true if should remove also the*
1562	compressed page of an uncompressed page /*
1563	{
1564	buf_page_t* b = NULL;
1565	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1566
1567	rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, bpage->id);
1568
1569	BPageMutex* block_mutex = buf_page_get_mutex(bpage);
1570
1571	ut_ad(buf_pool_mutex_own(buf_pool));
1572	ut_ad(buf_page_in_file(bpage));
1573	ut_ad(bpage->in_LRU_list);
1574
1575	rw_lock_x_lock(hash_lock);
1576	mutex_enter(block_mutex);
1577
1578	if (!buf_page_can_relocate(bpage)) {
1579
1580	/ Do not free buffer fixed and I/O-fixed blocks. /
1581	goto func_exit;
1582	}
1583
1584	#ifdef UNIV_IBUF_COUNT_DEBUG
1585	ut_a(ibuf_count_get(bpage->id) == `0`);
1586	#endif /* UNIV_IBUF_COUNT_DEBUG */
1587
1588	if (zip \|\| !bpage->zip.data) {
1589	/ This would completely free the block. /
1590	/ Do not completely free dirty blocks. /
1591
1592	if (bpage->oldest_modification) {
1593	goto func_exit;
1594	}
1595	} else if (bpage->oldest_modification > `0`
1596	&& buf_page_get_state(bpage) != BUF_BLOCK_FILE_PAGE) {
1597
1598	ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_DIRTY);
1599
1600	func_exit:
1601	rw_lock_x_unlock(hash_lock);
1602	mutex_exit(block_mutex);
1603	return(false);
1604
1605	} else if (buf_page_get_state(bpage) == BUF_BLOCK_FILE_PAGE) {
1606	b = buf_page_alloc_descriptor();
1607	ut_a(b);
1608	memcpy(b, bpage, sizeof *b);
1609	}
1610
1611	ut_ad(buf_pool_mutex_own(buf_pool));
1612	ut_ad(buf_page_in_file(bpage));
1613	ut_ad(bpage->in_LRU_list);
1614	ut_ad(!bpage->in_flush_list == !bpage->oldest_modification);
1615
1616	DBUG_PRINT("ib_buf", ("free page %u:%u",
1617	bpage->id.space(), bpage->id.page_no()));
1618
1619	ut_ad(rw_lock_own(hash_lock, RW_LOCK_X));
1620	ut_ad(buf_page_can_relocate(bpage));
1621
1622	if (!buf_LRU_block_remove_hashed(bpage, zip)) {
1623	return(true);
1624	}
1625
1626	/ buf_LRU_block_remove_hashed() releases the hash_lock /
1627	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_X)
1628	&& !rw_lock_own(hash_lock, RW_LOCK_S));
1629
1630	/ We have just freed a BUF_BLOCK_FILE_PAGE. If b != NULL*
1631	then it was a compressed page with an uncompressed frame and
1632	we are interested in freeing only the uncompressed frame.
1633	Therefore we have to reinsert the compressed page descriptor
1634	into the LRU and page_hash (and possibly flush_list).
1635	if b == NULL then it was a regular page that has been freed /*
1636
1637	if (b != NULL) {
1638	buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, b);
1639
1640	rw_lock_x_lock(hash_lock);
1641
1642	mutex_enter(block_mutex);
1643
1644	ut_a(!buf_page_hash_get_low(buf_pool, b->id));
1645
1646	b->state = b->oldest_modification
1647	? BUF_BLOCK_ZIP_DIRTY
1648	: BUF_BLOCK_ZIP_PAGE;
1649
1650	ut_ad(b->size.is_compressed());
1651
1652	UNIV_MEM_DESC(b->zip.data, b->size.physical());
1653
1654	/ The fields in_page_hash and in_LRU_list of*
1655	the to-be-freed block descriptor should have
1656	been cleared in
1657	buf_LRU_block_remove_hashed(), which
1658	invokes buf_LRU_remove_block(). /*
1659	ut_ad(!bpage->in_page_hash);
1660	ut_ad(!bpage->in_LRU_list);
1661
1662	/ bpage->state was BUF_BLOCK_FILE_PAGE because*
1663	b != NULL. The type cast below is thus valid. /*
1664	ut_ad(!((buf_block_t*) bpage)->in_unzip_LRU_list);
1665
1666	/ The fields of bpage were copied to b before*
1667	buf_LRU_block_remove_hashed() was invoked. /*
1668	ut_ad(!b->in_zip_hash);
1669	ut_ad(b->in_page_hash);
1670	ut_ad(b->in_LRU_list);
1671
1672	HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
1673	b->id.fold(), b);
1674
1675	/ Insert b where bpage was in the LRU list. /
1676	if (prev_b != NULL) {
1677	ulint lru_len;
1678
1679	ut_ad(prev_b->in_LRU_list);
1680	ut_ad(buf_page_in_file(prev_b));
1681
1682	UT_LIST_INSERT_AFTER(buf_pool->LRU, prev_b, b);
1683
1684	incr_LRU_size_in_bytes(b, buf_pool);
1685
1686	if (buf_page_is_old(b)) {
1687	buf_pool->LRU_old_len++;
1688	if (buf_pool->LRU_old
1689	== UT_LIST_GET_NEXT(LRU, b)) {
1690
1691	buf_pool->LRU_old = b;
1692	}
1693	}
1694
1695	lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
1696
1697	if (lru_len > BUF_LRU_OLD_MIN_LEN) {
1698	ut_ad(buf_pool->LRU_old);
1699	/ Adjust the length of the*
1700	old block list if necessary /*
1701	buf_LRU_old_adjust_len(buf_pool);
1702	} else if (lru_len == BUF_LRU_OLD_MIN_LEN) {
1703	/ The LRU list is now long*
1704	enough for LRU_old to become
1705	defined: init it /*
1706	buf_LRU_old_init(buf_pool);
1707	}
1708	#ifdef UNIV_LRU_DEBUG
1709	/ Check that the "old" flag is consistent*
1710	in the block and its neighbours. /*
1711	buf_page_set_old(b, buf_page_is_old(b));
1712	#endif /* UNIV_LRU_DEBUG */
1713	} else {
1714	ut_d(b->in_LRU_list = FALSE);
1715	buf_LRU_add_block_low(b, buf_page_is_old(b));
1716	}
1717
1718	if (b->state == BUF_BLOCK_ZIP_PAGE) {
1719	#if defined UNIV_DEBUG \|\| defined UNIV_BUF_DEBUG
1720	buf_LRU_insert_zip_clean(b);
1721	#endif /* UNIV_DEBUG \|\| UNIV_BUF_DEBUG */
1722	} else {
1723	/ Relocate on buf_pool->flush_list. /
1724	buf_flush_relocate_on_flush_list(bpage, b);
1725	}
1726
1727	bpage->zip.data = NULL;
1728
1729	page_zip_set_size(&bpage->zip, `0`);
1730
1731	bpage->size.copy_from(page_size_t (bpage->size.logical(),
1732	bpage->size.logical(),
1733	false));
1734
1735	mutex_exit(block_mutex);
1736
1737	/ Prevent buf_page_get_gen() from*
1738	decompressing the block while we release
1739	buf_pool->mutex and block_mutex. /*
1740	block_mutex = buf_page_get_mutex(b);
1741
1742	mutex_enter(block_mutex);
1743
1744	buf_page_set_sticky(b);
1745
1746	mutex_exit(block_mutex);
1747
1748	rw_lock_x_unlock(hash_lock);
1749	}
1750
1751	buf_pool_mutex_exit(buf_pool);
1752
1753	/ Remove possible adaptive hash index on the page.*
1754	The page was declared uninitialized by
1755	buf_LRU_block_remove_hashed(). We need to flag
1756	the contents of the page valid (which it still is) in
1757	order to avoid bogus Valgrind warnings./*
1758
1759	UNIV_MEM_VALID(((buf_block_t*) bpage)->frame,
1760	srv_page_size);
1761	btr_search_drop_page_hash_index((buf_block_t*) bpage);
1762	UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame,
1763	srv_page_size);
1764
1765	if (b != NULL) {
1766
1767	/ Compute and stamp the compressed page*
1768	checksum while not holding any mutex. The
1769	block is already half-freed
1770	(BUF_BLOCK_REMOVE_HASH) and removed from
1771	buf_pool->page_hash, thus inaccessible by any
1772	other thread. /*
1773
1774	ut_ad(b->size.is_compressed());
1775
1776	const uint32_t checksum = page_zip_calc_checksum(
1777	b->zip.data,
1778	b->size.physical(),
1779	static_cast<srv_checksum_algorithm_t>(
1780	srv_checksum_algorithm));
1781
1782	mach_write_to_4(b->zip.data + FIL_PAGE_SPACE_OR_CHKSUM,
1783	checksum);
1784	}
1785
1786	buf_pool_mutex_enter(buf_pool);
1787
1788	if (b != NULL) {
1789	mutex_enter(block_mutex);
1790
1791	buf_page_unset_sticky(b);
1792
1793	mutex_exit(block_mutex);
1794	}
1795
1796	buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
1797
1798	return(true);
1799	}
1800
1801	/****************************************************************//**
1802	Puts a block back to the free list. /*
1803	void
1804	buf_LRU_block_free_non_file_page(
1805	/=============================/
1806	buf_block_t* block) /!< in: block, must not contain a file page /
1807	{
1808	void* data;
1809	buf_pool_t* buf_pool = buf_pool_from_block(block);
1810
1811	ut_ad(buf_pool_mutex_own(buf_pool));
1812	ut_ad(buf_page_mutex_own(block));
1813
1814	switch (buf_block_get_state(block)) {
1815	case BUF_BLOCK_MEMORY:
1816	case BUF_BLOCK_READY_FOR_USE:
1817	break;
1818	default:
1819	ut_error;
1820	}
1821
1822	assert_block_ahi_empty(block);
1823	ut_ad(!block->page.in_free_list);
1824	ut_ad(!block->page.in_flush_list);
1825	ut_ad(!block->page.in_LRU_list);
1826
1827	buf_block_set_state(block, BUF_BLOCK_NOT_USED);
1828
1829	UNIV_MEM_ALLOC(block->frame, srv_page_size);
1830	#ifdef UNIV_DEBUG
1831	/ Wipe contents of page to reveal possible stale pointers to it /
1832	memset(block->frame, `'\0'`, srv_page_size);
1833	#else
1834	/ Wipe page_no and space_id /
1835	memset(block->frame + FIL_PAGE_OFFSET, `0xfe`, `4`);
1836	memset(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, `0xfe`, `4`);
1837	#endif /* UNIV_DEBUG */
1838	data = block->page.zip.data;
1839
1840	if (data != NULL) {
1841	block->page.zip.data = NULL;
1842	buf_page_mutex_exit(block);
1843	buf_pool_mutex_exit_forbid(buf_pool);
1844
1845	ut_ad(block->page.size.is_compressed());
1846
1847	buf_buddy_free(buf_pool, data, block->page.size.physical());
1848
1849	buf_pool_mutex_exit_allow(buf_pool);
1850	buf_page_mutex_enter(block);
1851
1852	page_zip_set_size(&block->page.zip, `0`);
1853
1854	block->page.size.copy_from(
1855	page_size_t (block->page.size.logical(),
1856	block->page.size.logical(),
1857	false));
1858	}
1859
1860	if (buf_pool->curr_size < buf_pool->old_size
1861	&& UT_LIST_GET_LEN(buf_pool->withdraw) < buf_pool->withdraw_target
1862	&& buf_block_will_withdrawn(buf_pool, block)) {
1863	/ This should be withdrawn /
1864	UT_LIST_ADD_LAST(
1865	buf_pool->withdraw,
1866	&block->page);
1867	ut_d(block->in_withdraw_list = TRUE);
1868	} else {
1869	UT_LIST_ADD_FIRST(buf_pool->free, &block->page);
1870	ut_d(block->page.in_free_list = TRUE);
1871	}
1872
1873	UNIV_MEM_FREE(block->frame, srv_page_size);
1874	}
1875
1876	/****************************************************************//**
1877	Takes a block out of the LRU list and page hash table.
1878	If the block is compressed-only (BUF_BLOCK_ZIP_PAGE),
1879	the object will be freed.
1880
1881	The caller must hold buf_pool->mutex, the buf_page_get_mutex() mutex
1882	and the appropriate hash_lock. This function will release the
1883	buf_page_get_mutex() and the hash_lock.
1884
1885	If a compressed page is freed other compressed pages may be relocated.
1886	@retval true if BUF_BLOCK_FILE_PAGE was removed from page_hash. The
1887	caller needs to free the page to the free list
1888	@retval false if BUF_BLOCK_ZIP_PAGE was removed from page_hash. In
1889	this case the block is already returned to the buddy allocator. /*
1890	static
1891	bool
1892	buf_LRU_block_remove_hashed(
1893	/========================/
1894	buf_page_t* bpage, /!< in: block, must contain a file page and*
1895	be in a state where it can be freed; there
1896	may or may not be a hash index to the page /*
1897	bool zip) /!< in: true if should remove also the*
1898	compressed page of an uncompressed page /*
1899	{
1900	const buf_page_t* hashed_bpage;
1901	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
1902	rw_lock_t* hash_lock;
1903
1904	ut_ad(buf_pool_mutex_own(buf_pool));
1905	ut_ad(mutex_own(buf_page_get_mutex(bpage)));
1906
1907	hash_lock = buf_page_hash_lock_get(buf_pool, bpage->id);
1908
1909	ut_ad(rw_lock_own(hash_lock, RW_LOCK_X));
1910
1911	ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
1912	ut_a(bpage->buf_fix_count == `0`);
1913
1914	buf_LRU_remove_block(bpage);
1915
1916	buf_pool->freed_page_clock += `1`;
1917
1918	switch (buf_page_get_state(bpage)) {
1919	case BUF_BLOCK_FILE_PAGE:
1920	UNIV_MEM_ASSERT_W(bpage, sizeof(buf_block_t));
1921	UNIV_MEM_ASSERT_W(((buf_block_t*) bpage)->frame,
1922	srv_page_size);
1923	buf_block_modify_clock_inc((buf_block_t*) bpage);
1924	if (bpage->zip.data) {
1925	const page_t* page = ((buf_block_t*) bpage)->frame;
1926
1927	ut_a(!zip \|\| bpage->oldest_modification == `0`);
1928	ut_ad(bpage->size.is_compressed());
1929
1930	switch (fil_page_get_type(page)) {
1931	case FIL_PAGE_TYPE_ALLOCATED:
1932	case FIL_PAGE_INODE:
1933	case FIL_PAGE_IBUF_BITMAP:
1934	case FIL_PAGE_TYPE_FSP_HDR:
1935	case FIL_PAGE_TYPE_XDES:
1936	/ These are essentially uncompressed pages. /
1937	if (!zip) {
1938	/ InnoDB writes the data to the*
1939	uncompressed page frame. Copy it
1940	to the compressed page, which will
1941	be preserved. /*
1942	memcpy(bpage->zip.data, page,
1943	bpage->size.physical());
1944	}
1945	break;
1946	case FIL_PAGE_TYPE_ZBLOB:
1947	case FIL_PAGE_TYPE_ZBLOB2:
1948	break;
1949	case FIL_PAGE_INDEX:
1950	case FIL_PAGE_RTREE:
1951	#if defined UNIV_ZIP_DEBUG && defined BTR_CUR_HASH_ADAPT
1952	ut_a(page_zip_validate(
1953	&bpage->zip, page,
1954	((buf_block_t*) bpage)->index));
1955	#endif /* UNIV_ZIP_DEBUG && BTR_CUR_HASH_ADAPT */
1956	break;
1957	default:
1958	ib::error () << "The compressed page to be"
1959	" evicted seems corrupt:";
1960	ut_print_buf(stderr, page,
1961	bpage->size.logical());
1962
1963	ib::error () << "Possibly older version of"
1964	" the page:";
1965
1966	ut_print_buf(stderr, bpage->zip.data,
1967	bpage->size.physical());
1968	putc(`'\n'`, stderr);
1969	ut_error;
1970	}
1971
1972	break;
1973	}
1974	/ fall through /
1975	case BUF_BLOCK_ZIP_PAGE:
1976	ut_a(bpage->oldest_modification == `0`);
1977	if (bpage->size.is_compressed()) {
1978	UNIV_MEM_ASSERT_W(bpage->zip.data,
1979	bpage->size.physical());
1980	}
1981	break;
1982	case BUF_BLOCK_POOL_WATCH:
1983	case BUF_BLOCK_ZIP_DIRTY:
1984	case BUF_BLOCK_NOT_USED:
1985	case BUF_BLOCK_READY_FOR_USE:
1986	case BUF_BLOCK_MEMORY:
1987	case BUF_BLOCK_REMOVE_HASH:
1988	ut_error;
1989	break;
1990	}
1991
1992	hashed_bpage = buf_page_hash_get_low(buf_pool, bpage->id);
1993	if (bpage != hashed_bpage) {
1994	ib::error () << "Page " << bpage->id
1995	<< " not found in the hash table";
1996
1997	#ifdef UNIV_DEBUG
1998
1999
2000	ib::error()
2001	<< "in_page_hash:" << bpage->in_page_hash
2002	<< " in_zip_hash:" << bpage->in_zip_hash
2003	// << " in_free_list:"<< bpage->in_fee_list
2004	<< " in_flush_list:" << bpage->in_flush_list
2005	<< " in_LRU_list:" << bpage->in_LRU_list
2006	<< " zip.data:" << bpage->zip.data
2007	<< " zip_size:" << bpage->size.logical()
2008	<< " page_state:" << buf_page_get_state(bpage);
2009	#else
2010	ib::error ()
2011	<< " zip.data:" << bpage->zip.data
2012	<< " zip_size:" << bpage->size.logical()
2013	<< " page_state:" << buf_page_get_state(bpage);
2014	#endif
2015
2016	if (hashed_bpage) {
2017
2018	ib::error () << "In hash table we find block "
2019	<< hashed_bpage << " of " << hashed_bpage->id
2020	<< " which is not " << bpage;
2021	}
2022
2023	#if defined UNIV_DEBUG \|\| defined UNIV_BUF_DEBUG
2024	mutex_exit(buf_page_get_mutex(bpage));
2025	rw_lock_x_unlock(hash_lock);
2026	buf_pool_mutex_exit(buf_pool);
2027	buf_print();
2028	buf_LRU_print();
2029	buf_validate();
2030	buf_LRU_validate();
2031	#endif /* UNIV_DEBUG \|\| UNIV_BUF_DEBUG */
2032	ut_error;
2033	}
2034
2035	ut_ad(!bpage->in_zip_hash);
2036	ut_ad(bpage->in_page_hash);
2037	ut_d(bpage->in_page_hash = FALSE);
2038
2039	HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, bpage->id.fold(),
2040	bpage);
2041
2042	switch (buf_page_get_state(bpage)) {
2043	case BUF_BLOCK_ZIP_PAGE:
2044	ut_ad(!bpage->in_free_list);
2045	ut_ad(!bpage->in_flush_list);
2046	ut_ad(!bpage->in_LRU_list);
2047	ut_a(bpage->zip.data);
2048	ut_a(bpage->size.is_compressed());
2049
2050	#if defined UNIV_DEBUG \|\| defined UNIV_BUF_DEBUG
2051	UT_LIST_REMOVE(buf_pool->zip_clean, bpage);
2052	#endif /* UNIV_DEBUG \|\| UNIV_BUF_DEBUG */
2053
2054	mutex_exit(&buf_pool->zip_mutex);
2055	rw_lock_x_unlock(hash_lock);
2056	buf_pool_mutex_exit_forbid(buf_pool);
2057
2058	buf_buddy_free(buf_pool, bpage->zip.data,
2059	bpage->size.physical());
2060
2061	buf_pool_mutex_exit_allow(buf_pool);
2062	buf_page_free_descriptor(bpage);
2063	return(false);
2064
2065	case BUF_BLOCK_FILE_PAGE:
2066	memset(((buf_block_t*) bpage)->frame
2067	+ FIL_PAGE_OFFSET, `0xff`, `4`);
2068	memset(((buf_block_t*) bpage)->frame
2069	+ FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, `0xff`, `4`);
2070	UNIV_MEM_INVALID(((buf_block_t*) bpage)->frame,
2071	srv_page_size);
2072	buf_page_set_state(bpage, BUF_BLOCK_REMOVE_HASH);
2073
2074	/ Question: If we release bpage and hash mutex here*
2075	then what protects us against:
2076	1) Some other thread buffer fixing this page
2077	2) Some other thread trying to read this page and
2078	not finding it in buffer pool attempting to read it
2079	from the disk.
2080	Answer:
2081	1) Cannot happen because the page is no longer in the
2082	page_hash. Only possibility is when while invalidating
2083	a tablespace we buffer fix the prev_page in LRU to
2084	avoid relocation during the scan. But that is not
2085	possible because we are holding buf_pool mutex.
2086
2087	2) Not possible because in buf_page_init_for_read()
2088	we do a look up of page_hash while holding buf_pool
2089	mutex and since we are holding buf_pool mutex here
2090	and by the time we'll release it in the caller we'd
2091	have inserted the compressed only descriptor in the
2092	page_hash. /*
2093	rw_lock_x_unlock(hash_lock);
2094	mutex_exit(&((buf_block_t*) bpage)->mutex);
2095
2096	if (zip && bpage->zip.data) {
2097	/ Free the compressed page. /
2098	void* data = bpage->zip.data;
2099	bpage->zip.data = NULL;
2100
2101	ut_ad(!bpage->in_free_list);
2102	ut_ad(!bpage->in_flush_list);
2103	ut_ad(!bpage->in_LRU_list);
2104	buf_pool_mutex_exit_forbid(buf_pool);
2105
2106	buf_buddy_free(buf_pool, data, bpage->size.physical());
2107
2108	buf_pool_mutex_exit_allow(buf_pool);
2109
2110	page_zip_set_size(&bpage->zip, `0`);
2111
2112	bpage->size.copy_from(
2113	page_size_t (bpage->size.logical(),
2114	bpage->size.logical(),
2115	false));
2116	}
2117
2118	return(true);
2119
2120	case BUF_BLOCK_POOL_WATCH:
2121	case BUF_BLOCK_ZIP_DIRTY:
2122	case BUF_BLOCK_NOT_USED:
2123	case BUF_BLOCK_READY_FOR_USE:
2124	case BUF_BLOCK_MEMORY:
2125	case BUF_BLOCK_REMOVE_HASH:
2126	break;
2127	}
2128
2129	ut_error;
2130	return(false);
2131	}
2132
2133	/****************************************************************//**
2134	Puts a file page whose has no hash index to the free list. /*
2135	static
2136	void
2137	buf_LRU_block_free_hashed_page(
2138	/===========================/
2139	buf_block_t* block) /!< in: block, must contain a file page and*
2140	be in a state where it can be freed /*
2141	{
2142	buf_pool_t* buf_pool = buf_pool_from_block(block);
2143	ut_ad(buf_pool_mutex_own(buf_pool));
2144
2145	buf_page_mutex_enter(block);
2146
2147	if (buf_pool->flush_rbt == NULL) {
2148	block->page.id.reset();
2149	}
2150
2151	buf_block_set_state(block, BUF_BLOCK_MEMORY);
2152
2153	buf_LRU_block_free_non_file_page(block);
2154	buf_page_mutex_exit(block);
2155	}
2156
2157	/****************************************************************//**
2158	Remove one page from LRU list and put it to free list /*
2159	void
2160	buf_LRU_free_one_page(
2161	/==================/
2162	buf_page_t* bpage) /!< in/out: block, must contain a file page and*
2163	be in a state where it can be freed; there
2164	may or may not be a hash index to the page /*
2165	{
2166	buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
2167
2168	rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, bpage->id);
2169	BPageMutex* block_mutex = buf_page_get_mutex(bpage);
2170
2171	ut_ad(buf_pool_mutex_own(buf_pool));
2172
2173	rw_lock_x_lock(hash_lock);
2174	mutex_enter(block_mutex);
2175
2176	if (buf_LRU_block_remove_hashed(bpage, true)) {
2177	buf_LRU_block_free_hashed_page((buf_block_t*) bpage);
2178	}
2179
2180	/ buf_LRU_block_remove_hashed() releases hash_lock and block_mutex /
2181	ut_ad(!rw_lock_own(hash_lock, RW_LOCK_X)
2182	&& !rw_lock_own(hash_lock, RW_LOCK_S));
2183
2184	ut_ad(!mutex_own(block_mutex));
2185	}
2186
2187	/********************************************************************//**
2188	Updates buf_pool->LRU_old_ratio for one buffer pool instance.
2189	@return updated old_pct /*
2190	static
2191	uint
2192	buf_LRU_old_ratio_update_instance(
2193	/==============================/
2194	buf_pool_t* buf_pool,/!< in: buffer pool instance /
2195	uint old_pct,/!< in: Reserve this percentage of*
2196	the buffer pool for "old" blocks. /*
2197	ibool adjust) /!< in: TRUE=adjust the LRU list;*
2198	FALSE=just assign buf_pool->LRU_old_ratio
2199	during the initialization of InnoDB /*
2200	{
2201	uint ratio;
2202
2203	ratio = old_pct * BUF_LRU_OLD_RATIO_DIV / `100`;
2204	if (ratio < BUF_LRU_OLD_RATIO_MIN) {
2205	ratio = BUF_LRU_OLD_RATIO_MIN;
2206	} else if (ratio > BUF_LRU_OLD_RATIO_MAX) {
2207	ratio = BUF_LRU_OLD_RATIO_MAX;
2208	}
2209
2210	if (adjust) {
2211	buf_pool_mutex_enter(buf_pool);
2212
2213	if (ratio != buf_pool->LRU_old_ratio) {
2214	buf_pool->LRU_old_ratio = ratio;
2215
2216	if (UT_LIST_GET_LEN(buf_pool->LRU)
2217	>= BUF_LRU_OLD_MIN_LEN) {
2218
2219	buf_LRU_old_adjust_len(buf_pool);
2220	}
2221	}
2222
2223	buf_pool_mutex_exit(buf_pool);
2224	} else {
2225	buf_pool->LRU_old_ratio = ratio;
2226	}
2227	/ the reverse of*
2228	ratio = old_pct BUF_LRU_OLD_RATIO_DIV / 100 /
2229	return((uint) (ratio * `100` / (double) BUF_LRU_OLD_RATIO_DIV + `0.5`));
2230	}
2231
2232	/********************************************************************//**
2233	Updates buf_pool->LRU_old_ratio.
2234	@return updated old_pct /*
2235	uint
2236	buf_LRU_old_ratio_update(
2237	/=====================/
2238	uint old_pct,/!< in: Reserve this percentage of*
2239	the buffer pool for "old" blocks. /*
2240	ibool adjust) /!< in: TRUE=adjust the LRU list;*
2241	FALSE=just assign buf_pool->LRU_old_ratio
2242	during the initialization of InnoDB /*
2243	{
2244	uint new_ratio = `0`;
2245
2246	for (ulint i = `0`; i < srv_buf_pool_instances; i++) {
2247	buf_pool_t* buf_pool;
2248
2249	buf_pool = buf_pool_from_array(i);
2250
2251	new_ratio = buf_LRU_old_ratio_update_instance(
2252	buf_pool, old_pct, adjust);
2253	}
2254
2255	return(new_ratio);
2256	}
2257
2258	/******************************************************************//**
2259	Update the historical stats that we are collecting for LRU eviction
2260	policy at the end of each interval. /*
2261	void
2262	buf_LRU_stat_update(void)
2263	/=====================/
2264	{
2265	buf_LRU_stat_t* item;
2266	buf_pool_t* buf_pool;
2267	bool evict_started = FALSE;
2268	buf_LRU_stat_t cur_stat;
2269
2270	/ If we haven't started eviction yet then don't update stats. /
2271	for (ulint i = `0`; i < srv_buf_pool_instances; i++) {
2272
2273	buf_pool = buf_pool_from_array(i);
2274
2275	if (buf_pool->freed_page_clock != `0`) {
2276	evict_started = true;
2277	break;
2278	}
2279	}
2280
2281	if (!evict_started) {
2282	goto func_exit;
2283	}
2284
2285	/ Update the index. /
2286	item = &buf_LRU_stat_arr[buf_LRU_stat_arr_ind];
2287	buf_LRU_stat_arr_ind++;
2288	buf_LRU_stat_arr_ind %= BUF_LRU_STAT_N_INTERVAL;
2289
2290	/ Add the current value and subtract the obsolete entry.*
2291	Since buf_LRU_stat_cur is not protected by any mutex,
2292	it can be changing between adding to buf_LRU_stat_sum
2293	and copying to item. Assign it to local variables to make
2294	sure the same value assign to the buf_LRU_stat_sum
2295	and item /*
2296	cur_stat = buf_LRU_stat_cur;
2297
2298	buf_LRU_stat_sum.io += cur_stat.io - item->io;
2299	buf_LRU_stat_sum.unzip += cur_stat.unzip - item->unzip;
2300
2301	/ Put current entry in the array. /
2302	memcpy(item, &cur_stat, sizeof *item);
2303
2304	func_exit:
2305	/ Clear the current entry. /
2306	memset(&buf_LRU_stat_cur, `0`, sizeof buf_LRU_stat_cur);
2307	}
2308
2309	#if defined UNIV_DEBUG \|\| defined UNIV_BUF_DEBUG
2310	/********************************************************************//**
2311	Validates the LRU list for one buffer pool instance. /*
2312	static
2313	void
2314	buf_LRU_validate_instance(
2315	/======================/
2316	buf_pool_t* buf_pool)
2317	{
2318	ulint old_len;
2319	ulint new_len;
2320
2321	buf_pool_mutex_enter(buf_pool);
2322
2323	if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) {
2324
2325	ut_a(buf_pool->LRU_old);
2326	old_len = buf_pool->LRU_old_len;
2327
2328	new_len = ut_min(UT_LIST_GET_LEN(buf_pool->LRU)
2329	* buf_pool->LRU_old_ratio
2330	/ BUF_LRU_OLD_RATIO_DIV,
2331	UT_LIST_GET_LEN(buf_pool->LRU)
2332	- (BUF_LRU_OLD_TOLERANCE
2333	+ BUF_LRU_NON_OLD_MIN_LEN));
2334
2335	ut_a(old_len >= new_len - BUF_LRU_OLD_TOLERANCE);
2336	ut_a(old_len <= new_len + BUF_LRU_OLD_TOLERANCE);
2337	}
2338
2339	CheckInLRUList::validate(buf_pool);
2340
2341	old_len = `0`;
2342
2343	for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
2344	bpage != NULL;
2345	bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
2346
2347	switch (buf_page_get_state(bpage)) {
2348	case BUF_BLOCK_POOL_WATCH:
2349	case BUF_BLOCK_NOT_USED:
2350	case BUF_BLOCK_READY_FOR_USE:
2351	case BUF_BLOCK_MEMORY:
2352	case BUF_BLOCK_REMOVE_HASH:
2353	ut_error;
2354	break;
2355	case BUF_BLOCK_FILE_PAGE:
2356	ut_ad(((buf_block_t*) bpage)->in_unzip_LRU_list
2357	== buf_page_belongs_to_unzip_LRU(bpage));
2358	case BUF_BLOCK_ZIP_PAGE:
2359	case BUF_BLOCK_ZIP_DIRTY:
2360	break;
2361	}
2362
2363	if (buf_page_is_old(bpage)) {
2364	const buf_page_t* prev
2365	= UT_LIST_GET_PREV(LRU, bpage);
2366	const buf_page_t* next
2367	= UT_LIST_GET_NEXT(LRU, bpage);
2368
2369	if (!old_len++) {
2370	ut_a(buf_pool->LRU_old == bpage);
2371	} else {
2372	ut_a(!prev \|\| buf_page_is_old(prev));
2373	}
2374
2375	ut_a(!next \|\| buf_page_is_old(next));
2376	}
2377	}
2378
2379	ut_a(buf_pool->LRU_old_len == old_len);
2380
2381	CheckInFreeList::validate(buf_pool);
2382
2383	for (buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool->free);
2384	bpage != NULL;
2385	bpage = UT_LIST_GET_NEXT(list, bpage)) {
2386
2387	ut_a(buf_page_get_state(bpage) == BUF_BLOCK_NOT_USED);
2388	}
2389
2390	CheckUnzipLRUAndLRUList::validate(buf_pool);
2391
2392	for (buf_block_t* block = UT_LIST_GET_FIRST(buf_pool->unzip_LRU);
2393	block != NULL;
2394	block = UT_LIST_GET_NEXT(unzip_LRU, block)) {
2395
2396	ut_ad(block->in_unzip_LRU_list);
2397	ut_ad(block->page.in_LRU_list);
2398	ut_a(buf_page_belongs_to_unzip_LRU(&block->page));
2399	}
2400
2401	buf_pool_mutex_exit(buf_pool);
2402	}
2403
2404	/********************************************************************//**
2405	Validates the LRU list.
2406	@return TRUE /*
2407	ibool
2408	buf_LRU_validate(void)
2409	/==================/
2410	{
2411	for (ulint i = `0`; i < srv_buf_pool_instances; i++) {
2412	buf_pool_t* buf_pool;
2413
2414	buf_pool = buf_pool_from_array(i);
2415	buf_LRU_validate_instance(buf_pool);
2416	}
2417
2418	return(TRUE);
2419	}
2420	#endif /* UNIV_DEBUG \|\| UNIV_BUF_DEBUG */
2421
2422	#if defined UNIV_DEBUG_PRINT \|\| defined UNIV_DEBUG \|\| defined UNIV_BUF_DEBUG
2423	/********************************************************************//**
2424	Prints the LRU list for one buffer pool instance. /*
2425	static
2426	void
2427	buf_LRU_print_instance(
2428	/===================/
2429	buf_pool_t* buf_pool)
2430	{
2431	buf_pool_mutex_enter(buf_pool);
2432
2433	for (const buf_page_t* bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
2434	bpage != NULL;
2435	bpage = UT_LIST_GET_NEXT(LRU, bpage)) {
2436
2437	mutex_enter(buf_page_get_mutex(bpage));
2438
2439	fprintf(stderr, "BLOCK space %u page %u ",
2440	bpage->id.space(), bpage->id.page_no());
2441
2442	if (buf_page_is_old(bpage)) {
2443	fputs("old ", stderr);
2444	}
2445
2446	if (bpage->buf_fix_count) {
2447	fprintf(stderr, "buffix count %u ",
2448	bpage->buf_fix_count);
2449	}
2450
2451	if (buf_page_get_io_fix(bpage)) {
2452	fprintf(stderr, "io_fix %d ",
2453	buf_page_get_io_fix(bpage));
2454	}
2455
2456	if (bpage->oldest_modification) {
2457	fputs("modif. ", stderr);
2458	}
2459
2460	switch (buf_page_get_state(bpage)) {
2461	const byte* frame;
2462	case BUF_BLOCK_FILE_PAGE:
2463	frame = buf_block_get_frame((buf_block_t*) bpage);
2464	fprintf(stderr, "\ntype %u index id " IB_ID_FMT "\n",
2465	fil_page_get_type(frame),
2466	btr_page_get_index_id(frame));
2467	break;
2468	case BUF_BLOCK_ZIP_PAGE:
2469	frame = bpage->zip.data;
2470	fprintf(stderr, "\ntype %u size " ULINTPF
2471	" index id " IB_ID_FMT "\n",
2472	fil_page_get_type(frame),
2473	bpage->size.physical(),
2474	btr_page_get_index_id(frame));
2475	break;
2476
2477	default:
2478	fprintf(stderr, "\n!state %d!\n",
2479	buf_page_get_state(bpage));
2480	break;
2481	}
2482
2483	mutex_exit(buf_page_get_mutex(bpage));
2484	}
2485
2486	buf_pool_mutex_exit(buf_pool);
2487	}
2488
2489	/********************************************************************//**
2490	Prints the LRU list. /*
2491	void
2492	buf_LRU_print(void)
2493	/===============/
2494	{
2495	for (ulint i = `0`; i < srv_buf_pool_instances; i++) {
2496	buf_pool_t* buf_pool;
2497
2498	buf_pool = buf_pool_from_array(i);
2499	buf_LRU_print_instance(buf_pool);
2500	}
2501	}
2502	#endif /* UNIV_DEBUG_PRINT \|\| UNIV_DEBUG \|\| UNIV_BUF_DEBUG */
2503

Browse the source code of MariaDB/storage/innobase/buf/buf0lru.cc