1/*****************************************************************************
2
3Copyright (c) 1995, 2018, Oracle and/or its affiliates. All Rights Reserved.
4Copyright (c) 2008, Google Inc.
5Copyright (c) 2013, 2018, MariaDB Corporation.
6
7Portions of this file contain modifications contributed and copyrighted by
8Google, Inc. Those modifications are gratefully acknowledged and are described
9briefly in the InnoDB documentation. The contributions by Google are
10incorporated with their permission, and subject to the conditions contained in
11the file COPYING.Google.
12
13This program is free software; you can redistribute it and/or modify it under
14the terms of the GNU General Public License as published by the Free Software
15Foundation; version 2 of the License.
16
17This program is distributed in the hope that it will be useful, but WITHOUT
18ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
19FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
20
21You should have received a copy of the GNU General Public License along with
22this program; if not, write to the Free Software Foundation, Inc.,
2351 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
24
25*****************************************************************************/
26
27/**************************************************//**
28@file buf/buf0buf.cc
29The database buffer buf_pool
30
31Created 11/5/1995 Heikki Tuuri
32*******************************************************/
33
34#include "univ.i"
35#include "mtr0types.h"
36#include "mach0data.h"
37#include "page0size.h"
38#include "buf0buf.h"
39#include <string.h>
40
41#ifdef UNIV_NONINL
42#include "buf0buf.ic"
43#endif
44
45#ifndef UNIV_INNOCHECKSUM
46#include "mem0mem.h"
47#include "btr0btr.h"
48#include "fil0fil.h"
49#include "fil0crypt.h"
50#include "fsp0sysspace.h"
51#include "buf0buddy.h"
52#include "lock0lock.h"
53#include "sync0rw.h"
54#include "btr0sea.h"
55#include "ibuf0ibuf.h"
56#include "trx0undo.h"
57#include "trx0purge.h"
58#include "log0log.h"
59#include "dict0stats_bg.h"
60#include "srv0srv.h"
61#include "srv0start.h"
62#include "dict0dict.h"
63#include "log0recv.h"
64#include "srv0mon.h"
65#include "fsp0sysspace.h"
66#endif /* !UNIV_INNOCHECKSUM */
67#include "page0zip.h"
68#include "sync0sync.h"
69#include "buf0dump.h"
70#include "ut0new.h"
71#include <new>
72#include <map>
73#include <sstream>
74#ifndef UNIV_INNOCHECKSUM
75#include "fil0pagecompress.h"
76#include "fsp0pagecompress.h"
77#endif
78#include "ha_prototypes.h"
79#include "ut0byte.h"
80#include <new>
81
82#ifdef UNIV_LINUX
83#include <stdlib.h>
84#endif
85
86#ifdef HAVE_LZO
87#include "lzo/lzo1x.h"
88#endif
89
90#ifdef HAVE_LIBNUMA
91#include <numa.h>
92#include <numaif.h>
93struct set_numa_interleave_t
94{
95 set_numa_interleave_t()
96 {
97 if (srv_numa_interleave) {
98
99 struct bitmask *numa_mems_allowed = numa_get_mems_allowed();
100 ib::info() << "Setting NUMA memory policy to"
101 " MPOL_INTERLEAVE";
102 if (set_mempolicy(MPOL_INTERLEAVE,
103 numa_mems_allowed->maskp,
104 numa_mems_allowed->size) != 0) {
105
106 ib::warn() << "Failed to set NUMA memory"
107 " policy to MPOL_INTERLEAVE: "
108 << strerror(errno);
109 }
110 }
111 }
112
113 ~set_numa_interleave_t()
114 {
115 if (srv_numa_interleave) {
116
117 ib::info() << "Setting NUMA memory policy to"
118 " MPOL_DEFAULT";
119 if (set_mempolicy(MPOL_DEFAULT, NULL, 0) != 0) {
120 ib::warn() << "Failed to set NUMA memory"
121 " policy to MPOL_DEFAULT: "
122 << strerror(errno);
123 }
124 }
125 }
126};
127
128#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE set_numa_interleave_t scoped_numa
129#else
130#define NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE
131#endif /* HAVE_LIBNUMA */
132
133#ifdef HAVE_SNAPPY
134#include "snappy-c.h"
135#endif
136
137inline void* aligned_malloc(size_t size, size_t align) {
138 void *result;
139#ifdef _MSC_VER
140 result = _aligned_malloc(size, align);
141#elif defined (HAVE_POSIX_MEMALIGN)
142 if(posix_memalign(&result, align, size)) {
143 result = 0;
144 }
145#else
146 /* Use unaligned malloc as fallback */
147 result = malloc(size);
148#endif
149 return result;
150}
151
152inline void aligned_free(void *ptr) {
153#ifdef _MSC_VER
154 _aligned_free(ptr);
155#else
156 free(ptr);
157#endif
158}
159
160/*
161 IMPLEMENTATION OF THE BUFFER POOL
162 =================================
163
164Performance improvement:
165------------------------
166Thread scheduling in NT may be so slow that the OS wait mechanism should
167not be used even in waiting for disk reads to complete.
168Rather, we should put waiting query threads to the queue of
169waiting jobs, and let the OS thread do something useful while the i/o
170is processed. In this way we could remove most OS thread switches in
171an i/o-intensive benchmark like TPC-C.
172
173A possibility is to put a user space thread library between the database
174and NT. User space thread libraries might be very fast.
175
176SQL Server 7.0 can be configured to use 'fibers' which are lightweight
177threads in NT. These should be studied.
178
179 Buffer frames and blocks
180 ------------------------
181Following the terminology of Gray and Reuter, we call the memory
182blocks where file pages are loaded buffer frames. For each buffer
183frame there is a control block, or shortly, a block, in the buffer
184control array. The control info which does not need to be stored
185in the file along with the file page, resides in the control block.
186
187 Buffer pool struct
188 ------------------
189The buffer buf_pool contains a single mutex which protects all the
190control data structures of the buf_pool. The content of a buffer frame is
191protected by a separate read-write lock in its control block, though.
192These locks can be locked and unlocked without owning the buf_pool->mutex.
193The OS events in the buf_pool struct can be waited for without owning the
194buf_pool->mutex.
195
196The buf_pool->mutex is a hot-spot in main memory, causing a lot of
197memory bus traffic on multiprocessor systems when processors
198alternately access the mutex. On our Pentium, the mutex is accessed
199maybe every 10 microseconds. We gave up the solution to have mutexes
200for each control block, for instance, because it seemed to be
201complicated.
202
203A solution to reduce mutex contention of the buf_pool->mutex is to
204create a separate mutex for the page hash table. On Pentium,
205accessing the hash table takes 2 microseconds, about half
206of the total buf_pool->mutex hold time.
207
208 Control blocks
209 --------------
210
211The control block contains, for instance, the bufferfix count
212which is incremented when a thread wants a file page to be fixed
213in a buffer frame. The bufferfix operation does not lock the
214contents of the frame, however. For this purpose, the control
215block contains a read-write lock.
216
217The buffer frames have to be aligned so that the start memory
218address of a frame is divisible by the universal page size, which
219is a power of two.
220
221We intend to make the buffer buf_pool size on-line reconfigurable,
222that is, the buf_pool size can be changed without closing the database.
223Then the database administarator may adjust it to be bigger
224at night, for example. The control block array must
225contain enough control blocks for the maximum buffer buf_pool size
226which is used in the particular database.
227If the buf_pool size is cut, we exploit the virtual memory mechanism of
228the OS, and just refrain from using frames at high addresses. Then the OS
229can swap them to disk.
230
231The control blocks containing file pages are put to a hash table
232according to the file address of the page.
233We could speed up the access to an individual page by using
234"pointer swizzling": we could replace the page references on
235non-leaf index pages by direct pointers to the page, if it exists
236in the buf_pool. We could make a separate hash table where we could
237chain all the page references in non-leaf pages residing in the buf_pool,
238using the page reference as the hash key,
239and at the time of reading of a page update the pointers accordingly.
240Drawbacks of this solution are added complexity and,
241possibly, extra space required on non-leaf pages for memory pointers.
242A simpler solution is just to speed up the hash table mechanism
243in the database, using tables whose size is a power of 2.
244
245 Lists of blocks
246 ---------------
247
248There are several lists of control blocks.
249
250The free list (buf_pool->free) contains blocks which are currently not
251used.
252
253The common LRU list contains all the blocks holding a file page
254except those for which the bufferfix count is non-zero.
255The pages are in the LRU list roughly in the order of the last
256access to the page, so that the oldest pages are at the end of the
257list. We also keep a pointer to near the end of the LRU list,
258which we can use when we want to artificially age a page in the
259buf_pool. This is used if we know that some page is not needed
260again for some time: we insert the block right after the pointer,
261causing it to be replaced sooner than would normally be the case.
262Currently this aging mechanism is used for read-ahead mechanism
263of pages, and it can also be used when there is a scan of a full
264table which cannot fit in the memory. Putting the pages near the
265end of the LRU list, we make sure that most of the buf_pool stays
266in the main memory, undisturbed.
267
268The unzip_LRU list contains a subset of the common LRU list. The
269blocks on the unzip_LRU list hold a compressed file page and the
270corresponding uncompressed page frame. A block is in unzip_LRU if and
271only if the predicate buf_page_belongs_to_unzip_LRU(&block->page)
272holds. The blocks in unzip_LRU will be in same order as they are in
273the common LRU list. That is, each manipulation of the common LRU
274list will result in the same manipulation of the unzip_LRU list.
275
276The chain of modified blocks (buf_pool->flush_list) contains the blocks
277holding file pages that have been modified in the memory
278but not written to disk yet. The block with the oldest modification
279which has not yet been written to disk is at the end of the chain.
280The access to this list is protected by buf_pool->flush_list_mutex.
281
282The chain of unmodified compressed blocks (buf_pool->zip_clean)
283contains the control blocks (buf_page_t) of those compressed pages
284that are not in buf_pool->flush_list and for which no uncompressed
285page has been allocated in the buffer pool. The control blocks for
286uncompressed pages are accessible via buf_block_t objects that are
287reachable via buf_pool->chunks[].
288
289The chains of free memory blocks (buf_pool->zip_free[]) are used by
290the buddy allocator (buf0buddy.cc) to keep track of currently unused
291memory blocks of size sizeof(buf_page_t)..srv_page_size / 2. These
292blocks are inside the srv_page_size-sized memory blocks of type
293BUF_BLOCK_MEMORY that the buddy allocator requests from the buffer
294pool. The buddy allocator is solely used for allocating control
295blocks for compressed pages (buf_page_t) and compressed page frames.
296
297 Loading a file page
298 -------------------
299
300First, a victim block for replacement has to be found in the
301buf_pool. It is taken from the free list or searched for from the
302end of the LRU-list. An exclusive lock is reserved for the frame,
303the io_fix field is set in the block fixing the block in buf_pool,
304and the io-operation for loading the page is queued. The io-handler thread
305releases the X-lock on the frame and resets the io_fix field
306when the io operation completes.
307
308A thread may request the above operation using the function
309buf_page_get(). It may then continue to request a lock on the frame.
310The lock is granted when the io-handler releases the x-lock.
311
312 Read-ahead
313 ----------
314
315The read-ahead mechanism is intended to be intelligent and
316isolated from the semantically higher levels of the database
317index management. From the higher level we only need the
318information if a file page has a natural successor or
319predecessor page. On the leaf level of a B-tree index,
320these are the next and previous pages in the natural
321order of the pages.
322
323Let us first explain the read-ahead mechanism when the leafs
324of a B-tree are scanned in an ascending or descending order.
325When a read page is the first time referenced in the buf_pool,
326the buffer manager checks if it is at the border of a so-called
327linear read-ahead area. The tablespace is divided into these
328areas of size 64 blocks, for example. So if the page is at the
329border of such an area, the read-ahead mechanism checks if
330all the other blocks in the area have been accessed in an
331ascending or descending order. If this is the case, the system
332looks at the natural successor or predecessor of the page,
333checks if that is at the border of another area, and in this case
334issues read-requests for all the pages in that area. Maybe
335we could relax the condition that all the pages in the area
336have to be accessed: if data is deleted from a table, there may
337appear holes of unused pages in the area.
338
339A different read-ahead mechanism is used when there appears
340to be a random access pattern to a file.
341If a new page is referenced in the buf_pool, and several pages
342of its random access area (for instance, 32 consecutive pages
343in a tablespace) have recently been referenced, we may predict
344that the whole area may be needed in the near future, and issue
345the read requests for the whole area.
346*/
347
348#ifndef UNIV_INNOCHECKSUM
349/** Value in microseconds */
350static const int WAIT_FOR_READ = 100;
351static const int WAIT_FOR_WRITE = 100;
352/** Number of attempts made to read in a page in the buffer pool */
353static const ulint BUF_PAGE_READ_MAX_RETRIES = 100;
354/** Number of pages to read ahead */
355static const ulint BUF_READ_AHEAD_PAGES = 64;
356/** The maximum portion of the buffer pool that can be used for the
357read-ahead buffer. (Divide buf_pool size by this amount) */
358static const ulint BUF_READ_AHEAD_PORTION = 32;
359
360/** The buffer pools of the database */
361buf_pool_t* buf_pool_ptr;
362
363/** true when resizing buffer pool is in the critical path. */
364volatile bool buf_pool_resizing;
365
366/** true when withdrawing buffer pool pages might cause page relocation */
367volatile bool buf_pool_withdrawing;
368
369/** the clock is incremented every time a pointer to a page may become obsolete;
370if the withdrwa clock has not changed, the pointer is still valid in buffer
371pool. if changed, the pointer might not be in buffer pool any more. */
372volatile ulint buf_withdraw_clock;
373
374/** Map of buffer pool chunks by its first frame address
375This is newly made by initialization of buffer pool and buf_resize_thread.
376Currently, no need mutex protection for update. */
377typedef std::map<
378 const byte*,
379 buf_chunk_t*,
380 std::less<const byte*>,
381 ut_allocator<std::pair<const byte* const, buf_chunk_t*> > >
382 buf_pool_chunk_map_t;
383
384static buf_pool_chunk_map_t* buf_chunk_map_reg;
385
386/** Chunk map to be used to lookup.
387The map pointed by this should not be updated */
388static buf_pool_chunk_map_t* buf_chunk_map_ref = NULL;
389
390#ifdef UNIV_DEBUG
391/** Disable resizing buffer pool to make assertion code not expensive. */
392my_bool buf_disable_resize_buffer_pool_debug = TRUE;
393#endif /* UNIV_DEBUG */
394
395#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
396/** This is used to insert validation operations in execution
397in the debug version */
398static ulint buf_dbg_counter = 0;
399#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
400
401#if defined UNIV_PFS_MUTEX || defined UNIV_PFS_RWLOCK
402# ifndef PFS_SKIP_BUFFER_MUTEX_RWLOCK
403
404/* Buffer block mutexes and rwlocks can be registered
405in one group rather than individually. If PFS_GROUP_BUFFER_SYNC
406is defined, register buffer block mutex and rwlock
407in one group after their initialization. */
408# define PFS_GROUP_BUFFER_SYNC
409
410/* This define caps the number of mutexes/rwlocks can
411be registered with performance schema. Developers can
412modify this define if necessary. Please note, this would
413be effective only if PFS_GROUP_BUFFER_SYNC is defined. */
414# define PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER ULINT_MAX
415
416# endif /* !PFS_SKIP_BUFFER_MUTEX_RWLOCK */
417#endif /* UNIV_PFS_MUTEX || UNIV_PFS_RWLOCK */
418
419/** Macro to determine whether the read of write counter is used depending
420on the io_type */
421#define MONITOR_RW_COUNTER(io_type, counter) \
422 ((io_type == BUF_IO_READ) \
423 ? (counter##_READ) \
424 : (counter##_WRITTEN))
425
426/** Registers a chunk to buf_pool_chunk_map
427@param[in] chunk chunk of buffers */
428static
429void
430buf_pool_register_chunk(
431 buf_chunk_t* chunk)
432{
433 buf_chunk_map_reg->insert(buf_pool_chunk_map_t::value_type(
434 chunk->blocks->frame, chunk));
435}
436
437/** Decrypt a page.
438@param[in,out] bpage Page control block
439@param[in,out] space tablespace
440@return whether the operation was successful */
441static
442bool
443buf_page_decrypt_after_read(buf_page_t* bpage, fil_space_t* space)
444 MY_ATTRIBUTE((nonnull));
445
446/********************************************************************//**
447Gets the smallest oldest_modification lsn for any page in the pool. Returns
448zero if all modified pages have been flushed to disk.
449@return oldest modification in pool, zero if none */
450lsn_t
451buf_pool_get_oldest_modification(void)
452/*==================================*/
453{
454 lsn_t lsn = 0;
455 lsn_t oldest_lsn = 0;
456
457 /* When we traverse all the flush lists we don't want another
458 thread to add a dirty page to any flush list. */
459 log_flush_order_mutex_enter();
460
461 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
462 buf_pool_t* buf_pool;
463
464 buf_pool = buf_pool_from_array(i);
465
466 buf_flush_list_mutex_enter(buf_pool);
467
468 buf_page_t* bpage;
469
470 /* We don't let log-checkpoint halt because pages from system
471 temporary are not yet flushed to the disk. Anyway, object
472 residing in system temporary doesn't generate REDO logging. */
473 for (bpage = UT_LIST_GET_LAST(buf_pool->flush_list);
474 bpage != NULL
475 && fsp_is_system_temporary(bpage->id.space());
476 bpage = UT_LIST_GET_PREV(list, bpage)) {
477 /* Do nothing. */
478 }
479
480 if (bpage != NULL) {
481 ut_ad(bpage->in_flush_list);
482 lsn = bpage->oldest_modification;
483 }
484
485 buf_flush_list_mutex_exit(buf_pool);
486
487 if (!oldest_lsn || oldest_lsn > lsn) {
488 oldest_lsn = lsn;
489 }
490 }
491
492 log_flush_order_mutex_exit();
493
494 /* The returned answer may be out of date: the flush_list can
495 change after the mutex has been released. */
496
497 return(oldest_lsn);
498}
499
500/********************************************************************//**
501Get total buffer pool statistics. */
502void
503buf_get_total_list_len(
504/*===================*/
505 ulint* LRU_len, /*!< out: length of all LRU lists */
506 ulint* free_len, /*!< out: length of all free lists */
507 ulint* flush_list_len) /*!< out: length of all flush lists */
508{
509 ulint i;
510
511 *LRU_len = 0;
512 *free_len = 0;
513 *flush_list_len = 0;
514
515 for (i = 0; i < srv_buf_pool_instances; i++) {
516 buf_pool_t* buf_pool;
517
518 buf_pool = buf_pool_from_array(i);
519
520 *LRU_len += UT_LIST_GET_LEN(buf_pool->LRU);
521 *free_len += UT_LIST_GET_LEN(buf_pool->free);
522 *flush_list_len += UT_LIST_GET_LEN(buf_pool->flush_list);
523 }
524}
525
526/********************************************************************//**
527Get total list size in bytes from all buffer pools. */
528void
529buf_get_total_list_size_in_bytes(
530/*=============================*/
531 buf_pools_list_size_t* buf_pools_list_size) /*!< out: list sizes
532 in all buffer pools */
533{
534 ut_ad(buf_pools_list_size);
535 memset(buf_pools_list_size, 0, sizeof(*buf_pools_list_size));
536
537 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
538 buf_pool_t* buf_pool;
539
540 buf_pool = buf_pool_from_array(i);
541 /* We don't need mutex protection since this is
542 for statistics purpose */
543 buf_pools_list_size->LRU_bytes += buf_pool->stat.LRU_bytes;
544 buf_pools_list_size->unzip_LRU_bytes +=
545 UT_LIST_GET_LEN(buf_pool->unzip_LRU)
546 << srv_page_size_shift;
547 buf_pools_list_size->flush_list_bytes +=
548 buf_pool->stat.flush_list_bytes;
549 }
550}
551
552/********************************************************************//**
553Get total buffer pool statistics. */
554void
555buf_get_total_stat(
556/*===============*/
557 buf_pool_stat_t* tot_stat) /*!< out: buffer pool stats */
558{
559 ulint i;
560
561 memset(tot_stat, 0, sizeof(*tot_stat));
562
563 for (i = 0; i < srv_buf_pool_instances; i++) {
564 buf_pool_stat_t*buf_stat;
565 buf_pool_t* buf_pool;
566
567 buf_pool = buf_pool_from_array(i);
568
569 buf_stat = &buf_pool->stat;
570 tot_stat->n_page_gets += buf_stat->n_page_gets;
571 tot_stat->n_pages_read += buf_stat->n_pages_read;
572 tot_stat->n_pages_written += buf_stat->n_pages_written;
573 tot_stat->n_pages_created += buf_stat->n_pages_created;
574 tot_stat->n_ra_pages_read_rnd += buf_stat->n_ra_pages_read_rnd;
575 tot_stat->n_ra_pages_read += buf_stat->n_ra_pages_read;
576 tot_stat->n_ra_pages_evicted += buf_stat->n_ra_pages_evicted;
577 tot_stat->n_pages_made_young += buf_stat->n_pages_made_young;
578
579 tot_stat->n_pages_not_made_young +=
580 buf_stat->n_pages_not_made_young;
581 }
582}
583
584/********************************************************************//**
585Allocates a buffer block.
586@return own: the allocated block, in state BUF_BLOCK_MEMORY */
587buf_block_t*
588buf_block_alloc(
589/*============*/
590 buf_pool_t* buf_pool) /*!< in/out: buffer pool instance,
591 or NULL for round-robin selection
592 of the buffer pool */
593{
594 buf_block_t* block;
595 ulint index;
596 static ulint buf_pool_index;
597
598 if (buf_pool == NULL) {
599 /* We are allocating memory from any buffer pool, ensure
600 we spread the grace on all buffer pool instances. */
601 index = buf_pool_index++ % srv_buf_pool_instances;
602 buf_pool = buf_pool_from_array(index);
603 }
604
605 block = buf_LRU_get_free_block(buf_pool);
606
607 buf_block_set_state(block, BUF_BLOCK_MEMORY);
608
609 return(block);
610}
611#endif /* !UNIV_INNOCHECKSUM */
612
613/** Checks if a page contains only zeroes.
614@param[in] read_buf database page
615@param[in] page_size page size
616@return true if page is filled with zeroes */
617bool
618buf_page_is_zeroes(
619 const byte* read_buf,
620 const page_size_t& page_size)
621{
622 for (ulint i = 0; i < page_size.logical(); i++) {
623 if (read_buf[i] != 0) {
624 return(false);
625 }
626 }
627 return(true);
628}
629
630/** Checks if the page is in crc32 checksum format.
631@param[in] read_buf database page
632@param[in] checksum_field1 new checksum field
633@param[in] checksum_field2 old checksum field
634@param[in] use_legacy_big_endian use legacy big endian algorithm
635@return true if the page is in crc32 checksum format. */
636bool
637buf_page_is_checksum_valid_crc32(
638 const byte* read_buf,
639 ulint checksum_field1,
640 ulint checksum_field2,
641 bool use_legacy_big_endian)
642{
643 const uint32_t crc32 = buf_calc_page_crc32(read_buf,
644 use_legacy_big_endian);
645
646#ifdef UNIV_INNOCHECKSUM
647 if (log_file
648 && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32) {
649 fprintf(log_file, "page::%llu;"
650 " crc32 calculated = %u;"
651 " recorded checksum field1 = " ULINTPF " recorded"
652 " checksum field2 =" ULINTPF "\n", cur_page_num,
653 crc32, checksum_field1, checksum_field2);
654 }
655#endif /* UNIV_INNOCHECKSUM */
656
657 if (checksum_field1 != checksum_field2) {
658 goto invalid;
659 }
660
661 if (checksum_field1 == crc32) {
662 return(true);
663 } else {
664 const uint32_t crc32_legacy = buf_calc_page_crc32(read_buf, true);
665
666 if (checksum_field1 == crc32_legacy) {
667 return(true);
668 }
669 }
670
671invalid:
672 DBUG_LOG("checksum", "Page checksum crc32 not valid"
673 << " field1 " << checksum_field1
674 << " field2 " << checksum_field2
675 << " crc32 " << crc32);
676 return(false);
677}
678
679/** Checks if the page is in innodb checksum format.
680@param[in] read_buf database page
681@param[in] checksum_field1 new checksum field
682@param[in] checksum_field2 old checksum field
683@return true if the page is in innodb checksum format. */
684bool
685buf_page_is_checksum_valid_innodb(
686 const byte* read_buf,
687 ulint checksum_field1,
688 ulint checksum_field2)
689{
690 /* There are 2 valid formulas for
691 checksum_field2 (old checksum field) which algo=innodb could have
692 written to the page:
693
694 1. Very old versions of InnoDB only stored 8 byte lsn to the
695 start and the end of the page.
696
697 2. Newer InnoDB versions store the old formula checksum
698 (buf_calc_page_old_checksum()). */
699
700 ulint old_checksum = buf_calc_page_old_checksum(read_buf);
701 ulint new_checksum = buf_calc_page_new_checksum(read_buf);
702
703#ifdef UNIV_INNOCHECKSUM
704 if (log_file
705 && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_INNODB) {
706 fprintf(log_file, "page::%llu;"
707 " old style: calculated ="
708 " " ULINTPF "; recorded = " ULINTPF "\n",
709 cur_page_num, old_checksum,
710 checksum_field2);
711 fprintf(log_file, "page::%llu;"
712 " new style: calculated ="
713 " " ULINTPF "; crc32 = %u; recorded = " ULINTPF "\n",
714 cur_page_num, new_checksum,
715 buf_calc_page_crc32(read_buf), checksum_field1);
716 }
717
718 if (log_file
719 && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB) {
720 fprintf(log_file, "page::%llu;"
721 " old style: calculated ="
722 " " ULINTPF "; recorded checksum = " ULINTPF "\n",
723 cur_page_num, old_checksum,
724 checksum_field2);
725 fprintf(log_file, "page::%llu;"
726 " new style: calculated ="
727 " " ULINTPF "; recorded checksum = " ULINTPF "\n",
728 cur_page_num, new_checksum,
729 checksum_field1);
730 }
731#endif /* UNIV_INNOCHECKSUM */
732
733
734 if (checksum_field2 != mach_read_from_4(read_buf + FIL_PAGE_LSN)
735 && checksum_field2 != old_checksum) {
736 DBUG_LOG("checksum",
737 "Page checksum crc32 not valid"
738 << " field1 " << checksum_field1
739 << " field2 " << checksum_field2
740 << " crc32 " << buf_calc_page_old_checksum(read_buf)
741 << " lsn " << mach_read_from_4(
742 read_buf + FIL_PAGE_LSN));
743 return(false);
744 }
745
746 /* old field is fine, check the new field */
747
748 /* InnoDB versions < 4.0.14 and < 4.1.1 stored the space id
749 (always equal to 0), to FIL_PAGE_SPACE_OR_CHKSUM */
750
751 if (checksum_field1 != 0 && checksum_field1 != new_checksum) {
752 DBUG_LOG("checksum",
753 "Page checksum crc32 not valid"
754 << " field1 " << checksum_field1
755 << " field2 " << checksum_field2
756 << " crc32 " << buf_calc_page_new_checksum(read_buf)
757 << " lsn " << mach_read_from_4(
758 read_buf + FIL_PAGE_LSN));
759 return(false);
760 }
761
762 return(true);
763}
764
765/** Checks if the page is in none checksum format.
766@param[in] read_buf database page
767@param[in] checksum_field1 new checksum field
768@param[in] checksum_field2 old checksum field
769@return true if the page is in none checksum format. */
770bool
771buf_page_is_checksum_valid_none(
772 const byte* read_buf,
773 ulint checksum_field1,
774 ulint checksum_field2)
775{
776#ifndef DBUG_OFF
777 if (checksum_field1 != checksum_field2
778 && checksum_field1 != BUF_NO_CHECKSUM_MAGIC) {
779 DBUG_LOG("checksum",
780 "Page checksum crc32 not valid"
781 << " field1 " << checksum_field1
782 << " field2 " << checksum_field2
783 << " crc32 " << BUF_NO_CHECKSUM_MAGIC
784 << " lsn " << mach_read_from_4(read_buf
785 + FIL_PAGE_LSN));
786 }
787#endif /* DBUG_OFF */
788
789#ifdef UNIV_INNOCHECKSUM
790 if (log_file
791 && srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_STRICT_NONE) {
792 fprintf(log_file,
793 "page::%llu; none checksum: calculated"
794 " = %lu; recorded checksum_field1 = " ULINTPF
795 " recorded checksum_field2 = " ULINTPF "\n",
796 cur_page_num, BUF_NO_CHECKSUM_MAGIC,
797 checksum_field1, checksum_field2);
798 }
799#endif /* UNIV_INNOCHECKSUM */
800
801 return(checksum_field1 == checksum_field2
802 && checksum_field1 == BUF_NO_CHECKSUM_MAGIC);
803}
804
805/** Check if a page is corrupt.
806@param[in] check_lsn whether the LSN should be checked
807@param[in] read_buf database page
808@param[in] page_size page size
809@param[in] space tablespace
810@return whether the page is corrupted */
811bool
812buf_page_is_corrupted(
813 bool check_lsn,
814 const byte* read_buf,
815 const page_size_t& page_size,
816#ifndef UNIV_INNOCHECKSUM
817 const fil_space_t* space)
818#else
819 const void* space)
820#endif
821{
822 size_t checksum_field1 = 0;
823 size_t checksum_field2 = 0;
824#ifndef UNIV_INNOCHECKSUM
825 DBUG_EXECUTE_IF("buf_page_import_corrupt_failure", return(true); );
826#endif
827 ulint page_type = mach_read_from_2(read_buf + FIL_PAGE_TYPE);
828
829 /* We can trust page type if page compression is set on tablespace
830 flags because page compression flag means file must have been
831 created with 10.1 (later than 5.5 code base). In 10.1 page
832 compressed tables do not contain post compression checksum and
833 FIL_PAGE_END_LSN_OLD_CHKSUM field stored. Note that space can
834 be null if we are in fil_check_first_page() and first page
835 is not compressed or encrypted. Page checksum is verified
836 after decompression (i.e. normally pages are already
837 decompressed at this stage). */
838 if ((page_type == FIL_PAGE_PAGE_COMPRESSED ||
839 page_type == FIL_PAGE_PAGE_COMPRESSED_ENCRYPTED)
840#ifndef UNIV_INNOCHECKSUM
841 && space && FSP_FLAGS_HAS_PAGE_COMPRESSION(space->flags)
842#endif
843 ) {
844 return(false);
845 }
846
847 if (!page_size.is_compressed()
848 && memcmp(read_buf + FIL_PAGE_LSN + 4,
849 read_buf + page_size.logical()
850 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4, 4)) {
851
852 /* Stored log sequence numbers at the start and the end
853 of page do not match */
854#ifndef UNIV_INNOCHECKSUM
855 ib::info() << "Log sequence number at the start "
856 << mach_read_from_4(read_buf + FIL_PAGE_LSN + 4)
857 << " and the end "
858 << mach_read_from_4(read_buf + srv_page_size - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)
859 << " do not match";
860#endif /* UNIV_INNOCHECKSUM */
861 return(true);
862 }
863
864#ifndef UNIV_INNOCHECKSUM
865 if (check_lsn && recv_lsn_checks_on) {
866 lsn_t current_lsn;
867 const lsn_t page_lsn
868 = mach_read_from_8(read_buf + FIL_PAGE_LSN);
869
870 /* Since we are going to reset the page LSN during the import
871 phase it makes no sense to spam the log with error messages. */
872
873 if (log_peek_lsn(&current_lsn) && current_lsn < page_lsn) {
874
875 const ulint space_id = mach_read_from_4(
876 read_buf + FIL_PAGE_SPACE_ID);
877 const ulint page_no = mach_read_from_4(
878 read_buf + FIL_PAGE_OFFSET);
879
880 ib::error() << "Page " << page_id_t(space_id, page_no)
881 << " log sequence number " << page_lsn
882 << " is in the future! Current system"
883 << " log sequence number "
884 << current_lsn << ".";
885
886 ib::error() << "Your database may be corrupt or"
887 " you may have copied the InnoDB"
888 " tablespace but not the InnoDB"
889 " log files. "
890 << FORCE_RECOVERY_MSG;
891
892 }
893 }
894#endif /* !UNIV_INNOCHECKSUM */
895
896 /* Check whether the checksum fields have correct values */
897
898 if (srv_checksum_algorithm == SRV_CHECKSUM_ALGORITHM_NONE) {
899 return(false);
900 }
901
902 if (page_size.is_compressed()) {
903 return(!page_zip_verify_checksum(read_buf,
904 page_size.physical()));
905 }
906
907 checksum_field1 = mach_read_from_4(
908 read_buf + FIL_PAGE_SPACE_OR_CHKSUM);
909
910 checksum_field2 = mach_read_from_4(
911 read_buf + page_size.logical() - FIL_PAGE_END_LSN_OLD_CHKSUM);
912
913 compile_time_assert(!(FIL_PAGE_LSN % 8));
914
915 /* declare empty pages non-corrupted */
916 if (checksum_field1 == 0
917 && checksum_field2 == 0
918 && *reinterpret_cast<const ib_uint64_t*>(
919 read_buf + FIL_PAGE_LSN) == 0) {
920
921 ulint i;
922
923 /* make sure that the page is really empty */
924 for (i = 0; i < page_size.logical(); ++i) {
925
926 /* The FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID has been
927 repurposed for page compression. It can be
928 set for uncompressed empty pages. */
929
930 if ((i < FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION
931 || i >= FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID)
932 && read_buf[i] != 0) {
933
934#ifndef UNIV_INNOCHECKSUM
935 ib::info() << "Checksum fields zero but page is not empty.";
936#endif
937
938 break;
939 }
940 }
941#ifdef UNIV_INNOCHECKSUM
942 if (i >= page_size.logical()) {
943 if (log_file) {
944 fprintf(log_file, "Page::%llu"
945 " is empty and uncorrupted\n",
946 cur_page_num);
947 }
948 return(false);
949 }
950#else
951 return(i < page_size.logical());
952#endif /* UNIV_INNOCHECKSUM */
953 }
954
955#ifndef UNIV_INNOCHECKSUM
956 const page_id_t page_id(mach_read_from_4(
957 read_buf + FIL_PAGE_SPACE_ID),
958 mach_read_from_4(
959 read_buf + FIL_PAGE_OFFSET));
960#endif /* UNIV_INNOCHECKSUM */
961
962 const srv_checksum_algorithm_t curr_algo =
963 static_cast<srv_checksum_algorithm_t>(srv_checksum_algorithm);
964
965 bool legacy_checksum_checked = false;
966
967 switch (curr_algo) {
968 case SRV_CHECKSUM_ALGORITHM_CRC32:
969 case SRV_CHECKSUM_ALGORITHM_STRICT_CRC32:
970
971 if (buf_page_is_checksum_valid_crc32(read_buf,
972 checksum_field1, checksum_field2, false)) {
973 return(false);
974 }
975
976 if (buf_page_is_checksum_valid_none(read_buf,
977 checksum_field1, checksum_field2)) {
978 if (curr_algo
979 == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32) {
980#ifndef UNIV_INNOCHECKSUM
981 page_warn_strict_checksum(
982 curr_algo,
983 SRV_CHECKSUM_ALGORITHM_NONE,
984 page_id);
985#endif /* !UNIV_INNOCHECKSUM */
986 }
987
988#ifdef UNIV_INNOCHECKSUM
989 if (log_file) {
990 fprintf(log_file, "page::%llu;"
991 " old style: calculated = %u;"
992 " recorded = " ULINTPF ";\n",
993 cur_page_num,
994 buf_calc_page_old_checksum(read_buf),
995 checksum_field2);
996 fprintf(log_file, "page::%llu;"
997 " new style: calculated = %u;"
998 " crc32 = %u; recorded = " ULINTPF ";\n",
999 cur_page_num,
1000 buf_calc_page_new_checksum(read_buf),
1001 buf_calc_page_crc32(read_buf),
1002 checksum_field1);
1003 }
1004#endif /* UNIV_INNOCHECKSUM */
1005
1006 return(false);
1007 }
1008
1009 /* We need to check whether the stored checksum matches legacy
1010 big endian checksum or Innodb checksum. We optimize the order
1011 based on earlier results. if earlier we have found pages
1012 matching legacy big endian checksum, we try to match it first.
1013 Otherwise we check innodb checksum first. */
1014 if (legacy_big_endian_checksum) {
1015 if (buf_page_is_checksum_valid_crc32(read_buf,
1016 checksum_field1, checksum_field2, true)) {
1017
1018 return(false);
1019 }
1020 legacy_checksum_checked = true;
1021 }
1022
1023 if (buf_page_is_checksum_valid_innodb(read_buf,
1024 checksum_field1, checksum_field2)) {
1025 if (curr_algo
1026 == SRV_CHECKSUM_ALGORITHM_STRICT_CRC32) {
1027#ifndef UNIV_INNOCHECKSUM
1028 page_warn_strict_checksum(
1029 curr_algo,
1030 SRV_CHECKSUM_ALGORITHM_INNODB,
1031 page_id);
1032#endif
1033 }
1034
1035 return(false);
1036 }
1037
1038 /* If legacy checksum is not checked, do it now. */
1039 if (!legacy_checksum_checked && buf_page_is_checksum_valid_crc32(
1040 read_buf, checksum_field1, checksum_field2, true)) {
1041
1042 legacy_big_endian_checksum = true;
1043 return(false);
1044 }
1045
1046#ifdef UNIV_INNOCHECKSUM
1047 if (log_file) {
1048 fprintf(log_file, "Fail; page::%llu;"
1049 " invalid (fails crc32 checksum)\n",
1050 cur_page_num);
1051 }
1052#endif /* UNIV_INNOCHECKSUM */
1053 return(true);
1054
1055 case SRV_CHECKSUM_ALGORITHM_INNODB:
1056 case SRV_CHECKSUM_ALGORITHM_STRICT_INNODB:
1057
1058 if (buf_page_is_checksum_valid_innodb(read_buf,
1059 checksum_field1, checksum_field2)) {
1060 return(false);
1061 }
1062
1063 if (buf_page_is_checksum_valid_none(read_buf,
1064 checksum_field1, checksum_field2)) {
1065 if (curr_algo
1066 == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB) {
1067#ifndef UNIV_INNOCHECKSUM
1068 page_warn_strict_checksum(
1069 curr_algo,
1070 SRV_CHECKSUM_ALGORITHM_NONE,
1071 page_id);
1072#endif
1073 }
1074#ifdef UNIV_INNOCHECKSUM
1075 if (log_file) {
1076 fprintf(log_file, "page::%llu;"
1077 " old style: calculated = %u;"
1078 " recorded = %zu;\n", cur_page_num,
1079 buf_calc_page_old_checksum(read_buf),
1080 checksum_field2);
1081 fprintf(log_file, "page::%llu;"
1082 " new style: calculated = %u;"
1083 " crc32 = %u; recorded = %zu;\n",
1084 cur_page_num,
1085 buf_calc_page_new_checksum(read_buf),
1086 buf_calc_page_crc32(read_buf),
1087 checksum_field1);
1088 }
1089#endif /* UNIV_INNOCHECKSUM */
1090
1091 return(false);
1092 }
1093
1094 if (buf_page_is_checksum_valid_crc32(read_buf,
1095 checksum_field1, checksum_field2, false)
1096 || buf_page_is_checksum_valid_crc32(read_buf,
1097 checksum_field1, checksum_field2, true)) {
1098
1099 if (curr_algo
1100 == SRV_CHECKSUM_ALGORITHM_STRICT_INNODB) {
1101#ifndef UNIV_INNOCHECKSUM
1102 page_warn_strict_checksum(
1103 curr_algo,
1104 SRV_CHECKSUM_ALGORITHM_CRC32,
1105 page_id);
1106#endif
1107 }
1108
1109 return(false);
1110 }
1111
1112#ifdef UNIV_INNOCHECKSUM
1113 if (log_file) {
1114 fprintf(log_file, "Fail; page::%llu;"
1115 " invalid (fails innodb checksum)\n",
1116 cur_page_num);
1117 }
1118#endif /* UNIV_INNOCHECKSUM */
1119
1120 return(true);
1121
1122 case SRV_CHECKSUM_ALGORITHM_STRICT_NONE:
1123
1124 if (buf_page_is_checksum_valid_none(read_buf,
1125 checksum_field1, checksum_field2)) {
1126 return(false);
1127 }
1128
1129 if (buf_page_is_checksum_valid_crc32(read_buf,
1130 checksum_field1, checksum_field2, false)
1131 || buf_page_is_checksum_valid_crc32(read_buf,
1132 checksum_field1, checksum_field2, true)) {
1133#ifndef UNIV_INNOCHECKSUM
1134 page_warn_strict_checksum(
1135 curr_algo,
1136 SRV_CHECKSUM_ALGORITHM_CRC32,
1137 page_id);
1138#endif /* !UNIV_INNOCHECKSUM */
1139 return(false);
1140 }
1141
1142 if (buf_page_is_checksum_valid_innodb(read_buf,
1143 checksum_field1, checksum_field2)) {
1144#ifndef UNIV_INNOCHECKSUM
1145 page_warn_strict_checksum(
1146 curr_algo,
1147 SRV_CHECKSUM_ALGORITHM_INNODB,
1148 page_id);
1149#endif /* !UNIV_INNOCHECKSUM */
1150 return(false);
1151 }
1152
1153#ifdef UNIV_INNOCHECKSUM
1154 if (log_file) {
1155 fprintf(log_file, "Fail; page::%llu;"
1156 " invalid (fails none checksum)\n",
1157 cur_page_num);
1158 }
1159#endif /* UNIV_INNOCHECKSUM */
1160
1161 return(true);
1162
1163 case SRV_CHECKSUM_ALGORITHM_NONE:
1164 /* should have returned false earlier */
1165 break;
1166 /* no default so the compiler will emit a warning if new enum
1167 is added and not handled here */
1168 }
1169
1170 ut_error;
1171 return(false);
1172}
1173
1174#ifndef UNIV_INNOCHECKSUM
1175
1176#if defined(DBUG_OFF) && defined(HAVE_MADVISE) && defined(MADV_DODUMP)
1177/** Enable buffers to be dumped to core files
1178
1179A convience function, not called anyhwere directly however
1180it is left available for gdb or any debugger to call
1181in the event that you want all of the memory to be dumped
1182to a core file.
1183
1184Returns number of errors found in madvise calls. */
1185int
1186buf_madvise_do_dump()
1187{
1188 int ret= 0;
1189 buf_pool_t* buf_pool;
1190 buf_chunk_t* chunk;
1191
1192 /* mirrors allocation in log_t::create() */
1193 if (log_sys.buf) {
1194 ret+= madvise(log_sys.first_in_use
1195 ? log_sys.buf
1196 : log_sys.buf - srv_log_buffer_size,
1197 srv_log_buffer_size * 2,
1198 MADV_DODUMP);
1199 }
1200 /* mirrors recv_sys_init() */
1201 if (recv_sys->buf)
1202 {
1203 ret+= madvise(recv_sys->buf, recv_sys->len, MADV_DODUMP);
1204 }
1205
1206 buf_pool_mutex_enter_all();
1207
1208 for (ulong i= 0; i < srv_buf_pool_instances; i++)
1209 {
1210 buf_pool = buf_pool_from_array(i);
1211 chunk = buf_pool->chunks;
1212
1213 for (int n = buf_pool->n_chunks; n--; chunk++)
1214 {
1215 ret+= madvise(chunk->mem, chunk->mem_size(), MADV_DODUMP);
1216 }
1217 }
1218
1219 buf_pool_mutex_exit_all();
1220
1221 return ret;
1222}
1223#endif
1224
1225/** Dump a page to stderr.
1226@param[in] read_buf database page
1227@param[in] page_size page size */
1228UNIV_INTERN
1229void
1230buf_page_print(const byte* read_buf, const page_size_t& page_size)
1231{
1232 dict_index_t* index;
1233
1234 ib::info() << "Page dump in ascii and hex ("
1235 << page_size.physical() << " bytes):";
1236
1237 ut_print_buf(stderr, read_buf, page_size.physical());
1238 fputs("\nInnoDB: End of page dump\n", stderr);
1239
1240 if (page_size.is_compressed()) {
1241 /* Print compressed page. */
1242 ib::info() << "Compressed page type ("
1243 << fil_page_get_type(read_buf)
1244 << "); stored checksum in field1 "
1245 << mach_read_from_4(
1246 read_buf + FIL_PAGE_SPACE_OR_CHKSUM)
1247 << "; calculated checksums for field1: "
1248 << buf_checksum_algorithm_name(
1249 SRV_CHECKSUM_ALGORITHM_CRC32)
1250 << " "
1251 << page_zip_calc_checksum(
1252 read_buf, page_size.physical(),
1253 SRV_CHECKSUM_ALGORITHM_CRC32)
1254 << "/"
1255 << page_zip_calc_checksum(
1256 read_buf, page_size.physical(),
1257 SRV_CHECKSUM_ALGORITHM_CRC32, true)
1258 << ", "
1259 << buf_checksum_algorithm_name(
1260 SRV_CHECKSUM_ALGORITHM_INNODB)
1261 << " "
1262 << page_zip_calc_checksum(
1263 read_buf, page_size.physical(),
1264 SRV_CHECKSUM_ALGORITHM_INNODB)
1265 << ", "
1266 << buf_checksum_algorithm_name(
1267 SRV_CHECKSUM_ALGORITHM_NONE)
1268 << " "
1269 << page_zip_calc_checksum(
1270 read_buf, page_size.physical(),
1271 SRV_CHECKSUM_ALGORITHM_NONE)
1272 << "; page LSN "
1273 << mach_read_from_8(read_buf + FIL_PAGE_LSN)
1274 << "; page number (if stored to page"
1275 << " already) "
1276 << mach_read_from_4(read_buf + FIL_PAGE_OFFSET)
1277 << "; space id (if stored to page already) "
1278 << mach_read_from_4(
1279 read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
1280
1281 } else {
1282 const uint32_t crc32 = buf_calc_page_crc32(read_buf);
1283
1284 const uint32_t crc32_legacy = buf_calc_page_crc32(read_buf,
1285 true);
1286 ulint page_type = fil_page_get_type(read_buf);
1287
1288 ib::info() << "Uncompressed page, stored checksum in field1 "
1289 << mach_read_from_4(
1290 read_buf + FIL_PAGE_SPACE_OR_CHKSUM)
1291 << ", calculated checksums for field1: "
1292 << buf_checksum_algorithm_name(
1293 SRV_CHECKSUM_ALGORITHM_CRC32) << " "
1294 << crc32 << "/" << crc32_legacy
1295 << ", "
1296 << buf_checksum_algorithm_name(
1297 SRV_CHECKSUM_ALGORITHM_INNODB) << " "
1298 << buf_calc_page_new_checksum(read_buf)
1299 << ", "
1300 << " page type " << page_type << " == "
1301 << fil_get_page_type_name(page_type) << "."
1302 << buf_checksum_algorithm_name(
1303 SRV_CHECKSUM_ALGORITHM_NONE) << " "
1304 << BUF_NO_CHECKSUM_MAGIC
1305 << ", stored checksum in field2 "
1306 << mach_read_from_4(read_buf + page_size.logical()
1307 - FIL_PAGE_END_LSN_OLD_CHKSUM)
1308 << ", calculated checksums for field2: "
1309 << buf_checksum_algorithm_name(
1310 SRV_CHECKSUM_ALGORITHM_CRC32) << " "
1311 << crc32 << "/" << crc32_legacy
1312 << ", "
1313 << buf_checksum_algorithm_name(
1314 SRV_CHECKSUM_ALGORITHM_INNODB) << " "
1315 << buf_calc_page_old_checksum(read_buf)
1316 << ", "
1317 << buf_checksum_algorithm_name(
1318 SRV_CHECKSUM_ALGORITHM_NONE) << " "
1319 << BUF_NO_CHECKSUM_MAGIC
1320 << ", page LSN "
1321 << mach_read_from_4(read_buf + FIL_PAGE_LSN)
1322 << " "
1323 << mach_read_from_4(read_buf + FIL_PAGE_LSN + 4)
1324 << ", low 4 bytes of LSN at page end "
1325 << mach_read_from_4(read_buf + page_size.logical()
1326 - FIL_PAGE_END_LSN_OLD_CHKSUM + 4)
1327 << ", page number (if stored to page already) "
1328 << mach_read_from_4(read_buf + FIL_PAGE_OFFSET)
1329 << ", space id (if created with >= MySQL-4.1.1"
1330 " and stored already) "
1331 << mach_read_from_4(
1332 read_buf + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
1333 }
1334
1335 switch (fil_page_get_type(read_buf)) {
1336 index_id_t index_id;
1337 case FIL_PAGE_INDEX:
1338 case FIL_PAGE_TYPE_INSTANT:
1339 case FIL_PAGE_RTREE:
1340 index_id = btr_page_get_index_id(read_buf);
1341 ib::info() << "Page may be an index page where"
1342 " index id is " << index_id;
1343
1344 index = dict_index_find_on_id_low(index_id);
1345 if (index) {
1346 ib::info()
1347 << "Index " << index_id
1348 << " is " << index->name
1349 << " in table " << index->table->name;
1350 }
1351 break;
1352 case FIL_PAGE_UNDO_LOG:
1353 fputs("InnoDB: Page may be an undo log page\n", stderr);
1354 break;
1355 case FIL_PAGE_INODE:
1356 fputs("InnoDB: Page may be an 'inode' page\n", stderr);
1357 break;
1358 case FIL_PAGE_IBUF_FREE_LIST:
1359 fputs("InnoDB: Page may be an insert buffer free list page\n",
1360 stderr);
1361 break;
1362 case FIL_PAGE_TYPE_ALLOCATED:
1363 fputs("InnoDB: Page may be a freshly allocated page\n",
1364 stderr);
1365 break;
1366 case FIL_PAGE_IBUF_BITMAP:
1367 fputs("InnoDB: Page may be an insert buffer bitmap page\n",
1368 stderr);
1369 break;
1370 case FIL_PAGE_TYPE_SYS:
1371 fputs("InnoDB: Page may be a system page\n",
1372 stderr);
1373 break;
1374 case FIL_PAGE_TYPE_TRX_SYS:
1375 fputs("InnoDB: Page may be a transaction system page\n",
1376 stderr);
1377 break;
1378 case FIL_PAGE_TYPE_FSP_HDR:
1379 fputs("InnoDB: Page may be a file space header page\n",
1380 stderr);
1381 break;
1382 case FIL_PAGE_TYPE_XDES:
1383 fputs("InnoDB: Page may be an extent descriptor page\n",
1384 stderr);
1385 break;
1386 case FIL_PAGE_TYPE_BLOB:
1387 fputs("InnoDB: Page may be a BLOB page\n",
1388 stderr);
1389 break;
1390 case FIL_PAGE_TYPE_ZBLOB:
1391 case FIL_PAGE_TYPE_ZBLOB2:
1392 fputs("InnoDB: Page may be a compressed BLOB page\n",
1393 stderr);
1394 break;
1395 }
1396}
1397
1398# ifdef PFS_GROUP_BUFFER_SYNC
1399extern mysql_pfs_key_t buffer_block_mutex_key;
1400
1401/********************************************************************//**
1402This function registers mutexes and rwlocks in buffer blocks with
1403performance schema. If PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER is
1404defined to be a value less than chunk->size, then only mutexes
1405and rwlocks in the first PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER
1406blocks are registered. */
1407static
1408void
1409pfs_register_buffer_block(
1410/*======================*/
1411 buf_chunk_t* chunk) /*!< in/out: chunk of buffers */
1412{
1413 buf_block_t* block;
1414 ulint num_to_register;
1415
1416 block = chunk->blocks;
1417
1418 num_to_register = ut_min(
1419 chunk->size, PFS_MAX_BUFFER_MUTEX_LOCK_REGISTER);
1420
1421 for (ulint i = 0; i < num_to_register; i++) {
1422# ifdef UNIV_PFS_MUTEX
1423 BPageMutex* mutex;
1424
1425 mutex = &block->mutex;
1426 mutex->pfs_add(buffer_block_mutex_key);
1427# endif /* UNIV_PFS_MUTEX */
1428
1429 rw_lock_t* rwlock;
1430
1431# ifdef UNIV_PFS_RWLOCK
1432 rwlock = &block->lock;
1433 ut_a(!rwlock->pfs_psi);
1434 rwlock->pfs_psi = (PSI_server)
1435 ? PSI_server->init_rwlock(buf_block_lock_key, rwlock)
1436 : NULL;
1437
1438# ifdef UNIV_DEBUG
1439 rwlock = &block->debug_latch;
1440 ut_a(!rwlock->pfs_psi);
1441 rwlock->pfs_psi = (PSI_server)
1442 ? PSI_server->init_rwlock(buf_block_debug_latch_key,
1443 rwlock)
1444 : NULL;
1445# endif /* UNIV_DEBUG */
1446
1447# endif /* UNIV_PFS_RWLOCK */
1448 block++;
1449 }
1450}
1451# endif /* PFS_GROUP_BUFFER_SYNC */
1452
1453/********************************************************************//**
1454Initializes a buffer control block when the buf_pool is created. */
1455static
1456void
1457buf_block_init(
1458/*===========*/
1459 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1460 buf_block_t* block, /*!< in: pointer to control block */
1461 byte* frame) /*!< in: pointer to buffer frame */
1462{
1463 UNIV_MEM_DESC(frame, srv_page_size);
1464
1465 /* This function should only be executed at database startup or by
1466 buf_pool_resize(). Either way, adaptive hash index must not exist. */
1467 assert_block_ahi_empty_on_init(block);
1468
1469 block->frame = frame;
1470
1471 block->page.buf_pool_index = buf_pool_index(buf_pool);
1472 block->page.flush_type = BUF_FLUSH_LRU;
1473 block->page.state = BUF_BLOCK_NOT_USED;
1474 block->page.buf_fix_count = 0;
1475 block->page.io_fix = BUF_IO_NONE;
1476 block->page.flush_observer = NULL;
1477 block->page.encrypted = false;
1478 block->page.real_size = 0;
1479 block->page.write_size = 0;
1480 block->modify_clock = 0;
1481 block->page.slot = NULL;
1482
1483 ut_d(block->page.file_page_was_freed = FALSE);
1484
1485#ifdef BTR_CUR_HASH_ADAPT
1486 block->index = NULL;
1487#endif /* BTR_CUR_HASH_ADAPT */
1488 block->skip_flush_check = false;
1489
1490 ut_d(block->page.in_page_hash = FALSE);
1491 ut_d(block->page.in_zip_hash = FALSE);
1492 ut_d(block->page.in_flush_list = FALSE);
1493 ut_d(block->page.in_free_list = FALSE);
1494 ut_d(block->page.in_LRU_list = FALSE);
1495 ut_d(block->in_unzip_LRU_list = FALSE);
1496 ut_d(block->in_withdraw_list = FALSE);
1497
1498 page_zip_des_init(&block->page.zip);
1499
1500 mutex_create(LATCH_ID_BUF_BLOCK_MUTEX, &block->mutex);
1501
1502#if defined PFS_SKIP_BUFFER_MUTEX_RWLOCK || defined PFS_GROUP_BUFFER_SYNC
1503 /* If PFS_SKIP_BUFFER_MUTEX_RWLOCK is defined, skip registration
1504 of buffer block rwlock with performance schema.
1505
1506 If PFS_GROUP_BUFFER_SYNC is defined, skip the registration
1507 since buffer block rwlock will be registered later in
1508 pfs_register_buffer_block(). */
1509
1510 rw_lock_create(PFS_NOT_INSTRUMENTED, &block->lock, SYNC_LEVEL_VARYING);
1511
1512 ut_d(rw_lock_create(PFS_NOT_INSTRUMENTED, &block->debug_latch,
1513 SYNC_LEVEL_VARYING));
1514
1515#else /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
1516
1517 rw_lock_create(buf_block_lock_key, &block->lock, SYNC_LEVEL_VARYING);
1518
1519 ut_d(rw_lock_create(buf_block_debug_latch_key,
1520 &block->debug_latch, SYNC_LEVEL_VARYING));
1521
1522#endif /* PFS_SKIP_BUFFER_MUTEX_RWLOCK || PFS_GROUP_BUFFER_SYNC */
1523
1524 block->lock.is_block_lock = 1;
1525
1526 ut_ad(rw_lock_validate(&(block->lock)));
1527}
1528
1529/********************************************************************//**
1530Allocates a chunk of buffer frames.
1531@return chunk, or NULL on failure */
1532static
1533buf_chunk_t*
1534buf_chunk_init(
1535/*===========*/
1536 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1537 buf_chunk_t* chunk, /*!< out: chunk of buffers */
1538 ulint mem_size) /*!< in: requested size in bytes */
1539{
1540 buf_block_t* block;
1541 byte* frame;
1542 ulint i;
1543
1544 /* Round down to a multiple of page size,
1545 although it already should be. */
1546 mem_size = ut_2pow_round(mem_size, ulint(srv_page_size));
1547 /* Reserve space for the block descriptors. */
1548 mem_size += ut_2pow_round((mem_size >> srv_page_size_shift)
1549 * (sizeof *block)
1550 + (srv_page_size - 1),
1551 ulint(srv_page_size));
1552
1553 DBUG_EXECUTE_IF("ib_buf_chunk_init_fails", return(NULL););
1554
1555 chunk->mem = buf_pool->allocator.allocate_large(mem_size,
1556 &chunk->mem_pfx, true);
1557
1558 if (UNIV_UNLIKELY(chunk->mem == NULL)) {
1559
1560 return(NULL);
1561 }
1562
1563#ifdef HAVE_LIBNUMA
1564 if (srv_numa_interleave) {
1565 struct bitmask *numa_mems_allowed = numa_get_mems_allowed();
1566 int st = mbind(chunk->mem, chunk->mem_size(),
1567 MPOL_INTERLEAVE,
1568 numa_mems_allowed->maskp,
1569 numa_mems_allowed->size,
1570 MPOL_MF_MOVE);
1571 if (st != 0) {
1572 ib::warn() << "Failed to set NUMA memory policy of"
1573 " buffer pool page frames to MPOL_INTERLEAVE"
1574 " (error: " << strerror(errno) << ").";
1575 }
1576 }
1577#endif /* HAVE_LIBNUMA */
1578
1579
1580 /* Allocate the block descriptors from
1581 the start of the memory block. */
1582 chunk->blocks = (buf_block_t*) chunk->mem;
1583
1584 /* Align a pointer to the first frame. Note that when
1585 os_large_page_size is smaller than srv_page_size,
1586 we may allocate one fewer block than requested. When
1587 it is bigger, we may allocate more blocks than requested. */
1588
1589 frame = (byte*) ut_align(chunk->mem, srv_page_size);
1590 chunk->size = (chunk->mem_pfx.m_size >> srv_page_size_shift)
1591 - (frame != chunk->mem);
1592
1593 /* Subtract the space needed for block descriptors. */
1594 {
1595 ulint size = chunk->size;
1596
1597 while (frame < (byte*) (chunk->blocks + size)) {
1598 frame += srv_page_size;
1599 size--;
1600 }
1601
1602 chunk->size = size;
1603 }
1604
1605 /* Init block structs and assign frames for them. Then we
1606 assign the frames to the first blocks (we already mapped the
1607 memory above). */
1608
1609 block = chunk->blocks;
1610
1611 for (i = chunk->size; i--; ) {
1612
1613 buf_block_init(buf_pool, block, frame);
1614 UNIV_MEM_INVALID(block->frame, srv_page_size);
1615
1616 /* Add the block to the free list */
1617 UT_LIST_ADD_LAST(buf_pool->free, &block->page);
1618
1619 ut_d(block->page.in_free_list = TRUE);
1620 ut_ad(buf_pool_from_block(block) == buf_pool);
1621
1622 block++;
1623 frame += srv_page_size;
1624 }
1625
1626 buf_pool_register_chunk(chunk);
1627
1628#ifdef PFS_GROUP_BUFFER_SYNC
1629 pfs_register_buffer_block(chunk);
1630#endif /* PFS_GROUP_BUFFER_SYNC */
1631 return(chunk);
1632}
1633
1634#ifdef UNIV_DEBUG
1635/*********************************************************************//**
1636Finds a block in the given buffer chunk that points to a
1637given compressed page.
1638@return buffer block pointing to the compressed page, or NULL */
1639static
1640buf_block_t*
1641buf_chunk_contains_zip(
1642/*===================*/
1643 buf_chunk_t* chunk, /*!< in: chunk being checked */
1644 const void* data) /*!< in: pointer to compressed page */
1645{
1646 buf_block_t* block;
1647 ulint i;
1648
1649 block = chunk->blocks;
1650
1651 for (i = chunk->size; i--; block++) {
1652 if (block->page.zip.data == data) {
1653
1654 return(block);
1655 }
1656 }
1657
1658 return(NULL);
1659}
1660
1661/*********************************************************************//**
1662Finds a block in the buffer pool that points to a
1663given compressed page.
1664@return buffer block pointing to the compressed page, or NULL */
1665buf_block_t*
1666buf_pool_contains_zip(
1667/*==================*/
1668 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1669 const void* data) /*!< in: pointer to compressed page */
1670{
1671 ulint n;
1672 buf_chunk_t* chunk = buf_pool->chunks;
1673
1674 ut_ad(buf_pool);
1675 ut_ad(buf_pool_mutex_own(buf_pool));
1676 for (n = buf_pool->n_chunks; n--; chunk++) {
1677
1678 buf_block_t* block = buf_chunk_contains_zip(chunk, data);
1679
1680 if (block) {
1681 return(block);
1682 }
1683 }
1684
1685 return(NULL);
1686}
1687#endif /* UNIV_DEBUG */
1688
1689/*********************************************************************//**
1690Checks that all file pages in the buffer chunk are in a replaceable state.
1691@return address of a non-free block, or NULL if all freed */
1692static
1693const buf_block_t*
1694buf_chunk_not_freed(
1695/*================*/
1696 buf_chunk_t* chunk) /*!< in: chunk being checked */
1697{
1698 buf_block_t* block;
1699 ulint i;
1700
1701 block = chunk->blocks;
1702
1703 for (i = chunk->size; i--; block++) {
1704 ibool ready;
1705
1706 switch (buf_block_get_state(block)) {
1707 case BUF_BLOCK_POOL_WATCH:
1708 case BUF_BLOCK_ZIP_PAGE:
1709 case BUF_BLOCK_ZIP_DIRTY:
1710 /* The uncompressed buffer pool should never
1711 contain compressed block descriptors. */
1712 ut_error;
1713 break;
1714 case BUF_BLOCK_NOT_USED:
1715 case BUF_BLOCK_READY_FOR_USE:
1716 case BUF_BLOCK_MEMORY:
1717 case BUF_BLOCK_REMOVE_HASH:
1718 /* Skip blocks that are not being used for
1719 file pages. */
1720 break;
1721 case BUF_BLOCK_FILE_PAGE:
1722 if (srv_read_only_mode) {
1723 /* The page cleaner is disabled in
1724 read-only mode. No pages can be
1725 dirtied, so all of them must be clean. */
1726 ut_ad(block->page.oldest_modification
1727 == block->page.newest_modification);
1728 ut_ad(block->page.oldest_modification == 0
1729 || block->page.oldest_modification
1730 == recv_sys->recovered_lsn
1731 || srv_force_recovery
1732 == SRV_FORCE_NO_LOG_REDO);
1733 ut_ad(block->page.buf_fix_count == 0);
1734 ut_ad(block->page.io_fix == BUF_IO_NONE);
1735 break;
1736 }
1737
1738 buf_page_mutex_enter(block);
1739 ready = buf_flush_ready_for_replace(&block->page);
1740 buf_page_mutex_exit(block);
1741
1742 if (!ready) {
1743 return(block);
1744 }
1745
1746 break;
1747 }
1748 }
1749
1750 return(NULL);
1751}
1752
1753/********************************************************************//**
1754Set buffer pool size variables after resizing it */
1755static
1756void
1757buf_pool_set_sizes(void)
1758/*====================*/
1759{
1760 ulint i;
1761 ulint curr_size = 0;
1762
1763 buf_pool_mutex_enter_all();
1764
1765 for (i = 0; i < srv_buf_pool_instances; i++) {
1766 buf_pool_t* buf_pool;
1767
1768 buf_pool = buf_pool_from_array(i);
1769 curr_size += buf_pool->curr_pool_size;
1770 }
1771
1772 srv_buf_pool_curr_size = curr_size;
1773 srv_buf_pool_old_size = srv_buf_pool_size;
1774 srv_buf_pool_base_size = srv_buf_pool_size;
1775
1776 buf_pool_mutex_exit_all();
1777}
1778
1779/********************************************************************//**
1780Initialize a buffer pool instance.
1781@return DB_SUCCESS if all goes well. */
1782static
1783ulint
1784buf_pool_init_instance(
1785/*===================*/
1786 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
1787 ulint buf_pool_size, /*!< in: size in bytes */
1788 ulint instance_no) /*!< in: id of the instance */
1789{
1790 ulint i;
1791 ulint chunk_size;
1792 buf_chunk_t* chunk;
1793
1794 ut_ad(buf_pool_size % srv_buf_pool_chunk_unit == 0);
1795
1796 /* 1. Initialize general fields
1797 ------------------------------- */
1798 mutex_create(LATCH_ID_BUF_POOL, &buf_pool->mutex);
1799
1800 mutex_create(LATCH_ID_BUF_POOL_ZIP, &buf_pool->zip_mutex);
1801
1802 new(&buf_pool->allocator)
1803 ut_allocator<unsigned char>(mem_key_buf_buf_pool);
1804
1805 buf_pool_mutex_enter(buf_pool);
1806
1807 if (buf_pool_size > 0) {
1808 buf_pool->n_chunks
1809 = buf_pool_size / srv_buf_pool_chunk_unit;
1810 chunk_size = srv_buf_pool_chunk_unit;
1811
1812 buf_pool->chunks =
1813 reinterpret_cast<buf_chunk_t*>(ut_zalloc_nokey(
1814 buf_pool->n_chunks * sizeof(*chunk)));
1815 buf_pool->chunks_old = NULL;
1816
1817 UT_LIST_INIT(buf_pool->LRU, &buf_page_t::LRU);
1818 UT_LIST_INIT(buf_pool->free, &buf_page_t::list);
1819 UT_LIST_INIT(buf_pool->withdraw, &buf_page_t::list);
1820 buf_pool->withdraw_target = 0;
1821 UT_LIST_INIT(buf_pool->flush_list, &buf_page_t::list);
1822 UT_LIST_INIT(buf_pool->unzip_LRU, &buf_block_t::unzip_LRU);
1823
1824#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
1825 UT_LIST_INIT(buf_pool->zip_clean, &buf_page_t::list);
1826#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
1827
1828 for (i = 0; i < UT_ARR_SIZE(buf_pool->zip_free); ++i) {
1829 UT_LIST_INIT(
1830 buf_pool->zip_free[i], &buf_buddy_free_t::list);
1831 }
1832
1833 buf_pool->curr_size = 0;
1834 chunk = buf_pool->chunks;
1835
1836 do {
1837 if (!buf_chunk_init(buf_pool, chunk, chunk_size)) {
1838 while (--chunk >= buf_pool->chunks) {
1839 buf_block_t* block = chunk->blocks;
1840
1841 for (i = chunk->size; i--; block++) {
1842 mutex_free(&block->mutex);
1843 rw_lock_free(&block->lock);
1844
1845 ut_d(rw_lock_free(
1846 &block->debug_latch));
1847 }
1848
1849 buf_pool->allocator.deallocate_large(
1850 chunk->mem, &chunk->mem_pfx, chunk->mem_size(),
1851 true);
1852 }
1853 ut_free(buf_pool->chunks);
1854 buf_pool_mutex_exit(buf_pool);
1855
1856 return(DB_ERROR);
1857 }
1858
1859 buf_pool->curr_size += chunk->size;
1860 } while (++chunk < buf_pool->chunks + buf_pool->n_chunks);
1861
1862 buf_pool->instance_no = instance_no;
1863 buf_pool->read_ahead_area =
1864 ut_min(BUF_READ_AHEAD_PAGES,
1865 ut_2_power_up(buf_pool->curr_size /
1866 BUF_READ_AHEAD_PORTION));
1867 buf_pool->curr_pool_size = buf_pool->curr_size
1868 << srv_page_size_shift;
1869
1870 buf_pool->old_size = buf_pool->curr_size;
1871 buf_pool->n_chunks_new = buf_pool->n_chunks;
1872
1873 /* Number of locks protecting page_hash must be a
1874 power of two */
1875 srv_n_page_hash_locks = static_cast<ulong>(
1876 ut_2_power_up(srv_n_page_hash_locks));
1877 ut_a(srv_n_page_hash_locks != 0);
1878 ut_a(srv_n_page_hash_locks <= MAX_PAGE_HASH_LOCKS);
1879
1880 buf_pool->page_hash = ib_create(
1881 2 * buf_pool->curr_size,
1882 LATCH_ID_HASH_TABLE_RW_LOCK,
1883 srv_n_page_hash_locks, MEM_HEAP_FOR_PAGE_HASH);
1884
1885 buf_pool->page_hash_old = NULL;
1886
1887 buf_pool->zip_hash = hash_create(2 * buf_pool->curr_size);
1888
1889 buf_pool->last_printout_time = ut_time();
1890 }
1891 /* 2. Initialize flushing fields
1892 -------------------------------- */
1893
1894 mutex_create(LATCH_ID_FLUSH_LIST, &buf_pool->flush_list_mutex);
1895
1896 for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
1897 buf_pool->no_flush[i] = os_event_create(0);
1898 }
1899
1900 buf_pool->watch = (buf_page_t*) ut_zalloc_nokey(
1901 sizeof(*buf_pool->watch) * BUF_POOL_WATCH_SIZE);
1902 for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) {
1903 buf_pool->watch[i].buf_pool_index
1904 = unsigned(buf_pool->instance_no);
1905 }
1906
1907 /* All fields are initialized by ut_zalloc_nokey(). */
1908
1909 buf_pool->try_LRU_scan = TRUE;
1910
1911 /* Initialize the hazard pointer for flush_list batches */
1912 new(&buf_pool->flush_hp)
1913 FlushHp(buf_pool, &buf_pool->flush_list_mutex);
1914
1915 /* Initialize the hazard pointer for LRU batches */
1916 new(&buf_pool->lru_hp) LRUHp(buf_pool, &buf_pool->mutex);
1917
1918 /* Initialize the iterator for LRU scan search */
1919 new(&buf_pool->lru_scan_itr) LRUItr(buf_pool, &buf_pool->mutex);
1920
1921 /* Initialize the iterator for single page scan search */
1922 new(&buf_pool->single_scan_itr) LRUItr(buf_pool, &buf_pool->mutex);
1923
1924 /* Initialize the temporal memory array and slots */
1925 buf_pool->tmp_arr = (buf_tmp_array_t *)ut_malloc_nokey(sizeof(buf_tmp_array_t));
1926 memset(buf_pool->tmp_arr, 0, sizeof(buf_tmp_array_t));
1927 ulint n_slots = (srv_n_read_io_threads + srv_n_write_io_threads) * (8 * OS_AIO_N_PENDING_IOS_PER_THREAD);
1928 buf_pool->tmp_arr->n_slots = n_slots;
1929 buf_pool->tmp_arr->slots = (buf_tmp_buffer_t*)ut_malloc_nokey(sizeof(buf_tmp_buffer_t) * n_slots);
1930 memset(buf_pool->tmp_arr->slots, 0, (sizeof(buf_tmp_buffer_t) * n_slots));
1931
1932 buf_pool_mutex_exit(buf_pool);
1933
1934 DBUG_EXECUTE_IF("buf_pool_init_instance_force_oom",
1935 return(DB_ERROR); );
1936
1937 return(DB_SUCCESS);
1938}
1939
1940/********************************************************************//**
1941free one buffer pool instance */
1942static
1943void
1944buf_pool_free_instance(
1945/*===================*/
1946 buf_pool_t* buf_pool) /* in,own: buffer pool instance
1947 to free */
1948{
1949 buf_chunk_t* chunk;
1950 buf_chunk_t* chunks;
1951 buf_page_t* bpage;
1952 buf_page_t* prev_bpage = 0;
1953
1954 mutex_free(&buf_pool->mutex);
1955 mutex_free(&buf_pool->zip_mutex);
1956 mutex_free(&buf_pool->flush_list_mutex);
1957
1958 if (buf_pool->flush_rbt) {
1959 rbt_free(buf_pool->flush_rbt);
1960 buf_pool->flush_rbt = NULL;
1961 }
1962
1963 for (bpage = UT_LIST_GET_LAST(buf_pool->LRU);
1964 bpage != NULL;
1965 bpage = prev_bpage) {
1966
1967 prev_bpage = UT_LIST_GET_PREV(LRU, bpage);
1968 buf_page_state state = buf_page_get_state(bpage);
1969
1970 ut_ad(buf_page_in_file(bpage));
1971 ut_ad(bpage->in_LRU_list);
1972
1973 if (state != BUF_BLOCK_FILE_PAGE) {
1974 /* We must not have any dirty block except
1975 when doing a fast shutdown. */
1976 ut_ad(state == BUF_BLOCK_ZIP_PAGE
1977 || srv_fast_shutdown == 2);
1978 buf_page_free_descriptor(bpage);
1979 }
1980 }
1981
1982 ut_free(buf_pool->watch);
1983 buf_pool->watch = NULL;
1984
1985 chunks = buf_pool->chunks;
1986 chunk = chunks + buf_pool->n_chunks;
1987
1988 while (--chunk >= chunks) {
1989 buf_block_t* block = chunk->blocks;
1990
1991 for (ulint i = chunk->size; i--; block++) {
1992 mutex_free(&block->mutex);
1993 rw_lock_free(&block->lock);
1994
1995 ut_d(rw_lock_free(&block->debug_latch));
1996 }
1997
1998 buf_pool->allocator.deallocate_large(
1999 chunk->mem, &chunk->mem_pfx, true);
2000 }
2001
2002 for (ulint i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; ++i) {
2003 os_event_destroy(buf_pool->no_flush[i]);
2004 }
2005
2006 ut_free(buf_pool->chunks);
2007 ha_clear(buf_pool->page_hash);
2008 hash_table_free(buf_pool->page_hash);
2009 hash_table_free(buf_pool->zip_hash);
2010
2011 /* Free all used temporary slots */
2012 if (buf_pool->tmp_arr) {
2013 for(ulint i = 0; i < buf_pool->tmp_arr->n_slots; i++) {
2014 buf_tmp_buffer_t* slot = &(buf_pool->tmp_arr->slots[i]);
2015 if (slot && slot->crypt_buf) {
2016 aligned_free(slot->crypt_buf);
2017 slot->crypt_buf = NULL;
2018 }
2019
2020 if (slot && slot->comp_buf) {
2021 aligned_free(slot->comp_buf);
2022 slot->comp_buf = NULL;
2023 }
2024 }
2025
2026 ut_free(buf_pool->tmp_arr->slots);
2027 ut_free(buf_pool->tmp_arr);
2028 buf_pool->tmp_arr = NULL;
2029 }
2030
2031 buf_pool->allocator.~ut_allocator();
2032}
2033
2034/********************************************************************//**
2035Creates the buffer pool.
2036@return DB_SUCCESS if success, DB_ERROR if not enough memory or error */
2037dberr_t
2038buf_pool_init(
2039/*==========*/
2040 ulint total_size, /*!< in: size of the total pool in bytes */
2041 ulint n_instances) /*!< in: number of instances */
2042{
2043 ulint i;
2044 const ulint size = total_size / n_instances;
2045
2046 ut_ad(n_instances > 0);
2047 ut_ad(n_instances <= MAX_BUFFER_POOLS);
2048 ut_ad(n_instances == srv_buf_pool_instances);
2049
2050 NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
2051
2052 buf_pool_resizing = false;
2053 buf_pool_withdrawing = false;
2054 buf_withdraw_clock = 0;
2055
2056 buf_pool_ptr = (buf_pool_t*) ut_zalloc_nokey(
2057 n_instances * sizeof *buf_pool_ptr);
2058
2059 buf_chunk_map_reg = UT_NEW_NOKEY(buf_pool_chunk_map_t());
2060
2061 for (i = 0; i < n_instances; i++) {
2062 buf_pool_t* ptr = &buf_pool_ptr[i];
2063
2064 if (buf_pool_init_instance(ptr, size, i) != DB_SUCCESS) {
2065
2066 /* Free all the instances created so far. */
2067 buf_pool_free(i);
2068
2069 return(DB_ERROR);
2070 }
2071 }
2072
2073 buf_chunk_map_ref = buf_chunk_map_reg;
2074
2075 buf_pool_set_sizes();
2076 buf_LRU_old_ratio_update(100 * 3/ 8, FALSE);
2077
2078 btr_search_sys_create(buf_pool_get_curr_size() / sizeof(void*) / 64);
2079
2080 return(DB_SUCCESS);
2081}
2082
2083/********************************************************************//**
2084Frees the buffer pool at shutdown. This must not be invoked before
2085freeing all mutexes. */
2086void
2087buf_pool_free(
2088/*==========*/
2089 ulint n_instances) /*!< in: numbere of instances to free */
2090{
2091 for (ulint i = 0; i < n_instances; i++) {
2092 buf_pool_free_instance(buf_pool_from_array(i));
2093 }
2094
2095 UT_DELETE(buf_chunk_map_reg);
2096 buf_chunk_map_reg = buf_chunk_map_ref = NULL;
2097
2098 ut_free(buf_pool_ptr);
2099 buf_pool_ptr = NULL;
2100}
2101
2102/** Reallocate a control block.
2103@param[in] buf_pool buffer pool instance
2104@param[in] block pointer to control block
2105@retval false if failed because of no free blocks. */
2106static
2107bool
2108buf_page_realloc(
2109 buf_pool_t* buf_pool,
2110 buf_block_t* block)
2111{
2112 buf_block_t* new_block;
2113
2114 ut_ad(buf_pool_withdrawing);
2115 ut_ad(buf_pool_mutex_own(buf_pool));
2116 ut_ad(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
2117
2118 new_block = buf_LRU_get_free_only(buf_pool);
2119
2120 if (new_block == NULL) {
2121 return(false); /* free_list was not enough */
2122 }
2123
2124 rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, block->page.id);
2125
2126 rw_lock_x_lock(hash_lock);
2127 mutex_enter(&block->mutex);
2128
2129 if (buf_page_can_relocate(&block->page)) {
2130 mutex_enter(&new_block->mutex);
2131
2132 memcpy(new_block->frame, block->frame, srv_page_size);
2133 memcpy(&new_block->page, &block->page, sizeof block->page);
2134
2135 /* relocate LRU list */
2136 ut_ad(block->page.in_LRU_list);
2137 ut_ad(!block->page.in_zip_hash);
2138 ut_d(block->page.in_LRU_list = FALSE);
2139
2140 buf_LRU_adjust_hp(buf_pool, &block->page);
2141
2142 buf_page_t* prev_b = UT_LIST_GET_PREV(LRU, &block->page);
2143 UT_LIST_REMOVE(buf_pool->LRU, &block->page);
2144
2145 if (prev_b != NULL) {
2146 UT_LIST_INSERT_AFTER(buf_pool->LRU, prev_b, &new_block->page);
2147 } else {
2148 UT_LIST_ADD_FIRST(buf_pool->LRU, &new_block->page);
2149 }
2150
2151 if (buf_pool->LRU_old == &block->page) {
2152 buf_pool->LRU_old = &new_block->page;
2153 }
2154
2155 ut_ad(new_block->page.in_LRU_list);
2156
2157 /* relocate unzip_LRU list */
2158 if (block->page.zip.data != NULL) {
2159 ut_ad(block->in_unzip_LRU_list);
2160 ut_d(new_block->in_unzip_LRU_list = TRUE);
2161 UNIV_MEM_DESC(&new_block->page.zip.data,
2162 page_zip_get_size(&new_block->page.zip));
2163
2164 buf_block_t* prev_block = UT_LIST_GET_PREV(unzip_LRU, block);
2165 UT_LIST_REMOVE(buf_pool->unzip_LRU, block);
2166
2167 ut_d(block->in_unzip_LRU_list = FALSE);
2168 block->page.zip.data = NULL;
2169 page_zip_set_size(&block->page.zip, 0);
2170
2171 if (prev_block != NULL) {
2172 UT_LIST_INSERT_AFTER(buf_pool->unzip_LRU, prev_block, new_block);
2173 } else {
2174 UT_LIST_ADD_FIRST(buf_pool->unzip_LRU, new_block);
2175 }
2176 } else {
2177 ut_ad(!block->in_unzip_LRU_list);
2178 ut_d(new_block->in_unzip_LRU_list = FALSE);
2179 }
2180
2181 /* relocate buf_pool->page_hash */
2182 ut_ad(block->page.in_page_hash);
2183 ut_ad(&block->page == buf_page_hash_get_low(buf_pool,
2184 block->page.id));
2185 ut_d(block->page.in_page_hash = FALSE);
2186 ulint fold = block->page.id.fold();
2187 ut_ad(fold == new_block->page.id.fold());
2188 HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, (&block->page));
2189 HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, (&new_block->page));
2190
2191 ut_ad(new_block->page.in_page_hash);
2192
2193 buf_block_modify_clock_inc(block);
2194 memset(block->frame + FIL_PAGE_OFFSET, 0xff, 4);
2195 memset(block->frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID, 0xff, 4);
2196 UNIV_MEM_INVALID(block->frame, srv_page_size);
2197 buf_block_set_state(block, BUF_BLOCK_REMOVE_HASH);
2198 block->page.id.reset();
2199
2200 /* Relocate buf_pool->flush_list. */
2201 if (block->page.oldest_modification) {
2202 buf_flush_relocate_on_flush_list(
2203 &block->page, &new_block->page);
2204 }
2205
2206 /* set other flags of buf_block_t */
2207
2208#ifdef BTR_CUR_HASH_ADAPT
2209 /* This code should only be executed by buf_pool_resize(),
2210 while the adaptive hash index is disabled. */
2211 assert_block_ahi_empty(block);
2212 assert_block_ahi_empty_on_init(new_block);
2213 ut_ad(!block->index);
2214 new_block->index = NULL;
2215 new_block->n_hash_helps = 0;
2216 new_block->n_fields = 1;
2217 new_block->left_side = TRUE;
2218#endif /* BTR_CUR_HASH_ADAPT */
2219
2220 new_block->lock_hash_val = block->lock_hash_val;
2221 ut_ad(new_block->lock_hash_val == lock_rec_hash(
2222 new_block->page.id.space(),
2223 new_block->page.id.page_no()));
2224
2225 rw_lock_x_unlock(hash_lock);
2226 mutex_exit(&new_block->mutex);
2227
2228 /* free block */
2229 buf_block_set_state(block, BUF_BLOCK_MEMORY);
2230 buf_LRU_block_free_non_file_page(block);
2231
2232 mutex_exit(&block->mutex);
2233 } else {
2234 rw_lock_x_unlock(hash_lock);
2235 mutex_exit(&block->mutex);
2236
2237 /* free new_block */
2238 mutex_enter(&new_block->mutex);
2239 buf_LRU_block_free_non_file_page(new_block);
2240 mutex_exit(&new_block->mutex);
2241 }
2242
2243 return(true); /* free_list was enough */
2244}
2245
2246/** Sets the global variable that feeds MySQL's innodb_buffer_pool_resize_status
2247to the specified string. The format and the following parameters are the
2248same as the ones used for printf(3).
2249@param[in] fmt format
2250@param[in] ... extra parameters according to fmt */
2251static
2252void
2253buf_resize_status(
2254 const char* fmt,
2255 ...)
2256{
2257 va_list ap;
2258
2259 va_start(ap, fmt);
2260
2261 vsnprintf(
2262 export_vars.innodb_buffer_pool_resize_status,
2263 sizeof(export_vars.innodb_buffer_pool_resize_status),
2264 fmt, ap);
2265
2266 va_end(ap);
2267
2268 ib::info() << export_vars.innodb_buffer_pool_resize_status;
2269}
2270
2271/** Determines if a block is intended to be withdrawn.
2272@param[in] buf_pool buffer pool instance
2273@param[in] block pointer to control block
2274@retval true if will be withdrawn */
2275bool
2276buf_block_will_withdrawn(
2277 buf_pool_t* buf_pool,
2278 const buf_block_t* block)
2279{
2280 ut_ad(buf_pool->curr_size < buf_pool->old_size);
2281 ut_ad(!buf_pool_resizing || buf_pool_mutex_own(buf_pool));
2282
2283 const buf_chunk_t* chunk
2284 = buf_pool->chunks + buf_pool->n_chunks_new;
2285 const buf_chunk_t* echunk
2286 = buf_pool->chunks + buf_pool->n_chunks;
2287
2288 while (chunk < echunk) {
2289 if (block >= chunk->blocks
2290 && block < chunk->blocks + chunk->size) {
2291 return(true);
2292 }
2293 ++chunk;
2294 }
2295
2296 return(false);
2297}
2298
2299/** Determines if a frame is intended to be withdrawn.
2300@param[in] buf_pool buffer pool instance
2301@param[in] ptr pointer to a frame
2302@retval true if will be withdrawn */
2303bool
2304buf_frame_will_withdrawn(
2305 buf_pool_t* buf_pool,
2306 const byte* ptr)
2307{
2308 ut_ad(buf_pool->curr_size < buf_pool->old_size);
2309 ut_ad(!buf_pool_resizing || buf_pool_mutex_own(buf_pool));
2310
2311 const buf_chunk_t* chunk
2312 = buf_pool->chunks + buf_pool->n_chunks_new;
2313 const buf_chunk_t* echunk
2314 = buf_pool->chunks + buf_pool->n_chunks;
2315
2316 while (chunk < echunk) {
2317 if (ptr >= chunk->blocks->frame
2318 && ptr < (chunk->blocks + chunk->size - 1)->frame
2319 + srv_page_size) {
2320 return(true);
2321 }
2322 ++chunk;
2323 }
2324
2325 return(false);
2326}
2327
2328/** Withdraw the buffer pool blocks from end of the buffer pool instance
2329until withdrawn by buf_pool->withdraw_target.
2330@param[in] buf_pool buffer pool instance
2331@retval true if retry is needed */
2332static
2333bool
2334buf_pool_withdraw_blocks(
2335 buf_pool_t* buf_pool)
2336{
2337 buf_block_t* block;
2338 ulint loop_count = 0;
2339 ulint i = buf_pool_index(buf_pool);
2340
2341 ib::info() << "buffer pool " << i
2342 << " : start to withdraw the last "
2343 << buf_pool->withdraw_target << " blocks.";
2344
2345 /* Minimize buf_pool->zip_free[i] lists */
2346 buf_pool_mutex_enter(buf_pool);
2347 buf_buddy_condense_free(buf_pool);
2348 buf_pool_mutex_exit(buf_pool);
2349
2350 while (UT_LIST_GET_LEN(buf_pool->withdraw)
2351 < buf_pool->withdraw_target) {
2352
2353 /* try to withdraw from free_list */
2354 ulint count1 = 0;
2355
2356 buf_pool_mutex_enter(buf_pool);
2357 block = reinterpret_cast<buf_block_t*>(
2358 UT_LIST_GET_FIRST(buf_pool->free));
2359 while (block != NULL
2360 && UT_LIST_GET_LEN(buf_pool->withdraw)
2361 < buf_pool->withdraw_target) {
2362 ut_ad(block->page.in_free_list);
2363 ut_ad(!block->page.in_flush_list);
2364 ut_ad(!block->page.in_LRU_list);
2365 ut_a(!buf_page_in_file(&block->page));
2366
2367 buf_block_t* next_block;
2368 next_block = reinterpret_cast<buf_block_t*>(
2369 UT_LIST_GET_NEXT(
2370 list, &block->page));
2371
2372 if (buf_block_will_withdrawn(buf_pool, block)) {
2373 /* This should be withdrawn */
2374 UT_LIST_REMOVE(
2375 buf_pool->free,
2376 &block->page);
2377 UT_LIST_ADD_LAST(
2378 buf_pool->withdraw,
2379 &block->page);
2380 ut_d(block->in_withdraw_list = TRUE);
2381 count1++;
2382 }
2383
2384 block = next_block;
2385 }
2386 buf_pool_mutex_exit(buf_pool);
2387
2388 /* reserve free_list length */
2389 if (UT_LIST_GET_LEN(buf_pool->withdraw)
2390 < buf_pool->withdraw_target) {
2391 ulint scan_depth;
2392 flush_counters_t n;
2393
2394 /* cap scan_depth with current LRU size. */
2395 buf_pool_mutex_enter(buf_pool);
2396 scan_depth = UT_LIST_GET_LEN(buf_pool->LRU);
2397 buf_pool_mutex_exit(buf_pool);
2398
2399 scan_depth = ut_min(
2400 ut_max(buf_pool->withdraw_target
2401 - UT_LIST_GET_LEN(buf_pool->withdraw),
2402 static_cast<ulint>(srv_LRU_scan_depth)),
2403 scan_depth);
2404
2405 buf_flush_do_batch(buf_pool, BUF_FLUSH_LRU,
2406 scan_depth, 0, &n);
2407 buf_flush_wait_batch_end(buf_pool, BUF_FLUSH_LRU);
2408
2409 if (n.flushed) {
2410 MONITOR_INC_VALUE_CUMULATIVE(
2411 MONITOR_LRU_BATCH_FLUSH_TOTAL_PAGE,
2412 MONITOR_LRU_BATCH_FLUSH_COUNT,
2413 MONITOR_LRU_BATCH_FLUSH_PAGES,
2414 n.flushed);
2415 }
2416 }
2417
2418 /* relocate blocks/buddies in withdrawn area */
2419 ulint count2 = 0;
2420
2421 buf_pool_mutex_enter(buf_pool);
2422 buf_page_t* bpage;
2423 bpage = UT_LIST_GET_FIRST(buf_pool->LRU);
2424 while (bpage != NULL) {
2425 BPageMutex* block_mutex;
2426 buf_page_t* next_bpage;
2427
2428 block_mutex = buf_page_get_mutex(bpage);
2429 mutex_enter(block_mutex);
2430
2431 next_bpage = UT_LIST_GET_NEXT(LRU, bpage);
2432
2433 if (bpage->zip.data != NULL
2434 && buf_frame_will_withdrawn(
2435 buf_pool,
2436 static_cast<byte*>(bpage->zip.data))) {
2437
2438 if (buf_page_can_relocate(bpage)) {
2439 mutex_exit(block_mutex);
2440 buf_pool_mutex_exit_forbid(buf_pool);
2441 if(!buf_buddy_realloc(
2442 buf_pool, bpage->zip.data,
2443 page_zip_get_size(
2444 &bpage->zip))) {
2445
2446 /* failed to allocate block */
2447 buf_pool_mutex_exit_allow(
2448 buf_pool);
2449 break;
2450 }
2451 buf_pool_mutex_exit_allow(buf_pool);
2452 mutex_enter(block_mutex);
2453 count2++;
2454 }
2455 /* NOTE: if the page is in use,
2456 not reallocated yet */
2457 }
2458
2459 if (buf_page_get_state(bpage)
2460 == BUF_BLOCK_FILE_PAGE
2461 && buf_block_will_withdrawn(
2462 buf_pool,
2463 reinterpret_cast<buf_block_t*>(bpage))) {
2464
2465 if (buf_page_can_relocate(bpage)) {
2466 mutex_exit(block_mutex);
2467 buf_pool_mutex_exit_forbid(buf_pool);
2468 if(!buf_page_realloc(
2469 buf_pool,
2470 reinterpret_cast<buf_block_t*>(
2471 bpage))) {
2472 /* failed to allocate block */
2473 buf_pool_mutex_exit_allow(
2474 buf_pool);
2475 break;
2476 }
2477 buf_pool_mutex_exit_allow(buf_pool);
2478 count2++;
2479 } else {
2480 mutex_exit(block_mutex);
2481 }
2482 /* NOTE: if the page is in use,
2483 not reallocated yet */
2484 } else {
2485 mutex_exit(block_mutex);
2486 }
2487
2488 bpage = next_bpage;
2489 }
2490 buf_pool_mutex_exit(buf_pool);
2491
2492 buf_resize_status(
2493 "buffer pool %lu : withdrawing blocks. (%lu/%lu)",
2494 i, UT_LIST_GET_LEN(buf_pool->withdraw),
2495 buf_pool->withdraw_target);
2496
2497 ib::info() << "buffer pool " << i << " : withdrew "
2498 << count1 << " blocks from free list."
2499 << " Tried to relocate " << count2 << " pages ("
2500 << UT_LIST_GET_LEN(buf_pool->withdraw) << "/"
2501 << buf_pool->withdraw_target << ").";
2502
2503 if (++loop_count >= 10) {
2504 /* give up for now.
2505 retried after user threads paused. */
2506
2507 ib::info() << "buffer pool " << i
2508 << " : will retry to withdraw later.";
2509
2510 /* need retry later */
2511 return(true);
2512 }
2513 }
2514
2515 /* confirm withdrawn enough */
2516 const buf_chunk_t* chunk
2517 = buf_pool->chunks + buf_pool->n_chunks_new;
2518 const buf_chunk_t* echunk
2519 = buf_pool->chunks + buf_pool->n_chunks;
2520
2521 while (chunk < echunk) {
2522 block = chunk->blocks;
2523 for (ulint j = chunk->size; j--; block++) {
2524 /* If !=BUF_BLOCK_NOT_USED block in the
2525 withdrawn area, it means corruption
2526 something */
2527 ut_a(buf_block_get_state(block)
2528 == BUF_BLOCK_NOT_USED);
2529 ut_ad(block->in_withdraw_list);
2530 }
2531 ++chunk;
2532 }
2533
2534 ib::info() << "buffer pool " << i << " : withdrawn target "
2535 << UT_LIST_GET_LEN(buf_pool->withdraw) << " blocks.";
2536
2537 /* retry is not needed */
2538 ++buf_withdraw_clock;
2539
2540 return(false);
2541}
2542
2543/** resize page_hash and zip_hash for a buffer pool instance.
2544@param[in] buf_pool buffer pool instance */
2545static
2546void
2547buf_pool_resize_hash(
2548 buf_pool_t* buf_pool)
2549{
2550 hash_table_t* new_hash_table;
2551
2552 ut_ad(buf_pool->page_hash_old == NULL);
2553
2554 /* recreate page_hash */
2555 new_hash_table = ib_recreate(
2556 buf_pool->page_hash, 2 * buf_pool->curr_size);
2557
2558 for (ulint i = 0; i < hash_get_n_cells(buf_pool->page_hash); i++) {
2559 buf_page_t* bpage;
2560
2561 bpage = static_cast<buf_page_t*>(
2562 HASH_GET_FIRST(
2563 buf_pool->page_hash, i));
2564
2565 while (bpage) {
2566 buf_page_t* prev_bpage = bpage;
2567 ulint fold;
2568
2569 bpage = static_cast<buf_page_t*>(
2570 HASH_GET_NEXT(
2571 hash, prev_bpage));
2572
2573 fold = prev_bpage->id.fold();
2574
2575 HASH_DELETE(buf_page_t, hash,
2576 buf_pool->page_hash, fold,
2577 prev_bpage);
2578
2579 HASH_INSERT(buf_page_t, hash,
2580 new_hash_table, fold,
2581 prev_bpage);
2582 }
2583 }
2584
2585 buf_pool->page_hash_old = buf_pool->page_hash;
2586 buf_pool->page_hash = new_hash_table;
2587
2588 /* recreate zip_hash */
2589 new_hash_table = hash_create(2 * buf_pool->curr_size);
2590
2591 for (ulint i = 0; i < hash_get_n_cells(buf_pool->zip_hash); i++) {
2592 buf_page_t* bpage;
2593
2594 bpage = static_cast<buf_page_t*>(
2595 HASH_GET_FIRST(buf_pool->zip_hash, i));
2596
2597 while (bpage) {
2598 buf_page_t* prev_bpage = bpage;
2599 ulint fold;
2600
2601 bpage = static_cast<buf_page_t*>(
2602 HASH_GET_NEXT(
2603 hash, prev_bpage));
2604
2605 fold = BUF_POOL_ZIP_FOLD(
2606 reinterpret_cast<buf_block_t*>(
2607 prev_bpage));
2608
2609 HASH_DELETE(buf_page_t, hash,
2610 buf_pool->zip_hash, fold,
2611 prev_bpage);
2612
2613 HASH_INSERT(buf_page_t, hash,
2614 new_hash_table, fold,
2615 prev_bpage);
2616 }
2617 }
2618
2619 hash_table_free(buf_pool->zip_hash);
2620 buf_pool->zip_hash = new_hash_table;
2621}
2622
2623#ifndef DBUG_OFF
2624/** This is a debug routine to inject an memory allocation failure error. */
2625static
2626void
2627buf_pool_resize_chunk_make_null(buf_chunk_t** new_chunks)
2628{
2629 static int count = 0;
2630
2631 if (count == 1) {
2632 ut_free(*new_chunks);
2633 *new_chunks = NULL;
2634 }
2635
2636 count++;
2637}
2638#endif // DBUG_OFF
2639
2640/** Resize the buffer pool based on srv_buf_pool_size from
2641srv_buf_pool_old_size. */
2642static
2643void
2644buf_pool_resize()
2645{
2646 buf_pool_t* buf_pool;
2647 ulint new_instance_size;
2648 bool warning = false;
2649
2650 NUMA_MEMPOLICY_INTERLEAVE_IN_SCOPE;
2651
2652 ut_ad(!buf_pool_resizing);
2653 ut_ad(!buf_pool_withdrawing);
2654 ut_ad(srv_buf_pool_chunk_unit > 0);
2655
2656 new_instance_size = srv_buf_pool_size / srv_buf_pool_instances;
2657 new_instance_size >>= srv_page_size_shift;
2658
2659 buf_resize_status("Resizing buffer pool from " ULINTPF " to "
2660 ULINTPF " (unit=" ULINTPF ").",
2661 srv_buf_pool_old_size, srv_buf_pool_size,
2662 srv_buf_pool_chunk_unit);
2663
2664 /* set new limit for all buffer pool for resizing */
2665 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2666 buf_pool = buf_pool_from_array(i);
2667 buf_pool_mutex_enter(buf_pool);
2668
2669 ut_ad(buf_pool->curr_size == buf_pool->old_size);
2670 ut_ad(buf_pool->n_chunks_new == buf_pool->n_chunks);
2671 ut_ad(UT_LIST_GET_LEN(buf_pool->withdraw) == 0);
2672 ut_ad(buf_pool->flush_rbt == NULL);
2673
2674 buf_pool->curr_size = new_instance_size;
2675
2676 buf_pool->n_chunks_new =
2677 (new_instance_size << srv_page_size_shift)
2678 / srv_buf_pool_chunk_unit;
2679
2680 buf_pool_mutex_exit(buf_pool);
2681 }
2682#ifdef BTR_CUR_HASH_ADAPT
2683 /* disable AHI if needed */
2684 bool btr_search_disabled = false;
2685
2686 buf_resize_status("Disabling adaptive hash index.");
2687
2688 btr_search_s_lock_all();
2689 if (btr_search_enabled) {
2690 btr_search_s_unlock_all();
2691 btr_search_disabled = true;
2692 } else {
2693 btr_search_s_unlock_all();
2694 }
2695
2696 btr_search_disable(true);
2697
2698 if (btr_search_disabled) {
2699 ib::info() << "disabled adaptive hash index.";
2700 }
2701#endif /* BTR_CUR_HASH_ADAPT */
2702
2703 /* set withdraw target */
2704 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2705 buf_pool = buf_pool_from_array(i);
2706 if (buf_pool->curr_size < buf_pool->old_size) {
2707 ulint withdraw_target = 0;
2708
2709 const buf_chunk_t* chunk
2710 = buf_pool->chunks + buf_pool->n_chunks_new;
2711 const buf_chunk_t* echunk
2712 = buf_pool->chunks + buf_pool->n_chunks;
2713
2714 while (chunk < echunk) {
2715 withdraw_target += chunk->size;
2716 ++chunk;
2717 }
2718
2719 ut_ad(buf_pool->withdraw_target == 0);
2720 buf_pool->withdraw_target = withdraw_target;
2721 buf_pool_withdrawing = true;
2722 }
2723 }
2724
2725 buf_resize_status("Withdrawing blocks to be shrunken.");
2726
2727 ib_time_t withdraw_started = ut_time();
2728 ulint message_interval = 60;
2729 ulint retry_interval = 1;
2730
2731withdraw_retry:
2732 bool should_retry_withdraw = false;
2733
2734 /* wait for the number of blocks fit to the new size (if needed)*/
2735 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
2736 buf_pool = buf_pool_from_array(i);
2737 if (buf_pool->curr_size < buf_pool->old_size) {
2738
2739 should_retry_withdraw |=
2740 buf_pool_withdraw_blocks(buf_pool);
2741 }
2742 }
2743
2744 if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
2745 /* abort to resize for shutdown. */
2746 buf_pool_withdrawing = false;
2747 return;
2748 }
2749
2750 /* abort buffer pool load */
2751 buf_load_abort();
2752
2753 if (should_retry_withdraw
2754 && ut_difftime(ut_time(), withdraw_started) >= message_interval) {
2755
2756 if (message_interval > 900) {
2757 message_interval = 1800;
2758 } else {
2759 message_interval *= 2;
2760 }
2761
2762 lock_mutex_enter();
2763 mutex_enter(&trx_sys.mutex);
2764 bool found = false;
2765 for (trx_t* trx = UT_LIST_GET_FIRST(trx_sys.trx_list);
2766 trx != NULL;
2767 trx = UT_LIST_GET_NEXT(trx_list, trx)) {
2768 if (trx->state != TRX_STATE_NOT_STARTED
2769 && trx->mysql_thd != NULL
2770 && ut_difftime(withdraw_started,
2771 trx->start_time) > 0) {
2772 if (!found) {
2773 ib::warn() <<
2774 "The following trx might hold"
2775 " the blocks in buffer pool to"
2776 " be withdrawn. Buffer pool"
2777 " resizing can complete only"
2778 " after all the transactions"
2779 " below release the blocks.";
2780 found = true;
2781 }
2782
2783 lock_trx_print_wait_and_mvcc_state(
2784 stderr, trx);
2785 }
2786 }
2787 mutex_exit(&trx_sys.mutex);
2788 lock_mutex_exit();
2789
2790 withdraw_started = ut_time();
2791 }
2792
2793 if (should_retry_withdraw) {
2794 ib::info() << "Will retry to withdraw " << retry_interval
2795 << " seconds later.";
2796 os_thread_sleep(retry_interval * 1000000);
2797
2798 if (retry_interval > 5) {
2799 retry_interval = 10;
2800 } else {
2801 retry_interval *= 2;
2802 }
2803
2804 goto withdraw_retry;
2805 }
2806
2807 buf_pool_withdrawing = false;
2808
2809 buf_resize_status("Latching whole of buffer pool.");
2810
2811#ifndef DBUG_OFF
2812 {
2813 bool should_wait = true;
2814
2815 while (should_wait) {
2816 should_wait = false;
2817 DBUG_EXECUTE_IF(
2818 "ib_buf_pool_resize_wait_before_resize",
2819 should_wait = true; os_thread_sleep(10000););
2820 }
2821 }
2822#endif /* !DBUG_OFF */
2823
2824 if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
2825 return;
2826 }
2827
2828 /* Indicate critical path */
2829 buf_pool_resizing = true;
2830
2831 /* Acquire all buf_pool_mutex/hash_lock */
2832 for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
2833 buf_pool_t* buf_pool = buf_pool_from_array(i);
2834
2835 buf_pool_mutex_enter(buf_pool);
2836 }
2837 for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
2838 buf_pool_t* buf_pool = buf_pool_from_array(i);
2839
2840 hash_lock_x_all(buf_pool->page_hash);
2841 }
2842
2843 buf_chunk_map_reg = UT_NEW_NOKEY(buf_pool_chunk_map_t());
2844
2845 /* add/delete chunks */
2846 for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
2847 buf_pool_t* buf_pool = buf_pool_from_array(i);
2848 buf_chunk_t* chunk;
2849 buf_chunk_t* echunk;
2850
2851 buf_resize_status("buffer pool %lu :"
2852 " resizing with chunks %lu to %lu.",
2853 i, buf_pool->n_chunks, buf_pool->n_chunks_new);
2854
2855 if (buf_pool->n_chunks_new < buf_pool->n_chunks) {
2856 /* delete chunks */
2857 chunk = buf_pool->chunks
2858 + buf_pool->n_chunks_new;
2859 echunk = buf_pool->chunks + buf_pool->n_chunks;
2860
2861 ulint sum_freed = 0;
2862
2863 while (chunk < echunk) {
2864 buf_block_t* block = chunk->blocks;
2865
2866 for (ulint j = chunk->size;
2867 j--; block++) {
2868 mutex_free(&block->mutex);
2869 rw_lock_free(&block->lock);
2870
2871 ut_d(rw_lock_free(
2872 &block->debug_latch));
2873 }
2874
2875 buf_pool->allocator.deallocate_large(
2876 chunk->mem, &chunk->mem_pfx, true);
2877
2878 sum_freed += chunk->size;
2879
2880 ++chunk;
2881 }
2882
2883 /* discard withdraw list */
2884 UT_LIST_INIT(buf_pool->withdraw,
2885 &buf_page_t::list);
2886 buf_pool->withdraw_target = 0;
2887
2888 ib::info() << "buffer pool " << i << " : "
2889 << buf_pool->n_chunks - buf_pool->n_chunks_new
2890 << " chunks (" << sum_freed
2891 << " blocks) were freed.";
2892
2893 buf_pool->n_chunks = buf_pool->n_chunks_new;
2894 }
2895
2896 {
2897 /* reallocate buf_pool->chunks */
2898 const ulint new_chunks_size
2899 = buf_pool->n_chunks_new * sizeof(*chunk);
2900
2901 buf_chunk_t* new_chunks
2902 = reinterpret_cast<buf_chunk_t*>(
2903 ut_zalloc_nokey_nofatal(new_chunks_size));
2904
2905 DBUG_EXECUTE_IF("buf_pool_resize_chunk_null",
2906 buf_pool_resize_chunk_make_null(&new_chunks););
2907
2908 if (new_chunks == NULL) {
2909 ib::error() << "buffer pool " << i
2910 << " : failed to allocate"
2911 " the chunk array.";
2912 buf_pool->n_chunks_new
2913 = buf_pool->n_chunks;
2914 warning = true;
2915 buf_pool->chunks_old = NULL;
2916 for (ulint j = 0; j < buf_pool->n_chunks_new; j++) {
2917 buf_pool_register_chunk(&(buf_pool->chunks[j]));
2918 }
2919 goto calc_buf_pool_size;
2920 }
2921
2922 ulint n_chunks_copy = ut_min(buf_pool->n_chunks_new,
2923 buf_pool->n_chunks);
2924
2925 memcpy(new_chunks, buf_pool->chunks,
2926 n_chunks_copy * sizeof(*chunk));
2927
2928 for (ulint j = 0; j < n_chunks_copy; j++) {
2929 buf_pool_register_chunk(&new_chunks[j]);
2930 }
2931
2932 buf_pool->chunks_old = buf_pool->chunks;
2933 buf_pool->chunks = new_chunks;
2934 }
2935
2936
2937 if (buf_pool->n_chunks_new > buf_pool->n_chunks) {
2938 /* add chunks */
2939 chunk = buf_pool->chunks + buf_pool->n_chunks;
2940 echunk = buf_pool->chunks
2941 + buf_pool->n_chunks_new;
2942
2943 ulint sum_added = 0;
2944 ulint n_chunks = buf_pool->n_chunks;
2945
2946 while (chunk < echunk) {
2947 ulong unit = srv_buf_pool_chunk_unit;
2948
2949 if (!buf_chunk_init(buf_pool, chunk, unit)) {
2950
2951 ib::error() << "buffer pool " << i
2952 << " : failed to allocate"
2953 " new memory.";
2954
2955 warning = true;
2956
2957 buf_pool->n_chunks_new
2958 = n_chunks;
2959
2960 break;
2961 }
2962
2963 sum_added += chunk->size;
2964
2965 ++n_chunks;
2966 ++chunk;
2967 }
2968
2969 ib::info() << "buffer pool " << i << " : "
2970 << buf_pool->n_chunks_new - buf_pool->n_chunks
2971 << " chunks (" << sum_added
2972 << " blocks) were added.";
2973
2974 buf_pool->n_chunks = n_chunks;
2975 }
2976calc_buf_pool_size:
2977
2978 /* recalc buf_pool->curr_size */
2979 ulint new_size = 0;
2980
2981 chunk = buf_pool->chunks;
2982 do {
2983 new_size += chunk->size;
2984 } while (++chunk < buf_pool->chunks
2985 + buf_pool->n_chunks);
2986
2987 buf_pool->curr_size = new_size;
2988 buf_pool->n_chunks_new = buf_pool->n_chunks;
2989
2990 if (buf_pool->chunks_old) {
2991 ut_free(buf_pool->chunks_old);
2992 buf_pool->chunks_old = NULL;
2993 }
2994 }
2995
2996 buf_pool_chunk_map_t* chunk_map_old = buf_chunk_map_ref;
2997 buf_chunk_map_ref = buf_chunk_map_reg;
2998
2999 /* set instance sizes */
3000 {
3001 ulint curr_size = 0;
3002
3003 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
3004 buf_pool = buf_pool_from_array(i);
3005
3006 ut_ad(UT_LIST_GET_LEN(buf_pool->withdraw) == 0);
3007
3008 buf_pool->read_ahead_area =
3009 ut_min(BUF_READ_AHEAD_PAGES,
3010 ut_2_power_up(buf_pool->curr_size /
3011 BUF_READ_AHEAD_PORTION));
3012 buf_pool->curr_pool_size
3013 = buf_pool->curr_size << srv_page_size_shift;
3014 curr_size += buf_pool->curr_pool_size;
3015 buf_pool->old_size = buf_pool->curr_size;
3016 }
3017 srv_buf_pool_curr_size = curr_size;
3018 innodb_set_buf_pool_size(buf_pool_size_align(curr_size));
3019 }
3020
3021 const bool new_size_too_diff
3022 = srv_buf_pool_base_size > srv_buf_pool_size * 2
3023 || srv_buf_pool_base_size * 2 < srv_buf_pool_size;
3024
3025 /* Normalize page_hash and zip_hash,
3026 if the new size is too different */
3027 if (!warning && new_size_too_diff) {
3028
3029 buf_resize_status("Resizing hash tables.");
3030
3031 for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
3032 buf_pool_t* buf_pool = buf_pool_from_array(i);
3033
3034 buf_pool_resize_hash(buf_pool);
3035
3036 ib::info() << "buffer pool " << i
3037 << " : hash tables were resized.";
3038 }
3039 }
3040
3041 /* Release all buf_pool_mutex/page_hash */
3042 for (ulint i = 0; i < srv_buf_pool_instances; ++i) {
3043 buf_pool_t* buf_pool = buf_pool_from_array(i);
3044
3045 hash_unlock_x_all(buf_pool->page_hash);
3046 buf_pool_mutex_exit(buf_pool);
3047
3048 if (buf_pool->page_hash_old != NULL) {
3049 hash_table_free(buf_pool->page_hash_old);
3050 buf_pool->page_hash_old = NULL;
3051 }
3052 }
3053
3054 UT_DELETE(chunk_map_old);
3055
3056 buf_pool_resizing = false;
3057
3058 /* Normalize other components, if the new size is too different */
3059 if (!warning && new_size_too_diff) {
3060 srv_buf_pool_base_size = srv_buf_pool_size;
3061
3062 buf_resize_status("Resizing also other hash tables.");
3063
3064 /* normalize lock_sys */
3065 srv_lock_table_size = 5
3066 * (srv_buf_pool_size >> srv_page_size_shift);
3067 lock_sys.resize(srv_lock_table_size);
3068
3069 /* normalize btr_search_sys */
3070 btr_search_sys_resize(
3071 buf_pool_get_curr_size() / sizeof(void*) / 64);
3072
3073 /* normalize dict_sys */
3074 dict_resize();
3075
3076 ib::info() << "Resized hash tables at lock_sys,"
3077#ifdef BTR_CUR_HASH_ADAPT
3078 " adaptive hash index,"
3079#endif /* BTR_CUR_HASH_ADAPT */
3080 " dictionary.";
3081 }
3082
3083 /* normalize ibuf->max_size */
3084 ibuf_max_size_update(srv_change_buffer_max_size);
3085
3086 if (srv_buf_pool_old_size != srv_buf_pool_size) {
3087
3088 ib::info() << "Completed to resize buffer pool from "
3089 << srv_buf_pool_old_size
3090 << " to " << srv_buf_pool_size << ".";
3091 srv_buf_pool_old_size = srv_buf_pool_size;
3092 }
3093
3094#ifdef BTR_CUR_HASH_ADAPT
3095 /* enable AHI if needed */
3096 if (btr_search_disabled) {
3097 btr_search_enable();
3098 ib::info() << "Re-enabled adaptive hash index.";
3099 }
3100#endif /* BTR_CUR_HASH_ADAPT */
3101
3102 char now[32];
3103
3104 ut_sprintf_timestamp(now);
3105 if (!warning) {
3106 buf_resize_status("Completed resizing buffer pool at %s.",
3107 now);
3108 } else {
3109 buf_resize_status("Resizing buffer pool failed,"
3110 " finished resizing at %s.", now);
3111 }
3112
3113#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3114 ut_a(buf_validate());
3115#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3116
3117 return;
3118}
3119
3120/** This is the thread for resizing buffer pool. It waits for an event and
3121when waked up either performs a resizing and sleeps again.
3122@return this function does not return, calls os_thread_exit()
3123*/
3124extern "C"
3125os_thread_ret_t
3126DECLARE_THREAD(buf_resize_thread)(void*)
3127{
3128 my_thread_init();
3129
3130 while (srv_shutdown_state == SRV_SHUTDOWN_NONE) {
3131 os_event_wait(srv_buf_resize_event);
3132 os_event_reset(srv_buf_resize_event);
3133
3134 if (srv_shutdown_state != SRV_SHUTDOWN_NONE) {
3135 break;
3136 }
3137
3138 buf_pool_mutex_enter_all();
3139 if (srv_buf_pool_old_size == srv_buf_pool_size) {
3140 buf_pool_mutex_exit_all();
3141 std::ostringstream sout;
3142 sout << "Size did not change (old size = new size = "
3143 << srv_buf_pool_size << ". Nothing to do.";
3144 buf_resize_status(sout.str().c_str());
3145
3146 /* nothing to do */
3147 continue;
3148 }
3149 buf_pool_mutex_exit_all();
3150
3151 buf_pool_resize();
3152 }
3153
3154 srv_buf_resize_thread_active = false;
3155
3156 my_thread_end();
3157 os_thread_exit();
3158
3159 OS_THREAD_DUMMY_RETURN;
3160}
3161
3162#ifdef BTR_CUR_HASH_ADAPT
3163/** Clear the adaptive hash index on all pages in the buffer pool. */
3164void
3165buf_pool_clear_hash_index()
3166{
3167 ulint p;
3168
3169 ut_ad(btr_search_own_all(RW_LOCK_X));
3170 ut_ad(!buf_pool_resizing);
3171 ut_ad(!btr_search_enabled);
3172
3173 for (p = 0; p < srv_buf_pool_instances; p++) {
3174 buf_pool_t* buf_pool = buf_pool_from_array(p);
3175 buf_chunk_t* chunks = buf_pool->chunks;
3176 buf_chunk_t* chunk = chunks + buf_pool->n_chunks;
3177
3178 while (--chunk >= chunks) {
3179 buf_block_t* block = chunk->blocks;
3180 ulint i = chunk->size;
3181
3182 for (; i--; block++) {
3183 dict_index_t* index = block->index;
3184 assert_block_ahi_valid(block);
3185
3186 /* We can set block->index = NULL
3187 and block->n_pointers = 0
3188 when btr_search_own_all(RW_LOCK_X);
3189 see the comments in buf0buf.h */
3190
3191 if (!index) {
3192# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
3193 ut_a(!block->n_pointers);
3194# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
3195 continue;
3196 }
3197
3198 ut_d(buf_page_state state
3199 = buf_block_get_state(block));
3200 /* Another thread may have set the
3201 state to BUF_BLOCK_REMOVE_HASH in
3202 buf_LRU_block_remove_hashed().
3203
3204 The state change in buf_page_realloc()
3205 is not observable here, because in
3206 that case we would have !block->index.
3207
3208 In the end, the entire adaptive hash
3209 index will be removed. */
3210 ut_ad(state == BUF_BLOCK_FILE_PAGE
3211 || state == BUF_BLOCK_REMOVE_HASH);
3212# if defined UNIV_AHI_DEBUG || defined UNIV_DEBUG
3213 block->n_pointers = 0;
3214# endif /* UNIV_AHI_DEBUG || UNIV_DEBUG */
3215 block->index = NULL;
3216 }
3217 }
3218 }
3219}
3220#endif /* BTR_CUR_HASH_ADAPT */
3221
3222/********************************************************************//**
3223Relocate a buffer control block. Relocates the block on the LRU list
3224and in buf_pool->page_hash. Does not relocate bpage->list.
3225The caller must take care of relocating bpage->list. */
3226static
3227void
3228buf_relocate(
3229/*=========*/
3230 buf_page_t* bpage, /*!< in/out: control block being relocated;
3231 buf_page_get_state(bpage) must be
3232 BUF_BLOCK_ZIP_DIRTY or BUF_BLOCK_ZIP_PAGE */
3233 buf_page_t* dpage) /*!< in/out: destination control block */
3234{
3235 buf_page_t* b;
3236 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
3237
3238 ut_ad(buf_pool_mutex_own(buf_pool));
3239 ut_ad(buf_page_hash_lock_held_x(buf_pool, bpage));
3240 ut_ad(mutex_own(buf_page_get_mutex(bpage)));
3241 ut_a(buf_page_get_io_fix(bpage) == BUF_IO_NONE);
3242 ut_a(bpage->buf_fix_count == 0);
3243 ut_ad(bpage->in_LRU_list);
3244 ut_ad(!bpage->in_zip_hash);
3245 ut_ad(bpage->in_page_hash);
3246 ut_ad(bpage == buf_page_hash_get_low(buf_pool, bpage->id));
3247
3248 ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
3249#ifdef UNIV_DEBUG
3250 switch (buf_page_get_state(bpage)) {
3251 case BUF_BLOCK_POOL_WATCH:
3252 case BUF_BLOCK_NOT_USED:
3253 case BUF_BLOCK_READY_FOR_USE:
3254 case BUF_BLOCK_FILE_PAGE:
3255 case BUF_BLOCK_MEMORY:
3256 case BUF_BLOCK_REMOVE_HASH:
3257 ut_error;
3258 case BUF_BLOCK_ZIP_DIRTY:
3259 case BUF_BLOCK_ZIP_PAGE:
3260 break;
3261 }
3262#endif /* UNIV_DEBUG */
3263
3264 memcpy(dpage, bpage, sizeof *dpage);
3265
3266 /* Important that we adjust the hazard pointer before
3267 removing bpage from LRU list. */
3268 buf_LRU_adjust_hp(buf_pool, bpage);
3269
3270 ut_d(bpage->in_LRU_list = FALSE);
3271 ut_d(bpage->in_page_hash = FALSE);
3272
3273 /* relocate buf_pool->LRU */
3274 b = UT_LIST_GET_PREV(LRU, bpage);
3275 UT_LIST_REMOVE(buf_pool->LRU, bpage);
3276
3277 if (b != NULL) {
3278 UT_LIST_INSERT_AFTER(buf_pool->LRU, b, dpage);
3279 } else {
3280 UT_LIST_ADD_FIRST(buf_pool->LRU, dpage);
3281 }
3282
3283 if (UNIV_UNLIKELY(buf_pool->LRU_old == bpage)) {
3284 buf_pool->LRU_old = dpage;
3285#ifdef UNIV_LRU_DEBUG
3286 /* buf_pool->LRU_old must be the first item in the LRU list
3287 whose "old" flag is set. */
3288 ut_a(buf_pool->LRU_old->old);
3289 ut_a(!UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)
3290 || !UT_LIST_GET_PREV(LRU, buf_pool->LRU_old)->old);
3291 ut_a(!UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)
3292 || UT_LIST_GET_NEXT(LRU, buf_pool->LRU_old)->old);
3293 } else {
3294 /* Check that the "old" flag is consistent in
3295 the block and its neighbours. */
3296 buf_page_set_old(dpage, buf_page_is_old(dpage));
3297#endif /* UNIV_LRU_DEBUG */
3298 }
3299
3300 ut_d(CheckInLRUList::validate(buf_pool));
3301
3302 /* relocate buf_pool->page_hash */
3303 ulint fold = bpage->id.fold();
3304 ut_ad(fold == dpage->id.fold());
3305 HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, fold, bpage);
3306 HASH_INSERT(buf_page_t, hash, buf_pool->page_hash, fold, dpage);
3307}
3308
3309/** Hazard Pointer implementation. */
3310
3311/** Set current value
3312@param bpage buffer block to be set as hp */
3313void
3314HazardPointer::set(buf_page_t* bpage)
3315{
3316 ut_ad(mutex_own(m_mutex));
3317 ut_ad(!bpage || buf_pool_from_bpage(bpage) == m_buf_pool);
3318 ut_ad(!bpage || buf_page_in_file(bpage));
3319
3320 m_hp = bpage;
3321}
3322
3323/** Checks if a bpage is the hp
3324@param bpage buffer block to be compared
3325@return true if it is hp */
3326
3327bool
3328HazardPointer::is_hp(const buf_page_t* bpage)
3329{
3330 ut_ad(mutex_own(m_mutex));
3331 ut_ad(!m_hp || buf_pool_from_bpage(m_hp) == m_buf_pool);
3332 ut_ad(!bpage || buf_pool_from_bpage(bpage) == m_buf_pool);
3333
3334 return(bpage == m_hp);
3335}
3336
3337/** Adjust the value of hp. This happens when some other thread working
3338on the same list attempts to remove the hp from the list.
3339@param bpage buffer block to be compared */
3340
3341void
3342FlushHp::adjust(const buf_page_t* bpage)
3343{
3344 ut_ad(bpage != NULL);
3345
3346 /** We only support reverse traversal for now. */
3347 if (is_hp(bpage)) {
3348 m_hp = UT_LIST_GET_PREV(list, m_hp);
3349 }
3350
3351 ut_ad(!m_hp || m_hp->in_flush_list);
3352}
3353
3354/** Adjust the value of hp. This happens when some other thread working
3355on the same list attempts to remove the hp from the list.
3356@param bpage buffer block to be compared */
3357
3358void
3359LRUHp::adjust(const buf_page_t* bpage)
3360{
3361 ut_ad(bpage);
3362
3363 /** We only support reverse traversal for now. */
3364 if (is_hp(bpage)) {
3365 m_hp = UT_LIST_GET_PREV(LRU, m_hp);
3366 }
3367
3368 ut_ad(!m_hp || m_hp->in_LRU_list);
3369}
3370
3371/** Selects from where to start a scan. If we have scanned too deep into
3372the LRU list it resets the value to the tail of the LRU list.
3373@return buf_page_t from where to start scan. */
3374
3375buf_page_t*
3376LRUItr::start()
3377{
3378 ut_ad(mutex_own(m_mutex));
3379
3380 if (!m_hp || m_hp->old) {
3381 m_hp = UT_LIST_GET_LAST(m_buf_pool->LRU);
3382 }
3383
3384 return(m_hp);
3385}
3386
3387/** Determine if a block is a sentinel for a buffer pool watch.
3388@param[in] buf_pool buffer pool instance
3389@param[in] bpage block
3390@return TRUE if a sentinel for a buffer pool watch, FALSE if not */
3391ibool
3392buf_pool_watch_is_sentinel(
3393 const buf_pool_t* buf_pool,
3394 const buf_page_t* bpage)
3395{
3396 /* We must also own the appropriate hash lock. */
3397 ut_ad(buf_page_hash_lock_held_s_or_x(buf_pool, bpage));
3398 ut_ad(buf_page_in_file(bpage));
3399
3400 if (bpage < &buf_pool->watch[0]
3401 || bpage >= &buf_pool->watch[BUF_POOL_WATCH_SIZE]) {
3402
3403 ut_ad(buf_page_get_state(bpage) != BUF_BLOCK_ZIP_PAGE
3404 || bpage->zip.data != NULL);
3405
3406 return(FALSE);
3407 }
3408
3409 ut_ad(buf_page_get_state(bpage) == BUF_BLOCK_ZIP_PAGE);
3410 ut_ad(!bpage->in_zip_hash);
3411 ut_ad(bpage->in_page_hash);
3412 ut_ad(bpage->zip.data == NULL);
3413 return(TRUE);
3414}
3415
3416/** Add watch for the given page to be read in. Caller must have
3417appropriate hash_lock for the bpage. This function may release the
3418hash_lock and reacquire it.
3419@param[in] page_id page id
3420@param[in,out] hash_lock hash_lock currently latched
3421@return NULL if watch set, block if the page is in the buffer pool */
3422static
3423buf_page_t*
3424buf_pool_watch_set(
3425 const page_id_t& page_id,
3426 rw_lock_t** hash_lock)
3427{
3428 buf_page_t* bpage;
3429 ulint i;
3430 buf_pool_t* buf_pool = buf_pool_get(page_id);
3431
3432 ut_ad(*hash_lock == buf_page_hash_lock_get(buf_pool, page_id));
3433
3434 ut_ad(rw_lock_own(*hash_lock, RW_LOCK_X));
3435
3436 bpage = buf_page_hash_get_low(buf_pool, page_id);
3437
3438 if (bpage != NULL) {
3439page_found:
3440 if (!buf_pool_watch_is_sentinel(buf_pool, bpage)) {
3441 /* The page was loaded meanwhile. */
3442 return(bpage);
3443 }
3444
3445 /* Add to an existing watch. */
3446 buf_block_fix(bpage);
3447 return(NULL);
3448 }
3449
3450 /* From this point this function becomes fairly heavy in terms
3451 of latching. We acquire the buf_pool mutex as well as all the
3452 hash_locks. buf_pool mutex is needed because any changes to
3453 the page_hash must be covered by it and hash_locks are needed
3454 because we don't want to read any stale information in
3455 buf_pool->watch[]. However, it is not in the critical code path
3456 as this function will be called only by the purge thread. */
3457
3458 /* To obey latching order first release the hash_lock. */
3459 rw_lock_x_unlock(*hash_lock);
3460
3461 buf_pool_mutex_enter(buf_pool);
3462 hash_lock_x_all(buf_pool->page_hash);
3463
3464 /* If not own buf_pool_mutex, page_hash can be changed. */
3465 *hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
3466
3467 /* We have to recheck that the page
3468 was not loaded or a watch set by some other
3469 purge thread. This is because of the small
3470 time window between when we release the
3471 hash_lock to acquire buf_pool mutex above. */
3472
3473 bpage = buf_page_hash_get_low(buf_pool, page_id);
3474 if (UNIV_LIKELY_NULL(bpage)) {
3475 buf_pool_mutex_exit(buf_pool);
3476 hash_unlock_x_all_but(buf_pool->page_hash, *hash_lock);
3477 goto page_found;
3478 }
3479
3480 /* The maximum number of purge threads should never exceed
3481 BUF_POOL_WATCH_SIZE. So there is no way for purge thread
3482 instance to hold a watch when setting another watch. */
3483 for (i = 0; i < BUF_POOL_WATCH_SIZE; i++) {
3484 bpage = &buf_pool->watch[i];
3485
3486 ut_ad(bpage->access_time == 0);
3487 ut_ad(bpage->newest_modification == 0);
3488 ut_ad(bpage->oldest_modification == 0);
3489 ut_ad(bpage->zip.data == NULL);
3490 ut_ad(!bpage->in_zip_hash);
3491
3492 switch (bpage->state) {
3493 case BUF_BLOCK_POOL_WATCH:
3494 ut_ad(!bpage->in_page_hash);
3495 ut_ad(bpage->buf_fix_count == 0);
3496
3497 /* bpage is pointing to buf_pool->watch[],
3498 which is protected by buf_pool->mutex.
3499 Normally, buf_page_t objects are protected by
3500 buf_block_t::mutex or buf_pool->zip_mutex or both. */
3501
3502 bpage->state = BUF_BLOCK_ZIP_PAGE;
3503 bpage->id.copy_from(page_id);
3504 bpage->buf_fix_count = 1;
3505
3506 ut_d(bpage->in_page_hash = TRUE);
3507 HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
3508 page_id.fold(), bpage);
3509
3510 buf_pool_mutex_exit(buf_pool);
3511 /* Once the sentinel is in the page_hash we can
3512 safely release all locks except just the
3513 relevant hash_lock */
3514 hash_unlock_x_all_but(buf_pool->page_hash,
3515 *hash_lock);
3516
3517 return(NULL);
3518 case BUF_BLOCK_ZIP_PAGE:
3519 ut_ad(bpage->in_page_hash);
3520 ut_ad(bpage->buf_fix_count > 0);
3521 break;
3522 default:
3523 ut_error;
3524 }
3525 }
3526
3527 /* Allocation failed. Either the maximum number of purge
3528 threads should never exceed BUF_POOL_WATCH_SIZE, or this code
3529 should be modified to return a special non-NULL value and the
3530 caller should purge the record directly. */
3531 ut_error;
3532
3533 /* Fix compiler warning */
3534 return(NULL);
3535}
3536
3537/** Remove the sentinel block for the watch before replacing it with a
3538real block. buf_page_watch_clear() or buf_page_watch_occurred() will notice
3539that the block has been replaced with the real block.
3540@param[in,out] buf_pool buffer pool instance
3541@param[in,out] watch sentinel for watch
3542@return reference count, to be added to the replacement block */
3543static
3544void
3545buf_pool_watch_remove(
3546 buf_pool_t* buf_pool,
3547 buf_page_t* watch)
3548{
3549#ifdef UNIV_DEBUG
3550 /* We must also own the appropriate hash_bucket mutex. */
3551 rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, watch->id);
3552 ut_ad(rw_lock_own(hash_lock, RW_LOCK_X));
3553#endif /* UNIV_DEBUG */
3554
3555 ut_ad(buf_pool_mutex_own(buf_pool));
3556
3557 HASH_DELETE(buf_page_t, hash, buf_pool->page_hash, watch->id.fold(),
3558 watch);
3559 ut_d(watch->in_page_hash = FALSE);
3560 watch->buf_fix_count = 0;
3561 watch->state = BUF_BLOCK_POOL_WATCH;
3562}
3563
3564/** Stop watching if the page has been read in.
3565buf_pool_watch_set(same_page_id) must have returned NULL before.
3566@param[in] page_id page id */
3567void
3568buf_pool_watch_unset(
3569 const page_id_t& page_id)
3570{
3571 buf_page_t* bpage;
3572 buf_pool_t* buf_pool = buf_pool_get(page_id);
3573
3574 /* We only need to have buf_pool mutex in case where we end
3575 up calling buf_pool_watch_remove but to obey latching order
3576 we acquire it here before acquiring hash_lock. This should
3577 not cause too much grief as this function is only ever
3578 called from the purge thread. */
3579 buf_pool_mutex_enter(buf_pool);
3580
3581 rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
3582 rw_lock_x_lock(hash_lock);
3583
3584 /* The page must exist because buf_pool_watch_set()
3585 increments buf_fix_count. */
3586 bpage = buf_page_hash_get_low(buf_pool, page_id);
3587
3588 if (buf_block_unfix(bpage) == 0
3589 && buf_pool_watch_is_sentinel(buf_pool, bpage)) {
3590 buf_pool_watch_remove(buf_pool, bpage);
3591 }
3592
3593 buf_pool_mutex_exit(buf_pool);
3594 rw_lock_x_unlock(hash_lock);
3595}
3596
3597/** Check if the page has been read in.
3598This may only be called after buf_pool_watch_set(same_page_id)
3599has returned NULL and before invoking buf_pool_watch_unset(same_page_id).
3600@param[in] page_id page id
3601@return FALSE if the given page was not read in, TRUE if it was */
3602ibool
3603buf_pool_watch_occurred(
3604 const page_id_t& page_id)
3605{
3606 ibool ret;
3607 buf_page_t* bpage;
3608 buf_pool_t* buf_pool = buf_pool_get(page_id);
3609 rw_lock_t* hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
3610
3611 rw_lock_s_lock(hash_lock);
3612
3613 /* If not own buf_pool_mutex, page_hash can be changed. */
3614 hash_lock = buf_page_hash_lock_s_confirm(hash_lock, buf_pool, page_id);
3615
3616 /* The page must exist because buf_pool_watch_set()
3617 increments buf_fix_count. */
3618 bpage = buf_page_hash_get_low(buf_pool, page_id);
3619
3620 ret = !buf_pool_watch_is_sentinel(buf_pool, bpage);
3621 rw_lock_s_unlock(hash_lock);
3622
3623 return(ret);
3624}
3625
3626/********************************************************************//**
3627Moves a page to the start of the buffer pool LRU list. This high-level
3628function can be used to prevent an important page from slipping out of
3629the buffer pool. */
3630void
3631buf_page_make_young(
3632/*================*/
3633 buf_page_t* bpage) /*!< in: buffer block of a file page */
3634{
3635 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
3636
3637 buf_pool_mutex_enter(buf_pool);
3638
3639 ut_a(buf_page_in_file(bpage));
3640
3641 buf_LRU_make_block_young(bpage);
3642
3643 buf_pool_mutex_exit(buf_pool);
3644}
3645
3646/********************************************************************//**
3647Moves a page to the start of the buffer pool LRU list if it is too old.
3648This high-level function can be used to prevent an important page from
3649slipping out of the buffer pool. */
3650static
3651void
3652buf_page_make_young_if_needed(
3653/*==========================*/
3654 buf_page_t* bpage) /*!< in/out: buffer block of a
3655 file page */
3656{
3657#ifdef UNIV_DEBUG
3658 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
3659 ut_ad(!buf_pool_mutex_own(buf_pool));
3660#endif /* UNIV_DEBUG */
3661 ut_a(buf_page_in_file(bpage));
3662
3663 if (buf_page_peek_if_too_old(bpage)) {
3664 buf_page_make_young(bpage);
3665 }
3666}
3667
3668#ifdef UNIV_DEBUG
3669
3670/** Sets file_page_was_freed TRUE if the page is found in the buffer pool.
3671This function should be called when we free a file page and want the
3672debug version to check that it is not accessed any more unless
3673reallocated.
3674@param[in] page_id page id
3675@return control block if found in page hash table, otherwise NULL */
3676buf_page_t*
3677buf_page_set_file_page_was_freed(
3678 const page_id_t& page_id)
3679{
3680 buf_page_t* bpage;
3681 buf_pool_t* buf_pool = buf_pool_get(page_id);
3682 rw_lock_t* hash_lock;
3683
3684 bpage = buf_page_hash_get_s_locked(buf_pool, page_id, &hash_lock);
3685
3686 if (bpage) {
3687 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
3688 ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
3689 mutex_enter(block_mutex);
3690 rw_lock_s_unlock(hash_lock);
3691 /* bpage->file_page_was_freed can already hold
3692 when this code is invoked from dict_drop_index_tree() */
3693 bpage->file_page_was_freed = TRUE;
3694 mutex_exit(block_mutex);
3695 }
3696
3697 return(bpage);
3698}
3699
3700/** Sets file_page_was_freed FALSE if the page is found in the buffer pool.
3701This function should be called when we free a file page and want the
3702debug version to check that it is not accessed any more unless
3703reallocated.
3704@param[in] page_id page id
3705@return control block if found in page hash table, otherwise NULL */
3706buf_page_t*
3707buf_page_reset_file_page_was_freed(
3708 const page_id_t& page_id)
3709{
3710 buf_page_t* bpage;
3711 buf_pool_t* buf_pool = buf_pool_get(page_id);
3712 rw_lock_t* hash_lock;
3713
3714 bpage = buf_page_hash_get_s_locked(buf_pool, page_id, &hash_lock);
3715 if (bpage) {
3716 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
3717 ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
3718 mutex_enter(block_mutex);
3719 rw_lock_s_unlock(hash_lock);
3720 bpage->file_page_was_freed = FALSE;
3721 mutex_exit(block_mutex);
3722 }
3723
3724 return(bpage);
3725}
3726#endif /* UNIV_DEBUG */
3727
3728/** Attempts to discard the uncompressed frame of a compressed page.
3729The caller should not be holding any mutexes when this function is called.
3730@param[in] page_id page id
3731@return TRUE if successful, FALSE otherwise. */
3732static
3733void
3734buf_block_try_discard_uncompressed(
3735 const page_id_t& page_id)
3736{
3737 buf_page_t* bpage;
3738 buf_pool_t* buf_pool = buf_pool_get(page_id);
3739
3740 /* Since we need to acquire buf_pool mutex to discard
3741 the uncompressed frame and because page_hash mutex resides
3742 below buf_pool mutex in sync ordering therefore we must
3743 first release the page_hash mutex. This means that the
3744 block in question can move out of page_hash. Therefore
3745 we need to check again if the block is still in page_hash. */
3746 buf_pool_mutex_enter(buf_pool);
3747
3748 bpage = buf_page_hash_get(buf_pool, page_id);
3749
3750 if (bpage) {
3751 buf_LRU_free_page(bpage, false);
3752 }
3753
3754 buf_pool_mutex_exit(buf_pool);
3755}
3756
3757/** Get read access to a compressed page (usually of type
3758FIL_PAGE_TYPE_ZBLOB or FIL_PAGE_TYPE_ZBLOB2).
3759The page must be released with buf_page_release_zip().
3760NOTE: the page is not protected by any latch. Mutual exclusion has to
3761be implemented at a higher level. In other words, all possible
3762accesses to a given page through this function must be protected by
3763the same set of mutexes or latches.
3764@param[in] page_id page id
3765@param[in] page_size page size
3766@return pointer to the block */
3767buf_page_t*
3768buf_page_get_zip(
3769 const page_id_t& page_id,
3770 const page_size_t& page_size)
3771{
3772 buf_page_t* bpage;
3773 BPageMutex* block_mutex;
3774 rw_lock_t* hash_lock;
3775 ibool discard_attempted = FALSE;
3776 ibool must_read;
3777 buf_pool_t* buf_pool = buf_pool_get(page_id);
3778
3779 buf_pool->stat.n_page_gets++;
3780
3781 for (;;) {
3782lookup:
3783
3784 /* The following call will also grab the page_hash
3785 mutex if the page is found. */
3786 bpage = buf_page_hash_get_s_locked(buf_pool, page_id,
3787 &hash_lock);
3788 if (bpage) {
3789 ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
3790 break;
3791 }
3792
3793 /* Page not in buf_pool: needs to be read from file */
3794
3795 ut_ad(!hash_lock);
3796 dberr_t err = buf_read_page(page_id, page_size);
3797
3798 if (err != DB_SUCCESS) {
3799 ib::error() << "Reading compressed page " << page_id
3800 << " failed with error: " << ut_strerr(err);
3801
3802 goto err_exit;
3803 }
3804
3805#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3806 ut_a(++buf_dbg_counter % 5771 || buf_validate());
3807#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3808 }
3809
3810 ut_ad(buf_page_hash_lock_held_s(buf_pool, bpage));
3811
3812 if (!bpage->zip.data) {
3813 /* There is no compressed page. */
3814err_exit:
3815 rw_lock_s_unlock(hash_lock);
3816 return(NULL);
3817 }
3818
3819 ut_ad(!buf_pool_watch_is_sentinel(buf_pool, bpage));
3820
3821 switch (buf_page_get_state(bpage)) {
3822 case BUF_BLOCK_POOL_WATCH:
3823 case BUF_BLOCK_NOT_USED:
3824 case BUF_BLOCK_READY_FOR_USE:
3825 case BUF_BLOCK_MEMORY:
3826 case BUF_BLOCK_REMOVE_HASH:
3827 ut_error;
3828
3829 case BUF_BLOCK_ZIP_PAGE:
3830 case BUF_BLOCK_ZIP_DIRTY:
3831 buf_block_fix(bpage);
3832 block_mutex = &buf_pool->zip_mutex;
3833 mutex_enter(block_mutex);
3834 goto got_block;
3835 case BUF_BLOCK_FILE_PAGE:
3836 /* Discard the uncompressed page frame if possible. */
3837 if (!discard_attempted) {
3838 rw_lock_s_unlock(hash_lock);
3839 buf_block_try_discard_uncompressed(page_id);
3840 discard_attempted = TRUE;
3841 goto lookup;
3842 }
3843
3844 buf_block_buf_fix_inc((buf_block_t*) bpage,
3845 __FILE__, __LINE__);
3846
3847 block_mutex = &((buf_block_t*) bpage)->mutex;
3848
3849 mutex_enter(block_mutex);
3850
3851 goto got_block;
3852 }
3853
3854 ut_error;
3855 goto err_exit;
3856
3857got_block:
3858 must_read = buf_page_get_io_fix(bpage) == BUF_IO_READ;
3859
3860 rw_lock_s_unlock(hash_lock);
3861
3862 ut_ad(!bpage->file_page_was_freed);
3863
3864 buf_page_set_accessed(bpage);
3865
3866 mutex_exit(block_mutex);
3867
3868 buf_page_make_young_if_needed(bpage);
3869
3870#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
3871 ut_a(++buf_dbg_counter % 5771 || buf_validate());
3872 ut_a(bpage->buf_fix_count > 0);
3873 ut_a(buf_page_in_file(bpage));
3874#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
3875
3876 if (must_read) {
3877 /* Let us wait until the read operation
3878 completes */
3879
3880 for (;;) {
3881 enum buf_io_fix io_fix;
3882
3883 mutex_enter(block_mutex);
3884 io_fix = buf_page_get_io_fix(bpage);
3885 mutex_exit(block_mutex);
3886
3887 if (io_fix == BUF_IO_READ) {
3888
3889 os_thread_sleep(WAIT_FOR_READ);
3890 } else {
3891 break;
3892 }
3893 }
3894 }
3895
3896#ifdef UNIV_IBUF_COUNT_DEBUG
3897 ut_a(ibuf_count_get(page_id) == 0);
3898#endif /* UNIV_IBUF_COUNT_DEBUG */
3899
3900 return(bpage);
3901}
3902
3903/********************************************************************//**
3904Initialize some fields of a control block. */
3905UNIV_INLINE
3906void
3907buf_block_init_low(
3908/*===============*/
3909 buf_block_t* block) /*!< in: block to init */
3910{
3911 block->skip_flush_check = false;
3912#ifdef BTR_CUR_HASH_ADAPT
3913 /* No adaptive hash index entries may point to a previously
3914 unused (and now freshly allocated) block. */
3915 assert_block_ahi_empty_on_init(block);
3916 block->index = NULL;
3917
3918 block->n_hash_helps = 0;
3919 block->n_fields = 1;
3920 block->n_bytes = 0;
3921 block->left_side = TRUE;
3922#endif /* BTR_CUR_HASH_ADAPT */
3923}
3924
3925/********************************************************************//**
3926Decompress a block.
3927@return TRUE if successful */
3928ibool
3929buf_zip_decompress(
3930/*===============*/
3931 buf_block_t* block, /*!< in/out: block */
3932 ibool check) /*!< in: TRUE=verify the page checksum */
3933{
3934 const byte* frame = block->page.zip.data;
3935 ulint size = page_zip_get_size(&block->page.zip);
3936 /* The tablespace will not be found if this function is called
3937 during IMPORT. */
3938 fil_space_t* space = fil_space_acquire_for_io(block->page.id.space());
3939 const unsigned key_version = mach_read_from_4(
3940 frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
3941 fil_space_crypt_t* crypt_data = space ? space->crypt_data : NULL;
3942 const bool encrypted = crypt_data
3943 && crypt_data->type != CRYPT_SCHEME_UNENCRYPTED
3944 && (!crypt_data->is_default_encryption()
3945 || srv_encrypt_tables);
3946
3947 ut_ad(block->page.size.is_compressed());
3948 ut_a(block->page.id.space() != 0);
3949
3950 if (UNIV_UNLIKELY(check && !page_zip_verify_checksum(frame, size))) {
3951
3952 ib::error() << "Compressed page checksum mismatch for "
3953 << (space ? space->chain.start->name : "")
3954 << block->page.id << ": stored: "
3955 << mach_read_from_4(frame + FIL_PAGE_SPACE_OR_CHKSUM)
3956 << ", crc32: "
3957 << page_zip_calc_checksum(
3958 frame, size, SRV_CHECKSUM_ALGORITHM_CRC32)
3959 << "/"
3960 << page_zip_calc_checksum(
3961 frame, size, SRV_CHECKSUM_ALGORITHM_CRC32,
3962 true)
3963 << " innodb: "
3964 << page_zip_calc_checksum(
3965 frame, size, SRV_CHECKSUM_ALGORITHM_INNODB)
3966 << ", none: "
3967 << page_zip_calc_checksum(
3968 frame, size, SRV_CHECKSUM_ALGORITHM_NONE);
3969
3970 goto err_exit;
3971 }
3972
3973 switch (fil_page_get_type(frame)) {
3974 case FIL_PAGE_INDEX:
3975 case FIL_PAGE_RTREE:
3976 if (page_zip_decompress(&block->page.zip,
3977 block->frame, TRUE)) {
3978 if (space) {
3979 space->release_for_io();
3980 }
3981 return(TRUE);
3982 }
3983
3984 ib::error() << "Unable to decompress "
3985 << (space ? space->chain.start->name : "")
3986 << block->page.id;
3987 goto err_exit;
3988 case FIL_PAGE_TYPE_ALLOCATED:
3989 case FIL_PAGE_INODE:
3990 case FIL_PAGE_IBUF_BITMAP:
3991 case FIL_PAGE_TYPE_FSP_HDR:
3992 case FIL_PAGE_TYPE_XDES:
3993 case FIL_PAGE_TYPE_ZBLOB:
3994 case FIL_PAGE_TYPE_ZBLOB2:
3995 /* Copy to uncompressed storage. */
3996 memcpy(block->frame, frame, block->page.size.physical());
3997 if (space) {
3998 space->release_for_io();
3999 }
4000
4001 return(TRUE);
4002 }
4003
4004 ib::error() << "Unknown compressed page type "
4005 << fil_page_get_type(frame)
4006 << " in " << (space ? space->chain.start->name : "")
4007 << block->page.id;
4008
4009err_exit:
4010 if (encrypted) {
4011 ib::info() << "Row compressed page could be encrypted"
4012 " with key_version " << key_version;
4013 block->page.encrypted = true;
4014 }
4015
4016 if (space) {
4017 if (encrypted) {
4018 dict_set_encrypted_by_space(space);
4019 } else {
4020 dict_set_corrupted_by_space(space);
4021 }
4022
4023 space->release_for_io();
4024 }
4025
4026 return(FALSE);
4027}
4028
4029#ifdef BTR_CUR_HASH_ADAPT
4030/** Get a buffer block from an adaptive hash index pointer.
4031This function does not return if the block is not identified.
4032@param[in] ptr pointer to within a page frame
4033@return pointer to block, never NULL */
4034buf_block_t*
4035buf_block_from_ahi(const byte* ptr)
4036{
4037 buf_pool_chunk_map_t::iterator it;
4038
4039 buf_pool_chunk_map_t* chunk_map = buf_chunk_map_ref;
4040 ut_ad(buf_chunk_map_ref == buf_chunk_map_reg);
4041 ut_ad(!buf_pool_resizing);
4042
4043 buf_chunk_t* chunk;
4044 it = chunk_map->upper_bound(ptr);
4045
4046 ut_a(it != chunk_map->begin());
4047
4048 if (it == chunk_map->end()) {
4049 chunk = chunk_map->rbegin()->second;
4050 } else {
4051 chunk = (--it)->second;
4052 }
4053
4054 ulint offs = ulint(ptr - chunk->blocks->frame);
4055
4056 offs >>= srv_page_size_shift;
4057
4058 ut_a(offs < chunk->size);
4059
4060 buf_block_t* block = &chunk->blocks[offs];
4061
4062 /* The function buf_chunk_init() invokes buf_block_init() so that
4063 block[n].frame == block->frame + n * srv_page_size. Check it. */
4064 ut_ad(block->frame == page_align(ptr));
4065 /* Read the state of the block without holding a mutex.
4066 A state transition from BUF_BLOCK_FILE_PAGE to
4067 BUF_BLOCK_REMOVE_HASH is possible during this execution. */
4068 ut_d(const buf_page_state state = buf_block_get_state(block));
4069 ut_ad(state == BUF_BLOCK_FILE_PAGE || state == BUF_BLOCK_REMOVE_HASH);
4070 return(block);
4071}
4072#endif /* BTR_CUR_HASH_ADAPT */
4073
4074/********************************************************************//**
4075Find out if a pointer belongs to a buf_block_t. It can be a pointer to
4076the buf_block_t itself or a member of it. This functions checks one of
4077the buffer pool instances.
4078@return TRUE if ptr belongs to a buf_block_t struct */
4079static
4080ibool
4081buf_pointer_is_block_field_instance(
4082/*================================*/
4083 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
4084 const void* ptr) /*!< in: pointer not dereferenced */
4085{
4086 const buf_chunk_t* chunk = buf_pool->chunks;
4087 const buf_chunk_t* const echunk = chunk + ut_min(
4088 buf_pool->n_chunks, buf_pool->n_chunks_new);
4089
4090 /* TODO: protect buf_pool->chunks with a mutex (the older pointer will
4091 currently remain while during buf_pool_resize()) */
4092 while (chunk < echunk) {
4093 if (ptr >= (void*) chunk->blocks
4094 && ptr < (void*) (chunk->blocks + chunk->size)) {
4095
4096 return(TRUE);
4097 }
4098
4099 chunk++;
4100 }
4101
4102 return(FALSE);
4103}
4104
4105/********************************************************************//**
4106Find out if a pointer belongs to a buf_block_t. It can be a pointer to
4107the buf_block_t itself or a member of it
4108@return TRUE if ptr belongs to a buf_block_t struct */
4109ibool
4110buf_pointer_is_block_field(
4111/*=======================*/
4112 const void* ptr) /*!< in: pointer not dereferenced */
4113{
4114 ulint i;
4115
4116 for (i = 0; i < srv_buf_pool_instances; i++) {
4117 ibool found;
4118
4119 found = buf_pointer_is_block_field_instance(
4120 buf_pool_from_array(i), ptr);
4121 if (found) {
4122 return(TRUE);
4123 }
4124 }
4125
4126 return(FALSE);
4127}
4128
4129/********************************************************************//**
4130Find out if a buffer block was created by buf_chunk_init().
4131@return TRUE if "block" has been added to buf_pool->free by buf_chunk_init() */
4132static
4133ibool
4134buf_block_is_uncompressed(
4135/*======================*/
4136 buf_pool_t* buf_pool, /*!< in: buffer pool instance */
4137 const buf_block_t* block) /*!< in: pointer to block,
4138 not dereferenced */
4139{
4140 if ((((ulint) block) % sizeof *block) != 0) {
4141 /* The pointer should be aligned. */
4142 return(FALSE);
4143 }
4144
4145 return(buf_pointer_is_block_field_instance(buf_pool, (void*) block));
4146}
4147
4148#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
4149/********************************************************************//**
4150Return true if probe is enabled.
4151@return true if probe enabled. */
4152static
4153bool
4154buf_debug_execute_is_force_flush()
4155/*==============================*/
4156{
4157 DBUG_EXECUTE_IF("ib_buf_force_flush", return(true); );
4158
4159 /* This is used during queisce testing, we want to ensure maximum
4160 buffering by the change buffer. */
4161
4162 if (srv_ibuf_disable_background_merge) {
4163 return(true);
4164 }
4165
4166 return(false);
4167}
4168#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
4169
4170/** Wait for the block to be read in.
4171@param[in] block The block to check */
4172static
4173void
4174buf_wait_for_read(
4175 buf_block_t* block)
4176{
4177 /* Note:
4178
4179 We are using the block->lock to check for IO state (and a dirty read).
4180 We set the IO_READ state under the protection of the hash_lock
4181 (and block->mutex). This is safe because another thread can only
4182 access the block (and check for IO state) after the block has been
4183 added to the page hashtable. */
4184
4185 if (buf_block_get_io_fix(block) == BUF_IO_READ) {
4186
4187 /* Wait until the read operation completes */
4188
4189 BPageMutex* mutex = buf_page_get_mutex(&block->page);
4190
4191 for (;;) {
4192 buf_io_fix io_fix;
4193
4194 mutex_enter(mutex);
4195
4196 io_fix = buf_block_get_io_fix(block);
4197
4198 mutex_exit(mutex);
4199
4200 if (io_fix == BUF_IO_READ) {
4201 /* Wait by temporaly s-latch */
4202 rw_lock_s_lock(&block->lock);
4203 rw_lock_s_unlock(&block->lock);
4204 } else {
4205 break;
4206 }
4207 }
4208 }
4209}
4210
4211/** This is the general function used to get access to a database page.
4212@param[in] page_id page id
4213@param[in] rw_latch RW_S_LATCH, RW_X_LATCH, RW_NO_LATCH
4214@param[in] guess guessed block or NULL
4215@param[in] mode BUF_GET, BUF_GET_IF_IN_POOL,
4216BUF_PEEK_IF_IN_POOL, BUF_GET_NO_LATCH, or BUF_GET_IF_IN_POOL_OR_WATCH
4217@param[in] file file name
4218@param[in] line line where called
4219@param[in] mtr mini-transaction
4220@return pointer to the block or NULL */
4221buf_block_t*
4222buf_page_get_gen(
4223 const page_id_t& page_id,
4224 const page_size_t& page_size,
4225 ulint rw_latch,
4226 buf_block_t* guess,
4227 ulint mode,
4228 const char* file,
4229 unsigned line,
4230 mtr_t* mtr,
4231 dberr_t* err)
4232{
4233 buf_block_t* block;
4234 unsigned access_time;
4235 rw_lock_t* hash_lock;
4236 buf_block_t* fix_block;
4237 ulint retries = 0;
4238 buf_pool_t* buf_pool = buf_pool_get(page_id);
4239
4240 ut_ad((mtr == NULL) == (mode == BUF_EVICT_IF_IN_POOL));
4241 ut_ad(!mtr || mtr->is_active());
4242 ut_ad((rw_latch == RW_S_LATCH)
4243 || (rw_latch == RW_X_LATCH)
4244 || (rw_latch == RW_SX_LATCH)
4245 || (rw_latch == RW_NO_LATCH));
4246
4247 if (err) {
4248 *err = DB_SUCCESS;
4249 }
4250
4251#ifdef UNIV_DEBUG
4252 switch (mode) {
4253 case BUF_EVICT_IF_IN_POOL:
4254 /* After DISCARD TABLESPACE, the tablespace would not exist,
4255 but in IMPORT TABLESPACE, PageConverter::operator() must
4256 replace any old pages, which were not evicted during DISCARD.
4257 Skip the assertion on space_page_size. */
4258 break;
4259 case BUF_PEEK_IF_IN_POOL:
4260 /* In this mode, the caller may pass a dummy page size,
4261 because it does not really matter. */
4262 break;
4263 default:
4264 ut_error;
4265 case BUF_GET_NO_LATCH:
4266 ut_ad(rw_latch == RW_NO_LATCH);
4267 /* fall through */
4268 case BUF_GET:
4269 case BUF_GET_IF_IN_POOL:
4270 case BUF_GET_IF_IN_POOL_OR_WATCH:
4271 case BUF_GET_POSSIBLY_FREED:
4272 bool found;
4273 const page_size_t& space_page_size
4274 = fil_space_get_page_size(page_id.space(), &found);
4275 ut_ad(found);
4276 ut_ad(page_size.equals_to(space_page_size));
4277 }
4278#endif /* UNIV_DEBUG */
4279
4280 ut_ad(!mtr || !ibuf_inside(mtr)
4281 || ibuf_page_low(page_id, page_size, FALSE, file, line, NULL));
4282
4283 buf_pool->stat.n_page_gets++;
4284 hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
4285loop:
4286 block = guess;
4287
4288 rw_lock_s_lock(hash_lock);
4289
4290 /* If not own buf_pool_mutex, page_hash can be changed. */
4291 hash_lock = buf_page_hash_lock_s_confirm(hash_lock, buf_pool, page_id);
4292
4293 if (block != NULL) {
4294
4295 /* If the guess is a compressed page descriptor that
4296 has been allocated by buf_page_alloc_descriptor(),
4297 it may have been freed by buf_relocate(). */
4298
4299 if (!buf_block_is_uncompressed(buf_pool, block)
4300 || !page_id.equals_to(block->page.id)
4301 || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
4302
4303 /* Our guess was bogus or things have changed
4304 since. */
4305 block = guess = NULL;
4306 } else {
4307 ut_ad(!block->page.in_zip_hash);
4308 }
4309 }
4310
4311 if (block == NULL) {
4312 block = (buf_block_t*) buf_page_hash_get_low(buf_pool, page_id);
4313 }
4314
4315 if (!block || buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
4316 rw_lock_s_unlock(hash_lock);
4317 block = NULL;
4318 }
4319
4320 if (block == NULL) {
4321
4322 /* Page not in buf_pool: needs to be read from file */
4323
4324 if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
4325 rw_lock_x_lock(hash_lock);
4326
4327 /* If not own buf_pool_mutex,
4328 page_hash can be changed. */
4329 hash_lock = buf_page_hash_lock_x_confirm(
4330 hash_lock, buf_pool, page_id);
4331
4332 block = (buf_block_t*) buf_pool_watch_set(
4333 page_id, &hash_lock);
4334
4335 if (block) {
4336 /* We can release hash_lock after we
4337 increment the fix count to make
4338 sure that no state change takes place. */
4339 fix_block = block;
4340
4341 if (fsp_is_system_temporary(page_id.space())) {
4342 /* For temporary tablespace,
4343 the mutex is being used for
4344 synchronization between user
4345 thread and flush thread,
4346 instead of block->lock. See
4347 buf_flush_page() for the flush
4348 thread counterpart. */
4349
4350 BPageMutex* fix_mutex
4351 = buf_page_get_mutex(
4352 &fix_block->page);
4353 mutex_enter(fix_mutex);
4354 buf_block_fix(fix_block);
4355 mutex_exit(fix_mutex);
4356 } else {
4357 buf_block_fix(fix_block);
4358 }
4359
4360 /* Now safe to release page_hash mutex */
4361 rw_lock_x_unlock(hash_lock);
4362 goto got_block;
4363 }
4364
4365 rw_lock_x_unlock(hash_lock);
4366 }
4367
4368 switch (mode) {
4369 case BUF_GET_IF_IN_POOL:
4370 case BUF_GET_IF_IN_POOL_OR_WATCH:
4371 case BUF_PEEK_IF_IN_POOL:
4372 case BUF_EVICT_IF_IN_POOL:
4373 ut_ad(!rw_lock_own(hash_lock, RW_LOCK_X));
4374 ut_ad(!rw_lock_own(hash_lock, RW_LOCK_S));
4375 return(NULL);
4376 }
4377
4378 /* The call path is buf_read_page() ->
4379 buf_read_page_low() (fil_io()) ->
4380 buf_page_io_complete() ->
4381 buf_decrypt_after_read(). Here fil_space_t* is used
4382 and we decrypt -> buf_page_check_corrupt() where page
4383 checksums are compared. Decryption, decompression as
4384 well as error handling takes place at a lower level.
4385 Here we only need to know whether the page really is
4386 corrupted, or if an encrypted page with a valid
4387 checksum cannot be decypted. */
4388
4389 dberr_t local_err = buf_read_page(page_id, page_size);
4390
4391 if (local_err == DB_SUCCESS) {
4392 buf_read_ahead_random(page_id, page_size,
4393 ibuf_inside(mtr));
4394
4395 retries = 0;
4396 } else if (mode == BUF_GET_POSSIBLY_FREED) {
4397 if (err) {
4398 *err = local_err;
4399 }
4400 return NULL;
4401 } else if (retries < BUF_PAGE_READ_MAX_RETRIES) {
4402 ++retries;
4403
4404 DBUG_EXECUTE_IF(
4405 "innodb_page_corruption_retries",
4406 retries = BUF_PAGE_READ_MAX_RETRIES;
4407 );
4408 } else {
4409 if (err) {
4410 *err = local_err;
4411 }
4412
4413 /* Pages whose encryption key is unavailable or used
4414 key, encryption algorithm or encryption method is
4415 incorrect are marked as encrypted in
4416 buf_page_check_corrupt(). Unencrypted page could be
4417 corrupted in a way where the key_id field is
4418 nonzero. There is no checksum on field
4419 FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION. */
4420 if (local_err == DB_DECRYPTION_FAILED) {
4421 return (NULL);
4422 }
4423
4424 /* Try to set table as corrupted instead of
4425 asserting. */
4426 if (page_id.space() == TRX_SYS_SPACE) {
4427 } else if (page_id.space() == SRV_TMP_SPACE_ID) {
4428 } else if (fil_space_t* space
4429 = fil_space_acquire_for_io(
4430 page_id.space())) {
4431 bool set = dict_set_corrupted_by_space(space);
4432 space->release_for_io();
4433 if (set) {
4434 return NULL;
4435 }
4436 }
4437
4438 ib::fatal() << "Unable to read page " << page_id
4439 << " into the buffer pool after "
4440 << BUF_PAGE_READ_MAX_RETRIES
4441 << ". The most probable cause"
4442 " of this error may be that the"
4443 " table has been corrupted."
4444 " See https://mariadb.com/kb/en/library/xtradbinnodb-recovery-modes/";
4445 }
4446
4447#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
4448 ut_a(++buf_dbg_counter % 5771 || buf_validate());
4449#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
4450 goto loop;
4451 } else {
4452 fix_block = block;
4453 }
4454
4455 if (fsp_is_system_temporary(page_id.space())) {
4456 /* For temporary tablespace, the mutex is being used
4457 for synchronization between user thread and flush
4458 thread, instead of block->lock. See buf_flush_page()
4459 for the flush thread counterpart. */
4460 BPageMutex* fix_mutex = buf_page_get_mutex(
4461 &fix_block->page);
4462 mutex_enter(fix_mutex);
4463 buf_block_fix(fix_block);
4464 mutex_exit(fix_mutex);
4465 } else {
4466 buf_block_fix(fix_block);
4467 }
4468
4469 /* Now safe to release page_hash mutex */
4470 rw_lock_s_unlock(hash_lock);
4471
4472got_block:
4473
4474 switch (mode) {
4475 case BUF_GET_IF_IN_POOL:
4476 case BUF_PEEK_IF_IN_POOL:
4477 case BUF_EVICT_IF_IN_POOL:
4478 buf_page_t* fix_page = &fix_block->page;
4479 BPageMutex* fix_mutex = buf_page_get_mutex(fix_page);
4480 mutex_enter(fix_mutex);
4481 const bool must_read
4482 = (buf_page_get_io_fix(fix_page) == BUF_IO_READ);
4483 mutex_exit(fix_mutex);
4484
4485 if (must_read) {
4486 /* The page is being read to buffer pool,
4487 but we cannot wait around for the read to
4488 complete. */
4489 buf_block_unfix(fix_block);
4490
4491 return(NULL);
4492 }
4493 }
4494
4495 switch (buf_block_get_state(fix_block)) {
4496 buf_page_t* bpage;
4497
4498 case BUF_BLOCK_FILE_PAGE:
4499 bpage = &block->page;
4500 if (fsp_is_system_temporary(page_id.space())
4501 && buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
4502 /* This suggests that the page is being flushed.
4503 Avoid returning reference to this page.
4504 Instead wait for the flush action to complete. */
4505 buf_block_unfix(fix_block);
4506 os_thread_sleep(WAIT_FOR_WRITE);
4507 goto loop;
4508 }
4509
4510 if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) {
4511evict_from_pool:
4512 ut_ad(!fix_block->page.oldest_modification);
4513 buf_pool_mutex_enter(buf_pool);
4514 buf_block_unfix(fix_block);
4515
4516 if (!buf_LRU_free_page(&fix_block->page, true)) {
4517 ut_ad(0);
4518 }
4519
4520 buf_pool_mutex_exit(buf_pool);
4521 return(NULL);
4522 }
4523 break;
4524
4525 case BUF_BLOCK_ZIP_PAGE:
4526 case BUF_BLOCK_ZIP_DIRTY:
4527 if (mode == BUF_PEEK_IF_IN_POOL) {
4528 /* This mode is only used for dropping an
4529 adaptive hash index. There cannot be an
4530 adaptive hash index for a compressed-only
4531 page, so do not bother decompressing the page. */
4532 buf_block_unfix(fix_block);
4533
4534 return(NULL);
4535 }
4536
4537 bpage = &block->page;
4538
4539 /* Note: We have already buffer fixed this block. */
4540 if (bpage->buf_fix_count > 1
4541 || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
4542
4543 /* This condition often occurs when the buffer
4544 is not buffer-fixed, but I/O-fixed by
4545 buf_page_init_for_read(). */
4546 buf_block_unfix(fix_block);
4547
4548 /* The block is buffer-fixed or I/O-fixed.
4549 Try again later. */
4550 os_thread_sleep(WAIT_FOR_READ);
4551
4552 goto loop;
4553 }
4554
4555 if (UNIV_UNLIKELY(mode == BUF_EVICT_IF_IN_POOL)) {
4556 goto evict_from_pool;
4557 }
4558
4559 /* Buffer-fix the block so that it cannot be evicted
4560 or relocated while we are attempting to allocate an
4561 uncompressed page. */
4562
4563 block = buf_LRU_get_free_block(buf_pool);
4564
4565 buf_pool_mutex_enter(buf_pool);
4566
4567 /* If not own buf_pool_mutex, page_hash can be changed. */
4568 hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
4569
4570 rw_lock_x_lock(hash_lock);
4571
4572 /* Buffer-fixing prevents the page_hash from changing. */
4573 ut_ad(bpage == buf_page_hash_get_low(buf_pool, page_id));
4574
4575 buf_block_unfix(fix_block);
4576
4577 buf_page_mutex_enter(block);
4578 mutex_enter(&buf_pool->zip_mutex);
4579
4580 fix_block = block;
4581
4582 if (bpage->buf_fix_count > 0
4583 || buf_page_get_io_fix(bpage) != BUF_IO_NONE) {
4584
4585 mutex_exit(&buf_pool->zip_mutex);
4586 /* The block was buffer-fixed or I/O-fixed while
4587 buf_pool->mutex was not held by this thread.
4588 Free the block that was allocated and retry.
4589 This should be extremely unlikely, for example,
4590 if buf_page_get_zip() was invoked. */
4591
4592 buf_LRU_block_free_non_file_page(block);
4593 buf_pool_mutex_exit(buf_pool);
4594 rw_lock_x_unlock(hash_lock);
4595 buf_page_mutex_exit(block);
4596
4597 /* Try again */
4598 goto loop;
4599 }
4600
4601 /* Move the compressed page from bpage to block,
4602 and uncompress it. */
4603
4604 /* Note: this is the uncompressed block and it is not
4605 accessible by other threads yet because it is not in
4606 any list or hash table */
4607 buf_relocate(bpage, &block->page);
4608
4609 buf_block_init_low(block);
4610
4611 /* Set after buf_relocate(). */
4612 block->page.buf_fix_count = 1;
4613
4614 block->lock_hash_val = lock_rec_hash(page_id.space(),
4615 page_id.page_no());
4616
4617 UNIV_MEM_DESC(&block->page.zip.data,
4618 page_zip_get_size(&block->page.zip));
4619
4620 if (buf_page_get_state(&block->page) == BUF_BLOCK_ZIP_PAGE) {
4621#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
4622 UT_LIST_REMOVE(buf_pool->zip_clean, &block->page);
4623#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
4624 ut_ad(!block->page.in_flush_list);
4625 } else {
4626 /* Relocate buf_pool->flush_list. */
4627 buf_flush_relocate_on_flush_list(bpage, &block->page);
4628 }
4629
4630 /* Buffer-fix, I/O-fix, and X-latch the block
4631 for the duration of the decompression.
4632 Also add the block to the unzip_LRU list. */
4633 block->page.state = BUF_BLOCK_FILE_PAGE;
4634
4635 /* Insert at the front of unzip_LRU list */
4636 buf_unzip_LRU_add_block(block, FALSE);
4637
4638 buf_block_set_io_fix(block, BUF_IO_READ);
4639 rw_lock_x_lock_inline(&block->lock, 0, file, line);
4640
4641 UNIV_MEM_INVALID(bpage, sizeof *bpage);
4642
4643 rw_lock_x_unlock(hash_lock);
4644 buf_pool->n_pend_unzip++;
4645 mutex_exit(&buf_pool->zip_mutex);
4646 buf_pool_mutex_exit(buf_pool);
4647
4648 access_time = buf_page_is_accessed(&block->page);
4649
4650 buf_page_mutex_exit(block);
4651
4652 buf_page_free_descriptor(bpage);
4653
4654 /* Decompress the page while not holding
4655 buf_pool->mutex or block->mutex. */
4656
4657 {
4658 bool success = buf_zip_decompress(block, TRUE);
4659
4660 if (!success) {
4661 buf_pool_mutex_enter(buf_pool);
4662 buf_page_mutex_enter(fix_block);
4663 buf_block_set_io_fix(fix_block, BUF_IO_NONE);
4664 buf_page_mutex_exit(fix_block);
4665
4666 --buf_pool->n_pend_unzip;
4667 buf_block_unfix(fix_block);
4668 buf_pool_mutex_exit(buf_pool);
4669 rw_lock_x_unlock(&fix_block->lock);
4670
4671 *err = DB_PAGE_CORRUPTED;
4672 return NULL;
4673 }
4674 }
4675
4676 if (!recv_no_ibuf_operations) {
4677 if (access_time) {
4678#ifdef UNIV_IBUF_COUNT_DEBUG
4679 ut_a(ibuf_count_get(page_id) == 0);
4680#endif /* UNIV_IBUF_COUNT_DEBUG */
4681 } else {
4682 ibuf_merge_or_delete_for_page(
4683 block, page_id, &page_size, TRUE);
4684 }
4685 }
4686
4687 buf_pool_mutex_enter(buf_pool);
4688
4689 buf_page_mutex_enter(fix_block);
4690
4691 buf_block_set_io_fix(fix_block, BUF_IO_NONE);
4692
4693 buf_page_mutex_exit(fix_block);
4694
4695 --buf_pool->n_pend_unzip;
4696
4697 buf_pool_mutex_exit(buf_pool);
4698
4699 rw_lock_x_unlock(&block->lock);
4700
4701 break;
4702
4703 case BUF_BLOCK_POOL_WATCH:
4704 case BUF_BLOCK_NOT_USED:
4705 case BUF_BLOCK_READY_FOR_USE:
4706 case BUF_BLOCK_MEMORY:
4707 case BUF_BLOCK_REMOVE_HASH:
4708 ut_error;
4709 break;
4710 }
4711
4712 ut_ad(block == fix_block);
4713 ut_ad(fix_block->page.buf_fix_count > 0);
4714
4715 ut_ad(!rw_lock_own(hash_lock, RW_LOCK_X));
4716 ut_ad(!rw_lock_own(hash_lock, RW_LOCK_S));
4717
4718 ut_ad(buf_block_get_state(fix_block) == BUF_BLOCK_FILE_PAGE);
4719
4720#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
4721
4722 if ((mode == BUF_GET_IF_IN_POOL || mode == BUF_GET_IF_IN_POOL_OR_WATCH)
4723 && (ibuf_debug || buf_debug_execute_is_force_flush())) {
4724
4725 /* Try to evict the block from the buffer pool, to use the
4726 insert buffer (change buffer) as much as possible. */
4727
4728 buf_pool_mutex_enter(buf_pool);
4729
4730 buf_block_unfix(fix_block);
4731
4732 /* Now we are only holding the buf_pool->mutex,
4733 not block->mutex or hash_lock. Blocks cannot be
4734 relocated or enter or exit the buf_pool while we
4735 are holding the buf_pool->mutex. */
4736
4737 if (buf_LRU_free_page(&fix_block->page, true)) {
4738
4739 buf_pool_mutex_exit(buf_pool);
4740
4741 /* If not own buf_pool_mutex,
4742 page_hash can be changed. */
4743 hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
4744
4745 rw_lock_x_lock(hash_lock);
4746
4747 /* If not own buf_pool_mutex,
4748 page_hash can be changed. */
4749 hash_lock = buf_page_hash_lock_x_confirm(
4750 hash_lock, buf_pool, page_id);
4751
4752 if (mode == BUF_GET_IF_IN_POOL_OR_WATCH) {
4753 /* Set the watch, as it would have
4754 been set if the page were not in the
4755 buffer pool in the first place. */
4756 block = (buf_block_t*) buf_pool_watch_set(
4757 page_id, &hash_lock);
4758 } else {
4759 block = (buf_block_t*) buf_page_hash_get_low(
4760 buf_pool, page_id);
4761 }
4762
4763 rw_lock_x_unlock(hash_lock);
4764
4765 if (block != NULL) {
4766 /* Either the page has been read in or
4767 a watch was set on that in the window
4768 where we released the buf_pool::mutex
4769 and before we acquire the hash_lock
4770 above. Try again. */
4771 guess = block;
4772
4773 goto loop;
4774 }
4775
4776 return(NULL);
4777 }
4778
4779 buf_page_mutex_enter(fix_block);
4780
4781 if (buf_flush_page_try(buf_pool, fix_block)) {
4782 guess = fix_block;
4783
4784 goto loop;
4785 }
4786
4787 buf_page_mutex_exit(fix_block);
4788
4789 buf_block_fix(fix_block);
4790
4791 /* Failed to evict the page; change it directly */
4792
4793 buf_pool_mutex_exit(buf_pool);
4794 }
4795#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
4796
4797 ut_ad(fix_block->page.buf_fix_count > 0);
4798
4799#ifdef UNIV_DEBUG
4800 /* We have already buffer fixed the page, and we are committed to
4801 returning this page to the caller. Register for debugging.
4802 Avoid debug latching if page/block belongs to system temporary
4803 tablespace (Not much needed for table with single threaded access.). */
4804 if (!fsp_is_system_temporary(page_id.space())) {
4805 ibool ret;
4806 ret = rw_lock_s_lock_nowait(
4807 &fix_block->debug_latch, file, line);
4808 ut_a(ret);
4809 }
4810#endif /* UNIV_DEBUG */
4811
4812 /* While tablespace is reinited the indexes are already freed but the
4813 blocks related to it still resides in buffer pool. Trying to remove
4814 such blocks from buffer pool would invoke removal of AHI entries
4815 associated with these blocks. Logic to remove AHI entry will try to
4816 load the block but block is already in free state. Handle the said case
4817 with mode = BUF_PEEK_IF_IN_POOL that is invoked from
4818 "btr_search_drop_page_hash_when_freed". */
4819 ut_ad(mode == BUF_GET_POSSIBLY_FREED
4820 || mode == BUF_PEEK_IF_IN_POOL
4821 || !fix_block->page.file_page_was_freed);
4822
4823 /* Check if this is the first access to the page */
4824 access_time = buf_page_is_accessed(&fix_block->page);
4825
4826 /* This is a heuristic and we don't care about ordering issues. */
4827 if (access_time == 0) {
4828 buf_page_mutex_enter(fix_block);
4829
4830 buf_page_set_accessed(&fix_block->page);
4831
4832 buf_page_mutex_exit(fix_block);
4833 }
4834
4835 if (mode != BUF_PEEK_IF_IN_POOL) {
4836 buf_page_make_young_if_needed(&fix_block->page);
4837 }
4838
4839#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
4840 ut_a(++buf_dbg_counter % 5771 || buf_validate());
4841 ut_a(buf_block_get_state(fix_block) == BUF_BLOCK_FILE_PAGE);
4842#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
4843
4844 /* We have to wait here because the IO_READ state was set
4845 under the protection of the hash_lock and not the block->mutex
4846 and block->lock. */
4847 buf_wait_for_read(fix_block);
4848
4849 mtr_memo_type_t fix_type;
4850
4851 switch (rw_latch) {
4852 case RW_NO_LATCH:
4853
4854 fix_type = MTR_MEMO_BUF_FIX;
4855 break;
4856
4857 case RW_S_LATCH:
4858 rw_lock_s_lock_inline(&fix_block->lock, 0, file, line);
4859
4860 fix_type = MTR_MEMO_PAGE_S_FIX;
4861 break;
4862
4863 case RW_SX_LATCH:
4864 rw_lock_sx_lock_inline(&fix_block->lock, 0, file, line);
4865
4866 fix_type = MTR_MEMO_PAGE_SX_FIX;
4867 break;
4868
4869 default:
4870 ut_ad(rw_latch == RW_X_LATCH);
4871 rw_lock_x_lock_inline(&fix_block->lock, 0, file, line);
4872
4873 fix_type = MTR_MEMO_PAGE_X_FIX;
4874 break;
4875 }
4876
4877 mtr_memo_push(mtr, fix_block, fix_type);
4878
4879 if (mode != BUF_PEEK_IF_IN_POOL && !access_time) {
4880 /* In the case of a first access, try to apply linear
4881 read-ahead */
4882
4883 buf_read_ahead_linear(page_id, page_size, ibuf_inside(mtr));
4884 }
4885
4886#ifdef UNIV_IBUF_COUNT_DEBUG
4887 ut_a(ibuf_count_get(fix_block->page.id) == 0);
4888#endif
4889
4890 ut_ad(!rw_lock_own(hash_lock, RW_LOCK_X));
4891 ut_ad(!rw_lock_own(hash_lock, RW_LOCK_S));
4892
4893 return(fix_block);
4894}
4895
4896/********************************************************************//**
4897This is the general function used to get optimistic access to a database
4898page.
4899@return TRUE if success */
4900ibool
4901buf_page_optimistic_get(
4902/*====================*/
4903 ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
4904 buf_block_t* block, /*!< in: guessed buffer block */
4905 ib_uint64_t modify_clock,/*!< in: modify clock value */
4906 const char* file, /*!< in: file name */
4907 unsigned line, /*!< in: line where called */
4908 mtr_t* mtr) /*!< in: mini-transaction */
4909{
4910 buf_pool_t* buf_pool;
4911 unsigned access_time;
4912 ibool success;
4913
4914 ut_ad(block);
4915 ut_ad(mtr);
4916 ut_ad(mtr->is_active());
4917 ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
4918
4919 buf_page_mutex_enter(block);
4920
4921 if (UNIV_UNLIKELY(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE)) {
4922
4923 buf_page_mutex_exit(block);
4924
4925 return(FALSE);
4926 }
4927
4928 buf_block_buf_fix_inc(block, file, line);
4929
4930 access_time = buf_page_is_accessed(&block->page);
4931
4932 buf_page_set_accessed(&block->page);
4933
4934 buf_page_mutex_exit(block);
4935
4936 buf_page_make_young_if_needed(&block->page);
4937
4938 ut_ad(!ibuf_inside(mtr)
4939 || ibuf_page(block->page.id, block->page.size, NULL));
4940
4941 mtr_memo_type_t fix_type;
4942
4943 switch (rw_latch) {
4944 case RW_S_LATCH:
4945 success = rw_lock_s_lock_nowait(&block->lock, file, line);
4946
4947 fix_type = MTR_MEMO_PAGE_S_FIX;
4948 break;
4949 case RW_X_LATCH:
4950 success = rw_lock_x_lock_func_nowait_inline(
4951 &block->lock, file, line);
4952
4953 fix_type = MTR_MEMO_PAGE_X_FIX;
4954 break;
4955 default:
4956 ut_error; /* RW_SX_LATCH is not implemented yet */
4957 }
4958
4959 if (!success) {
4960 buf_page_mutex_enter(block);
4961 buf_block_buf_fix_dec(block);
4962 buf_page_mutex_exit(block);
4963
4964 return(FALSE);
4965 }
4966
4967 if (modify_clock != block->modify_clock) {
4968
4969 buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
4970
4971 if (rw_latch == RW_S_LATCH) {
4972 rw_lock_s_unlock(&block->lock);
4973 } else {
4974 rw_lock_x_unlock(&block->lock);
4975 }
4976
4977 buf_page_mutex_enter(block);
4978 buf_block_buf_fix_dec(block);
4979 buf_page_mutex_exit(block);
4980
4981 return(FALSE);
4982 }
4983
4984 mtr_memo_push(mtr, block, fix_type);
4985
4986#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
4987 ut_a(++buf_dbg_counter % 5771 || buf_validate());
4988 ut_a(block->page.buf_fix_count > 0);
4989 ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
4990#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
4991
4992 ut_d(buf_page_mutex_enter(block));
4993 ut_ad(!block->page.file_page_was_freed);
4994 ut_d(buf_page_mutex_exit(block));
4995
4996 if (!access_time) {
4997 /* In the case of a first access, try to apply linear
4998 read-ahead */
4999 buf_read_ahead_linear(block->page.id, block->page.size,
5000 ibuf_inside(mtr));
5001 }
5002
5003#ifdef UNIV_IBUF_COUNT_DEBUG
5004 ut_a(ibuf_count_get(block->page.id) == 0);
5005#endif /* UNIV_IBUF_COUNT_DEBUG */
5006
5007 buf_pool = buf_pool_from_block(block);
5008 buf_pool->stat.n_page_gets++;
5009
5010 return(TRUE);
5011}
5012
5013/********************************************************************//**
5014This is used to get access to a known database page, when no waiting can be
5015done. For example, if a search in an adaptive hash index leads us to this
5016frame.
5017@return TRUE if success */
5018ibool
5019buf_page_get_known_nowait(
5020/*======================*/
5021 ulint rw_latch,/*!< in: RW_S_LATCH, RW_X_LATCH */
5022 buf_block_t* block, /*!< in: the known page */
5023 ulint mode, /*!< in: BUF_MAKE_YOUNG or BUF_KEEP_OLD */
5024 const char* file, /*!< in: file name */
5025 unsigned line, /*!< in: line where called */
5026 mtr_t* mtr) /*!< in: mini-transaction */
5027{
5028 buf_pool_t* buf_pool;
5029 ibool success;
5030
5031 ut_ad(mtr->is_active());
5032 ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
5033
5034 buf_page_mutex_enter(block);
5035
5036 if (buf_block_get_state(block) == BUF_BLOCK_REMOVE_HASH) {
5037 /* Another thread is just freeing the block from the LRU list
5038 of the buffer pool: do not try to access this page; this
5039 attempt to access the page can only come through the hash
5040 index because when the buffer block state is ..._REMOVE_HASH,
5041 we have already removed it from the page address hash table
5042 of the buffer pool. */
5043
5044 buf_page_mutex_exit(block);
5045
5046 return(FALSE);
5047 }
5048
5049 ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
5050
5051 buf_block_buf_fix_inc(block, file, line);
5052
5053 buf_page_set_accessed(&block->page);
5054
5055 buf_page_mutex_exit(block);
5056
5057 buf_pool = buf_pool_from_block(block);
5058
5059 if (mode == BUF_MAKE_YOUNG) {
5060 buf_page_make_young_if_needed(&block->page);
5061 }
5062
5063 ut_ad(!ibuf_inside(mtr) || mode == BUF_KEEP_OLD);
5064
5065 mtr_memo_type_t fix_type;
5066
5067 switch (rw_latch) {
5068 case RW_S_LATCH:
5069 success = rw_lock_s_lock_nowait(&block->lock, file, line);
5070 fix_type = MTR_MEMO_PAGE_S_FIX;
5071 break;
5072 case RW_X_LATCH:
5073 success = rw_lock_x_lock_func_nowait_inline(
5074 &block->lock, file, line);
5075
5076 fix_type = MTR_MEMO_PAGE_X_FIX;
5077 break;
5078 default:
5079 ut_error; /* RW_SX_LATCH is not implemented yet */
5080 }
5081
5082 if (!success) {
5083 buf_page_mutex_enter(block);
5084 buf_block_buf_fix_dec(block);
5085 buf_page_mutex_exit(block);
5086
5087 return(FALSE);
5088 }
5089
5090 mtr_memo_push(mtr, block, fix_type);
5091
5092#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
5093 ut_a(++buf_dbg_counter % 5771 || buf_validate());
5094 ut_a(block->page.buf_fix_count > 0);
5095 ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
5096#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
5097
5098#ifdef UNIV_DEBUG
5099 if (mode != BUF_KEEP_OLD) {
5100 /* If mode == BUF_KEEP_OLD, we are executing an I/O
5101 completion routine. Avoid a bogus assertion failure
5102 when ibuf_merge_or_delete_for_page() is processing a
5103 page that was just freed due to DROP INDEX, or
5104 deleting a record from SYS_INDEXES. This check will be
5105 skipped in recv_recover_page() as well. */
5106
5107 buf_page_mutex_enter(block);
5108 ut_a(!block->page.file_page_was_freed);
5109 buf_page_mutex_exit(block);
5110 }
5111#endif /* UNIV_DEBUG */
5112
5113#ifdef UNIV_IBUF_COUNT_DEBUG
5114 ut_a((mode == BUF_KEEP_OLD) || ibuf_count_get(block->page.id) == 0);
5115#endif
5116 buf_pool->stat.n_page_gets++;
5117
5118 return(TRUE);
5119}
5120
5121/** Given a tablespace id and page number tries to get that page. If the
5122page is not in the buffer pool it is not loaded and NULL is returned.
5123Suitable for using when holding the lock_sys_t::mutex.
5124@param[in] page_id page id
5125@param[in] file file name
5126@param[in] line line where called
5127@param[in] mtr mini-transaction
5128@return pointer to a page or NULL */
5129buf_block_t*
5130buf_page_try_get_func(
5131 const page_id_t& page_id,
5132 const char* file,
5133 unsigned line,
5134 mtr_t* mtr)
5135{
5136 buf_block_t* block;
5137 ibool success;
5138 buf_pool_t* buf_pool = buf_pool_get(page_id);
5139 rw_lock_t* hash_lock;
5140
5141 ut_ad(mtr);
5142 ut_ad(mtr->is_active());
5143
5144 block = buf_block_hash_get_s_locked(buf_pool, page_id, &hash_lock);
5145
5146 if (!block || buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE) {
5147 if (block) {
5148 rw_lock_s_unlock(hash_lock);
5149 }
5150 return(NULL);
5151 }
5152
5153 ut_ad(!buf_pool_watch_is_sentinel(buf_pool, &block->page));
5154
5155 buf_page_mutex_enter(block);
5156 rw_lock_s_unlock(hash_lock);
5157
5158#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
5159 ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
5160 ut_a(page_id.equals_to(block->page.id));
5161#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
5162
5163 buf_block_buf_fix_inc(block, file, line);
5164 buf_page_mutex_exit(block);
5165
5166 mtr_memo_type_t fix_type = MTR_MEMO_PAGE_S_FIX;
5167 success = rw_lock_s_lock_nowait(&block->lock, file, line);
5168
5169 if (!success) {
5170 /* Let us try to get an X-latch. If the current thread
5171 is holding an X-latch on the page, we cannot get an
5172 S-latch. */
5173
5174 fix_type = MTR_MEMO_PAGE_X_FIX;
5175 success = rw_lock_x_lock_func_nowait_inline(&block->lock,
5176 file, line);
5177 }
5178
5179 if (!success) {
5180 buf_page_mutex_enter(block);
5181 buf_block_buf_fix_dec(block);
5182 buf_page_mutex_exit(block);
5183
5184 return(NULL);
5185 }
5186
5187 mtr_memo_push(mtr, block, fix_type);
5188
5189#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
5190 ut_a(++buf_dbg_counter % 5771 || buf_validate());
5191 ut_a(block->page.buf_fix_count > 0);
5192 ut_a(buf_block_get_state(block) == BUF_BLOCK_FILE_PAGE);
5193#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
5194
5195 ut_d(buf_page_mutex_enter(block));
5196 ut_d(ut_a(!block->page.file_page_was_freed));
5197 ut_d(buf_page_mutex_exit(block));
5198
5199 buf_block_dbg_add_level(block, SYNC_NO_ORDER_CHECK);
5200
5201 buf_pool->stat.n_page_gets++;
5202
5203#ifdef UNIV_IBUF_COUNT_DEBUG
5204 ut_a(ibuf_count_get(block->page.id) == 0);
5205#endif /* UNIV_IBUF_COUNT_DEBUG */
5206
5207 return(block);
5208}
5209
5210/********************************************************************//**
5211Initialize some fields of a control block. */
5212UNIV_INLINE
5213void
5214buf_page_init_low(
5215/*==============*/
5216 buf_page_t* bpage) /*!< in: block to init */
5217{
5218 bpage->flush_type = BUF_FLUSH_LRU;
5219 bpage->io_fix = BUF_IO_NONE;
5220 bpage->buf_fix_count = 0;
5221 bpage->old = 0;
5222 bpage->freed_page_clock = 0;
5223 bpage->access_time = 0;
5224 bpage->newest_modification = 0;
5225 bpage->oldest_modification = 0;
5226 bpage->write_size = 0;
5227 bpage->encrypted = false;
5228 bpage->real_size = 0;
5229 bpage->slot = NULL;
5230
5231 HASH_INVALIDATE(bpage, hash);
5232
5233 ut_d(bpage->file_page_was_freed = FALSE);
5234}
5235
5236/** Inits a page to the buffer buf_pool.
5237@param[in,out] buf_pool buffer pool
5238@param[in] page_id page id
5239@param[in,out] block block to init */
5240static
5241void
5242buf_page_init(
5243 buf_pool_t* buf_pool,
5244 const page_id_t& page_id,
5245 const page_size_t& page_size,
5246 buf_block_t* block)
5247{
5248 buf_page_t* hash_page;
5249
5250 ut_ad(buf_pool == buf_pool_get(page_id));
5251 ut_ad(buf_pool_mutex_own(buf_pool));
5252
5253 ut_ad(buf_page_mutex_own(block));
5254 ut_a(buf_block_get_state(block) != BUF_BLOCK_FILE_PAGE);
5255
5256 ut_ad(rw_lock_own(buf_page_hash_lock_get(buf_pool, page_id),
5257 RW_LOCK_X));
5258
5259 /* Set the state of the block */
5260 buf_block_set_file_page(block, page_id);
5261
5262#ifdef UNIV_DEBUG_VALGRIND
5263 if (is_system_tablespace(page_id.space())) {
5264 /* Silence valid Valgrind warnings about uninitialized
5265 data being written to data files. There are some unused
5266 bytes on some pages that InnoDB does not initialize. */
5267 UNIV_MEM_VALID(block->frame, srv_page_size);
5268 }
5269#endif /* UNIV_DEBUG_VALGRIND */
5270
5271 buf_block_init_low(block);
5272
5273 block->lock_hash_val = lock_rec_hash(page_id.space(),
5274 page_id.page_no());
5275
5276 buf_page_init_low(&block->page);
5277
5278 /* Insert into the hash table of file pages */
5279
5280 hash_page = buf_page_hash_get_low(buf_pool, page_id);
5281
5282 if (hash_page == NULL) {
5283 /* Block not found in hash table */
5284 } else if (buf_pool_watch_is_sentinel(buf_pool, hash_page)) {
5285 /* Preserve the reference count. */
5286 ib_uint32_t buf_fix_count = hash_page->buf_fix_count;
5287
5288 ut_a(buf_fix_count > 0);
5289
5290 my_atomic_add32((int32*) &block->page.buf_fix_count, buf_fix_count);
5291
5292 buf_pool_watch_remove(buf_pool, hash_page);
5293 } else {
5294
5295 ib::error() << "Page " << page_id
5296 << " already found in the hash table: "
5297 << hash_page << ", " << block;
5298
5299 ut_d(buf_page_mutex_exit(block));
5300 ut_d(buf_pool_mutex_exit(buf_pool));
5301 ut_d(buf_print());
5302 ut_d(buf_LRU_print());
5303 ut_d(buf_validate());
5304 ut_d(buf_LRU_validate());
5305 ut_error;
5306 }
5307
5308 ut_ad(!block->page.in_zip_hash);
5309 ut_ad(!block->page.in_page_hash);
5310 ut_d(block->page.in_page_hash = TRUE);
5311
5312 block->page.id.copy_from(page_id);
5313 block->page.size.copy_from(page_size);
5314
5315 HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
5316 page_id.fold(), &block->page);
5317
5318 if (page_size.is_compressed()) {
5319 page_zip_set_size(&block->page.zip, page_size.physical());
5320 }
5321}
5322
5323/** Initialize a page for read to the buffer buf_pool. If the page is
5324(1) already in buf_pool, or
5325(2) if we specify to read only ibuf pages and the page is not an ibuf page, or
5326(3) if the space is deleted or being deleted,
5327then this function does nothing.
5328Sets the io_fix flag to BUF_IO_READ and sets a non-recursive exclusive lock
5329on the buffer frame. The io-handler must take care that the flag is cleared
5330and the lock released later.
5331@param[out] err DB_SUCCESS or DB_TABLESPACE_DELETED
5332@param[in] mode BUF_READ_IBUF_PAGES_ONLY, ...
5333@param[in] page_id page id
5334@param[in] unzip whether the uncompressed page is
5335 requested (for ROW_FORMAT=COMPRESSED)
5336@return pointer to the block
5337@retval NULL in case of an error */
5338buf_page_t*
5339buf_page_init_for_read(
5340 dberr_t* err,
5341 ulint mode,
5342 const page_id_t& page_id,
5343 const page_size_t& page_size,
5344 bool unzip)
5345{
5346 buf_block_t* block;
5347 buf_page_t* bpage = NULL;
5348 buf_page_t* watch_page;
5349 rw_lock_t* hash_lock;
5350 mtr_t mtr;
5351 bool lru = false;
5352 void* data;
5353 buf_pool_t* buf_pool = buf_pool_get(page_id);
5354
5355 ut_ad(buf_pool);
5356
5357 *err = DB_SUCCESS;
5358
5359 if (mode == BUF_READ_IBUF_PAGES_ONLY) {
5360 /* It is a read-ahead within an ibuf routine */
5361
5362 ut_ad(!ibuf_bitmap_page(page_id, page_size));
5363
5364 ibuf_mtr_start(&mtr);
5365
5366 if (!recv_no_ibuf_operations &&
5367 !ibuf_page(page_id, page_size, &mtr)) {
5368
5369 ibuf_mtr_commit(&mtr);
5370
5371 return(NULL);
5372 }
5373 } else {
5374 ut_ad(mode == BUF_READ_ANY_PAGE);
5375 }
5376
5377 if (page_size.is_compressed() && !unzip && !recv_recovery_is_on()) {
5378 block = NULL;
5379 } else {
5380 block = buf_LRU_get_free_block(buf_pool);
5381 ut_ad(block);
5382 ut_ad(buf_pool_from_block(block) == buf_pool);
5383 }
5384
5385 buf_pool_mutex_enter(buf_pool);
5386
5387 hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
5388 rw_lock_x_lock(hash_lock);
5389
5390 watch_page = buf_page_hash_get_low(buf_pool, page_id);
5391 if (watch_page && !buf_pool_watch_is_sentinel(buf_pool, watch_page)) {
5392 /* The page is already in the buffer pool. */
5393 watch_page = NULL;
5394 rw_lock_x_unlock(hash_lock);
5395 if (block) {
5396 buf_page_mutex_enter(block);
5397 buf_LRU_block_free_non_file_page(block);
5398 buf_page_mutex_exit(block);
5399 }
5400
5401 bpage = NULL;
5402 goto func_exit;
5403 }
5404
5405 if (block) {
5406 bpage = &block->page;
5407
5408 buf_page_mutex_enter(block);
5409
5410 ut_ad(buf_pool_from_bpage(bpage) == buf_pool);
5411
5412 buf_page_init(buf_pool, page_id, page_size, block);
5413
5414 /* Note: We are using the hash_lock for protection. This is
5415 safe because no other thread can lookup the block from the
5416 page hashtable yet. */
5417
5418 buf_page_set_io_fix(bpage, BUF_IO_READ);
5419
5420 rw_lock_x_unlock(hash_lock);
5421
5422 /* The block must be put to the LRU list, to the old blocks */
5423 buf_LRU_add_block(bpage, TRUE/* to old blocks */);
5424
5425 /* We set a pass-type x-lock on the frame because then
5426 the same thread which called for the read operation
5427 (and is running now at this point of code) can wait
5428 for the read to complete by waiting for the x-lock on
5429 the frame; if the x-lock were recursive, the same
5430 thread would illegally get the x-lock before the page
5431 read is completed. The x-lock is cleared by the
5432 io-handler thread. */
5433
5434 rw_lock_x_lock_gen(&block->lock, BUF_IO_READ);
5435
5436 if (page_size.is_compressed()) {
5437 /* buf_pool->mutex may be released and
5438 reacquired by buf_buddy_alloc(). Thus, we
5439 must release block->mutex in order not to
5440 break the latching order in the reacquisition
5441 of buf_pool->mutex. We also must defer this
5442 operation until after the block descriptor has
5443 been added to buf_pool->LRU and
5444 buf_pool->page_hash. */
5445 buf_page_mutex_exit(block);
5446 data = buf_buddy_alloc(buf_pool, page_size.physical(),
5447 &lru);
5448 buf_page_mutex_enter(block);
5449 block->page.zip.data = (page_zip_t*) data;
5450
5451 /* To maintain the invariant
5452 block->in_unzip_LRU_list
5453 == buf_page_belongs_to_unzip_LRU(&block->page)
5454 we have to add this block to unzip_LRU
5455 after block->page.zip.data is set. */
5456 ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
5457 buf_unzip_LRU_add_block(block, TRUE);
5458 }
5459
5460 buf_page_mutex_exit(block);
5461 } else {
5462 rw_lock_x_unlock(hash_lock);
5463
5464 /* The compressed page must be allocated before the
5465 control block (bpage), in order to avoid the
5466 invocation of buf_buddy_relocate_block() on
5467 uninitialized data. */
5468 data = buf_buddy_alloc(buf_pool, page_size.physical(), &lru);
5469
5470 rw_lock_x_lock(hash_lock);
5471
5472 /* If buf_buddy_alloc() allocated storage from the LRU list,
5473 it released and reacquired buf_pool->mutex. Thus, we must
5474 check the page_hash again, as it may have been modified. */
5475 if (UNIV_UNLIKELY(lru)) {
5476
5477 watch_page = buf_page_hash_get_low(buf_pool, page_id);
5478
5479 if (UNIV_UNLIKELY(watch_page
5480 && !buf_pool_watch_is_sentinel(buf_pool,
5481 watch_page))) {
5482
5483 /* The block was added by some other thread. */
5484 rw_lock_x_unlock(hash_lock);
5485 watch_page = NULL;
5486 buf_buddy_free(buf_pool, data,
5487 page_size.physical());
5488
5489 bpage = NULL;
5490 goto func_exit;
5491 }
5492 }
5493
5494 bpage = buf_page_alloc_descriptor();
5495
5496 /* Initialize the buf_pool pointer. */
5497 bpage->buf_pool_index = buf_pool_index(buf_pool);
5498
5499 page_zip_des_init(&bpage->zip);
5500 page_zip_set_size(&bpage->zip, page_size.physical());
5501 bpage->zip.data = (page_zip_t*) data;
5502
5503 bpage->size.copy_from(page_size);
5504
5505 mutex_enter(&buf_pool->zip_mutex);
5506 UNIV_MEM_DESC(bpage->zip.data, bpage->size.physical());
5507
5508 buf_page_init_low(bpage);
5509
5510 bpage->state = BUF_BLOCK_ZIP_PAGE;
5511 bpage->id.copy_from(page_id);
5512 bpage->flush_observer = NULL;
5513
5514 ut_d(bpage->in_page_hash = FALSE);
5515 ut_d(bpage->in_zip_hash = FALSE);
5516 ut_d(bpage->in_flush_list = FALSE);
5517 ut_d(bpage->in_free_list = FALSE);
5518 ut_d(bpage->in_LRU_list = FALSE);
5519
5520 ut_d(bpage->in_page_hash = TRUE);
5521
5522 if (watch_page != NULL) {
5523
5524 /* Preserve the reference count. */
5525 ib_uint32_t buf_fix_count;
5526
5527 buf_fix_count = watch_page->buf_fix_count;
5528
5529 ut_a(buf_fix_count > 0);
5530
5531 my_atomic_add32((int32*) &bpage->buf_fix_count, buf_fix_count);
5532
5533 ut_ad(buf_pool_watch_is_sentinel(buf_pool, watch_page));
5534 buf_pool_watch_remove(buf_pool, watch_page);
5535 }
5536
5537 HASH_INSERT(buf_page_t, hash, buf_pool->page_hash,
5538 bpage->id.fold(), bpage);
5539
5540 rw_lock_x_unlock(hash_lock);
5541
5542 /* The block must be put to the LRU list, to the old blocks.
5543 The zip size is already set into the page zip */
5544 buf_LRU_add_block(bpage, TRUE/* to old blocks */);
5545#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
5546 buf_LRU_insert_zip_clean(bpage);
5547#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
5548
5549 buf_page_set_io_fix(bpage, BUF_IO_READ);
5550
5551 mutex_exit(&buf_pool->zip_mutex);
5552 }
5553
5554 buf_pool->n_pend_reads++;
5555func_exit:
5556 buf_pool_mutex_exit(buf_pool);
5557
5558 if (mode == BUF_READ_IBUF_PAGES_ONLY) {
5559
5560 ibuf_mtr_commit(&mtr);
5561 }
5562
5563 ut_ad(!rw_lock_own(hash_lock, RW_LOCK_X));
5564 ut_ad(!rw_lock_own(hash_lock, RW_LOCK_S));
5565 ut_ad(!bpage || buf_page_in_file(bpage));
5566
5567 return(bpage);
5568}
5569
5570/** Initializes a page to the buffer buf_pool. The page is usually not read
5571from a file even if it cannot be found in the buffer buf_pool. This is one
5572of the functions which perform to a block a state transition NOT_USED =>
5573FILE_PAGE (the other is buf_page_get_gen).
5574@param[in] page_id page id
5575@param[in] page_size page size
5576@param[in] mtr mini-transaction
5577@return pointer to the block, page bufferfixed */
5578buf_block_t*
5579buf_page_create(
5580 const page_id_t& page_id,
5581 const page_size_t& page_size,
5582 mtr_t* mtr)
5583{
5584 buf_frame_t* frame;
5585 buf_block_t* block;
5586 buf_block_t* free_block = NULL;
5587 buf_pool_t* buf_pool = buf_pool_get(page_id);
5588 rw_lock_t* hash_lock;
5589
5590 ut_ad(mtr->is_active());
5591 ut_ad(page_id.space() != 0 || !page_size.is_compressed());
5592
5593 free_block = buf_LRU_get_free_block(buf_pool);
5594
5595 buf_pool_mutex_enter(buf_pool);
5596
5597 hash_lock = buf_page_hash_lock_get(buf_pool, page_id);
5598 rw_lock_x_lock(hash_lock);
5599
5600 block = (buf_block_t*) buf_page_hash_get_low(buf_pool, page_id);
5601
5602 if (block
5603 && buf_page_in_file(&block->page)
5604 && !buf_pool_watch_is_sentinel(buf_pool, &block->page)) {
5605
5606#ifdef UNIV_IBUF_COUNT_DEBUG
5607 ut_a(ibuf_count_get(page_id) == 0);
5608#endif /* UNIV_IBUF_COUNT_DEBUG */
5609
5610 ut_d(block->page.file_page_was_freed = FALSE);
5611
5612 /* Page can be found in buf_pool */
5613 buf_pool_mutex_exit(buf_pool);
5614 rw_lock_x_unlock(hash_lock);
5615
5616 buf_block_free(free_block);
5617
5618 return(buf_page_get_with_no_latch(page_id, page_size, mtr));
5619 }
5620
5621 /* If we get here, the page was not in buf_pool: init it there */
5622
5623 DBUG_PRINT("ib_buf", ("create page %u:%u",
5624 page_id.space(), page_id.page_no()));
5625
5626 block = free_block;
5627
5628 buf_page_mutex_enter(block);
5629
5630 buf_page_init(buf_pool, page_id, page_size, block);
5631
5632 rw_lock_x_unlock(hash_lock);
5633
5634 /* The block must be put to the LRU list */
5635 buf_LRU_add_block(&block->page, FALSE);
5636
5637 buf_block_buf_fix_inc(block, __FILE__, __LINE__);
5638 buf_pool->stat.n_pages_created++;
5639
5640 if (page_size.is_compressed()) {
5641 void* data;
5642 bool lru;
5643
5644 /* Prevent race conditions during buf_buddy_alloc(),
5645 which may release and reacquire buf_pool->mutex,
5646 by IO-fixing and X-latching the block. */
5647
5648 buf_page_set_io_fix(&block->page, BUF_IO_READ);
5649 rw_lock_x_lock(&block->lock);
5650
5651 buf_page_mutex_exit(block);
5652 /* buf_pool->mutex may be released and reacquired by
5653 buf_buddy_alloc(). Thus, we must release block->mutex
5654 in order not to break the latching order in
5655 the reacquisition of buf_pool->mutex. We also must
5656 defer this operation until after the block descriptor
5657 has been added to buf_pool->LRU and buf_pool->page_hash. */
5658 data = buf_buddy_alloc(buf_pool, page_size.physical(), &lru);
5659 buf_page_mutex_enter(block);
5660 block->page.zip.data = (page_zip_t*) data;
5661
5662 /* To maintain the invariant
5663 block->in_unzip_LRU_list
5664 == buf_page_belongs_to_unzip_LRU(&block->page)
5665 we have to add this block to unzip_LRU after
5666 block->page.zip.data is set. */
5667 ut_ad(buf_page_belongs_to_unzip_LRU(&block->page));
5668 buf_unzip_LRU_add_block(block, FALSE);
5669
5670 buf_page_set_io_fix(&block->page, BUF_IO_NONE);
5671 rw_lock_x_unlock(&block->lock);
5672 }
5673
5674 buf_pool_mutex_exit(buf_pool);
5675
5676 mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX);
5677
5678 buf_page_set_accessed(&block->page);
5679
5680 buf_page_mutex_exit(block);
5681
5682 /* Delete possible entries for the page from the insert buffer:
5683 such can exist if the page belonged to an index which was dropped */
5684 ibuf_merge_or_delete_for_page(NULL, page_id, &page_size, TRUE);
5685
5686 frame = block->frame;
5687
5688 memset(frame + FIL_PAGE_PREV, 0xff, 4);
5689 memset(frame + FIL_PAGE_NEXT, 0xff, 4);
5690 mach_write_to_2(frame + FIL_PAGE_TYPE, FIL_PAGE_TYPE_ALLOCATED);
5691
5692 /* FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION is only used on the
5693 following pages:
5694 (1) The first page of the InnoDB system tablespace (page 0:0)
5695 (2) FIL_RTREE_SPLIT_SEQ_NUM on R-tree pages
5696 (3) key_version on encrypted pages (not page 0:0) */
5697
5698 memset(frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
5699
5700#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
5701 ut_a(++buf_dbg_counter % 5771 || buf_validate());
5702#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
5703#ifdef UNIV_IBUF_COUNT_DEBUG
5704 ut_a(ibuf_count_get(block->page.id) == 0);
5705#endif
5706 return(block);
5707}
5708
5709/********************************************************************//**
5710Monitor the buffer page read/write activity, and increment corresponding
5711counter value if MONITOR_MODULE_BUF_PAGE (module_buf_page) module is
5712enabled. */
5713static
5714void
5715buf_page_monitor(
5716/*=============*/
5717 const buf_page_t* bpage, /*!< in: pointer to the block */
5718 enum buf_io_fix io_type)/*!< in: io_fix types */
5719{
5720 const byte* frame;
5721 monitor_id_t counter;
5722
5723 /* If the counter module is not turned on, just return */
5724 if (!MONITOR_IS_ON(MONITOR_MODULE_BUF_PAGE)) {
5725 return;
5726 }
5727
5728 ut_a(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
5729
5730 frame = bpage->zip.data
5731 ? bpage->zip.data
5732 : ((buf_block_t*) bpage)->frame;
5733
5734 switch (fil_page_get_type(frame)) {
5735 ulint level;
5736 case FIL_PAGE_TYPE_INSTANT:
5737 case FIL_PAGE_INDEX:
5738 case FIL_PAGE_RTREE:
5739 level = btr_page_get_level(frame);
5740
5741 /* Check if it is an index page for insert buffer */
5742 if (fil_page_get_type(frame) == FIL_PAGE_INDEX
5743 && btr_page_get_index_id(frame)
5744 == (index_id_t)(DICT_IBUF_ID_MIN + IBUF_SPACE_ID)) {
5745 if (level == 0) {
5746 counter = MONITOR_RW_COUNTER(
5747 io_type, MONITOR_INDEX_IBUF_LEAF_PAGE);
5748 } else {
5749 counter = MONITOR_RW_COUNTER(
5750 io_type,
5751 MONITOR_INDEX_IBUF_NON_LEAF_PAGE);
5752 }
5753 } else {
5754 if (level == 0) {
5755 counter = MONITOR_RW_COUNTER(
5756 io_type, MONITOR_INDEX_LEAF_PAGE);
5757 } else {
5758 counter = MONITOR_RW_COUNTER(
5759 io_type, MONITOR_INDEX_NON_LEAF_PAGE);
5760 }
5761 }
5762 break;
5763
5764 case FIL_PAGE_UNDO_LOG:
5765 counter = MONITOR_RW_COUNTER(io_type, MONITOR_UNDO_LOG_PAGE);
5766 break;
5767
5768 case FIL_PAGE_INODE:
5769 counter = MONITOR_RW_COUNTER(io_type, MONITOR_INODE_PAGE);
5770 break;
5771
5772 case FIL_PAGE_IBUF_FREE_LIST:
5773 counter = MONITOR_RW_COUNTER(io_type,
5774 MONITOR_IBUF_FREELIST_PAGE);
5775 break;
5776
5777 case FIL_PAGE_IBUF_BITMAP:
5778 counter = MONITOR_RW_COUNTER(io_type,
5779 MONITOR_IBUF_BITMAP_PAGE);
5780 break;
5781
5782 case FIL_PAGE_TYPE_SYS:
5783 counter = MONITOR_RW_COUNTER(io_type, MONITOR_SYSTEM_PAGE);
5784 break;
5785
5786 case FIL_PAGE_TYPE_TRX_SYS:
5787 counter = MONITOR_RW_COUNTER(io_type, MONITOR_TRX_SYSTEM_PAGE);
5788 break;
5789
5790 case FIL_PAGE_TYPE_FSP_HDR:
5791 counter = MONITOR_RW_COUNTER(io_type, MONITOR_FSP_HDR_PAGE);
5792 break;
5793
5794 case FIL_PAGE_TYPE_XDES:
5795 counter = MONITOR_RW_COUNTER(io_type, MONITOR_XDES_PAGE);
5796 break;
5797
5798 case FIL_PAGE_TYPE_BLOB:
5799 counter = MONITOR_RW_COUNTER(io_type, MONITOR_BLOB_PAGE);
5800 break;
5801
5802 case FIL_PAGE_TYPE_ZBLOB:
5803 counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB_PAGE);
5804 break;
5805
5806 case FIL_PAGE_TYPE_ZBLOB2:
5807 counter = MONITOR_RW_COUNTER(io_type, MONITOR_ZBLOB2_PAGE);
5808 break;
5809
5810 default:
5811 counter = MONITOR_RW_COUNTER(io_type, MONITOR_OTHER_PAGE);
5812 }
5813
5814 MONITOR_INC_NOCHECK(counter);
5815}
5816
5817/** Mark a table corrupted.
5818Also remove the bpage from LRU list. */
5819static
5820void
5821buf_mark_space_corrupt(buf_page_t* bpage, const fil_space_t* space)
5822{
5823 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
5824 const ibool uncompressed = (buf_page_get_state(bpage)
5825 == BUF_BLOCK_FILE_PAGE);
5826
5827 /* First unfix and release lock on the bpage */
5828 buf_pool_mutex_enter(buf_pool);
5829 mutex_enter(buf_page_get_mutex(bpage));
5830 ut_ad(buf_page_get_io_fix(bpage) == BUF_IO_READ);
5831 ut_ad(bpage->buf_fix_count == 0);
5832 ut_ad(bpage->id.space() == space->id);
5833
5834 /* Set BUF_IO_NONE before we remove the block from LRU list */
5835 buf_page_set_io_fix(bpage, BUF_IO_NONE);
5836
5837 if (uncompressed) {
5838 rw_lock_x_unlock_gen(
5839 &((buf_block_t*) bpage)->lock,
5840 BUF_IO_READ);
5841 }
5842
5843 mutex_exit(buf_page_get_mutex(bpage));
5844
5845 /* If block is not encrypted find the table with specified
5846 space id, and mark it corrupted. Encrypted tables
5847 are marked unusable later e.g. in ::open(). */
5848 if (!bpage->encrypted) {
5849 dict_set_corrupted_by_space(space);
5850 } else {
5851 dict_set_encrypted_by_space(space);
5852 }
5853
5854 /* After this point bpage can't be referenced. */
5855 buf_LRU_free_one_page(bpage);
5856
5857 ut_ad(buf_pool->n_pend_reads > 0);
5858 buf_pool->n_pend_reads--;
5859
5860 buf_pool_mutex_exit(buf_pool);
5861}
5862
5863/** Check if page is maybe compressed, encrypted or both when we encounter
5864corrupted page. Note that we can't be 100% sure if page is corrupted
5865or decrypt/decompress just failed.
5866@param[in,out] bpage page
5867@param[in,out] space tablespace from fil_space_acquire_for_io()
5868@return whether the operation succeeded
5869@retval DB_SUCCESS if page has been read and is not corrupted
5870@retval DB_PAGE_CORRUPTED if page based on checksum check is corrupted
5871@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
5872after decryption normal page checksum does not match.
5873@retval DB_TABLESPACE_DELETED if accessed tablespace is not found */
5874static
5875dberr_t
5876buf_page_check_corrupt(buf_page_t* bpage, fil_space_t* space)
5877{
5878 ut_ad(space->pending_io());
5879
5880 byte* dst_frame = (bpage->zip.data) ? bpage->zip.data :
5881 ((buf_block_t*) bpage)->frame;
5882 bool still_encrypted = false;
5883 dberr_t err = DB_SUCCESS;
5884 bool corrupted = false;
5885 fil_space_crypt_t* crypt_data = space->crypt_data;
5886
5887 /* In buf_decrypt_after_read we have either decrypted the page if
5888 page post encryption checksum matches and used key_id is found
5889 from the encryption plugin. If checksum did not match page was
5890 not decrypted and it could be either encrypted and corrupted
5891 or corrupted or good page. If we decrypted, there page could
5892 still be corrupted if used key does not match. */
5893 still_encrypted = crypt_data
5894 && crypt_data->type != CRYPT_SCHEME_UNENCRYPTED
5895 && !bpage->encrypted
5896 && fil_space_verify_crypt_checksum(
5897 dst_frame, bpage->size,
5898 bpage->id.space(), bpage->id.page_no());
5899
5900 if (!still_encrypted) {
5901 /* If traditional checksums match, we assume that page is
5902 not anymore encrypted. */
5903 corrupted = buf_page_is_corrupted(
5904 true, dst_frame, bpage->size, space);
5905
5906 if (!corrupted) {
5907 bpage->encrypted = false;
5908 } else {
5909 err = DB_PAGE_CORRUPTED;
5910 }
5911 }
5912
5913 /* Pages that we think are unencrypted but do not match the checksum
5914 checks could be corrupted or encrypted or both. */
5915 if (corrupted && !bpage->encrypted) {
5916 /* An error will be reported by
5917 buf_page_io_complete(). */
5918 } else if (still_encrypted || (bpage->encrypted && corrupted)) {
5919 bpage->encrypted = true;
5920 err = DB_DECRYPTION_FAILED;
5921
5922 ib::error()
5923 << "The page " << bpage->id << " in file '"
5924 << space->chain.start->name
5925 << "' cannot be decrypted.";
5926
5927 ib::info()
5928 << "However key management plugin or used key_version "
5929 << mach_read_from_4(dst_frame
5930 + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION)
5931 << " is not found or"
5932 " used encryption algorithm or method does not match.";
5933
5934 if (bpage->id.space() != TRX_SYS_SPACE) {
5935 ib::info()
5936 << "Marking tablespace as missing."
5937 " You may drop this table or"
5938 " install correct key management plugin"
5939 " and key file.";
5940 }
5941 }
5942
5943 return (err);
5944}
5945
5946/** Complete a read or write request of a file page to or from the buffer pool.
5947@param[in,out] bpage page to complete
5948@param[in] dblwr whether the doublewrite buffer was used (on write)
5949@param[in] evict whether or not to evict the page from LRU list
5950@return whether the operation succeeded
5951@retval DB_SUCCESS always when writing, or if a read page was OK
5952@retval DB_TABLESPACE_DELETED if the tablespace does not exist
5953@retval DB_PAGE_CORRUPTED if the checksum fails on a page read
5954@retval DB_DECRYPTION_FAILED if page post encryption checksum matches but
5955 after decryption normal page checksum does
5956 not match */
5957UNIV_INTERN
5958dberr_t
5959buf_page_io_complete(buf_page_t* bpage, bool dblwr, bool evict)
5960{
5961 enum buf_io_fix io_type;
5962 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
5963 const bool uncompressed = (buf_page_get_state(bpage)
5964 == BUF_BLOCK_FILE_PAGE);
5965 ut_a(buf_page_in_file(bpage));
5966
5967 /* We do not need protect io_fix here by mutex to read
5968 it because this is the only function where we can change the value
5969 from BUF_IO_READ or BUF_IO_WRITE to some other value, and our code
5970 ensures that this is the only thread that handles the i/o for this
5971 block. */
5972
5973 io_type = buf_page_get_io_fix(bpage);
5974 ut_ad(io_type == BUF_IO_READ || io_type == BUF_IO_WRITE);
5975 ut_ad(bpage->size.is_compressed() == (bpage->zip.data != NULL));
5976 ut_ad(uncompressed || bpage->zip.data);
5977
5978 if (io_type == BUF_IO_READ) {
5979 ulint read_page_no = 0;
5980 ulint read_space_id = 0;
5981 uint key_version = 0;
5982
5983 ut_ad(bpage->zip.data != NULL || ((buf_block_t*)bpage)->frame != NULL);
5984 fil_space_t* space = fil_space_acquire_for_io(
5985 bpage->id.space());
5986 if (!space) {
5987 return DB_TABLESPACE_DELETED;
5988 }
5989
5990 buf_page_decrypt_after_read(bpage, space);
5991
5992 byte* frame = bpage->zip.data
5993 ? bpage->zip.data
5994 : reinterpret_cast<buf_block_t*>(bpage)->frame;
5995 dberr_t err;
5996
5997 if (bpage->zip.data && uncompressed) {
5998 my_atomic_addlint(&buf_pool->n_pend_unzip, 1);
5999 ibool ok = buf_zip_decompress((buf_block_t*) bpage,
6000 FALSE);
6001 my_atomic_addlint(&buf_pool->n_pend_unzip, ulint(-1));
6002
6003 if (!ok) {
6004 ib::info() << "Page "
6005 << bpage->id
6006 << " zip_decompress failure.";
6007
6008 err = DB_PAGE_CORRUPTED;
6009 goto database_corrupted;
6010 }
6011 }
6012
6013 /* If this page is not uninitialized and not in the
6014 doublewrite buffer, then the page number and space id
6015 should be the same as in block. */
6016 read_page_no = mach_read_from_4(frame + FIL_PAGE_OFFSET);
6017 read_space_id = mach_read_from_4(
6018 frame + FIL_PAGE_ARCH_LOG_NO_OR_SPACE_ID);
6019 key_version = mach_read_from_4(
6020 frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
6021
6022 if (bpage->id.space() == TRX_SYS_SPACE
6023 && buf_dblwr_page_inside(bpage->id.page_no())) {
6024
6025 ib::error() << "Reading page " << bpage->id
6026 << ", which is in the doublewrite buffer!";
6027
6028 } else if (read_space_id == 0 && read_page_no == 0) {
6029 /* This is likely an uninitialized page. */
6030 } else if ((bpage->id.space() != TRX_SYS_SPACE
6031 && bpage->id.space() != read_space_id)
6032 || bpage->id.page_no() != read_page_no) {
6033 /* We did not compare space_id to read_space_id
6034 in the system tablespace, because the field
6035 was written as garbage before MySQL 4.1.1,
6036 which did not support innodb_file_per_table. */
6037
6038 ib::error() << "Space id and page no stored in "
6039 "the page, read in are "
6040 << page_id_t(read_space_id, read_page_no)
6041 << ", should be " << bpage->id;
6042 }
6043
6044 err = buf_page_check_corrupt(bpage, space);
6045
6046database_corrupted:
6047
6048 if (err != DB_SUCCESS) {
6049 /* Not a real corruption if it was triggered by
6050 error injection */
6051 DBUG_EXECUTE_IF(
6052 "buf_page_import_corrupt_failure",
6053 if (!is_predefined_tablespace(
6054 bpage->id.space())) {
6055 buf_mark_space_corrupt(bpage, space);
6056 ib::info() << "Simulated IMPORT "
6057 "corruption";
6058 space->release_for_io();
6059 return(err);
6060 }
6061 err = DB_SUCCESS;
6062 goto page_not_corrupt;
6063 );
6064
6065 if (err == DB_PAGE_CORRUPTED) {
6066 ib::error()
6067 << "Database page corruption on disk"
6068 " or a failed file read of tablespace "
6069 << space->name << " page " << bpage->id
6070 << ". You may have to recover from "
6071 << "a backup.";
6072
6073 buf_page_print(frame, bpage->size);
6074
6075 ib::info()
6076 << "It is also possible that your"
6077 " operating system has corrupted"
6078 " its own file cache and rebooting"
6079 " your computer removes the error."
6080 " If the corrupt page is an index page."
6081 " You can also try to fix the"
6082 " corruption by dumping, dropping,"
6083 " and reimporting the corrupt table."
6084 " You can use CHECK TABLE to scan"
6085 " your table for corruption. "
6086 << FORCE_RECOVERY_MSG;
6087 }
6088
6089 if (srv_force_recovery < SRV_FORCE_IGNORE_CORRUPT) {
6090
6091 /* If page space id is larger than TRX_SYS_SPACE
6092 (0), we will attempt to mark the corresponding
6093 table as corrupted instead of crashing server */
6094 if (bpage->id.space() == TRX_SYS_SPACE) {
6095 ib::fatal() << "Aborting because of"
6096 " a corrupt database page.";
6097 }
6098
6099 buf_mark_space_corrupt(bpage, space);
6100 space->release_for_io();
6101 return(err);
6102 }
6103 }
6104
6105 DBUG_EXECUTE_IF("buf_page_import_corrupt_failure",
6106 page_not_corrupt: bpage = bpage; );
6107
6108 if (recv_recovery_is_on()) {
6109 /* Pages must be uncompressed for crash recovery. */
6110 ut_a(uncompressed);
6111 recv_recover_page(TRUE, (buf_block_t*) bpage);
6112 }
6113
6114 /* If space is being truncated then avoid ibuf operation.
6115 During re-init we have already freed ibuf entries. */
6116 if (uncompressed
6117 && !recv_no_ibuf_operations
6118 && (bpage->id.space() == 0
6119 || !is_predefined_tablespace(bpage->id.space()))
6120 && !srv_is_tablespace_truncated(bpage->id.space())
6121 && fil_page_get_type(frame) == FIL_PAGE_INDEX
6122 && page_is_leaf(frame)) {
6123
6124 if (bpage->encrypted) {
6125 ib::warn()
6126 << "Table in tablespace "
6127 << bpage->id.space()
6128 << " encrypted. However key "
6129 "management plugin or used "
6130 << "key_version " << key_version
6131 << "is not found or"
6132 " used encryption algorithm or method does not match."
6133 " Can't continue opening the table.";
6134 } else {
6135
6136 ibuf_merge_or_delete_for_page(
6137 (buf_block_t*) bpage, bpage->id,
6138 &bpage->size, TRUE);
6139 }
6140
6141 }
6142
6143 space->release_for_io();
6144 } else {
6145 /* io_type == BUF_IO_WRITE */
6146 if (bpage->slot) {
6147 /* Mark slot free */
6148 bpage->slot->reserved = false;
6149 bpage->slot = NULL;
6150 }
6151 }
6152
6153 BPageMutex* block_mutex = buf_page_get_mutex(bpage);
6154 buf_pool_mutex_enter(buf_pool);
6155 mutex_enter(block_mutex);
6156
6157#ifdef UNIV_IBUF_COUNT_DEBUG
6158 if (io_type == BUF_IO_WRITE || uncompressed) {
6159 /* For BUF_IO_READ of compressed-only blocks, the
6160 buffered operations will be merged by buf_page_get_gen()
6161 after the block has been uncompressed. */
6162 ut_a(ibuf_count_get(bpage->id) == 0);
6163 }
6164#endif
6165 /* Because this thread which does the unlocking is not the same that
6166 did the locking, we use a pass value != 0 in unlock, which simply
6167 removes the newest lock debug record, without checking the thread
6168 id. */
6169
6170 buf_page_set_io_fix(bpage, BUF_IO_NONE);
6171 buf_page_monitor(bpage, io_type);
6172
6173 if (io_type == BUF_IO_READ) {
6174 /* NOTE that the call to ibuf may have moved the ownership of
6175 the x-latch to this OS thread: do not let this confuse you in
6176 debugging! */
6177
6178 ut_ad(buf_pool->n_pend_reads > 0);
6179 buf_pool->n_pend_reads--;
6180 buf_pool->stat.n_pages_read++;
6181
6182 if (uncompressed) {
6183 rw_lock_x_unlock_gen(&((buf_block_t*) bpage)->lock,
6184 BUF_IO_READ);
6185 }
6186
6187 mutex_exit(block_mutex);
6188 } else {
6189 /* Write means a flush operation: call the completion
6190 routine in the flush system */
6191
6192 buf_flush_write_complete(bpage, dblwr);
6193
6194 if (uncompressed) {
6195 rw_lock_sx_unlock_gen(&((buf_block_t*) bpage)->lock,
6196 BUF_IO_WRITE);
6197 }
6198
6199 buf_pool->stat.n_pages_written++;
6200
6201 /* We decide whether or not to evict the page from the
6202 LRU list based on the flush_type.
6203 * BUF_FLUSH_LIST: don't evict
6204 * BUF_FLUSH_LRU: always evict
6205 * BUF_FLUSH_SINGLE_PAGE: eviction preference is passed
6206 by the caller explicitly. */
6207 if (buf_page_get_flush_type(bpage) == BUF_FLUSH_LRU) {
6208 evict = true;
6209 }
6210
6211 mutex_exit(block_mutex);
6212
6213 if (evict) {
6214 buf_LRU_free_page(bpage, true);
6215 }
6216 }
6217
6218 DBUG_PRINT("ib_buf", ("%s page %u:%u",
6219 io_type == BUF_IO_READ ? "read" : "wrote",
6220 bpage->id.space(), bpage->id.page_no()));
6221
6222 buf_pool_mutex_exit(buf_pool);
6223
6224 return DB_SUCCESS;
6225}
6226
6227/*********************************************************************//**
6228Asserts that all file pages in the buffer are in a replaceable state.
6229@return TRUE */
6230static
6231ibool
6232buf_all_freed_instance(
6233/*===================*/
6234 buf_pool_t* buf_pool) /*!< in: buffer pool instancce */
6235{
6236 ulint i;
6237 buf_chunk_t* chunk;
6238
6239 ut_ad(buf_pool);
6240
6241 buf_pool_mutex_enter(buf_pool);
6242
6243 chunk = buf_pool->chunks;
6244
6245 for (i = buf_pool->n_chunks; i--; chunk++) {
6246
6247 if (const buf_block_t* block = buf_chunk_not_freed(chunk)) {
6248 ib::fatal() << "Page " << block->page.id
6249 << " still fixed or dirty";
6250 }
6251 }
6252
6253 buf_pool_mutex_exit(buf_pool);
6254
6255 return(TRUE);
6256}
6257
6258/** Refreshes the statistics used to print per-second averages.
6259@param[in,out] buf_pool buffer pool instance */
6260static
6261void
6262buf_refresh_io_stats(
6263 buf_pool_t* buf_pool)
6264{
6265 buf_pool->last_printout_time = ut_time();
6266 buf_pool->old_stat = buf_pool->stat;
6267}
6268
6269/*********************************************************************//**
6270Invalidates file pages in one buffer pool instance */
6271static
6272void
6273buf_pool_invalidate_instance(
6274/*=========================*/
6275 buf_pool_t* buf_pool) /*!< in: buffer pool instance */
6276{
6277 ulint i;
6278
6279 buf_pool_mutex_enter(buf_pool);
6280
6281 for (i = BUF_FLUSH_LRU; i < BUF_FLUSH_N_TYPES; i++) {
6282
6283 /* As this function is called during startup and
6284 during redo application phase during recovery, InnoDB
6285 is single threaded (apart from IO helper threads) at
6286 this stage. No new write batch can be in intialization
6287 stage at this point. */
6288 ut_ad(buf_pool->init_flush[i] == FALSE);
6289
6290 /* However, it is possible that a write batch that has
6291 been posted earlier is still not complete. For buffer
6292 pool invalidation to proceed we must ensure there is NO
6293 write activity happening. */
6294 if (buf_pool->n_flush[i] > 0) {
6295 buf_flush_t type = static_cast<buf_flush_t>(i);
6296
6297 buf_pool_mutex_exit(buf_pool);
6298 buf_flush_wait_batch_end(buf_pool, type);
6299 buf_pool_mutex_enter(buf_pool);
6300 }
6301 }
6302
6303 buf_pool_mutex_exit(buf_pool);
6304
6305 ut_ad(buf_all_freed_instance(buf_pool));
6306
6307 buf_pool_mutex_enter(buf_pool);
6308
6309 while (buf_LRU_scan_and_free_block(buf_pool, true)) {
6310 }
6311
6312 ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0);
6313 ut_ad(UT_LIST_GET_LEN(buf_pool->unzip_LRU) == 0);
6314
6315 buf_pool->freed_page_clock = 0;
6316 buf_pool->LRU_old = NULL;
6317 buf_pool->LRU_old_len = 0;
6318
6319 memset(&buf_pool->stat, 0x00, sizeof(buf_pool->stat));
6320 buf_refresh_io_stats(buf_pool);
6321
6322 buf_pool_mutex_exit(buf_pool);
6323}
6324
6325/*********************************************************************//**
6326Invalidates the file pages in the buffer pool when an archive recovery is
6327completed. All the file pages buffered must be in a replaceable state when
6328this function is called: not latched and not modified. */
6329void
6330buf_pool_invalidate(void)
6331/*=====================*/
6332{
6333 ulint i;
6334
6335 for (i = 0; i < srv_buf_pool_instances; i++) {
6336 buf_pool_invalidate_instance(buf_pool_from_array(i));
6337 }
6338}
6339
6340#if defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
6341/*********************************************************************//**
6342Validates data in one buffer pool instance
6343@return TRUE */
6344static
6345ibool
6346buf_pool_validate_instance(
6347/*=======================*/
6348 buf_pool_t* buf_pool) /*!< in: buffer pool instance */
6349{
6350 buf_page_t* b;
6351 buf_chunk_t* chunk;
6352 ulint i;
6353 ulint n_lru_flush = 0;
6354 ulint n_page_flush = 0;
6355 ulint n_list_flush = 0;
6356 ulint n_lru = 0;
6357 ulint n_flush = 0;
6358 ulint n_free = 0;
6359 ulint n_zip = 0;
6360
6361 ut_ad(buf_pool);
6362
6363 buf_pool_mutex_enter(buf_pool);
6364 hash_lock_x_all(buf_pool->page_hash);
6365
6366 chunk = buf_pool->chunks;
6367
6368 /* Check the uncompressed blocks. */
6369
6370 for (i = buf_pool->n_chunks; i--; chunk++) {
6371
6372 ulint j;
6373 buf_block_t* block = chunk->blocks;
6374
6375 for (j = chunk->size; j--; block++) {
6376
6377 buf_page_mutex_enter(block);
6378
6379 switch (buf_block_get_state(block)) {
6380 case BUF_BLOCK_POOL_WATCH:
6381 case BUF_BLOCK_ZIP_PAGE:
6382 case BUF_BLOCK_ZIP_DIRTY:
6383 /* These should only occur on
6384 zip_clean, zip_free[], or flush_list. */
6385 ut_error;
6386 break;
6387
6388 case BUF_BLOCK_FILE_PAGE:
6389 ut_a(buf_page_hash_get_low(
6390 buf_pool, block->page.id)
6391 == &block->page);
6392
6393#ifdef UNIV_IBUF_COUNT_DEBUG
6394 ut_a(buf_page_get_io_fix(&block->page)
6395 == BUF_IO_READ
6396 || !ibuf_count_get(block->page.id));
6397#endif
6398 switch (buf_page_get_io_fix(&block->page)) {
6399 case BUF_IO_NONE:
6400 break;
6401
6402 case BUF_IO_WRITE:
6403 switch (buf_page_get_flush_type(
6404 &block->page)) {
6405 case BUF_FLUSH_LRU:
6406 n_lru_flush++;
6407 goto assert_s_latched;
6408 case BUF_FLUSH_SINGLE_PAGE:
6409 n_page_flush++;
6410assert_s_latched:
6411 ut_a(rw_lock_is_locked(
6412 &block->lock,
6413 RW_LOCK_S)
6414 || rw_lock_is_locked(
6415 &block->lock,
6416 RW_LOCK_SX));
6417 break;
6418 case BUF_FLUSH_LIST:
6419 n_list_flush++;
6420 break;
6421 default:
6422 ut_error;
6423 }
6424
6425 break;
6426
6427 case BUF_IO_READ:
6428
6429 ut_a(rw_lock_is_locked(&block->lock,
6430 RW_LOCK_X));
6431 break;
6432
6433 case BUF_IO_PIN:
6434 break;
6435 }
6436
6437 n_lru++;
6438 break;
6439
6440 case BUF_BLOCK_NOT_USED:
6441 n_free++;
6442 break;
6443
6444 case BUF_BLOCK_READY_FOR_USE:
6445 case BUF_BLOCK_MEMORY:
6446 case BUF_BLOCK_REMOVE_HASH:
6447 /* do nothing */
6448 break;
6449 }
6450
6451 buf_page_mutex_exit(block);
6452 }
6453 }
6454
6455 mutex_enter(&buf_pool->zip_mutex);
6456
6457 /* Check clean compressed-only blocks. */
6458
6459 for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
6460 b = UT_LIST_GET_NEXT(list, b)) {
6461 ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
6462 switch (buf_page_get_io_fix(b)) {
6463 case BUF_IO_NONE:
6464 case BUF_IO_PIN:
6465 /* All clean blocks should be I/O-unfixed. */
6466 break;
6467 case BUF_IO_READ:
6468 /* In buf_LRU_free_page(), we temporarily set
6469 b->io_fix = BUF_IO_READ for a newly allocated
6470 control block in order to prevent
6471 buf_page_get_gen() from decompressing the block. */
6472 break;
6473 default:
6474 ut_error;
6475 break;
6476 }
6477
6478 /* It is OK to read oldest_modification here because
6479 we have acquired buf_pool->zip_mutex above which acts
6480 as the 'block->mutex' for these bpages. */
6481 ut_a(!b->oldest_modification);
6482 ut_a(buf_page_hash_get_low(buf_pool, b->id) == b);
6483 n_lru++;
6484 n_zip++;
6485 }
6486
6487 /* Check dirty blocks. */
6488
6489 buf_flush_list_mutex_enter(buf_pool);
6490 for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
6491 b = UT_LIST_GET_NEXT(list, b)) {
6492 ut_ad(b->in_flush_list);
6493 ut_a(b->oldest_modification);
6494 n_flush++;
6495
6496 switch (buf_page_get_state(b)) {
6497 case BUF_BLOCK_ZIP_DIRTY:
6498 n_lru++;
6499 n_zip++;
6500 switch (buf_page_get_io_fix(b)) {
6501 case BUF_IO_NONE:
6502 case BUF_IO_READ:
6503 case BUF_IO_PIN:
6504 break;
6505 case BUF_IO_WRITE:
6506 switch (buf_page_get_flush_type(b)) {
6507 case BUF_FLUSH_LRU:
6508 n_lru_flush++;
6509 break;
6510 case BUF_FLUSH_SINGLE_PAGE:
6511 n_page_flush++;
6512 break;
6513 case BUF_FLUSH_LIST:
6514 n_list_flush++;
6515 break;
6516 default:
6517 ut_error;
6518 }
6519 break;
6520 }
6521 break;
6522 case BUF_BLOCK_FILE_PAGE:
6523 /* uncompressed page */
6524 break;
6525 case BUF_BLOCK_POOL_WATCH:
6526 case BUF_BLOCK_ZIP_PAGE:
6527 case BUF_BLOCK_NOT_USED:
6528 case BUF_BLOCK_READY_FOR_USE:
6529 case BUF_BLOCK_MEMORY:
6530 case BUF_BLOCK_REMOVE_HASH:
6531 ut_error;
6532 break;
6533 }
6534 ut_a(buf_page_hash_get_low(buf_pool, b->id) == b);
6535 }
6536
6537 ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
6538
6539 hash_unlock_x_all(buf_pool->page_hash);
6540 buf_flush_list_mutex_exit(buf_pool);
6541
6542 mutex_exit(&buf_pool->zip_mutex);
6543
6544 if (buf_pool->curr_size == buf_pool->old_size
6545 && n_lru + n_free > buf_pool->curr_size + n_zip) {
6546
6547 ib::fatal() << "n_LRU " << n_lru << ", n_free " << n_free
6548 << ", pool " << buf_pool->curr_size
6549 << " zip " << n_zip << ". Aborting...";
6550 }
6551
6552 ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru);
6553 if (buf_pool->curr_size == buf_pool->old_size
6554 && UT_LIST_GET_LEN(buf_pool->free) != n_free) {
6555
6556 ib::fatal() << "Free list len "
6557 << UT_LIST_GET_LEN(buf_pool->free)
6558 << ", free blocks " << n_free << ". Aborting...";
6559 }
6560
6561 ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush);
6562 ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush);
6563 ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_page_flush);
6564
6565 buf_pool_mutex_exit(buf_pool);
6566
6567 ut_a(buf_LRU_validate());
6568 ut_a(buf_flush_validate(buf_pool));
6569
6570 return(TRUE);
6571}
6572
6573/*********************************************************************//**
6574Validates the buffer buf_pool data structure.
6575@return TRUE */
6576ibool
6577buf_validate(void)
6578/*==============*/
6579{
6580 ulint i;
6581
6582 for (i = 0; i < srv_buf_pool_instances; i++) {
6583 buf_pool_t* buf_pool;
6584
6585 buf_pool = buf_pool_from_array(i);
6586
6587 buf_pool_validate_instance(buf_pool);
6588 }
6589 return(TRUE);
6590}
6591
6592#endif /* UNIV_DEBUG || UNIV_BUF_DEBUG */
6593
6594#if defined UNIV_DEBUG_PRINT || defined UNIV_DEBUG || defined UNIV_BUF_DEBUG
6595/*********************************************************************//**
6596Prints info of the buffer buf_pool data structure for one instance. */
6597static
6598void
6599buf_print_instance(
6600/*===============*/
6601 buf_pool_t* buf_pool)
6602{
6603 index_id_t* index_ids;
6604 ulint* counts;
6605 ulint size;
6606 ulint i;
6607 ulint j;
6608 index_id_t id;
6609 ulint n_found;
6610 buf_chunk_t* chunk;
6611 dict_index_t* index;
6612
6613 ut_ad(buf_pool);
6614
6615 size = buf_pool->curr_size;
6616
6617 index_ids = static_cast<index_id_t*>(
6618 ut_malloc_nokey(size * sizeof *index_ids));
6619
6620 counts = static_cast<ulint*>(ut_malloc_nokey(sizeof(ulint) * size));
6621
6622 buf_pool_mutex_enter(buf_pool);
6623 buf_flush_list_mutex_enter(buf_pool);
6624
6625 ib::info() << *buf_pool;
6626
6627 buf_flush_list_mutex_exit(buf_pool);
6628
6629 /* Count the number of blocks belonging to each index in the buffer */
6630
6631 n_found = 0;
6632
6633 chunk = buf_pool->chunks;
6634
6635 for (i = buf_pool->n_chunks; i--; chunk++) {
6636 buf_block_t* block = chunk->blocks;
6637 ulint n_blocks = chunk->size;
6638
6639 for (; n_blocks--; block++) {
6640 const buf_frame_t* frame = block->frame;
6641
6642 if (fil_page_index_page_check(frame)) {
6643
6644 id = btr_page_get_index_id(frame);
6645
6646 /* Look for the id in the index_ids array */
6647 j = 0;
6648
6649 while (j < n_found) {
6650
6651 if (index_ids[j] == id) {
6652 counts[j]++;
6653
6654 break;
6655 }
6656 j++;
6657 }
6658
6659 if (j == n_found) {
6660 n_found++;
6661 index_ids[j] = id;
6662 counts[j] = 1;
6663 }
6664 }
6665 }
6666 }
6667
6668 buf_pool_mutex_exit(buf_pool);
6669
6670 for (i = 0; i < n_found; i++) {
6671 index = dict_index_get_if_in_cache(index_ids[i]);
6672
6673 if (!index) {
6674 ib::info() << "Block count for index "
6675 << index_ids[i] << " in buffer is about "
6676 << counts[i];
6677 } else {
6678 ib::info() << "Block count for index " << index_ids[i]
6679 << " in buffer is about " << counts[i]
6680 << ", index " << index->name
6681 << " of table " << index->table->name;
6682 }
6683 }
6684
6685 ut_free(index_ids);
6686 ut_free(counts);
6687
6688 ut_a(buf_pool_validate_instance(buf_pool));
6689}
6690
6691/*********************************************************************//**
6692Prints info of the buffer buf_pool data structure. */
6693void
6694buf_print(void)
6695/*===========*/
6696{
6697 ulint i;
6698
6699 for (i = 0; i < srv_buf_pool_instances; i++) {
6700 buf_pool_t* buf_pool;
6701
6702 buf_pool = buf_pool_from_array(i);
6703 buf_print_instance(buf_pool);
6704 }
6705}
6706#endif /* UNIV_DEBUG_PRINT || UNIV_DEBUG || UNIV_BUF_DEBUG */
6707
6708#ifdef UNIV_DEBUG
6709/*********************************************************************//**
6710Returns the number of latched pages in the buffer pool.
6711@return number of latched pages */
6712static
6713ulint
6714buf_get_latched_pages_number_instance(
6715/*==================================*/
6716 buf_pool_t* buf_pool) /*!< in: buffer pool instance */
6717{
6718 buf_page_t* b;
6719 ulint i;
6720 buf_chunk_t* chunk;
6721 ulint fixed_pages_number = 0;
6722
6723 buf_pool_mutex_enter(buf_pool);
6724
6725 chunk = buf_pool->chunks;
6726
6727 for (i = buf_pool->n_chunks; i--; chunk++) {
6728 buf_block_t* block;
6729 ulint j;
6730
6731 block = chunk->blocks;
6732
6733 for (j = chunk->size; j--; block++) {
6734 if (buf_block_get_state(block)
6735 != BUF_BLOCK_FILE_PAGE) {
6736
6737 continue;
6738 }
6739
6740 buf_page_mutex_enter(block);
6741
6742 if (block->page.buf_fix_count != 0
6743 || buf_page_get_io_fix(&block->page)
6744 != BUF_IO_NONE) {
6745 fixed_pages_number++;
6746 }
6747
6748 buf_page_mutex_exit(block);
6749 }
6750 }
6751
6752 mutex_enter(&buf_pool->zip_mutex);
6753
6754 /* Traverse the lists of clean and dirty compressed-only blocks. */
6755
6756 for (b = UT_LIST_GET_FIRST(buf_pool->zip_clean); b;
6757 b = UT_LIST_GET_NEXT(list, b)) {
6758 ut_a(buf_page_get_state(b) == BUF_BLOCK_ZIP_PAGE);
6759 ut_a(buf_page_get_io_fix(b) != BUF_IO_WRITE);
6760
6761 if (b->buf_fix_count != 0
6762 || buf_page_get_io_fix(b) != BUF_IO_NONE) {
6763 fixed_pages_number++;
6764 }
6765 }
6766
6767 buf_flush_list_mutex_enter(buf_pool);
6768 for (b = UT_LIST_GET_FIRST(buf_pool->flush_list); b;
6769 b = UT_LIST_GET_NEXT(list, b)) {
6770 ut_ad(b->in_flush_list);
6771
6772 switch (buf_page_get_state(b)) {
6773 case BUF_BLOCK_ZIP_DIRTY:
6774 if (b->buf_fix_count != 0
6775 || buf_page_get_io_fix(b) != BUF_IO_NONE) {
6776 fixed_pages_number++;
6777 }
6778 break;
6779 case BUF_BLOCK_FILE_PAGE:
6780 /* uncompressed page */
6781 break;
6782 case BUF_BLOCK_POOL_WATCH:
6783 case BUF_BLOCK_ZIP_PAGE:
6784 case BUF_BLOCK_NOT_USED:
6785 case BUF_BLOCK_READY_FOR_USE:
6786 case BUF_BLOCK_MEMORY:
6787 case BUF_BLOCK_REMOVE_HASH:
6788 ut_error;
6789 break;
6790 }
6791 }
6792
6793 buf_flush_list_mutex_exit(buf_pool);
6794 mutex_exit(&buf_pool->zip_mutex);
6795 buf_pool_mutex_exit(buf_pool);
6796
6797 return(fixed_pages_number);
6798}
6799
6800/*********************************************************************//**
6801Returns the number of latched pages in all the buffer pools.
6802@return number of latched pages */
6803ulint
6804buf_get_latched_pages_number(void)
6805/*==============================*/
6806{
6807 ulint i;
6808 ulint total_latched_pages = 0;
6809
6810 for (i = 0; i < srv_buf_pool_instances; i++) {
6811 buf_pool_t* buf_pool;
6812
6813 buf_pool = buf_pool_from_array(i);
6814
6815 total_latched_pages += buf_get_latched_pages_number_instance(
6816 buf_pool);
6817 }
6818
6819 return(total_latched_pages);
6820}
6821
6822#endif /* UNIV_DEBUG */
6823
6824/*********************************************************************//**
6825Returns the number of pending buf pool read ios.
6826@return number of pending read I/O operations */
6827ulint
6828buf_get_n_pending_read_ios(void)
6829/*============================*/
6830{
6831 ulint pend_ios = 0;
6832
6833 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
6834 pend_ios += buf_pool_from_array(i)->n_pend_reads;
6835 }
6836
6837 return(pend_ios);
6838}
6839
6840/*********************************************************************//**
6841Returns the ratio in percents of modified pages in the buffer pool /
6842database pages in the buffer pool.
6843@return modified page percentage ratio */
6844double
6845buf_get_modified_ratio_pct(void)
6846/*============================*/
6847{
6848 double ratio;
6849 ulint lru_len = 0;
6850 ulint free_len = 0;
6851 ulint flush_list_len = 0;
6852
6853 buf_get_total_list_len(&lru_len, &free_len, &flush_list_len);
6854
6855 ratio = static_cast<double>(100 * flush_list_len)
6856 / (1 + lru_len + free_len);
6857
6858 /* 1 + is there to avoid division by zero */
6859
6860 return(ratio);
6861}
6862
6863/*******************************************************************//**
6864Aggregates a pool stats information with the total buffer pool stats */
6865static
6866void
6867buf_stats_aggregate_pool_info(
6868/*==========================*/
6869 buf_pool_info_t* total_info, /*!< in/out: the buffer pool
6870 info to store aggregated
6871 result */
6872 const buf_pool_info_t* pool_info) /*!< in: individual buffer pool
6873 stats info */
6874{
6875 ut_a(total_info && pool_info);
6876
6877 /* Nothing to copy if total_info is the same as pool_info */
6878 if (total_info == pool_info) {
6879 return;
6880 }
6881
6882 total_info->pool_size += pool_info->pool_size;
6883 total_info->lru_len += pool_info->lru_len;
6884 total_info->old_lru_len += pool_info->old_lru_len;
6885 total_info->free_list_len += pool_info->free_list_len;
6886 total_info->flush_list_len += pool_info->flush_list_len;
6887 total_info->n_pend_unzip += pool_info->n_pend_unzip;
6888 total_info->n_pend_reads += pool_info->n_pend_reads;
6889 total_info->n_pending_flush_lru += pool_info->n_pending_flush_lru;
6890 total_info->n_pending_flush_list += pool_info->n_pending_flush_list;
6891 total_info->n_pages_made_young += pool_info->n_pages_made_young;
6892 total_info->n_pages_not_made_young += pool_info->n_pages_not_made_young;
6893 total_info->n_pages_read += pool_info->n_pages_read;
6894 total_info->n_pages_created += pool_info->n_pages_created;
6895 total_info->n_pages_written += pool_info->n_pages_written;
6896 total_info->n_page_gets += pool_info->n_page_gets;
6897 total_info->n_ra_pages_read_rnd += pool_info->n_ra_pages_read_rnd;
6898 total_info->n_ra_pages_read += pool_info->n_ra_pages_read;
6899 total_info->n_ra_pages_evicted += pool_info->n_ra_pages_evicted;
6900 total_info->page_made_young_rate += pool_info->page_made_young_rate;
6901 total_info->page_not_made_young_rate +=
6902 pool_info->page_not_made_young_rate;
6903 total_info->pages_read_rate += pool_info->pages_read_rate;
6904 total_info->pages_created_rate += pool_info->pages_created_rate;
6905 total_info->pages_written_rate += pool_info->pages_written_rate;
6906 total_info->n_page_get_delta += pool_info->n_page_get_delta;
6907 total_info->page_read_delta += pool_info->page_read_delta;
6908 total_info->young_making_delta += pool_info->young_making_delta;
6909 total_info->not_young_making_delta += pool_info->not_young_making_delta;
6910 total_info->pages_readahead_rnd_rate += pool_info->pages_readahead_rnd_rate;
6911 total_info->pages_readahead_rate += pool_info->pages_readahead_rate;
6912 total_info->pages_evicted_rate += pool_info->pages_evicted_rate;
6913 total_info->unzip_lru_len += pool_info->unzip_lru_len;
6914 total_info->io_sum += pool_info->io_sum;
6915 total_info->io_cur += pool_info->io_cur;
6916 total_info->unzip_sum += pool_info->unzip_sum;
6917 total_info->unzip_cur += pool_info->unzip_cur;
6918}
6919/*******************************************************************//**
6920Collect buffer pool stats information for a buffer pool. Also
6921record aggregated stats if there are more than one buffer pool
6922in the server */
6923void
6924buf_stats_get_pool_info(
6925/*====================*/
6926 buf_pool_t* buf_pool, /*!< in: buffer pool */
6927 ulint pool_id, /*!< in: buffer pool ID */
6928 buf_pool_info_t* all_pool_info) /*!< in/out: buffer pool info
6929 to fill */
6930{
6931 buf_pool_info_t* pool_info;
6932 time_t current_time;
6933 double time_elapsed;
6934
6935 /* Find appropriate pool_info to store stats for this buffer pool */
6936 pool_info = &all_pool_info[pool_id];
6937
6938 buf_pool_mutex_enter(buf_pool);
6939 buf_flush_list_mutex_enter(buf_pool);
6940
6941 pool_info->pool_unique_id = pool_id;
6942
6943 pool_info->pool_size = buf_pool->curr_size;
6944
6945 pool_info->lru_len = UT_LIST_GET_LEN(buf_pool->LRU);
6946
6947 pool_info->old_lru_len = buf_pool->LRU_old_len;
6948
6949 pool_info->free_list_len = UT_LIST_GET_LEN(buf_pool->free);
6950
6951 pool_info->flush_list_len = UT_LIST_GET_LEN(buf_pool->flush_list);
6952
6953 pool_info->n_pend_unzip = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
6954
6955 pool_info->n_pend_reads = buf_pool->n_pend_reads;
6956
6957 pool_info->n_pending_flush_lru =
6958 (buf_pool->n_flush[BUF_FLUSH_LRU]
6959 + buf_pool->init_flush[BUF_FLUSH_LRU]);
6960
6961 pool_info->n_pending_flush_list =
6962 (buf_pool->n_flush[BUF_FLUSH_LIST]
6963 + buf_pool->init_flush[BUF_FLUSH_LIST]);
6964
6965 pool_info->n_pending_flush_single_page =
6966 (buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]
6967 + buf_pool->init_flush[BUF_FLUSH_SINGLE_PAGE]);
6968
6969 buf_flush_list_mutex_exit(buf_pool);
6970
6971 current_time = time(NULL);
6972 time_elapsed = 0.001 + difftime(current_time,
6973 buf_pool->last_printout_time);
6974
6975 pool_info->n_pages_made_young = buf_pool->stat.n_pages_made_young;
6976
6977 pool_info->n_pages_not_made_young =
6978 buf_pool->stat.n_pages_not_made_young;
6979
6980 pool_info->n_pages_read = buf_pool->stat.n_pages_read;
6981
6982 pool_info->n_pages_created = buf_pool->stat.n_pages_created;
6983
6984 pool_info->n_pages_written = buf_pool->stat.n_pages_written;
6985
6986 pool_info->n_page_gets = buf_pool->stat.n_page_gets;
6987
6988 pool_info->n_ra_pages_read_rnd = buf_pool->stat.n_ra_pages_read_rnd;
6989 pool_info->n_ra_pages_read = buf_pool->stat.n_ra_pages_read;
6990
6991 pool_info->n_ra_pages_evicted = buf_pool->stat.n_ra_pages_evicted;
6992
6993 pool_info->page_made_young_rate =
6994 (buf_pool->stat.n_pages_made_young
6995 - buf_pool->old_stat.n_pages_made_young) / time_elapsed;
6996
6997 pool_info->page_not_made_young_rate =
6998 (buf_pool->stat.n_pages_not_made_young
6999 - buf_pool->old_stat.n_pages_not_made_young) / time_elapsed;
7000
7001 pool_info->pages_read_rate =
7002 (buf_pool->stat.n_pages_read
7003 - buf_pool->old_stat.n_pages_read) / time_elapsed;
7004
7005 pool_info->pages_created_rate =
7006 (buf_pool->stat.n_pages_created
7007 - buf_pool->old_stat.n_pages_created) / time_elapsed;
7008
7009 pool_info->pages_written_rate =
7010 (buf_pool->stat.n_pages_written
7011 - buf_pool->old_stat.n_pages_written) / time_elapsed;
7012
7013 pool_info->n_page_get_delta = buf_pool->stat.n_page_gets
7014 - buf_pool->old_stat.n_page_gets;
7015
7016 if (pool_info->n_page_get_delta) {
7017 pool_info->page_read_delta = buf_pool->stat.n_pages_read
7018 - buf_pool->old_stat.n_pages_read;
7019
7020 pool_info->young_making_delta =
7021 buf_pool->stat.n_pages_made_young
7022 - buf_pool->old_stat.n_pages_made_young;
7023
7024 pool_info->not_young_making_delta =
7025 buf_pool->stat.n_pages_not_made_young
7026 - buf_pool->old_stat.n_pages_not_made_young;
7027 }
7028 pool_info->pages_readahead_rnd_rate =
7029 (buf_pool->stat.n_ra_pages_read_rnd
7030 - buf_pool->old_stat.n_ra_pages_read_rnd) / time_elapsed;
7031
7032
7033 pool_info->pages_readahead_rate =
7034 (buf_pool->stat.n_ra_pages_read
7035 - buf_pool->old_stat.n_ra_pages_read) / time_elapsed;
7036
7037 pool_info->pages_evicted_rate =
7038 (buf_pool->stat.n_ra_pages_evicted
7039 - buf_pool->old_stat.n_ra_pages_evicted) / time_elapsed;
7040
7041 pool_info->unzip_lru_len = UT_LIST_GET_LEN(buf_pool->unzip_LRU);
7042
7043 pool_info->io_sum = buf_LRU_stat_sum.io;
7044
7045 pool_info->io_cur = buf_LRU_stat_cur.io;
7046
7047 pool_info->unzip_sum = buf_LRU_stat_sum.unzip;
7048
7049 pool_info->unzip_cur = buf_LRU_stat_cur.unzip;
7050
7051 buf_refresh_io_stats(buf_pool);
7052 buf_pool_mutex_exit(buf_pool);
7053}
7054
7055/*********************************************************************//**
7056Prints info of the buffer i/o. */
7057static
7058void
7059buf_print_io_instance(
7060/*==================*/
7061 buf_pool_info_t*pool_info, /*!< in: buffer pool info */
7062 FILE* file) /*!< in/out: buffer where to print */
7063{
7064 ut_ad(pool_info);
7065
7066 fprintf(file,
7067 "Buffer pool size " ULINTPF "\n"
7068 "Free buffers " ULINTPF "\n"
7069 "Database pages " ULINTPF "\n"
7070 "Old database pages " ULINTPF "\n"
7071 "Modified db pages " ULINTPF "\n"
7072 "Percent of dirty pages(LRU & free pages): %.3f\n"
7073 "Max dirty pages percent: %.3f\n"
7074 "Pending reads " ULINTPF "\n"
7075 "Pending writes: LRU " ULINTPF ", flush list " ULINTPF
7076 ", single page " ULINTPF "\n",
7077 pool_info->pool_size,
7078 pool_info->free_list_len,
7079 pool_info->lru_len,
7080 pool_info->old_lru_len,
7081 pool_info->flush_list_len,
7082 (((double) pool_info->flush_list_len) /
7083 (pool_info->lru_len + pool_info->free_list_len + 1.0)) * 100.0,
7084 srv_max_buf_pool_modified_pct,
7085 pool_info->n_pend_reads,
7086 pool_info->n_pending_flush_lru,
7087 pool_info->n_pending_flush_list,
7088 pool_info->n_pending_flush_single_page);
7089
7090 fprintf(file,
7091 "Pages made young " ULINTPF ", not young " ULINTPF "\n"
7092 "%.2f youngs/s, %.2f non-youngs/s\n"
7093 "Pages read " ULINTPF ", created " ULINTPF
7094 ", written " ULINTPF "\n"
7095 "%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
7096 pool_info->n_pages_made_young,
7097 pool_info->n_pages_not_made_young,
7098 pool_info->page_made_young_rate,
7099 pool_info->page_not_made_young_rate,
7100 pool_info->n_pages_read,
7101 pool_info->n_pages_created,
7102 pool_info->n_pages_written,
7103 pool_info->pages_read_rate,
7104 pool_info->pages_created_rate,
7105 pool_info->pages_written_rate);
7106
7107 if (pool_info->n_page_get_delta) {
7108 double hit_rate = double(pool_info->page_read_delta)
7109 / pool_info->n_page_get_delta;
7110
7111 if (hit_rate > 1) {
7112 hit_rate = 1;
7113 }
7114
7115 fprintf(file,
7116 "Buffer pool hit rate " ULINTPF " / 1000,"
7117 " young-making rate " ULINTPF " / 1000 not "
7118 ULINTPF " / 1000\n",
7119 ulint(1000 * (1 - hit_rate)),
7120 ulint(1000 * double(pool_info->young_making_delta)
7121 / pool_info->n_page_get_delta),
7122 ulint(1000 * double(pool_info->not_young_making_delta)
7123 / pool_info->n_page_get_delta));
7124 } else {
7125 fputs("No buffer pool page gets since the last printout\n",
7126 file);
7127 }
7128
7129 /* Statistics about read ahead algorithm */
7130 fprintf(file, "Pages read ahead %.2f/s,"
7131 " evicted without access %.2f/s,"
7132 " Random read ahead %.2f/s\n",
7133
7134 pool_info->pages_readahead_rate,
7135 pool_info->pages_evicted_rate,
7136 pool_info->pages_readahead_rnd_rate);
7137
7138 /* Print some values to help us with visualizing what is
7139 happening with LRU eviction. */
7140 fprintf(file,
7141 "LRU len: " ULINTPF ", unzip_LRU len: " ULINTPF "\n"
7142 "I/O sum[" ULINTPF "]:cur[" ULINTPF "], "
7143 "unzip sum[" ULINTPF "]:cur[" ULINTPF "]\n",
7144 pool_info->lru_len, pool_info->unzip_lru_len,
7145 pool_info->io_sum, pool_info->io_cur,
7146 pool_info->unzip_sum, pool_info->unzip_cur);
7147}
7148
7149/*********************************************************************//**
7150Prints info of the buffer i/o. */
7151void
7152buf_print_io(
7153/*=========*/
7154 FILE* file) /*!< in/out: buffer where to print */
7155{
7156 ulint i;
7157 buf_pool_info_t* pool_info;
7158 buf_pool_info_t* pool_info_total;
7159
7160 /* If srv_buf_pool_instances is greater than 1, allocate
7161 one extra buf_pool_info_t, the last one stores
7162 aggregated/total values from all pools */
7163 if (srv_buf_pool_instances > 1) {
7164 pool_info = (buf_pool_info_t*) ut_zalloc_nokey((
7165 srv_buf_pool_instances + 1) * sizeof *pool_info);
7166
7167 pool_info_total = &pool_info[srv_buf_pool_instances];
7168 } else {
7169 ut_a(srv_buf_pool_instances == 1);
7170
7171 pool_info_total = pool_info =
7172 static_cast<buf_pool_info_t*>(
7173 ut_zalloc_nokey(sizeof *pool_info));
7174 }
7175
7176 for (i = 0; i < srv_buf_pool_instances; i++) {
7177 buf_pool_t* buf_pool;
7178
7179 buf_pool = buf_pool_from_array(i);
7180
7181 /* Fetch individual buffer pool info and calculate
7182 aggregated stats along the way */
7183 buf_stats_get_pool_info(buf_pool, i, pool_info);
7184
7185 /* If we have more than one buffer pool, store
7186 the aggregated stats */
7187 if (srv_buf_pool_instances > 1) {
7188 buf_stats_aggregate_pool_info(pool_info_total,
7189 &pool_info[i]);
7190 }
7191 }
7192
7193 /* Print the aggreate buffer pool info */
7194 buf_print_io_instance(pool_info_total, file);
7195
7196 /* If there are more than one buffer pool, print each individual pool
7197 info */
7198 if (srv_buf_pool_instances > 1) {
7199 fputs("----------------------\n"
7200 "INDIVIDUAL BUFFER POOL INFO\n"
7201 "----------------------\n", file);
7202
7203 for (i = 0; i < srv_buf_pool_instances; i++) {
7204 fprintf(file, "---BUFFER POOL " ULINTPF "\n", i);
7205 buf_print_io_instance(&pool_info[i], file);
7206 }
7207 }
7208
7209 ut_free(pool_info);
7210}
7211
7212/**********************************************************************//**
7213Refreshes the statistics used to print per-second averages. */
7214void
7215buf_refresh_io_stats_all(void)
7216/*==========================*/
7217{
7218 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
7219 buf_pool_t* buf_pool;
7220
7221 buf_pool = buf_pool_from_array(i);
7222
7223 buf_refresh_io_stats(buf_pool);
7224 }
7225}
7226
7227/**********************************************************************//**
7228Check if all pages in all buffer pools are in a replacable state.
7229@return FALSE if not */
7230ibool
7231buf_all_freed(void)
7232/*===============*/
7233{
7234 for (ulint i = 0; i < srv_buf_pool_instances; i++) {
7235 buf_pool_t* buf_pool;
7236
7237 buf_pool = buf_pool_from_array(i);
7238
7239 if (!buf_all_freed_instance(buf_pool)) {
7240 return(FALSE);
7241 }
7242 }
7243
7244 return(TRUE);
7245}
7246
7247/*********************************************************************//**
7248Checks that there currently are no pending i/o-operations for the buffer
7249pool.
7250@return number of pending i/o */
7251ulint
7252buf_pool_check_no_pending_io(void)
7253/*==============================*/
7254{
7255 ulint i;
7256 ulint pending_io = 0;
7257
7258 buf_pool_mutex_enter_all();
7259
7260 for (i = 0; i < srv_buf_pool_instances; i++) {
7261 const buf_pool_t* buf_pool;
7262
7263 buf_pool = buf_pool_from_array(i);
7264
7265 pending_io += buf_pool->n_pend_reads
7266 + buf_pool->n_flush[BUF_FLUSH_LRU]
7267 + buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]
7268 + buf_pool->n_flush[BUF_FLUSH_LIST];
7269
7270 }
7271
7272 buf_pool_mutex_exit_all();
7273
7274 return(pending_io);
7275}
7276
7277/** Print the given page_id_t object.
7278@param[in,out] out the output stream
7279@param[in] page_id the page_id_t object to be printed
7280@return the output stream */
7281std::ostream&
7282operator<<(
7283 std::ostream& out,
7284 const page_id_t& page_id)
7285{
7286 out << "[page id: space=" << page_id.m_space
7287 << ", page number=" << page_id.m_page_no << "]";
7288 return(out);
7289}
7290
7291/** Print the given buf_pool_t object.
7292@param[in,out] out the output stream
7293@param[in] buf_pool the buf_pool_t object to be printed
7294@return the output stream */
7295std::ostream&
7296operator<<(
7297 std::ostream& out,
7298 const buf_pool_t& buf_pool)
7299{
7300 out << "[buffer pool instance: "
7301 << "buf_pool size=" << buf_pool.curr_size
7302 << ", database pages=" << UT_LIST_GET_LEN(buf_pool.LRU)
7303 << ", free pages=" << UT_LIST_GET_LEN(buf_pool.free)
7304 << ", modified database pages="
7305 << UT_LIST_GET_LEN(buf_pool.flush_list)
7306 << ", n pending decompressions=" << buf_pool.n_pend_unzip
7307 << ", n pending reads=" << buf_pool.n_pend_reads
7308 << ", n pending flush LRU=" << buf_pool.n_flush[BUF_FLUSH_LRU]
7309 << " list=" << buf_pool.n_flush[BUF_FLUSH_LIST]
7310 << " single page=" << buf_pool.n_flush[BUF_FLUSH_SINGLE_PAGE]
7311 << ", pages made young=" << buf_pool.stat.n_pages_made_young
7312 << ", not young=" << buf_pool.stat.n_pages_not_made_young
7313 << ", pages read=" << buf_pool.stat.n_pages_read
7314 << ", created=" << buf_pool.stat.n_pages_created
7315 << ", written=" << buf_pool.stat.n_pages_written << "]";
7316 return(out);
7317}
7318
7319/********************************************************************//**
7320Reserve unused slot from temporary memory array and allocate necessary
7321temporary memory if not yet allocated.
7322@return reserved slot */
7323UNIV_INTERN
7324buf_tmp_buffer_t*
7325buf_pool_reserve_tmp_slot(
7326/*======================*/
7327 buf_pool_t* buf_pool, /*!< in: buffer pool where to
7328 reserve */
7329 bool compressed) /*!< in: is file space compressed */
7330{
7331 buf_tmp_buffer_t *free_slot=NULL;
7332
7333 /* Array is protected by buf_pool mutex */
7334 buf_pool_mutex_enter(buf_pool);
7335
7336 for(ulint i = 0; i < buf_pool->tmp_arr->n_slots; i++) {
7337 buf_tmp_buffer_t *slot = &buf_pool->tmp_arr->slots[i];
7338
7339 if(slot->reserved == false) {
7340 free_slot = slot;
7341 break;
7342 }
7343 }
7344
7345 /* We assume that free slot is found */
7346 ut_a(free_slot != NULL);
7347 free_slot->reserved = true;
7348 /* Now that we have reserved this slot we can release
7349 buf_pool mutex */
7350 buf_pool_mutex_exit(buf_pool);
7351
7352 /* Allocate temporary memory for encryption/decryption */
7353 if (free_slot->crypt_buf == NULL) {
7354 free_slot->crypt_buf = static_cast<byte*>(aligned_malloc(srv_page_size, srv_page_size));
7355 memset(free_slot->crypt_buf, 0, srv_page_size);
7356 }
7357
7358 /* For page compressed tables allocate temporary memory for
7359 compression/decompression */
7360 if (compressed && free_slot->comp_buf == NULL) {
7361 ulint size = srv_page_size;
7362
7363 /* Both snappy and lzo compression methods require that
7364 output buffer used for compression is bigger than input
7365 buffer. Increase the allocated buffer size accordingly. */
7366#if defined(HAVE_SNAPPY)
7367 size = snappy_max_compressed_length(size);
7368#endif
7369#if defined(HAVE_LZO)
7370 size += LZO1X_1_15_MEM_COMPRESS;
7371#endif
7372 free_slot->comp_buf = static_cast<byte*>(aligned_malloc(size, srv_page_size));
7373 memset(free_slot->comp_buf, 0, size);
7374 }
7375
7376 return (free_slot);
7377}
7378
7379/** Encryption and page_compression hook that is called just before
7380a page is written to disk.
7381@param[in,out] space tablespace
7382@param[in,out] bpage buffer page
7383@param[in] src_frame physical page frame that is being encrypted
7384@return page frame to be written to file
7385(may be src_frame or an encrypted/compressed copy of it) */
7386UNIV_INTERN
7387byte*
7388buf_page_encrypt_before_write(
7389 fil_space_t* space,
7390 buf_page_t* bpage,
7391 byte* src_frame)
7392{
7393 ut_ad(space->id == bpage->id.space());
7394 bpage->real_size = srv_page_size;
7395
7396 fil_page_type_validate(src_frame);
7397
7398 switch (bpage->id.page_no()) {
7399 case 0:
7400 /* Page 0 of a tablespace is not encrypted/compressed */
7401 return src_frame;
7402 case TRX_SYS_PAGE_NO:
7403 if (bpage->id.space() == TRX_SYS_SPACE) {
7404 /* don't encrypt/compress page as it contains
7405 address to dblwr buffer */
7406 return src_frame;
7407 }
7408 }
7409
7410 fil_space_crypt_t* crypt_data = space->crypt_data;
7411
7412 const bool encrypted = crypt_data
7413 && !crypt_data->not_encrypted()
7414 && crypt_data->type != CRYPT_SCHEME_UNENCRYPTED
7415 && (!crypt_data->is_default_encryption()
7416 || srv_encrypt_tables);
7417
7418 bool page_compressed = FSP_FLAGS_HAS_PAGE_COMPRESSION(space->flags);
7419
7420 if (!encrypted && !page_compressed) {
7421 /* No need to encrypt or page compress the page.
7422 Clear key-version & crypt-checksum. */
7423 memset(src_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION, 0, 8);
7424 return src_frame;
7425 }
7426
7427 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
7428 /* Find free slot from temporary memory array */
7429 buf_tmp_buffer_t* slot = buf_pool_reserve_tmp_slot(buf_pool, page_compressed);
7430 slot->out_buf = NULL;
7431 bpage->slot = slot;
7432
7433 byte *dst_frame = slot->crypt_buf;
7434
7435 if (!page_compressed) {
7436 /* Encrypt page content */
7437 byte* tmp = fil_space_encrypt(space,
7438 bpage->id.page_no(),
7439 bpage->newest_modification,
7440 src_frame,
7441 dst_frame);
7442
7443 slot->out_buf = dst_frame = tmp;
7444
7445 ut_d(fil_page_type_validate(tmp));
7446 } else {
7447 /* First we compress the page content */
7448 ulint out_len = 0;
7449
7450 byte *tmp = fil_compress_page(
7451 space,
7452 (byte *)src_frame,
7453 slot->comp_buf,
7454 srv_page_size,
7455 fsp_flags_get_page_compression_level(space->flags),
7456 fil_space_get_block_size(space, bpage->id.page_no()),
7457 encrypted,
7458 &out_len);
7459
7460 bpage->real_size = out_len;
7461
7462 /* Workaround for MDEV-15527. */
7463 memset(tmp + out_len, 0 , srv_page_size - out_len);
7464#ifdef UNIV_DEBUG
7465 fil_page_type_validate(tmp);
7466#endif
7467
7468 if(encrypted) {
7469
7470 /* And then we encrypt the page content */
7471 tmp = fil_space_encrypt(space,
7472 bpage->id.page_no(),
7473 bpage->newest_modification,
7474 tmp,
7475 dst_frame);
7476 }
7477
7478 slot->out_buf = dst_frame = tmp;
7479 }
7480
7481 ut_d(fil_page_type_validate(dst_frame));
7482
7483 // return dst_frame which will be written
7484 return dst_frame;
7485}
7486
7487/** Decrypt a page.
7488@param[in,out] bpage Page control block
7489@param[in,out] space tablespace
7490@return whether the operation was successful */
7491static
7492bool
7493buf_page_decrypt_after_read(buf_page_t* bpage, fil_space_t* space)
7494{
7495 ut_ad(space->pending_io());
7496 ut_ad(space->id == bpage->id.space());
7497
7498 bool compressed = bpage->size.is_compressed();
7499 const page_size_t& size = bpage->size;
7500 byte* dst_frame = compressed ? bpage->zip.data :
7501 ((buf_block_t*) bpage)->frame;
7502 unsigned key_version =
7503 mach_read_from_4(dst_frame + FIL_PAGE_FILE_FLUSH_LSN_OR_KEY_VERSION);
7504 bool page_compressed = fil_page_is_compressed(dst_frame);
7505 bool page_compressed_encrypted = fil_page_is_compressed_encrypted(dst_frame);
7506 buf_pool_t* buf_pool = buf_pool_from_bpage(bpage);
7507 bool success = true;
7508
7509 if (bpage->id.page_no() == 0) {
7510 /* File header pages are not encrypted/compressed */
7511 return (true);
7512 }
7513
7514 /* Page is encrypted if encryption information is found from
7515 tablespace and page contains used key_version. This is true
7516 also for pages first compressed and then encrypted. */
7517 if (!space->crypt_data) {
7518 key_version = 0;
7519 }
7520
7521 if (page_compressed) {
7522 /* the page we read is unencrypted */
7523 /* Find free slot from temporary memory array */
7524 buf_tmp_buffer_t* slot = buf_pool_reserve_tmp_slot(buf_pool, page_compressed);
7525
7526 ut_d(fil_page_type_validate(dst_frame));
7527
7528 /* decompress using comp_buf to dst_frame */
7529 fil_decompress_page(slot->comp_buf,
7530 dst_frame,
7531 ulong(size.logical()),
7532 &bpage->write_size);
7533
7534 /* Mark this slot as free */
7535 slot->reserved = false;
7536 key_version = 0;
7537
7538 ut_d(fil_page_type_validate(dst_frame));
7539 } else {
7540 buf_tmp_buffer_t* slot = NULL;
7541
7542 if (key_version) {
7543 /* Verify encryption checksum before we even try to
7544 decrypt. */
7545 if (!fil_space_verify_crypt_checksum(
7546 dst_frame, size,
7547 bpage->id.space(), bpage->id.page_no())) {
7548 if (space->crypt_data->type
7549 != CRYPT_SCHEME_UNENCRYPTED) {
7550 bpage->encrypted = true;
7551 }
7552 return (false);
7553 }
7554
7555 /* Find free slot from temporary memory array */
7556 slot = buf_pool_reserve_tmp_slot(buf_pool, page_compressed);
7557
7558 ut_d(fil_page_type_validate(dst_frame));
7559
7560 /* decrypt using crypt_buf to dst_frame */
7561 if (!fil_space_decrypt(space, slot->crypt_buf,
7562 dst_frame, &bpage->encrypted)) {
7563 success = false;
7564 }
7565
7566 ut_d(fil_page_type_validate(dst_frame));
7567 }
7568
7569 if (page_compressed_encrypted && success) {
7570 if (!slot) {
7571 slot = buf_pool_reserve_tmp_slot(buf_pool, page_compressed);
7572 }
7573
7574 ut_d(fil_page_type_validate(dst_frame));
7575 /* decompress using comp_buf to dst_frame */
7576 fil_decompress_page(slot->comp_buf,
7577 dst_frame,
7578 ulong(size.logical()),
7579 &bpage->write_size);
7580 ut_d(fil_page_type_validate(dst_frame));
7581 }
7582
7583 /* Mark this slot as free */
7584 if (slot) {
7585 slot->reserved = false;
7586 }
7587 }
7588
7589 ut_ad(space->pending_io());
7590 return success;
7591}
7592
7593/**
7594Should we punch hole to deallocate unused portion of the page.
7595@param[in] bpage Page control block
7596@return true if punch hole should be used, false if not */
7597bool
7598buf_page_should_punch_hole(
7599 const buf_page_t* bpage)
7600{
7601 return (bpage->real_size != bpage->size.physical());
7602}
7603
7604/**
7605Calculate the length of trim (punch_hole) operation.
7606@param[in] bpage Page control block
7607@param[in] write_length Write length
7608@return length of the trim or zero. */
7609ulint
7610buf_page_get_trim_length(
7611 const buf_page_t* bpage,
7612 ulint write_length)
7613{
7614 return (bpage->size.physical() - write_length);
7615}
7616
7617
7618#endif /* !UNIV_INNOCHECKSUM */
7619