1/*****************************************************************************
2
3Copyright (c) 1997, 2016, Oracle and/or its affiliates. All Rights Reserved.
4Copyright (c) 2016, 2018, MariaDB Corporation.
5
6This program is free software; you can redistribute it and/or modify it under
7the terms of the GNU General Public License as published by the Free Software
8Foundation; version 2 of the License.
9
10This program is distributed in the hope that it will be useful, but WITHOUT
11ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14You should have received a copy of the GNU General Public License along with
15this program; if not, write to the Free Software Foundation, Inc.,
1651 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
17
18*****************************************************************************/
19
20/**************************************************//**
21@file ibuf/ibuf0ibuf.cc
22Insert buffer
23
24Created 7/19/1997 Heikki Tuuri
25*******************************************************/
26
27#include "ha_prototypes.h"
28
29#include "ibuf0ibuf.h"
30#include "sync0sync.h"
31#include "btr0sea.h"
32
33#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
34my_bool srv_ibuf_disable_background_merge;
35#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
36
37/** Number of bits describing a single page */
38#define IBUF_BITS_PER_PAGE 4
39/** The start address for an insert buffer bitmap page bitmap */
40#define IBUF_BITMAP PAGE_DATA
41
42#include "buf0buf.h"
43#include "buf0rea.h"
44#include "fsp0fsp.h"
45#include "trx0sys.h"
46#include "fil0fil.h"
47#include "rem0rec.h"
48#include "btr0cur.h"
49#include "btr0pcur.h"
50#include "btr0btr.h"
51#include "row0upd.h"
52#include "dict0boot.h"
53#include "fut0lst.h"
54#include "lock0lock.h"
55#include "log0recv.h"
56#include "que0que.h"
57#include "srv0start.h" /* srv_shutdown_state */
58#include "fsp0sysspace.h"
59#include "rem0cmp.h"
60
61/* STRUCTURE OF AN INSERT BUFFER RECORD
62
63In versions < 4.1.x:
64
651. The first field is the page number.
662. The second field is an array which stores type info for each subsequent
67 field. We store the information which affects the ordering of records, and
68 also the physical storage size of an SQL NULL value. E.g., for CHAR(10) it
69 is 10 bytes.
703. Next we have the fields of the actual index record.
71
72In versions >= 4.1.x:
73
74Note that contary to what we planned in the 1990's, there will only be one
75insert buffer tree, and that is in the system tablespace of InnoDB.
76
771. The first field is the space id.
782. The second field is a one-byte marker (0) which differentiates records from
79 the < 4.1.x storage format.
803. The third field is the page number.
814. The fourth field contains the type info, where we have also added 2 bytes to
82 store the charset. In the compressed table format of 5.0.x we must add more
83 information here so that we can build a dummy 'index' struct which 5.0.x
84 can use in the binary search on the index page in the ibuf merge phase.
855. The rest of the fields contain the fields of the actual index record.
86
87In versions >= 5.0.3:
88
89The first byte of the fourth field is an additional marker (0) if the record
90is in the compact format. The presence of this marker can be detected by
91looking at the length of the field modulo DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE.
92
93The high-order bit of the character set field in the type info is the
94"nullable" flag for the field.
95
96In versions >= 5.5:
97
98The optional marker byte at the start of the fourth field is replaced by
99mandatory 3 fields, totaling 4 bytes:
100
101 1. 2 bytes: Counter field, used to sort records within a (space id, page
102 no) in the order they were added. This is needed so that for example the
103 sequence of operations "INSERT x, DEL MARK x, INSERT x" is handled
104 correctly.
105
106 2. 1 byte: Operation type (see ibuf_op_t).
107
108 3. 1 byte: Flags. Currently only one flag exists, IBUF_REC_COMPACT.
109
110To ensure older records, which do not have counters to enforce correct
111sorting, are merged before any new records, ibuf_insert checks if we're
112trying to insert to a position that contains old-style records, and if so,
113refuses the insert. Thus, ibuf pages are gradually converted to the new
114format as their corresponding buffer pool pages are read into memory.
115*/
116
117
118/* PREVENTING DEADLOCKS IN THE INSERT BUFFER SYSTEM
119
120If an OS thread performs any operation that brings in disk pages from
121non-system tablespaces into the buffer pool, or creates such a page there,
122then the operation may have as a side effect an insert buffer index tree
123compression. Thus, the tree latch of the insert buffer tree may be acquired
124in the x-mode, and also the file space latch of the system tablespace may
125be acquired in the x-mode.
126
127Also, an insert to an index in a non-system tablespace can have the same
128effect. How do we know this cannot lead to a deadlock of OS threads? There
129is a problem with the i\o-handler threads: they break the latching order
130because they own x-latches to pages which are on a lower level than the
131insert buffer tree latch, its page latches, and the tablespace latch an
132insert buffer operation can reserve.
133
134The solution is the following: Let all the tree and page latches connected
135with the insert buffer be later in the latching order than the fsp latch and
136fsp page latches.
137
138Insert buffer pages must be such that the insert buffer is never invoked
139when these pages are accessed as this would result in a recursion violating
140the latching order. We let a special i/o-handler thread take care of i/o to
141the insert buffer pages and the ibuf bitmap pages, as well as the fsp bitmap
142pages and the first inode page, which contains the inode of the ibuf tree: let
143us call all these ibuf pages. To prevent deadlocks, we do not let a read-ahead
144access both non-ibuf and ibuf pages.
145
146Then an i/o-handler for the insert buffer never needs to access recursively the
147insert buffer tree and thus obeys the latching order. On the other hand, other
148i/o-handlers for other tablespaces may require access to the insert buffer,
149but because all kinds of latches they need to access there are later in the
150latching order, no violation of the latching order occurs in this case,
151either.
152
153A problem is how to grow and contract an insert buffer tree. As it is later
154in the latching order than the fsp management, we have to reserve the fsp
155latch first, before adding or removing pages from the insert buffer tree.
156We let the insert buffer tree have its own file space management: a free
157list of pages linked to the tree root. To prevent recursive using of the
158insert buffer when adding pages to the tree, we must first load these pages
159to memory, obtaining a latch on them, and only after that add them to the
160free list of the insert buffer tree. More difficult is removing of pages
161from the free list. If there is an excess of pages in the free list of the
162ibuf tree, they might be needed if some thread reserves the fsp latch,
163intending to allocate more file space. So we do the following: if a thread
164reserves the fsp latch, we check the writer count field of the latch. If
165this field has value 1, it means that the thread did not own the latch
166before entering the fsp system, and the mtr of the thread contains no
167modifications to the fsp pages. Now we are free to reserve the ibuf latch,
168and check if there is an excess of pages in the free list. We can then, in a
169separate mini-transaction, take them out of the free list and free them to
170the fsp system.
171
172To avoid deadlocks in the ibuf system, we divide file pages into three levels:
173
174(1) non-ibuf pages,
175(2) ibuf tree pages and the pages in the ibuf tree free list, and
176(3) ibuf bitmap pages.
177
178No OS thread is allowed to access higher level pages if it has latches to
179lower level pages; even if the thread owns a B-tree latch it must not access
180the B-tree non-leaf pages if it has latches on lower level pages. Read-ahead
181is only allowed for level 1 and 2 pages. Dedicated i/o-handler threads handle
182exclusively level 1 i/o. A dedicated i/o handler thread handles exclusively
183level 2 i/o. However, if an OS thread does the i/o handling for itself, i.e.,
184it uses synchronous aio, it can access any pages, as long as it obeys the
185access order rules. */
186
187/** Operations that can currently be buffered. */
188ibuf_use_t ibuf_use = IBUF_USE_ALL;
189
190#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
191/** Flag to control insert buffer debugging. */
192uint ibuf_debug;
193#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
194
195/** The insert buffer control structure */
196ibuf_t* ibuf = NULL;
197
198#ifdef UNIV_IBUF_COUNT_DEBUG
199/** Number of tablespaces in the ibuf_counts array */
200#define IBUF_COUNT_N_SPACES 4
201/** Number of pages within each tablespace in the ibuf_counts array */
202#define IBUF_COUNT_N_PAGES 130000
203
204/** Buffered entry counts for file pages, used in debugging */
205static ulint ibuf_counts[IBUF_COUNT_N_SPACES][IBUF_COUNT_N_PAGES];
206
207/** Checks that the indexes to ibuf_counts[][] are within limits.
208@param[in] page_id page id */
209UNIV_INLINE
210void
211ibuf_count_check(
212 const page_id_t& page_id)
213{
214 if (page_id.space() < IBUF_COUNT_N_SPACES
215 && page_id.page_no() < IBUF_COUNT_N_PAGES) {
216 return;
217 }
218
219 ib::fatal() << "UNIV_IBUF_COUNT_DEBUG limits space_id and page_no"
220 " and breaks crash recovery. space_id=" << page_id.space()
221 << ", should be 0<=space_id<" << IBUF_COUNT_N_SPACES
222 << ". page_no=" << page_id.page_no()
223 << ", should be 0<=page_no<" << IBUF_COUNT_N_PAGES;
224}
225#endif
226
227/** @name Offsets to the per-page bits in the insert buffer bitmap */
228/* @{ */
229#define IBUF_BITMAP_FREE 0 /*!< Bits indicating the
230 amount of free space */
231#define IBUF_BITMAP_BUFFERED 2 /*!< TRUE if there are buffered
232 changes for the page */
233#define IBUF_BITMAP_IBUF 3 /*!< TRUE if page is a part of
234 the ibuf tree, excluding the
235 root page, or is in the free
236 list of the ibuf */
237/* @} */
238
239#define IBUF_REC_FIELD_SPACE 0 /*!< in the pre-4.1 format,
240 the page number. later, the space_id */
241#define IBUF_REC_FIELD_MARKER 1 /*!< starting with 4.1, a marker
242 consisting of 1 byte that is 0 */
243#define IBUF_REC_FIELD_PAGE 2 /*!< starting with 4.1, the
244 page number */
245#define IBUF_REC_FIELD_METADATA 3 /* the metadata field */
246#define IBUF_REC_FIELD_USER 4 /* first user field */
247
248/* Various constants for checking the type of an ibuf record and extracting
249data from it. For details, see the description of the record format at the
250top of this file. */
251
252/** @name Format of the IBUF_REC_FIELD_METADATA of an insert buffer record
253The fourth column in the MySQL 5.5 format contains an operation
254type, counter, and some flags. */
255/* @{ */
256#define IBUF_REC_INFO_SIZE 4 /*!< Combined size of info fields at
257 the beginning of the fourth field */
258
259/* Offsets for the fields at the beginning of the fourth field */
260#define IBUF_REC_OFFSET_COUNTER 0 /*!< Operation counter */
261#define IBUF_REC_OFFSET_TYPE 2 /*!< Type of operation */
262#define IBUF_REC_OFFSET_FLAGS 3 /*!< Additional flags */
263
264/* Record flag masks */
265#define IBUF_REC_COMPACT 0x1 /*!< Set in
266 IBUF_REC_OFFSET_FLAGS if the
267 user index is in COMPACT
268 format or later */
269
270
271/** The mutex used to block pessimistic inserts to ibuf trees */
272static ib_mutex_t ibuf_pessimistic_insert_mutex;
273
274/** The mutex protecting the insert buffer structs */
275static ib_mutex_t ibuf_mutex;
276
277/** The mutex protecting the insert buffer bitmaps */
278static ib_mutex_t ibuf_bitmap_mutex;
279
280/** The area in pages from which contract looks for page numbers for merge */
281const ulint IBUF_MERGE_AREA = 8;
282
283/** Inside the merge area, pages which have at most 1 per this number less
284buffered entries compared to maximum volume that can buffered for a single
285page are merged along with the page whose buffer became full */
286const ulint IBUF_MERGE_THRESHOLD = 4;
287
288/** In ibuf_contract at most this number of pages is read to memory in one
289batch, in order to merge the entries for them in the insert buffer */
290const ulint IBUF_MAX_N_PAGES_MERGED = IBUF_MERGE_AREA;
291
292/** If the combined size of the ibuf trees exceeds ibuf->max_size by this
293many pages, we start to contract it in connection to inserts there, using
294non-synchronous contract */
295const ulint IBUF_CONTRACT_ON_INSERT_NON_SYNC = 0;
296
297/** If the combined size of the ibuf trees exceeds ibuf->max_size by this
298many pages, we start to contract it in connection to inserts there, using
299synchronous contract */
300const ulint IBUF_CONTRACT_ON_INSERT_SYNC = 5;
301
302/** If the combined size of the ibuf trees exceeds ibuf->max_size by
303this many pages, we start to contract it synchronous contract, but do
304not insert */
305const ulint IBUF_CONTRACT_DO_NOT_INSERT = 10;
306
307/* TODO: how to cope with drop table if there are records in the insert
308buffer for the indexes of the table? Is there actually any problem,
309because ibuf merge is done to a page when it is read in, and it is
310still physically like the index page even if the index would have been
311dropped! So, there seems to be no problem. */
312
313/******************************************************************//**
314Sets the flag in the current mini-transaction record indicating we're
315inside an insert buffer routine. */
316UNIV_INLINE
317void
318ibuf_enter(
319/*=======*/
320 mtr_t* mtr) /*!< in/out: mini-transaction */
321{
322 ut_ad(!mtr->is_inside_ibuf());
323 mtr->enter_ibuf();
324}
325
326/******************************************************************//**
327Sets the flag in the current mini-transaction record indicating we're
328exiting an insert buffer routine. */
329UNIV_INLINE
330void
331ibuf_exit(
332/*======*/
333 mtr_t* mtr) /*!< in/out: mini-transaction */
334{
335 ut_ad(mtr->is_inside_ibuf());
336 mtr->exit_ibuf();
337}
338
339/**************************************************************//**
340Commits an insert buffer mini-transaction and sets the persistent
341cursor latch mode to BTR_NO_LATCHES, that is, detaches the cursor. */
342UNIV_INLINE
343void
344ibuf_btr_pcur_commit_specify_mtr(
345/*=============================*/
346 btr_pcur_t* pcur, /*!< in/out: persistent cursor */
347 mtr_t* mtr) /*!< in/out: mini-transaction */
348{
349 ut_d(ibuf_exit(mtr));
350 btr_pcur_commit_specify_mtr(pcur, mtr);
351}
352
353/******************************************************************//**
354Gets the ibuf header page and x-latches it.
355@return insert buffer header page */
356static
357page_t*
358ibuf_header_page_get(
359/*=================*/
360 mtr_t* mtr) /*!< in/out: mini-transaction */
361{
362 buf_block_t* block;
363
364 ut_ad(!ibuf_inside(mtr));
365 page_t* page = NULL;
366
367 block = buf_page_get(
368 page_id_t(IBUF_SPACE_ID, FSP_IBUF_HEADER_PAGE_NO),
369 univ_page_size, RW_X_LATCH, mtr);
370
371
372 if (!block->page.encrypted) {
373 buf_block_dbg_add_level(block, SYNC_IBUF_HEADER);
374
375 page = buf_block_get_frame(block);
376 }
377
378 return page;
379}
380
381/******************************************************************//**
382Gets the root page and sx-latches it.
383@return insert buffer tree root page */
384static
385page_t*
386ibuf_tree_root_get(
387/*===============*/
388 mtr_t* mtr) /*!< in: mtr */
389{
390 buf_block_t* block;
391 page_t* root;
392
393 ut_ad(ibuf_inside(mtr));
394 ut_ad(mutex_own(&ibuf_mutex));
395
396 mtr_sx_lock(dict_index_get_lock(ibuf->index), mtr);
397
398 /* only segment list access is exclusive each other */
399 block = buf_page_get(
400 page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO),
401 univ_page_size, RW_SX_LATCH, mtr);
402
403 buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);
404
405 root = buf_block_get_frame(block);
406
407 ut_ad(page_get_space_id(root) == IBUF_SPACE_ID);
408 ut_ad(page_get_page_no(root) == FSP_IBUF_TREE_ROOT_PAGE_NO);
409 ut_ad(ibuf->empty == page_is_empty(root));
410
411 return(root);
412}
413
414#ifdef UNIV_IBUF_COUNT_DEBUG
415
416/** Gets the ibuf count for a given page.
417@param[in] page_id page id
418@return number of entries in the insert buffer currently buffered for
419this page */
420ulint
421ibuf_count_get(
422 const page_id_t& page_id)
423{
424 ibuf_count_check(page_id);
425
426 return(ibuf_counts[page_id.space()][page_id.page_no()]);
427}
428
429/** Sets the ibuf count for a given page.
430@param[in] page_id page id
431@param[in] val value to set */
432static
433void
434ibuf_count_set(
435 const page_id_t& page_id,
436 ulint val)
437{
438 ibuf_count_check(page_id);
439 ut_a(val < srv_page_size);
440
441 ibuf_counts[page_id.space()][page_id.page_no()] = val;
442}
443#endif
444
445/******************************************************************//**
446Closes insert buffer and frees the data structures. */
447void
448ibuf_close(void)
449/*============*/
450{
451 if (ibuf == NULL) {
452 return;
453 }
454
455 mutex_free(&ibuf_pessimistic_insert_mutex);
456
457 mutex_free(&ibuf_mutex);
458
459 mutex_free(&ibuf_bitmap_mutex);
460
461 dict_table_t* ibuf_table = ibuf->index->table;
462 rw_lock_free(&ibuf->index->lock);
463 dict_mem_index_free(ibuf->index);
464 dict_mem_table_free(ibuf_table);
465
466 ut_free(ibuf);
467 ibuf = NULL;
468}
469
470/******************************************************************//**
471Updates the size information of the ibuf, assuming the segment size has not
472changed. */
473static
474void
475ibuf_size_update(
476/*=============*/
477 const page_t* root) /*!< in: ibuf tree root */
478{
479 ut_ad(mutex_own(&ibuf_mutex));
480
481 ibuf->free_list_len = flst_get_len(root + PAGE_HEADER
482 + PAGE_BTR_IBUF_FREE_LIST);
483
484 ibuf->height = 1 + btr_page_get_level(root);
485
486 /* the '1 +' is the ibuf header page */
487 ibuf->size = ibuf->seg_size - (1 + ibuf->free_list_len);
488}
489
490/******************************************************************//**
491Creates the insert buffer data structure at a database startup and initializes
492the data structures for the insert buffer.
493@return DB_SUCCESS or failure */
494dberr_t
495ibuf_init_at_db_start(void)
496/*=======================*/
497{
498 page_t* root;
499 mtr_t mtr;
500 ulint n_used;
501 page_t* header_page;
502 dberr_t error= DB_SUCCESS;
503
504 ibuf = static_cast<ibuf_t*>(ut_zalloc_nokey(sizeof(ibuf_t)));
505
506 /* At startup we intialize ibuf to have a maximum of
507 CHANGE_BUFFER_DEFAULT_SIZE in terms of percentage of the
508 buffer pool size. Once ibuf struct is initialized this
509 value is updated with the user supplied size by calling
510 ibuf_max_size_update(). */
511 ibuf->max_size = ((buf_pool_get_curr_size() >> srv_page_size_shift)
512 * CHANGE_BUFFER_DEFAULT_SIZE) / 100;
513
514 mutex_create(LATCH_ID_IBUF, &ibuf_mutex);
515
516 mutex_create(LATCH_ID_IBUF_BITMAP, &ibuf_bitmap_mutex);
517
518 mutex_create(LATCH_ID_IBUF_PESSIMISTIC_INSERT,
519 &ibuf_pessimistic_insert_mutex);
520
521 mtr_start(&mtr);
522
523 compile_time_assert(IBUF_SPACE_ID == TRX_SYS_SPACE);
524 compile_time_assert(IBUF_SPACE_ID == 0);
525 mtr_x_lock(&fil_system.sys_space->latch, &mtr);
526
527 mutex_enter(&ibuf_mutex);
528
529 header_page = ibuf_header_page_get(&mtr);
530
531 if (!header_page) {
532 return (DB_DECRYPTION_FAILED);
533 }
534
535 fseg_n_reserved_pages(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
536 &n_used, &mtr);
537
538 ut_ad(n_used >= 2);
539
540 ibuf->seg_size = n_used;
541
542 {
543 buf_block_t* block;
544
545 block = buf_page_get(
546 page_id_t(IBUF_SPACE_ID, FSP_IBUF_TREE_ROOT_PAGE_NO),
547 univ_page_size, RW_X_LATCH, &mtr);
548
549 buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
550
551 root = buf_block_get_frame(block);
552 }
553
554 ibuf_size_update(root);
555 mutex_exit(&ibuf_mutex);
556
557 ibuf->empty = page_is_empty(root);
558 mtr.commit();
559
560 ibuf->index = dict_mem_index_create(
561 dict_mem_table_create("innodb_change_buffer",
562 fil_system.sys_space, 1, 0, 0, 0),
563 "CLUST_IND",
564 DICT_CLUSTERED | DICT_IBUF, 1);
565 ibuf->index->id = DICT_IBUF_ID_MIN + IBUF_SPACE_ID;
566 ibuf->index->n_uniq = REC_MAX_N_FIELDS;
567 rw_lock_create(index_tree_rw_lock_key, &ibuf->index->lock,
568 SYNC_IBUF_INDEX_TREE);
569#ifdef BTR_CUR_ADAPT
570 ibuf->index->search_info = btr_search_info_create(ibuf->index->heap);
571#endif /* BTR_CUR_ADAPT */
572 ibuf->index->page = FSP_IBUF_TREE_ROOT_PAGE_NO;
573 ut_d(ibuf->index->cached = TRUE);
574 return (error);
575}
576
577/*********************************************************************//**
578Updates the max_size value for ibuf. */
579void
580ibuf_max_size_update(
581/*=================*/
582 ulint new_val) /*!< in: new value in terms of
583 percentage of the buffer pool size */
584{
585 ulint new_size = ((buf_pool_get_curr_size() >> srv_page_size_shift)
586 * new_val) / 100;
587 mutex_enter(&ibuf_mutex);
588 ibuf->max_size = new_size;
589 mutex_exit(&ibuf_mutex);
590}
591
592
593/*********************************************************************//**
594Initializes an ibuf bitmap page. */
595void
596ibuf_bitmap_page_init(
597/*==================*/
598 buf_block_t* block, /*!< in: bitmap page */
599 mtr_t* mtr) /*!< in: mtr */
600{
601 page_t* page;
602 ulint byte_offset;
603
604 page = buf_block_get_frame(block);
605 fil_page_set_type(page, FIL_PAGE_IBUF_BITMAP);
606
607 /* Write all zeros to the bitmap */
608 compile_time_assert(!(IBUF_BITS_PER_PAGE % 2));
609
610 byte_offset = UT_BITS_IN_BYTES(block->page.size.physical()
611 * IBUF_BITS_PER_PAGE);
612
613 memset(page + IBUF_BITMAP, 0, byte_offset);
614
615 /* The remaining area (up to the page trailer) is uninitialized. */
616 mlog_write_initial_log_record(page, MLOG_IBUF_BITMAP_INIT, mtr);
617}
618
619/*********************************************************************//**
620Parses a redo log record of an ibuf bitmap page init.
621@return end of log record or NULL */
622byte*
623ibuf_parse_bitmap_init(
624/*===================*/
625 byte* ptr, /*!< in: buffer */
626 byte* end_ptr MY_ATTRIBUTE((unused)), /*!< in: buffer end */
627 buf_block_t* block, /*!< in: block or NULL */
628 mtr_t* mtr) /*!< in: mtr or NULL */
629{
630 ut_ad(ptr != NULL);
631 ut_ad(end_ptr != NULL);
632
633 if (block) {
634 ibuf_bitmap_page_init(block, mtr);
635 }
636
637 return(ptr);
638}
639
640# ifdef UNIV_DEBUG
641/** Gets the desired bits for a given page from a bitmap page.
642@param[in] page bitmap page
643@param[in] page_id page id whose bits to get
644@param[in] page_size page id whose bits to get
645@param[in] bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ...
646@param[in,out] mtr mini-transaction holding an x-latch on the
647bitmap page
648@return value of bits */
649# define ibuf_bitmap_page_get_bits(page, page_id, page_size, bit, mtr) \
650 ibuf_bitmap_page_get_bits_low(page, page_id, page_size, \
651 MTR_MEMO_PAGE_X_FIX, mtr, bit)
652# else /* UNIV_DEBUG */
653/** Gets the desired bits for a given page from a bitmap page.
654@param[in] page bitmap page
655@param[in] page_id page id whose bits to get
656@param[in] page_size page id whose bits to get
657@param[in] bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ...
658@param[in,out] mtr mini-transaction holding an x-latch on the
659bitmap page
660@return value of bits */
661# define ibuf_bitmap_page_get_bits(page, page_id, page_size, bit, mtr) \
662 ibuf_bitmap_page_get_bits_low(page, page_id, page_size, bit)
663# endif /* UNIV_DEBUG */
664
665/** Gets the desired bits for a given page from a bitmap page.
666@param[in] page bitmap page
667@param[in] page_id page id whose bits to get
668@param[in] page_size page size
669@param[in] latch_type MTR_MEMO_PAGE_X_FIX, MTR_MEMO_BUF_FIX, ...
670@param[in,out] mtr mini-transaction holding latch_type on the
671bitmap page
672@param[in] bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ...
673@return value of bits */
674UNIV_INLINE
675ulint
676ibuf_bitmap_page_get_bits_low(
677 const page_t* page,
678 const page_id_t& page_id,
679 const page_size_t& page_size,
680#ifdef UNIV_DEBUG
681 ulint latch_type,
682 mtr_t* mtr,
683#endif /* UNIV_DEBUG */
684 ulint bit)
685{
686 ulint byte_offset;
687 ulint bit_offset;
688 ulint map_byte;
689 ulint value;
690
691 ut_ad(bit < IBUF_BITS_PER_PAGE);
692 compile_time_assert(!(IBUF_BITS_PER_PAGE % 2));
693 ut_ad(mtr_memo_contains_page(mtr, page, latch_type));
694
695 bit_offset = (page_id.page_no() % page_size.physical())
696 * IBUF_BITS_PER_PAGE + bit;
697
698 byte_offset = bit_offset / 8;
699 bit_offset = bit_offset % 8;
700
701 ut_ad(byte_offset + IBUF_BITMAP < srv_page_size);
702
703 map_byte = mach_read_from_1(page + IBUF_BITMAP + byte_offset);
704
705 value = ut_bit_get_nth(map_byte, bit_offset);
706
707 if (bit == IBUF_BITMAP_FREE) {
708 ut_ad(bit_offset + 1 < 8);
709
710 value = value * 2 + ut_bit_get_nth(map_byte, bit_offset + 1);
711 }
712
713 return(value);
714}
715
716/** Sets the desired bit for a given page in a bitmap page.
717@param[in,out] page bitmap page
718@param[in] page_id page id whose bits to set
719@param[in] page_size page size
720@param[in] bit IBUF_BITMAP_FREE, IBUF_BITMAP_BUFFERED, ...
721@param[in] val value to set
722@param[in,out] mtr mtr containing an x-latch to the bitmap page */
723static
724void
725ibuf_bitmap_page_set_bits(
726 page_t* page,
727 const page_id_t& page_id,
728 const page_size_t& page_size,
729 ulint bit,
730 ulint val,
731 mtr_t* mtr)
732{
733 ulint byte_offset;
734 ulint bit_offset;
735 ulint map_byte;
736
737 ut_ad(bit < IBUF_BITS_PER_PAGE);
738 compile_time_assert(!(IBUF_BITS_PER_PAGE % 2));
739 ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
740 ut_ad(mtr->is_named_space(page_id.space()));
741#ifdef UNIV_IBUF_COUNT_DEBUG
742 ut_a((bit != IBUF_BITMAP_BUFFERED) || (val != FALSE)
743 || (0 == ibuf_count_get(page_id)));
744#endif
745
746 bit_offset = (page_id.page_no() % page_size.physical())
747 * IBUF_BITS_PER_PAGE + bit;
748
749 byte_offset = bit_offset / 8;
750 bit_offset = bit_offset % 8;
751
752 ut_ad(byte_offset + IBUF_BITMAP < srv_page_size);
753
754 map_byte = mach_read_from_1(page + IBUF_BITMAP + byte_offset);
755
756 if (bit == IBUF_BITMAP_FREE) {
757 ut_ad(bit_offset + 1 < 8);
758 ut_ad(val <= 3);
759
760 map_byte = ut_bit_set_nth(map_byte, bit_offset, val / 2);
761 map_byte = ut_bit_set_nth(map_byte, bit_offset + 1, val % 2);
762 } else {
763 ut_ad(val <= 1);
764 map_byte = ut_bit_set_nth(map_byte, bit_offset, val);
765 }
766
767 mlog_write_ulint(page + IBUF_BITMAP + byte_offset, map_byte,
768 MLOG_1BYTE, mtr);
769}
770
771/** Calculates the bitmap page number for a given page number.
772@param[in] page_id page id
773@param[in] page_size page size
774@return the bitmap page id where the file page is mapped */
775UNIV_INLINE
776const page_id_t
777ibuf_bitmap_page_no_calc(
778 const page_id_t& page_id,
779 const page_size_t& page_size)
780{
781 ulint bitmap_page_no;
782
783 bitmap_page_no = FSP_IBUF_BITMAP_OFFSET
784 + (page_id.page_no() & ~(page_size.physical() - 1));
785
786 return(page_id_t(page_id.space(), bitmap_page_no));
787}
788
789/** Gets the ibuf bitmap page where the bits describing a given file page are
790stored.
791@param[in] page_id page id of the file page
792@param[in] page_size page size of the file page
793@param[in] file file name
794@param[in] line line where called
795@param[in,out] mtr mini-transaction
796@return bitmap page where the file page is mapped, that is, the bitmap
797page containing the descriptor bits for the file page; the bitmap page
798is x-latched */
799static
800page_t*
801ibuf_bitmap_get_map_page_func(
802 const page_id_t& page_id,
803 const page_size_t& page_size,
804 const char* file,
805 unsigned line,
806 mtr_t* mtr)
807{
808 buf_block_t* block = NULL;
809 dberr_t err = DB_SUCCESS;
810
811 block = buf_page_get_gen(ibuf_bitmap_page_no_calc(page_id, page_size),
812 page_size, RW_X_LATCH, NULL, BUF_GET,
813 file, line, mtr, &err);
814
815 if (err != DB_SUCCESS) {
816 return NULL;
817 }
818
819
820 buf_block_dbg_add_level(block, SYNC_IBUF_BITMAP);
821
822 return(buf_block_get_frame(block));
823}
824
825/** Gets the ibuf bitmap page where the bits describing a given file page are
826stored.
827@param[in] page_id page id of the file page
828@param[in] page_size page size of the file page
829@param[in,out] mtr mini-transaction
830@return bitmap page where the file page is mapped, that is, the bitmap
831page containing the descriptor bits for the file page; the bitmap page
832is x-latched */
833#define ibuf_bitmap_get_map_page(page_id, page_size, mtr) \
834 ibuf_bitmap_get_map_page_func(page_id, page_size, \
835 __FILE__, __LINE__, mtr)
836
837/************************************************************************//**
838Sets the free bits of the page in the ibuf bitmap. This is done in a separate
839mini-transaction, hence this operation does not restrict further work to only
840ibuf bitmap operations, which would result if the latch to the bitmap page
841were kept. */
842UNIV_INLINE
843void
844ibuf_set_free_bits_low(
845/*===================*/
846 const buf_block_t* block, /*!< in: index page; free bits are set if
847 the index is non-clustered and page
848 level is 0 */
849 ulint val, /*!< in: value to set: < 4 */
850 mtr_t* mtr) /*!< in/out: mtr */
851{
852 page_t* bitmap_page;
853 buf_frame_t* frame;
854
855 ut_ad(mtr->is_named_space(block->page.id.space()));
856
857 if (!block) {
858 return;
859 }
860
861 frame = buf_block_get_frame(block);
862
863 if (!frame || !page_is_leaf(frame)) {
864 return;
865 }
866
867 bitmap_page = ibuf_bitmap_get_map_page(block->page.id,
868 block->page.size, mtr);
869
870#ifdef UNIV_IBUF_DEBUG
871 ut_a(val <= ibuf_index_page_calc_free(block));
872#endif /* UNIV_IBUF_DEBUG */
873
874 ibuf_bitmap_page_set_bits(
875 bitmap_page, block->page.id, block->page.size,
876 IBUF_BITMAP_FREE, val, mtr);
877}
878
879/************************************************************************//**
880Sets the free bit of the page in the ibuf bitmap. This is done in a separate
881mini-transaction, hence this operation does not restrict further work to only
882ibuf bitmap operations, which would result if the latch to the bitmap page
883were kept. */
884void
885ibuf_set_free_bits_func(
886/*====================*/
887 buf_block_t* block, /*!< in: index page of a non-clustered index;
888 free bit is reset if page level is 0 */
889#ifdef UNIV_IBUF_DEBUG
890 ulint max_val,/*!< in: ULINT_UNDEFINED or a maximum
891 value which the bits must have before
892 setting; this is for debugging */
893#endif /* UNIV_IBUF_DEBUG */
894 ulint val) /*!< in: value to set: < 4 */
895{
896 mtr_t mtr;
897 page_t* page;
898 page_t* bitmap_page;
899
900 page = buf_block_get_frame(block);
901
902 if (!page_is_leaf(page)) {
903
904 return;
905 }
906
907 mtr_start(&mtr);
908 const fil_space_t* space = mtr.set_named_space_id(
909 block->page.id.space());
910
911 bitmap_page = ibuf_bitmap_get_map_page(block->page.id,
912 block->page.size, &mtr);
913
914 switch (space->purpose) {
915 case FIL_TYPE_LOG:
916 ut_ad(0);
917 break;
918 case FIL_TYPE_TABLESPACE:
919 /* Avoid logging while fixing up truncate of table. */
920 if (!srv_is_tablespace_truncated(block->page.id.space())) {
921 break;
922 }
923 /* fall through */
924 case FIL_TYPE_TEMPORARY:
925 case FIL_TYPE_IMPORT:
926 mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
927 }
928
929#ifdef UNIV_IBUF_DEBUG
930 if (max_val != ULINT_UNDEFINED) {
931 ulint old_val;
932
933 old_val = ibuf_bitmap_page_get_bits(
934 bitmap_page, block->page.id,
935 IBUF_BITMAP_FREE, &mtr);
936# if 0
937 if (old_val != max_val) {
938 fprintf(stderr,
939 "Ibuf: page %lu old val %lu max val %lu\n",
940 page_get_page_no(page),
941 old_val, max_val);
942 }
943# endif
944
945 ut_a(old_val <= max_val);
946 }
947# if 0
948 fprintf(stderr, "Setting page no %lu free bits to %lu should be %lu\n",
949 page_get_page_no(page), val,
950 ibuf_index_page_calc_free(block));
951# endif
952
953 ut_a(val <= ibuf_index_page_calc_free(block));
954#endif /* UNIV_IBUF_DEBUG */
955
956 ibuf_bitmap_page_set_bits(
957 bitmap_page, block->page.id, block->page.size,
958 IBUF_BITMAP_FREE, val, &mtr);
959
960 mtr_commit(&mtr);
961}
962
963/************************************************************************//**
964Resets the free bits of the page in the ibuf bitmap. This is done in a
965separate mini-transaction, hence this operation does not restrict
966further work to only ibuf bitmap operations, which would result if the
967latch to the bitmap page were kept. NOTE: The free bits in the insert
968buffer bitmap must never exceed the free space on a page. It is safe
969to decrement or reset the bits in the bitmap in a mini-transaction
970that is committed before the mini-transaction that affects the free
971space. */
972void
973ibuf_reset_free_bits(
974/*=================*/
975 buf_block_t* block) /*!< in: index page; free bits are set to 0
976 if the index is a non-clustered
977 non-unique, and page level is 0 */
978{
979 ibuf_set_free_bits(block, 0, ULINT_UNDEFINED);
980}
981
982/**********************************************************************//**
983Updates the free bits for an uncompressed page to reflect the present
984state. Does this in the mtr given, which means that the latching
985order rules virtually prevent any further operations for this OS
986thread until mtr is committed. NOTE: The free bits in the insert
987buffer bitmap must never exceed the free space on a page. It is safe
988to set the free bits in the same mini-transaction that updated the
989page. */
990void
991ibuf_update_free_bits_low(
992/*======================*/
993 const buf_block_t* block, /*!< in: index page */
994 ulint max_ins_size, /*!< in: value of
995 maximum insert size
996 with reorganize before
997 the latest operation
998 performed to the page */
999 mtr_t* mtr) /*!< in/out: mtr */
1000{
1001 ulint before;
1002 ulint after;
1003
1004 ut_a(!buf_block_get_page_zip(block));
1005 ut_ad(mtr->is_named_space(block->page.id.space()));
1006
1007 before = ibuf_index_page_calc_free_bits(block->page.size.logical(),
1008 max_ins_size);
1009
1010 after = ibuf_index_page_calc_free(block);
1011
1012 /* This approach cannot be used on compressed pages, since the
1013 computed value of "before" often does not match the current
1014 state of the bitmap. This is because the free space may
1015 increase or decrease when a compressed page is reorganized. */
1016 if (before != after) {
1017 ibuf_set_free_bits_low(block, after, mtr);
1018 }
1019}
1020
1021/**********************************************************************//**
1022Updates the free bits for a compressed page to reflect the present
1023state. Does this in the mtr given, which means that the latching
1024order rules virtually prevent any further operations for this OS
1025thread until mtr is committed. NOTE: The free bits in the insert
1026buffer bitmap must never exceed the free space on a page. It is safe
1027to set the free bits in the same mini-transaction that updated the
1028page. */
1029void
1030ibuf_update_free_bits_zip(
1031/*======================*/
1032 buf_block_t* block, /*!< in/out: index page */
1033 mtr_t* mtr) /*!< in/out: mtr */
1034{
1035 page_t* bitmap_page;
1036 ulint after;
1037
1038 ut_a(block);
1039 buf_frame_t* frame = buf_block_get_frame(block);
1040 ut_a(frame);
1041 ut_a(page_is_leaf(frame));
1042 ut_a(block->page.size.is_compressed());
1043
1044 bitmap_page = ibuf_bitmap_get_map_page(block->page.id,
1045 block->page.size, mtr);
1046
1047 after = ibuf_index_page_calc_free_zip(block);
1048
1049 if (after == 0) {
1050 /* We move the page to the front of the buffer pool LRU list:
1051 the purpose of this is to prevent those pages to which we
1052 cannot make inserts using the insert buffer from slipping
1053 out of the buffer pool */
1054
1055 buf_page_make_young(&block->page);
1056 }
1057
1058 ibuf_bitmap_page_set_bits(
1059 bitmap_page, block->page.id, block->page.size,
1060 IBUF_BITMAP_FREE, after, mtr);
1061}
1062
1063/**********************************************************************//**
1064Updates the free bits for the two pages to reflect the present state.
1065Does this in the mtr given, which means that the latching order rules
1066virtually prevent any further operations until mtr is committed.
1067NOTE: The free bits in the insert buffer bitmap must never exceed the
1068free space on a page. It is safe to set the free bits in the same
1069mini-transaction that updated the pages. */
1070void
1071ibuf_update_free_bits_for_two_pages_low(
1072/*====================================*/
1073 buf_block_t* block1, /*!< in: index page */
1074 buf_block_t* block2, /*!< in: index page */
1075 mtr_t* mtr) /*!< in: mtr */
1076{
1077 ulint state;
1078
1079 ut_ad(mtr->is_named_space(block1->page.id.space()));
1080 ut_ad(block1->page.id.space() == block2->page.id.space());
1081
1082 /* As we have to x-latch two random bitmap pages, we have to acquire
1083 the bitmap mutex to prevent a deadlock with a similar operation
1084 performed by another OS thread. */
1085
1086 mutex_enter(&ibuf_bitmap_mutex);
1087
1088 state = ibuf_index_page_calc_free(block1);
1089
1090 ibuf_set_free_bits_low(block1, state, mtr);
1091
1092 state = ibuf_index_page_calc_free(block2);
1093
1094 ibuf_set_free_bits_low(block2, state, mtr);
1095
1096 mutex_exit(&ibuf_bitmap_mutex);
1097}
1098
1099/** Returns TRUE if the page is one of the fixed address ibuf pages.
1100@param[in] page_id page id
1101@param[in] page_size page size
1102@return TRUE if a fixed address ibuf i/o page */
1103UNIV_INLINE
1104ibool
1105ibuf_fixed_addr_page(
1106 const page_id_t& page_id,
1107 const page_size_t& page_size)
1108{
1109 return((page_id.space() == IBUF_SPACE_ID
1110 && page_id.page_no() == IBUF_TREE_ROOT_PAGE_NO)
1111 || ibuf_bitmap_page(page_id, page_size));
1112}
1113
1114/** Checks if a page is a level 2 or 3 page in the ibuf hierarchy of pages.
1115Must not be called when recv_no_ibuf_operations==true.
1116@param[in] page_id page id
1117@param[in] page_size page size
1118@param[in] x_latch FALSE if relaxed check (avoid latching the
1119bitmap page)
1120@param[in] file file name
1121@param[in] line line where called
1122@param[in,out] mtr mtr which will contain an x-latch to the
1123bitmap page if the page is not one of the fixed address ibuf pages, or NULL,
1124in which case a new transaction is created.
1125@return TRUE if level 2 or level 3 page */
1126ibool
1127ibuf_page_low(
1128 const page_id_t& page_id,
1129 const page_size_t& page_size,
1130#ifdef UNIV_DEBUG
1131 ibool x_latch,
1132#endif /* UNIV_DEBUG */
1133 const char* file,
1134 unsigned line,
1135 mtr_t* mtr)
1136{
1137 ibool ret;
1138 mtr_t local_mtr;
1139 page_t* bitmap_page;
1140
1141 ut_ad(!recv_no_ibuf_operations);
1142 ut_ad(x_latch || mtr == NULL);
1143
1144 if (ibuf_fixed_addr_page(page_id, page_size)) {
1145
1146 return(TRUE);
1147 } else if (page_id.space() != IBUF_SPACE_ID) {
1148
1149 return(FALSE);
1150 }
1151
1152 compile_time_assert(IBUF_SPACE_ID == 0);
1153 ut_ad(fil_system.sys_space->purpose == FIL_TYPE_TABLESPACE);
1154
1155#ifdef UNIV_DEBUG
1156 if (!x_latch) {
1157 mtr_start(&local_mtr);
1158
1159 /* Get the bitmap page without a page latch, so that
1160 we will not be violating the latching order when
1161 another bitmap page has already been latched by this
1162 thread. The page will be buffer-fixed, and thus it
1163 cannot be removed or relocated while we are looking at
1164 it. The contents of the page could change, but the
1165 IBUF_BITMAP_IBUF bit that we are interested in should
1166 not be modified by any other thread. Nobody should be
1167 calling ibuf_add_free_page() or ibuf_remove_free_page()
1168 while the page is linked to the insert buffer b-tree. */
1169 dberr_t err = DB_SUCCESS;
1170
1171 buf_block_t* block = buf_page_get_gen(
1172 ibuf_bitmap_page_no_calc(page_id, page_size),
1173 page_size, RW_NO_LATCH, NULL, BUF_GET_NO_LATCH,
1174 file, line, &local_mtr, &err);
1175
1176 bitmap_page = buf_block_get_frame(block);
1177
1178 ret = ibuf_bitmap_page_get_bits_low(
1179 bitmap_page, page_id, page_size,
1180 MTR_MEMO_BUF_FIX, &local_mtr, IBUF_BITMAP_IBUF);
1181
1182 mtr_commit(&local_mtr);
1183 return(ret);
1184 }
1185#endif /* UNIV_DEBUG */
1186
1187 if (mtr == NULL) {
1188 mtr = &local_mtr;
1189 mtr_start(mtr);
1190 }
1191
1192 bitmap_page = ibuf_bitmap_get_map_page_func(page_id, page_size,
1193 file, line, mtr);
1194
1195 ret = ibuf_bitmap_page_get_bits(bitmap_page, page_id, page_size,
1196 IBUF_BITMAP_IBUF, mtr);
1197
1198 if (mtr == &local_mtr) {
1199 mtr_commit(mtr);
1200 }
1201
1202 return(ret);
1203}
1204
1205#ifdef UNIV_DEBUG
1206# define ibuf_rec_get_page_no(mtr,rec) ibuf_rec_get_page_no_func(mtr,rec)
1207#else /* UNIV_DEBUG */
1208# define ibuf_rec_get_page_no(mtr,rec) ibuf_rec_get_page_no_func(rec)
1209#endif /* UNIV_DEBUG */
1210
1211/********************************************************************//**
1212Returns the page number field of an ibuf record.
1213@return page number */
1214static
1215ulint
1216ibuf_rec_get_page_no_func(
1217/*======================*/
1218#ifdef UNIV_DEBUG
1219 mtr_t* mtr, /*!< in: mini-transaction owning rec */
1220#endif /* UNIV_DEBUG */
1221 const rec_t* rec) /*!< in: ibuf record */
1222{
1223 const byte* field;
1224 ulint len;
1225
1226 ut_ad(mtr_memo_contains_page_flagged(mtr, rec,
1227 MTR_MEMO_PAGE_X_FIX
1228 | MTR_MEMO_PAGE_S_FIX));
1229 ut_ad(ibuf_inside(mtr));
1230 ut_ad(rec_get_n_fields_old(rec) > 2);
1231
1232 field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
1233
1234 ut_a(len == 1);
1235
1236 field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_PAGE, &len);
1237
1238 ut_a(len == 4);
1239
1240 return(mach_read_from_4(field));
1241}
1242
1243#ifdef UNIV_DEBUG
1244# define ibuf_rec_get_space(mtr,rec) ibuf_rec_get_space_func(mtr,rec)
1245#else /* UNIV_DEBUG */
1246# define ibuf_rec_get_space(mtr,rec) ibuf_rec_get_space_func(rec)
1247#endif /* UNIV_DEBUG */
1248
1249/********************************************************************//**
1250Returns the space id field of an ibuf record. For < 4.1.x format records
1251returns 0.
1252@return space id */
1253static
1254ulint
1255ibuf_rec_get_space_func(
1256/*====================*/
1257#ifdef UNIV_DEBUG
1258 mtr_t* mtr, /*!< in: mini-transaction owning rec */
1259#endif /* UNIV_DEBUG */
1260 const rec_t* rec) /*!< in: ibuf record */
1261{
1262 const byte* field;
1263 ulint len;
1264
1265 ut_ad(mtr_memo_contains_page_flagged(mtr, rec, MTR_MEMO_PAGE_X_FIX
1266 | MTR_MEMO_PAGE_S_FIX));
1267 ut_ad(ibuf_inside(mtr));
1268 ut_ad(rec_get_n_fields_old(rec) > 2);
1269
1270 field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
1271
1272 ut_a(len == 1);
1273
1274 field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
1275
1276 ut_a(len == 4);
1277
1278 return(mach_read_from_4(field));
1279}
1280
1281#ifdef UNIV_DEBUG
1282# define ibuf_rec_get_info(mtr,rec,op,comp,info_len,counter) \
1283 ibuf_rec_get_info_func(mtr,rec,op,comp,info_len,counter)
1284#else /* UNIV_DEBUG */
1285# define ibuf_rec_get_info(mtr,rec,op,comp,info_len,counter) \
1286 ibuf_rec_get_info_func(rec,op,comp,info_len,counter)
1287#endif
1288/****************************************************************//**
1289Get various information about an ibuf record in >= 4.1.x format. */
1290static
1291void
1292ibuf_rec_get_info_func(
1293/*===================*/
1294#ifdef UNIV_DEBUG
1295 mtr_t* mtr, /*!< in: mini-transaction owning rec */
1296#endif /* UNIV_DEBUG */
1297 const rec_t* rec, /*!< in: ibuf record */
1298 ibuf_op_t* op, /*!< out: operation type, or NULL */
1299 ibool* comp, /*!< out: compact flag, or NULL */
1300 ulint* info_len, /*!< out: length of info fields at the
1301 start of the fourth field, or
1302 NULL */
1303 ulint* counter) /*!< in: counter value, or NULL */
1304{
1305 const byte* types;
1306 ulint fields;
1307 ulint len;
1308
1309 /* Local variables to shadow arguments. */
1310 ibuf_op_t op_local;
1311 ibool comp_local;
1312 ulint info_len_local;
1313 ulint counter_local;
1314
1315 ut_ad(mtr_memo_contains_page_flagged(mtr, rec, MTR_MEMO_PAGE_X_FIX
1316 | MTR_MEMO_PAGE_S_FIX));
1317 ut_ad(ibuf_inside(mtr));
1318 fields = rec_get_n_fields_old(rec);
1319 ut_a(fields > IBUF_REC_FIELD_USER);
1320
1321 types = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len);
1322
1323 info_len_local = len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
1324 compile_time_assert(IBUF_REC_INFO_SIZE
1325 < DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
1326
1327 switch (info_len_local) {
1328 case 0:
1329 case 1:
1330 op_local = IBUF_OP_INSERT;
1331 comp_local = info_len_local;
1332 ut_ad(!counter);
1333 counter_local = ULINT_UNDEFINED;
1334 break;
1335
1336 case IBUF_REC_INFO_SIZE:
1337 op_local = (ibuf_op_t) types[IBUF_REC_OFFSET_TYPE];
1338 comp_local = types[IBUF_REC_OFFSET_FLAGS] & IBUF_REC_COMPACT;
1339 counter_local = mach_read_from_2(
1340 types + IBUF_REC_OFFSET_COUNTER);
1341 break;
1342
1343 default:
1344 ut_error;
1345 }
1346
1347 ut_a(op_local < IBUF_OP_COUNT);
1348 ut_a((len - info_len_local) ==
1349 (fields - IBUF_REC_FIELD_USER)
1350 * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
1351
1352 if (op) {
1353 *op = op_local;
1354 }
1355
1356 if (comp) {
1357 *comp = comp_local;
1358 }
1359
1360 if (info_len) {
1361 *info_len = info_len_local;
1362 }
1363
1364 if (counter) {
1365 *counter = counter_local;
1366 }
1367}
1368
1369#ifdef UNIV_DEBUG
1370# define ibuf_rec_get_op_type(mtr,rec) ibuf_rec_get_op_type_func(mtr,rec)
1371#else /* UNIV_DEBUG */
1372# define ibuf_rec_get_op_type(mtr,rec) ibuf_rec_get_op_type_func(rec)
1373#endif
1374
1375/****************************************************************//**
1376Returns the operation type field of an ibuf record.
1377@return operation type */
1378static
1379ibuf_op_t
1380ibuf_rec_get_op_type_func(
1381/*======================*/
1382#ifdef UNIV_DEBUG
1383 mtr_t* mtr, /*!< in: mini-transaction owning rec */
1384#endif /* UNIV_DEBUG */
1385 const rec_t* rec) /*!< in: ibuf record */
1386{
1387 ulint len;
1388
1389 ut_ad(mtr_memo_contains_page_flagged(mtr, rec, MTR_MEMO_PAGE_X_FIX
1390 | MTR_MEMO_PAGE_S_FIX));
1391 ut_ad(ibuf_inside(mtr));
1392 ut_ad(rec_get_n_fields_old(rec) > 2);
1393
1394 (void) rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
1395
1396 if (len > 1) {
1397 /* This is a < 4.1.x format record */
1398
1399 return(IBUF_OP_INSERT);
1400 } else {
1401 ibuf_op_t op;
1402
1403 ibuf_rec_get_info(mtr, rec, &op, NULL, NULL, NULL);
1404
1405 return(op);
1406 }
1407}
1408
1409/****************************************************************//**
1410Read the first two bytes from a record's fourth field (counter field in new
1411records; something else in older records).
1412@return "counter" field, or ULINT_UNDEFINED if for some reason it
1413can't be read */
1414ulint
1415ibuf_rec_get_counter(
1416/*=================*/
1417 const rec_t* rec) /*!< in: ibuf record */
1418{
1419 const byte* ptr;
1420 ulint len;
1421
1422 if (rec_get_n_fields_old(rec) <= IBUF_REC_FIELD_METADATA) {
1423
1424 return(ULINT_UNDEFINED);
1425 }
1426
1427 ptr = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len);
1428
1429 if (len >= 2) {
1430
1431 return(mach_read_from_2(ptr));
1432 } else {
1433
1434 return(ULINT_UNDEFINED);
1435 }
1436}
1437
1438/****************************************************************//**
1439Add accumulated operation counts to a permanent array. Both arrays must be
1440of size IBUF_OP_COUNT. */
1441static
1442void
1443ibuf_add_ops(
1444/*=========*/
1445 ulint* arr, /*!< in/out: array to modify */
1446 const ulint* ops) /*!< in: operation counts */
1447
1448{
1449 ulint i;
1450
1451 for (i = 0; i < IBUF_OP_COUNT; i++) {
1452 my_atomic_addlint(&arr[i], ops[i]);
1453 }
1454}
1455
1456/****************************************************************//**
1457Print operation counts. The array must be of size IBUF_OP_COUNT. */
1458static
1459void
1460ibuf_print_ops(
1461/*===========*/
1462 const ulint* ops, /*!< in: operation counts */
1463 FILE* file) /*!< in: file where to print */
1464{
1465 static const char* op_names[] = {
1466 "insert",
1467 "delete mark",
1468 "delete"
1469 };
1470 ulint i;
1471
1472 ut_a(UT_ARR_SIZE(op_names) == IBUF_OP_COUNT);
1473
1474 for (i = 0; i < IBUF_OP_COUNT; i++) {
1475 fprintf(file, "%s " ULINTPF "%s", op_names[i],
1476 ops[i], (i < (IBUF_OP_COUNT - 1)) ? ", " : "");
1477 }
1478
1479 putc('\n', file);
1480}
1481
1482/********************************************************************//**
1483Creates a dummy index for inserting a record to a non-clustered index.
1484@return dummy index */
1485static
1486dict_index_t*
1487ibuf_dummy_index_create(
1488/*====================*/
1489 ulint n, /*!< in: number of fields */
1490 ibool comp) /*!< in: TRUE=use compact record format */
1491{
1492 dict_table_t* table;
1493 dict_index_t* index;
1494
1495 table = dict_mem_table_create("IBUF_DUMMY", NULL, n, 0,
1496 comp ? DICT_TF_COMPACT : 0, 0);
1497
1498 index = dict_mem_index_create(table, "IBUF_DUMMY", 0, n);
1499
1500 /* avoid ut_ad(index->cached) in dict_index_get_n_unique_in_tree */
1501 index->cached = TRUE;
1502 ut_d(index->is_dummy = true);
1503
1504 return(index);
1505}
1506/********************************************************************//**
1507Add a column to the dummy index */
1508static
1509void
1510ibuf_dummy_index_add_col(
1511/*=====================*/
1512 dict_index_t* index, /*!< in: dummy index */
1513 const dtype_t* type, /*!< in: the data type of the column */
1514 ulint len) /*!< in: length of the column */
1515{
1516 ulint i = index->table->n_def;
1517 dict_mem_table_add_col(index->table, NULL, NULL,
1518 dtype_get_mtype(type),
1519 dtype_get_prtype(type),
1520 dtype_get_len(type));
1521 dict_index_add_col(index, index->table,
1522 dict_table_get_nth_col(index->table, i), len);
1523}
1524/********************************************************************//**
1525Deallocates a dummy index for inserting a record to a non-clustered index. */
1526static
1527void
1528ibuf_dummy_index_free(
1529/*==================*/
1530 dict_index_t* index) /*!< in, own: dummy index */
1531{
1532 dict_table_t* table = index->table;
1533
1534 dict_mem_index_free(index);
1535 dict_mem_table_free(table);
1536}
1537
1538#ifdef UNIV_DEBUG
1539# define ibuf_build_entry_from_ibuf_rec(mtr,ibuf_rec,heap,pindex) \
1540 ibuf_build_entry_from_ibuf_rec_func(mtr,ibuf_rec,heap,pindex)
1541#else /* UNIV_DEBUG */
1542# define ibuf_build_entry_from_ibuf_rec(mtr,ibuf_rec,heap,pindex) \
1543 ibuf_build_entry_from_ibuf_rec_func(ibuf_rec,heap,pindex)
1544#endif
1545
1546/*********************************************************************//**
1547Builds the entry used to
1548
15491) IBUF_OP_INSERT: insert into a non-clustered index
1550
15512) IBUF_OP_DELETE_MARK: find the record whose delete-mark flag we need to
1552 activate
1553
15543) IBUF_OP_DELETE: find the record we need to delete
1555
1556when we have the corresponding record in an ibuf index.
1557
1558NOTE that as we copy pointers to fields in ibuf_rec, the caller must
1559hold a latch to the ibuf_rec page as long as the entry is used!
1560
1561@return own: entry to insert to a non-clustered index */
1562static
1563dtuple_t*
1564ibuf_build_entry_from_ibuf_rec_func(
1565/*================================*/
1566#ifdef UNIV_DEBUG
1567 mtr_t* mtr, /*!< in: mini-transaction owning rec */
1568#endif /* UNIV_DEBUG */
1569 const rec_t* ibuf_rec, /*!< in: record in an insert buffer */
1570 mem_heap_t* heap, /*!< in: heap where built */
1571 dict_index_t** pindex) /*!< out, own: dummy index that
1572 describes the entry */
1573{
1574 dtuple_t* tuple;
1575 dfield_t* field;
1576 ulint n_fields;
1577 const byte* types;
1578 const byte* data;
1579 ulint len;
1580 ulint info_len;
1581 ulint i;
1582 ulint comp;
1583 dict_index_t* index;
1584
1585 ut_ad(mtr_memo_contains_page_flagged(mtr, ibuf_rec, MTR_MEMO_PAGE_X_FIX
1586 | MTR_MEMO_PAGE_S_FIX));
1587 ut_ad(ibuf_inside(mtr));
1588
1589 data = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_MARKER, &len);
1590
1591 ut_a(len == 1);
1592 ut_a(*data == 0);
1593 ut_a(rec_get_n_fields_old(ibuf_rec) > IBUF_REC_FIELD_USER);
1594
1595 n_fields = rec_get_n_fields_old(ibuf_rec) - IBUF_REC_FIELD_USER;
1596
1597 tuple = dtuple_create(heap, n_fields);
1598
1599 types = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_METADATA, &len);
1600
1601 ibuf_rec_get_info(mtr, ibuf_rec, NULL, &comp, &info_len, NULL);
1602
1603 index = ibuf_dummy_index_create(n_fields, comp);
1604
1605 len -= info_len;
1606 types += info_len;
1607
1608 ut_a(len == n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
1609
1610 for (i = 0; i < n_fields; i++) {
1611 field = dtuple_get_nth_field(tuple, i);
1612
1613 data = rec_get_nth_field_old(
1614 ibuf_rec, i + IBUF_REC_FIELD_USER, &len);
1615
1616 dfield_set_data(field, data, len);
1617
1618 dtype_new_read_for_order_and_null_size(
1619 dfield_get_type(field),
1620 types + i * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE);
1621
1622 ibuf_dummy_index_add_col(index, dfield_get_type(field), len);
1623 }
1624
1625 index->n_core_null_bytes
1626 = UT_BITS_IN_BYTES(unsigned(index->n_nullable));
1627
1628 /* Prevent an ut_ad() failure in page_zip_write_rec() by
1629 adding system columns to the dummy table pointed to by the
1630 dummy secondary index. The insert buffer is only used for
1631 secondary indexes, whose records never contain any system
1632 columns, such as DB_TRX_ID. */
1633 ut_d(dict_table_add_system_columns(index->table, index->table->heap));
1634
1635 *pindex = index;
1636
1637 return(tuple);
1638}
1639
1640/******************************************************************//**
1641Get the data size.
1642@return size of fields */
1643UNIV_INLINE
1644ulint
1645ibuf_rec_get_size(
1646/*==============*/
1647 const rec_t* rec, /*!< in: ibuf record */
1648 const byte* types, /*!< in: fields */
1649 ulint n_fields, /*!< in: number of fields */
1650 ulint comp) /*!< in: 0=ROW_FORMAT=REDUNDANT,
1651 nonzero=ROW_FORMAT=COMPACT */
1652{
1653 ulint i;
1654 ulint field_offset;
1655 ulint types_offset;
1656 ulint size = 0;
1657
1658 field_offset = IBUF_REC_FIELD_USER;
1659 types_offset = DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
1660
1661 for (i = 0; i < n_fields; i++) {
1662 ulint len;
1663 dtype_t dtype;
1664
1665 rec_get_nth_field_offs_old(rec, i + field_offset, &len);
1666
1667 if (len != UNIV_SQL_NULL) {
1668 size += len;
1669 } else {
1670 dtype_new_read_for_order_and_null_size(&dtype, types);
1671
1672 size += dtype_get_sql_null_size(&dtype, comp);
1673 }
1674
1675 types += types_offset;
1676 }
1677
1678 return(size);
1679}
1680
1681#ifdef UNIV_DEBUG
1682# define ibuf_rec_get_volume(mtr,rec) ibuf_rec_get_volume_func(mtr,rec)
1683#else /* UNIV_DEBUG */
1684# define ibuf_rec_get_volume(mtr,rec) ibuf_rec_get_volume_func(rec)
1685#endif
1686
1687/********************************************************************//**
1688Returns the space taken by a stored non-clustered index entry if converted to
1689an index record.
1690@return size of index record in bytes + an upper limit of the space
1691taken in the page directory */
1692static
1693ulint
1694ibuf_rec_get_volume_func(
1695/*=====================*/
1696#ifdef UNIV_DEBUG
1697 mtr_t* mtr, /*!< in: mini-transaction owning rec */
1698#endif /* UNIV_DEBUG */
1699 const rec_t* ibuf_rec)/*!< in: ibuf record */
1700{
1701 ulint len;
1702 const byte* data;
1703 const byte* types;
1704 ulint n_fields;
1705 ulint data_size;
1706 ulint comp;
1707 ibuf_op_t op;
1708 ulint info_len;
1709
1710 ut_ad(mtr_memo_contains_page_flagged(mtr, ibuf_rec, MTR_MEMO_PAGE_X_FIX
1711 | MTR_MEMO_PAGE_S_FIX));
1712 ut_ad(ibuf_inside(mtr));
1713 ut_ad(rec_get_n_fields_old(ibuf_rec) > 2);
1714
1715 data = rec_get_nth_field_old(ibuf_rec, IBUF_REC_FIELD_MARKER, &len);
1716 ut_a(len == 1);
1717 ut_a(*data == 0);
1718
1719 types = rec_get_nth_field_old(
1720 ibuf_rec, IBUF_REC_FIELD_METADATA, &len);
1721
1722 ibuf_rec_get_info(mtr, ibuf_rec, &op, &comp, &info_len, NULL);
1723
1724 if (op == IBUF_OP_DELETE_MARK || op == IBUF_OP_DELETE) {
1725 /* Delete-marking a record doesn't take any
1726 additional space, and while deleting a record
1727 actually frees up space, we have to play it safe and
1728 pretend it takes no additional space (the record
1729 might not exist, etc.). */
1730
1731 return(0);
1732 } else if (comp) {
1733 dtuple_t* entry;
1734 ulint volume;
1735 dict_index_t* dummy_index;
1736 mem_heap_t* heap = mem_heap_create(500);
1737
1738 entry = ibuf_build_entry_from_ibuf_rec(mtr, ibuf_rec,
1739 heap, &dummy_index);
1740
1741 volume = rec_get_converted_size(dummy_index, entry, 0);
1742
1743 ibuf_dummy_index_free(dummy_index);
1744 mem_heap_free(heap);
1745
1746 return(volume + page_dir_calc_reserved_space(1));
1747 }
1748
1749 types += info_len;
1750 n_fields = rec_get_n_fields_old(ibuf_rec)
1751 - IBUF_REC_FIELD_USER;
1752
1753 data_size = ibuf_rec_get_size(ibuf_rec, types, n_fields, comp);
1754
1755 return(data_size + rec_get_converted_extra_size(data_size, n_fields, 0)
1756 + page_dir_calc_reserved_space(1));
1757}
1758
1759/*********************************************************************//**
1760Builds the tuple to insert to an ibuf tree when we have an entry for a
1761non-clustered index.
1762
1763NOTE that the original entry must be kept because we copy pointers to
1764its fields.
1765
1766@return own: entry to insert into an ibuf index tree */
1767static
1768dtuple_t*
1769ibuf_entry_build(
1770/*=============*/
1771 ibuf_op_t op, /*!< in: operation type */
1772 dict_index_t* index, /*!< in: non-clustered index */
1773 const dtuple_t* entry, /*!< in: entry for a non-clustered index */
1774 ulint space, /*!< in: space id */
1775 ulint page_no,/*!< in: index page number where entry should
1776 be inserted */
1777 ulint counter,/*!< in: counter value;
1778 ULINT_UNDEFINED=not used */
1779 mem_heap_t* heap) /*!< in: heap into which to build */
1780{
1781 dtuple_t* tuple;
1782 dfield_t* field;
1783 const dfield_t* entry_field;
1784 ulint n_fields;
1785 byte* buf;
1786 byte* ti;
1787 byte* type_info;
1788 ulint i;
1789
1790 ut_ad(counter != ULINT_UNDEFINED || op == IBUF_OP_INSERT);
1791 ut_ad(counter == ULINT_UNDEFINED || counter <= 0xFFFF);
1792 ut_ad(op < IBUF_OP_COUNT);
1793
1794 /* We have to build a tuple with the following fields:
1795
1796 1-4) These are described at the top of this file.
1797
1798 5) The rest of the fields are copied from the entry.
1799
1800 All fields in the tuple are ordered like the type binary in our
1801 insert buffer tree. */
1802
1803 n_fields = dtuple_get_n_fields(entry);
1804
1805 tuple = dtuple_create(heap, n_fields + IBUF_REC_FIELD_USER);
1806
1807 /* 1) Space Id */
1808
1809 field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_SPACE);
1810
1811 buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
1812
1813 mach_write_to_4(buf, space);
1814
1815 dfield_set_data(field, buf, 4);
1816
1817 /* 2) Marker byte */
1818
1819 field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_MARKER);
1820
1821 buf = static_cast<byte*>(mem_heap_alloc(heap, 1));
1822
1823 /* We set the marker byte zero */
1824
1825 mach_write_to_1(buf, 0);
1826
1827 dfield_set_data(field, buf, 1);
1828
1829 /* 3) Page number */
1830
1831 field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_PAGE);
1832
1833 buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
1834
1835 mach_write_to_4(buf, page_no);
1836
1837 dfield_set_data(field, buf, 4);
1838
1839 /* 4) Type info, part #1 */
1840
1841 if (counter == ULINT_UNDEFINED) {
1842 i = dict_table_is_comp(index->table) ? 1 : 0;
1843 } else {
1844 ut_ad(counter <= 0xFFFF);
1845 i = IBUF_REC_INFO_SIZE;
1846 }
1847
1848 ti = type_info = static_cast<byte*>(
1849 mem_heap_alloc(
1850 heap,
1851 i + n_fields * DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE));
1852
1853 switch (i) {
1854 default:
1855 ut_error;
1856 break;
1857 case 1:
1858 /* set the flag for ROW_FORMAT=COMPACT */
1859 *ti++ = 0;
1860 /* fall through */
1861 case 0:
1862 /* the old format does not allow delete buffering */
1863 ut_ad(op == IBUF_OP_INSERT);
1864 break;
1865 case IBUF_REC_INFO_SIZE:
1866 mach_write_to_2(ti + IBUF_REC_OFFSET_COUNTER, counter);
1867
1868 ti[IBUF_REC_OFFSET_TYPE] = (byte) op;
1869 ti[IBUF_REC_OFFSET_FLAGS] = dict_table_is_comp(index->table)
1870 ? IBUF_REC_COMPACT : 0;
1871 ti += IBUF_REC_INFO_SIZE;
1872 break;
1873 }
1874
1875 /* 5+) Fields from the entry */
1876
1877 for (i = 0; i < n_fields; i++) {
1878 ulint fixed_len;
1879 const dict_field_t* ifield;
1880
1881 field = dtuple_get_nth_field(tuple, i + IBUF_REC_FIELD_USER);
1882 entry_field = dtuple_get_nth_field(entry, i);
1883 dfield_copy(field, entry_field);
1884
1885 ifield = dict_index_get_nth_field(index, i);
1886 /* Prefix index columns of fixed-length columns are of
1887 fixed length. However, in the function call below,
1888 dfield_get_type(entry_field) contains the fixed length
1889 of the column in the clustered index. Replace it with
1890 the fixed length of the secondary index column. */
1891 fixed_len = ifield->fixed_len;
1892
1893#ifdef UNIV_DEBUG
1894 if (fixed_len) {
1895 /* dict_index_add_col() should guarantee these */
1896 ut_ad(fixed_len <= (ulint)
1897 dfield_get_type(entry_field)->len);
1898 if (ifield->prefix_len) {
1899 ut_ad(ifield->prefix_len == fixed_len);
1900 } else {
1901 ut_ad(fixed_len == (ulint)
1902 dfield_get_type(entry_field)->len);
1903 }
1904 }
1905#endif /* UNIV_DEBUG */
1906
1907 dtype_new_store_for_order_and_null_size(
1908 ti, dfield_get_type(entry_field), fixed_len);
1909 ti += DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE;
1910 }
1911
1912 /* 4) Type info, part #2 */
1913
1914 field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_METADATA);
1915
1916 dfield_set_data(field, type_info, ulint(ti - type_info));
1917
1918 /* Set all the types in the new tuple binary */
1919
1920 dtuple_set_types_binary(tuple, n_fields + IBUF_REC_FIELD_USER);
1921
1922 return(tuple);
1923}
1924
1925/*********************************************************************//**
1926Builds a search tuple used to search buffered inserts for an index page.
1927This is for >= 4.1.x format records.
1928@return own: search tuple */
1929static
1930dtuple_t*
1931ibuf_search_tuple_build(
1932/*====================*/
1933 ulint space, /*!< in: space id */
1934 ulint page_no,/*!< in: index page number */
1935 mem_heap_t* heap) /*!< in: heap into which to build */
1936{
1937 dtuple_t* tuple;
1938 dfield_t* field;
1939 byte* buf;
1940
1941 tuple = dtuple_create(heap, IBUF_REC_FIELD_METADATA);
1942
1943 /* Store the space id in tuple */
1944
1945 field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_SPACE);
1946
1947 buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
1948
1949 mach_write_to_4(buf, space);
1950
1951 dfield_set_data(field, buf, 4);
1952
1953 /* Store the new format record marker byte */
1954
1955 field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_MARKER);
1956
1957 buf = static_cast<byte*>(mem_heap_alloc(heap, 1));
1958
1959 mach_write_to_1(buf, 0);
1960
1961 dfield_set_data(field, buf, 1);
1962
1963 /* Store the page number in tuple */
1964
1965 field = dtuple_get_nth_field(tuple, IBUF_REC_FIELD_PAGE);
1966
1967 buf = static_cast<byte*>(mem_heap_alloc(heap, 4));
1968
1969 mach_write_to_4(buf, page_no);
1970
1971 dfield_set_data(field, buf, 4);
1972
1973 dtuple_set_types_binary(tuple, IBUF_REC_FIELD_METADATA);
1974
1975 return(tuple);
1976}
1977
1978/*********************************************************************//**
1979Checks if there are enough pages in the free list of the ibuf tree that we
1980dare to start a pessimistic insert to the insert buffer.
1981@return whether enough free pages in list */
1982static inline bool ibuf_data_enough_free_for_insert()
1983{
1984 ut_ad(mutex_own(&ibuf_mutex));
1985
1986 /* We want a big margin of free pages, because a B-tree can sometimes
1987 grow in size also if records are deleted from it, as the node pointers
1988 can change, and we must make sure that we are able to delete the
1989 inserts buffered for pages that we read to the buffer pool, without
1990 any risk of running out of free space in the insert buffer. */
1991
1992 return(ibuf->free_list_len >= (ibuf->size / 2) + 3 * ibuf->height);
1993}
1994
1995/*********************************************************************//**
1996Checks if there are enough pages in the free list of the ibuf tree that we
1997should remove them and free to the file space management.
1998@return TRUE if enough free pages in list */
1999UNIV_INLINE
2000ibool
2001ibuf_data_too_much_free(void)
2002/*=========================*/
2003{
2004 ut_ad(mutex_own(&ibuf_mutex));
2005
2006 return(ibuf->free_list_len >= 3 + (ibuf->size / 2) + 3 * ibuf->height);
2007}
2008
2009/*********************************************************************//**
2010Allocates a new page from the ibuf file segment and adds it to the free
2011list.
2012@return TRUE on success, FALSE if no space left */
2013static
2014ibool
2015ibuf_add_free_page(void)
2016/*====================*/
2017{
2018 mtr_t mtr;
2019 page_t* header_page;
2020 buf_block_t* block;
2021 page_t* page;
2022 page_t* root;
2023 page_t* bitmap_page;
2024
2025 mtr_start(&mtr);
2026 /* Acquire the fsp latch before the ibuf header, obeying the latching
2027 order */
2028 mtr_x_lock(&fil_system.sys_space->latch, &mtr);
2029 header_page = ibuf_header_page_get(&mtr);
2030
2031 /* Allocate a new page: NOTE that if the page has been a part of a
2032 non-clustered index which has subsequently been dropped, then the
2033 page may have buffered inserts in the insert buffer, and these
2034 should be deleted from there. These get deleted when the page
2035 allocation creates the page in buffer. Thus the call below may end
2036 up calling the insert buffer routines and, as we yet have no latches
2037 to insert buffer tree pages, these routines can run without a risk
2038 of a deadlock. This is the reason why we created a special ibuf
2039 header page apart from the ibuf tree. */
2040
2041 block = fseg_alloc_free_page(
2042 header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER, 0, FSP_UP,
2043 &mtr);
2044
2045 if (block == NULL) {
2046 mtr_commit(&mtr);
2047
2048 return(FALSE);
2049 }
2050
2051 ut_ad(rw_lock_get_x_lock_count(&block->lock) == 1);
2052 ibuf_enter(&mtr);
2053 mutex_enter(&ibuf_mutex);
2054 root = ibuf_tree_root_get(&mtr);
2055
2056 buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);
2057 page = buf_block_get_frame(block);
2058
2059 /* Add the page to the free list and update the ibuf size data */
2060
2061 flst_add_last(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
2062 page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);
2063
2064 mlog_write_ulint(page + FIL_PAGE_TYPE, FIL_PAGE_IBUF_FREE_LIST,
2065 MLOG_2BYTES, &mtr);
2066
2067 ibuf->seg_size++;
2068 ibuf->free_list_len++;
2069
2070 /* Set the bit indicating that this page is now an ibuf tree page
2071 (level 2 page) */
2072
2073 const page_id_t page_id(IBUF_SPACE_ID, block->page.id.page_no());
2074 bitmap_page = ibuf_bitmap_get_map_page(page_id, univ_page_size, &mtr);
2075
2076 mutex_exit(&ibuf_mutex);
2077
2078 ibuf_bitmap_page_set_bits(bitmap_page, page_id, univ_page_size,
2079 IBUF_BITMAP_IBUF, TRUE, &mtr);
2080
2081 ibuf_mtr_commit(&mtr);
2082
2083 return(TRUE);
2084}
2085
2086/*********************************************************************//**
2087Removes a page from the free list and frees it to the fsp system. */
2088static
2089void
2090ibuf_remove_free_page(void)
2091/*=======================*/
2092{
2093 mtr_t mtr;
2094 mtr_t mtr2;
2095 page_t* header_page;
2096 ulint page_no;
2097 page_t* page;
2098 page_t* root;
2099 page_t* bitmap_page;
2100
2101 log_free_check();
2102
2103 mtr_start(&mtr);
2104 /* Acquire the fsp latch before the ibuf header, obeying the latching
2105 order */
2106
2107 mtr_x_lock(&fil_system.sys_space->latch, &mtr);
2108 header_page = ibuf_header_page_get(&mtr);
2109
2110 /* Prevent pessimistic inserts to insert buffer trees for a while */
2111 ibuf_enter(&mtr);
2112 mutex_enter(&ibuf_pessimistic_insert_mutex);
2113 mutex_enter(&ibuf_mutex);
2114
2115 if (!ibuf_data_too_much_free()) {
2116
2117 mutex_exit(&ibuf_mutex);
2118 mutex_exit(&ibuf_pessimistic_insert_mutex);
2119
2120 ibuf_mtr_commit(&mtr);
2121
2122 return;
2123 }
2124
2125 ibuf_mtr_start(&mtr2);
2126
2127 root = ibuf_tree_root_get(&mtr2);
2128
2129 mutex_exit(&ibuf_mutex);
2130
2131 page_no = flst_get_last(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
2132 &mtr2).page;
2133
2134 /* NOTE that we must release the latch on the ibuf tree root
2135 because in fseg_free_page we access level 1 pages, and the root
2136 is a level 2 page. */
2137
2138 ibuf_mtr_commit(&mtr2);
2139 ibuf_exit(&mtr);
2140
2141 /* Since pessimistic inserts were prevented, we know that the
2142 page is still in the free list. NOTE that also deletes may take
2143 pages from the free list, but they take them from the start, and
2144 the free list was so long that they cannot have taken the last
2145 page from it. */
2146
2147 fseg_free_page(header_page + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
2148 IBUF_SPACE_ID, page_no, false, &mtr);
2149
2150 const page_id_t page_id(IBUF_SPACE_ID, page_no);
2151
2152 ut_d(buf_page_reset_file_page_was_freed(page_id));
2153
2154 ibuf_enter(&mtr);
2155
2156 mutex_enter(&ibuf_mutex);
2157
2158 root = ibuf_tree_root_get(&mtr);
2159
2160 ut_ad(page_no == flst_get_last(root + PAGE_HEADER
2161 + PAGE_BTR_IBUF_FREE_LIST, &mtr).page);
2162
2163 {
2164 buf_block_t* block;
2165
2166 block = buf_page_get(page_id, univ_page_size, RW_X_LATCH, &mtr);
2167
2168 buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
2169
2170 page = buf_block_get_frame(block);
2171 }
2172
2173 /* Remove the page from the free list and update the ibuf size data */
2174
2175 flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
2176 page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, &mtr);
2177
2178 mutex_exit(&ibuf_pessimistic_insert_mutex);
2179
2180 ibuf->seg_size--;
2181 ibuf->free_list_len--;
2182
2183 /* Set the bit indicating that this page is no more an ibuf tree page
2184 (level 2 page) */
2185
2186 bitmap_page = ibuf_bitmap_get_map_page(page_id, univ_page_size, &mtr);
2187
2188 mutex_exit(&ibuf_mutex);
2189
2190 ibuf_bitmap_page_set_bits(
2191 bitmap_page, page_id, univ_page_size, IBUF_BITMAP_IBUF, FALSE,
2192 &mtr);
2193
2194 ut_d(buf_page_set_file_page_was_freed(page_id));
2195
2196 ibuf_mtr_commit(&mtr);
2197}
2198
2199/***********************************************************************//**
2200Frees excess pages from the ibuf free list. This function is called when an OS
2201thread calls fsp services to allocate a new file segment, or a new page to a
2202file segment, and the thread did not own the fsp latch before this call. */
2203void
2204ibuf_free_excess_pages(void)
2205/*========================*/
2206{
2207 if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE) {
2208 return;
2209 }
2210
2211 /* Free at most a few pages at a time, so that we do not delay the
2212 requested service too much */
2213
2214 for (ulint i = 0; i < 4; i++) {
2215
2216 ibool too_much_free;
2217
2218 mutex_enter(&ibuf_mutex);
2219 too_much_free = ibuf_data_too_much_free();
2220 mutex_exit(&ibuf_mutex);
2221
2222 if (!too_much_free) {
2223 return;
2224 }
2225
2226 ibuf_remove_free_page();
2227 }
2228}
2229
2230#ifdef UNIV_DEBUG
2231# define ibuf_get_merge_page_nos(contract,rec,mtr,ids,pages,n_stored) \
2232 ibuf_get_merge_page_nos_func(contract,rec,mtr,ids,pages,n_stored)
2233#else /* UNIV_DEBUG */
2234# define ibuf_get_merge_page_nos(contract,rec,mtr,ids,pages,n_stored) \
2235 ibuf_get_merge_page_nos_func(contract,rec,ids,pages,n_stored)
2236#endif /* UNIV_DEBUG */
2237
2238/*********************************************************************//**
2239Reads page numbers from a leaf in an ibuf tree.
2240@return a lower limit for the combined volume of records which will be
2241merged */
2242static
2243ulint
2244ibuf_get_merge_page_nos_func(
2245/*=========================*/
2246 ibool contract,/*!< in: TRUE if this function is called to
2247 contract the tree, FALSE if this is called
2248 when a single page becomes full and we look
2249 if it pays to read also nearby pages */
2250 const rec_t* rec, /*!< in: insert buffer record */
2251#ifdef UNIV_DEBUG
2252 mtr_t* mtr, /*!< in: mini-transaction holding rec */
2253#endif /* UNIV_DEBUG */
2254 ulint* space_ids,/*!< in/out: space id's of the pages */
2255 ulint* page_nos,/*!< in/out: buffer for at least
2256 IBUF_MAX_N_PAGES_MERGED many page numbers;
2257 the page numbers are in an ascending order */
2258 ulint* n_stored)/*!< out: number of page numbers stored to
2259 page_nos in this function */
2260{
2261 ulint prev_page_no;
2262 ulint prev_space_id;
2263 ulint first_page_no;
2264 ulint first_space_id;
2265 ulint rec_page_no;
2266 ulint rec_space_id;
2267 ulint sum_volumes;
2268 ulint volume_for_page;
2269 ulint rec_volume;
2270 ulint limit;
2271 ulint n_pages;
2272
2273 ut_ad(mtr_memo_contains_page_flagged(mtr, rec, MTR_MEMO_PAGE_X_FIX
2274 | MTR_MEMO_PAGE_S_FIX));
2275 ut_ad(ibuf_inside(mtr));
2276
2277 *n_stored = 0;
2278
2279 limit = ut_min(IBUF_MAX_N_PAGES_MERGED,
2280 buf_pool_get_curr_size() / 4);
2281
2282 if (page_rec_is_supremum(rec)) {
2283
2284 rec = page_rec_get_prev_const(rec);
2285 }
2286
2287 if (page_rec_is_infimum(rec)) {
2288
2289 rec = page_rec_get_next_const(rec);
2290 }
2291
2292 if (page_rec_is_supremum(rec)) {
2293
2294 return(0);
2295 }
2296
2297 first_page_no = ibuf_rec_get_page_no(mtr, rec);
2298 first_space_id = ibuf_rec_get_space(mtr, rec);
2299 n_pages = 0;
2300 prev_page_no = 0;
2301 prev_space_id = 0;
2302
2303 /* Go backwards from the first rec until we reach the border of the
2304 'merge area', or the page start or the limit of storeable pages is
2305 reached */
2306
2307 while (!page_rec_is_infimum(rec) && UNIV_LIKELY(n_pages < limit)) {
2308
2309 rec_page_no = ibuf_rec_get_page_no(mtr, rec);
2310 rec_space_id = ibuf_rec_get_space(mtr, rec);
2311
2312 if (rec_space_id != first_space_id
2313 || (rec_page_no / IBUF_MERGE_AREA)
2314 != (first_page_no / IBUF_MERGE_AREA)) {
2315
2316 break;
2317 }
2318
2319 if (rec_page_no != prev_page_no
2320 || rec_space_id != prev_space_id) {
2321 n_pages++;
2322 }
2323
2324 prev_page_no = rec_page_no;
2325 prev_space_id = rec_space_id;
2326
2327 rec = page_rec_get_prev_const(rec);
2328 }
2329
2330 rec = page_rec_get_next_const(rec);
2331
2332 /* At the loop start there is no prev page; we mark this with a pair
2333 of space id, page no (0, 0) for which there can never be entries in
2334 the insert buffer */
2335
2336 prev_page_no = 0;
2337 prev_space_id = 0;
2338 sum_volumes = 0;
2339 volume_for_page = 0;
2340
2341 while (*n_stored < limit) {
2342 if (page_rec_is_supremum(rec)) {
2343 /* When no more records available, mark this with
2344 another 'impossible' pair of space id, page no */
2345 rec_page_no = 1;
2346 rec_space_id = 0;
2347 } else {
2348 rec_page_no = ibuf_rec_get_page_no(mtr, rec);
2349 rec_space_id = ibuf_rec_get_space(mtr, rec);
2350 /* In the system tablespace the smallest
2351 possible secondary index leaf page number is
2352 bigger than FSP_DICT_HDR_PAGE_NO (7).
2353 In all tablespaces, pages 0 and 1 are reserved
2354 for the allocation bitmap and the change
2355 buffer bitmap. In file-per-table tablespaces,
2356 a file segment inode page will be created at
2357 page 2 and the clustered index tree is created
2358 at page 3. So for file-per-table tablespaces,
2359 page 4 is the smallest possible secondary
2360 index leaf page. CREATE TABLESPACE also initially
2361 uses pages 2 and 3 for the first created table,
2362 but that table may be dropped, allowing page 2
2363 to be reused for a secondary index leaf page.
2364 To keep this assertion simple, just
2365 make sure the page is >= 2. */
2366 ut_ad(rec_page_no >= FSP_FIRST_INODE_PAGE_NO);
2367 }
2368
2369#ifdef UNIV_IBUF_DEBUG
2370 ut_a(*n_stored < IBUF_MAX_N_PAGES_MERGED);
2371#endif
2372 if ((rec_space_id != prev_space_id
2373 || rec_page_no != prev_page_no)
2374 && (prev_space_id != 0 || prev_page_no != 0)) {
2375
2376 if (contract
2377 || (prev_page_no == first_page_no
2378 && prev_space_id == first_space_id)
2379 || (volume_for_page
2380 > ((IBUF_MERGE_THRESHOLD - 1)
2381 * 4U << srv_page_size_shift
2382 / IBUF_PAGE_SIZE_PER_FREE_SPACE)
2383 / IBUF_MERGE_THRESHOLD)) {
2384
2385 space_ids[*n_stored] = prev_space_id;
2386 page_nos[*n_stored] = prev_page_no;
2387
2388 (*n_stored)++;
2389
2390 sum_volumes += volume_for_page;
2391 }
2392
2393 if (rec_space_id != first_space_id
2394 || rec_page_no / IBUF_MERGE_AREA
2395 != first_page_no / IBUF_MERGE_AREA) {
2396
2397 break;
2398 }
2399
2400 volume_for_page = 0;
2401 }
2402
2403 if (rec_page_no == 1 && rec_space_id == 0) {
2404 /* Supremum record */
2405
2406 break;
2407 }
2408
2409 rec_volume = ibuf_rec_get_volume(mtr, rec);
2410
2411 volume_for_page += rec_volume;
2412
2413 prev_page_no = rec_page_no;
2414 prev_space_id = rec_space_id;
2415
2416 rec = page_rec_get_next_const(rec);
2417 }
2418
2419#ifdef UNIV_IBUF_DEBUG
2420 ut_a(*n_stored <= IBUF_MAX_N_PAGES_MERGED);
2421#endif
2422#if 0
2423 fprintf(stderr, "Ibuf merge batch %lu pages %lu volume\n",
2424 *n_stored, sum_volumes);
2425#endif
2426 return(sum_volumes);
2427}
2428
2429/*******************************************************************//**
2430Get the matching records for space id.
2431@return current rec or NULL */
2432static MY_ATTRIBUTE((nonnull, warn_unused_result))
2433const rec_t*
2434ibuf_get_user_rec(
2435/*===============*/
2436 btr_pcur_t* pcur, /*!< in: the current cursor */
2437 mtr_t* mtr) /*!< in: mini transaction */
2438{
2439 do {
2440 const rec_t* rec = btr_pcur_get_rec(pcur);
2441
2442 if (page_rec_is_user_rec(rec)) {
2443 return(rec);
2444 }
2445 } while (btr_pcur_move_to_next(pcur, mtr));
2446
2447 return(NULL);
2448}
2449
2450/*********************************************************************//**
2451Reads page numbers for a space id from an ibuf tree.
2452@return a lower limit for the combined volume of records which will be
2453merged */
2454static MY_ATTRIBUTE((nonnull, warn_unused_result))
2455ulint
2456ibuf_get_merge_pages(
2457/*=================*/
2458 btr_pcur_t* pcur, /*!< in/out: cursor */
2459 ulint space, /*!< in: space for which to merge */
2460 ulint limit, /*!< in: max page numbers to read */
2461 ulint* pages, /*!< out: pages read */
2462 ulint* spaces, /*!< out: spaces read */
2463 ulint* n_pages,/*!< out: number of pages read */
2464 mtr_t* mtr) /*!< in: mini transaction */
2465{
2466 const rec_t* rec;
2467 ulint volume = 0;
2468
2469 ut_a(space != ULINT_UNDEFINED);
2470
2471 *n_pages = 0;
2472
2473 while ((rec = ibuf_get_user_rec(pcur, mtr)) != 0
2474 && ibuf_rec_get_space(mtr, rec) == space
2475 && *n_pages < limit) {
2476
2477 ulint page_no = ibuf_rec_get_page_no(mtr, rec);
2478
2479 if (*n_pages == 0 || pages[*n_pages - 1] != page_no) {
2480 spaces[*n_pages] = space;
2481 pages[*n_pages] = page_no;
2482 ++*n_pages;
2483 }
2484
2485 volume += ibuf_rec_get_volume(mtr, rec);
2486
2487 btr_pcur_move_to_next(pcur, mtr);
2488 }
2489
2490 return(volume);
2491}
2492
2493/*********************************************************************//**
2494Contracts insert buffer trees by reading pages to the buffer pool.
2495@return a lower limit for the combined size in bytes of entries which
2496will be merged from ibuf trees to the pages read, 0 if ibuf is
2497empty */
2498static
2499ulint
2500ibuf_merge_pages(
2501/*=============*/
2502 ulint* n_pages, /*!< out: number of pages to which merged */
2503 bool sync) /*!< in: true if the caller wants to wait for
2504 the issued read with the highest tablespace
2505 address to complete */
2506{
2507 mtr_t mtr;
2508 btr_pcur_t pcur;
2509 ulint sum_sizes;
2510 ulint page_nos[IBUF_MAX_N_PAGES_MERGED];
2511 ulint space_ids[IBUF_MAX_N_PAGES_MERGED];
2512
2513 *n_pages = 0;
2514
2515 ibuf_mtr_start(&mtr);
2516
2517 /* Open a cursor to a randomly chosen leaf of the tree, at a random
2518 position within the leaf */
2519 bool available;
2520
2521 available = btr_pcur_open_at_rnd_pos(ibuf->index, BTR_SEARCH_LEAF,
2522 &pcur, &mtr);
2523 /* No one should make this index unavailable when server is running */
2524 ut_a(available);
2525
2526 ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index));
2527
2528 if (page_is_empty(btr_pcur_get_page(&pcur))) {
2529 /* If a B-tree page is empty, it must be the root page
2530 and the whole B-tree must be empty. InnoDB does not
2531 allow empty B-tree pages other than the root. */
2532 ut_ad(ibuf->empty);
2533 ut_ad(page_get_space_id(btr_pcur_get_page(&pcur))
2534 == IBUF_SPACE_ID);
2535 ut_ad(page_get_page_no(btr_pcur_get_page(&pcur))
2536 == FSP_IBUF_TREE_ROOT_PAGE_NO);
2537
2538 ibuf_mtr_commit(&mtr);
2539 btr_pcur_close(&pcur);
2540
2541 return(0);
2542 }
2543
2544 sum_sizes = ibuf_get_merge_page_nos(TRUE,
2545 btr_pcur_get_rec(&pcur), &mtr,
2546 space_ids,
2547 page_nos, n_pages);
2548#if 0 /* defined UNIV_IBUF_DEBUG */
2549 fprintf(stderr, "Ibuf contract sync %lu pages %lu volume %lu\n",
2550 sync, *n_pages, sum_sizes);
2551#endif
2552 ibuf_mtr_commit(&mtr);
2553 btr_pcur_close(&pcur);
2554
2555 buf_read_ibuf_merge_pages(
2556 sync, space_ids, page_nos, *n_pages);
2557
2558 return(sum_sizes + 1);
2559}
2560
2561/*********************************************************************//**
2562Contracts insert buffer trees by reading pages referring to space_id
2563to the buffer pool.
2564@returns number of pages merged.*/
2565ulint
2566ibuf_merge_space(
2567/*=============*/
2568 ulint space) /*!< in: tablespace id to merge */
2569{
2570 mtr_t mtr;
2571 btr_pcur_t pcur;
2572 mem_heap_t* heap = mem_heap_create(512);
2573 dtuple_t* tuple = ibuf_search_tuple_build(space, 0, heap);
2574 ulint n_pages = 0;
2575
2576 ut_ad(space < SRV_LOG_SPACE_FIRST_ID);
2577
2578 ibuf_mtr_start(&mtr);
2579
2580 /* Position the cursor on the first matching record. */
2581
2582 btr_pcur_open(
2583 ibuf->index, tuple, PAGE_CUR_GE, BTR_SEARCH_LEAF, &pcur,
2584 &mtr);
2585
2586 mem_heap_free(heap);
2587
2588 ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index));
2589
2590 ulint sum_sizes = 0;
2591 ulint pages[IBUF_MAX_N_PAGES_MERGED];
2592 ulint spaces[IBUF_MAX_N_PAGES_MERGED];
2593
2594 if (page_is_empty(btr_pcur_get_page(&pcur))) {
2595 /* If a B-tree page is empty, it must be the root page
2596 and the whole B-tree must be empty. InnoDB does not
2597 allow empty B-tree pages other than the root. */
2598 ut_ad(ibuf->empty);
2599 ut_ad(page_get_space_id(btr_pcur_get_page(&pcur))
2600 == IBUF_SPACE_ID);
2601 ut_ad(page_get_page_no(btr_pcur_get_page(&pcur))
2602 == FSP_IBUF_TREE_ROOT_PAGE_NO);
2603
2604 } else {
2605
2606 sum_sizes = ibuf_get_merge_pages(
2607 &pcur, space, IBUF_MAX_N_PAGES_MERGED,
2608 &pages[0], &spaces[0], &n_pages,
2609 &mtr);
2610 ib::info() << "Size of pages merged " << sum_sizes;
2611 }
2612
2613 ibuf_mtr_commit(&mtr);
2614
2615 btr_pcur_close(&pcur);
2616
2617 if (n_pages > 0) {
2618 ut_ad(n_pages <= UT_ARR_SIZE(pages));
2619
2620#ifdef UNIV_DEBUG
2621 for (ulint i = 0; i < n_pages; ++i) {
2622 ut_ad(spaces[i] == space);
2623 }
2624#endif /* UNIV_DEBUG */
2625
2626 buf_read_ibuf_merge_pages(
2627 true, spaces, pages, n_pages);
2628 }
2629
2630 return(n_pages);
2631}
2632
2633/** Contract the change buffer by reading pages to the buffer pool.
2634@param[out] n_pages number of pages merged
2635@param[in] sync whether the caller waits for
2636the issued reads to complete
2637@return a lower limit for the combined size in bytes of entries which
2638will be merged from ibuf trees to the pages read, 0 if ibuf is
2639empty */
2640static MY_ATTRIBUTE((warn_unused_result))
2641ulint
2642ibuf_merge(
2643 ulint* n_pages,
2644 bool sync)
2645{
2646 *n_pages = 0;
2647
2648 /* We perform a dirty read of ibuf->empty, without latching
2649 the insert buffer root page. We trust this dirty read except
2650 when a slow shutdown is being executed. During a slow
2651 shutdown, the insert buffer merge must be completed. */
2652
2653 if (ibuf->empty && !srv_shutdown_state) {
2654 return(0);
2655#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
2656 } else if (ibuf_debug) {
2657 return(0);
2658#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
2659 } else {
2660 return(ibuf_merge_pages(n_pages, sync));
2661 }
2662}
2663
2664/** Contract the change buffer by reading pages to the buffer pool.
2665@param[in] sync whether the caller waits for
2666the issued reads to complete
2667@return a lower limit for the combined size in bytes of entries which
2668will be merged from ibuf trees to the pages read, 0 if ibuf is empty */
2669static
2670ulint
2671ibuf_contract(
2672 bool sync)
2673{
2674 ulint n_pages;
2675
2676 return(ibuf_merge_pages(&n_pages, sync));
2677}
2678
2679/** Contract the change buffer by reading pages to the buffer pool.
2680@param[in] full If true, do a full contraction based
2681on PCT_IO(100). If false, the size of contract batch is determined
2682based on the current size of the change buffer.
2683@return a lower limit for the combined size in bytes of entries which
2684will be merged from ibuf trees to the pages read, 0 if ibuf is
2685empty */
2686ulint
2687ibuf_merge_in_background(
2688 bool full)
2689{
2690 ulint sum_bytes = 0;
2691 ulint sum_pages = 0;
2692 ulint n_pag2;
2693 ulint n_pages;
2694
2695#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
2696 if (srv_ibuf_disable_background_merge) {
2697 return(0);
2698 }
2699#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
2700
2701 if (full) {
2702 /* Caller has requested a full batch */
2703 n_pages = PCT_IO(100);
2704 } else {
2705 /* By default we do a batch of 5% of the io_capacity */
2706 n_pages = PCT_IO(5);
2707
2708 mutex_enter(&ibuf_mutex);
2709
2710 /* If the ibuf->size is more than half the max_size
2711 then we make more agreesive contraction.
2712 +1 is to avoid division by zero. */
2713 if (ibuf->size > ibuf->max_size / 2) {
2714 ulint diff = ibuf->size - ibuf->max_size / 2;
2715 n_pages += PCT_IO((diff * 100)
2716 / (ibuf->max_size + 1));
2717 }
2718
2719 mutex_exit(&ibuf_mutex);
2720 }
2721
2722#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
2723 if (ibuf_debug) {
2724 return(0);
2725 }
2726#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
2727
2728 while (sum_pages < n_pages) {
2729 ulint n_bytes;
2730
2731 n_bytes = ibuf_merge(&n_pag2, false);
2732
2733 if (n_bytes == 0) {
2734 return(sum_bytes);
2735 }
2736
2737 sum_bytes += n_bytes;
2738 sum_pages += n_pag2;
2739 }
2740
2741 return(sum_bytes);
2742}
2743
2744/*********************************************************************//**
2745Contract insert buffer trees after insert if they are too big. */
2746UNIV_INLINE
2747void
2748ibuf_contract_after_insert(
2749/*=======================*/
2750 ulint entry_size) /*!< in: size of a record which was inserted
2751 into an ibuf tree */
2752{
2753 ibool sync;
2754 ulint sum_sizes;
2755 ulint size;
2756 ulint max_size;
2757
2758 /* Perform dirty reads of ibuf->size and ibuf->max_size, to
2759 reduce ibuf_mutex contention. ibuf->max_size remains constant
2760 after ibuf_init_at_db_start(), but ibuf->size should be
2761 protected by ibuf_mutex. Given that ibuf->size fits in a
2762 machine word, this should be OK; at worst we are doing some
2763 excessive ibuf_contract() or occasionally skipping a
2764 ibuf_contract(). */
2765 size = ibuf->size;
2766 max_size = ibuf->max_size;
2767
2768 if (size < max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) {
2769 return;
2770 }
2771
2772 sync = (size >= max_size + IBUF_CONTRACT_ON_INSERT_SYNC);
2773
2774 /* Contract at least entry_size many bytes */
2775 sum_sizes = 0;
2776 size = 1;
2777
2778 do {
2779
2780 size = ibuf_contract(sync);
2781 sum_sizes += size;
2782 } while (size > 0 && sum_sizes < entry_size);
2783}
2784
2785/*********************************************************************//**
2786Determine if an insert buffer record has been encountered already.
2787@return TRUE if a new record, FALSE if possible duplicate */
2788static
2789ibool
2790ibuf_get_volume_buffered_hash(
2791/*==========================*/
2792 const rec_t* rec, /*!< in: ibuf record in post-4.1 format */
2793 const byte* types, /*!< in: fields */
2794 const byte* data, /*!< in: start of user record data */
2795 ulint comp, /*!< in: 0=ROW_FORMAT=REDUNDANT,
2796 nonzero=ROW_FORMAT=COMPACT */
2797 ulint* hash, /*!< in/out: hash array */
2798 ulint size) /*!< in: number of elements in hash array */
2799{
2800 ulint len;
2801 ulint fold;
2802 ulint bitmask;
2803
2804 len = ibuf_rec_get_size(
2805 rec, types,
2806 rec_get_n_fields_old(rec) - IBUF_REC_FIELD_USER, comp);
2807 fold = ut_fold_binary(data, len);
2808
2809 hash += (fold / (CHAR_BIT * sizeof *hash)) % size;
2810 bitmask = static_cast<ulint>(1) << (fold % (CHAR_BIT * sizeof(*hash)));
2811
2812 if (*hash & bitmask) {
2813
2814 return(FALSE);
2815 }
2816
2817 /* We have not seen this record yet. Insert it. */
2818 *hash |= bitmask;
2819
2820 return(TRUE);
2821}
2822
2823#ifdef UNIV_DEBUG
2824# define ibuf_get_volume_buffered_count(mtr,rec,hash,size,n_recs) \
2825 ibuf_get_volume_buffered_count_func(mtr,rec,hash,size,n_recs)
2826#else /* UNIV_DEBUG */
2827# define ibuf_get_volume_buffered_count(mtr,rec,hash,size,n_recs) \
2828 ibuf_get_volume_buffered_count_func(rec,hash,size,n_recs)
2829#endif /* UNIV_DEBUG */
2830
2831/*********************************************************************//**
2832Update the estimate of the number of records on a page, and
2833get the space taken by merging the buffered record to the index page.
2834@return size of index record in bytes + an upper limit of the space
2835taken in the page directory */
2836static
2837ulint
2838ibuf_get_volume_buffered_count_func(
2839/*================================*/
2840#ifdef UNIV_DEBUG
2841 mtr_t* mtr, /*!< in: mini-transaction owning rec */
2842#endif /* UNIV_DEBUG */
2843 const rec_t* rec, /*!< in: insert buffer record */
2844 ulint* hash, /*!< in/out: hash array */
2845 ulint size, /*!< in: number of elements in hash array */
2846 lint* n_recs) /*!< in/out: estimated number of records
2847 on the page that rec points to */
2848{
2849 ulint len;
2850 ibuf_op_t ibuf_op;
2851 const byte* types;
2852 ulint n_fields;
2853
2854 ut_ad(mtr_memo_contains_page_flagged(mtr, rec, MTR_MEMO_PAGE_X_FIX
2855 | MTR_MEMO_PAGE_S_FIX));
2856 ut_ad(ibuf_inside(mtr));
2857
2858 n_fields = rec_get_n_fields_old(rec);
2859 ut_ad(n_fields > IBUF_REC_FIELD_USER);
2860 n_fields -= IBUF_REC_FIELD_USER;
2861
2862 rec_get_nth_field_offs_old(rec, 1, &len);
2863 /* This function is only invoked when buffering new
2864 operations. All pre-4.1 records should have been merged
2865 when the database was started up. */
2866 ut_a(len == 1);
2867
2868 if (rec_get_deleted_flag(rec, 0)) {
2869 /* This record has been merged already,
2870 but apparently the system crashed before
2871 the change was discarded from the buffer.
2872 Pretend that the record does not exist. */
2873 return(0);
2874 }
2875
2876 types = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len);
2877
2878 switch (UNIV_EXPECT(int(len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE),
2879 IBUF_REC_INFO_SIZE)) {
2880 default:
2881 ut_error;
2882 case 0:
2883 /* This ROW_TYPE=REDUNDANT record does not include an
2884 operation counter. Exclude it from the *n_recs,
2885 because deletes cannot be buffered if there are
2886 old-style inserts buffered for the page. */
2887
2888 len = ibuf_rec_get_size(rec, types, n_fields, 0);
2889
2890 return(len
2891 + rec_get_converted_extra_size(len, n_fields, 0)
2892 + page_dir_calc_reserved_space(1));
2893 case 1:
2894 /* This ROW_TYPE=COMPACT record does not include an
2895 operation counter. Exclude it from the *n_recs,
2896 because deletes cannot be buffered if there are
2897 old-style inserts buffered for the page. */
2898 goto get_volume_comp;
2899
2900 case IBUF_REC_INFO_SIZE:
2901 ibuf_op = (ibuf_op_t) types[IBUF_REC_OFFSET_TYPE];
2902 break;
2903 }
2904
2905 switch (ibuf_op) {
2906 case IBUF_OP_INSERT:
2907 /* Inserts can be done by updating a delete-marked record.
2908 Because delete-mark and insert operations can be pointing to
2909 the same records, we must not count duplicates. */
2910 case IBUF_OP_DELETE_MARK:
2911 /* There must be a record to delete-mark.
2912 See if this record has been already buffered. */
2913 if (n_recs && ibuf_get_volume_buffered_hash(
2914 rec, types + IBUF_REC_INFO_SIZE,
2915 types + len,
2916 types[IBUF_REC_OFFSET_FLAGS] & IBUF_REC_COMPACT,
2917 hash, size)) {
2918 (*n_recs)++;
2919 }
2920
2921 if (ibuf_op == IBUF_OP_DELETE_MARK) {
2922 /* Setting the delete-mark flag does not
2923 affect the available space on the page. */
2924 return(0);
2925 }
2926 break;
2927 case IBUF_OP_DELETE:
2928 /* A record will be removed from the page. */
2929 if (n_recs) {
2930 (*n_recs)--;
2931 }
2932 /* While deleting a record actually frees up space,
2933 we have to play it safe and pretend that it takes no
2934 additional space (the record might not exist, etc.). */
2935 return(0);
2936 default:
2937 ut_error;
2938 }
2939
2940 ut_ad(ibuf_op == IBUF_OP_INSERT);
2941
2942get_volume_comp:
2943 {
2944 dtuple_t* entry;
2945 ulint volume;
2946 dict_index_t* dummy_index;
2947 mem_heap_t* heap = mem_heap_create(500);
2948
2949 entry = ibuf_build_entry_from_ibuf_rec(
2950 mtr, rec, heap, &dummy_index);
2951
2952 volume = rec_get_converted_size(dummy_index, entry, 0);
2953
2954 ibuf_dummy_index_free(dummy_index);
2955 mem_heap_free(heap);
2956
2957 return(volume + page_dir_calc_reserved_space(1));
2958 }
2959}
2960
2961/*********************************************************************//**
2962Gets an upper limit for the combined size of entries buffered in the insert
2963buffer for a given page.
2964@return upper limit for the volume of buffered inserts for the index
2965page, in bytes; srv_page_size, if the entries for the index page span
2966several pages in the insert buffer */
2967static
2968ulint
2969ibuf_get_volume_buffered(
2970/*=====================*/
2971 const btr_pcur_t*pcur, /*!< in: pcur positioned at a place in an
2972 insert buffer tree where we would insert an
2973 entry for the index page whose number is
2974 page_no, latch mode has to be BTR_MODIFY_PREV
2975 or BTR_MODIFY_TREE */
2976 ulint space, /*!< in: space id */
2977 ulint page_no,/*!< in: page number of an index page */
2978 lint* n_recs, /*!< in/out: minimum number of records on the
2979 page after the buffered changes have been
2980 applied, or NULL to disable the counting */
2981 mtr_t* mtr) /*!< in: mini-transaction of pcur */
2982{
2983 ulint volume;
2984 const rec_t* rec;
2985 const page_t* page;
2986 ulint prev_page_no;
2987 const page_t* prev_page;
2988 ulint next_page_no;
2989 const page_t* next_page;
2990 /* bitmap of buffered recs */
2991 ulint hash_bitmap[128 / sizeof(ulint)];
2992
2993 ut_ad((pcur->latch_mode == BTR_MODIFY_PREV)
2994 || (pcur->latch_mode == BTR_MODIFY_TREE));
2995
2996 /* Count the volume of inserts earlier in the alphabetical order than
2997 pcur */
2998
2999 volume = 0;
3000
3001 if (n_recs) {
3002 memset(hash_bitmap, 0, sizeof hash_bitmap);
3003 }
3004
3005 rec = btr_pcur_get_rec(pcur);
3006 page = page_align(rec);
3007 ut_ad(page_validate(page, ibuf->index));
3008
3009 if (page_rec_is_supremum(rec)) {
3010 rec = page_rec_get_prev_const(rec);
3011 }
3012
3013 for (; !page_rec_is_infimum(rec);
3014 rec = page_rec_get_prev_const(rec)) {
3015 ut_ad(page_align(rec) == page);
3016
3017 if (page_no != ibuf_rec_get_page_no(mtr, rec)
3018 || space != ibuf_rec_get_space(mtr, rec)) {
3019
3020 goto count_later;
3021 }
3022
3023 volume += ibuf_get_volume_buffered_count(
3024 mtr, rec,
3025 hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
3026 }
3027
3028 /* Look at the previous page */
3029
3030 prev_page_no = btr_page_get_prev(page, mtr);
3031
3032 if (prev_page_no == FIL_NULL) {
3033
3034 goto count_later;
3035 }
3036
3037 {
3038 buf_block_t* block;
3039
3040 block = buf_page_get(
3041 page_id_t(IBUF_SPACE_ID, prev_page_no),
3042 univ_page_size, RW_X_LATCH, mtr);
3043
3044 buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
3045
3046 prev_page = buf_block_get_frame(block);
3047 ut_ad(page_validate(prev_page, ibuf->index));
3048 }
3049
3050#ifdef UNIV_BTR_DEBUG
3051 ut_a(btr_page_get_next(prev_page, mtr) == page_get_page_no(page));
3052#endif /* UNIV_BTR_DEBUG */
3053
3054 rec = page_get_supremum_rec(prev_page);
3055 rec = page_rec_get_prev_const(rec);
3056
3057 for (;; rec = page_rec_get_prev_const(rec)) {
3058 ut_ad(page_align(rec) == prev_page);
3059
3060 if (page_rec_is_infimum(rec)) {
3061
3062 /* We cannot go to yet a previous page, because we
3063 do not have the x-latch on it, and cannot acquire one
3064 because of the latching order: we have to give up */
3065
3066 return(srv_page_size);
3067 }
3068
3069 if (page_no != ibuf_rec_get_page_no(mtr, rec)
3070 || space != ibuf_rec_get_space(mtr, rec)) {
3071
3072 goto count_later;
3073 }
3074
3075 volume += ibuf_get_volume_buffered_count(
3076 mtr, rec,
3077 hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
3078 }
3079
3080count_later:
3081 rec = btr_pcur_get_rec(pcur);
3082
3083 if (!page_rec_is_supremum(rec)) {
3084 rec = page_rec_get_next_const(rec);
3085 }
3086
3087 for (; !page_rec_is_supremum(rec);
3088 rec = page_rec_get_next_const(rec)) {
3089 if (page_no != ibuf_rec_get_page_no(mtr, rec)
3090 || space != ibuf_rec_get_space(mtr, rec)) {
3091
3092 return(volume);
3093 }
3094
3095 volume += ibuf_get_volume_buffered_count(
3096 mtr, rec,
3097 hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
3098 }
3099
3100 /* Look at the next page */
3101
3102 next_page_no = btr_page_get_next(page, mtr);
3103
3104 if (next_page_no == FIL_NULL) {
3105
3106 return(volume);
3107 }
3108
3109 {
3110 buf_block_t* block;
3111
3112 block = buf_page_get(
3113 page_id_t(IBUF_SPACE_ID, next_page_no),
3114 univ_page_size, RW_X_LATCH, mtr);
3115
3116 buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
3117
3118 next_page = buf_block_get_frame(block);
3119 ut_ad(page_validate(next_page, ibuf->index));
3120 }
3121
3122#ifdef UNIV_BTR_DEBUG
3123 ut_a(btr_page_get_prev(next_page, mtr) == page_get_page_no(page));
3124#endif /* UNIV_BTR_DEBUG */
3125
3126 rec = page_get_infimum_rec(next_page);
3127 rec = page_rec_get_next_const(rec);
3128
3129 for (;; rec = page_rec_get_next_const(rec)) {
3130 ut_ad(page_align(rec) == next_page);
3131
3132 if (page_rec_is_supremum(rec)) {
3133
3134 /* We give up */
3135
3136 return(srv_page_size);
3137 }
3138
3139 if (page_no != ibuf_rec_get_page_no(mtr, rec)
3140 || space != ibuf_rec_get_space(mtr, rec)) {
3141
3142 return(volume);
3143 }
3144
3145 volume += ibuf_get_volume_buffered_count(
3146 mtr, rec,
3147 hash_bitmap, UT_ARR_SIZE(hash_bitmap), n_recs);
3148 }
3149}
3150
3151/*********************************************************************//**
3152Reads the biggest tablespace id from the high end of the insert buffer
3153tree and updates the counter in fil_system. */
3154void
3155ibuf_update_max_tablespace_id(void)
3156/*===============================*/
3157{
3158 ulint max_space_id;
3159 const rec_t* rec;
3160 const byte* field;
3161 ulint len;
3162 btr_pcur_t pcur;
3163 mtr_t mtr;
3164
3165 ut_a(!dict_table_is_comp(ibuf->index->table));
3166
3167 ibuf_mtr_start(&mtr);
3168
3169 btr_pcur_open_at_index_side(
3170 false, ibuf->index, BTR_SEARCH_LEAF, &pcur, true, 0, &mtr);
3171
3172 ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index));
3173
3174 btr_pcur_move_to_prev(&pcur, &mtr);
3175
3176 if (btr_pcur_is_before_first_on_page(&pcur)) {
3177 /* The tree is empty */
3178
3179 max_space_id = 0;
3180 } else {
3181 rec = btr_pcur_get_rec(&pcur);
3182
3183 field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
3184
3185 ut_a(len == 4);
3186
3187 max_space_id = mach_read_from_4(field);
3188 }
3189
3190 ibuf_mtr_commit(&mtr);
3191
3192 /* printf("Maximum space id in insert buffer %lu\n", max_space_id); */
3193
3194 fil_set_max_space_id_if_bigger(max_space_id);
3195}
3196
3197#ifdef UNIV_DEBUG
3198# define ibuf_get_entry_counter_low(mtr,rec,space,page_no) \
3199 ibuf_get_entry_counter_low_func(mtr,rec,space,page_no)
3200#else /* UNIV_DEBUG */
3201# define ibuf_get_entry_counter_low(mtr,rec,space,page_no) \
3202 ibuf_get_entry_counter_low_func(rec,space,page_no)
3203#endif
3204/****************************************************************//**
3205Helper function for ibuf_get_entry_counter_func. Checks if rec is for
3206(space, page_no), and if so, reads counter value from it and returns
3207that + 1.
3208@retval ULINT_UNDEFINED if the record does not contain any counter
3209@retval 0 if the record is not for (space, page_no)
3210@retval 1 + previous counter value, otherwise */
3211static
3212ulint
3213ibuf_get_entry_counter_low_func(
3214/*============================*/
3215#ifdef UNIV_DEBUG
3216 mtr_t* mtr, /*!< in: mini-transaction of rec */
3217#endif /* UNIV_DEBUG */
3218 const rec_t* rec, /*!< in: insert buffer record */
3219 ulint space, /*!< in: space id */
3220 ulint page_no) /*!< in: page number */
3221{
3222 ulint counter;
3223 const byte* field;
3224 ulint len;
3225
3226 ut_ad(ibuf_inside(mtr));
3227 ut_ad(mtr_memo_contains_page_flagged(mtr, rec, MTR_MEMO_PAGE_X_FIX
3228 | MTR_MEMO_PAGE_S_FIX));
3229 ut_ad(rec_get_n_fields_old(rec) > 2);
3230
3231 field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_MARKER, &len);
3232
3233 ut_a(len == 1);
3234
3235 /* Check the tablespace identifier. */
3236 field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_SPACE, &len);
3237
3238 ut_a(len == 4);
3239
3240 if (mach_read_from_4(field) != space) {
3241
3242 return(0);
3243 }
3244
3245 /* Check the page offset. */
3246 field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_PAGE, &len);
3247 ut_a(len == 4);
3248
3249 if (mach_read_from_4(field) != page_no) {
3250
3251 return(0);
3252 }
3253
3254 /* Check if the record contains a counter field. */
3255 field = rec_get_nth_field_old(rec, IBUF_REC_FIELD_METADATA, &len);
3256
3257 switch (len % DATA_NEW_ORDER_NULL_TYPE_BUF_SIZE) {
3258 default:
3259 ut_error;
3260 case 0: /* ROW_FORMAT=REDUNDANT */
3261 case 1: /* ROW_FORMAT=COMPACT */
3262 return(ULINT_UNDEFINED);
3263
3264 case IBUF_REC_INFO_SIZE:
3265 counter = mach_read_from_2(field + IBUF_REC_OFFSET_COUNTER);
3266 ut_a(counter < 0xFFFF);
3267 return(counter + 1);
3268 }
3269}
3270
3271#ifdef UNIV_DEBUG
3272# define ibuf_get_entry_counter(space,page_no,rec,mtr,exact_leaf) \
3273 ibuf_get_entry_counter_func(space,page_no,rec,mtr,exact_leaf)
3274#else /* UNIV_DEBUG */
3275# define ibuf_get_entry_counter(space,page_no,rec,mtr,exact_leaf) \
3276 ibuf_get_entry_counter_func(space,page_no,rec,exact_leaf)
3277#endif /* UNIV_DEBUG */
3278
3279/****************************************************************//**
3280Calculate the counter field for an entry based on the current
3281last record in ibuf for (space, page_no).
3282@return the counter field, or ULINT_UNDEFINED
3283if we should abort this insertion to ibuf */
3284static
3285ulint
3286ibuf_get_entry_counter_func(
3287/*========================*/
3288 ulint space, /*!< in: space id of entry */
3289 ulint page_no, /*!< in: page number of entry */
3290 const rec_t* rec, /*!< in: the record preceding the
3291 insertion point */
3292#ifdef UNIV_DEBUG
3293 mtr_t* mtr, /*!< in: mini-transaction */
3294#endif /* UNIV_DEBUG */
3295 ibool only_leaf) /*!< in: TRUE if this is the only
3296 leaf page that can contain entries
3297 for (space,page_no), that is, there
3298 was no exact match for (space,page_no)
3299 in the node pointer */
3300{
3301 ut_ad(ibuf_inside(mtr));
3302 ut_ad(mtr_memo_contains_page(mtr, rec, MTR_MEMO_PAGE_X_FIX));
3303 ut_ad(page_validate(page_align(rec), ibuf->index));
3304
3305 if (page_rec_is_supremum(rec)) {
3306 /* This is just for safety. The record should be a
3307 page infimum or a user record. */
3308 ut_ad(0);
3309 return(ULINT_UNDEFINED);
3310 } else if (!page_rec_is_infimum(rec)) {
3311 return(ibuf_get_entry_counter_low(mtr, rec, space, page_no));
3312 } else if (only_leaf || !page_has_prev(page_align(rec))) {
3313 /* The parent node pointer did not contain the
3314 searched for (space, page_no), which means that the
3315 search ended on the correct page regardless of the
3316 counter value, and since we're at the infimum record,
3317 there are no existing records. */
3318 return(0);
3319 } else {
3320 /* We used to read the previous page here. It would
3321 break the latching order, because the caller has
3322 buffer-fixed an insert buffer bitmap page. */
3323 return(ULINT_UNDEFINED);
3324 }
3325}
3326
3327/** Buffer an operation in the insert/delete buffer, instead of doing it
3328directly to the disk page, if this is possible.
3329@param[in] mode BTR_MODIFY_PREV or BTR_MODIFY_TREE
3330@param[in] op operation type
3331@param[in] no_counter TRUE=use 5.0.3 format; FALSE=allow delete
3332buffering
3333@param[in] entry index entry to insert
3334@param[in] entry_size rec_get_converted_size(index, entry)
3335@param[in,out] index index where to insert; must not be unique
3336or clustered
3337@param[in] page_id page id where to insert
3338@param[in] page_size page size
3339@param[in,out] thr query thread
3340@return DB_SUCCESS, DB_STRONG_FAIL or other error */
3341static MY_ATTRIBUTE((warn_unused_result))
3342dberr_t
3343ibuf_insert_low(
3344 ulint mode,
3345 ibuf_op_t op,
3346 ibool no_counter,
3347 const dtuple_t* entry,
3348 ulint entry_size,
3349 dict_index_t* index,
3350 const page_id_t& page_id,
3351 const page_size_t& page_size,
3352 que_thr_t* thr)
3353{
3354 big_rec_t* dummy_big_rec;
3355 btr_pcur_t pcur;
3356 btr_cur_t* cursor;
3357 dtuple_t* ibuf_entry;
3358 mem_heap_t* offsets_heap = NULL;
3359 mem_heap_t* heap;
3360 ulint* offsets = NULL;
3361 ulint buffered;
3362 lint min_n_recs;
3363 rec_t* ins_rec;
3364 ibool old_bit_value;
3365 page_t* bitmap_page;
3366 buf_block_t* block;
3367 page_t* root;
3368 dberr_t err;
3369 ibool do_merge;
3370 ulint space_ids[IBUF_MAX_N_PAGES_MERGED];
3371 ulint page_nos[IBUF_MAX_N_PAGES_MERGED];
3372 ulint n_stored;
3373 mtr_t mtr;
3374 mtr_t bitmap_mtr;
3375
3376 ut_a(!dict_index_is_clust(index));
3377 ut_ad(!dict_index_is_spatial(index));
3378 ut_ad(dtuple_check_typed(entry));
3379 ut_ad(!no_counter || op == IBUF_OP_INSERT);
3380 ut_ad(page_id.space() == index->table->space->id);
3381 ut_a(op < IBUF_OP_COUNT);
3382
3383 do_merge = FALSE;
3384
3385 /* Perform dirty reads of ibuf->size and ibuf->max_size, to
3386 reduce ibuf_mutex contention. Given that ibuf->max_size and
3387 ibuf->size fit in a machine word, this should be OK; at worst
3388 we are doing some excessive ibuf_contract() or occasionally
3389 skipping an ibuf_contract(). */
3390 if (ibuf->max_size == 0) {
3391 return(DB_STRONG_FAIL);
3392 }
3393
3394 if (ibuf->size >= ibuf->max_size + IBUF_CONTRACT_DO_NOT_INSERT) {
3395 /* Insert buffer is now too big, contract it but do not try
3396 to insert */
3397
3398
3399#ifdef UNIV_IBUF_DEBUG
3400 fputs("Ibuf too big\n", stderr);
3401#endif
3402 ibuf_contract(true);
3403
3404 return(DB_STRONG_FAIL);
3405 }
3406
3407 heap = mem_heap_create(1024);
3408
3409 /* Build the entry which contains the space id and the page number
3410 as the first fields and the type information for other fields, and
3411 which will be inserted to the insert buffer. Using a counter value
3412 of 0xFFFF we find the last record for (space, page_no), from which
3413 we can then read the counter value N and use N + 1 in the record we
3414 insert. (We patch the ibuf_entry's counter field to the correct
3415 value just before actually inserting the entry.) */
3416
3417 ibuf_entry = ibuf_entry_build(
3418 op, index, entry, page_id.space(), page_id.page_no(),
3419 no_counter ? ULINT_UNDEFINED : 0xFFFF, heap);
3420
3421 /* Open a cursor to the insert buffer tree to calculate if we can add
3422 the new entry to it without exceeding the free space limit for the
3423 page. */
3424
3425 if (BTR_LATCH_MODE_WITHOUT_INTENTION(mode) == BTR_MODIFY_TREE) {
3426 for (;;) {
3427 mutex_enter(&ibuf_pessimistic_insert_mutex);
3428 mutex_enter(&ibuf_mutex);
3429
3430 if (UNIV_LIKELY(ibuf_data_enough_free_for_insert())) {
3431
3432 break;
3433 }
3434
3435 mutex_exit(&ibuf_mutex);
3436 mutex_exit(&ibuf_pessimistic_insert_mutex);
3437
3438 if (!ibuf_add_free_page()) {
3439
3440 mem_heap_free(heap);
3441 return(DB_STRONG_FAIL);
3442 }
3443 }
3444 }
3445
3446 ibuf_mtr_start(&mtr);
3447
3448 btr_pcur_open(ibuf->index, ibuf_entry, PAGE_CUR_LE, mode, &pcur, &mtr);
3449 ut_ad(page_validate(btr_pcur_get_page(&pcur), ibuf->index));
3450
3451 /* Find out the volume of already buffered inserts for the same index
3452 page */
3453 min_n_recs = 0;
3454 buffered = ibuf_get_volume_buffered(&pcur,
3455 page_id.space(),
3456 page_id.page_no(),
3457 op == IBUF_OP_DELETE
3458 ? &min_n_recs
3459 : NULL, &mtr);
3460
3461 if (op == IBUF_OP_DELETE
3462 && (min_n_recs < 2 || buf_pool_watch_occurred(page_id))) {
3463 /* The page could become empty after the record is
3464 deleted, or the page has been read in to the buffer
3465 pool. Refuse to buffer the operation. */
3466
3467 /* The buffer pool watch is needed for IBUF_OP_DELETE
3468 because of latching order considerations. We can
3469 check buf_pool_watch_occurred() only after latching
3470 the insert buffer B-tree pages that contain buffered
3471 changes for the page. We never buffer IBUF_OP_DELETE,
3472 unless some IBUF_OP_INSERT or IBUF_OP_DELETE_MARK have
3473 been previously buffered for the page. Because there
3474 are buffered operations for the page, the insert
3475 buffer B-tree page latches held by mtr will guarantee
3476 that no changes for the user page will be merged
3477 before mtr_commit(&mtr). We must not mtr_commit(&mtr)
3478 until after the IBUF_OP_DELETE has been buffered. */
3479
3480fail_exit:
3481 if (BTR_LATCH_MODE_WITHOUT_INTENTION(mode) == BTR_MODIFY_TREE) {
3482 mutex_exit(&ibuf_mutex);
3483 mutex_exit(&ibuf_pessimistic_insert_mutex);
3484 }
3485
3486 err = DB_STRONG_FAIL;
3487 goto func_exit;
3488 }
3489
3490 /* After this point, the page could still be loaded to the
3491 buffer pool, but we do not have to care about it, since we are
3492 holding a latch on the insert buffer leaf page that contains
3493 buffered changes for (space, page_no). If the page enters the
3494 buffer pool, buf_page_io_complete() for (space, page_no) will
3495 have to acquire a latch on the same insert buffer leaf page,
3496 which it cannot do until we have buffered the IBUF_OP_DELETE
3497 and done mtr_commit(&mtr) to release the latch. */
3498
3499#ifdef UNIV_IBUF_COUNT_DEBUG
3500 ut_a((buffered == 0) || ibuf_count_get(page_id));
3501#endif
3502 ibuf_mtr_start(&bitmap_mtr);
3503 index->set_modified(bitmap_mtr);
3504
3505 bitmap_page = ibuf_bitmap_get_map_page(page_id, page_size,
3506 &bitmap_mtr);
3507
3508 /* We check if the index page is suitable for buffered entries */
3509
3510 if (buf_page_peek(page_id)
3511 || lock_rec_expl_exist_on_page(page_id.space(),
3512 page_id.page_no())) {
3513
3514 ibuf_mtr_commit(&bitmap_mtr);
3515 goto fail_exit;
3516 }
3517
3518 if (op == IBUF_OP_INSERT) {
3519 ulint bits = ibuf_bitmap_page_get_bits(
3520 bitmap_page, page_id, page_size, IBUF_BITMAP_FREE,
3521 &bitmap_mtr);
3522
3523 if (buffered + entry_size + page_dir_calc_reserved_space(1)
3524 > ibuf_index_page_calc_free_from_bits(page_size, bits)) {
3525 /* Release the bitmap page latch early. */
3526 ibuf_mtr_commit(&bitmap_mtr);
3527
3528 /* It may not fit */
3529 do_merge = TRUE;
3530
3531 ibuf_get_merge_page_nos(FALSE,
3532 btr_pcur_get_rec(&pcur), &mtr,
3533 space_ids,
3534 page_nos, &n_stored);
3535
3536 goto fail_exit;
3537 }
3538 }
3539
3540 if (!no_counter) {
3541 /* Patch correct counter value to the entry to
3542 insert. This can change the insert position, which can
3543 result in the need to abort in some cases. */
3544 ulint counter = ibuf_get_entry_counter(
3545 page_id.space(), page_id.page_no(),
3546 btr_pcur_get_rec(&pcur), &mtr,
3547 btr_pcur_get_btr_cur(&pcur)->low_match
3548 < IBUF_REC_FIELD_METADATA);
3549 dfield_t* field;
3550
3551 if (counter == ULINT_UNDEFINED) {
3552 ibuf_mtr_commit(&bitmap_mtr);
3553 goto fail_exit;
3554 }
3555
3556 field = dtuple_get_nth_field(
3557 ibuf_entry, IBUF_REC_FIELD_METADATA);
3558 mach_write_to_2(
3559 (byte*) dfield_get_data(field)
3560 + IBUF_REC_OFFSET_COUNTER, counter);
3561 }
3562
3563 /* Set the bitmap bit denoting that the insert buffer contains
3564 buffered entries for this index page, if the bit is not set yet */
3565
3566 old_bit_value = ibuf_bitmap_page_get_bits(
3567 bitmap_page, page_id, page_size,
3568 IBUF_BITMAP_BUFFERED, &bitmap_mtr);
3569
3570 if (!old_bit_value) {
3571 ibuf_bitmap_page_set_bits(bitmap_page, page_id, page_size,
3572 IBUF_BITMAP_BUFFERED, TRUE,
3573 &bitmap_mtr);
3574 }
3575
3576 ibuf_mtr_commit(&bitmap_mtr);
3577
3578 cursor = btr_pcur_get_btr_cur(&pcur);
3579
3580 if (mode == BTR_MODIFY_PREV) {
3581 err = btr_cur_optimistic_insert(
3582 BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG,
3583 cursor, &offsets, &offsets_heap,
3584 ibuf_entry, &ins_rec,
3585 &dummy_big_rec, 0, thr, &mtr);
3586 block = btr_cur_get_block(cursor);
3587 ut_ad(block->page.id.space() == IBUF_SPACE_ID);
3588
3589 /* If this is the root page, update ibuf->empty. */
3590 if (block->page.id.page_no() == FSP_IBUF_TREE_ROOT_PAGE_NO) {
3591 const page_t* root = buf_block_get_frame(block);
3592
3593 ut_ad(page_get_space_id(root) == IBUF_SPACE_ID);
3594 ut_ad(page_get_page_no(root)
3595 == FSP_IBUF_TREE_ROOT_PAGE_NO);
3596
3597 ibuf->empty = page_is_empty(root);
3598 }
3599 } else {
3600 ut_ad(BTR_LATCH_MODE_WITHOUT_INTENTION(mode)
3601 == BTR_MODIFY_TREE);
3602
3603 /* We acquire an sx-latch to the root page before the insert,
3604 because a pessimistic insert releases the tree x-latch,
3605 which would cause the sx-latching of the root after that to
3606 break the latching order. */
3607
3608 root = ibuf_tree_root_get(&mtr);
3609
3610 err = btr_cur_optimistic_insert(
3611 BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG,
3612 cursor, &offsets, &offsets_heap,
3613 ibuf_entry, &ins_rec,
3614 &dummy_big_rec, 0, thr, &mtr);
3615
3616 if (err == DB_FAIL) {
3617 err = btr_cur_pessimistic_insert(
3618 BTR_NO_LOCKING_FLAG | BTR_NO_UNDO_LOG_FLAG,
3619 cursor, &offsets, &offsets_heap,
3620 ibuf_entry, &ins_rec,
3621 &dummy_big_rec, 0, thr, &mtr);
3622 }
3623
3624 mutex_exit(&ibuf_pessimistic_insert_mutex);
3625 ibuf_size_update(root);
3626 mutex_exit(&ibuf_mutex);
3627 ibuf->empty = page_is_empty(root);
3628
3629 block = btr_cur_get_block(cursor);
3630 ut_ad(block->page.id.space() == IBUF_SPACE_ID);
3631 }
3632
3633 if (offsets_heap) {
3634 mem_heap_free(offsets_heap);
3635 }
3636
3637 if (err == DB_SUCCESS && op != IBUF_OP_DELETE) {
3638 /* Update the page max trx id field */
3639 page_update_max_trx_id(block, NULL,
3640 thr_get_trx(thr)->id, &mtr);
3641 }
3642
3643func_exit:
3644#ifdef UNIV_IBUF_COUNT_DEBUG
3645 if (err == DB_SUCCESS) {
3646
3647 ib::info() << "Incrementing ibuf count of page " << page_id
3648 << " from " << ibuf_count_get(space, page_no)
3649 << " by 1";
3650
3651 ibuf_count_set(page_id, ibuf_count_get(page_id) + 1);
3652 }
3653#endif
3654
3655 ibuf_mtr_commit(&mtr);
3656 btr_pcur_close(&pcur);
3657
3658 mem_heap_free(heap);
3659
3660 if (err == DB_SUCCESS
3661 && BTR_LATCH_MODE_WITHOUT_INTENTION(mode) == BTR_MODIFY_TREE) {
3662 ibuf_contract_after_insert(entry_size);
3663 }
3664
3665 if (do_merge) {
3666#ifdef UNIV_IBUF_DEBUG
3667 ut_a(n_stored <= IBUF_MAX_N_PAGES_MERGED);
3668#endif
3669 buf_read_ibuf_merge_pages(false, space_ids,
3670 page_nos, n_stored);
3671 }
3672
3673 return(err);
3674}
3675
3676/** Buffer an operation in the insert/delete buffer, instead of doing it
3677directly to the disk page, if this is possible. Does not do it if the index
3678is clustered or unique.
3679@param[in] op operation type
3680@param[in] entry index entry to insert
3681@param[in,out] index index where to insert
3682@param[in] page_id page id where to insert
3683@param[in] page_size page size
3684@param[in,out] thr query thread
3685@return TRUE if success */
3686ibool
3687ibuf_insert(
3688 ibuf_op_t op,
3689 const dtuple_t* entry,
3690 dict_index_t* index,
3691 const page_id_t& page_id,
3692 const page_size_t& page_size,
3693 que_thr_t* thr)
3694{
3695 dberr_t err;
3696 ulint entry_size;
3697 ibool no_counter;
3698 /* Read the settable global variable ibuf_use only once in
3699 this function, so that we will have a consistent view of it. */
3700 ibuf_use_t use = ibuf_use;
3701 DBUG_ENTER("ibuf_insert");
3702
3703 DBUG_PRINT("ibuf", ("op: %d, space: " UINT32PF ", page_no: " UINT32PF,
3704 op, page_id.space(), page_id.page_no()));
3705
3706 ut_ad(dtuple_check_typed(entry));
3707 ut_ad(page_id.space() != SRV_TMP_SPACE_ID);
3708
3709 ut_a(!dict_index_is_clust(index));
3710 ut_ad(!index->table->is_temporary());
3711
3712 no_counter = use <= IBUF_USE_INSERT;
3713
3714 switch (op) {
3715 case IBUF_OP_INSERT:
3716 switch (use) {
3717 case IBUF_USE_NONE:
3718 case IBUF_USE_DELETE:
3719 case IBUF_USE_DELETE_MARK:
3720 DBUG_RETURN(FALSE);
3721 case IBUF_USE_INSERT:
3722 case IBUF_USE_INSERT_DELETE_MARK:
3723 case IBUF_USE_ALL:
3724 goto check_watch;
3725 }
3726 break;
3727 case IBUF_OP_DELETE_MARK:
3728 switch (use) {
3729 case IBUF_USE_NONE:
3730 case IBUF_USE_INSERT:
3731 DBUG_RETURN(FALSE);
3732 case IBUF_USE_DELETE_MARK:
3733 case IBUF_USE_DELETE:
3734 case IBUF_USE_INSERT_DELETE_MARK:
3735 case IBUF_USE_ALL:
3736 ut_ad(!no_counter);
3737 goto check_watch;
3738 }
3739 break;
3740 case IBUF_OP_DELETE:
3741 switch (use) {
3742 case IBUF_USE_NONE:
3743 case IBUF_USE_INSERT:
3744 case IBUF_USE_INSERT_DELETE_MARK:
3745 DBUG_RETURN(FALSE);
3746 case IBUF_USE_DELETE_MARK:
3747 case IBUF_USE_DELETE:
3748 case IBUF_USE_ALL:
3749 ut_ad(!no_counter);
3750 goto skip_watch;
3751 }
3752 break;
3753 case IBUF_OP_COUNT:
3754 break;
3755 }
3756
3757 /* unknown op or use */
3758 ut_error;
3759
3760check_watch:
3761 /* If a thread attempts to buffer an insert on a page while a
3762 purge is in progress on the same page, the purge must not be
3763 buffered, because it could remove a record that was
3764 re-inserted later. For simplicity, we block the buffering of
3765 all operations on a page that has a purge pending.
3766
3767 We do not check this in the IBUF_OP_DELETE case, because that
3768 would always trigger the buffer pool watch during purge and
3769 thus prevent the buffering of delete operations. We assume
3770 that the issuer of IBUF_OP_DELETE has called
3771 buf_pool_watch_set(space, page_no). */
3772
3773 {
3774 buf_pool_t* buf_pool = buf_pool_get(page_id);
3775 buf_page_t* bpage
3776 = buf_page_get_also_watch(buf_pool, page_id);
3777
3778 if (bpage != NULL) {
3779 /* A buffer pool watch has been set or the
3780 page has been read into the buffer pool.
3781 Do not buffer the request. If a purge operation
3782 is being buffered, have this request executed
3783 directly on the page in the buffer pool after the
3784 buffered entries for this page have been merged. */
3785 DBUG_RETURN(FALSE);
3786 }
3787 }
3788
3789skip_watch:
3790 entry_size = rec_get_converted_size(index, entry, 0);
3791
3792 if (entry_size
3793 >= page_get_free_space_of_empty(dict_table_is_comp(index->table))
3794 / 2) {
3795
3796 DBUG_RETURN(FALSE);
3797 }
3798
3799 err = ibuf_insert_low(BTR_MODIFY_PREV, op, no_counter,
3800 entry, entry_size,
3801 index, page_id, page_size, thr);
3802 if (err == DB_FAIL) {
3803 err = ibuf_insert_low(BTR_MODIFY_TREE | BTR_LATCH_FOR_INSERT,
3804 op, no_counter, entry, entry_size,
3805 index, page_id, page_size, thr);
3806 }
3807
3808 if (err == DB_SUCCESS) {
3809#ifdef UNIV_IBUF_DEBUG
3810 /* fprintf(stderr, "Ibuf insert for page no %lu of index %s\n",
3811 page_no, index->name); */
3812#endif
3813 DBUG_RETURN(TRUE);
3814
3815 } else {
3816 ut_a(err == DB_STRONG_FAIL || err == DB_TOO_BIG_RECORD);
3817
3818 DBUG_RETURN(FALSE);
3819 }
3820}
3821
3822/********************************************************************//**
3823During merge, inserts to an index page a secondary index entry extracted
3824from the insert buffer.
3825@return newly inserted record */
3826static MY_ATTRIBUTE((nonnull))
3827rec_t*
3828ibuf_insert_to_index_page_low(
3829/*==========================*/
3830 const dtuple_t* entry, /*!< in: buffered entry to insert */
3831 buf_block_t* block, /*!< in/out: index page where the buffered
3832 entry should be placed */
3833 dict_index_t* index, /*!< in: record descriptor */
3834 ulint** offsets,/*!< out: offsets on *rec */
3835 mem_heap_t* heap, /*!< in/out: memory heap */
3836 mtr_t* mtr, /*!< in/out: mtr */
3837 page_cur_t* page_cur)/*!< in/out: cursor positioned on the record
3838 after which to insert the buffered entry */
3839{
3840 const page_t* page;
3841 const page_t* bitmap_page;
3842 ulint old_bits;
3843 rec_t* rec;
3844 DBUG_ENTER("ibuf_insert_to_index_page_low");
3845
3846 rec = page_cur_tuple_insert(page_cur, entry, index,
3847 offsets, &heap, 0, mtr);
3848 if (rec != NULL) {
3849 DBUG_RETURN(rec);
3850 }
3851
3852 /* Page reorganization or recompression should already have
3853 been attempted by page_cur_tuple_insert(). Besides, per
3854 ibuf_index_page_calc_free_zip() the page should not have been
3855 recompressed or reorganized. */
3856 ut_ad(!buf_block_get_page_zip(block));
3857
3858 /* If the record did not fit, reorganize */
3859
3860 btr_page_reorganize(page_cur, index, mtr);
3861
3862 /* This time the record must fit */
3863
3864 rec = page_cur_tuple_insert(page_cur, entry, index,
3865 offsets, &heap, 0, mtr);
3866 if (rec != NULL) {
3867 DBUG_RETURN(rec);
3868 }
3869
3870 page = buf_block_get_frame(block);
3871
3872 ib::error() << "Insert buffer insert fails; page free "
3873 << page_get_max_insert_size(page, 1) << ", dtuple size "
3874 << rec_get_converted_size(index, entry, 0);
3875
3876 fputs("InnoDB: Cannot insert index record ", stderr);
3877 dtuple_print(stderr, entry);
3878 fputs("\nInnoDB: The table where this index record belongs\n"
3879 "InnoDB: is now probably corrupt. Please run CHECK TABLE on\n"
3880 "InnoDB: that table.\n", stderr);
3881
3882 bitmap_page = ibuf_bitmap_get_map_page(block->page.id,
3883 block->page.size, mtr);
3884 old_bits = ibuf_bitmap_page_get_bits(
3885 bitmap_page, block->page.id, block->page.size,
3886 IBUF_BITMAP_FREE, mtr);
3887
3888 ib::error() << "page " << block->page.id << ", size "
3889 << block->page.size.physical() << ", bitmap bits " << old_bits;
3890
3891 ib::error() << BUG_REPORT_MSG;
3892
3893 ut_ad(0);
3894 DBUG_RETURN(NULL);
3895}
3896
3897/************************************************************************
3898During merge, inserts to an index page a secondary index entry extracted
3899from the insert buffer. */
3900static
3901void
3902ibuf_insert_to_index_page(
3903/*======================*/
3904 const dtuple_t* entry, /*!< in: buffered entry to insert */
3905 buf_block_t* block, /*!< in/out: index page where the buffered entry
3906 should be placed */
3907 dict_index_t* index, /*!< in: record descriptor */
3908 mtr_t* mtr) /*!< in: mtr */
3909{
3910 page_cur_t page_cur;
3911 ulint low_match;
3912 page_t* page = buf_block_get_frame(block);
3913 rec_t* rec;
3914 ulint* offsets;
3915 mem_heap_t* heap;
3916
3917 DBUG_ENTER("ibuf_insert_to_index_page");
3918
3919 DBUG_PRINT("ibuf", ("page " UINT32PF ":" UINT32PF,
3920 block->page.id.space(),
3921 block->page.id.page_no()));
3922
3923 ut_ad(!dict_index_is_online_ddl(index));// this is an ibuf_dummy index
3924 ut_ad(ibuf_inside(mtr));
3925 ut_ad(dtuple_check_typed(entry));
3926#ifdef BTR_CUR_HASH_ADAPT
3927 /* A change buffer merge must occur before users are granted
3928 any access to the page. No adaptive hash index entries may
3929 point to a freshly read page. */
3930 ut_ad(!block->index);
3931 assert_block_ahi_empty(block);
3932#endif /* BTR_CUR_HASH_ADAPT */
3933 ut_ad(mtr->is_named_space(block->page.id.space()));
3934
3935 if (UNIV_UNLIKELY(dict_table_is_comp(index->table)
3936 != (ibool)!!page_is_comp(page))) {
3937 ib::warn() << "Trying to insert a record from the insert"
3938 " buffer to an index page but the 'compact' flag does"
3939 " not match!";
3940 goto dump;
3941 }
3942
3943 rec = page_rec_get_next(page_get_infimum_rec(page));
3944
3945 if (page_rec_is_supremum(rec)) {
3946 ib::warn() << "Trying to insert a record from the insert"
3947 " buffer to an index page but the index page"
3948 " is empty!";
3949 goto dump;
3950 }
3951
3952 if (!rec_n_fields_is_sane(index, rec, entry)) {
3953 ib::warn() << "Trying to insert a record from the insert"
3954 " buffer to an index page but the number of fields"
3955 " does not match!";
3956 rec_print(stderr, rec, index);
3957dump:
3958 dtuple_print(stderr, entry);
3959 ut_ad(0);
3960
3961 ib::warn() << "The table where this index record belongs"
3962 " is now probably corrupt. Please run CHECK TABLE on"
3963 " your tables. " << BUG_REPORT_MSG;
3964
3965 DBUG_VOID_RETURN;
3966 }
3967
3968 low_match = page_cur_search(block, index, entry, &page_cur);
3969
3970 heap = mem_heap_create(
3971 sizeof(upd_t)
3972 + REC_OFFS_HEADER_SIZE * sizeof(*offsets)
3973 + dtuple_get_n_fields(entry)
3974 * (sizeof(upd_field_t) + sizeof *offsets));
3975
3976 if (UNIV_UNLIKELY(low_match == dtuple_get_n_fields(entry))) {
3977 upd_t* update;
3978 page_zip_des_t* page_zip;
3979
3980 rec = page_cur_get_rec(&page_cur);
3981
3982 /* This is based on
3983 row_ins_sec_index_entry_by_modify(BTR_MODIFY_LEAF). */
3984 ut_ad(rec_get_deleted_flag(rec, page_is_comp(page)));
3985
3986 offsets = rec_get_offsets(rec, index, NULL, true,
3987 ULINT_UNDEFINED, &heap);
3988 update = row_upd_build_sec_rec_difference_binary(
3989 rec, index, offsets, entry, heap);
3990
3991 page_zip = buf_block_get_page_zip(block);
3992
3993 if (update->n_fields == 0) {
3994 /* The records only differ in the delete-mark.
3995 Clear the delete-mark, like we did before
3996 Bug #56680 was fixed. */
3997 btr_cur_set_deleted_flag_for_ibuf(
3998 rec, page_zip, FALSE, mtr);
3999 goto updated_in_place;
4000 }
4001
4002 /* Copy the info bits. Clear the delete-mark. */
4003 update->info_bits = rec_get_info_bits(rec, page_is_comp(page));
4004 update->info_bits &= ~REC_INFO_DELETED_FLAG;
4005
4006 /* We cannot invoke btr_cur_optimistic_update() here,
4007 because we do not have a btr_cur_t or que_thr_t,
4008 as the insert buffer merge occurs at a very low level. */
4009 if (!row_upd_changes_field_size_or_external(index, offsets,
4010 update)
4011 && (!page_zip || btr_cur_update_alloc_zip(
4012 page_zip, &page_cur, index, offsets,
4013 rec_offs_size(offsets), false, mtr))) {
4014 /* This is the easy case. Do something similar
4015 to btr_cur_update_in_place(). */
4016 rec = page_cur_get_rec(&page_cur);
4017 row_upd_rec_in_place(rec, index, offsets,
4018 update, page_zip);
4019
4020 /* Log the update in place operation. During recovery
4021 MLOG_COMP_REC_UPDATE_IN_PLACE/MLOG_REC_UPDATE_IN_PLACE
4022 expects trx_id, roll_ptr for secondary indexes. So we
4023 just write dummy trx_id(0), roll_ptr(0) */
4024 btr_cur_update_in_place_log(BTR_KEEP_SYS_FLAG, rec,
4025 index, update, 0, 0, mtr);
4026
4027 DBUG_EXECUTE_IF(
4028 "crash_after_log_ibuf_upd_inplace",
4029 log_buffer_flush_to_disk();
4030 ib::info() << "Wrote log record for ibuf"
4031 " update in place operation";
4032 DBUG_SUICIDE();
4033 );
4034
4035 goto updated_in_place;
4036 }
4037
4038 /* btr_cur_update_alloc_zip() may have changed this */
4039 rec = page_cur_get_rec(&page_cur);
4040
4041 /* A collation may identify values that differ in
4042 storage length.
4043 Some examples (1 or 2 bytes):
4044 utf8_turkish_ci: I = U+0131 LATIN SMALL LETTER DOTLESS I
4045 utf8_general_ci: S = U+00DF LATIN SMALL LETTER SHARP S
4046 utf8_general_ci: A = U+00E4 LATIN SMALL LETTER A WITH DIAERESIS
4047
4048 latin1_german2_ci: SS = U+00DF LATIN SMALL LETTER SHARP S
4049
4050 Examples of a character (3-byte UTF-8 sequence)
4051 identified with 2 or 4 characters (1-byte UTF-8 sequences):
4052
4053 utf8_unicode_ci: 'II' = U+2171 SMALL ROMAN NUMERAL TWO
4054 utf8_unicode_ci: '(10)' = U+247D PARENTHESIZED NUMBER TEN
4055 */
4056
4057 /* Delete the different-length record, and insert the
4058 buffered one. */
4059
4060 lock_rec_store_on_page_infimum(block, rec);
4061 page_cur_delete_rec(&page_cur, index, offsets, mtr);
4062 page_cur_move_to_prev(&page_cur);
4063 rec = ibuf_insert_to_index_page_low(entry, block, index,
4064 &offsets, heap, mtr,
4065 &page_cur);
4066
4067 ut_ad(!cmp_dtuple_rec(entry, rec, offsets));
4068 lock_rec_restore_from_page_infimum(block, rec, block);
4069 } else {
4070 offsets = NULL;
4071 ibuf_insert_to_index_page_low(entry, block, index,
4072 &offsets, heap, mtr,
4073 &page_cur);
4074 }
4075updated_in_place:
4076 mem_heap_free(heap);
4077
4078 DBUG_VOID_RETURN;
4079}
4080
4081/****************************************************************//**
4082During merge, sets the delete mark on a record for a secondary index
4083entry. */
4084static
4085void
4086ibuf_set_del_mark(
4087/*==============*/
4088 const dtuple_t* entry, /*!< in: entry */
4089 buf_block_t* block, /*!< in/out: block */
4090 const dict_index_t* index, /*!< in: record descriptor */
4091 mtr_t* mtr) /*!< in: mtr */
4092{
4093 page_cur_t page_cur;
4094 ulint low_match;
4095
4096 ut_ad(ibuf_inside(mtr));
4097 ut_ad(dtuple_check_typed(entry));
4098
4099 low_match = page_cur_search(block, index, entry, &page_cur);
4100
4101 if (low_match == dtuple_get_n_fields(entry)) {
4102 rec_t* rec;
4103 page_zip_des_t* page_zip;
4104
4105 rec = page_cur_get_rec(&page_cur);
4106 page_zip = page_cur_get_page_zip(&page_cur);
4107
4108 /* Delete mark the old index record. According to a
4109 comment in row_upd_sec_index_entry(), it can already
4110 have been delete marked if a lock wait occurred in
4111 row_ins_sec_index_entry() in a previous invocation of
4112 row_upd_sec_index_entry(). */
4113
4114 if (UNIV_LIKELY
4115 (!rec_get_deleted_flag(
4116 rec, dict_table_is_comp(index->table)))) {
4117 btr_cur_set_deleted_flag_for_ibuf(rec, page_zip,
4118 TRUE, mtr);
4119 }
4120 } else {
4121 const page_t* page
4122 = page_cur_get_page(&page_cur);
4123 const buf_block_t* block
4124 = page_cur_get_block(&page_cur);
4125
4126 ib::error() << "Unable to find a record to delete-mark";
4127 fputs("InnoDB: tuple ", stderr);
4128 dtuple_print(stderr, entry);
4129 fputs("\n"
4130 "InnoDB: record ", stderr);
4131 rec_print(stderr, page_cur_get_rec(&page_cur), index);
4132
4133 ib::error() << "page " << block->page.id << " ("
4134 << page_get_n_recs(page) << " records, index id "
4135 << btr_page_get_index_id(page) << ").";
4136
4137 ib::error() << BUG_REPORT_MSG;
4138 ut_ad(0);
4139 }
4140}
4141
4142/****************************************************************//**
4143During merge, delete a record for a secondary index entry. */
4144static
4145void
4146ibuf_delete(
4147/*========*/
4148 const dtuple_t* entry, /*!< in: entry */
4149 buf_block_t* block, /*!< in/out: block */
4150 dict_index_t* index, /*!< in: record descriptor */
4151 mtr_t* mtr) /*!< in/out: mtr; must be committed
4152 before latching any further pages */
4153{
4154 page_cur_t page_cur;
4155 ulint low_match;
4156
4157 ut_ad(ibuf_inside(mtr));
4158 ut_ad(dtuple_check_typed(entry));
4159 ut_ad(!dict_index_is_spatial(index));
4160
4161 low_match = page_cur_search(block, index, entry, &page_cur);
4162
4163 if (low_match == dtuple_get_n_fields(entry)) {
4164 page_zip_des_t* page_zip= buf_block_get_page_zip(block);
4165 page_t* page = buf_block_get_frame(block);
4166 rec_t* rec = page_cur_get_rec(&page_cur);
4167
4168 /* TODO: the below should probably be a separate function,
4169 it's a bastardized version of btr_cur_optimistic_delete. */
4170
4171 ulint offsets_[REC_OFFS_NORMAL_SIZE];
4172 ulint* offsets = offsets_;
4173 mem_heap_t* heap = NULL;
4174 ulint max_ins_size = 0;
4175
4176 rec_offs_init(offsets_);
4177
4178 offsets = rec_get_offsets(
4179 rec, index, offsets, true, ULINT_UNDEFINED, &heap);
4180
4181 if (page_get_n_recs(page) <= 1
4182 || !(REC_INFO_DELETED_FLAG
4183 & rec_get_info_bits(rec, page_is_comp(page)))) {
4184 /* Refuse to purge the last record or a
4185 record that has not been marked for deletion. */
4186 ib::error() << "Unable to purge a record";
4187 fputs("InnoDB: tuple ", stderr);
4188 dtuple_print(stderr, entry);
4189 fputs("\n"
4190 "InnoDB: record ", stderr);
4191 rec_print_new(stderr, rec, offsets);
4192 fprintf(stderr, "\nspace " UINT32PF " offset " UINT32PF
4193 " (%u records, index id %llu)\n"
4194 "InnoDB: Submit a detailed bug report"
4195 " to https://jira.mariadb.org/\n",
4196 block->page.id.space(),
4197 block->page.id.page_no(),
4198 (unsigned) page_get_n_recs(page),
4199 (ulonglong) btr_page_get_index_id(page));
4200
4201 ut_ad(0);
4202 return;
4203 }
4204
4205 lock_update_delete(block, rec);
4206
4207 if (!page_zip) {
4208 max_ins_size
4209 = page_get_max_insert_size_after_reorganize(
4210 page, 1);
4211 }
4212#ifdef UNIV_ZIP_DEBUG
4213 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4214#endif /* UNIV_ZIP_DEBUG */
4215 page_cur_delete_rec(&page_cur, index, offsets, mtr);
4216#ifdef UNIV_ZIP_DEBUG
4217 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4218#endif /* UNIV_ZIP_DEBUG */
4219
4220 if (page_zip) {
4221 ibuf_update_free_bits_zip(block, mtr);
4222 } else {
4223 ibuf_update_free_bits_low(block, max_ins_size, mtr);
4224 }
4225
4226 if (UNIV_LIKELY_NULL(heap)) {
4227 mem_heap_free(heap);
4228 }
4229 } else {
4230 /* The record must have been purged already. */
4231 }
4232}
4233
4234/*********************************************************************//**
4235Restores insert buffer tree cursor position
4236@return TRUE if the position was restored; FALSE if not */
4237static MY_ATTRIBUTE((nonnull))
4238ibool
4239ibuf_restore_pos(
4240/*=============*/
4241 ulint space, /*!< in: space id */
4242 ulint page_no,/*!< in: index page number where the record
4243 should belong */
4244 const dtuple_t* search_tuple,
4245 /*!< in: search tuple for entries of page_no */
4246 ulint mode, /*!< in: BTR_MODIFY_LEAF or BTR_MODIFY_TREE */
4247 btr_pcur_t* pcur, /*!< in/out: persistent cursor whose
4248 position is to be restored */
4249 mtr_t* mtr) /*!< in/out: mini-transaction */
4250{
4251 ut_ad(mode == BTR_MODIFY_LEAF
4252 || BTR_LATCH_MODE_WITHOUT_INTENTION(mode) == BTR_MODIFY_TREE);
4253
4254 if (btr_pcur_restore_position(mode, pcur, mtr)) {
4255
4256 return(TRUE);
4257 }
4258
4259 if (fil_space_t* s = fil_space_acquire_silent(space)) {
4260 ib::error() << "ibuf cursor restoration fails!"
4261 " ibuf record inserted to page "
4262 << space << ":" << page_no
4263 << " in file " << s->chain.start->name;
4264 s->release();
4265
4266 ib::error() << BUG_REPORT_MSG;
4267
4268 rec_print_old(stderr, btr_pcur_get_rec(pcur));
4269 rec_print_old(stderr, pcur->old_rec);
4270 dtuple_print(stderr, search_tuple);
4271
4272 rec_print_old(stderr,
4273 page_rec_get_next(btr_pcur_get_rec(pcur)));
4274 }
4275
4276 ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
4277 return(FALSE);
4278}
4279
4280/*********************************************************************//**
4281Deletes from ibuf the record on which pcur is positioned. If we have to
4282resort to a pessimistic delete, this function commits mtr and closes
4283the cursor.
4284@return TRUE if mtr was committed and pcur closed in this operation */
4285static MY_ATTRIBUTE((warn_unused_result))
4286ibool
4287ibuf_delete_rec(
4288/*============*/
4289 ulint space, /*!< in: space id */
4290 ulint page_no,/*!< in: index page number that the record
4291 should belong to */
4292 btr_pcur_t* pcur, /*!< in: pcur positioned on the record to
4293 delete, having latch mode BTR_MODIFY_LEAF */
4294 const dtuple_t* search_tuple,
4295 /*!< in: search tuple for entries of page_no */
4296 mtr_t* mtr) /*!< in: mtr */
4297{
4298 ibool success;
4299 page_t* root;
4300 dberr_t err;
4301
4302 ut_ad(ibuf_inside(mtr));
4303 ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur)));
4304 ut_ad(ibuf_rec_get_page_no(mtr, btr_pcur_get_rec(pcur)) == page_no);
4305 ut_ad(ibuf_rec_get_space(mtr, btr_pcur_get_rec(pcur)) == space);
4306
4307#if defined UNIV_DEBUG || defined UNIV_IBUF_DEBUG
4308 if (ibuf_debug == 2) {
4309 /* Inject a fault (crash). We do this before trying
4310 optimistic delete, because a pessimistic delete in the
4311 change buffer would require a larger test case. */
4312
4313 /* Flag the buffered record as processed, to avoid
4314 an assertion failure after crash recovery. */
4315 btr_cur_set_deleted_flag_for_ibuf(
4316 btr_pcur_get_rec(pcur), NULL, TRUE, mtr);
4317
4318 ibuf_mtr_commit(mtr);
4319 log_write_up_to(LSN_MAX, true);
4320 DBUG_SUICIDE();
4321 }
4322#endif /* UNIV_DEBUG || UNIV_IBUF_DEBUG */
4323
4324 success = btr_cur_optimistic_delete(btr_pcur_get_btr_cur(pcur),
4325 0, mtr);
4326
4327 const page_id_t page_id(space, page_no);
4328
4329 if (success) {
4330 if (page_is_empty(btr_pcur_get_page(pcur))) {
4331 /* If a B-tree page is empty, it must be the root page
4332 and the whole B-tree must be empty. InnoDB does not
4333 allow empty B-tree pages other than the root. */
4334 root = btr_pcur_get_page(pcur);
4335
4336 ut_ad(page_get_space_id(root) == IBUF_SPACE_ID);
4337 ut_ad(page_get_page_no(root)
4338 == FSP_IBUF_TREE_ROOT_PAGE_NO);
4339
4340 /* ibuf->empty is protected by the root page latch.
4341 Before the deletion, it had to be FALSE. */
4342 ut_ad(!ibuf->empty);
4343 ibuf->empty = true;
4344 }
4345
4346#ifdef UNIV_IBUF_COUNT_DEBUG
4347 ib::info() << "Decrementing ibuf count of space " << space
4348 << " page " << page_no << " from "
4349 << ibuf_count_get(page_id) << " by 1";
4350
4351 ibuf_count_set(page_id, ibuf_count_get(page_id) - 1);
4352#endif /* UNIV_IBUF_COUNT_DEBUG */
4353
4354 return(FALSE);
4355 }
4356
4357 ut_ad(page_rec_is_user_rec(btr_pcur_get_rec(pcur)));
4358 ut_ad(ibuf_rec_get_page_no(mtr, btr_pcur_get_rec(pcur)) == page_no);
4359 ut_ad(ibuf_rec_get_space(mtr, btr_pcur_get_rec(pcur)) == space);
4360
4361 /* We have to resort to a pessimistic delete from ibuf.
4362 Delete-mark the record so that it will not be applied again,
4363 in case the server crashes before the pessimistic delete is
4364 made persistent. */
4365 btr_cur_set_deleted_flag_for_ibuf(
4366 btr_pcur_get_rec(pcur), NULL, TRUE, mtr);
4367
4368 btr_pcur_store_position(pcur, mtr);
4369 ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
4370
4371 ibuf_mtr_start(mtr);
4372 mutex_enter(&ibuf_mutex);
4373
4374 if (!ibuf_restore_pos(space, page_no, search_tuple,
4375 BTR_MODIFY_TREE | BTR_LATCH_FOR_DELETE,
4376 pcur, mtr)) {
4377
4378 mutex_exit(&ibuf_mutex);
4379 ut_ad(mtr->has_committed());
4380 goto func_exit;
4381 }
4382
4383 root = ibuf_tree_root_get(mtr);
4384
4385 btr_cur_pessimistic_delete(&err, TRUE, btr_pcur_get_btr_cur(pcur), 0,
4386 false, mtr);
4387 ut_a(err == DB_SUCCESS);
4388
4389#ifdef UNIV_IBUF_COUNT_DEBUG
4390 ibuf_count_set(page_id, ibuf_count_get(page_id) - 1);
4391#endif /* UNIV_IBUF_COUNT_DEBUG */
4392
4393 ibuf_size_update(root);
4394 mutex_exit(&ibuf_mutex);
4395
4396 ibuf->empty = page_is_empty(root);
4397 ibuf_btr_pcur_commit_specify_mtr(pcur, mtr);
4398
4399func_exit:
4400 ut_ad(mtr->has_committed());
4401 btr_pcur_close(pcur);
4402
4403 return(TRUE);
4404}
4405
4406/** When an index page is read from a disk to the buffer pool, this function
4407applies any buffered operations to the page and deletes the entries from the
4408insert buffer. If the page is not read, but created in the buffer pool, this
4409function deletes its buffered entries from the insert buffer; there can
4410exist entries for such a page if the page belonged to an index which
4411subsequently was dropped.
4412@param[in,out] block if page has been read from disk,
4413pointer to the page x-latched, else NULL
4414@param[in] page_id page id of the index page
4415@param[in] update_ibuf_bitmap normally this is set to TRUE, but
4416if we have deleted or are deleting the tablespace, then we naturally do not
4417want to update a non-existent bitmap page */
4418void
4419ibuf_merge_or_delete_for_page(
4420 buf_block_t* block,
4421 const page_id_t& page_id,
4422 const page_size_t* page_size,
4423 ibool update_ibuf_bitmap)
4424{
4425 mem_heap_t* heap;
4426 btr_pcur_t pcur;
4427 dtuple_t* search_tuple;
4428#ifdef UNIV_IBUF_DEBUG
4429 ulint volume = 0;
4430#endif /* UNIV_IBUF_DEBUG */
4431 page_zip_des_t* page_zip = NULL;
4432 bool corruption_noticed = false;
4433 mtr_t mtr;
4434
4435 /* Counts for merged & discarded operations. */
4436 ulint mops[IBUF_OP_COUNT];
4437 ulint dops[IBUF_OP_COUNT];
4438
4439 ut_ad(block == NULL || page_id.equals_to(block->page.id));
4440 ut_ad(block == NULL || buf_block_get_io_fix(block) == BUF_IO_READ);
4441
4442 if (srv_force_recovery >= SRV_FORCE_NO_IBUF_MERGE
4443 || trx_sys_hdr_page(page_id)
4444 || fsp_is_system_temporary(page_id.space())) {
4445 return;
4446 }
4447
4448 /* We cannot refer to page_size in the following, because it is passed
4449 as NULL (it is unknown) when buf_read_ibuf_merge_pages() is merging
4450 (discarding) changes for a dropped tablespace. When block != NULL or
4451 update_ibuf_bitmap is specified, then page_size must be known.
4452 That is why we will repeat the check below, with page_size in
4453 place of univ_page_size. Passing univ_page_size assumes that the
4454 uncompressed page size always is a power-of-2 multiple of the
4455 compressed page size. */
4456
4457 if (ibuf_fixed_addr_page(page_id, univ_page_size)
4458 || fsp_descr_page(page_id, univ_page_size)) {
4459 return;
4460 }
4461
4462 fil_space_t* space;
4463
4464 if (update_ibuf_bitmap) {
4465
4466 ut_ad(page_size != NULL);
4467
4468 if (ibuf_fixed_addr_page(page_id, *page_size)
4469 || fsp_descr_page(page_id, *page_size)) {
4470 return;
4471 }
4472
4473 space = fil_space_acquire_silent(page_id.space());
4474
4475 if (UNIV_UNLIKELY(!space)) {
4476 /* Do not try to read the bitmap page from the
4477 non-existent tablespace, delete the ibuf records */
4478 block = NULL;
4479 update_ibuf_bitmap = FALSE;
4480 } else {
4481 page_t* bitmap_page = NULL;
4482 ulint bitmap_bits = 0;
4483
4484 ibuf_mtr_start(&mtr);
4485
4486 bitmap_page = ibuf_bitmap_get_map_page(
4487 page_id, *page_size, &mtr);
4488
4489 if (bitmap_page &&
4490 fil_page_get_type(bitmap_page) != FIL_PAGE_TYPE_ALLOCATED) {
4491 bitmap_bits = ibuf_bitmap_page_get_bits(
4492 bitmap_page, page_id, *page_size,
4493 IBUF_BITMAP_BUFFERED, &mtr);
4494 }
4495
4496 ibuf_mtr_commit(&mtr);
4497
4498 if (!bitmap_bits) {
4499 /* No inserts buffered for this page */
4500
4501 space->release();
4502 return;
4503 }
4504 }
4505 } else if (block != NULL
4506 && (ibuf_fixed_addr_page(page_id, *page_size)
4507 || fsp_descr_page(page_id, *page_size))) {
4508
4509 return;
4510 } else {
4511 space = NULL;
4512 }
4513
4514 heap = mem_heap_create(512);
4515
4516 search_tuple = ibuf_search_tuple_build(
4517 page_id.space(), page_id.page_no(), heap);
4518
4519 if (block != NULL) {
4520 /* Move the ownership of the x-latch on the page to this OS
4521 thread, so that we can acquire a second x-latch on it. This
4522 is needed for the insert operations to the index page to pass
4523 the debug checks. */
4524
4525 rw_lock_x_lock_move_ownership(&(block->lock));
4526 page_zip = buf_block_get_page_zip(block);
4527
4528 if (!fil_page_index_page_check(block->frame)
4529 || !page_is_leaf(block->frame)) {
4530
4531 corruption_noticed = true;
4532
4533 ib::error() << "Corruption in the tablespace. Bitmap"
4534 " shows insert buffer records to page "
4535 << page_id << " though the page type is "
4536 << fil_page_get_type(block->frame)
4537 << ", which is not an index leaf page. We try"
4538 " to resolve the problem by skipping the"
4539 " insert buffer merge for this page. Please"
4540 " run CHECK TABLE on your tables to determine"
4541 " if they are corrupt after this.";
4542 ut_ad(0);
4543 }
4544 }
4545
4546 memset(mops, 0, sizeof(mops));
4547 memset(dops, 0, sizeof(dops));
4548
4549loop:
4550 ibuf_mtr_start(&mtr);
4551
4552 /* Position pcur in the insert buffer at the first entry for this
4553 index page */
4554 btr_pcur_open_on_user_rec(
4555 ibuf->index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
4556 &pcur, &mtr);
4557
4558 if (block != NULL) {
4559 ibool success;
4560
4561 mtr.set_named_space(space);
4562
4563 success = buf_page_get_known_nowait(
4564 RW_X_LATCH, block,
4565 BUF_KEEP_OLD, __FILE__, __LINE__, &mtr);
4566
4567 ut_a(success);
4568
4569 /* This is a user page (secondary index leaf page),
4570 but we pretend that it is a change buffer page in
4571 order to obey the latching order. This should be OK,
4572 because buffered changes are applied immediately while
4573 the block is io-fixed. Other threads must not try to
4574 latch an io-fixed block. */
4575 buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE);
4576 } else if (update_ibuf_bitmap) {
4577 mtr.set_named_space(space);
4578 }
4579
4580 if (!btr_pcur_is_on_user_rec(&pcur)) {
4581 ut_ad(btr_pcur_is_after_last_in_tree(&pcur));
4582 goto reset_bit;
4583 }
4584
4585 for (;;) {
4586 rec_t* rec;
4587
4588 ut_ad(btr_pcur_is_on_user_rec(&pcur));
4589
4590 rec = btr_pcur_get_rec(&pcur);
4591
4592 /* Check if the entry is for this index page */
4593 if (ibuf_rec_get_page_no(&mtr, rec) != page_id.page_no()
4594 || ibuf_rec_get_space(&mtr, rec) != page_id.space()) {
4595
4596 if (block != NULL) {
4597 page_header_reset_last_insert(
4598 block->frame, page_zip, &mtr);
4599 }
4600
4601 goto reset_bit;
4602 }
4603
4604 if (corruption_noticed) {
4605 fputs("InnoDB: Discarding record\n ", stderr);
4606 rec_print_old(stderr, rec);
4607 fputs("\nInnoDB: from the insert buffer!\n\n", stderr);
4608 } else if (block != NULL && !rec_get_deleted_flag(rec, 0)) {
4609 /* Now we have at pcur a record which should be
4610 applied on the index page; NOTE that the call below
4611 copies pointers to fields in rec, and we must
4612 keep the latch to the rec page until the
4613 insertion is finished! */
4614 dtuple_t* entry;
4615 trx_id_t max_trx_id;
4616 dict_index_t* dummy_index;
4617 ibuf_op_t op = ibuf_rec_get_op_type(&mtr, rec);
4618
4619 max_trx_id = page_get_max_trx_id(page_align(rec));
4620 page_update_max_trx_id(block, page_zip, max_trx_id,
4621 &mtr);
4622
4623 ut_ad(page_validate(page_align(rec), ibuf->index));
4624
4625 entry = ibuf_build_entry_from_ibuf_rec(
4626 &mtr, rec, heap, &dummy_index);
4627 ut_ad(!dummy_index->table->space);
4628 dummy_index->table->space = space;
4629 dummy_index->table->space_id = space->id;
4630
4631 ut_ad(page_validate(block->frame, dummy_index));
4632
4633 switch (op) {
4634 ibool success;
4635 case IBUF_OP_INSERT:
4636#ifdef UNIV_IBUF_DEBUG
4637 volume += rec_get_converted_size(
4638 dummy_index, entry, 0);
4639
4640 volume += page_dir_calc_reserved_space(1);
4641
4642 ut_a(volume <= (4U << srv_page_size_shift)
4643 / IBUF_PAGE_SIZE_PER_FREE_SPACE);
4644#endif
4645 ibuf_insert_to_index_page(
4646 entry, block, dummy_index, &mtr);
4647 break;
4648
4649 case IBUF_OP_DELETE_MARK:
4650 ibuf_set_del_mark(
4651 entry, block, dummy_index, &mtr);
4652 break;
4653
4654 case IBUF_OP_DELETE:
4655 ibuf_delete(entry, block, dummy_index, &mtr);
4656 /* Because ibuf_delete() will latch an
4657 insert buffer bitmap page, commit mtr
4658 before latching any further pages.
4659 Store and restore the cursor position. */
4660 ut_ad(rec == btr_pcur_get_rec(&pcur));
4661 ut_ad(page_rec_is_user_rec(rec));
4662 ut_ad(ibuf_rec_get_page_no(&mtr, rec)
4663 == page_id.page_no());
4664 ut_ad(ibuf_rec_get_space(&mtr, rec)
4665 == page_id.space());
4666
4667 /* Mark the change buffer record processed,
4668 so that it will not be merged again in case
4669 the server crashes between the following
4670 mtr_commit() and the subsequent mtr_commit()
4671 of deleting the change buffer record. */
4672
4673 btr_cur_set_deleted_flag_for_ibuf(
4674 btr_pcur_get_rec(&pcur), NULL,
4675 TRUE, &mtr);
4676
4677 btr_pcur_store_position(&pcur, &mtr);
4678 ibuf_btr_pcur_commit_specify_mtr(&pcur, &mtr);
4679
4680 ibuf_mtr_start(&mtr);
4681 mtr.set_named_space(space);
4682
4683 success = buf_page_get_known_nowait(
4684 RW_X_LATCH, block,
4685 BUF_KEEP_OLD,
4686 __FILE__, __LINE__, &mtr);
4687 ut_a(success);
4688
4689 /* This is a user page (secondary
4690 index leaf page), but it should be OK
4691 to use too low latching order for it,
4692 as the block is io-fixed. */
4693 buf_block_dbg_add_level(
4694 block, SYNC_IBUF_TREE_NODE);
4695
4696 if (!ibuf_restore_pos(page_id.space(),
4697 page_id.page_no(),
4698 search_tuple,
4699 BTR_MODIFY_LEAF,
4700 &pcur, &mtr)) {
4701
4702 ut_ad(mtr.has_committed());
4703 mops[op]++;
4704 ibuf_dummy_index_free(dummy_index);
4705 goto loop;
4706 }
4707
4708 break;
4709 default:
4710 ut_error;
4711 }
4712
4713 mops[op]++;
4714
4715 ibuf_dummy_index_free(dummy_index);
4716 } else {
4717 dops[ibuf_rec_get_op_type(&mtr, rec)]++;
4718 }
4719
4720 /* Delete the record from ibuf */
4721 if (ibuf_delete_rec(page_id.space(), page_id.page_no(),
4722 &pcur, search_tuple, &mtr)) {
4723 /* Deletion was pessimistic and mtr was committed:
4724 we start from the beginning again */
4725
4726 ut_ad(mtr.has_committed());
4727 goto loop;
4728 } else if (btr_pcur_is_after_last_on_page(&pcur)) {
4729 ibuf_mtr_commit(&mtr);
4730 btr_pcur_close(&pcur);
4731
4732 goto loop;
4733 }
4734 }
4735
4736reset_bit:
4737 if (update_ibuf_bitmap) {
4738 page_t* bitmap_page;
4739
4740 bitmap_page = ibuf_bitmap_get_map_page(page_id, *page_size,
4741 &mtr);
4742
4743 ibuf_bitmap_page_set_bits(
4744 bitmap_page, page_id, *page_size,
4745 IBUF_BITMAP_BUFFERED, FALSE, &mtr);
4746
4747 if (block != NULL) {
4748 ulint old_bits = ibuf_bitmap_page_get_bits(
4749 bitmap_page, page_id, *page_size,
4750 IBUF_BITMAP_FREE, &mtr);
4751
4752 ulint new_bits = ibuf_index_page_calc_free(block);
4753
4754 if (old_bits != new_bits) {
4755 ibuf_bitmap_page_set_bits(
4756 bitmap_page, page_id, *page_size,
4757 IBUF_BITMAP_FREE, new_bits, &mtr);
4758 }
4759 }
4760 }
4761
4762 ibuf_mtr_commit(&mtr);
4763
4764 if (space) {
4765 space->release();
4766 }
4767
4768 btr_pcur_close(&pcur);
4769 mem_heap_free(heap);
4770
4771 my_atomic_addlint(&ibuf->n_merges, 1);
4772 ibuf_add_ops(ibuf->n_merged_ops, mops);
4773 ibuf_add_ops(ibuf->n_discarded_ops, dops);
4774
4775#ifdef UNIV_IBUF_COUNT_DEBUG
4776 ut_a(ibuf_count_get(page_id) == 0);
4777#endif
4778}
4779
4780/*********************************************************************//**
4781Deletes all entries in the insert buffer for a given space id. This is used
4782in DISCARD TABLESPACE, IMPORT TABLESPACE and TRUNCATE TABLESPACE.
4783NOTE: this does not update the page free bitmaps in the space. The space will
4784become CORRUPT when you call this function! */
4785void
4786ibuf_delete_for_discarded_space(
4787/*============================*/
4788 ulint space) /*!< in: space id */
4789{
4790 mem_heap_t* heap;
4791 btr_pcur_t pcur;
4792 dtuple_t* search_tuple;
4793 const rec_t* ibuf_rec;
4794 ulint page_no;
4795 mtr_t mtr;
4796
4797 /* Counts for discarded operations. */
4798 ulint dops[IBUF_OP_COUNT];
4799
4800 heap = mem_heap_create(512);
4801
4802 /* Use page number 0 to build the search tuple so that we get the
4803 cursor positioned at the first entry for this space id */
4804
4805 search_tuple = ibuf_search_tuple_build(space, 0, heap);
4806
4807 memset(dops, 0, sizeof(dops));
4808loop:
4809 ibuf_mtr_start(&mtr);
4810
4811 /* Position pcur in the insert buffer at the first entry for the
4812 space */
4813 btr_pcur_open_on_user_rec(
4814 ibuf->index, search_tuple, PAGE_CUR_GE, BTR_MODIFY_LEAF,
4815 &pcur, &mtr);
4816
4817 if (!btr_pcur_is_on_user_rec(&pcur)) {
4818 ut_ad(btr_pcur_is_after_last_in_tree(&pcur));
4819 goto leave_loop;
4820 }
4821
4822 for (;;) {
4823 ut_ad(btr_pcur_is_on_user_rec(&pcur));
4824
4825 ibuf_rec = btr_pcur_get_rec(&pcur);
4826
4827 /* Check if the entry is for this space */
4828 if (ibuf_rec_get_space(&mtr, ibuf_rec) != space) {
4829
4830 goto leave_loop;
4831 }
4832
4833 page_no = ibuf_rec_get_page_no(&mtr, ibuf_rec);
4834
4835 dops[ibuf_rec_get_op_type(&mtr, ibuf_rec)]++;
4836
4837 /* Delete the record from ibuf */
4838 if (ibuf_delete_rec(space, page_no, &pcur, search_tuple,
4839 &mtr)) {
4840 /* Deletion was pessimistic and mtr was committed:
4841 we start from the beginning again */
4842
4843 ut_ad(mtr.has_committed());
4844 goto loop;
4845 }
4846
4847 if (btr_pcur_is_after_last_on_page(&pcur)) {
4848 ibuf_mtr_commit(&mtr);
4849 btr_pcur_close(&pcur);
4850
4851 goto loop;
4852 }
4853 }
4854
4855leave_loop:
4856 ibuf_mtr_commit(&mtr);
4857 btr_pcur_close(&pcur);
4858
4859 ibuf_add_ops(ibuf->n_discarded_ops, dops);
4860
4861 mem_heap_free(heap);
4862}
4863
4864/******************************************************************//**
4865Looks if the insert buffer is empty.
4866@return true if empty */
4867bool
4868ibuf_is_empty(void)
4869/*===============*/
4870{
4871 bool is_empty;
4872 const page_t* root;
4873 mtr_t mtr;
4874
4875 ibuf_mtr_start(&mtr);
4876
4877 mutex_enter(&ibuf_mutex);
4878 root = ibuf_tree_root_get(&mtr);
4879 mutex_exit(&ibuf_mutex);
4880
4881 is_empty = page_is_empty(root);
4882 ut_a(is_empty == ibuf->empty);
4883 ibuf_mtr_commit(&mtr);
4884
4885 return(is_empty);
4886}
4887
4888/******************************************************************//**
4889Prints info of ibuf. */
4890void
4891ibuf_print(
4892/*=======*/
4893 FILE* file) /*!< in: file where to print */
4894{
4895#ifdef UNIV_IBUF_COUNT_DEBUG
4896 ulint i;
4897 ulint j;
4898#endif
4899
4900 mutex_enter(&ibuf_mutex);
4901
4902 fprintf(file,
4903 "Ibuf: size " ULINTPF ", free list len " ULINTPF ","
4904 " seg size " ULINTPF ", " ULINTPF " merges\n",
4905 ibuf->size,
4906 ibuf->free_list_len,
4907 ibuf->seg_size,
4908 ibuf->n_merges);
4909
4910 fputs("merged operations:\n ", file);
4911 ibuf_print_ops(ibuf->n_merged_ops, file);
4912
4913 fputs("discarded operations:\n ", file);
4914 ibuf_print_ops(ibuf->n_discarded_ops, file);
4915
4916#ifdef UNIV_IBUF_COUNT_DEBUG
4917 for (i = 0; i < IBUF_COUNT_N_SPACES; i++) {
4918 for (j = 0; j < IBUF_COUNT_N_PAGES; j++) {
4919 ulint count = ibuf_count_get(page_id_t(i, j, 0));
4920
4921 if (count > 0) {
4922 fprintf(stderr,
4923 "Ibuf count for page "
4924 ULINTPF ":" ULINTPF ""
4925 " is " ULINTPF "\n",
4926 i, j, count);
4927 }
4928 }
4929 }
4930#endif /* UNIV_IBUF_COUNT_DEBUG */
4931
4932 mutex_exit(&ibuf_mutex);
4933}
4934
4935/** Check the insert buffer bitmaps on IMPORT TABLESPACE.
4936@param[in] trx transaction
4937@param[in,out] space tablespace being imported
4938@return DB_SUCCESS or error code */
4939dberr_t ibuf_check_bitmap_on_import(const trx_t* trx, fil_space_t* space)
4940{
4941 ulint page_no;
4942 ut_ad(trx->mysql_thd);
4943 ut_ad(space->purpose == FIL_TYPE_IMPORT);
4944 const page_size_t page_size(space->flags);
4945 /* fil_space_t::size and fil_space_t::free_limit would still be 0
4946 at this point. So, we will have to read page 0. */
4947 ut_ad(!space->free_limit);
4948 ut_ad(!space->size);
4949
4950 mtr_t mtr;
4951 ulint size;
4952 mtr.start();
4953 if (buf_block_t* sp = buf_page_get(page_id_t(space->id, 0), page_size,
4954 RW_S_LATCH, &mtr)) {
4955 size = std::min(
4956 mach_read_from_4(FSP_HEADER_OFFSET + FSP_FREE_LIMIT
4957 + sp->frame),
4958 mach_read_from_4(FSP_HEADER_OFFSET + FSP_SIZE
4959 + sp->frame));
4960 } else {
4961 size = 0;
4962 }
4963 mtr.commit();
4964
4965 if (size == 0) {
4966 return(DB_TABLE_NOT_FOUND);
4967 }
4968
4969 mutex_enter(&ibuf_mutex);
4970
4971 /* The two bitmap pages (allocation bitmap and ibuf bitmap) repeat
4972 every page_size pages. For example if page_size is 16 KiB, then the
4973 two bitmap pages repeat every 16 KiB * 16384 = 256 MiB. In the loop
4974 below page_no is measured in number of pages since the beginning of
4975 the space, as usual. */
4976
4977 for (page_no = 0; page_no < size; page_no += page_size.physical()) {
4978 page_t* bitmap_page;
4979 ulint i;
4980
4981 if (trx_is_interrupted(trx)) {
4982 mutex_exit(&ibuf_mutex);
4983 return(DB_INTERRUPTED);
4984 }
4985
4986 mtr_start(&mtr);
4987
4988 mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
4989
4990 ibuf_enter(&mtr);
4991
4992 bitmap_page = ibuf_bitmap_get_map_page(
4993 page_id_t(space->id, page_no), page_size, &mtr);
4994
4995 if (buf_page_is_zeroes(bitmap_page, page_size)) {
4996 /* This means we got all-zero page instead of
4997 ibuf bitmap page. The subsequent page should be
4998 all-zero pages. */
4999#ifdef UNIV_DEBUG
5000 for (ulint curr_page = page_no + 1;
5001 curr_page < page_size.physical(); curr_page++) {
5002
5003 buf_block_t* block = buf_page_get(
5004 page_id_t(space->id, curr_page),
5005 page_size, RW_S_LATCH, &mtr);
5006 page_t* page = buf_block_get_frame(block);
5007 ut_ad(buf_page_is_zeroes(page, page_size));
5008 }
5009#endif /* UNIV_DEBUG */
5010 ibuf_exit(&mtr);
5011 mtr_commit(&mtr);
5012 continue;
5013 }
5014
5015 for (i = FSP_IBUF_BITMAP_OFFSET + 1;
5016 i < page_size.physical();
5017 i++) {
5018
5019 const ulint offset = page_no + i;
5020
5021 const page_id_t cur_page_id(space->id, offset);
5022
5023 if (ibuf_bitmap_page_get_bits(
5024 bitmap_page, cur_page_id, page_size,
5025 IBUF_BITMAP_IBUF, &mtr)) {
5026
5027 mutex_exit(&ibuf_mutex);
5028 ibuf_exit(&mtr);
5029 mtr_commit(&mtr);
5030
5031 ib_errf(trx->mysql_thd,
5032 IB_LOG_LEVEL_ERROR,
5033 ER_INNODB_INDEX_CORRUPT,
5034 "File %s page " ULINTPF
5035 " is wrongly flagged to belong to the"
5036 " insert buffer",
5037 space->chain.start->name, offset);
5038 return(DB_CORRUPTION);
5039 }
5040
5041 if (ibuf_bitmap_page_get_bits(
5042 bitmap_page, cur_page_id, page_size,
5043 IBUF_BITMAP_BUFFERED, &mtr)) {
5044
5045 ib_errf(trx->mysql_thd,
5046 IB_LOG_LEVEL_WARN,
5047 ER_INNODB_INDEX_CORRUPT,
5048 "Buffered changes"
5049 " for file %s page " ULINTPF
5050 " are lost",
5051 space->chain.start->name, offset);
5052
5053 /* Tolerate this error, so that
5054 slightly corrupted tables can be
5055 imported and dumped. Clear the bit. */
5056 ibuf_bitmap_page_set_bits(
5057 bitmap_page, cur_page_id, page_size,
5058 IBUF_BITMAP_BUFFERED, FALSE, &mtr);
5059 }
5060 }
5061
5062 ibuf_exit(&mtr);
5063 mtr_commit(&mtr);
5064 }
5065
5066 mutex_exit(&ibuf_mutex);
5067 return(DB_SUCCESS);
5068}
5069
5070/** Updates free bits and buffered bits for bulk loaded page.
5071@param[in] block index page
5072@param[in] reset flag if reset free val */
5073void
5074ibuf_set_bitmap_for_bulk_load(
5075 buf_block_t* block,
5076 bool reset)
5077{
5078 page_t* bitmap_page;
5079 mtr_t mtr;
5080 ulint free_val;
5081
5082 ut_a(page_is_leaf(buf_block_get_frame(block)));
5083
5084 free_val = ibuf_index_page_calc_free(block);
5085
5086 mtr_start(&mtr);
5087 mtr.set_named_space_id(block->page.id.space());
5088
5089 bitmap_page = ibuf_bitmap_get_map_page(block->page.id,
5090 block->page.size, &mtr);
5091
5092 free_val = reset ? 0 : ibuf_index_page_calc_free(block);
5093 ibuf_bitmap_page_set_bits(
5094 bitmap_page, block->page.id, block->page.size,
5095 IBUF_BITMAP_FREE, free_val, &mtr);
5096
5097 ibuf_bitmap_page_set_bits(
5098 bitmap_page, block->page.id, block->page.size,
5099 IBUF_BITMAP_BUFFERED, FALSE, &mtr);
5100
5101 mtr_commit(&mtr);
5102}
5103