1/*****************************************************************************
2
3Copyright (c) 1994, 2016, Oracle and/or its affiliates. All Rights Reserved.
4Copyright (c) 2012, Facebook Inc.
5Copyright (c) 2014, 2018, MariaDB Corporation.
6
7This program is free software; you can redistribute it and/or modify it under
8the terms of the GNU General Public License as published by the Free Software
9Foundation; version 2 of the License.
10
11This program is distributed in the hope that it will be useful, but WITHOUT
12ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
13FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
14
15You should have received a copy of the GNU General Public License along with
16this program; if not, write to the Free Software Foundation, Inc.,
1751 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
18
19*****************************************************************************/
20
21/**************************************************//**
22@file btr/btr0btr.cc
23The B-tree
24
25Created 6/2/1994 Heikki Tuuri
26*******************************************************/
27
28#include "btr0btr.h"
29#include "ha_prototypes.h"
30
31#include "fsp0sysspace.h"
32#include "page0page.h"
33#include "page0zip.h"
34#include "gis0rtree.h"
35
36#include "btr0cur.h"
37#include "btr0sea.h"
38#include "btr0pcur.h"
39#include "btr0defragment.h"
40#include "rem0cmp.h"
41#include "lock0lock.h"
42#include "ibuf0ibuf.h"
43#include "trx0trx.h"
44#include "srv0mon.h"
45#include "gis0geo.h"
46#include "ut0new.h"
47#include "dict0boot.h"
48#include "row0sel.h" /* row_search_max_autoinc() */
49
50/**************************************************************//**
51Checks if the page in the cursor can be merged with given page.
52If necessary, re-organize the merge_page.
53@return true if possible to merge. */
54static
55bool
56btr_can_merge_with_page(
57/*====================*/
58 btr_cur_t* cursor, /*!< in: cursor on the page to merge */
59 ulint page_no, /*!< in: a sibling page */
60 buf_block_t** merge_block, /*!< out: the merge block */
61 mtr_t* mtr); /*!< in: mini-transaction */
62
63/**************************************************************//**
64Report that an index page is corrupted. */
65void
66btr_corruption_report(
67/*==================*/
68 const buf_block_t* block, /*!< in: corrupted block */
69 const dict_index_t* index) /*!< in: index tree */
70{
71 ib::error()
72 << "Flag mismatch in page " << block->page.id
73 << " index " << index->name
74 << " of table " << index->table->name;
75}
76
77/*
78Latching strategy of the InnoDB B-tree
79--------------------------------------
80
81Node pointer page latches acquisition is protected by index->lock latch.
82
83Before MariaDB 10.2.2, all node pointer pages were protected by index->lock
84either in S (shared) or X (exclusive) mode and block->lock was not acquired on
85node pointer pages.
86
87After MariaDB 10.2.2, block->lock S-latch or X-latch is used to protect
88node pointer pages and obtaiment of node pointer page latches is protected by
89index->lock.
90
91(0) Definition: B-tree level.
92
93(0.1) The leaf pages of the B-tree are at level 0.
94
95(0.2) The parent of a page at level L has level L+1. (The level of the
96root page is equal to the tree height.)
97
98(0.3) The B-tree lock (index->lock) is the parent of the root page and
99has a level = tree height + 1.
100
101Index->lock has 3 possible locking modes:
102
103(1) S-latch:
104
105(1.1) All latches for pages must be obtained in descending order of tree level.
106
107(1.2) Before obtaining the first node pointer page latch at a given B-tree
108level, parent latch must be held (at level +1 ).
109
110(1.3) If a node pointer page is already latched at the same level
111we can only obtain latch to its right sibling page latch at the same level.
112
113(1.4) Release of the node pointer page latches must be done in
114child-to-parent order. (Prevents deadlocks when obtained index->lock
115in SX mode).
116
117(1.4.1) Level L node pointer page latch can be released only when
118no latches at children level i.e. level < L are hold.
119
120(1.4.2) All latches from node pointer pages must be released so
121that no latches are obtained between.
122
123(1.5) [implied by (1.1), (1.2)] Root page latch must be first node pointer
124latch obtained.
125
126(2) SX-latch:
127
128In this case rules (1.2) and (1.3) from S-latch case are relaxed and
129merged into (2.2) and rule (1.4) is removed. Thus, latch acquisition
130can be skipped at some tree levels and latches can be obtained in
131a less restricted order.
132
133(2.1) [identical to (1.1)]: All latches for pages must be obtained in descending
134order of tree level.
135
136(2.2) When a node pointer latch at level L is obtained,
137the left sibling page latch in the same level or some ancestor
138page latch (at level > L) must be hold.
139
140(2.3) [implied by (2.1), (2.2)] The first node pointer page latch obtained can
141be any node pointer page.
142
143(3) X-latch:
144
145Node pointer latches can be obtained in any order.
146
147NOTE: New rules after MariaDB 10.2.2 does not affect the latching rules of leaf pages:
148
149index->lock S-latch is needed in read for the node pointer traversal. When the leaf
150level is reached, index-lock can be released (and with the MariaDB 10.2.2 changes, all
151node pointer latches). Left to right index travelsal in leaf page level can be safely done
152by obtaining right sibling leaf page latch and then releasing the old page latch.
153
154Single leaf page modifications (BTR_MODIFY_LEAF) are protected by index->lock
155S-latch.
156
157B-tree operations involving page splits or merges (BTR_MODIFY_TREE) and page
158allocations are protected by index->lock X-latch.
159
160Node pointers
161-------------
162Leaf pages of a B-tree contain the index records stored in the
163tree. On levels n > 0 we store 'node pointers' to pages on level
164n - 1. For each page there is exactly one node pointer stored:
165thus the our tree is an ordinary B-tree, not a B-link tree.
166
167A node pointer contains a prefix P of an index record. The prefix
168is long enough so that it determines an index record uniquely.
169The file page number of the child page is added as the last
170field. To the child page we can store node pointers or index records
171which are >= P in the alphabetical order, but < P1 if there is
172a next node pointer on the level, and P1 is its prefix.
173
174If a node pointer with a prefix P points to a non-leaf child,
175then the leftmost record in the child must have the same
176prefix P. If it points to a leaf node, the child is not required
177to contain any record with a prefix equal to P. The leaf case
178is decided this way to allow arbitrary deletions in a leaf node
179without touching upper levels of the tree.
180
181We have predefined a special minimum record which we
182define as the smallest record in any alphabetical order.
183A minimum record is denoted by setting a bit in the record
184header. A minimum record acts as the prefix of a node pointer
185which points to a leftmost node on any level of the tree.
186
187File page allocation
188--------------------
189In the root node of a B-tree there are two file segment headers.
190The leaf pages of a tree are allocated from one file segment, to
191make them consecutive on disk if possible. From the other file segment
192we allocate pages for the non-leaf levels of the tree.
193*/
194
195#ifdef UNIV_BTR_DEBUG
196/**************************************************************//**
197Checks a file segment header within a B-tree root page.
198@return TRUE if valid */
199static
200ibool
201btr_root_fseg_validate(
202/*===================*/
203 const fseg_header_t* seg_header, /*!< in: segment header */
204 ulint space) /*!< in: tablespace identifier */
205{
206 ulint offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
207
208 ut_a(mach_read_from_4(seg_header + FSEG_HDR_SPACE) == space);
209 ut_a(offset >= FIL_PAGE_DATA);
210 ut_a(offset <= srv_page_size - FIL_PAGE_DATA_END);
211 return(TRUE);
212}
213#endif /* UNIV_BTR_DEBUG */
214
215/**************************************************************//**
216Gets the root node of a tree and x- or s-latches it.
217@return root page, x- or s-latched */
218buf_block_t*
219btr_root_block_get(
220/*===============*/
221 const dict_index_t* index, /*!< in: index tree */
222 ulint mode, /*!< in: either RW_S_LATCH
223 or RW_X_LATCH */
224 mtr_t* mtr) /*!< in: mtr */
225{
226 if (!index->table || !index->table->space) {
227 return NULL;
228 }
229
230 buf_block_t* block = btr_block_get(
231 page_id_t(index->table->space->id, index->page),
232 page_size_t(index->table->space->flags), mode,
233 index, mtr);
234
235 if (!block) {
236 index->table->file_unreadable = true;
237
238 ib_push_warning(
239 static_cast<THD*>(NULL), DB_DECRYPTION_FAILED,
240 "Table %s in file %s is encrypted but encryption service or"
241 " used key_id is not available. "
242 " Can't continue reading table.",
243 index->table->name,
244 UT_LIST_GET_FIRST(index->table->space->chain)->name);
245
246 return NULL;
247 }
248
249 btr_assert_not_corrupted(block, index);
250
251#ifdef UNIV_BTR_DEBUG
252 if (!dict_index_is_ibuf(index)) {
253 const page_t* root = buf_block_get_frame(block);
254
255 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
256 + root, index->table->space->id));
257 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
258 + root, index->table->space->id));
259 }
260#endif /* UNIV_BTR_DEBUG */
261
262 return(block);
263}
264
265/**************************************************************//**
266Gets the root node of a tree and sx-latches it for segment access.
267@return root page, sx-latched */
268page_t*
269btr_root_get(
270/*=========*/
271 const dict_index_t* index, /*!< in: index tree */
272 mtr_t* mtr) /*!< in: mtr */
273{
274 /* Intended to be used for segment list access.
275 SX lock doesn't block reading user data by other threads.
276 And block the segment list access by others.*/
277 buf_block_t* root = btr_root_block_get(index, RW_SX_LATCH,
278 mtr);
279
280 if (root && root->page.encrypted == true) {
281 root = NULL;
282 }
283
284 return(root ? buf_block_get_frame(root) : NULL);
285}
286
287/**************************************************************//**
288Gets the height of the B-tree (the level of the root, when the leaf
289level is assumed to be 0). The caller must hold an S or X latch on
290the index.
291@return tree height (level of the root) */
292ulint
293btr_height_get(
294/*===========*/
295 dict_index_t* index, /*!< in: index tree */
296 mtr_t* mtr) /*!< in/out: mini-transaction */
297{
298 ulint height=0;
299 buf_block_t* root_block;
300
301 ut_ad(srv_read_only_mode
302 || mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
303 MTR_MEMO_S_LOCK
304 | MTR_MEMO_X_LOCK
305 | MTR_MEMO_SX_LOCK));
306
307 /* S latches the page */
308 root_block = btr_root_block_get(index, RW_S_LATCH, mtr);
309
310 if (root_block) {
311 height = btr_page_get_level(buf_block_get_frame(root_block));
312
313 /* Release the S latch on the root page. */
314 mtr->memo_release(root_block, MTR_MEMO_PAGE_S_FIX);
315
316 ut_d(sync_check_unlock(&root_block->lock));
317 }
318
319 return(height);
320}
321
322/**************************************************************//**
323Checks a file segment header within a B-tree root page and updates
324the segment header space id.
325@return TRUE if valid */
326static
327bool
328btr_root_fseg_adjust_on_import(
329/*===========================*/
330 fseg_header_t* seg_header, /*!< in/out: segment header */
331 page_zip_des_t* page_zip, /*!< in/out: compressed page,
332 or NULL */
333 ulint space, /*!< in: tablespace identifier */
334 mtr_t* mtr) /*!< in/out: mini-transaction */
335{
336 ulint offset = mach_read_from_2(seg_header + FSEG_HDR_OFFSET);
337
338 if (offset < FIL_PAGE_DATA
339 || offset > srv_page_size - FIL_PAGE_DATA_END) {
340
341 return(FALSE);
342
343 } else if (page_zip) {
344 mach_write_to_4(seg_header + FSEG_HDR_SPACE, space);
345 page_zip_write_header(page_zip, seg_header + FSEG_HDR_SPACE,
346 4, mtr);
347 } else {
348 mlog_write_ulint(seg_header + FSEG_HDR_SPACE,
349 space, MLOG_4BYTES, mtr);
350 }
351
352 return(TRUE);
353}
354
355/**************************************************************//**
356Checks and adjusts the root node of a tree during IMPORT TABLESPACE.
357@return error code, or DB_SUCCESS */
358dberr_t
359btr_root_adjust_on_import(
360/*======================*/
361 const dict_index_t* index) /*!< in: index tree */
362{
363 dberr_t err;
364 mtr_t mtr;
365 page_t* page;
366 buf_block_t* block;
367 page_zip_des_t* page_zip;
368 dict_table_t* table = index->table;
369 const page_id_t page_id(table->space->id, index->page);
370 const page_size_t page_size(table->space->flags);
371
372 DBUG_EXECUTE_IF("ib_import_trigger_corruption_3",
373 return(DB_CORRUPTION););
374
375 mtr_start(&mtr);
376
377 mtr_set_log_mode(&mtr, MTR_LOG_NO_REDO);
378
379 block = btr_block_get(page_id, page_size, RW_X_LATCH, index, &mtr);
380
381 page = buf_block_get_frame(block);
382 page_zip = buf_block_get_page_zip(block);
383
384 if (!page_is_root(page)) {
385
386 err = DB_CORRUPTION;
387
388 } else if (dict_index_is_clust(index)) {
389 bool page_is_compact_format;
390
391 page_is_compact_format = page_is_comp(page) > 0;
392
393 /* Check if the page format and table format agree. */
394 if (page_is_compact_format != dict_table_is_comp(table)) {
395 err = DB_CORRUPTION;
396 } else {
397 /* Check that the table flags and the tablespace
398 flags match. */
399 err = (dict_tf_to_fsp_flags(table->flags)
400 == table->space->flags)
401 ? DB_SUCCESS : DB_CORRUPTION;
402 }
403 } else {
404 err = DB_SUCCESS;
405 }
406
407 /* Check and adjust the file segment headers, if all OK so far. */
408 if (err == DB_SUCCESS
409 && (!btr_root_fseg_adjust_on_import(
410 FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
411 + page, page_zip, table->space->id, &mtr)
412 || !btr_root_fseg_adjust_on_import(
413 FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
414 + page, page_zip, table->space->id, &mtr))) {
415
416 err = DB_CORRUPTION;
417 }
418
419 mtr_commit(&mtr);
420
421 return(err);
422}
423
424/**************************************************************//**
425Creates a new index page (not the root, and also not
426used in page reorganization). @see btr_page_empty(). */
427void
428btr_page_create(
429/*============*/
430 buf_block_t* block, /*!< in/out: page to be created */
431 page_zip_des_t* page_zip,/*!< in/out: compressed page, or NULL */
432 dict_index_t* index, /*!< in: index */
433 ulint level, /*!< in: the B-tree level of the page */
434 mtr_t* mtr) /*!< in: mtr */
435{
436 page_t* page = buf_block_get_frame(block);
437
438 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
439
440 if (page_zip) {
441 page_create_zip(block, index, level, 0, NULL, mtr);
442 } else {
443 page_create(block, mtr, dict_table_is_comp(index->table),
444 dict_index_is_spatial(index));
445 /* Set the level of the new index page */
446 btr_page_set_level(page, NULL, level, mtr);
447 }
448
449 /* For Spatial Index, initialize the Split Sequence Number */
450 if (dict_index_is_spatial(index)) {
451 page_set_ssn_id(block, page_zip, 0, mtr);
452 }
453
454 btr_page_set_index_id(page, page_zip, index->id, mtr);
455}
456
457/**************************************************************//**
458Allocates a new file page to be used in an ibuf tree. Takes the page from
459the free list of the tree, which must contain pages!
460@return new allocated block, x-latched */
461static
462buf_block_t*
463btr_page_alloc_for_ibuf(
464/*====================*/
465 dict_index_t* index, /*!< in: index tree */
466 mtr_t* mtr) /*!< in: mtr */
467{
468 fil_addr_t node_addr;
469 page_t* root;
470 page_t* new_page;
471 buf_block_t* new_block;
472
473 root = btr_root_get(index, mtr);
474
475 node_addr = flst_get_first(root + PAGE_HEADER
476 + PAGE_BTR_IBUF_FREE_LIST, mtr);
477 ut_a(node_addr.page != FIL_NULL);
478
479 new_block = buf_page_get(
480 page_id_t(index->table->space->id, node_addr.page),
481 page_size_t(index->table->space->flags),
482 RW_X_LATCH, mtr);
483
484 new_page = buf_block_get_frame(new_block);
485 buf_block_dbg_add_level(new_block, SYNC_IBUF_TREE_NODE_NEW);
486
487 flst_remove(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
488 new_page + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE,
489 mtr);
490 ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
491 mtr));
492
493 return(new_block);
494}
495
496/**************************************************************//**
497Allocates a new file page to be used in an index tree. NOTE: we assume
498that the caller has made the reservation for free extents!
499@retval NULL if no page could be allocated
500@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
501(init_mtr == mtr, or the page was not previously freed in mtr)
502@retval block (not allocated or initialized) otherwise */
503static MY_ATTRIBUTE((nonnull, warn_unused_result))
504buf_block_t*
505btr_page_alloc_low(
506/*===============*/
507 dict_index_t* index, /*!< in: index */
508 ulint hint_page_no, /*!< in: hint of a good page */
509 byte file_direction, /*!< in: direction where a possible
510 page split is made */
511 ulint level, /*!< in: level where the page is placed
512 in the tree */
513 mtr_t* mtr, /*!< in/out: mini-transaction
514 for the allocation */
515 mtr_t* init_mtr) /*!< in/out: mtr or another
516 mini-transaction in which the
517 page should be initialized.
518 If init_mtr!=mtr, but the page
519 is already X-latched in mtr, do
520 not initialize the page. */
521{
522 fseg_header_t* seg_header;
523 page_t* root;
524
525 root = btr_root_get(index, mtr);
526
527 if (level == 0) {
528 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
529 } else {
530 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
531 }
532
533 /* Parameter TRUE below states that the caller has made the
534 reservation for free extents, and thus we know that a page can
535 be allocated: */
536
537 buf_block_t* block = fseg_alloc_free_page_general(
538 seg_header, hint_page_no, file_direction,
539 TRUE, mtr, init_mtr);
540
541#ifdef UNIV_DEBUG_SCRUBBING
542 if (block != NULL) {
543 fprintf(stderr,
544 "alloc %lu:%lu to index: %lu root: %lu\n",
545 buf_block_get_page_no(block),
546 buf_block_get_space(block),
547 index->id,
548 dict_index_get_page(index));
549 } else {
550 fprintf(stderr,
551 "failed alloc index: %lu root: %lu\n",
552 index->id,
553 dict_index_get_page(index));
554 }
555#endif /* UNIV_DEBUG_SCRUBBING */
556
557 return block;
558}
559
560/**************************************************************//**
561Allocates a new file page to be used in an index tree. NOTE: we assume
562that the caller has made the reservation for free extents!
563@retval NULL if no page could be allocated
564@retval block, rw_lock_x_lock_count(&block->lock) == 1 if allocation succeeded
565(init_mtr == mtr, or the page was not previously freed in mtr)
566@retval block (not allocated or initialized) otherwise */
567buf_block_t*
568btr_page_alloc(
569/*===========*/
570 dict_index_t* index, /*!< in: index */
571 ulint hint_page_no, /*!< in: hint of a good page */
572 byte file_direction, /*!< in: direction where a possible
573 page split is made */
574 ulint level, /*!< in: level where the page is placed
575 in the tree */
576 mtr_t* mtr, /*!< in/out: mini-transaction
577 for the allocation */
578 mtr_t* init_mtr) /*!< in/out: mini-transaction
579 for x-latching and initializing
580 the page */
581{
582 buf_block_t* new_block;
583
584 if (dict_index_is_ibuf(index)) {
585
586 return(btr_page_alloc_for_ibuf(index, mtr));
587 }
588
589 new_block = btr_page_alloc_low(
590 index, hint_page_no, file_direction, level, mtr, init_mtr);
591
592 if (new_block) {
593 buf_block_dbg_add_level(new_block, SYNC_TREE_NODE_NEW);
594 }
595
596 return(new_block);
597}
598
599/**************************************************************//**
600Gets the number of pages in a B-tree.
601@return number of pages, or ULINT_UNDEFINED if the index is unavailable */
602ulint
603btr_get_size(
604/*=========*/
605 dict_index_t* index, /*!< in: index */
606 ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
607 mtr_t* mtr) /*!< in/out: mini-transaction where index
608 is s-latched */
609{
610 fseg_header_t* seg_header;
611 page_t* root;
612 ulint n=0;
613 ulint dummy;
614
615 ut_ad(srv_read_only_mode
616 || mtr_memo_contains(mtr, dict_index_get_lock(index),
617 MTR_MEMO_S_LOCK));
618
619 if (index->page == FIL_NULL
620 || dict_index_is_online_ddl(index)
621 || !index->is_committed()) {
622 return(ULINT_UNDEFINED);
623 }
624
625 root = btr_root_get(index, mtr);
626
627 if (root) {
628 if (flag == BTR_N_LEAF_PAGES) {
629 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
630
631 fseg_n_reserved_pages(seg_header, &n, mtr);
632
633 } else if (flag == BTR_TOTAL_SIZE) {
634 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
635
636 n = fseg_n_reserved_pages(seg_header, &dummy, mtr);
637
638 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
639
640 n += fseg_n_reserved_pages(seg_header, &dummy, mtr);
641 } else {
642 ut_error;
643 }
644 } else {
645 n = ULINT_UNDEFINED;
646 }
647
648 return(n);
649}
650
651/**************************************************************//**
652Gets the number of reserved and used pages in a B-tree.
653@return number of pages reserved, or ULINT_UNDEFINED if the index
654is unavailable */
655UNIV_INTERN
656ulint
657btr_get_size_and_reserved(
658/*======================*/
659 dict_index_t* index, /*!< in: index */
660 ulint flag, /*!< in: BTR_N_LEAF_PAGES or BTR_TOTAL_SIZE */
661 ulint* used, /*!< out: number of pages used (<= reserved) */
662 mtr_t* mtr) /*!< in/out: mini-transaction where index
663 is s-latched */
664{
665 fseg_header_t* seg_header;
666 page_t* root;
667 ulint n=ULINT_UNDEFINED;
668 ulint dummy;
669
670 ut_ad(mtr_memo_contains(mtr, dict_index_get_lock(index),
671 MTR_MEMO_S_LOCK));
672
673 ut_a(flag == BTR_N_LEAF_PAGES || flag == BTR_TOTAL_SIZE);
674
675 if (index->page == FIL_NULL
676 || dict_index_is_online_ddl(index)
677 || !index->is_committed()) {
678 return(ULINT_UNDEFINED);
679 }
680
681 root = btr_root_get(index, mtr);
682 *used = 0;
683
684 if (root) {
685
686 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
687
688 n = fseg_n_reserved_pages(seg_header, used, mtr);
689
690 if (flag == BTR_TOTAL_SIZE) {
691 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
692
693 n += fseg_n_reserved_pages(seg_header, &dummy, mtr);
694 *used += dummy;
695
696 }
697 }
698
699 return(n);
700}
701
702/**************************************************************//**
703Frees a page used in an ibuf tree. Puts the page to the free list of the
704ibuf tree. */
705static
706void
707btr_page_free_for_ibuf(
708/*===================*/
709 dict_index_t* index, /*!< in: index tree */
710 buf_block_t* block, /*!< in: block to be freed, x-latched */
711 mtr_t* mtr) /*!< in: mtr */
712{
713 page_t* root;
714
715 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
716 root = btr_root_get(index, mtr);
717
718 flst_add_first(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
719 buf_block_get_frame(block)
720 + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST_NODE, mtr);
721
722 ut_ad(flst_validate(root + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
723 mtr));
724}
725
726/**************************************************************//**
727Frees a file page used in an index tree. Can be used also to (BLOB)
728external storage pages. */
729void
730btr_page_free_low(
731/*==============*/
732 dict_index_t* index, /*!< in: index tree */
733 buf_block_t* block, /*!< in: block to be freed, x-latched */
734 ulint level, /*!< in: page level (ULINT_UNDEFINED=BLOB) */
735 bool blob, /*!< in: blob page */
736 mtr_t* mtr) /*!< in: mtr */
737{
738 fseg_header_t* seg_header;
739 page_t* root;
740
741 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
742 /* The page gets invalid for optimistic searches: increment the frame
743 modify clock */
744
745 buf_block_modify_clock_inc(block);
746
747 if (blob) {
748 ut_a(level == 0);
749 }
750
751 bool scrub = srv_immediate_scrub_data_uncompressed;
752 /* scrub page */
753 if (scrub && blob) {
754 /* blob page: scrub entire page */
755 // TODO(jonaso): scrub only what is actually needed
756 page_t* page = buf_block_get_frame(block);
757 memset(page + PAGE_HEADER, 0,
758 srv_page_size - PAGE_HEADER);
759#ifdef UNIV_DEBUG_SCRUBBING
760 fprintf(stderr,
761 "btr_page_free_low: scrub blob page %lu/%lu\n",
762 buf_block_get_space(block),
763 buf_block_get_page_no(block));
764#endif /* UNIV_DEBUG_SCRUBBING */
765 } else if (scrub) {
766 /* scrub records on page */
767
768 /* TODO(jonaso): in theory we could clear full page
769 * but, since page still remains in buffer pool, and
770 * gets flushed etc. Lots of routines validates consistency
771 * of it. And in order to remain structurally consistent
772 * we clear each record by it own
773 *
774 * NOTE: The TODO below mentions removing page from buffer pool
775 * and removing redo entries, once that is done, clearing full
776 * pages should be possible
777 */
778 uint cnt = 0;
779 ulint bytes = 0;
780 page_t* page = buf_block_get_frame(block);
781 mem_heap_t* heap = NULL;
782 ulint* offsets = NULL;
783 rec_t* rec = page_rec_get_next(page_get_infimum_rec(page));
784 while (!page_rec_is_supremum(rec)) {
785 offsets = rec_get_offsets(rec, index, offsets,
786 page_is_leaf(page),
787 ULINT_UNDEFINED,
788 &heap);
789 ulint size = rec_offs_data_size(offsets);
790 memset(rec, 0, size);
791 rec = page_rec_get_next(rec);
792 cnt++;
793 bytes += size;
794 }
795#ifdef UNIV_DEBUG_SCRUBBING
796 fprintf(stderr,
797 "btr_page_free_low: scrub %lu/%lu - "
798 "%u records " ULINTPF " bytes\n",
799 buf_block_get_space(block),
800 buf_block_get_page_no(block),
801 cnt, bytes);
802#endif /* UNIV_DEBUG_SCRUBBING */
803 if (heap) {
804 mem_heap_free(heap);
805 }
806 }
807
808#ifdef UNIV_DEBUG_SCRUBBING
809 if (scrub == false) {
810 fprintf(stderr,
811 "btr_page_free_low %lu/%lu blob: %u\n",
812 buf_block_get_space(block),
813 buf_block_get_page_no(block),
814 blob);
815 }
816#endif /* UNIV_DEBUG_SCRUBBING */
817
818 if (dict_index_is_ibuf(index)) {
819
820 btr_page_free_for_ibuf(index, block, mtr);
821
822 return;
823 }
824
825 root = btr_root_get(index, mtr);
826
827 if (level == 0 || level == ULINT_UNDEFINED) {
828 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
829 } else {
830 seg_header = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
831 }
832
833#ifdef UNIV_GIS_DEBUG
834 if (dict_index_is_spatial(index)) {
835 fprintf(stderr, "GIS_DIAG: Freed %ld\n",
836 (long) block->page.id.page_no());
837 }
838#endif
839
840 if (scrub) {
841 /**
842 * Reset page type so that scrub thread won't try to scrub it
843 */
844 mlog_write_ulint(buf_block_get_frame(block) + FIL_PAGE_TYPE,
845 FIL_PAGE_TYPE_ALLOCATED, MLOG_2BYTES, mtr);
846 }
847
848 fseg_free_page(seg_header,
849 block->page.id.space(),
850 block->page.id.page_no(),
851 level != ULINT_UNDEFINED, mtr);
852
853 /* The page was marked free in the allocation bitmap, but it
854 should remain buffer-fixed until mtr_commit(mtr) or until it
855 is explicitly freed from the mini-transaction. */
856 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
857 /* TODO: Discard any operations on the page from the redo log
858 and remove the block from the flush list and the buffer pool.
859 This would free up buffer pool earlier and reduce writes to
860 both the tablespace and the redo log. */
861}
862
863/**************************************************************//**
864Frees a file page used in an index tree. NOTE: cannot free field external
865storage pages because the page must contain info on its level. */
866void
867btr_page_free(
868/*==========*/
869 dict_index_t* index, /*!< in: index tree */
870 buf_block_t* block, /*!< in: block to be freed, x-latched */
871 mtr_t* mtr) /*!< in: mtr */
872{
873 const page_t* page = buf_block_get_frame(block);
874 ulint level = btr_page_get_level(page);
875
876 ut_ad(fil_page_index_page_check(block->frame));
877 ut_ad(level != ULINT_UNDEFINED);
878 btr_page_free_low(index, block, level, false, mtr);
879}
880
881/**************************************************************//**
882Sets the child node file address in a node pointer. */
883UNIV_INLINE
884void
885btr_node_ptr_set_child_page_no(
886/*===========================*/
887 rec_t* rec, /*!< in: node pointer record */
888 page_zip_des_t* page_zip,/*!< in/out: compressed page whose uncompressed
889 part will be updated, or NULL */
890 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
891 ulint page_no,/*!< in: child node address */
892 mtr_t* mtr) /*!< in: mtr */
893{
894 byte* field;
895 ulint len;
896
897 ut_ad(rec_offs_validate(rec, NULL, offsets));
898 ut_ad(!page_rec_is_leaf(rec));
899 ut_ad(!rec_offs_comp(offsets) || rec_get_node_ptr_flag(rec));
900
901 /* The child address is in the last field */
902 field = rec_get_nth_field(rec, offsets,
903 rec_offs_n_fields(offsets) - 1, &len);
904
905 ut_ad(len == REC_NODE_PTR_SIZE);
906
907 if (page_zip) {
908 page_zip_write_node_ptr(page_zip, rec,
909 rec_offs_data_size(offsets),
910 page_no, mtr);
911 } else {
912 mlog_write_ulint(field, page_no, MLOG_4BYTES, mtr);
913 }
914}
915
916/************************************************************//**
917Returns the child page of a node pointer and sx-latches it.
918@return child page, sx-latched */
919static
920buf_block_t*
921btr_node_ptr_get_child(
922/*===================*/
923 const rec_t* node_ptr,/*!< in: node pointer */
924 dict_index_t* index, /*!< in: index */
925 const ulint* offsets,/*!< in: array returned by rec_get_offsets() */
926 mtr_t* mtr) /*!< in: mtr */
927{
928 ut_ad(rec_offs_validate(node_ptr, index, offsets));
929 ut_ad(index->table->space->id
930 == page_get_space_id(page_align(node_ptr)));
931
932 return btr_block_get(
933 page_id_t(index->table->space->id,
934 btr_node_ptr_get_child_page_no(node_ptr, offsets)),
935 page_size_t(index->table->space->flags),
936 RW_SX_LATCH, index, mtr);
937}
938
939/************************************************************//**
940Returns the upper level node pointer to a page. It is assumed that mtr holds
941an sx-latch on the tree.
942@return rec_get_offsets() of the node pointer record */
943static
944ulint*
945btr_page_get_father_node_ptr_func(
946/*==============================*/
947 ulint* offsets,/*!< in: work area for the return value */
948 mem_heap_t* heap, /*!< in: memory heap to use */
949 btr_cur_t* cursor, /*!< in: cursor pointing to user record,
950 out: cursor on node pointer record,
951 its page x-latched */
952 ulint latch_mode,/*!< in: BTR_CONT_MODIFY_TREE
953 or BTR_CONT_SEARCH_TREE */
954 const char* file, /*!< in: file name */
955 unsigned line, /*!< in: line where called */
956 mtr_t* mtr) /*!< in: mtr */
957{
958 dtuple_t* tuple;
959 rec_t* user_rec;
960 rec_t* node_ptr;
961 ulint level;
962 ulint page_no;
963 dict_index_t* index;
964
965 ut_ad(latch_mode == BTR_CONT_MODIFY_TREE
966 || latch_mode == BTR_CONT_SEARCH_TREE);
967
968 page_no = btr_cur_get_block(cursor)->page.id.page_no();
969 index = btr_cur_get_index(cursor);
970 ut_ad(!dict_index_is_spatial(index));
971
972 ut_ad(srv_read_only_mode
973 || mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
974 MTR_MEMO_X_LOCK
975 | MTR_MEMO_SX_LOCK));
976
977 ut_ad(dict_index_get_page(index) != page_no);
978
979 level = btr_page_get_level(btr_cur_get_page(cursor));
980
981 user_rec = btr_cur_get_rec(cursor);
982 ut_a(page_rec_is_user_rec(user_rec));
983
984 tuple = dict_index_build_node_ptr(index, user_rec, 0, heap, level);
985 dberr_t err = DB_SUCCESS;
986
987 err = btr_cur_search_to_nth_level(
988 index, level + 1, tuple,
989 PAGE_CUR_LE, latch_mode, cursor, 0,
990 file, line, mtr);
991
992 if (err != DB_SUCCESS) {
993 ib::warn() << " Error code: " << err
994 << " btr_page_get_father_node_ptr_func "
995 << " level: " << level + 1
996 << " called from file: "
997 << file << " line: " << line
998 << " table: " << index->table->name
999 << " index: " << index->name();
1000 }
1001
1002 node_ptr = btr_cur_get_rec(cursor);
1003
1004 offsets = rec_get_offsets(node_ptr, index, offsets, false,
1005 ULINT_UNDEFINED, &heap);
1006
1007 if (btr_node_ptr_get_child_page_no(node_ptr, offsets) != page_no) {
1008 rec_t* print_rec;
1009
1010 ib::error()
1011 << "Corruption of an index tree: table "
1012 << index->table->name
1013 << " index " << index->name
1014 << ", father ptr page no "
1015 << btr_node_ptr_get_child_page_no(node_ptr, offsets)
1016 << ", child page no " << page_no;
1017
1018 print_rec = page_rec_get_next(
1019 page_get_infimum_rec(page_align(user_rec)));
1020 offsets = rec_get_offsets(print_rec, index, offsets,
1021 page_rec_is_leaf(user_rec),
1022 ULINT_UNDEFINED, &heap);
1023 page_rec_print(print_rec, offsets);
1024 offsets = rec_get_offsets(node_ptr, index, offsets, false,
1025 ULINT_UNDEFINED, &heap);
1026 page_rec_print(node_ptr, offsets);
1027
1028 ib::fatal()
1029 << "You should dump + drop + reimport the table to"
1030 << " fix the corruption. If the crash happens at"
1031 << " database startup. " << FORCE_RECOVERY_MSG
1032 << " Then dump + drop + reimport.";
1033 }
1034
1035 return(offsets);
1036}
1037
1038#define btr_page_get_father_node_ptr(of,heap,cur,mtr) \
1039 btr_page_get_father_node_ptr_func( \
1040 of,heap,cur,BTR_CONT_MODIFY_TREE,__FILE__,__LINE__,mtr)
1041
1042#define btr_page_get_father_node_ptr_for_validate(of,heap,cur,mtr) \
1043 btr_page_get_father_node_ptr_func( \
1044 of,heap,cur,BTR_CONT_SEARCH_TREE,__FILE__,__LINE__,mtr)
1045
1046/************************************************************//**
1047Returns the upper level node pointer to a page. It is assumed that mtr holds
1048an x-latch on the tree.
1049@return rec_get_offsets() of the node pointer record */
1050static
1051ulint*
1052btr_page_get_father_block(
1053/*======================*/
1054 ulint* offsets,/*!< in: work area for the return value */
1055 mem_heap_t* heap, /*!< in: memory heap to use */
1056 dict_index_t* index, /*!< in: b-tree index */
1057 buf_block_t* block, /*!< in: child page in the index */
1058 mtr_t* mtr, /*!< in: mtr */
1059 btr_cur_t* cursor) /*!< out: cursor on node pointer record,
1060 its page x-latched */
1061{
1062 rec_t* rec
1063 = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
1064 block)));
1065 btr_cur_position(index, rec, block, cursor);
1066 return(btr_page_get_father_node_ptr(offsets, heap, cursor, mtr));
1067}
1068
1069/************************************************************//**
1070Seeks to the upper level node pointer to a page.
1071It is assumed that mtr holds an x-latch on the tree. */
1072static
1073void
1074btr_page_get_father(
1075/*================*/
1076 dict_index_t* index, /*!< in: b-tree index */
1077 buf_block_t* block, /*!< in: child page in the index */
1078 mtr_t* mtr, /*!< in: mtr */
1079 btr_cur_t* cursor) /*!< out: cursor on node pointer record,
1080 its page x-latched */
1081{
1082 mem_heap_t* heap;
1083 rec_t* rec
1084 = page_rec_get_next(page_get_infimum_rec(buf_block_get_frame(
1085 block)));
1086 btr_cur_position(index, rec, block, cursor);
1087
1088 heap = mem_heap_create(100);
1089 btr_page_get_father_node_ptr(NULL, heap, cursor, mtr);
1090 mem_heap_free(heap);
1091}
1092
1093/** Free a B-tree root page. btr_free_but_not_root() must already
1094have been called.
1095In a persistent tablespace, the caller must invoke fsp_init_file_page()
1096before mtr.commit().
1097@param[in,out] block index root page
1098@param[in,out] mtr mini-transaction */
1099static
1100void
1101btr_free_root(
1102 buf_block_t* block,
1103 mtr_t* mtr)
1104{
1105 fseg_header_t* header;
1106
1107 ut_ad(mtr_memo_contains_flagged(mtr, block, MTR_MEMO_PAGE_X_FIX
1108 | MTR_MEMO_PAGE_SX_FIX));
1109 ut_ad(mtr->is_named_space(block->page.id.space()));
1110
1111 btr_search_drop_page_hash_index(block);
1112
1113 header = buf_block_get_frame(block) + PAGE_HEADER + PAGE_BTR_SEG_TOP;
1114#ifdef UNIV_BTR_DEBUG
1115 ut_a(btr_root_fseg_validate(header, block->page.id.space()));
1116#endif /* UNIV_BTR_DEBUG */
1117
1118 while (!fseg_free_step(header, true, mtr)) {
1119 /* Free the entire segment in small steps. */
1120 }
1121}
1122
1123/** PAGE_INDEX_ID value for freed index B-trees */
1124static const index_id_t BTR_FREED_INDEX_ID = 0;
1125
1126/** Invalidate an index root page so that btr_free_root_check()
1127will not find it.
1128@param[in,out] block index root page
1129@param[in,out] mtr mini-transaction */
1130static
1131void
1132btr_free_root_invalidate(
1133 buf_block_t* block,
1134 mtr_t* mtr)
1135{
1136 btr_page_set_index_id(
1137 buf_block_get_frame(block),
1138 buf_block_get_page_zip(block),
1139 BTR_FREED_INDEX_ID, mtr);
1140}
1141
1142/** Prepare to free a B-tree.
1143@param[in] page_id page id
1144@param[in] page_size page size
1145@param[in] index_id PAGE_INDEX_ID contents
1146@param[in,out] mtr mini-transaction
1147@return root block, to invoke btr_free_but_not_root() and btr_free_root()
1148@retval NULL if the page is no longer a matching B-tree page */
1149static MY_ATTRIBUTE((warn_unused_result))
1150buf_block_t*
1151btr_free_root_check(
1152 const page_id_t& page_id,
1153 const page_size_t& page_size,
1154 index_id_t index_id,
1155 mtr_t* mtr)
1156{
1157 ut_ad(page_id.space() != SRV_TMP_SPACE_ID);
1158 ut_ad(index_id != BTR_FREED_INDEX_ID);
1159
1160 buf_block_t* block = buf_page_get(
1161 page_id, page_size, RW_X_LATCH, mtr);
1162
1163 if (block) {
1164 buf_block_dbg_add_level(block, SYNC_TREE_NODE);
1165
1166 if (fil_page_index_page_check(block->frame)
1167 && index_id == btr_page_get_index_id(block->frame)) {
1168 /* This should be a root page.
1169 It should not be possible to reassign the same
1170 index_id for some other index in the tablespace. */
1171 ut_ad(page_is_root(block->frame));
1172 } else {
1173 block = NULL;
1174 }
1175 }
1176
1177 return(block);
1178}
1179
1180/** Create the root node for a new index tree.
1181@param[in] type type of the index
1182@param[in,out] space tablespace where created
1183@param[in] index_id index id
1184@param[in] index index, or NULL when applying TRUNCATE
1185log record during recovery
1186@param[in] btr_redo_create_info used for applying TRUNCATE log
1187@param[in] mtr mini-transaction handle
1188record during recovery
1189@return page number of the created root, FIL_NULL if did not succeed */
1190ulint
1191btr_create(
1192 ulint type,
1193 fil_space_t* space,
1194 index_id_t index_id,
1195 dict_index_t* index,
1196 const btr_create_t* btr_redo_create_info,
1197 mtr_t* mtr)
1198{
1199 buf_block_t* block;
1200 page_t* page;
1201 page_zip_des_t* page_zip;
1202
1203 ut_ad(mtr->is_named_space(space));
1204 ut_ad(index_id != BTR_FREED_INDEX_ID);
1205
1206 /* Create the two new segments (one, in the case of an ibuf tree) for
1207 the index tree; the segment headers are put on the allocated root page
1208 (for an ibuf tree, not in the root, but on a separate ibuf header
1209 page) */
1210
1211 if (type & DICT_IBUF) {
1212 /* Allocate first the ibuf header page */
1213 buf_block_t* ibuf_hdr_block = fseg_create(
1214 space, 0,
1215 IBUF_HEADER + IBUF_TREE_SEG_HEADER, mtr);
1216
1217 if (ibuf_hdr_block == NULL) {
1218 return(FIL_NULL);
1219 }
1220
1221 buf_block_dbg_add_level(
1222 ibuf_hdr_block, SYNC_IBUF_TREE_NODE_NEW);
1223
1224 ut_ad(ibuf_hdr_block->page.id.page_no()
1225 == IBUF_HEADER_PAGE_NO);
1226 /* Allocate then the next page to the segment: it will be the
1227 tree root page */
1228
1229 block = fseg_alloc_free_page(
1230 buf_block_get_frame(ibuf_hdr_block)
1231 + IBUF_HEADER + IBUF_TREE_SEG_HEADER,
1232 IBUF_TREE_ROOT_PAGE_NO,
1233 FSP_UP, mtr);
1234
1235 if (block == NULL) {
1236 return(FIL_NULL);
1237 }
1238
1239 ut_ad(block->page.id.page_no() == IBUF_TREE_ROOT_PAGE_NO);
1240
1241 buf_block_dbg_add_level(block, SYNC_IBUF_TREE_NODE_NEW);
1242
1243 flst_init(block->frame + PAGE_HEADER + PAGE_BTR_IBUF_FREE_LIST,
1244 mtr);
1245 } else {
1246 block = fseg_create(space, 0,
1247 PAGE_HEADER + PAGE_BTR_SEG_TOP, mtr);
1248
1249 if (block == NULL) {
1250 return(FIL_NULL);
1251 }
1252
1253 buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
1254
1255 if (!fseg_create(space, block->page.id.page_no(),
1256 PAGE_HEADER + PAGE_BTR_SEG_LEAF, mtr)) {
1257 /* Not enough space for new segment, free root
1258 segment before return. */
1259 btr_free_root(block, mtr);
1260 if (!index->table->is_temporary()) {
1261 btr_free_root_invalidate(block, mtr);
1262 }
1263
1264 return(FIL_NULL);
1265 }
1266
1267 /* The fseg create acquires a second latch on the page,
1268 therefore we must declare it: */
1269 buf_block_dbg_add_level(block, SYNC_TREE_NODE_NEW);
1270 }
1271
1272 /* Create a new index page on the allocated segment page */
1273 page_zip = buf_block_get_page_zip(block);
1274
1275 if (page_zip) {
1276 if (index != NULL) {
1277 page = page_create_zip(block, index, 0, 0, NULL, mtr);
1278 } else {
1279 /* Create a compressed index page when applying
1280 TRUNCATE log record during recovery */
1281 ut_ad(btr_redo_create_info != NULL);
1282
1283 redo_page_compress_t page_comp_info;
1284
1285 page_comp_info.type = type;
1286
1287 page_comp_info.index_id = index_id;
1288
1289 page_comp_info.n_fields =
1290 btr_redo_create_info->n_fields;
1291
1292 page_comp_info.field_len =
1293 btr_redo_create_info->field_len;
1294
1295 page_comp_info.fields = btr_redo_create_info->fields;
1296
1297 page_comp_info.trx_id_pos =
1298 btr_redo_create_info->trx_id_pos;
1299
1300 page = page_create_zip(block, NULL, 0, 0,
1301 &page_comp_info, mtr);
1302 }
1303 } else {
1304 if (index != NULL) {
1305 page = page_create(block, mtr,
1306 dict_table_is_comp(index->table),
1307 dict_index_is_spatial(index));
1308 } else {
1309 ut_ad(btr_redo_create_info != NULL);
1310 page = page_create(
1311 block, mtr, btr_redo_create_info->format_flags,
1312 type == DICT_SPATIAL);
1313 }
1314 /* Set the level of the new index page */
1315 btr_page_set_level(page, NULL, 0, mtr);
1316 }
1317
1318 /* Set the index id of the page */
1319 btr_page_set_index_id(page, page_zip, index_id, mtr);
1320
1321 /* Set the next node and previous node fields */
1322 btr_page_set_next(page, page_zip, FIL_NULL, mtr);
1323 btr_page_set_prev(page, page_zip, FIL_NULL, mtr);
1324
1325 /* We reset the free bits for the page to allow creation of several
1326 trees in the same mtr, otherwise the latch on a bitmap page would
1327 prevent it because of the latching order.
1328
1329 index will be NULL if we are recreating the table during recovery
1330 on behalf of TRUNCATE.
1331
1332 Note: Insert Buffering is disabled for temporary tables given that
1333 most temporary tables are smaller in size and short-lived. */
1334 if (!(type & DICT_CLUSTERED)
1335 && (index == NULL || !index->table->is_temporary())) {
1336
1337 ibuf_reset_free_bits(block);
1338 }
1339
1340 /* In the following assertion we test that two records of maximum
1341 allowed size fit on the root page: this fact is needed to ensure
1342 correctness of split algorithms */
1343
1344 ut_ad(page_get_max_insert_size(page, 2) > 2 * BTR_PAGE_MAX_REC_SIZE);
1345
1346 return(block->page.id.page_no());
1347}
1348
1349/** Free a B-tree except the root page. The root page MUST be freed after
1350this by calling btr_free_root.
1351@param[in,out] block root page
1352@param[in] log_mode mtr logging mode */
1353static
1354void
1355btr_free_but_not_root(
1356 buf_block_t* block,
1357 mtr_log_t log_mode)
1358{
1359 ibool finished;
1360 mtr_t mtr;
1361
1362 ut_ad(page_is_root(block->frame));
1363leaf_loop:
1364 mtr_start(&mtr);
1365 mtr_set_log_mode(&mtr, log_mode);
1366 mtr.set_named_space_id(block->page.id.space());
1367
1368 page_t* root = block->frame;
1369
1370 if (!root) {
1371 mtr_commit(&mtr);
1372 return;
1373 }
1374
1375#ifdef UNIV_BTR_DEBUG
1376 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
1377 + root, block->page.id.space()));
1378 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
1379 + root, block->page.id.space()));
1380#endif /* UNIV_BTR_DEBUG */
1381
1382 /* NOTE: page hash indexes are dropped when a page is freed inside
1383 fsp0fsp. */
1384
1385 finished = fseg_free_step(root + PAGE_HEADER + PAGE_BTR_SEG_LEAF,
1386 true, &mtr);
1387 mtr_commit(&mtr);
1388
1389 if (!finished) {
1390
1391 goto leaf_loop;
1392 }
1393top_loop:
1394 mtr_start(&mtr);
1395 mtr_set_log_mode(&mtr, log_mode);
1396 mtr.set_named_space_id(block->page.id.space());
1397
1398 root = block->frame;
1399
1400#ifdef UNIV_BTR_DEBUG
1401 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
1402 + root, block->page.id.space()));
1403#endif /* UNIV_BTR_DEBUG */
1404
1405 finished = fseg_free_step_not_header(
1406 root + PAGE_HEADER + PAGE_BTR_SEG_TOP, true, &mtr);
1407 mtr_commit(&mtr);
1408
1409 if (!finished) {
1410 goto top_loop;
1411 }
1412}
1413
1414/** Free a persistent index tree if it exists.
1415@param[in] page_id root page id
1416@param[in] page_size page size
1417@param[in] index_id PAGE_INDEX_ID contents
1418@param[in,out] mtr mini-transaction */
1419void
1420btr_free_if_exists(
1421 const page_id_t& page_id,
1422 const page_size_t& page_size,
1423 index_id_t index_id,
1424 mtr_t* mtr)
1425{
1426 buf_block_t* root = btr_free_root_check(
1427 page_id, page_size, index_id, mtr);
1428
1429 if (root == NULL) {
1430 return;
1431 }
1432
1433 ut_ad(page_is_root(root->frame));
1434 btr_free_but_not_root(root, mtr->get_log_mode());
1435 mtr->set_named_space_id(page_id.space());
1436 btr_free_root(root, mtr);
1437 btr_free_root_invalidate(root, mtr);
1438}
1439
1440/** Free an index tree in a temporary tablespace or during TRUNCATE TABLE.
1441@param[in] page_id root page id
1442@param[in] page_size page size */
1443void
1444btr_free(
1445 const page_id_t& page_id,
1446 const page_size_t& page_size)
1447{
1448 mtr_t mtr;
1449 mtr.start();
1450 mtr.set_log_mode(MTR_LOG_NO_REDO);
1451
1452 buf_block_t* block = buf_page_get(
1453 page_id, page_size, RW_X_LATCH, &mtr);
1454
1455 if (block) {
1456 ut_ad(page_is_root(block->frame));
1457
1458 btr_free_but_not_root(block, MTR_LOG_NO_REDO);
1459 btr_free_root(block, &mtr);
1460 }
1461 mtr.commit();
1462}
1463
1464/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC.
1465@param[in,out] index clustered index
1466@return the last used AUTO_INCREMENT value
1467@retval 0 on error or if no AUTO_INCREMENT value was used yet */
1468ib_uint64_t
1469btr_read_autoinc(dict_index_t* index)
1470{
1471 ut_ad(index->is_primary());
1472 ut_ad(index->table->persistent_autoinc);
1473 ut_ad(!index->table->is_temporary());
1474 mtr_t mtr;
1475 mtr.start();
1476 ib_uint64_t autoinc;
1477 if (buf_block_t* block = buf_page_get(
1478 page_id_t(index->table->space->id, index->page),
1479 page_size_t(index->table->space->flags),
1480 RW_S_LATCH, &mtr)) {
1481 autoinc = page_get_autoinc(block->frame);
1482 } else {
1483 autoinc = 0;
1484 }
1485 mtr.commit();
1486 return autoinc;
1487}
1488
1489/** Read the last used AUTO_INCREMENT value from PAGE_ROOT_AUTO_INC,
1490or fall back to MAX(auto_increment_column).
1491@param[in] table table containing an AUTO_INCREMENT column
1492@param[in] col_no index of the AUTO_INCREMENT column
1493@return the AUTO_INCREMENT value
1494@retval 0 on error or if no AUTO_INCREMENT value was used yet */
1495ib_uint64_t
1496btr_read_autoinc_with_fallback(const dict_table_t* table, unsigned col_no)
1497{
1498 ut_ad(table->persistent_autoinc);
1499 ut_ad(!table->is_temporary());
1500
1501 dict_index_t* index = dict_table_get_first_index(table);
1502
1503 if (index == NULL) {
1504 return 0;
1505 }
1506
1507 mtr_t mtr;
1508 mtr.start();
1509 buf_block_t* block = buf_page_get(
1510 page_id_t(index->table->space->id, index->page),
1511 page_size_t(index->table->space->flags),
1512 RW_S_LATCH, &mtr);
1513
1514 ib_uint64_t autoinc = block ? page_get_autoinc(block->frame) : 0;
1515 const bool retry = block && autoinc == 0
1516 && !page_is_empty(block->frame);
1517 mtr.commit();
1518
1519 if (retry) {
1520 /* This should be an old data file where
1521 PAGE_ROOT_AUTO_INC was initialized to 0.
1522 Fall back to reading MAX(autoinc_col).
1523 There should be an index on it. */
1524 const dict_col_t* autoinc_col
1525 = dict_table_get_nth_col(table, col_no);
1526 while (index && index->fields[0].col != autoinc_col) {
1527 index = dict_table_get_next_index(index);
1528 }
1529
1530 if (index) {
1531 autoinc = row_search_max_autoinc(index);
1532 }
1533 }
1534
1535 return autoinc;
1536}
1537
1538/** Write the next available AUTO_INCREMENT value to PAGE_ROOT_AUTO_INC.
1539@param[in,out] index clustered index
1540@param[in] autoinc the AUTO_INCREMENT value
1541@param[in] reset whether to reset the AUTO_INCREMENT
1542 to a possibly smaller value than currently
1543 exists in the page */
1544void
1545btr_write_autoinc(dict_index_t* index, ib_uint64_t autoinc, bool reset)
1546{
1547 ut_ad(index->is_primary());
1548 ut_ad(index->table->persistent_autoinc);
1549 ut_ad(!index->table->is_temporary());
1550
1551 mtr_t mtr;
1552 mtr.start();
1553 fil_space_t* space = index->table->space;
1554 mtr.set_named_space(space);
1555 page_set_autoinc(buf_page_get(page_id_t(space->id, index->page),
1556 page_size_t(space->flags),
1557 RW_SX_LATCH, &mtr),
1558 index, autoinc, &mtr, reset);
1559 mtr.commit();
1560}
1561
1562/*************************************************************//**
1563Reorganizes an index page.
1564
1565IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
1566if this is a compressed leaf page in a secondary index. This has to
1567be done either within the same mini-transaction, or by invoking
1568ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
1569IBUF_BITMAP_FREE is unaffected by reorganization.
1570
1571@retval true if the operation was successful
1572@retval false if it is a compressed page, and recompression failed */
1573bool
1574btr_page_reorganize_low(
1575/*====================*/
1576 bool recovery,/*!< in: true if called in recovery:
1577 locks should not be updated, i.e.,
1578 there cannot exist locks on the
1579 page, and a hash index should not be
1580 dropped: it cannot exist */
1581 ulint z_level,/*!< in: compression level to be used
1582 if dealing with compressed page */
1583 page_cur_t* cursor, /*!< in/out: page cursor */
1584 dict_index_t* index, /*!< in: the index tree of the page */
1585 mtr_t* mtr) /*!< in/out: mini-transaction */
1586{
1587 buf_block_t* block = page_cur_get_block(cursor);
1588 buf_pool_t* buf_pool = buf_pool_from_bpage(&block->page);
1589 page_t* page = buf_block_get_frame(block);
1590 page_zip_des_t* page_zip = buf_block_get_page_zip(block);
1591 buf_block_t* temp_block;
1592 page_t* temp_page;
1593 ulint data_size1;
1594 ulint data_size2;
1595 ulint max_ins_size1;
1596 ulint max_ins_size2;
1597 bool success = false;
1598 ulint pos;
1599 bool log_compressed;
1600 bool is_spatial;
1601
1602 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1603 btr_assert_not_corrupted(block, index);
1604#ifdef UNIV_ZIP_DEBUG
1605 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
1606#endif /* UNIV_ZIP_DEBUG */
1607 data_size1 = page_get_data_size(page);
1608 max_ins_size1 = page_get_max_insert_size_after_reorganize(page, 1);
1609
1610 /* Turn logging off */
1611 mtr_log_t log_mode = mtr_set_log_mode(mtr, MTR_LOG_NONE);
1612
1613 temp_block = buf_block_alloc(buf_pool);
1614 temp_page = temp_block->frame;
1615
1616 MONITOR_INC(MONITOR_INDEX_REORG_ATTEMPTS);
1617
1618 /* This function can be called by log redo with a "dummy" index.
1619 So we would trust more on the original page's type */
1620 is_spatial = (fil_page_get_type(page) == FIL_PAGE_RTREE
1621 || dict_index_is_spatial(index));
1622
1623 /* Copy the old page to temporary space */
1624 buf_frame_copy(temp_page, page);
1625
1626 if (!recovery) {
1627 btr_search_drop_page_hash_index(block);
1628 }
1629
1630 /* Save the cursor position. */
1631 pos = page_rec_get_n_recs_before(page_cur_get_rec(cursor));
1632
1633 /* Recreate the page: note that global data on page (possible
1634 segment headers, next page-field, etc.) is preserved intact */
1635
1636 page_create(block, mtr, dict_table_is_comp(index->table), is_spatial);
1637
1638 /* Copy the records from the temporary space to the recreated page;
1639 do not copy the lock bits yet */
1640
1641 page_copy_rec_list_end_no_locks(block, temp_block,
1642 page_get_infimum_rec(temp_page),
1643 index, mtr);
1644
1645 /* Copy the PAGE_MAX_TRX_ID or PAGE_ROOT_AUTO_INC. */
1646 memcpy(page + (PAGE_HEADER + PAGE_MAX_TRX_ID),
1647 temp_page + (PAGE_HEADER + PAGE_MAX_TRX_ID), 8);
1648 /* PAGE_MAX_TRX_ID is unused in clustered index pages
1649 (other than the root where it is repurposed as PAGE_ROOT_AUTO_INC),
1650 non-leaf pages, and in temporary tables. It was always
1651 zero-initialized in page_create() in all InnoDB versions.
1652 PAGE_MAX_TRX_ID must be nonzero on dict_index_is_sec_or_ibuf()
1653 leaf pages.
1654
1655 During redo log apply, dict_index_is_sec_or_ibuf() always
1656 holds, even for clustered indexes. */
1657 ut_ad(recovery || index->table->is_temporary()
1658 || !page_is_leaf(temp_page)
1659 || !dict_index_is_sec_or_ibuf(index)
1660 || page_get_max_trx_id(page) != 0);
1661 /* PAGE_MAX_TRX_ID must be zero on non-leaf pages other than
1662 clustered index root pages. */
1663 ut_ad(recovery
1664 || page_get_max_trx_id(page) == 0
1665 || (dict_index_is_sec_or_ibuf(index)
1666 ? page_is_leaf(temp_page)
1667 : page_is_root(temp_page)));
1668
1669 /* If innodb_log_compressed_pages is ON, page reorganize should log the
1670 compressed page image.*/
1671 log_compressed = page_zip && page_zip_log_pages;
1672
1673 if (log_compressed) {
1674 mtr_set_log_mode(mtr, log_mode);
1675 }
1676
1677 if (page_zip
1678 && !page_zip_compress(page_zip, page, index, z_level, NULL, mtr)) {
1679
1680 /* Restore the old page and exit. */
1681#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
1682 /* Check that the bytes that we skip are identical. */
1683 ut_a(!memcmp(page, temp_page, PAGE_HEADER));
1684 ut_a(!memcmp(PAGE_HEADER + PAGE_N_RECS + page,
1685 PAGE_HEADER + PAGE_N_RECS + temp_page,
1686 PAGE_DATA - (PAGE_HEADER + PAGE_N_RECS)));
1687 ut_a(!memcmp(srv_page_size - FIL_PAGE_DATA_END + page,
1688 srv_page_size - FIL_PAGE_DATA_END + temp_page,
1689 FIL_PAGE_DATA_END));
1690#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
1691
1692 memcpy(PAGE_HEADER + page, PAGE_HEADER + temp_page,
1693 PAGE_N_RECS - PAGE_N_DIR_SLOTS);
1694 memcpy(PAGE_DATA + page, PAGE_DATA + temp_page,
1695 srv_page_size - PAGE_DATA - FIL_PAGE_DATA_END);
1696
1697#if defined UNIV_DEBUG || defined UNIV_ZIP_DEBUG
1698 ut_a(!memcmp(page, temp_page, srv_page_size));
1699#endif /* UNIV_DEBUG || UNIV_ZIP_DEBUG */
1700
1701 goto func_exit;
1702 }
1703
1704 if (!recovery && !dict_table_is_locking_disabled(index->table)) {
1705 /* Update the record lock bitmaps */
1706 lock_move_reorganize_page(block, temp_block);
1707 }
1708
1709 data_size2 = page_get_data_size(page);
1710 max_ins_size2 = page_get_max_insert_size_after_reorganize(page, 1);
1711
1712 if (data_size1 != data_size2 || max_ins_size1 != max_ins_size2) {
1713 ib::error()
1714 << "Page old data size " << data_size1
1715 << " new data size " << data_size2
1716 << ", page old max ins size " << max_ins_size1
1717 << " new max ins size " << max_ins_size2;
1718
1719 ib::error() << BUG_REPORT_MSG;
1720 ut_ad(0);
1721 } else {
1722 success = true;
1723 }
1724
1725 /* Restore the cursor position. */
1726 if (pos > 0) {
1727 cursor->rec = page_rec_get_nth(page, pos);
1728 } else {
1729 ut_ad(cursor->rec == page_get_infimum_rec(page));
1730 }
1731
1732func_exit:
1733#ifdef UNIV_ZIP_DEBUG
1734 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
1735#endif /* UNIV_ZIP_DEBUG */
1736
1737 if (!recovery && page_is_root(temp_page)
1738 && fil_page_get_type(temp_page) == FIL_PAGE_TYPE_INSTANT) {
1739 /* Preserve the PAGE_INSTANT information. */
1740 ut_ad(!page_zip);
1741 ut_ad(index->is_instant());
1742 memcpy(FIL_PAGE_TYPE + page, FIL_PAGE_TYPE + temp_page, 2);
1743 memcpy(PAGE_HEADER + PAGE_INSTANT + page,
1744 PAGE_HEADER + PAGE_INSTANT + temp_page, 2);
1745 }
1746
1747 buf_block_free(temp_block);
1748
1749 /* Restore logging mode */
1750 mtr_set_log_mode(mtr, log_mode);
1751
1752 if (success) {
1753 mlog_id_t type;
1754 byte* log_ptr;
1755
1756 /* Write the log record */
1757 if (page_zip) {
1758 ut_ad(page_is_comp(page));
1759 type = MLOG_ZIP_PAGE_REORGANIZE;
1760 } else if (page_is_comp(page)) {
1761 type = MLOG_COMP_PAGE_REORGANIZE;
1762 } else {
1763 type = MLOG_PAGE_REORGANIZE;
1764 }
1765
1766 log_ptr = log_compressed
1767 ? NULL
1768 : mlog_open_and_write_index(
1769 mtr, page, index, type,
1770 page_zip ? 1 : 0);
1771
1772 /* For compressed pages write the compression level. */
1773 if (log_ptr && page_zip) {
1774 mach_write_to_1(log_ptr, z_level);
1775 mlog_close(mtr, log_ptr + 1);
1776 }
1777
1778 MONITOR_INC(MONITOR_INDEX_REORG_SUCCESSFUL);
1779 }
1780
1781 if (UNIV_UNLIKELY(fil_page_get_type(page) == FIL_PAGE_TYPE_INSTANT)) {
1782 /* Log the PAGE_INSTANT information. */
1783 ut_ad(!page_zip);
1784 ut_ad(index->is_instant());
1785 ut_ad(!recovery);
1786 mlog_write_ulint(FIL_PAGE_TYPE + page, FIL_PAGE_TYPE_INSTANT,
1787 MLOG_2BYTES, mtr);
1788 mlog_write_ulint(PAGE_HEADER + PAGE_INSTANT + page,
1789 mach_read_from_2(PAGE_HEADER + PAGE_INSTANT
1790 + page),
1791 MLOG_2BYTES, mtr);
1792 }
1793
1794 return(success);
1795}
1796
1797/*************************************************************//**
1798Reorganizes an index page.
1799
1800IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
1801if this is a compressed leaf page in a secondary index. This has to
1802be done either within the same mini-transaction, or by invoking
1803ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
1804IBUF_BITMAP_FREE is unaffected by reorganization.
1805
1806@retval true if the operation was successful
1807@retval false if it is a compressed page, and recompression failed */
1808bool
1809btr_page_reorganize_block(
1810/*======================*/
1811 bool recovery,/*!< in: true if called in recovery:
1812 locks should not be updated, i.e.,
1813 there cannot exist locks on the
1814 page, and a hash index should not be
1815 dropped: it cannot exist */
1816 ulint z_level,/*!< in: compression level to be used
1817 if dealing with compressed page */
1818 buf_block_t* block, /*!< in/out: B-tree page */
1819 dict_index_t* index, /*!< in: the index tree of the page */
1820 mtr_t* mtr) /*!< in/out: mini-transaction */
1821{
1822 page_cur_t cur;
1823 page_cur_set_before_first(block, &cur);
1824
1825 return(btr_page_reorganize_low(recovery, z_level, &cur, index, mtr));
1826}
1827
1828/*************************************************************//**
1829Reorganizes an index page.
1830
1831IMPORTANT: On success, the caller will have to update IBUF_BITMAP_FREE
1832if this is a compressed leaf page in a secondary index. This has to
1833be done either within the same mini-transaction, or by invoking
1834ibuf_reset_free_bits() before mtr_commit(). On uncompressed pages,
1835IBUF_BITMAP_FREE is unaffected by reorganization.
1836
1837@retval true if the operation was successful
1838@retval false if it is a compressed page, and recompression failed */
1839bool
1840btr_page_reorganize(
1841/*================*/
1842 page_cur_t* cursor, /*!< in/out: page cursor */
1843 dict_index_t* index, /*!< in: the index tree of the page */
1844 mtr_t* mtr) /*!< in/out: mini-transaction */
1845{
1846 return(btr_page_reorganize_low(false, page_zip_level,
1847 cursor, index, mtr));
1848}
1849
1850/***********************************************************//**
1851Parses a redo log record of reorganizing a page.
1852@return end of log record or NULL */
1853byte*
1854btr_parse_page_reorganize(
1855/*======================*/
1856 byte* ptr, /*!< in: buffer */
1857 byte* end_ptr,/*!< in: buffer end */
1858 dict_index_t* index, /*!< in: record descriptor */
1859 bool compressed,/*!< in: true if compressed page */
1860 buf_block_t* block, /*!< in: page to be reorganized, or NULL */
1861 mtr_t* mtr) /*!< in: mtr or NULL */
1862{
1863 ulint level = page_zip_level;
1864
1865 ut_ad(ptr != NULL);
1866 ut_ad(end_ptr != NULL);
1867 ut_ad(index != NULL);
1868
1869 /* If dealing with a compressed page the record has the
1870 compression level used during original compression written in
1871 one byte. Otherwise record is empty. */
1872 if (compressed) {
1873 if (ptr == end_ptr) {
1874 return(NULL);
1875 }
1876
1877 level = mach_read_from_1(ptr);
1878
1879 ut_a(level <= 9);
1880 ++ptr;
1881 } else {
1882 level = page_zip_level;
1883 }
1884
1885 if (block != NULL) {
1886 btr_page_reorganize_block(true, level, block, index, mtr);
1887 }
1888
1889 return(ptr);
1890}
1891
1892/** Empty an index page (possibly the root page). @see btr_page_create().
1893@param[in,out] block page to be emptied
1894@param[in,out] page_zip compressed page frame, or NULL
1895@param[in] index index of the page
1896@param[in] level B-tree level of the page (0=leaf)
1897@param[in,out] mtr mini-transaction */
1898void
1899btr_page_empty(
1900 buf_block_t* block,
1901 page_zip_des_t* page_zip,
1902 dict_index_t* index,
1903 ulint level,
1904 mtr_t* mtr)
1905{
1906 page_t* page = buf_block_get_frame(block);
1907
1908 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
1909 ut_ad(page_zip == buf_block_get_page_zip(block));
1910#ifdef UNIV_ZIP_DEBUG
1911 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
1912#endif /* UNIV_ZIP_DEBUG */
1913
1914 btr_search_drop_page_hash_index(block);
1915
1916 /* Recreate the page: note that global data on page (possible
1917 segment headers, next page-field, etc.) is preserved intact */
1918
1919 /* Preserve PAGE_ROOT_AUTO_INC when creating a clustered index
1920 root page. */
1921 const ib_uint64_t autoinc
1922 = dict_index_is_clust(index) && page_is_root(page)
1923 ? page_get_autoinc(page)
1924 : 0;
1925
1926 if (page_zip) {
1927 page_create_zip(block, index, level, autoinc, NULL, mtr);
1928 } else {
1929 page_create(block, mtr, dict_table_is_comp(index->table),
1930 dict_index_is_spatial(index));
1931 btr_page_set_level(page, NULL, level, mtr);
1932 if (autoinc) {
1933 mlog_write_ull(PAGE_HEADER + PAGE_MAX_TRX_ID + page,
1934 autoinc, mtr);
1935 }
1936 }
1937}
1938
1939/*************************************************************//**
1940Makes tree one level higher by splitting the root, and inserts
1941the tuple. It is assumed that mtr contains an x-latch on the tree.
1942NOTE that the operation of this function must always succeed,
1943we cannot reverse it: therefore enough free disk space must be
1944guaranteed to be available before this function is called.
1945@return inserted record */
1946rec_t*
1947btr_root_raise_and_insert(
1948/*======================*/
1949 ulint flags, /*!< in: undo logging and locking flags */
1950 btr_cur_t* cursor, /*!< in: cursor at which to insert: must be
1951 on the root page; when the function returns,
1952 the cursor is positioned on the predecessor
1953 of the inserted record */
1954 ulint** offsets,/*!< out: offsets on inserted record */
1955 mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
1956 const dtuple_t* tuple, /*!< in: tuple to insert */
1957 ulint n_ext, /*!< in: number of externally stored columns */
1958 mtr_t* mtr) /*!< in: mtr */
1959{
1960 dict_index_t* index;
1961 page_t* root;
1962 page_t* new_page;
1963 ulint new_page_no;
1964 rec_t* rec;
1965 dtuple_t* node_ptr;
1966 ulint level;
1967 rec_t* node_ptr_rec;
1968 page_cur_t* page_cursor;
1969 page_zip_des_t* root_page_zip;
1970 page_zip_des_t* new_page_zip;
1971 buf_block_t* root_block;
1972 buf_block_t* new_block;
1973
1974 root = btr_cur_get_page(cursor);
1975 root_block = btr_cur_get_block(cursor);
1976 root_page_zip = buf_block_get_page_zip(root_block);
1977 ut_ad(!page_is_empty(root));
1978 index = btr_cur_get_index(cursor);
1979 ut_ad(index->n_core_null_bytes <= UT_BITS_IN_BYTES(index->n_nullable));
1980#ifdef UNIV_ZIP_DEBUG
1981 ut_a(!root_page_zip || page_zip_validate(root_page_zip, root, index));
1982#endif /* UNIV_ZIP_DEBUG */
1983#ifdef UNIV_BTR_DEBUG
1984 if (!dict_index_is_ibuf(index)) {
1985 ulint space = index->table->space->id;
1986
1987 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
1988 + root, space));
1989 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
1990 + root, space));
1991 }
1992
1993 ut_a(dict_index_get_page(index) == page_get_page_no(root));
1994#endif /* UNIV_BTR_DEBUG */
1995 ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
1996 MTR_MEMO_X_LOCK
1997 | MTR_MEMO_SX_LOCK));
1998 ut_ad(mtr_memo_contains(mtr, root_block, MTR_MEMO_PAGE_X_FIX));
1999
2000 /* Allocate a new page to the tree. Root splitting is done by first
2001 moving the root records to the new page, emptying the root, putting
2002 a node pointer to the new page, and then splitting the new page. */
2003
2004 level = btr_page_get_level(root);
2005
2006 new_block = btr_page_alloc(index, 0, FSP_NO_DIR, level, mtr, mtr);
2007
2008 if (new_block == NULL && os_has_said_disk_full) {
2009 return(NULL);
2010 }
2011
2012 new_page = buf_block_get_frame(new_block);
2013 new_page_zip = buf_block_get_page_zip(new_block);
2014 ut_a(!new_page_zip == !root_page_zip);
2015 ut_a(!new_page_zip
2016 || page_zip_get_size(new_page_zip)
2017 == page_zip_get_size(root_page_zip));
2018
2019 btr_page_create(new_block, new_page_zip, index, level, mtr);
2020
2021 /* Set the next node and previous node fields of new page */
2022 btr_page_set_next(new_page, new_page_zip, FIL_NULL, mtr);
2023 btr_page_set_prev(new_page, new_page_zip, FIL_NULL, mtr);
2024
2025 /* Copy the records from root to the new page one by one. */
2026
2027 if (0
2028#ifdef UNIV_ZIP_COPY
2029 || new_page_zip
2030#endif /* UNIV_ZIP_COPY */
2031 || !page_copy_rec_list_end(new_block, root_block,
2032 page_get_infimum_rec(root),
2033 index, mtr)) {
2034 ut_a(new_page_zip);
2035
2036 /* Copy the page byte for byte. */
2037 page_zip_copy_recs(new_page_zip, new_page,
2038 root_page_zip, root, index, mtr);
2039
2040 /* Update the lock table and possible hash index. */
2041 lock_move_rec_list_end(new_block, root_block,
2042 page_get_infimum_rec(root));
2043
2044 /* Move any existing predicate locks */
2045 if (dict_index_is_spatial(index)) {
2046 lock_prdt_rec_move(new_block, root_block);
2047 } else {
2048 btr_search_move_or_delete_hash_entries(
2049 new_block, root_block);
2050 }
2051 }
2052
2053 if (dict_index_is_sec_or_ibuf(index)) {
2054 /* In secondary indexes and the change buffer,
2055 PAGE_MAX_TRX_ID can be reset on the root page, because
2056 the field only matters on leaf pages, and the root no
2057 longer is a leaf page. (Older versions of InnoDB did
2058 set PAGE_MAX_TRX_ID on all secondary index pages.) */
2059 if (root_page_zip) {
2060 byte* p = PAGE_HEADER + PAGE_MAX_TRX_ID + root;
2061 memset(p, 0, 8);
2062 page_zip_write_header(root_page_zip, p, 8, mtr);
2063 } else {
2064 mlog_write_ull(PAGE_HEADER + PAGE_MAX_TRX_ID
2065 + root, 0, mtr);
2066 }
2067 } else {
2068 /* PAGE_ROOT_AUTO_INC is only present in the clustered index
2069 root page; on other clustered index pages, we want to reserve
2070 the field PAGE_MAX_TRX_ID for future use. */
2071 if (new_page_zip) {
2072 byte* p = PAGE_HEADER + PAGE_MAX_TRX_ID + new_page;
2073 memset(p, 0, 8);
2074 page_zip_write_header(new_page_zip, p, 8, mtr);
2075 } else {
2076 mlog_write_ull(PAGE_HEADER + PAGE_MAX_TRX_ID
2077 + new_page, 0, mtr);
2078 }
2079 }
2080
2081 /* If this is a pessimistic insert which is actually done to
2082 perform a pessimistic update then we have stored the lock
2083 information of the record to be inserted on the infimum of the
2084 root page: we cannot discard the lock structs on the root page */
2085
2086 if (!dict_table_is_locking_disabled(index->table)) {
2087 lock_update_root_raise(new_block, root_block);
2088 }
2089
2090 /* Create a memory heap where the node pointer is stored */
2091 if (!*heap) {
2092 *heap = mem_heap_create(1000);
2093 }
2094
2095 rec = page_rec_get_next(page_get_infimum_rec(new_page));
2096 new_page_no = new_block->page.id.page_no();
2097
2098 /* Build the node pointer (= node key and page address) for the
2099 child */
2100 if (dict_index_is_spatial(index)) {
2101 rtr_mbr_t new_mbr;
2102
2103 rtr_page_cal_mbr(index, new_block, &new_mbr, *heap);
2104 node_ptr = rtr_index_build_node_ptr(
2105 index, &new_mbr, rec, new_page_no, *heap);
2106 } else {
2107 node_ptr = dict_index_build_node_ptr(
2108 index, rec, new_page_no, *heap, level);
2109 }
2110 /* The node pointer must be marked as the predefined minimum record,
2111 as there is no lower alphabetical limit to records in the leftmost
2112 node of a level: */
2113 dtuple_set_info_bits(node_ptr,
2114 dtuple_get_info_bits(node_ptr)
2115 | REC_INFO_MIN_REC_FLAG);
2116
2117 /* Rebuild the root page to get free space */
2118 btr_page_empty(root_block, root_page_zip, index, level + 1, mtr);
2119 /* btr_page_empty() is supposed to zero-initialize the field. */
2120 ut_ad(!page_get_instant(root_block->frame));
2121
2122 if (index->is_instant()) {
2123 ut_ad(!root_page_zip);
2124 byte* page_type = root_block->frame + FIL_PAGE_TYPE;
2125 ut_ad(mach_read_from_2(page_type) == FIL_PAGE_INDEX);
2126 mlog_write_ulint(page_type, FIL_PAGE_TYPE_INSTANT,
2127 MLOG_2BYTES, mtr);
2128 page_set_instant(root_block->frame, index->n_core_fields, mtr);
2129 }
2130
2131 /* Set the next node and previous node fields, although
2132 they should already have been set. The previous node field
2133 must be FIL_NULL if root_page_zip != NULL, because the
2134 REC_INFO_MIN_REC_FLAG (of the first user record) will be
2135 set if and only if !page_has_prev(). */
2136 btr_page_set_next(root, root_page_zip, FIL_NULL, mtr);
2137 btr_page_set_prev(root, root_page_zip, FIL_NULL, mtr);
2138
2139 page_cursor = btr_cur_get_page_cur(cursor);
2140
2141 /* Insert node pointer to the root */
2142
2143 page_cur_set_before_first(root_block, page_cursor);
2144
2145 node_ptr_rec = page_cur_tuple_insert(page_cursor, node_ptr,
2146 index, offsets, heap, 0, mtr);
2147
2148 /* The root page should only contain the node pointer
2149 to new_page at this point. Thus, the data should fit. */
2150 ut_a(node_ptr_rec);
2151
2152 /* We play safe and reset the free bits for the new page */
2153
2154 if (!dict_index_is_clust(index)
2155 && !index->table->is_temporary()) {
2156 ibuf_reset_free_bits(new_block);
2157 }
2158
2159 if (tuple != NULL) {
2160 /* Reposition the cursor to the child node */
2161 page_cur_search(new_block, index, tuple, page_cursor);
2162 } else {
2163 /* Set cursor to first record on child node */
2164 page_cur_set_before_first(new_block, page_cursor);
2165 }
2166
2167 /* Split the child and insert tuple */
2168 if (dict_index_is_spatial(index)) {
2169 /* Split rtree page and insert tuple */
2170 return(rtr_page_split_and_insert(flags, cursor, offsets, heap,
2171 tuple, n_ext, mtr));
2172 } else {
2173 return(btr_page_split_and_insert(flags, cursor, offsets, heap,
2174 tuple, n_ext, mtr));
2175 }
2176}
2177
2178/*************************************************************//**
2179Decides if the page should be split at the convergence point of inserts
2180converging to the left.
2181@return TRUE if split recommended */
2182ibool
2183btr_page_get_split_rec_to_left(
2184/*===========================*/
2185 btr_cur_t* cursor, /*!< in: cursor at which to insert */
2186 rec_t** split_rec) /*!< out: if split recommended,
2187 the first record on upper half page,
2188 or NULL if tuple to be inserted should
2189 be first */
2190{
2191 page_t* page;
2192 rec_t* insert_point;
2193 rec_t* infimum;
2194
2195 page = btr_cur_get_page(cursor);
2196 insert_point = btr_cur_get_rec(cursor);
2197
2198 if (page_header_get_ptr(page, PAGE_LAST_INSERT)
2199 == page_rec_get_next(insert_point)) {
2200
2201 infimum = page_get_infimum_rec(page);
2202
2203 /* If the convergence is in the middle of a page, include also
2204 the record immediately before the new insert to the upper
2205 page. Otherwise, we could repeatedly move from page to page
2206 lots of records smaller than the convergence point. */
2207
2208 if (infimum != insert_point
2209 && page_rec_get_next(infimum) != insert_point) {
2210
2211 *split_rec = insert_point;
2212 } else {
2213 *split_rec = page_rec_get_next(insert_point);
2214 }
2215
2216 return(TRUE);
2217 }
2218
2219 return(FALSE);
2220}
2221
2222/*************************************************************//**
2223Decides if the page should be split at the convergence point of inserts
2224converging to the right.
2225@return TRUE if split recommended */
2226ibool
2227btr_page_get_split_rec_to_right(
2228/*============================*/
2229 btr_cur_t* cursor, /*!< in: cursor at which to insert */
2230 rec_t** split_rec) /*!< out: if split recommended,
2231 the first record on upper half page,
2232 or NULL if tuple to be inserted should
2233 be first */
2234{
2235 page_t* page;
2236 rec_t* insert_point;
2237
2238 page = btr_cur_get_page(cursor);
2239 insert_point = btr_cur_get_rec(cursor);
2240
2241 /* We use eager heuristics: if the new insert would be right after
2242 the previous insert on the same page, we assume that there is a
2243 pattern of sequential inserts here. */
2244
2245 if (page_header_get_ptr(page, PAGE_LAST_INSERT) == insert_point) {
2246
2247 rec_t* next_rec;
2248
2249 next_rec = page_rec_get_next(insert_point);
2250
2251 if (page_rec_is_supremum(next_rec)) {
2252split_at_new:
2253 /* Split at the new record to insert */
2254 *split_rec = NULL;
2255 } else {
2256 rec_t* next_next_rec = page_rec_get_next(next_rec);
2257 if (page_rec_is_supremum(next_next_rec)) {
2258
2259 goto split_at_new;
2260 }
2261
2262 /* If there are >= 2 user records up from the insert
2263 point, split all but 1 off. We want to keep one because
2264 then sequential inserts can use the adaptive hash
2265 index, as they can do the necessary checks of the right
2266 search position just by looking at the records on this
2267 page. */
2268
2269 *split_rec = next_next_rec;
2270 }
2271
2272 return(TRUE);
2273 }
2274
2275 return(FALSE);
2276}
2277
2278/*************************************************************//**
2279Calculates a split record such that the tuple will certainly fit on
2280its half-page when the split is performed. We assume in this function
2281only that the cursor page has at least one user record.
2282@return split record, or NULL if tuple will be the first record on
2283the lower or upper half-page (determined by btr_page_tuple_smaller()) */
2284static
2285rec_t*
2286btr_page_get_split_rec(
2287/*===================*/
2288 btr_cur_t* cursor, /*!< in: cursor at which insert should be made */
2289 const dtuple_t* tuple, /*!< in: tuple to insert */
2290 ulint n_ext) /*!< in: number of externally stored columns */
2291{
2292 page_t* page;
2293 page_zip_des_t* page_zip;
2294 ulint insert_size;
2295 ulint free_space;
2296 ulint total_data;
2297 ulint total_n_recs;
2298 ulint total_space;
2299 ulint incl_data;
2300 rec_t* ins_rec;
2301 rec_t* rec;
2302 rec_t* next_rec;
2303 ulint n;
2304 mem_heap_t* heap;
2305 ulint* offsets;
2306
2307 page = btr_cur_get_page(cursor);
2308
2309 insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
2310 free_space = page_get_free_space_of_empty(page_is_comp(page));
2311
2312 page_zip = btr_cur_get_page_zip(cursor);
2313 if (page_zip) {
2314 /* Estimate the free space of an empty compressed page. */
2315 ulint free_space_zip = page_zip_empty_size(
2316 cursor->index->n_fields,
2317 page_zip_get_size(page_zip));
2318
2319 if (free_space > (ulint) free_space_zip) {
2320 free_space = (ulint) free_space_zip;
2321 }
2322 }
2323
2324 /* free_space is now the free space of a created new page */
2325
2326 total_data = page_get_data_size(page) + insert_size;
2327 total_n_recs = ulint(page_get_n_recs(page)) + 1;
2328 ut_ad(total_n_recs >= 2);
2329 total_space = total_data + page_dir_calc_reserved_space(total_n_recs);
2330
2331 n = 0;
2332 incl_data = 0;
2333 ins_rec = btr_cur_get_rec(cursor);
2334 rec = page_get_infimum_rec(page);
2335
2336 heap = NULL;
2337 offsets = NULL;
2338
2339 /* We start to include records to the left half, and when the
2340 space reserved by them exceeds half of total_space, then if
2341 the included records fit on the left page, they will be put there
2342 if something was left over also for the right page,
2343 otherwise the last included record will be the first on the right
2344 half page */
2345
2346 do {
2347 /* Decide the next record to include */
2348 if (rec == ins_rec) {
2349 rec = NULL; /* NULL denotes that tuple is
2350 now included */
2351 } else if (rec == NULL) {
2352 rec = page_rec_get_next(ins_rec);
2353 } else {
2354 rec = page_rec_get_next(rec);
2355 }
2356
2357 if (rec == NULL) {
2358 /* Include tuple */
2359 incl_data += insert_size;
2360 } else {
2361 offsets = rec_get_offsets(rec, cursor->index, offsets,
2362 page_is_leaf(page),
2363 ULINT_UNDEFINED, &heap);
2364 incl_data += rec_offs_size(offsets);
2365 }
2366
2367 n++;
2368 } while (incl_data + page_dir_calc_reserved_space(n)
2369 < total_space / 2);
2370
2371 if (incl_data + page_dir_calc_reserved_space(n) <= free_space) {
2372 /* The next record will be the first on
2373 the right half page if it is not the
2374 supremum record of page */
2375
2376 if (rec == ins_rec) {
2377 rec = NULL;
2378
2379 goto func_exit;
2380 } else if (rec == NULL) {
2381 next_rec = page_rec_get_next(ins_rec);
2382 } else {
2383 next_rec = page_rec_get_next(rec);
2384 }
2385 ut_ad(next_rec);
2386 if (!page_rec_is_supremum(next_rec)) {
2387 rec = next_rec;
2388 }
2389 }
2390
2391func_exit:
2392 if (heap) {
2393 mem_heap_free(heap);
2394 }
2395 return(rec);
2396}
2397
2398/*************************************************************//**
2399Returns TRUE if the insert fits on the appropriate half-page with the
2400chosen split_rec.
2401@return true if fits */
2402static MY_ATTRIBUTE((nonnull(1,3,4,6), warn_unused_result))
2403bool
2404btr_page_insert_fits(
2405/*=================*/
2406 btr_cur_t* cursor, /*!< in: cursor at which insert
2407 should be made */
2408 const rec_t* split_rec,/*!< in: suggestion for first record
2409 on upper half-page, or NULL if
2410 tuple to be inserted should be first */
2411 ulint** offsets,/*!< in: rec_get_offsets(
2412 split_rec, cursor->index); out: garbage */
2413 const dtuple_t* tuple, /*!< in: tuple to insert */
2414 ulint n_ext, /*!< in: number of externally stored columns */
2415 mem_heap_t** heap) /*!< in: temporary memory heap */
2416{
2417 page_t* page;
2418 ulint insert_size;
2419 ulint free_space;
2420 ulint total_data;
2421 ulint total_n_recs;
2422 const rec_t* rec;
2423 const rec_t* end_rec;
2424
2425 page = btr_cur_get_page(cursor);
2426
2427 ut_ad(!split_rec
2428 || !page_is_comp(page) == !rec_offs_comp(*offsets));
2429 ut_ad(!split_rec
2430 || rec_offs_validate(split_rec, cursor->index, *offsets));
2431
2432 insert_size = rec_get_converted_size(cursor->index, tuple, n_ext);
2433 free_space = page_get_free_space_of_empty(page_is_comp(page));
2434
2435 /* free_space is now the free space of a created new page */
2436
2437 total_data = page_get_data_size(page) + insert_size;
2438 total_n_recs = ulint(page_get_n_recs(page)) + 1;
2439
2440 /* We determine which records (from rec to end_rec, not including
2441 end_rec) will end up on the other half page from tuple when it is
2442 inserted. */
2443
2444 if (split_rec == NULL) {
2445 rec = page_rec_get_next(page_get_infimum_rec(page));
2446 end_rec = page_rec_get_next(btr_cur_get_rec(cursor));
2447
2448 } else if (cmp_dtuple_rec(tuple, split_rec, *offsets) >= 0) {
2449
2450 rec = page_rec_get_next(page_get_infimum_rec(page));
2451 end_rec = split_rec;
2452 } else {
2453 rec = split_rec;
2454 end_rec = page_get_supremum_rec(page);
2455 }
2456
2457 if (total_data + page_dir_calc_reserved_space(total_n_recs)
2458 <= free_space) {
2459
2460 /* Ok, there will be enough available space on the
2461 half page where the tuple is inserted */
2462
2463 return(true);
2464 }
2465
2466 while (rec != end_rec) {
2467 /* In this loop we calculate the amount of reserved
2468 space after rec is removed from page. */
2469
2470 *offsets = rec_get_offsets(rec, cursor->index, *offsets,
2471 page_is_leaf(page),
2472 ULINT_UNDEFINED, heap);
2473
2474 total_data -= rec_offs_size(*offsets);
2475 total_n_recs--;
2476
2477 if (total_data + page_dir_calc_reserved_space(total_n_recs)
2478 <= free_space) {
2479
2480 /* Ok, there will be enough available space on the
2481 half page where the tuple is inserted */
2482
2483 return(true);
2484 }
2485
2486 rec = page_rec_get_next_const(rec);
2487 }
2488
2489 return(false);
2490}
2491
2492/*******************************************************//**
2493Inserts a data tuple to a tree on a non-leaf level. It is assumed
2494that mtr holds an x-latch on the tree. */
2495void
2496btr_insert_on_non_leaf_level_func(
2497/*==============================*/
2498 ulint flags, /*!< in: undo logging and locking flags */
2499 dict_index_t* index, /*!< in: index */
2500 ulint level, /*!< in: level, must be > 0 */
2501 dtuple_t* tuple, /*!< in: the record to be inserted */
2502 const char* file, /*!< in: file name */
2503 unsigned line, /*!< in: line where called */
2504 mtr_t* mtr) /*!< in: mtr */
2505{
2506 big_rec_t* dummy_big_rec;
2507 btr_cur_t cursor;
2508 dberr_t err;
2509 rec_t* rec;
2510 mem_heap_t* heap = NULL;
2511 ulint offsets_[REC_OFFS_NORMAL_SIZE];
2512 ulint* offsets = offsets_;
2513 rec_offs_init(offsets_);
2514 rtr_info_t rtr_info;
2515
2516 ut_ad(level > 0);
2517
2518 if (!dict_index_is_spatial(index)) {
2519 dberr_t err = btr_cur_search_to_nth_level(
2520 index, level, tuple, PAGE_CUR_LE,
2521 BTR_CONT_MODIFY_TREE,
2522 &cursor, 0, file, line, mtr);
2523
2524 if (err != DB_SUCCESS) {
2525 ib::warn() << " Error code: " << err
2526 << " btr_page_get_father_node_ptr_func "
2527 << " level: " << level
2528 << " called from file: "
2529 << file << " line: " << line
2530 << " table: " << index->table->name
2531 << " index: " << index->name;
2532 }
2533 } else {
2534 /* For spatial index, initialize structures to track
2535 its parents etc. */
2536 rtr_init_rtr_info(&rtr_info, false, &cursor, index, false);
2537
2538 rtr_info_update_btr(&cursor, &rtr_info);
2539
2540 btr_cur_search_to_nth_level(index, level, tuple,
2541 PAGE_CUR_RTREE_INSERT,
2542 BTR_CONT_MODIFY_TREE,
2543 &cursor, 0, file, line, mtr);
2544 }
2545
2546 ut_ad(cursor.flag == BTR_CUR_BINARY);
2547
2548 err = btr_cur_optimistic_insert(
2549 flags
2550 | BTR_NO_LOCKING_FLAG
2551 | BTR_KEEP_SYS_FLAG
2552 | BTR_NO_UNDO_LOG_FLAG,
2553 &cursor, &offsets, &heap,
2554 tuple, &rec, &dummy_big_rec, 0, NULL, mtr);
2555
2556 if (err == DB_FAIL) {
2557 err = btr_cur_pessimistic_insert(flags
2558 | BTR_NO_LOCKING_FLAG
2559 | BTR_KEEP_SYS_FLAG
2560 | BTR_NO_UNDO_LOG_FLAG,
2561 &cursor, &offsets, &heap,
2562 tuple, &rec,
2563 &dummy_big_rec, 0, NULL, mtr);
2564 ut_a(err == DB_SUCCESS);
2565 }
2566
2567 if (heap != NULL) {
2568 mem_heap_free(heap);
2569 }
2570
2571 if (dict_index_is_spatial(index)) {
2572 ut_ad(cursor.rtr_info);
2573
2574 rtr_clean_rtr_info(&rtr_info, true);
2575 }
2576}
2577
2578/**************************************************************//**
2579Attaches the halves of an index page on the appropriate level in an
2580index tree. */
2581static MY_ATTRIBUTE((nonnull))
2582void
2583btr_attach_half_pages(
2584/*==================*/
2585 ulint flags, /*!< in: undo logging and
2586 locking flags */
2587 dict_index_t* index, /*!< in: the index tree */
2588 buf_block_t* block, /*!< in/out: page to be split */
2589 const rec_t* split_rec, /*!< in: first record on upper
2590 half page */
2591 buf_block_t* new_block, /*!< in/out: the new half page */
2592 ulint direction, /*!< in: FSP_UP or FSP_DOWN */
2593 mtr_t* mtr) /*!< in: mtr */
2594{
2595 ulint prev_page_no;
2596 ulint next_page_no;
2597 ulint level;
2598 page_t* page = buf_block_get_frame(block);
2599 page_t* lower_page;
2600 page_t* upper_page;
2601 ulint lower_page_no;
2602 ulint upper_page_no;
2603 page_zip_des_t* lower_page_zip;
2604 page_zip_des_t* upper_page_zip;
2605 dtuple_t* node_ptr_upper;
2606 mem_heap_t* heap;
2607 buf_block_t* prev_block = NULL;
2608 buf_block_t* next_block = NULL;
2609
2610 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2611 ut_ad(mtr_memo_contains(mtr, new_block, MTR_MEMO_PAGE_X_FIX));
2612
2613 /* Create a memory heap where the data tuple is stored */
2614 heap = mem_heap_create(1024);
2615
2616 /* Based on split direction, decide upper and lower pages */
2617 if (direction == FSP_DOWN) {
2618
2619 btr_cur_t cursor;
2620 ulint* offsets;
2621
2622 lower_page = buf_block_get_frame(new_block);
2623 lower_page_no = new_block->page.id.page_no();
2624 lower_page_zip = buf_block_get_page_zip(new_block);
2625 upper_page = buf_block_get_frame(block);
2626 upper_page_no = block->page.id.page_no();
2627 upper_page_zip = buf_block_get_page_zip(block);
2628
2629 /* Look up the index for the node pointer to page */
2630 offsets = btr_page_get_father_block(NULL, heap, index,
2631 block, mtr, &cursor);
2632
2633 /* Replace the address of the old child node (= page) with the
2634 address of the new lower half */
2635
2636 btr_node_ptr_set_child_page_no(
2637 btr_cur_get_rec(&cursor),
2638 btr_cur_get_page_zip(&cursor),
2639 offsets, lower_page_no, mtr);
2640 mem_heap_empty(heap);
2641 } else {
2642 lower_page = buf_block_get_frame(block);
2643 lower_page_no = block->page.id.page_no();
2644 lower_page_zip = buf_block_get_page_zip(block);
2645 upper_page = buf_block_get_frame(new_block);
2646 upper_page_no = new_block->page.id.page_no();
2647 upper_page_zip = buf_block_get_page_zip(new_block);
2648 }
2649
2650 /* Get the previous and next pages of page */
2651 prev_page_no = btr_page_get_prev(page, mtr);
2652 next_page_no = btr_page_get_next(page, mtr);
2653
2654 const ulint space = block->page.id.space();
2655
2656 /* for consistency, both blocks should be locked, before change */
2657 if (prev_page_no != FIL_NULL && direction == FSP_DOWN) {
2658 prev_block = btr_block_get(
2659 page_id_t(space, prev_page_no), block->page.size,
2660 RW_X_LATCH, index, mtr);
2661 }
2662 if (next_page_no != FIL_NULL && direction != FSP_DOWN) {
2663 next_block = btr_block_get(
2664 page_id_t(space, next_page_no), block->page.size,
2665 RW_X_LATCH, index, mtr);
2666 }
2667
2668 /* Get the level of the split pages */
2669 level = btr_page_get_level(buf_block_get_frame(block));
2670 ut_ad(level == btr_page_get_level(buf_block_get_frame(new_block)));
2671
2672 /* Build the node pointer (= node key and page address) for the upper
2673 half */
2674
2675 node_ptr_upper = dict_index_build_node_ptr(index, split_rec,
2676 upper_page_no, heap, level);
2677
2678 /* Insert it next to the pointer to the lower half. Note that this
2679 may generate recursion leading to a split on the higher level. */
2680
2681 btr_insert_on_non_leaf_level(flags, index, level + 1,
2682 node_ptr_upper, mtr);
2683
2684 /* Free the memory heap */
2685 mem_heap_free(heap);
2686
2687 /* Update page links of the level */
2688
2689 if (prev_block) {
2690#ifdef UNIV_BTR_DEBUG
2691 ut_a(page_is_comp(prev_block->frame) == page_is_comp(page));
2692 ut_a(btr_page_get_next(prev_block->frame, mtr)
2693 == block->page.id.page_no());
2694#endif /* UNIV_BTR_DEBUG */
2695
2696 btr_page_set_next(buf_block_get_frame(prev_block),
2697 buf_block_get_page_zip(prev_block),
2698 lower_page_no, mtr);
2699 }
2700
2701 if (next_block) {
2702#ifdef UNIV_BTR_DEBUG
2703 ut_a(page_is_comp(next_block->frame) == page_is_comp(page));
2704 ut_a(btr_page_get_prev(next_block->frame, mtr)
2705 == page_get_page_no(page));
2706#endif /* UNIV_BTR_DEBUG */
2707
2708 btr_page_set_prev(buf_block_get_frame(next_block),
2709 buf_block_get_page_zip(next_block),
2710 upper_page_no, mtr);
2711 }
2712
2713 if (direction == FSP_DOWN) {
2714 /* lower_page is new */
2715 btr_page_set_prev(lower_page, lower_page_zip,
2716 prev_page_no, mtr);
2717 } else {
2718 ut_ad(btr_page_get_prev(lower_page, mtr) == prev_page_no);
2719 }
2720
2721 btr_page_set_next(lower_page, lower_page_zip, upper_page_no, mtr);
2722 btr_page_set_prev(upper_page, upper_page_zip, lower_page_no, mtr);
2723
2724 if (direction != FSP_DOWN) {
2725 /* upper_page is new */
2726 btr_page_set_next(upper_page, upper_page_zip,
2727 next_page_no, mtr);
2728 } else {
2729 ut_ad(btr_page_get_next(upper_page, mtr) == next_page_no);
2730 }
2731}
2732
2733/*************************************************************//**
2734Determine if a tuple is smaller than any record on the page.
2735@return TRUE if smaller */
2736static MY_ATTRIBUTE((nonnull, warn_unused_result))
2737bool
2738btr_page_tuple_smaller(
2739/*===================*/
2740 btr_cur_t* cursor, /*!< in: b-tree cursor */
2741 const dtuple_t* tuple, /*!< in: tuple to consider */
2742 ulint** offsets,/*!< in/out: temporary storage */
2743 ulint n_uniq, /*!< in: number of unique fields
2744 in the index page records */
2745 mem_heap_t** heap) /*!< in/out: heap for offsets */
2746{
2747 buf_block_t* block;
2748 const rec_t* first_rec;
2749 page_cur_t pcur;
2750
2751 /* Read the first user record in the page. */
2752 block = btr_cur_get_block(cursor);
2753 page_cur_set_before_first(block, &pcur);
2754 page_cur_move_to_next(&pcur);
2755 first_rec = page_cur_get_rec(&pcur);
2756
2757 *offsets = rec_get_offsets(
2758 first_rec, cursor->index, *offsets, page_is_leaf(block->frame),
2759 n_uniq, heap);
2760
2761 return(cmp_dtuple_rec(tuple, first_rec, *offsets) < 0);
2762}
2763
2764/** Insert the tuple into the right sibling page, if the cursor is at the end
2765of a page.
2766@param[in] flags undo logging and locking flags
2767@param[in,out] cursor cursor at which to insert; when the function succeeds,
2768 the cursor is positioned before the insert point.
2769@param[out] offsets offsets on inserted record
2770@param[in,out] heap memory heap for allocating offsets
2771@param[in] tuple tuple to insert
2772@param[in] n_ext number of externally stored columns
2773@param[in,out] mtr mini-transaction
2774@return inserted record (first record on the right sibling page);
2775 the cursor will be positioned on the page infimum
2776@retval NULL if the operation was not performed */
2777static
2778rec_t*
2779btr_insert_into_right_sibling(
2780 ulint flags,
2781 btr_cur_t* cursor,
2782 ulint** offsets,
2783 mem_heap_t* heap,
2784 const dtuple_t* tuple,
2785 ulint n_ext,
2786 mtr_t* mtr)
2787{
2788 buf_block_t* block = btr_cur_get_block(cursor);
2789 page_t* page = buf_block_get_frame(block);
2790 ulint next_page_no = btr_page_get_next(page, mtr);
2791
2792 ut_ad(mtr_memo_contains_flagged(
2793 mtr, dict_index_get_lock(cursor->index),
2794 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
2795 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2796 ut_ad(heap);
2797
2798 if (next_page_no == FIL_NULL || !page_rec_is_supremum(
2799 page_rec_get_next(btr_cur_get_rec(cursor)))) {
2800
2801 return(NULL);
2802 }
2803
2804 page_cur_t next_page_cursor;
2805 buf_block_t* next_block;
2806 page_t* next_page;
2807 btr_cur_t next_father_cursor;
2808 rec_t* rec = NULL;
2809 ulint max_size;
2810
2811 const ulint space = block->page.id.space();
2812
2813 next_block = btr_block_get(
2814 page_id_t(space, next_page_no), block->page.size,
2815 RW_X_LATCH, cursor->index, mtr);
2816 next_page = buf_block_get_frame(next_block);
2817
2818 bool is_leaf = page_is_leaf(next_page);
2819
2820 btr_page_get_father(
2821 cursor->index, next_block, mtr, &next_father_cursor);
2822
2823 page_cur_search(
2824 next_block, cursor->index, tuple, PAGE_CUR_LE,
2825 &next_page_cursor);
2826
2827 max_size = page_get_max_insert_size_after_reorganize(next_page, 1);
2828
2829 /* Extends gap lock for the next page */
2830 if (!dict_table_is_locking_disabled(cursor->index->table)) {
2831 lock_update_split_left(next_block, block);
2832 }
2833
2834 rec = page_cur_tuple_insert(
2835 &next_page_cursor, tuple, cursor->index, offsets, &heap,
2836 n_ext, mtr);
2837
2838 if (rec == NULL) {
2839 if (is_leaf
2840 && next_block->page.size.is_compressed()
2841 && !dict_index_is_clust(cursor->index)
2842 && !cursor->index->table->is_temporary()) {
2843 /* Reset the IBUF_BITMAP_FREE bits, because
2844 page_cur_tuple_insert() will have attempted page
2845 reorganize before failing. */
2846 ibuf_reset_free_bits(next_block);
2847 }
2848 return(NULL);
2849 }
2850
2851 ibool compressed;
2852 dberr_t err;
2853 ulint level = btr_page_get_level(next_page);
2854
2855 /* adjust cursor position */
2856 *btr_cur_get_page_cur(cursor) = next_page_cursor;
2857
2858 ut_ad(btr_cur_get_rec(cursor) == page_get_infimum_rec(next_page));
2859 ut_ad(page_rec_get_next(page_get_infimum_rec(next_page)) == rec);
2860
2861 /* We have to change the parent node pointer */
2862
2863 compressed = btr_cur_pessimistic_delete(
2864 &err, TRUE, &next_father_cursor,
2865 BTR_CREATE_FLAG, false, mtr);
2866
2867 ut_a(err == DB_SUCCESS);
2868
2869 if (!compressed) {
2870 btr_cur_compress_if_useful(&next_father_cursor, FALSE, mtr);
2871 }
2872
2873 dtuple_t* node_ptr = dict_index_build_node_ptr(
2874 cursor->index, rec, next_block->page.id.page_no(),
2875 heap, level);
2876
2877 btr_insert_on_non_leaf_level(
2878 flags, cursor->index, level + 1, node_ptr, mtr);
2879
2880 ut_ad(rec_offs_validate(rec, cursor->index, *offsets));
2881
2882 if (is_leaf
2883 && !dict_index_is_clust(cursor->index)
2884 && !cursor->index->table->is_temporary()) {
2885 /* Update the free bits of the B-tree page in the
2886 insert buffer bitmap. */
2887
2888 if (next_block->page.size.is_compressed()) {
2889 ibuf_update_free_bits_zip(next_block, mtr);
2890 } else {
2891 ibuf_update_free_bits_if_full(
2892 next_block, max_size,
2893 rec_offs_size(*offsets) + PAGE_DIR_SLOT_SIZE);
2894 }
2895 }
2896
2897 return(rec);
2898}
2899
2900/*************************************************************//**
2901Splits an index page to halves and inserts the tuple. It is assumed
2902that mtr holds an x-latch to the index tree. NOTE: the tree x-latch is
2903released within this function! NOTE that the operation of this
2904function must always succeed, we cannot reverse it: therefore enough
2905free disk space (2 pages) must be guaranteed to be available before
2906this function is called.
2907NOTE: jonaso added support for calling function with tuple == NULL
2908which cause it to only split a page.
2909
2910@return inserted record or NULL if run out of space */
2911rec_t*
2912btr_page_split_and_insert(
2913/*======================*/
2914 ulint flags, /*!< in: undo logging and locking flags */
2915 btr_cur_t* cursor, /*!< in: cursor at which to insert; when the
2916 function returns, the cursor is positioned
2917 on the predecessor of the inserted record */
2918 ulint** offsets,/*!< out: offsets on inserted record */
2919 mem_heap_t** heap, /*!< in/out: pointer to memory heap, or NULL */
2920 const dtuple_t* tuple, /*!< in: tuple to insert */
2921 ulint n_ext, /*!< in: number of externally stored columns */
2922 mtr_t* mtr) /*!< in: mtr */
2923{
2924 buf_block_t* block;
2925 page_t* page;
2926 page_zip_des_t* page_zip;
2927 ulint page_no;
2928 byte direction;
2929 ulint hint_page_no;
2930 buf_block_t* new_block;
2931 page_t* new_page;
2932 page_zip_des_t* new_page_zip;
2933 rec_t* split_rec;
2934 buf_block_t* left_block;
2935 buf_block_t* right_block;
2936 buf_block_t* insert_block;
2937 page_cur_t* page_cursor;
2938 rec_t* first_rec;
2939 byte* buf = 0; /* remove warning */
2940 rec_t* move_limit;
2941 ibool insert_will_fit;
2942 ibool insert_left;
2943 ulint n_iterations = 0;
2944 rec_t* rec;
2945 ulint n_uniq;
2946 dict_index_t* index;
2947
2948 index = btr_cur_get_index(cursor);
2949
2950 if (dict_index_is_spatial(index)) {
2951 /* Split rtree page and update parent */
2952 return(rtr_page_split_and_insert(flags, cursor, offsets, heap,
2953 tuple, n_ext, mtr));
2954 }
2955
2956 if (!*heap) {
2957 *heap = mem_heap_create(1024);
2958 }
2959 n_uniq = dict_index_get_n_unique_in_tree(cursor->index);
2960func_start:
2961 mem_heap_empty(*heap);
2962 *offsets = NULL;
2963
2964 ut_ad(mtr_memo_contains_flagged(mtr,
2965 dict_index_get_lock(cursor->index),
2966 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
2967 ut_ad(!dict_index_is_online_ddl(cursor->index)
2968 || (flags & BTR_CREATE_FLAG)
2969 || dict_index_is_clust(cursor->index));
2970 ut_ad(rw_lock_own_flagged(dict_index_get_lock(cursor->index),
2971 RW_LOCK_FLAG_X | RW_LOCK_FLAG_SX));
2972
2973 block = btr_cur_get_block(cursor);
2974 page = buf_block_get_frame(block);
2975 page_zip = buf_block_get_page_zip(block);
2976
2977 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
2978 ut_ad(!page_is_empty(page));
2979
2980 /* try to insert to the next page if possible before split */
2981 rec = btr_insert_into_right_sibling(
2982 flags, cursor, offsets, *heap, tuple, n_ext, mtr);
2983
2984 if (rec != NULL) {
2985 return(rec);
2986 }
2987
2988 page_no = block->page.id.page_no();
2989
2990 /* 1. Decide the split record; split_rec == NULL means that the
2991 tuple to be inserted should be the first record on the upper
2992 half-page */
2993 insert_left = FALSE;
2994
2995 if (tuple != NULL && n_iterations > 0) {
2996 direction = FSP_UP;
2997 hint_page_no = page_no + 1;
2998 split_rec = btr_page_get_split_rec(cursor, tuple, n_ext);
2999
3000 if (split_rec == NULL) {
3001 insert_left = btr_page_tuple_smaller(
3002 cursor, tuple, offsets, n_uniq, heap);
3003 }
3004 } else if (btr_page_get_split_rec_to_right(cursor, &split_rec)) {
3005 direction = FSP_UP;
3006 hint_page_no = page_no + 1;
3007
3008 } else if (btr_page_get_split_rec_to_left(cursor, &split_rec)) {
3009 direction = FSP_DOWN;
3010 hint_page_no = page_no - 1;
3011 ut_ad(split_rec);
3012 } else {
3013 direction = FSP_UP;
3014 hint_page_no = page_no + 1;
3015
3016 /* If there is only one record in the index page, we
3017 can't split the node in the middle by default. We need
3018 to determine whether the new record will be inserted
3019 to the left or right. */
3020
3021 if (page_get_n_recs(page) > 1) {
3022 split_rec = page_get_middle_rec(page);
3023 } else if (btr_page_tuple_smaller(cursor, tuple,
3024 offsets, n_uniq, heap)) {
3025 split_rec = page_rec_get_next(
3026 page_get_infimum_rec(page));
3027 } else {
3028 split_rec = NULL;
3029 }
3030 }
3031
3032 DBUG_EXECUTE_IF("disk_is_full",
3033 os_has_said_disk_full = true;
3034 return(NULL););
3035
3036 /* 2. Allocate a new page to the index */
3037 new_block = btr_page_alloc(cursor->index, hint_page_no, direction,
3038 btr_page_get_level(page), mtr, mtr);
3039
3040 if (new_block == NULL && os_has_said_disk_full) {
3041 return(NULL);
3042 }
3043
3044 new_page = buf_block_get_frame(new_block);
3045 new_page_zip = buf_block_get_page_zip(new_block);
3046 btr_page_create(new_block, new_page_zip, cursor->index,
3047 btr_page_get_level(page), mtr);
3048 /* Only record the leaf level page splits. */
3049 if (page_is_leaf(page)) {
3050 cursor->index->stat_defrag_n_page_split ++;
3051 cursor->index->stat_defrag_modified_counter ++;
3052 btr_defragment_save_defrag_stats_if_needed(cursor->index);
3053 }
3054
3055 /* 3. Calculate the first record on the upper half-page, and the
3056 first record (move_limit) on original page which ends up on the
3057 upper half */
3058
3059 if (split_rec) {
3060 first_rec = move_limit = split_rec;
3061
3062 *offsets = rec_get_offsets(split_rec, cursor->index, *offsets,
3063 page_is_leaf(page), n_uniq, heap);
3064
3065 if (tuple != NULL) {
3066 insert_left = cmp_dtuple_rec(
3067 tuple, split_rec, *offsets) < 0;
3068 } else {
3069 insert_left = 1;
3070 }
3071
3072 if (!insert_left && new_page_zip && n_iterations > 0) {
3073 /* If a compressed page has already been split,
3074 avoid further splits by inserting the record
3075 to an empty page. */
3076 split_rec = NULL;
3077 goto insert_empty;
3078 }
3079 } else if (insert_left) {
3080 ut_a(n_iterations > 0);
3081 first_rec = page_rec_get_next(page_get_infimum_rec(page));
3082 move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
3083 } else {
3084insert_empty:
3085 ut_ad(!split_rec);
3086 ut_ad(!insert_left);
3087 buf = UT_NEW_ARRAY_NOKEY(
3088 byte,
3089 rec_get_converted_size(cursor->index, tuple, n_ext));
3090
3091 first_rec = rec_convert_dtuple_to_rec(buf, cursor->index,
3092 tuple, n_ext);
3093 move_limit = page_rec_get_next(btr_cur_get_rec(cursor));
3094 }
3095
3096 /* 4. Do first the modifications in the tree structure */
3097
3098 btr_attach_half_pages(flags, cursor->index, block,
3099 first_rec, new_block, direction, mtr);
3100
3101 /* If the split is made on the leaf level and the insert will fit
3102 on the appropriate half-page, we may release the tree x-latch.
3103 We can then move the records after releasing the tree latch,
3104 thus reducing the tree latch contention. */
3105 if (tuple == NULL) {
3106 insert_will_fit = 1;
3107 }
3108 else if (split_rec) {
3109 insert_will_fit = !new_page_zip
3110 && btr_page_insert_fits(cursor, split_rec,
3111 offsets, tuple, n_ext, heap);
3112 } else {
3113 if (!insert_left) {
3114 UT_DELETE_ARRAY(buf);
3115 buf = NULL;
3116 }
3117
3118 insert_will_fit = !new_page_zip
3119 && btr_page_insert_fits(cursor, NULL,
3120 offsets, tuple, n_ext, heap);
3121 }
3122
3123 if (!srv_read_only_mode
3124 && insert_will_fit
3125 && page_is_leaf(page)
3126 && !dict_index_is_online_ddl(cursor->index)) {
3127
3128 mtr->memo_release(
3129 dict_index_get_lock(cursor->index),
3130 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK);
3131
3132 /* NOTE: We cannot release root block latch here, because it
3133 has segment header and already modified in most of cases.*/
3134 }
3135
3136 /* 5. Move then the records to the new page */
3137 if (direction == FSP_DOWN) {
3138 /* fputs("Split left\n", stderr); */
3139
3140 if (0
3141#ifdef UNIV_ZIP_COPY
3142 || page_zip
3143#endif /* UNIV_ZIP_COPY */
3144 || !page_move_rec_list_start(new_block, block, move_limit,
3145 cursor->index, mtr)) {
3146 /* For some reason, compressing new_page failed,
3147 even though it should contain fewer records than
3148 the original page. Copy the page byte for byte
3149 and then delete the records from both pages
3150 as appropriate. Deleting will always succeed. */
3151 ut_a(new_page_zip);
3152
3153 page_zip_copy_recs(new_page_zip, new_page,
3154 page_zip, page, cursor->index, mtr);
3155 page_delete_rec_list_end(move_limit - page + new_page,
3156 new_block, cursor->index,
3157 ULINT_UNDEFINED,
3158 ULINT_UNDEFINED, mtr);
3159
3160 /* Update the lock table and possible hash index. */
3161 lock_move_rec_list_start(
3162 new_block, block, move_limit,
3163 new_page + PAGE_NEW_INFIMUM);
3164
3165 btr_search_move_or_delete_hash_entries(
3166 new_block, block);
3167
3168 /* Delete the records from the source page. */
3169
3170 page_delete_rec_list_start(move_limit, block,
3171 cursor->index, mtr);
3172 }
3173
3174 left_block = new_block;
3175 right_block = block;
3176
3177 if (!dict_table_is_locking_disabled(cursor->index->table)) {
3178 lock_update_split_left(right_block, left_block);
3179 }
3180 } else {
3181 /* fputs("Split right\n", stderr); */
3182
3183 if (0
3184#ifdef UNIV_ZIP_COPY
3185 || page_zip
3186#endif /* UNIV_ZIP_COPY */
3187 || !page_move_rec_list_end(new_block, block, move_limit,
3188 cursor->index, mtr)) {
3189 /* For some reason, compressing new_page failed,
3190 even though it should contain fewer records than
3191 the original page. Copy the page byte for byte
3192 and then delete the records from both pages
3193 as appropriate. Deleting will always succeed. */
3194 ut_a(new_page_zip);
3195
3196 page_zip_copy_recs(new_page_zip, new_page,
3197 page_zip, page, cursor->index, mtr);
3198 page_delete_rec_list_start(move_limit - page
3199 + new_page, new_block,
3200 cursor->index, mtr);
3201
3202 /* Update the lock table and possible hash index. */
3203 lock_move_rec_list_end(new_block, block, move_limit);
3204
3205 ut_ad(!dict_index_is_spatial(index));
3206
3207 btr_search_move_or_delete_hash_entries(
3208 new_block, block);
3209
3210 /* Delete the records from the source page. */
3211
3212 page_delete_rec_list_end(move_limit, block,
3213 cursor->index,
3214 ULINT_UNDEFINED,
3215 ULINT_UNDEFINED, mtr);
3216 }
3217
3218 left_block = block;
3219 right_block = new_block;
3220
3221 if (!dict_table_is_locking_disabled(cursor->index->table)) {
3222 lock_update_split_right(right_block, left_block);
3223 }
3224 }
3225
3226#ifdef UNIV_ZIP_DEBUG
3227 if (page_zip) {
3228 ut_a(page_zip_validate(page_zip, page, cursor->index));
3229 ut_a(page_zip_validate(new_page_zip, new_page, cursor->index));
3230 }
3231#endif /* UNIV_ZIP_DEBUG */
3232
3233 /* At this point, split_rec, move_limit and first_rec may point
3234 to garbage on the old page. */
3235
3236 /* 6. The split and the tree modification is now completed. Decide the
3237 page where the tuple should be inserted */
3238
3239 if (tuple == NULL) {
3240 rec = NULL;
3241 goto func_exit;
3242 }
3243
3244 if (insert_left) {
3245 insert_block = left_block;
3246 } else {
3247 insert_block = right_block;
3248 }
3249
3250 /* 7. Reposition the cursor for insert and try insertion */
3251 page_cursor = btr_cur_get_page_cur(cursor);
3252
3253 page_cur_search(insert_block, cursor->index, tuple, page_cursor);
3254
3255 rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
3256 offsets, heap, n_ext, mtr);
3257
3258#ifdef UNIV_ZIP_DEBUG
3259 {
3260 page_t* insert_page
3261 = buf_block_get_frame(insert_block);
3262
3263 page_zip_des_t* insert_page_zip
3264 = buf_block_get_page_zip(insert_block);
3265
3266 ut_a(!insert_page_zip
3267 || page_zip_validate(insert_page_zip, insert_page,
3268 cursor->index));
3269 }
3270#endif /* UNIV_ZIP_DEBUG */
3271
3272 if (rec != NULL) {
3273
3274 goto func_exit;
3275 }
3276
3277 /* 8. If insert did not fit, try page reorganization.
3278 For compressed pages, page_cur_tuple_insert() will have
3279 attempted this already. */
3280
3281 if (page_cur_get_page_zip(page_cursor)
3282 || !btr_page_reorganize(page_cursor, cursor->index, mtr)) {
3283
3284 goto insert_failed;
3285 }
3286
3287 rec = page_cur_tuple_insert(page_cursor, tuple, cursor->index,
3288 offsets, heap, n_ext, mtr);
3289
3290 if (rec == NULL) {
3291 /* The insert did not fit on the page: loop back to the
3292 start of the function for a new split */
3293insert_failed:
3294 /* We play safe and reset the free bits for new_page */
3295 if (!dict_index_is_clust(cursor->index)
3296 && !cursor->index->table->is_temporary()) {
3297 ibuf_reset_free_bits(new_block);
3298 ibuf_reset_free_bits(block);
3299 }
3300
3301 n_iterations++;
3302 ut_ad(n_iterations < 2
3303 || buf_block_get_page_zip(insert_block));
3304 ut_ad(!insert_will_fit);
3305
3306 goto func_start;
3307 }
3308
3309func_exit:
3310 /* Insert fit on the page: update the free bits for the
3311 left and right pages in the same mtr */
3312
3313 if (!dict_index_is_clust(cursor->index)
3314 && !cursor->index->table->is_temporary()
3315 && page_is_leaf(page)) {
3316
3317 ibuf_update_free_bits_for_two_pages_low(
3318 left_block, right_block, mtr);
3319 }
3320
3321 MONITOR_INC(MONITOR_INDEX_SPLIT);
3322
3323 ut_ad(page_validate(buf_block_get_frame(left_block), cursor->index));
3324 ut_ad(page_validate(buf_block_get_frame(right_block), cursor->index));
3325
3326 if (tuple == NULL) {
3327 ut_ad(rec == NULL);
3328 }
3329 ut_ad(!rec || rec_offs_validate(rec, cursor->index, *offsets));
3330 return(rec);
3331}
3332
3333/** Removes a page from the level list of pages.
3334@param[in] space space where removed
3335@param[in] page_size page size
3336@param[in,out] page page to remove
3337@param[in] index index tree
3338@param[in,out] mtr mini-transaction */
3339void
3340btr_level_list_remove_func(
3341 ulint space,
3342 const page_size_t& page_size,
3343 page_t* page,
3344 dict_index_t* index,
3345 mtr_t* mtr)
3346{
3347 ut_ad(page != NULL);
3348 ut_ad(mtr != NULL);
3349 ut_ad(mtr_memo_contains_page(mtr, page, MTR_MEMO_PAGE_X_FIX));
3350 ut_ad(space == page_get_space_id(page));
3351 /* Get the previous and next page numbers of page */
3352
3353 const ulint prev_page_no = btr_page_get_prev(page, mtr);
3354 const ulint next_page_no = btr_page_get_next(page, mtr);
3355
3356 /* Update page links of the level */
3357
3358 if (prev_page_no != FIL_NULL) {
3359 buf_block_t* prev_block
3360 = btr_block_get(page_id_t(space, prev_page_no),
3361 page_size, RW_X_LATCH, index, mtr);
3362
3363 page_t* prev_page
3364 = buf_block_get_frame(prev_block);
3365#ifdef UNIV_BTR_DEBUG
3366 ut_a(page_is_comp(prev_page) == page_is_comp(page));
3367 ut_a(btr_page_get_next(prev_page, mtr)
3368 == page_get_page_no(page));
3369#endif /* UNIV_BTR_DEBUG */
3370
3371 btr_page_set_next(prev_page,
3372 buf_block_get_page_zip(prev_block),
3373 next_page_no, mtr);
3374 }
3375
3376 if (next_page_no != FIL_NULL) {
3377 buf_block_t* next_block
3378 = btr_block_get(
3379 page_id_t(space, next_page_no), page_size,
3380 RW_X_LATCH, index, mtr);
3381
3382 page_t* next_page
3383 = buf_block_get_frame(next_block);
3384#ifdef UNIV_BTR_DEBUG
3385 ut_a(page_is_comp(next_page) == page_is_comp(page));
3386 ut_a(btr_page_get_prev(next_page, mtr)
3387 == page_get_page_no(page));
3388#endif /* UNIV_BTR_DEBUG */
3389
3390 btr_page_set_prev(next_page,
3391 buf_block_get_page_zip(next_block),
3392 prev_page_no, mtr);
3393 }
3394}
3395
3396/****************************************************************//**
3397Writes the redo log record for setting an index record as the predefined
3398minimum record. */
3399UNIV_INLINE
3400void
3401btr_set_min_rec_mark_log(
3402/*=====================*/
3403 rec_t* rec, /*!< in: record */
3404 mlog_id_t type, /*!< in: MLOG_COMP_REC_MIN_MARK or
3405 MLOG_REC_MIN_MARK */
3406 mtr_t* mtr) /*!< in: mtr */
3407{
3408 mlog_write_initial_log_record(rec, type, mtr);
3409
3410 /* Write rec offset as a 2-byte ulint */
3411 mlog_catenate_ulint(mtr, page_offset(rec), MLOG_2BYTES);
3412}
3413
3414/****************************************************************//**
3415Parses the redo log record for setting an index record as the predefined
3416minimum record.
3417@return end of log record or NULL */
3418byte*
3419btr_parse_set_min_rec_mark(
3420/*=======================*/
3421 byte* ptr, /*!< in: buffer */
3422 byte* end_ptr,/*!< in: buffer end */
3423 ulint comp, /*!< in: nonzero=compact page format */
3424 page_t* page, /*!< in: page or NULL */
3425 mtr_t* mtr) /*!< in: mtr or NULL */
3426{
3427 rec_t* rec;
3428
3429 if (end_ptr < ptr + 2) {
3430
3431 return(NULL);
3432 }
3433
3434 if (page) {
3435 ut_a(!page_is_comp(page) == !comp);
3436
3437 rec = page + mach_read_from_2(ptr);
3438
3439 btr_set_min_rec_mark(rec, mtr);
3440 }
3441
3442 return(ptr + 2);
3443}
3444
3445/****************************************************************//**
3446Sets a record as the predefined minimum record. */
3447void
3448btr_set_min_rec_mark(
3449/*=================*/
3450 rec_t* rec, /*!< in: record */
3451 mtr_t* mtr) /*!< in: mtr */
3452{
3453 ulint info_bits;
3454
3455 if (page_rec_is_comp(rec)) {
3456 info_bits = rec_get_info_bits(rec, TRUE);
3457
3458 rec_set_info_bits_new(rec, info_bits | REC_INFO_MIN_REC_FLAG);
3459
3460 btr_set_min_rec_mark_log(rec, MLOG_COMP_REC_MIN_MARK, mtr);
3461 } else {
3462 info_bits = rec_get_info_bits(rec, FALSE);
3463
3464 rec_set_info_bits_old(rec, info_bits | REC_INFO_MIN_REC_FLAG);
3465
3466 btr_set_min_rec_mark_log(rec, MLOG_REC_MIN_MARK, mtr);
3467 }
3468}
3469
3470/*************************************************************//**
3471Deletes on the upper level the node pointer to a page. */
3472void
3473btr_node_ptr_delete(
3474/*================*/
3475 dict_index_t* index, /*!< in: index tree */
3476 buf_block_t* block, /*!< in: page whose node pointer is deleted */
3477 mtr_t* mtr) /*!< in: mtr */
3478{
3479 btr_cur_t cursor;
3480 ibool compressed;
3481 dberr_t err;
3482
3483 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3484
3485 /* Delete node pointer on father page */
3486 btr_page_get_father(index, block, mtr, &cursor);
3487
3488 compressed = btr_cur_pessimistic_delete(&err, TRUE, &cursor,
3489 BTR_CREATE_FLAG, false, mtr);
3490 ut_a(err == DB_SUCCESS);
3491
3492 if (!compressed) {
3493 btr_cur_compress_if_useful(&cursor, FALSE, mtr);
3494 }
3495}
3496
3497/*************************************************************//**
3498If page is the only on its level, this function moves its records to the
3499father page, thus reducing the tree height.
3500@return father block */
3501UNIV_INTERN
3502buf_block_t*
3503btr_lift_page_up(
3504/*=============*/
3505 dict_index_t* index, /*!< in: index tree */
3506 buf_block_t* block, /*!< in: page which is the only on its level;
3507 must not be empty: use
3508 btr_discard_only_page_on_level if the last
3509 record from the page should be removed */
3510 mtr_t* mtr) /*!< in: mtr */
3511{
3512 buf_block_t* father_block;
3513 page_t* father_page;
3514 ulint page_level;
3515 page_zip_des_t* father_page_zip;
3516 page_t* page = buf_block_get_frame(block);
3517 ulint root_page_no;
3518 buf_block_t* blocks[BTR_MAX_LEVELS];
3519 ulint n_blocks; /*!< last used index in blocks[] */
3520 ulint i;
3521 bool lift_father_up;
3522 buf_block_t* block_orig = block;
3523
3524 ut_ad(!page_has_siblings(page));
3525 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3526
3527 page_level = btr_page_get_level(page);
3528 root_page_no = dict_index_get_page(index);
3529
3530 {
3531 btr_cur_t cursor;
3532 ulint* offsets = NULL;
3533 mem_heap_t* heap = mem_heap_create(
3534 sizeof(*offsets)
3535 * (REC_OFFS_HEADER_SIZE + 1 + 1
3536 + unsigned(index->n_fields)));
3537 buf_block_t* b;
3538
3539 if (dict_index_is_spatial(index)) {
3540 offsets = rtr_page_get_father_block(
3541 NULL, heap, index, block, mtr,
3542 NULL, &cursor);
3543 } else {
3544 offsets = btr_page_get_father_block(offsets, heap,
3545 index, block,
3546 mtr, &cursor);
3547 }
3548 father_block = btr_cur_get_block(&cursor);
3549 father_page_zip = buf_block_get_page_zip(father_block);
3550 father_page = buf_block_get_frame(father_block);
3551
3552 n_blocks = 0;
3553
3554 /* Store all ancestor pages so we can reset their
3555 levels later on. We have to do all the searches on
3556 the tree now because later on, after we've replaced
3557 the first level, the tree is in an inconsistent state
3558 and can not be searched. */
3559 for (b = father_block;
3560 b->page.id.page_no() != root_page_no; ) {
3561 ut_a(n_blocks < BTR_MAX_LEVELS);
3562
3563 if (dict_index_is_spatial(index)) {
3564 offsets = rtr_page_get_father_block(
3565 NULL, heap, index, b, mtr,
3566 NULL, &cursor);
3567 } else {
3568 offsets = btr_page_get_father_block(offsets,
3569 heap,
3570 index, b,
3571 mtr,
3572 &cursor);
3573 }
3574
3575 blocks[n_blocks++] = b = btr_cur_get_block(&cursor);
3576 }
3577
3578 lift_father_up = (n_blocks && page_level == 0);
3579 if (lift_father_up) {
3580 /* The father page also should be the only on its level (not
3581 root). We should lift up the father page at first.
3582 Because the leaf page should be lifted up only for root page.
3583 The freeing page is based on page_level (==0 or !=0)
3584 to choose segment. If the page_level is changed ==0 from !=0,
3585 later freeing of the page doesn't find the page allocation
3586 to be freed.*/
3587
3588 block = father_block;
3589 page = buf_block_get_frame(block);
3590 page_level = btr_page_get_level(page);
3591
3592 ut_ad(!page_has_siblings(page));
3593 ut_ad(mtr_memo_contains(
3594 mtr, block, MTR_MEMO_PAGE_X_FIX));
3595
3596 father_block = blocks[0];
3597 father_page_zip = buf_block_get_page_zip(father_block);
3598 father_page = buf_block_get_frame(father_block);
3599 }
3600
3601 mem_heap_free(heap);
3602 }
3603
3604 btr_search_drop_page_hash_index(block);
3605
3606 /* Make the father empty */
3607 btr_page_empty(father_block, father_page_zip, index, page_level, mtr);
3608 /* btr_page_empty() is supposed to zero-initialize the field. */
3609 ut_ad(!page_get_instant(father_block->frame));
3610
3611 if (page_level == 0 && index->is_instant()) {
3612 ut_ad(!father_page_zip);
3613 byte* page_type = father_block->frame + FIL_PAGE_TYPE;
3614 ut_ad(mach_read_from_2(page_type) == FIL_PAGE_INDEX);
3615 mlog_write_ulint(page_type, FIL_PAGE_TYPE_INSTANT,
3616 MLOG_2BYTES, mtr);
3617 page_set_instant(father_block->frame,
3618 index->n_core_fields, mtr);
3619 }
3620
3621 page_level++;
3622
3623 /* Copy the records to the father page one by one. */
3624 if (0
3625#ifdef UNIV_ZIP_COPY
3626 || father_page_zip
3627#endif /* UNIV_ZIP_COPY */
3628 || !page_copy_rec_list_end(father_block, block,
3629 page_get_infimum_rec(page),
3630 index, mtr)) {
3631 const page_zip_des_t* page_zip
3632 = buf_block_get_page_zip(block);
3633 ut_a(father_page_zip);
3634 ut_a(page_zip);
3635
3636 /* Copy the page byte for byte. */
3637 page_zip_copy_recs(father_page_zip, father_page,
3638 page_zip, page, index, mtr);
3639
3640 /* Update the lock table and possible hash index. */
3641
3642 lock_move_rec_list_end(father_block, block,
3643 page_get_infimum_rec(page));
3644
3645 /* Also update the predicate locks */
3646 if (dict_index_is_spatial(index)) {
3647 lock_prdt_rec_move(father_block, block);
3648 } else {
3649 btr_search_move_or_delete_hash_entries(
3650 father_block, block);
3651 }
3652 }
3653
3654 if (!dict_table_is_locking_disabled(index->table)) {
3655 /* Free predicate page locks on the block */
3656 if (dict_index_is_spatial(index)) {
3657 lock_mutex_enter();
3658 lock_prdt_page_free_from_discard(
3659 block, lock_sys.prdt_page_hash);
3660 lock_mutex_exit();
3661 }
3662 lock_update_copy_and_discard(father_block, block);
3663 }
3664
3665 /* Go upward to root page, decrementing levels by one. */
3666 for (i = lift_father_up ? 1 : 0; i < n_blocks; i++, page_level++) {
3667 page_t* page = buf_block_get_frame(blocks[i]);
3668 page_zip_des_t* page_zip= buf_block_get_page_zip(blocks[i]);
3669
3670 ut_ad(btr_page_get_level(page) == page_level + 1);
3671
3672 btr_page_set_level(page, page_zip, page_level, mtr);
3673#ifdef UNIV_ZIP_DEBUG
3674 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
3675#endif /* UNIV_ZIP_DEBUG */
3676 }
3677
3678 if (dict_index_is_spatial(index)) {
3679 rtr_check_discard_page(index, NULL, block);
3680 }
3681
3682 /* Free the file page */
3683 btr_page_free(index, block, mtr);
3684
3685 /* We play it safe and reset the free bits for the father */
3686 if (!dict_index_is_clust(index)
3687 && !index->table->is_temporary()) {
3688 ibuf_reset_free_bits(father_block);
3689 }
3690 ut_ad(page_validate(father_page, index));
3691 ut_ad(btr_check_node_ptr(index, father_block, mtr));
3692
3693 return(lift_father_up ? block_orig : father_block);
3694}
3695
3696/*************************************************************//**
3697Tries to merge the page first to the left immediate brother if such a
3698brother exists, and the node pointers to the current page and to the brother
3699reside on the same page. If the left brother does not satisfy these
3700conditions, looks at the right brother. If the page is the only one on that
3701level lifts the records of the page to the father page, thus reducing the
3702tree height. It is assumed that mtr holds an x-latch on the tree and on the
3703page. If cursor is on the leaf level, mtr must also hold x-latches to the
3704brothers, if they exist.
3705@return TRUE on success */
3706ibool
3707btr_compress(
3708/*=========*/
3709 btr_cur_t* cursor, /*!< in/out: cursor on the page to merge
3710 or lift; the page must not be empty:
3711 when deleting records, use btr_discard_page()
3712 if the page would become empty */
3713 ibool adjust, /*!< in: TRUE if should adjust the
3714 cursor position even if compression occurs */
3715 mtr_t* mtr) /*!< in/out: mini-transaction */
3716{
3717 dict_index_t* index;
3718 ulint left_page_no;
3719 ulint right_page_no;
3720 buf_block_t* merge_block;
3721 page_t* merge_page = NULL;
3722 page_zip_des_t* merge_page_zip;
3723 ibool is_left;
3724 buf_block_t* block;
3725 page_t* page;
3726 btr_cur_t father_cursor;
3727 mem_heap_t* heap;
3728 ulint* offsets;
3729 ulint nth_rec = 0; /* remove bogus warning */
3730 bool mbr_changed = false;
3731#ifdef UNIV_DEBUG
3732 bool leftmost_child;
3733#endif
3734 DBUG_ENTER("btr_compress");
3735
3736 block = btr_cur_get_block(cursor);
3737 page = btr_cur_get_page(cursor);
3738 index = btr_cur_get_index(cursor);
3739
3740 btr_assert_not_corrupted(block, index);
3741
3742#ifdef UNIV_DEBUG
3743 if (dict_index_is_spatial(index)) {
3744 ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
3745 MTR_MEMO_X_LOCK));
3746 } else {
3747 ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
3748 MTR_MEMO_X_LOCK
3749 | MTR_MEMO_SX_LOCK));
3750 }
3751#endif /* UNIV_DEBUG */
3752
3753 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
3754
3755 const page_size_t page_size(index->table->space->flags);
3756
3757 MONITOR_INC(MONITOR_INDEX_MERGE_ATTEMPTS);
3758
3759 left_page_no = btr_page_get_prev(page, mtr);
3760 right_page_no = btr_page_get_next(page, mtr);
3761
3762#ifdef UNIV_DEBUG
3763 if (!page_is_leaf(page) && left_page_no == FIL_NULL) {
3764 ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
3765 page_rec_get_next(page_get_infimum_rec(page)),
3766 page_is_comp(page)));
3767 }
3768#endif /* UNIV_DEBUG */
3769
3770 heap = mem_heap_create(100);
3771
3772 if (dict_index_is_spatial(index)) {
3773 offsets = rtr_page_get_father_block(
3774 NULL, heap, index, block, mtr, cursor, &father_cursor);
3775 ut_ad(cursor->page_cur.block->page.id.page_no()
3776 == block->page.id.page_no());
3777 rec_t* my_rec = father_cursor.page_cur.rec;
3778
3779 ulint page_no = btr_node_ptr_get_child_page_no(my_rec, offsets);
3780
3781 if (page_no != block->page.id.page_no()) {
3782 ib::info() << "father positioned on page "
3783 << page_no << "instead of "
3784 << block->page.id.page_no();
3785 offsets = btr_page_get_father_block(
3786 NULL, heap, index, block, mtr, &father_cursor);
3787 }
3788 } else {
3789 offsets = btr_page_get_father_block(
3790 NULL, heap, index, block, mtr, &father_cursor);
3791 }
3792
3793 if (adjust) {
3794 nth_rec = page_rec_get_n_recs_before(btr_cur_get_rec(cursor));
3795 ut_ad(nth_rec > 0);
3796 }
3797
3798 if (left_page_no == FIL_NULL && right_page_no == FIL_NULL) {
3799 /* The page is the only one on the level, lift the records
3800 to the father */
3801
3802 merge_block = btr_lift_page_up(index, block, mtr);
3803 goto func_exit;
3804 }
3805
3806 ut_d(leftmost_child =
3807 left_page_no != FIL_NULL
3808 && (page_rec_get_next(
3809 page_get_infimum_rec(
3810 btr_cur_get_page(&father_cursor)))
3811 == btr_cur_get_rec(&father_cursor)));
3812
3813 /* Decide the page to which we try to merge and which will inherit
3814 the locks */
3815
3816 is_left = btr_can_merge_with_page(cursor, left_page_no,
3817 &merge_block, mtr);
3818
3819 DBUG_EXECUTE_IF("ib_always_merge_right", is_left = FALSE;);
3820retry:
3821 if (!is_left
3822 && !btr_can_merge_with_page(cursor, right_page_no, &merge_block,
3823 mtr)) {
3824 if (!merge_block) {
3825 merge_page = NULL;
3826 }
3827 goto err_exit;
3828 }
3829
3830 merge_page = buf_block_get_frame(merge_block);
3831
3832#ifdef UNIV_BTR_DEBUG
3833 if (is_left) {
3834 ut_a(btr_page_get_next(merge_page, mtr)
3835 == block->page.id.page_no());
3836 } else {
3837 ut_a(btr_page_get_prev(merge_page, mtr)
3838 == block->page.id.page_no());
3839 }
3840#endif /* UNIV_BTR_DEBUG */
3841
3842#ifdef UNIV_GIS_DEBUG
3843 if (dict_index_is_spatial(index)) {
3844 if (is_left) {
3845 fprintf(stderr, "GIS_DIAG: merge left %ld to %ld \n",
3846 (long) block->page.id.page_no(), left_page_no);
3847 } else {
3848 fprintf(stderr, "GIS_DIAG: merge right %ld to %ld\n",
3849 (long) block->page.id.page_no(), right_page_no);
3850 }
3851 }
3852#endif /* UNIV_GIS_DEBUG */
3853
3854 ut_ad(page_validate(merge_page, index));
3855
3856 merge_page_zip = buf_block_get_page_zip(merge_block);
3857#ifdef UNIV_ZIP_DEBUG
3858 if (merge_page_zip) {
3859 const page_zip_des_t* page_zip
3860 = buf_block_get_page_zip(block);
3861 ut_a(page_zip);
3862 ut_a(page_zip_validate(merge_page_zip, merge_page, index));
3863 ut_a(page_zip_validate(page_zip, page, index));
3864 }
3865#endif /* UNIV_ZIP_DEBUG */
3866
3867 /* Move records to the merge page */
3868 if (is_left) {
3869 btr_cur_t cursor2;
3870 rtr_mbr_t new_mbr;
3871 ulint* offsets2 = NULL;
3872
3873 /* For rtree, we need to update father's mbr. */
3874 if (dict_index_is_spatial(index)) {
3875 /* We only support merge pages with the same parent
3876 page */
3877 if (!rtr_check_same_block(
3878 index, &cursor2,
3879 btr_cur_get_block(&father_cursor),
3880 merge_block, heap)) {
3881 is_left = false;
3882 goto retry;
3883 }
3884
3885 /* Set rtr_info for cursor2, since it is
3886 necessary in recursive page merge. */
3887 cursor2.rtr_info = cursor->rtr_info;
3888 cursor2.tree_height = cursor->tree_height;
3889
3890 offsets2 = rec_get_offsets(
3891 btr_cur_get_rec(&cursor2), index, NULL,
3892 page_is_leaf(cursor2.page_cur.block->frame),
3893 ULINT_UNDEFINED, &heap);
3894
3895 /* Check if parent entry needs to be updated */
3896 mbr_changed = rtr_merge_mbr_changed(
3897 &cursor2, &father_cursor,
3898 offsets2, offsets, &new_mbr);
3899 }
3900
3901 rec_t* orig_pred = page_copy_rec_list_start(
3902 merge_block, block, page_get_supremum_rec(page),
3903 index, mtr);
3904
3905 if (!orig_pred) {
3906 goto err_exit;
3907 }
3908
3909 btr_search_drop_page_hash_index(block);
3910
3911 /* Remove the page from the level list */
3912 btr_level_list_remove(index->table->space->id,
3913 page_size, page, index, mtr);
3914
3915 if (dict_index_is_spatial(index)) {
3916 rec_t* my_rec = father_cursor.page_cur.rec;
3917
3918 ulint page_no = btr_node_ptr_get_child_page_no(
3919 my_rec, offsets);
3920
3921 if (page_no != block->page.id.page_no()) {
3922
3923 ib::fatal() << "father positioned on "
3924 << page_no << " instead of "
3925 << block->page.id.page_no();
3926
3927 ut_ad(0);
3928 }
3929
3930 if (mbr_changed) {
3931#ifdef UNIV_DEBUG
3932 bool success = rtr_update_mbr_field(
3933 &cursor2, offsets2, &father_cursor,
3934 merge_page, &new_mbr, NULL, mtr);
3935
3936 ut_ad(success);
3937#else
3938 rtr_update_mbr_field(
3939 &cursor2, offsets2, &father_cursor,
3940 merge_page, &new_mbr, NULL, mtr);
3941#endif
3942 } else {
3943 rtr_node_ptr_delete(&father_cursor, mtr);
3944 }
3945
3946 /* No GAP lock needs to be worrying about */
3947 lock_mutex_enter();
3948 lock_prdt_page_free_from_discard(
3949 block, lock_sys.prdt_page_hash);
3950 lock_rec_free_all_from_discard_page(block);
3951 lock_mutex_exit();
3952 } else {
3953 btr_node_ptr_delete(index, block, mtr);
3954 if (!dict_table_is_locking_disabled(index->table)) {
3955 lock_update_merge_left(
3956 merge_block, orig_pred, block);
3957 }
3958 }
3959
3960 if (adjust) {
3961 nth_rec += page_rec_get_n_recs_before(orig_pred);
3962 }
3963 } else {
3964 rec_t* orig_succ;
3965 ibool compressed;
3966 dberr_t err;
3967 btr_cur_t cursor2;
3968 /* father cursor pointing to node ptr
3969 of the right sibling */
3970#ifdef UNIV_BTR_DEBUG
3971 byte fil_page_prev[4];
3972#endif /* UNIV_BTR_DEBUG */
3973
3974 if (dict_index_is_spatial(index)) {
3975 cursor2.rtr_info = NULL;
3976
3977 /* For spatial index, we disallow merge of blocks
3978 with different parents, since the merge would need
3979 to update entry (for MBR and Primary key) in the
3980 parent of block being merged */
3981 if (!rtr_check_same_block(
3982 index, &cursor2,
3983 btr_cur_get_block(&father_cursor),
3984 merge_block, heap)) {
3985 goto err_exit;
3986 }
3987
3988 /* Set rtr_info for cursor2, since it is
3989 necessary in recursive page merge. */
3990 cursor2.rtr_info = cursor->rtr_info;
3991 cursor2.tree_height = cursor->tree_height;
3992 } else {
3993 btr_page_get_father(index, merge_block, mtr, &cursor2);
3994 }
3995
3996 if (merge_page_zip && left_page_no == FIL_NULL) {
3997
3998 /* The function page_zip_compress(), which will be
3999 invoked by page_copy_rec_list_end() below,
4000 requires that FIL_PAGE_PREV be FIL_NULL.
4001 Clear the field, but prepare to restore it. */
4002#ifdef UNIV_BTR_DEBUG
4003 memcpy(fil_page_prev, merge_page + FIL_PAGE_PREV, 4);
4004#endif /* UNIV_BTR_DEBUG */
4005 compile_time_assert(FIL_NULL == 0xffffffffU);
4006 memset(merge_page + FIL_PAGE_PREV, 0xff, 4);
4007 }
4008
4009 orig_succ = page_copy_rec_list_end(merge_block, block,
4010 page_get_infimum_rec(page),
4011 cursor->index, mtr);
4012
4013 if (!orig_succ) {
4014 ut_a(merge_page_zip);
4015#ifdef UNIV_BTR_DEBUG
4016 if (left_page_no == FIL_NULL) {
4017 /* FIL_PAGE_PREV was restored from
4018 merge_page_zip. */
4019 ut_a(!memcmp(fil_page_prev,
4020 merge_page + FIL_PAGE_PREV, 4));
4021 }
4022#endif /* UNIV_BTR_DEBUG */
4023 goto err_exit;
4024 }
4025
4026 btr_search_drop_page_hash_index(block);
4027
4028#ifdef UNIV_BTR_DEBUG
4029 if (merge_page_zip && left_page_no == FIL_NULL) {
4030
4031 /* Restore FIL_PAGE_PREV in order to avoid an assertion
4032 failure in btr_level_list_remove(), which will set
4033 the field again to FIL_NULL. Even though this makes
4034 merge_page and merge_page_zip inconsistent for a
4035 split second, it is harmless, because the pages
4036 are X-latched. */
4037 memcpy(merge_page + FIL_PAGE_PREV, fil_page_prev, 4);
4038 }
4039#endif /* UNIV_BTR_DEBUG */
4040
4041 /* Remove the page from the level list */
4042 btr_level_list_remove(index->table->space->id,
4043 page_size, page, index, mtr);
4044
4045 ut_ad(btr_node_ptr_get_child_page_no(
4046 btr_cur_get_rec(&father_cursor), offsets)
4047 == block->page.id.page_no());
4048
4049 /* Replace the address of the old child node (= page) with the
4050 address of the merge page to the right */
4051 btr_node_ptr_set_child_page_no(
4052 btr_cur_get_rec(&father_cursor),
4053 btr_cur_get_page_zip(&father_cursor),
4054 offsets, right_page_no, mtr);
4055
4056#ifdef UNIV_DEBUG
4057 if (!page_is_leaf(page) && left_page_no == FIL_NULL) {
4058 ut_ad(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
4059 page_rec_get_next(page_get_infimum_rec(
4060 buf_block_get_frame(merge_block))),
4061 page_is_comp(page)));
4062 }
4063#endif /* UNIV_DEBUG */
4064
4065 /* For rtree, we need to update father's mbr. */
4066 if (dict_index_is_spatial(index)) {
4067 ulint* offsets2;
4068 ulint rec_info;
4069
4070 offsets2 = rec_get_offsets(
4071 btr_cur_get_rec(&cursor2), index, NULL,
4072 page_is_leaf(cursor2.page_cur.block->frame),
4073 ULINT_UNDEFINED, &heap);
4074
4075 ut_ad(btr_node_ptr_get_child_page_no(
4076 btr_cur_get_rec(&cursor2), offsets2)
4077 == right_page_no);
4078
4079 rec_info = rec_get_info_bits(
4080 btr_cur_get_rec(&father_cursor),
4081 rec_offs_comp(offsets));
4082 if (rec_info & REC_INFO_MIN_REC_FLAG) {
4083 /* When the father node ptr is minimal rec,
4084 we will keep it and delete the node ptr of
4085 merge page. */
4086 rtr_merge_and_update_mbr(&father_cursor,
4087 &cursor2,
4088 offsets, offsets2,
4089 merge_page, mtr);
4090 } else {
4091 /* Otherwise, we will keep the node ptr of
4092 merge page and delete the father node ptr.
4093 This is for keeping the rec order in upper
4094 level. */
4095 rtr_merge_and_update_mbr(&cursor2,
4096 &father_cursor,
4097 offsets2, offsets,
4098 merge_page, mtr);
4099 }
4100 lock_mutex_enter();
4101 lock_prdt_page_free_from_discard(
4102 block, lock_sys.prdt_page_hash);
4103 lock_rec_free_all_from_discard_page(block);
4104 lock_mutex_exit();
4105 } else {
4106
4107 compressed = btr_cur_pessimistic_delete(&err, TRUE,
4108 &cursor2,
4109 BTR_CREATE_FLAG,
4110 false, mtr);
4111 ut_a(err == DB_SUCCESS);
4112
4113 if (!compressed) {
4114 btr_cur_compress_if_useful(&cursor2,
4115 FALSE,
4116 mtr);
4117 }
4118
4119 if (!dict_table_is_locking_disabled(index->table)) {
4120 lock_update_merge_right(
4121 merge_block, orig_succ, block);
4122 }
4123 }
4124 }
4125
4126 if (!dict_index_is_clust(index)
4127 && !index->table->is_temporary()
4128 && page_is_leaf(merge_page)) {
4129 /* Update the free bits of the B-tree page in the
4130 insert buffer bitmap. This has to be done in a
4131 separate mini-transaction that is committed before the
4132 main mini-transaction. We cannot update the insert
4133 buffer bitmap in this mini-transaction, because
4134 btr_compress() can be invoked recursively without
4135 committing the mini-transaction in between. Since
4136 insert buffer bitmap pages have a lower rank than
4137 B-tree pages, we must not access other pages in the
4138 same mini-transaction after accessing an insert buffer
4139 bitmap page. */
4140
4141 /* The free bits in the insert buffer bitmap must
4142 never exceed the free space on a page. It is safe to
4143 decrement or reset the bits in the bitmap in a
4144 mini-transaction that is committed before the
4145 mini-transaction that affects the free space. */
4146
4147 /* It is unsafe to increment the bits in a separately
4148 committed mini-transaction, because in crash recovery,
4149 the free bits could momentarily be set too high. */
4150
4151 if (page_size.is_compressed()) {
4152 /* Because the free bits may be incremented
4153 and we cannot update the insert buffer bitmap
4154 in the same mini-transaction, the only safe
4155 thing we can do here is the pessimistic
4156 approach: reset the free bits. */
4157 ibuf_reset_free_bits(merge_block);
4158 } else {
4159 /* On uncompressed pages, the free bits will
4160 never increase here. Thus, it is safe to
4161 write the bits accurately in a separate
4162 mini-transaction. */
4163 ibuf_update_free_bits_if_full(merge_block,
4164 srv_page_size,
4165 ULINT_UNDEFINED);
4166 }
4167 }
4168
4169 ut_ad(page_validate(merge_page, index));
4170#ifdef UNIV_ZIP_DEBUG
4171 ut_a(!merge_page_zip || page_zip_validate(merge_page_zip, merge_page,
4172 index));
4173#endif /* UNIV_ZIP_DEBUG */
4174
4175 if (dict_index_is_spatial(index)) {
4176#ifdef UNIV_GIS_DEBUG
4177 fprintf(stderr, "GIS_DIAG: compressed away %ld\n",
4178 (long) block->page.id.page_no());
4179 fprintf(stderr, "GIS_DIAG: merged to %ld\n",
4180 (long) merge_block->page.id.page_no());
4181#endif
4182
4183 rtr_check_discard_page(index, NULL, block);
4184 }
4185
4186 /* Free the file page */
4187 btr_page_free(index, block, mtr);
4188
4189 /* btr_check_node_ptr() needs parent block latched.
4190 If the merge_block's parent block is not same,
4191 we cannot use btr_check_node_ptr() */
4192 ut_ad(leftmost_child
4193 || btr_check_node_ptr(index, merge_block, mtr));
4194func_exit:
4195 mem_heap_free(heap);
4196
4197 if (adjust) {
4198 ut_ad(nth_rec > 0);
4199 btr_cur_position(
4200 index,
4201 page_rec_get_nth(merge_block->frame, nth_rec),
4202 merge_block, cursor);
4203 }
4204
4205 MONITOR_INC(MONITOR_INDEX_MERGE_SUCCESSFUL);
4206
4207 DBUG_RETURN(TRUE);
4208
4209err_exit:
4210 /* We play it safe and reset the free bits. */
4211 if (page_size.is_compressed()
4212 && merge_page
4213 && page_is_leaf(merge_page)
4214 && !dict_index_is_clust(index)) {
4215
4216 ibuf_reset_free_bits(merge_block);
4217 }
4218
4219 mem_heap_free(heap);
4220 DBUG_RETURN(FALSE);
4221}
4222
4223/*************************************************************//**
4224Discards a page that is the only page on its level. This will empty
4225the whole B-tree, leaving just an empty root page. This function
4226should never be reached, because btr_compress(), which is invoked in
4227delete operations, calls btr_lift_page_up() to flatten the B-tree. */
4228static
4229void
4230btr_discard_only_page_on_level(
4231/*===========================*/
4232 dict_index_t* index, /*!< in: index tree */
4233 buf_block_t* block, /*!< in: page which is the only on its level */
4234 mtr_t* mtr) /*!< in: mtr */
4235{
4236 ulint page_level = 0;
4237 trx_id_t max_trx_id;
4238
4239 /* Save the PAGE_MAX_TRX_ID from the leaf page. */
4240 max_trx_id = page_get_max_trx_id(buf_block_get_frame(block));
4241
4242 while (block->page.id.page_no() != dict_index_get_page(index)) {
4243 btr_cur_t cursor;
4244 buf_block_t* father;
4245 const page_t* page = buf_block_get_frame(block);
4246
4247 ut_a(page_get_n_recs(page) == 1);
4248 ut_a(page_level == btr_page_get_level(page));
4249 ut_a(!page_has_siblings(page));
4250
4251 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
4252 btr_search_drop_page_hash_index(block);
4253
4254 if (dict_index_is_spatial(index)) {
4255 /* Check any concurrent search having this page */
4256 rtr_check_discard_page(index, NULL, block);
4257 rtr_page_get_father(index, block, mtr, NULL, &cursor);
4258 } else {
4259 btr_page_get_father(index, block, mtr, &cursor);
4260 }
4261 father = btr_cur_get_block(&cursor);
4262
4263 if (!dict_table_is_locking_disabled(index->table)) {
4264 lock_update_discard(
4265 father, PAGE_HEAP_NO_SUPREMUM, block);
4266 }
4267
4268 /* Free the file page */
4269 btr_page_free(index, block, mtr);
4270
4271 block = father;
4272 page_level++;
4273 }
4274
4275 /* block is the root page, which must be empty, except
4276 for the node pointer to the (now discarded) block(s). */
4277 ut_ad(page_is_root(block->frame));
4278
4279#ifdef UNIV_BTR_DEBUG
4280 if (!dict_index_is_ibuf(index)) {
4281 const page_t* root = buf_block_get_frame(block);
4282 const ulint space = index->table->space->id;
4283 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_LEAF
4284 + root, space));
4285 ut_a(btr_root_fseg_validate(FIL_PAGE_DATA + PAGE_BTR_SEG_TOP
4286 + root, space));
4287 }
4288#endif /* UNIV_BTR_DEBUG */
4289
4290 btr_page_empty(block, buf_block_get_page_zip(block), index, 0, mtr);
4291 ut_ad(page_is_leaf(buf_block_get_frame(block)));
4292 /* btr_page_empty() is supposed to zero-initialize the field. */
4293 ut_ad(!page_get_instant(block->frame));
4294
4295 if (index->is_primary()) {
4296 /* Concurrent access is prevented by the root_block->lock
4297 X-latch, so this should be safe. */
4298 index->remove_instant();
4299 } else if (!index->table->is_temporary()) {
4300 /* We play it safe and reset the free bits for the root */
4301 ibuf_reset_free_bits(block);
4302
4303 ut_a(max_trx_id);
4304 page_set_max_trx_id(block,
4305 buf_block_get_page_zip(block),
4306 max_trx_id, mtr);
4307 }
4308}
4309
4310/*************************************************************//**
4311Discards a page from a B-tree. This is used to remove the last record from
4312a B-tree page: the whole page must be removed at the same time. This cannot
4313be used for the root page, which is allowed to be empty. */
4314void
4315btr_discard_page(
4316/*=============*/
4317 btr_cur_t* cursor, /*!< in: cursor on the page to discard: not on
4318 the root page */
4319 mtr_t* mtr) /*!< in: mtr */
4320{
4321 dict_index_t* index;
4322 ulint left_page_no;
4323 ulint right_page_no;
4324 buf_block_t* merge_block;
4325 page_t* merge_page;
4326 buf_block_t* block;
4327 page_t* page;
4328 rec_t* node_ptr;
4329#ifdef UNIV_DEBUG
4330 btr_cur_t parent_cursor;
4331 bool parent_is_different = false;
4332#endif
4333
4334 block = btr_cur_get_block(cursor);
4335 index = btr_cur_get_index(cursor);
4336
4337 ut_ad(dict_index_get_page(index) != block->page.id.page_no());
4338
4339 ut_ad(mtr_memo_contains_flagged(mtr, dict_index_get_lock(index),
4340 MTR_MEMO_X_LOCK | MTR_MEMO_SX_LOCK));
4341
4342 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
4343
4344 MONITOR_INC(MONITOR_INDEX_DISCARD);
4345
4346#ifdef UNIV_DEBUG
4347 if (dict_index_is_spatial(index)) {
4348 rtr_page_get_father(index, block, mtr, cursor, &parent_cursor);
4349 } else {
4350 btr_page_get_father(index, block, mtr, &parent_cursor);
4351 }
4352#endif
4353
4354 /* Decide the page which will inherit the locks */
4355
4356 left_page_no = btr_page_get_prev(buf_block_get_frame(block), mtr);
4357 right_page_no = btr_page_get_next(buf_block_get_frame(block), mtr);
4358
4359 const page_size_t page_size(index->table->space->flags);
4360
4361 if (left_page_no != FIL_NULL) {
4362 merge_block = btr_block_get(
4363 page_id_t(index->table->space->id, left_page_no),
4364 page_size, RW_X_LATCH, index, mtr);
4365
4366 merge_page = buf_block_get_frame(merge_block);
4367#ifdef UNIV_BTR_DEBUG
4368 ut_a(btr_page_get_next(merge_page, mtr)
4369 == block->page.id.page_no());
4370#endif /* UNIV_BTR_DEBUG */
4371 ut_d(parent_is_different =
4372 (page_rec_get_next(
4373 page_get_infimum_rec(
4374 btr_cur_get_page(
4375 &parent_cursor)))
4376 == btr_cur_get_rec(&parent_cursor)));
4377 } else if (right_page_no != FIL_NULL) {
4378 merge_block = btr_block_get(
4379 page_id_t(index->table->space->id, right_page_no),
4380 page_size, RW_X_LATCH, index, mtr);
4381
4382 merge_page = buf_block_get_frame(merge_block);
4383#ifdef UNIV_BTR_DEBUG
4384 ut_a(btr_page_get_prev(merge_page, mtr)
4385 == block->page.id.page_no());
4386#endif /* UNIV_BTR_DEBUG */
4387 ut_d(parent_is_different = page_rec_is_supremum(
4388 page_rec_get_next(btr_cur_get_rec(&parent_cursor))));
4389 } else {
4390 btr_discard_only_page_on_level(index, block, mtr);
4391
4392 return;
4393 }
4394
4395 page = buf_block_get_frame(block);
4396 ut_a(page_is_comp(merge_page) == page_is_comp(page));
4397 btr_search_drop_page_hash_index(block);
4398
4399 if (left_page_no == FIL_NULL && !page_is_leaf(page)) {
4400
4401 /* We have to mark the leftmost node pointer on the right
4402 side page as the predefined minimum record */
4403 node_ptr = page_rec_get_next(page_get_infimum_rec(merge_page));
4404
4405 ut_ad(page_rec_is_user_rec(node_ptr));
4406
4407 /* This will make page_zip_validate() fail on merge_page
4408 until btr_level_list_remove() completes. This is harmless,
4409 because everything will take place within a single
4410 mini-transaction and because writing to the redo log
4411 is an atomic operation (performed by mtr_commit()). */
4412 btr_set_min_rec_mark(node_ptr, mtr);
4413 }
4414
4415 if (dict_index_is_spatial(index)) {
4416 btr_cur_t father_cursor;
4417
4418 /* Since rtr_node_ptr_delete doesn't contain get father
4419 node ptr, so, we need to get father node ptr first and then
4420 delete it. */
4421 rtr_page_get_father(index, block, mtr, cursor, &father_cursor);
4422 rtr_node_ptr_delete(&father_cursor, mtr);
4423 } else {
4424 btr_node_ptr_delete(index, block, mtr);
4425 }
4426
4427 /* Remove the page from the level list */
4428 btr_level_list_remove(index->table->space->id, page_size,
4429 page, index, mtr);
4430
4431#ifdef UNIV_ZIP_DEBUG
4432 {
4433 page_zip_des_t* merge_page_zip
4434 = buf_block_get_page_zip(merge_block);
4435 ut_a(!merge_page_zip
4436 || page_zip_validate(merge_page_zip, merge_page, index));
4437 }
4438#endif /* UNIV_ZIP_DEBUG */
4439
4440 if (!dict_table_is_locking_disabled(index->table)) {
4441 if (left_page_no != FIL_NULL) {
4442 lock_update_discard(merge_block, PAGE_HEAP_NO_SUPREMUM,
4443 block);
4444 } else {
4445 lock_update_discard(merge_block,
4446 lock_get_min_heap_no(merge_block),
4447 block);
4448 }
4449 }
4450
4451 if (dict_index_is_spatial(index)) {
4452 rtr_check_discard_page(index, cursor, block);
4453 }
4454
4455 /* Free the file page */
4456 btr_page_free(index, block, mtr);
4457
4458 /* btr_check_node_ptr() needs parent block latched.
4459 If the merge_block's parent block is not same,
4460 we cannot use btr_check_node_ptr() */
4461 ut_ad(parent_is_different
4462 || btr_check_node_ptr(index, merge_block, mtr));
4463}
4464
4465#ifdef UNIV_BTR_PRINT
4466/*************************************************************//**
4467Prints size info of a B-tree. */
4468void
4469btr_print_size(
4470/*===========*/
4471 dict_index_t* index) /*!< in: index tree */
4472{
4473 page_t* root;
4474 fseg_header_t* seg;
4475 mtr_t mtr;
4476
4477 if (dict_index_is_ibuf(index)) {
4478 fputs("Sorry, cannot print info of an ibuf tree:"
4479 " use ibuf functions\n", stderr);
4480
4481 return;
4482 }
4483
4484 mtr_start(&mtr);
4485
4486 root = btr_root_get(index, &mtr);
4487
4488 seg = root + PAGE_HEADER + PAGE_BTR_SEG_TOP;
4489
4490 fputs("INFO OF THE NON-LEAF PAGE SEGMENT\n", stderr);
4491 fseg_print(seg, &mtr);
4492
4493 if (!dict_index_is_ibuf(index)) {
4494
4495 seg = root + PAGE_HEADER + PAGE_BTR_SEG_LEAF;
4496
4497 fputs("INFO OF THE LEAF PAGE SEGMENT\n", stderr);
4498 fseg_print(seg, &mtr);
4499 }
4500
4501 mtr_commit(&mtr);
4502}
4503
4504/************************************************************//**
4505Prints recursively index tree pages. */
4506static
4507void
4508btr_print_recursive(
4509/*================*/
4510 dict_index_t* index, /*!< in: index tree */
4511 buf_block_t* block, /*!< in: index page */
4512 ulint width, /*!< in: print this many entries from start
4513 and end */
4514 mem_heap_t** heap, /*!< in/out: heap for rec_get_offsets() */
4515 ulint** offsets,/*!< in/out: buffer for rec_get_offsets() */
4516 mtr_t* mtr) /*!< in: mtr */
4517{
4518 const page_t* page = buf_block_get_frame(block);
4519 page_cur_t cursor;
4520 ulint n_recs;
4521 ulint i = 0;
4522 mtr_t mtr2;
4523
4524 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_SX_FIX));
4525
4526 ib::info() << "NODE ON LEVEL " << btr_page_get_level(page)
4527 << " page " << block->page.id;
4528
4529 page_print(block, index, width, width);
4530
4531 n_recs = page_get_n_recs(page);
4532
4533 page_cur_set_before_first(block, &cursor);
4534 page_cur_move_to_next(&cursor);
4535
4536 while (!page_cur_is_after_last(&cursor)) {
4537
4538 if (page_is_leaf(page)) {
4539
4540 /* If this is the leaf level, do nothing */
4541
4542 } else if ((i <= width) || (i >= n_recs - width)) {
4543
4544 const rec_t* node_ptr;
4545
4546 mtr_start(&mtr2);
4547
4548 node_ptr = page_cur_get_rec(&cursor);
4549
4550 *offsets = rec_get_offsets(
4551 node_ptr, index, *offsets, false,
4552 ULINT_UNDEFINED, heap);
4553 btr_print_recursive(index,
4554 btr_node_ptr_get_child(node_ptr,
4555 index,
4556 *offsets,
4557 &mtr2),
4558 width, heap, offsets, &mtr2);
4559 mtr_commit(&mtr2);
4560 }
4561
4562 page_cur_move_to_next(&cursor);
4563 i++;
4564 }
4565}
4566
4567/**************************************************************//**
4568Prints directories and other info of all nodes in the tree. */
4569void
4570btr_print_index(
4571/*============*/
4572 dict_index_t* index, /*!< in: index */
4573 ulint width) /*!< in: print this many entries from start
4574 and end */
4575{
4576 mtr_t mtr;
4577 buf_block_t* root;
4578 mem_heap_t* heap = NULL;
4579 ulint offsets_[REC_OFFS_NORMAL_SIZE];
4580 ulint* offsets = offsets_;
4581 rec_offs_init(offsets_);
4582
4583 fputs("--------------------------\n"
4584 "INDEX TREE PRINT\n", stderr);
4585
4586 mtr_start(&mtr);
4587
4588 root = btr_root_block_get(index, RW_SX_LATCH, &mtr);
4589
4590 btr_print_recursive(index, root, width, &heap, &offsets, &mtr);
4591 if (heap) {
4592 mem_heap_free(heap);
4593 }
4594
4595 mtr_commit(&mtr);
4596
4597 ut_ad(btr_validate_index(index, 0, false));
4598}
4599#endif /* UNIV_BTR_PRINT */
4600
4601#ifdef UNIV_DEBUG
4602/************************************************************//**
4603Checks that the node pointer to a page is appropriate.
4604@return TRUE */
4605ibool
4606btr_check_node_ptr(
4607/*===============*/
4608 dict_index_t* index, /*!< in: index tree */
4609 buf_block_t* block, /*!< in: index page */
4610 mtr_t* mtr) /*!< in: mtr */
4611{
4612 mem_heap_t* heap;
4613 dtuple_t* tuple;
4614 ulint* offsets;
4615 btr_cur_t cursor;
4616 page_t* page = buf_block_get_frame(block);
4617
4618 ut_ad(mtr_memo_contains(mtr, block, MTR_MEMO_PAGE_X_FIX));
4619
4620 if (dict_index_get_page(index) == block->page.id.page_no()) {
4621
4622 return(TRUE);
4623 }
4624
4625 heap = mem_heap_create(256);
4626
4627 if (dict_index_is_spatial(index)) {
4628 offsets = rtr_page_get_father_block(NULL, heap, index, block, mtr,
4629 NULL, &cursor);
4630 } else {
4631 offsets = btr_page_get_father_block(NULL, heap, index, block, mtr,
4632 &cursor);
4633 }
4634
4635 if (page_is_leaf(page)) {
4636
4637 goto func_exit;
4638 }
4639
4640 tuple = dict_index_build_node_ptr(
4641 index, page_rec_get_next(page_get_infimum_rec(page)), 0, heap,
4642 btr_page_get_level(page));
4643
4644 /* For spatial index, the MBR in the parent rec could be different
4645 with that of first rec of child, their relationship should be
4646 "WITHIN" relationship */
4647 if (dict_index_is_spatial(index)) {
4648 ut_a(!cmp_dtuple_rec_with_gis(
4649 tuple, btr_cur_get_rec(&cursor),
4650 offsets, PAGE_CUR_WITHIN));
4651 } else {
4652 ut_a(!cmp_dtuple_rec(tuple, btr_cur_get_rec(&cursor), offsets));
4653 }
4654func_exit:
4655 mem_heap_free(heap);
4656
4657 return(TRUE);
4658}
4659#endif /* UNIV_DEBUG */
4660
4661/************************************************************//**
4662Display identification information for a record. */
4663static
4664void
4665btr_index_rec_validate_report(
4666/*==========================*/
4667 const page_t* page, /*!< in: index page */
4668 const rec_t* rec, /*!< in: index record */
4669 const dict_index_t* index) /*!< in: index */
4670{
4671 ib::info() << "Record in index " << index->name
4672 << " of table " << index->table->name
4673 << ", page " << page_id_t(page_get_space_id(page),
4674 page_get_page_no(page))
4675 << ", at offset " << page_offset(rec);
4676}
4677
4678/************************************************************//**
4679Checks the size and number of fields in a record based on the definition of
4680the index.
4681@return TRUE if ok */
4682ibool
4683btr_index_rec_validate(
4684/*===================*/
4685 const rec_t* rec, /*!< in: index record */
4686 const dict_index_t* index, /*!< in: index */
4687 ibool dump_on_error) /*!< in: TRUE if the function
4688 should print hex dump of record
4689 and page on error */
4690{
4691 ulint len;
4692 const page_t* page;
4693 mem_heap_t* heap = NULL;
4694 ulint offsets_[REC_OFFS_NORMAL_SIZE];
4695 ulint* offsets = offsets_;
4696 rec_offs_init(offsets_);
4697
4698 page = page_align(rec);
4699
4700 if (dict_index_is_ibuf(index)) {
4701 /* The insert buffer index tree can contain records from any
4702 other index: we cannot check the number of fields or
4703 their length */
4704
4705 return(TRUE);
4706 }
4707
4708#ifdef VIRTUAL_INDEX_DEBUG
4709 if (dict_index_has_virtual(index)) {
4710 fprintf(stderr, "index name is %s\n", index->name());
4711 }
4712#endif
4713 if ((ibool)!!page_is_comp(page) != dict_table_is_comp(index->table)) {
4714 btr_index_rec_validate_report(page, rec, index);
4715
4716 ib::error() << "Compact flag=" << !!page_is_comp(page)
4717 << ", should be " << dict_table_is_comp(index->table);
4718
4719 return(FALSE);
4720 }
4721
4722 if (!page_is_comp(page)) {
4723 const ulint n_rec_fields = rec_get_n_fields_old(rec);
4724 if (n_rec_fields == DICT_FLD__SYS_INDEXES__MERGE_THRESHOLD
4725 && index->id == DICT_INDEXES_ID) {
4726 /* A record for older SYS_INDEXES table
4727 (missing merge_threshold column) is acceptable. */
4728 } else if (n_rec_fields < index->n_core_fields
4729 || n_rec_fields > index->n_fields) {
4730 btr_index_rec_validate_report(page, rec, index);
4731
4732 ib::error() << "Has " << rec_get_n_fields_old(rec)
4733 << " fields, should have "
4734 << index->n_core_fields << ".."
4735 << index->n_fields;
4736
4737 if (dump_on_error) {
4738 fputs("InnoDB: corrupt record ", stderr);
4739 rec_print_old(stderr, rec);
4740 putc('\n', stderr);
4741 }
4742 return(FALSE);
4743 }
4744 }
4745
4746 offsets = rec_get_offsets(rec, index, offsets, page_is_leaf(page),
4747 ULINT_UNDEFINED, &heap);
4748
4749 for (unsigned i = 0; i < index->n_fields; i++) {
4750 dict_field_t* field = dict_index_get_nth_field(index, i);
4751 ulint fixed_size = dict_col_get_fixed_size(
4752 dict_field_get_col(field),
4753 page_is_comp(page));
4754
4755 rec_get_nth_field_offs(offsets, i, &len);
4756
4757 /* Note that if fixed_size != 0, it equals the
4758 length of a fixed-size column in the clustered index.
4759 We should adjust it here.
4760 A prefix index of the column is of fixed, but different
4761 length. When fixed_size == 0, prefix_len is the maximum
4762 length of the prefix index column. */
4763
4764 if (len_is_stored(len)
4765 && (field->prefix_len
4766 ? len > field->prefix_len
4767 : (fixed_size && len != fixed_size))) {
4768 btr_index_rec_validate_report(page, rec, index);
4769
4770 ib::error error;
4771
4772 error << "Field " << i << " len is " << len
4773 << ", should be " << fixed_size;
4774
4775 if (dump_on_error) {
4776 error << "; ";
4777 rec_print(error.m_oss, rec,
4778 rec_get_info_bits(
4779 rec, rec_offs_comp(offsets)),
4780 offsets);
4781 }
4782 if (heap) {
4783 mem_heap_free(heap);
4784 }
4785 return(FALSE);
4786 }
4787 }
4788
4789#ifdef VIRTUAL_INDEX_DEBUG
4790 if (dict_index_has_virtual(index)) {
4791 rec_print_new(stderr, rec, offsets);
4792 }
4793#endif
4794
4795 if (heap) {
4796 mem_heap_free(heap);
4797 }
4798 return(TRUE);
4799}
4800
4801/************************************************************//**
4802Checks the size and number of fields in records based on the definition of
4803the index.
4804@return TRUE if ok */
4805static
4806ibool
4807btr_index_page_validate(
4808/*====================*/
4809 buf_block_t* block, /*!< in: index page */
4810 dict_index_t* index) /*!< in: index */
4811{
4812 page_cur_t cur;
4813 ibool ret = TRUE;
4814#ifndef DBUG_OFF
4815 ulint nth = 1;
4816#endif /* !DBUG_OFF */
4817
4818 page_cur_set_before_first(block, &cur);
4819
4820 /* Directory slot 0 should only contain the infimum record. */
4821 DBUG_EXECUTE_IF("check_table_rec_next",
4822 ut_a(page_rec_get_nth_const(
4823 page_cur_get_page(&cur), 0)
4824 == cur.rec);
4825 ut_a(page_dir_slot_get_n_owned(
4826 page_dir_get_nth_slot(
4827 page_cur_get_page(&cur), 0))
4828 == 1););
4829
4830 page_cur_move_to_next(&cur);
4831
4832 for (;;) {
4833 if (page_cur_is_after_last(&cur)) {
4834
4835 break;
4836 }
4837
4838 if (!btr_index_rec_validate(cur.rec, index, TRUE)) {
4839
4840 return(FALSE);
4841 }
4842
4843 /* Verify that page_rec_get_nth_const() is correctly
4844 retrieving each record. */
4845 DBUG_EXECUTE_IF("check_table_rec_next",
4846 ut_a(cur.rec == page_rec_get_nth_const(
4847 page_cur_get_page(&cur),
4848 page_rec_get_n_recs_before(
4849 cur.rec)));
4850 ut_a(nth++ == page_rec_get_n_recs_before(
4851 cur.rec)););
4852
4853 page_cur_move_to_next(&cur);
4854 }
4855
4856 return(ret);
4857}
4858
4859/************************************************************//**
4860Report an error on one page of an index tree. */
4861static
4862void
4863btr_validate_report1(
4864/*=================*/
4865 dict_index_t* index, /*!< in: index */
4866 ulint level, /*!< in: B-tree level */
4867 const buf_block_t* block) /*!< in: index page */
4868{
4869 ib::error error;
4870 error << "In page " << block->page.id.page_no()
4871 << " of index " << index->name
4872 << " of table " << index->table->name;
4873
4874 if (level > 0) {
4875 error << ", index tree level " << level;
4876 }
4877}
4878
4879/************************************************************//**
4880Report an error on two pages of an index tree. */
4881static
4882void
4883btr_validate_report2(
4884/*=================*/
4885 const dict_index_t* index, /*!< in: index */
4886 ulint level, /*!< in: B-tree level */
4887 const buf_block_t* block1, /*!< in: first index page */
4888 const buf_block_t* block2) /*!< in: second index page */
4889{
4890 ib::error error;
4891 error << "In pages " << block1->page.id
4892 << " and " << block2->page.id << " of index " << index->name
4893 << " of table " << index->table->name;
4894
4895 if (level > 0) {
4896 error << ", index tree level " << level;
4897 }
4898}
4899
4900/************************************************************//**
4901Validates index tree level.
4902@return TRUE if ok */
4903static
4904bool
4905btr_validate_level(
4906/*===============*/
4907 dict_index_t* index, /*!< in: index tree */
4908 const trx_t* trx, /*!< in: transaction or NULL */
4909 ulint level, /*!< in: level number */
4910 bool lockout)/*!< in: true if X-latch index is intended */
4911{
4912 buf_block_t* block;
4913 page_t* page;
4914 buf_block_t* right_block = 0; /* remove warning */
4915 page_t* right_page = 0; /* remove warning */
4916 page_t* father_page;
4917 btr_cur_t node_cur;
4918 btr_cur_t right_node_cur;
4919 rec_t* rec;
4920 ulint right_page_no;
4921 ulint left_page_no;
4922 page_cur_t cursor;
4923 dtuple_t* node_ptr_tuple;
4924 bool ret = true;
4925 mtr_t mtr;
4926 mem_heap_t* heap = mem_heap_create(256);
4927 ulint* offsets = NULL;
4928 ulint* offsets2= NULL;
4929#ifdef UNIV_ZIP_DEBUG
4930 page_zip_des_t* page_zip;
4931#endif /* UNIV_ZIP_DEBUG */
4932 ulint savepoint = 0;
4933 ulint savepoint2 = 0;
4934 ulint parent_page_no = FIL_NULL;
4935 ulint parent_right_page_no = FIL_NULL;
4936 bool rightmost_child = false;
4937
4938 mtr_start(&mtr);
4939
4940 if (!srv_read_only_mode) {
4941 if (lockout) {
4942 mtr_x_lock(dict_index_get_lock(index), &mtr);
4943 } else {
4944 mtr_sx_lock(dict_index_get_lock(index), &mtr);
4945 }
4946 }
4947
4948 block = btr_root_block_get(index, RW_SX_LATCH, &mtr);
4949 page = buf_block_get_frame(block);
4950
4951#ifdef UNIV_DEBUG
4952 if (dict_index_is_spatial(index)) {
4953 fprintf(stderr, "Root page no: %lu\n",
4954 (ulong) page_get_page_no(page));
4955 }
4956#endif
4957
4958 fil_space_t* space = index->table->space;
4959 const page_size_t table_page_size(
4960 dict_table_page_size(index->table));
4961 const page_size_t space_page_size(space->flags);
4962
4963 if (!table_page_size.equals_to(space_page_size)) {
4964
4965 ib::warn() << "Flags mismatch: table=" << index->table->flags
4966 << ", tablespace=" << space->flags;
4967
4968 mtr_commit(&mtr);
4969
4970 return(false);
4971 }
4972
4973 while (level != btr_page_get_level(page)) {
4974 const rec_t* node_ptr;
4975
4976 if (fseg_page_is_free(space, block->page.id.page_no())) {
4977
4978 btr_validate_report1(index, level, block);
4979
4980 ib::warn() << "Page is free";
4981
4982 ret = false;
4983 }
4984
4985 ut_a(index->table->space->id == block->page.id.space());
4986 ut_a(block->page.id.space() == page_get_space_id(page));
4987#ifdef UNIV_ZIP_DEBUG
4988 page_zip = buf_block_get_page_zip(block);
4989 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
4990#endif /* UNIV_ZIP_DEBUG */
4991 ut_a(!page_is_leaf(page));
4992
4993 page_cur_set_before_first(block, &cursor);
4994 page_cur_move_to_next(&cursor);
4995
4996 node_ptr = page_cur_get_rec(&cursor);
4997 offsets = rec_get_offsets(node_ptr, index, offsets, false,
4998 ULINT_UNDEFINED, &heap);
4999
5000 savepoint2 = mtr_set_savepoint(&mtr);
5001 block = btr_node_ptr_get_child(node_ptr, index, offsets, &mtr);
5002 page = buf_block_get_frame(block);
5003
5004 /* For R-Tree, since record order might not be the same as
5005 linked index page in the lower level, we need to travers
5006 backwards to get the first page rec in this level.
5007 This is only used for index validation. Spatial index
5008 does not use such scan for any of its DML or query
5009 operations */
5010 if (dict_index_is_spatial(index)) {
5011 left_page_no = btr_page_get_prev(page, &mtr);
5012
5013 while (left_page_no != FIL_NULL) {
5014 /* To obey latch order of tree blocks,
5015 we should release the right_block once to
5016 obtain lock of the uncle block. */
5017 mtr_release_block_at_savepoint(
5018 &mtr, savepoint2, block);
5019
5020 savepoint2 = mtr_set_savepoint(&mtr);
5021 block = btr_block_get(
5022 page_id_t(index->table->space->id,
5023 left_page_no),
5024 table_page_size,
5025 RW_SX_LATCH, index, &mtr);
5026 page = buf_block_get_frame(block);
5027 left_page_no = btr_page_get_prev(page, &mtr);
5028 }
5029 }
5030 }
5031
5032 /* Now we are on the desired level. Loop through the pages on that
5033 level. */
5034
5035loop:
5036 mem_heap_empty(heap);
5037 offsets = offsets2 = NULL;
5038 if (!srv_read_only_mode) {
5039 if (lockout) {
5040 mtr_x_lock(dict_index_get_lock(index), &mtr);
5041 } else {
5042 mtr_sx_lock(dict_index_get_lock(index), &mtr);
5043 }
5044 }
5045
5046#ifdef UNIV_ZIP_DEBUG
5047 page_zip = buf_block_get_page_zip(block);
5048 ut_a(!page_zip || page_zip_validate(page_zip, page, index));
5049#endif /* UNIV_ZIP_DEBUG */
5050
5051 ut_a(block->page.id.space() == index->table->space->id);
5052
5053 if (fseg_page_is_free(space, block->page.id.page_no())) {
5054
5055 btr_validate_report1(index, level, block);
5056
5057 ib::warn() << "Page is marked as free";
5058 ret = false;
5059
5060 } else if (btr_page_get_index_id(page) != index->id) {
5061
5062 ib::error() << "Page index id " << btr_page_get_index_id(page)
5063 << " != data dictionary index id " << index->id;
5064
5065 ret = false;
5066
5067 } else if (!page_validate(page, index)) {
5068
5069 btr_validate_report1(index, level, block);
5070 ret = false;
5071
5072 } else if (level == 0 && !btr_index_page_validate(block, index)) {
5073
5074 /* We are on level 0. Check that the records have the right
5075 number of fields, and field lengths are right. */
5076
5077 ret = false;
5078 }
5079
5080 ut_a(btr_page_get_level(page) == level);
5081
5082 right_page_no = btr_page_get_next(page, &mtr);
5083 left_page_no = btr_page_get_prev(page, &mtr);
5084
5085 ut_a(!page_is_empty(page)
5086 || (level == 0
5087 && page_get_page_no(page) == dict_index_get_page(index)));
5088
5089 if (right_page_no != FIL_NULL) {
5090 const rec_t* right_rec;
5091 savepoint = mtr_set_savepoint(&mtr);
5092
5093 right_block = btr_block_get(
5094 page_id_t(index->table->space->id, right_page_no),
5095 table_page_size,
5096 RW_SX_LATCH, index, &mtr);
5097
5098 right_page = buf_block_get_frame(right_block);
5099
5100 if (btr_page_get_prev(right_page, &mtr)
5101 != page_get_page_no(page)) {
5102
5103 btr_validate_report2(index, level, block, right_block);
5104 fputs("InnoDB: broken FIL_PAGE_NEXT"
5105 " or FIL_PAGE_PREV links\n", stderr);
5106
5107 ret = false;
5108 }
5109
5110 if (page_is_comp(right_page) != page_is_comp(page)) {
5111 btr_validate_report2(index, level, block, right_block);
5112 fputs("InnoDB: 'compact' flag mismatch\n", stderr);
5113
5114 ret = false;
5115
5116 goto node_ptr_fails;
5117 }
5118
5119 rec = page_rec_get_prev(page_get_supremum_rec(page));
5120 right_rec = page_rec_get_next(page_get_infimum_rec(
5121 right_page));
5122 offsets = rec_get_offsets(rec, index, offsets,
5123 page_is_leaf(page),
5124 ULINT_UNDEFINED, &heap);
5125 offsets2 = rec_get_offsets(right_rec, index, offsets2,
5126 page_is_leaf(right_page),
5127 ULINT_UNDEFINED, &heap);
5128
5129 /* For spatial index, we cannot guarantee the key ordering
5130 across pages, so skip the record compare verification for
5131 now. Will enhanced in special R-Tree index validation scheme */
5132 if (!dict_index_is_spatial(index)
5133 && cmp_rec_rec(rec, right_rec,
5134 offsets, offsets2, index) >= 0) {
5135
5136 btr_validate_report2(index, level, block, right_block);
5137
5138 fputs("InnoDB: records in wrong order"
5139 " on adjacent pages\n", stderr);
5140
5141 fputs("InnoDB: record ", stderr);
5142 rec = page_rec_get_prev(page_get_supremum_rec(page));
5143 rec_print(stderr, rec, index);
5144 putc('\n', stderr);
5145 fputs("InnoDB: record ", stderr);
5146 rec = page_rec_get_next(
5147 page_get_infimum_rec(right_page));
5148 rec_print(stderr, rec, index);
5149 putc('\n', stderr);
5150
5151 ret = false;
5152 }
5153 }
5154
5155 if (level > 0 && left_page_no == FIL_NULL) {
5156 ut_a(REC_INFO_MIN_REC_FLAG & rec_get_info_bits(
5157 page_rec_get_next(page_get_infimum_rec(page)),
5158 page_is_comp(page)));
5159 }
5160
5161 /* Similarly skip the father node check for spatial index for now,
5162 for a couple of reasons:
5163 1) As mentioned, there is no ordering relationship between records
5164 in parent level and linked pages in the child level.
5165 2) Search parent from root is very costly for R-tree.
5166 We will add special validation mechanism for R-tree later (WL #7520) */
5167 if (!dict_index_is_spatial(index)
5168 && block->page.id.page_no() != dict_index_get_page(index)) {
5169
5170 /* Check father node pointers */
5171 rec_t* node_ptr;
5172
5173 btr_cur_position(
5174 index, page_rec_get_next(page_get_infimum_rec(page)),
5175 block, &node_cur);
5176 offsets = btr_page_get_father_node_ptr_for_validate(
5177 offsets, heap, &node_cur, &mtr);
5178
5179 father_page = btr_cur_get_page(&node_cur);
5180 node_ptr = btr_cur_get_rec(&node_cur);
5181
5182 parent_page_no = page_get_page_no(father_page);
5183 parent_right_page_no = btr_page_get_next(father_page, &mtr);
5184 rightmost_child = page_rec_is_supremum(
5185 page_rec_get_next(node_ptr));
5186
5187 btr_cur_position(
5188 index,
5189 page_rec_get_prev(page_get_supremum_rec(page)),
5190 block, &node_cur);
5191
5192 offsets = btr_page_get_father_node_ptr_for_validate(
5193 offsets, heap, &node_cur, &mtr);
5194
5195 if (node_ptr != btr_cur_get_rec(&node_cur)
5196 || btr_node_ptr_get_child_page_no(node_ptr, offsets)
5197 != block->page.id.page_no()) {
5198
5199 btr_validate_report1(index, level, block);
5200
5201 fputs("InnoDB: node pointer to the page is wrong\n",
5202 stderr);
5203
5204 fputs("InnoDB: node ptr ", stderr);
5205 rec_print(stderr, node_ptr, index);
5206
5207 rec = btr_cur_get_rec(&node_cur);
5208 fprintf(stderr, "\n"
5209 "InnoDB: node ptr child page n:o "
5210 ULINTPF "\n",
5211 btr_node_ptr_get_child_page_no(rec, offsets));
5212
5213 fputs("InnoDB: record on page ", stderr);
5214 rec_print_new(stderr, rec, offsets);
5215 putc('\n', stderr);
5216 ret = false;
5217
5218 goto node_ptr_fails;
5219 }
5220
5221 if (!page_is_leaf(page)) {
5222 node_ptr_tuple = dict_index_build_node_ptr(
5223 index,
5224 page_rec_get_next(page_get_infimum_rec(page)),
5225 0, heap, btr_page_get_level(page));
5226
5227 if (cmp_dtuple_rec(node_ptr_tuple, node_ptr,
5228 offsets)) {
5229 const rec_t* first_rec = page_rec_get_next(
5230 page_get_infimum_rec(page));
5231
5232 btr_validate_report1(index, level, block);
5233
5234 ib::error() << "Node ptrs differ on levels > 0";
5235
5236 fputs("InnoDB: node ptr ",stderr);
5237 rec_print_new(stderr, node_ptr, offsets);
5238 fputs("InnoDB: first rec ", stderr);
5239 rec_print(stderr, first_rec, index);
5240 putc('\n', stderr);
5241 ret = false;
5242
5243 goto node_ptr_fails;
5244 }
5245 }
5246
5247 if (left_page_no == FIL_NULL) {
5248 ut_a(node_ptr == page_rec_get_next(
5249 page_get_infimum_rec(father_page)));
5250 ut_a(!page_has_prev(father_page));
5251 }
5252
5253 if (right_page_no == FIL_NULL) {
5254 ut_a(node_ptr == page_rec_get_prev(
5255 page_get_supremum_rec(father_page)));
5256 ut_a(!page_has_next(father_page));
5257 } else {
5258 const rec_t* right_node_ptr;
5259
5260 right_node_ptr = page_rec_get_next(node_ptr);
5261
5262 if (!lockout && rightmost_child) {
5263
5264 /* To obey latch order of tree blocks,
5265 we should release the right_block once to
5266 obtain lock of the uncle block. */
5267 mtr_release_block_at_savepoint(
5268 &mtr, savepoint, right_block);
5269
5270 btr_block_get(
5271 page_id_t(index->table->space->id,
5272 parent_right_page_no),
5273 table_page_size,
5274 RW_SX_LATCH, index, &mtr);
5275
5276 right_block = btr_block_get(
5277 page_id_t(index->table->space->id,
5278 right_page_no),
5279 table_page_size,
5280 RW_SX_LATCH, index, &mtr);
5281 }
5282
5283 btr_cur_position(
5284 index, page_rec_get_next(
5285 page_get_infimum_rec(
5286 buf_block_get_frame(
5287 right_block))),
5288 right_block, &right_node_cur);
5289
5290 offsets = btr_page_get_father_node_ptr_for_validate(
5291 offsets, heap, &right_node_cur, &mtr);
5292
5293 if (right_node_ptr
5294 != page_get_supremum_rec(father_page)) {
5295
5296 if (btr_cur_get_rec(&right_node_cur)
5297 != right_node_ptr) {
5298 ret = false;
5299 fputs("InnoDB: node pointer to"
5300 " the right page is wrong\n",
5301 stderr);
5302
5303 btr_validate_report1(index, level,
5304 block);
5305 }
5306 } else {
5307 page_t* right_father_page
5308 = btr_cur_get_page(&right_node_cur);
5309
5310 if (btr_cur_get_rec(&right_node_cur)
5311 != page_rec_get_next(
5312 page_get_infimum_rec(
5313 right_father_page))) {
5314 ret = false;
5315 fputs("InnoDB: node pointer 2 to"
5316 " the right page is wrong\n",
5317 stderr);
5318
5319 btr_validate_report1(index, level,
5320 block);
5321 }
5322
5323 if (page_get_page_no(right_father_page)
5324 != btr_page_get_next(father_page, &mtr)) {
5325
5326 ret = false;
5327 fputs("InnoDB: node pointer 3 to"
5328 " the right page is wrong\n",
5329 stderr);
5330
5331 btr_validate_report1(index, level,
5332 block);
5333 }
5334 }
5335 }
5336 }
5337
5338node_ptr_fails:
5339 /* Commit the mini-transaction to release the latch on 'page'.
5340 Re-acquire the latch on right_page, which will become 'page'
5341 on the next loop. The page has already been checked. */
5342 mtr_commit(&mtr);
5343
5344 if (trx_is_interrupted(trx)) {
5345 /* On interrupt, return the current status. */
5346 } else if (right_page_no != FIL_NULL) {
5347
5348 mtr_start(&mtr);
5349
5350 if (!lockout) {
5351 if (rightmost_child) {
5352 if (parent_right_page_no != FIL_NULL) {
5353 btr_block_get(
5354 page_id_t(
5355 index->table->space->id,
5356 parent_right_page_no),
5357 table_page_size,
5358 RW_SX_LATCH, index, &mtr);
5359 }
5360 } else if (parent_page_no != FIL_NULL) {
5361 btr_block_get(
5362 page_id_t(index->table->space->id,
5363 parent_page_no),
5364 table_page_size,
5365 RW_SX_LATCH, index, &mtr);
5366 }
5367 }
5368
5369 block = btr_block_get(
5370 page_id_t(index->table->space->id, right_page_no),
5371 table_page_size,
5372 RW_SX_LATCH, index, &mtr);
5373
5374 page = buf_block_get_frame(block);
5375
5376 goto loop;
5377 }
5378
5379 mem_heap_free(heap);
5380
5381 return(ret);
5382}
5383
5384/**************************************************************//**
5385Do an index level validation of spaital index tree.
5386@return true if no error found */
5387static
5388bool
5389btr_validate_spatial_index(
5390/*=======================*/
5391 dict_index_t* index, /*!< in: index */
5392 const trx_t* trx) /*!< in: transaction or NULL */
5393{
5394
5395 mtr_t mtr;
5396 bool ok = true;
5397
5398 mtr_start(&mtr);
5399
5400 mtr_x_lock(dict_index_get_lock(index), &mtr);
5401
5402 page_t* root = btr_root_get(index, &mtr);
5403 ulint n = btr_page_get_level(root);
5404
5405#ifdef UNIV_RTR_DEBUG
5406 fprintf(stderr, "R-tree level is %lu\n", n);
5407#endif /* UNIV_RTR_DEBUG */
5408
5409 for (ulint i = 0; i <= n; ++i) {
5410#ifdef UNIV_RTR_DEBUG
5411 fprintf(stderr, "Level %lu:\n", n - i);
5412#endif /* UNIV_RTR_DEBUG */
5413
5414 if (!btr_validate_level(index, trx, n - i, true)) {
5415 ok = false;
5416 break;
5417 }
5418 }
5419
5420 mtr_commit(&mtr);
5421
5422 return(ok);
5423}
5424
5425/**************************************************************//**
5426Checks the consistency of an index tree.
5427@return DB_SUCCESS if ok, error code if not */
5428dberr_t
5429btr_validate_index(
5430/*===============*/
5431 dict_index_t* index, /*!< in: index */
5432 const trx_t* trx, /*!< in: transaction or NULL */
5433 bool lockout)/*!< in: true if X-latch index is intended */
5434{
5435 dberr_t err = DB_SUCCESS;
5436
5437 /* Full Text index are implemented by auxiliary tables,
5438 not the B-tree */
5439 if (dict_index_is_online_ddl(index) || (index->type & DICT_FTS)) {
5440 return(err);
5441 }
5442
5443 if (dict_index_is_spatial(index)) {
5444 if(!btr_validate_spatial_index(index, trx)) {
5445 err = DB_ERROR;
5446 }
5447 return(err);
5448 }
5449
5450 mtr_t mtr;
5451
5452 mtr_start(&mtr);
5453
5454 if (!srv_read_only_mode) {
5455 if (lockout) {
5456 mtr_x_lock(dict_index_get_lock(index), &mtr);
5457 } else {
5458 mtr_sx_lock(dict_index_get_lock(index), &mtr);
5459 }
5460 }
5461
5462 page_t* root = btr_root_get(index, &mtr);
5463
5464 if (root == NULL && !index->is_readable()) {
5465 err = DB_DECRYPTION_FAILED;
5466 mtr_commit(&mtr);
5467 return err;
5468 }
5469
5470 ulint n = btr_page_get_level(root);
5471
5472 for (ulint i = 0; i <= n; ++i) {
5473
5474 if (!btr_validate_level(index, trx, n - i, lockout)) {
5475 err = DB_CORRUPTION;
5476 break;
5477 }
5478 }
5479
5480 mtr_commit(&mtr);
5481
5482 return(err);
5483}
5484
5485/**************************************************************//**
5486Checks if the page in the cursor can be merged with given page.
5487If necessary, re-organize the merge_page.
5488@return true if possible to merge. */
5489static
5490bool
5491btr_can_merge_with_page(
5492/*====================*/
5493 btr_cur_t* cursor, /*!< in: cursor on the page to merge */
5494 ulint page_no, /*!< in: a sibling page */
5495 buf_block_t** merge_block, /*!< out: the merge block */
5496 mtr_t* mtr) /*!< in: mini-transaction */
5497{
5498 dict_index_t* index;
5499 page_t* page;
5500 ulint n_recs;
5501 ulint data_size;
5502 ulint max_ins_size_reorg;
5503 ulint max_ins_size;
5504 buf_block_t* mblock;
5505 page_t* mpage;
5506 DBUG_ENTER("btr_can_merge_with_page");
5507
5508 if (page_no == FIL_NULL) {
5509 *merge_block = NULL;
5510 DBUG_RETURN(false);
5511 }
5512
5513 index = btr_cur_get_index(cursor);
5514 page = btr_cur_get_page(cursor);
5515
5516 const page_id_t page_id(index->table->space->id, page_no);
5517 const page_size_t page_size(index->table->space->flags);
5518
5519 mblock = btr_block_get(page_id, page_size, RW_X_LATCH, index, mtr);
5520 mpage = buf_block_get_frame(mblock);
5521
5522 n_recs = page_get_n_recs(page);
5523 data_size = page_get_data_size(page);
5524
5525 max_ins_size_reorg = page_get_max_insert_size_after_reorganize(
5526 mpage, n_recs);
5527
5528 if (data_size > max_ins_size_reorg) {
5529 goto error;
5530 }
5531
5532 /* If compression padding tells us that merging will result in
5533 too packed up page i.e.: which is likely to cause compression
5534 failure then don't merge the pages. */
5535 if (page_size.is_compressed() && page_is_leaf(mpage)
5536 && (page_get_data_size(mpage) + data_size
5537 >= dict_index_zip_pad_optimal_page_size(index))) {
5538
5539 goto error;
5540 }
5541
5542 max_ins_size = page_get_max_insert_size(mpage, n_recs);
5543
5544 if (data_size > max_ins_size) {
5545
5546 /* We have to reorganize mpage */
5547
5548 if (!btr_page_reorganize_block(
5549 false, page_zip_level, mblock, index, mtr)) {
5550
5551 goto error;
5552 }
5553
5554 max_ins_size = page_get_max_insert_size(mpage, n_recs);
5555
5556 ut_ad(page_validate(mpage, index));
5557 ut_ad(max_ins_size == max_ins_size_reorg);
5558
5559 if (data_size > max_ins_size) {
5560
5561 /* Add fault tolerance, though this should
5562 never happen */
5563
5564 goto error;
5565 }
5566 }
5567
5568 *merge_block = mblock;
5569 DBUG_RETURN(true);
5570
5571error:
5572 *merge_block = NULL;
5573 DBUG_RETURN(false);
5574}
5575